diff --git a/README.md b/README.md index 585479f7..d141dfe0 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,67 @@ +### Change log [2024-04-09 11:08:55] +1. Item Updated: `coxph_trainer` (from version: `1.1.0` to `1.1.0`) +2. Item Updated: `load_dataset` (from version: `1.2.0` to `1.2.0`) +3. Item Updated: `question_answering` (from version: `0.4.0` to `0.4.0`) +4. Item Updated: `azureml_serving` (from version: `1.1.0` to `1.1.0`) +5. Item Updated: `hugging_face_serving` (from version: `1.1.0` to `1.1.0`) +6. Item Updated: `concept_drift_streaming` (from version: `1.1.0` to `1.1.0`) +7. Item Updated: `sql_to_file` (from version: `1.1.0` to `1.1.0`) +8. Item Updated: `transcribe` (from version: `1.1.0` to `1.1.0`) +9. Item Updated: `v2_model_tester` (from version: `1.1.0` to `1.1.0`) +10. Item Updated: `xgb_test` (from version: `1.1.1` to `1.1.1`) +11. Item Updated: `describe_spark` (from version: `1.1.0` to `1.1.0`) +12. Item Updated: `test_classifier` (from version: `1.1.0` to `1.1.0`) +13. Item Updated: `coxph_test` (from version: `1.1.0` to `1.1.0`) +14. Item Updated: `tf2_serving` (from version: `1.1.0` to `1.1.0`) +15. Item Updated: `sklearn_classifier` (from version: `1.1.1` to `1.1.1`) +16. Item Updated: `churn_server` (from version: `1.1.0` to `1.1.0`) +17. Item Updated: `pii_recognizer` (from version: `0.3.0` to `0.3.0`) +18. Item Updated: `gen_class_data` (from version: `1.2.0` to `1.2.0`) +19. Item Updated: `open_archive` (from version: `1.1.0` to `1.1.0`) +20. Item Updated: `structured_data_generator` (from version: `1.5.0` to `1.5.0`) +21. Item Updated: `describe` (from version: `1.2.0` to `1.2.0`) +22. Item Updated: `xgb_trainer` (from version: `1.1.1` to `1.1.1`) +23. Item Updated: `xgb_serving` (from version: `1.1.2` to `1.1.2`) +24. Item Updated: `send_email` (from version: `1.2.0` to `1.2.0`) +25. Item Updated: `tf2_serving_v2` (from version: `1.1.0` to `1.1.0`) +26. Item Updated: `get_offline_features` (from version: `1.2.0` to `1.2.0`) +27. Item Updated: `slack_notify` (from version: `1.1.0` to `1.1.0`) +28. Item Updated: `model_server_tester` (from version: `1.1.0` to `1.1.0`) +29. Item Updated: `arc_to_parquet` (from version: `1.4.1` to `1.4.1`) +30. Item Updated: `bert_embeddings` (from version: `1.3.0` to `1.3.0`) +31. Item Updated: `feature_perms` (from version: `1.1.0` to `1.1.0`) +32. Item Updated: `concept_drift` (from version: `1.1.0` to `1.1.0`) +33. Item Updated: `describe_dask` (from version: `1.1.0` to `1.1.0`) +34. Item Updated: `batch_inference` (from version: `1.7.0` to `1.7.0`) +35. Item Updated: `model_monitoring_stream` (from version: `1.1.0` to `1.1.0`) +36. Item Updated: `huggingface_auto_trainer` (from version: `1.1.0` to `1.1.0`) +37. Item Updated: `feature_selection` (from version: `1.4.0` to `1.4.0`) +38. Item Updated: `pyannote_audio` (from version: `1.2.0` to `1.2.0`) +39. Item Updated: `ingest` (from version: `1.1.0` to `1.1.0`) +40. Item Updated: `batch_inference_v2` (from version: `2.5.0` to `2.5.0`) +41. Item Updated: `validate_great_expectations` (from version: `1.1.0` to `1.1.0`) +42. Item Updated: `model_server` (from version: `1.1.0` to `1.1.0`) +43. Item Updated: `xgb_custom` (from version: `1.1.0` to `1.1.0`) +44. Item Updated: `snowflake_dask` (from version: `1.1.0` to `1.1.0`) +45. Item Updated: `azureml_utils` (from version: `1.3.0` to `1.3.0`) +46. Item Updated: `github_utils` (from version: `1.1.0` to `1.1.0`) +47. Item Updated: `pandas_profiling_report` (from version: `1.1.0` to `1.1.0`) +48. Item Updated: `translate` (from version: `0.1.0` to `0.1.0`) +49. Item Updated: `silero_vad` (from version: `1.3.0` to `1.3.0`) +50. Item Updated: `tf1_serving` (from version: `1.1.0` to `1.1.0`) +51. Item Updated: `model_monitoring_batch` (from version: `1.1.0` to `1.1.0`) +52. Item Updated: `hugging_face_classifier_trainer` (from version: `0.3.0` to `0.3.0`) +53. Item Updated: `stream_to_parquet` (from version: `1.1.0` to `1.1.0`) +54. Item Updated: `load_dask` (from version: `1.1.0` to `1.1.0`) +55. Item Updated: `text_to_audio_generator` (from version: `1.2.0` to `1.2.0`) +56. Item Updated: `virtual_drift` (from version: `1.1.0` to `1.1.0`) +57. Item Updated: `aggregate` (from version: `1.3.0` to `1.3.0`) +58. Item Updated: `auto_trainer` (from version: `1.7.0` to `1.7.0`) +59. Item Updated: `v2_model_server` (from version: `1.1.0` to `1.1.0`) +60. Item Updated: `rnn_serving` (from version: `1.1.0` to `1.1.0`) +61. Item Updated: `sklearn_classifier_dask` (from version: `1.1.1` to `1.1.1`) +62. Item Updated: `onnx_utils` (from version: `1.2.0` to `1.2.0`) + ### Change log [2024-04-08 12:13:34] 1. Item Updated: `coxph_trainer` (from version: `1.1.0` to `1.1.0`) 2. Item Updated: `load_dataset` (from version: `1.2.0` to `1.2.0`) diff --git a/catalog.json b/catalog.json index 691963bf..5071f6ca 100644 --- a/catalog.json +++ b/catalog.json @@ -1 +1 @@ -{"functions": {"development": {"tf2_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving", "platformVersion": "", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0"}}, "load_dataset": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dataset", "platformVersion": "3.5.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dataset", "platformVersion": "", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "model_server_tester": {"latest": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server-tester", "platformVersion": "", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "tf2_serving_v2": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving-v2", "platformVersion": "", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0"}}, "sql_to_file": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "sql-to-file", "platformVersion": "3.5.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "sql-to-file", "platformVersion": "3.5.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "sql-to-file", "platformVersion": "", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "feature_selection": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "feature-selection", "platformVersion": "2.10.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1"}, "1.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "aggregate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.7.1", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.2"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "aggregate", "platformVersion": "3.0.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "bert_embeddings": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.2.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "bert-embeddings", "platformVersion": "3.2.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "bert-embeddings", "platformVersion": "3.5.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "bert-embeddings", "platformVersion": "2.10.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.0.1"}, "1.1.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "bert-embeddings", "platformVersion": "3.5.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "1.1.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "bert-embeddings", "platformVersion": "3.2.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.2.0"}}, "describe": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.2": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-26:10-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.2"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "describe", "platformVersion": "3.5.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "describe", "platformVersion": "2.10.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-07:14-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server", "platformVersion": "", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0"}}, "pandas_profiling_report": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "pandas-profiling-report", "platformVersion": "3.5.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "pandas-profiling-report", "platformVersion": "3.2.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "pandas-profiling-report", "platformVersion": "3.5.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "pandas-profiling-report", "platformVersion": "", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "pandas-profiling-report", "platformVersion": "3.2.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.8.0"}}, "load_dask": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dask", "platformVersion": "3.5.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dask", "platformVersion": "3.2.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dask", "platformVersion": "3.5.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dask", "platformVersion": "", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dask", "platformVersion": "3.2.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.8.0"}}, "slack_notify": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "slack-notify", "platformVersion": "3.5.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "slack-notify", "platformVersion": "3.2.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "slack-notify", "platformVersion": "3.5.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "slack-notify", "platformVersion": "", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "slack-notify", "platformVersion": "3.2.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.8.0"}}, "xgb_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "xgb_serving", "platformVersion": "3.5.3", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.2"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.2": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "xgb_serving", "platformVersion": "3.5.3", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.2"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "xgb_serving", "platformVersion": "3.5.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "xgb_serving", "platformVersion": "3.0.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.8.0"}}, "model_monitoring_batch": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-batch", "platformVersion": "", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "stream_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "stream-to-parquet", "platformVersion": "3.5.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "stream-to-parquet", "platformVersion": "3.2.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "stream-to-parquet", "platformVersion": "3.5.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "stream-to-parquet", "platformVersion": "", "spec": {"filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": [], "customFields": {"min_replicas": 1, "max_replicas": 1}}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "stream-to-parquet", "platformVersion": "3.2.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.8.0"}}, "describe_spark": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "describe-spark", "platformVersion": "", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "gen_class_data": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "gen_class_data", "platformVersion": "3.5.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "gen_class_data", "platformVersion": "3.0.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.10.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "open_archive": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "open-archive", "platformVersion": "", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "send_email": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "send-email", "platformVersion": "3.5.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "send-email", "platformVersion": "", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "concept_drift": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.9.0"}, "0.0.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift", "platformVersion": "", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.0.2"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift", "platformVersion": "", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.8.0"}}, "tf1_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf1-serving", "platformVersion": "3.5.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf1-serving", "platformVersion": "3.5.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf1-serving", "platformVersion": "", "spec": {"filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": [], "env": {"MODEL_CLASS": "TFModel", "ENABLE_EXPLAINER": false}}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0"}}, "churn_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "churn-server", "platformVersion": "", "spec": {"filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": [], "env": {"ENABLE_EXPLAINER": "False"}, "customFields": {"default_class": "ChurnModel"}}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.8.0"}}, "model_monitoring_stream": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-stream", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-stream", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-stream", "platformVersion": "", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.8.0"}}, "virtual_drift": {"latest": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "virtual-drift", "platformVersion": "3.5.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "virtual-drift", "platformVersion": "3.2.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "virtual-drift", "platformVersion": "3.5.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "virtual-drift", "platformVersion": "", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "virtual-drift", "platformVersion": "3.2.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.8.0"}}, "rnn_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "rnn-serving", "platformVersion": "3.5.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "rnn-serving", "platformVersion": "3.5.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "rnn-serving", "platformVersion": "", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["keras"]}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "0.8.0"}}, "feature_perms": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-perms", "platformVersion": "3.5.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "test_valid": false}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-perms", "platformVersion": "3.5.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "test_valid": false}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "feature-perms", "platformVersion": "", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "v2_model_tester": {"latest": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-tester", "platformVersion": "", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "coxph_test": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "coxph-test", "platformVersion": "", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "arc_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "arc-to-parquet", "platformVersion": "2.10.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.4.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}}, "github_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "github-utils", "platformVersion": "", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "v2_model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-server", "platformVersion": "", "spec": {"filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": [], "customFields": {"default_class": "ClassifierModel"}}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.8.0"}}, "concept_drift_streaming": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift-streaming", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.9.0"}, "0.0.2": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift-streaming", "platformVersion": "", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.0.2"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift-streaming", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift-streaming", "platformVersion": "", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.8.0"}}, "onnx_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.8.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "onnxoptimizer~=0.2.0", "onnxmltools~=1.9.0", "tf2onnx~=1.9.0"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-10-25:00-15", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "onnx_utils", "platformVersion": "", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.1.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-10-25:00-15", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.10.2": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0"}, "0.10.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.10.1"}}, "ingest": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "ingest", "platformVersion": "3.5.0", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "ingest", "platformVersion": "", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "ingest", "platformVersion": "3.5.0", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "ingest", "platformVersion": "", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}}, "get_offline_features": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-01-17:17-56", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-01-17:17-56", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "azureml_utils": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.0.1"}, "0.9.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.4"}, "0.9.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.3"}, "0.9.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-04-20:15-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "0.9.5"}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.2.0", "test_valid": false}}, "auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "1.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "1.0.1"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.3.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.0.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.2"}, "1.0.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.4"}, "0.10.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0"}, "1.0.6": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.6"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "0.10.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2"}, "0.10.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.3"}, "0.10.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.1"}}, "snowflake_dask": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "snowflake_dask", "platformVersion": "3.5.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-03-20:12-28", "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "snowflake_dask", "platformVersion": "3.2.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "snowflake_dask", "platformVersion": "3.5.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "azureml_serving": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}}, "batch_inference": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference ( also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.1.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.2.0"}}, "hugging_face_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0"}}, "hugging_face_classifier_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.2.0"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.2.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.0", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.0.1"}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.0", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.1.0"}}, "validate_great_expectations": {"latest": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0"}}, "transcribe": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0"}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": false}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0"}}, "question_answering": {"latest": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.2.0"}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.3.0"}, "0.3.1": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1"}}, "huggingface_auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}}, "pii_recognizer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.1.0", "test_valid": false}}, "batch_inference_v2": {"latest": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}, "2.3.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.3.0"}, "1.9.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc16", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.9.0"}, "1.8.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}, "2.2.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.2.0"}, "2.1.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.1.0"}, "2.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0"}, "2.0.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.0.0"}}, "translate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": true}}, "structured_data_generator": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.3.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.0.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0"}}, "text_to_audio_generator": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.0.0", "test_valid": true}}, "silero_vad": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0"}}, "pyannote_audio": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.0.0"}}}, "master": {"tf2_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving", "platformVersion": "", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1"}}, "load_dataset": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dataset", "platformVersion": "3.5.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dataset", "platformVersion": "", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "model_server_tester": {"latest": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server-tester", "platformVersion": "", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}}, "tf2_serving_v2": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving-v2", "platformVersion": "", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1"}}, "sql_to_file": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "sql-to-file", "platformVersion": "3.5.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "sql-to-file", "platformVersion": "3.5.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "sql-to-file", "platformVersion": "", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}}, "feature_selection": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "feature-selection", "platformVersion": "2.10.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "aggregate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "aggregate", "platformVersion": "3.0.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "bert_embeddings": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "bert-embeddings", "platformVersion": "3.2.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "bert-embeddings", "platformVersion": "3.5.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "bert-embeddings", "platformVersion": "3.2.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.9.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "bert-embeddings", "platformVersion": "3.5.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "1.1.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "bert-embeddings", "platformVersion": "2.10.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.2.0"}}, "describe": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "describe", "platformVersion": "3.5.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-07:14-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.9.2": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-26:10-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.2"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "describe", "platformVersion": "2.10.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server", "platformVersion": "", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.0.0"}}, "pandas_profiling_report": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "pandas-profiling-report", "platformVersion": "3.5.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "pandas-profiling-report", "platformVersion": "3.2.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "pandas-profiling-report", "platformVersion": "3.5.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "pandas-profiling-report", "platformVersion": "3.2.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "pandas-profiling-report", "platformVersion": "", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.0.1"}}, "load_dask": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dask", "platformVersion": "3.5.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dask", "platformVersion": "3.2.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dask", "platformVersion": "3.5.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dask", "platformVersion": "3.2.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dask", "platformVersion": "", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.0.1"}}, "slack_notify": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "slack-notify", "platformVersion": "3.5.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "slack-notify", "platformVersion": "3.2.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "slack-notify", "platformVersion": "3.5.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "slack-notify", "platformVersion": "3.2.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "slack-notify", "platformVersion": "", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.0.1"}}, "xgb_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "xgb_serving", "platformVersion": "3.5.3", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.2"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "xgb_serving", "platformVersion": "3.5.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.2": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "xgb_serving", "platformVersion": "3.5.3", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.2"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "xgb_serving", "platformVersion": "3.0.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "1.0.0"}}, "model_monitoring_batch": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-batch", "platformVersion": "", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}}, "stream_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "stream-to-parquet", "platformVersion": "3.5.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "stream-to-parquet", "platformVersion": "3.2.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "stream-to-parquet", "platformVersion": "3.5.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "stream-to-parquet", "platformVersion": "3.2.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "stream-to-parquet", "platformVersion": "", "spec": {"filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": [], "customFields": {"min_replicas": 1, "max_replicas": 1}}, "url": "", "version": "0.0.1"}}, "describe_spark": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "describe-spark", "platformVersion": "", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}}, "gen_class_data": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "gen_class_data", "platformVersion": "3.5.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.10.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "gen_class_data", "platformVersion": "3.0.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "open_archive": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "open-archive", "platformVersion": "", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}}, "send_email": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "send-email", "platformVersion": "3.5.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "send-email", "platformVersion": "", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "concept_drift": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift", "platformVersion": "", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}}, "tf1_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf1-serving", "platformVersion": "3.5.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf1-serving", "platformVersion": "3.5.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf1-serving", "platformVersion": "", "spec": {"filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": [], "env": {"MODEL_CLASS": "TFModel", "ENABLE_EXPLAINER": false}}, "url": "", "version": "0.0.1"}}, "churn_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "churn-server", "platformVersion": "", "spec": {"filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": [], "env": {"ENABLE_EXPLAINER": "False"}, "customFields": {"default_class": "ChurnModel"}}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.0.0"}}, "model_monitoring_stream": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-stream", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-stream", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-stream", "platformVersion": "", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.0.1"}}, "virtual_drift": {"latest": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "virtual-drift", "platformVersion": "3.5.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "virtual-drift", "platformVersion": "3.2.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "virtual-drift", "platformVersion": "3.5.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "virtual-drift", "platformVersion": "3.2.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "virtual-drift", "platformVersion": "", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.0.1"}}, "rnn_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "rnn-serving", "platformVersion": "3.5.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "rnn-serving", "platformVersion": "3.5.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "rnn-serving", "platformVersion": "", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["keras"]}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.0.0"}}, "feature_perms": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-perms", "platformVersion": "3.5.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "test_valid": false}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-perms", "platformVersion": "3.5.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "test_valid": false}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "feature-perms", "platformVersion": "", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}}, "v2_model_tester": {"latest": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-tester", "platformVersion": "", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}}, "coxph_test": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "coxph-test", "platformVersion": "", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}}, "arc_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.4.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "arc-to-parquet", "platformVersion": "2.10.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "github_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "github-utils", "platformVersion": "", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}}, "v2_model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-server", "platformVersion": "", "spec": {"filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": [], "customFields": {"default_class": "ClassifierModel"}}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.0.0"}}, "concept_drift_streaming": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift-streaming", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift-streaming", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift-streaming", "platformVersion": "", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.0.1"}}, "onnx_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "onnxoptimizer~=0.2.0", "onnxmltools~=1.9.0", "tf2onnx~=1.9.0"]}, "url": "", "version": "1.1.0"}, "0.10.2": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.1.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0"}}, "ingest": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "ingest", "platformVersion": "3.5.0", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "ingest", "platformVersion": "3.5.0", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "ingest", "platformVersion": "", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "get_offline_features": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-01-17:17-56", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.2"}}, "azureml_utils": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.1.0"}, "0.9.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-04-20:15-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "0.9.5"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.9.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.2.0", "test_valid": false}, "0.9.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.4"}}, "auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}, "1.0.7": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.7"}, "1.6.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0"}, "0.10.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.3"}, "0.10.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.3.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "1.0.6": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.6"}, "1.0.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.5"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}}, "snowflake_dask": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "snowflake_dask", "platformVersion": "3.5.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "snowflake_dask", "platformVersion": "3.5.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-03-20:12-28", "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "snowflake_dask", "platformVersion": "3.2.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "azureml_serving": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}}, "batch_inference": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference ( also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.3.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.1"}, "1.4.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.4.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.2.0"}}, "hugging_face_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0"}}, "hugging_face_classifier_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.2.0"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.2.0"}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.0", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.0", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.0.1"}}, "validate_great_expectations": {"latest": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0"}}, "question_answering": {"latest": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.2.0"}, "0.3.1": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1"}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.1.0"}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.3.0"}}, "transcribe": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0"}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": false}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0"}}, "pii_recognizer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.1.0", "test_valid": false}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.0.1"}}, "huggingface_auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}}, "batch_inference_v2": {"latest": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0"}, "2.4.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.4.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}, "2.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0"}, "1.8.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0"}, "2.2.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.2.0"}, "1.9.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc16", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.9.0"}, "2.1.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.1.0"}, "2.0.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.0.0"}}, "translate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": true}}, "structured_data_generator": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.0.0"}}, "text_to_audio_generator": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.0.0", "test_valid": true}}, "silero_vad": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0"}}, "pyannote_audio": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.0.0"}}}}} \ No newline at end of file +{"functions": {"development": {"tf2_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving", "platformVersion": "", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0"}}, "load_dataset": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dataset", "platformVersion": "3.5.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dataset", "platformVersion": "", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "model_server_tester": {"latest": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server-tester", "platformVersion": "", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "tf2_serving_v2": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving-v2", "platformVersion": "", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0"}}, "sql_to_file": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "sql-to-file", "platformVersion": "3.5.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "sql-to-file", "platformVersion": "3.5.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "sql-to-file", "platformVersion": "", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "feature_selection": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "feature-selection", "platformVersion": "2.10.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1"}, "1.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "aggregate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.7.1", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.2"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "aggregate", "platformVersion": "3.0.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "bert_embeddings": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.2.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "bert-embeddings", "platformVersion": "3.2.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "bert-embeddings", "platformVersion": "3.5.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "bert-embeddings", "platformVersion": "2.10.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.0.1"}, "1.1.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "bert-embeddings", "platformVersion": "3.5.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "1.1.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "bert-embeddings", "platformVersion": "3.2.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.2.0"}}, "describe": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.2": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-26:10-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.2"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "describe", "platformVersion": "3.5.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "describe", "platformVersion": "2.10.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-07:14-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server", "platformVersion": "", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0"}}, "pandas_profiling_report": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "pandas-profiling-report", "platformVersion": "3.5.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "pandas-profiling-report", "platformVersion": "3.2.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "pandas-profiling-report", "platformVersion": "3.5.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "pandas-profiling-report", "platformVersion": "", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "pandas-profiling-report", "platformVersion": "3.2.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.8.0"}}, "load_dask": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dask", "platformVersion": "3.5.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dask", "platformVersion": "3.2.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dask", "platformVersion": "3.5.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dask", "platformVersion": "", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dask", "platformVersion": "3.2.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.8.0"}}, "slack_notify": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "slack-notify", "platformVersion": "3.5.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "slack-notify", "platformVersion": "3.2.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "slack-notify", "platformVersion": "3.5.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "slack-notify", "platformVersion": "", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "slack-notify", "platformVersion": "3.2.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.8.0"}}, "xgb_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "xgb_serving", "platformVersion": "3.5.3", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.2"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.2": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "xgb_serving", "platformVersion": "3.5.3", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.2"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "xgb_serving", "platformVersion": "3.5.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "xgb_serving", "platformVersion": "3.0.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.8.0"}}, "model_monitoring_batch": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-batch", "platformVersion": "", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "stream_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "stream-to-parquet", "platformVersion": "3.5.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "stream-to-parquet", "platformVersion": "3.2.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "stream-to-parquet", "platformVersion": "3.5.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "stream-to-parquet", "platformVersion": "", "spec": {"filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": [], "customFields": {"min_replicas": 1, "max_replicas": 1}}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "stream-to-parquet", "platformVersion": "3.2.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.8.0"}}, "describe_spark": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "describe-spark", "platformVersion": "", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "gen_class_data": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "gen_class_data", "platformVersion": "3.5.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "gen_class_data", "platformVersion": "3.0.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.10.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "open_archive": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "open-archive", "platformVersion": "", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "send_email": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "send-email", "platformVersion": "3.5.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "send-email", "platformVersion": "", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "concept_drift": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.9.0"}, "0.0.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift", "platformVersion": "", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.0.2"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift", "platformVersion": "", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.8.0"}}, "tf1_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf1-serving", "platformVersion": "3.5.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf1-serving", "platformVersion": "3.5.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf1-serving", "platformVersion": "", "spec": {"filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": [], "env": {"MODEL_CLASS": "TFModel", "ENABLE_EXPLAINER": false}}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0"}}, "churn_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "churn-server", "platformVersion": "", "spec": {"filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": [], "env": {"ENABLE_EXPLAINER": "False"}, "customFields": {"default_class": "ChurnModel"}}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.8.0"}}, "model_monitoring_stream": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-stream", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-stream", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-stream", "platformVersion": "", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.8.0"}}, "virtual_drift": {"latest": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "virtual-drift", "platformVersion": "3.5.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "virtual-drift", "platformVersion": "3.2.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "virtual-drift", "platformVersion": "3.5.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "virtual-drift", "platformVersion": "", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "virtual-drift", "platformVersion": "3.2.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.8.0"}}, "rnn_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "rnn-serving", "platformVersion": "3.5.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "rnn-serving", "platformVersion": "3.5.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "rnn-serving", "platformVersion": "", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["keras"]}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "0.8.0"}}, "feature_perms": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-perms", "platformVersion": "3.5.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "test_valid": false}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-perms", "platformVersion": "3.5.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "test_valid": false}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "feature-perms", "platformVersion": "", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "v2_model_tester": {"latest": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-tester", "platformVersion": "", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "coxph_test": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "coxph-test", "platformVersion": "", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "arc_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "arc-to-parquet", "platformVersion": "2.10.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.4.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}}, "github_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "github-utils", "platformVersion": "", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "v2_model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-server", "platformVersion": "", "spec": {"filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": [], "customFields": {"default_class": "ClassifierModel"}}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.8.0"}}, "concept_drift_streaming": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift-streaming", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.9.0"}, "0.0.2": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift-streaming", "platformVersion": "", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.0.2"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift-streaming", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift-streaming", "platformVersion": "", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.8.0"}}, "onnx_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.8.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "onnxoptimizer~=0.2.0", "onnxmltools~=1.9.0", "tf2onnx~=1.9.0"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-10-25:00-15", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "onnx_utils", "platformVersion": "", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.1.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-10-25:00-15", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.10.2": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0"}, "0.10.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.10.1"}}, "ingest": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "ingest", "platformVersion": "3.5.0", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "ingest", "platformVersion": "", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "ingest", "platformVersion": "3.5.0", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "ingest", "platformVersion": "", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}}, "get_offline_features": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-01-17:17-56", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-01-17:17-56", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "azureml_utils": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.0.1"}, "0.9.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.4"}, "0.9.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.3"}, "0.9.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-04-20:15-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "0.9.5"}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.2.0", "test_valid": false}}, "auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "1.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "1.0.1"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.3.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.0.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.2"}, "1.0.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.4"}, "0.10.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0"}, "1.0.6": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.6"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "0.10.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2"}, "0.10.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.3"}, "0.10.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.1"}}, "snowflake_dask": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "snowflake_dask", "platformVersion": "3.5.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-03-20:12-28", "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "snowflake_dask", "platformVersion": "3.2.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "snowflake_dask", "platformVersion": "3.5.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "azureml_serving": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}}, "batch_inference": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference ( also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.1.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.2.0"}}, "hugging_face_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0"}}, "hugging_face_classifier_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.2.0"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.2.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.0", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.0.1"}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.0", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.1.0"}}, "validate_great_expectations": {"latest": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0"}}, "transcribe": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0"}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": false}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0"}}, "question_answering": {"latest": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.2.0"}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.3.0"}, "0.3.1": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1"}}, "huggingface_auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}}, "pii_recognizer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.1.0", "test_valid": false}}, "batch_inference_v2": {"latest": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}, "2.3.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.3.0"}, "1.9.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc16", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.9.0"}, "1.8.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}, "2.2.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.2.0"}, "2.1.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.1.0"}, "2.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0"}, "2.0.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.0.0"}}, "translate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": true}}, "structured_data_generator": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.3.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.0.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0"}}, "text_to_audio_generator": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.0.0", "test_valid": true}}, "silero_vad": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0"}}, "pyannote_audio": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.0.0"}}}, "master": {"tf2_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving", "platformVersion": "", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1"}}, "load_dataset": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dataset", "platformVersion": "3.5.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dataset", "platformVersion": "", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "model_server_tester": {"latest": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server-tester", "platformVersion": "", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}}, "tf2_serving_v2": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving-v2", "platformVersion": "", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1"}}, "sql_to_file": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "sql-to-file", "platformVersion": "3.5.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "sql-to-file", "platformVersion": "3.5.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "sql-to-file", "platformVersion": "", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}}, "feature_selection": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "feature-selection", "platformVersion": "2.10.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "aggregate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "aggregate", "platformVersion": "3.0.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "bert_embeddings": {"latest": {"apiVersion": "v1", "categories": ["huggingface", "machine-learning", "data-preparation", "pytorch"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.3.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "bert-embeddings", "platformVersion": "3.2.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "bert-embeddings", "platformVersion": "3.5.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "1.1.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["huggingface", "machine-learning", "data-preparation", "pytorch"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.3.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "bert-embeddings", "platformVersion": "3.2.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.9.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "bert-embeddings", "platformVersion": "3.5.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "1.1.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "bert-embeddings", "platformVersion": "2.10.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.2.0"}}, "describe": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "describe", "platformVersion": "3.5.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-07:14-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.9.2": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-26:10-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.2"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "describe", "platformVersion": "2.10.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server", "platformVersion": "", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.0.0"}}, "pandas_profiling_report": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "pandas-profiling-report", "platformVersion": "3.5.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "pandas-profiling-report", "platformVersion": "3.2.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "pandas-profiling-report", "platformVersion": "3.5.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "pandas-profiling-report", "platformVersion": "3.2.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "pandas-profiling-report", "platformVersion": "", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.0.1"}}, "load_dask": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dask", "platformVersion": "3.5.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dask", "platformVersion": "3.2.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dask", "platformVersion": "3.5.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dask", "platformVersion": "3.2.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dask", "platformVersion": "", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.0.1"}}, "slack_notify": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "slack-notify", "platformVersion": "3.5.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "slack-notify", "platformVersion": "3.2.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "slack-notify", "platformVersion": "3.5.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "slack-notify", "platformVersion": "3.2.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "slack-notify", "platformVersion": "", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.0.1"}}, "xgb_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "xgb_serving", "platformVersion": "3.5.3", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.2"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "xgb_serving", "platformVersion": "3.5.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.2": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "xgb_serving", "platformVersion": "3.5.3", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.2"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "xgb_serving", "platformVersion": "3.0.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "1.0.0"}}, "model_monitoring_batch": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-batch", "platformVersion": "", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}}, "stream_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "stream-to-parquet", "platformVersion": "3.5.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "stream-to-parquet", "platformVersion": "3.2.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "stream-to-parquet", "platformVersion": "3.5.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "stream-to-parquet", "platformVersion": "3.2.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "stream-to-parquet", "platformVersion": "", "spec": {"filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": [], "customFields": {"min_replicas": 1, "max_replicas": 1}}, "url": "", "version": "0.0.1"}}, "describe_spark": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "describe-spark", "platformVersion": "", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}}, "gen_class_data": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "gen_class_data", "platformVersion": "3.5.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.10.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "gen_class_data", "platformVersion": "3.0.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "open_archive": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "open-archive", "platformVersion": "", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}}, "send_email": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "send-email", "platformVersion": "3.5.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "send-email", "platformVersion": "", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "concept_drift": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift", "platformVersion": "", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}}, "tf1_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf1-serving", "platformVersion": "3.5.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf1-serving", "platformVersion": "3.5.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf1-serving", "platformVersion": "", "spec": {"filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": [], "env": {"MODEL_CLASS": "TFModel", "ENABLE_EXPLAINER": false}}, "url": "", "version": "0.0.1"}}, "churn_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "churn-server", "platformVersion": "", "spec": {"filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": [], "env": {"ENABLE_EXPLAINER": "False"}, "customFields": {"default_class": "ChurnModel"}}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.0.0"}}, "model_monitoring_stream": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-stream", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-stream", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-stream", "platformVersion": "", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.0.1"}}, "virtual_drift": {"latest": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "virtual-drift", "platformVersion": "3.5.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "virtual-drift", "platformVersion": "3.2.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "virtual-drift", "platformVersion": "3.5.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "virtual-drift", "platformVersion": "3.2.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "virtual-drift", "platformVersion": "", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.0.1"}}, "rnn_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "rnn-serving", "platformVersion": "3.5.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "rnn-serving", "platformVersion": "3.5.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "rnn-serving", "platformVersion": "", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["keras"]}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.0.0"}}, "feature_perms": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-perms", "platformVersion": "3.5.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "test_valid": false}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-perms", "platformVersion": "3.5.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "test_valid": false}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "feature-perms", "platformVersion": "", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}}, "v2_model_tester": {"latest": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-tester", "platformVersion": "", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}}, "coxph_test": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "coxph-test", "platformVersion": "", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}}, "arc_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.4.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "arc-to-parquet", "platformVersion": "2.10.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "github_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "github-utils", "platformVersion": "", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}}, "v2_model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-server", "platformVersion": "", "spec": {"filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": [], "customFields": {"default_class": "ClassifierModel"}}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.0.0"}}, "concept_drift_streaming": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift-streaming", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift-streaming", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift-streaming", "platformVersion": "", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.0.1"}}, "onnx_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "onnxoptimizer~=0.2.0", "onnxmltools~=1.9.0", "tf2onnx~=1.9.0"]}, "url": "", "version": "1.1.0"}, "0.10.2": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.1.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0"}}, "ingest": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "ingest", "platformVersion": "3.5.0", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "ingest", "platformVersion": "3.5.0", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "ingest", "platformVersion": "", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "get_offline_features": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-01-17:17-56", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.2"}}, "azureml_utils": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.1.0"}, "0.9.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-04-20:15-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "0.9.5"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.9.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.2.0", "test_valid": false}, "0.9.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.4"}}, "auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}, "1.0.7": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.7"}, "1.6.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0"}, "0.10.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.3"}, "0.10.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.3.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "1.0.6": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.6"}, "1.0.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.5"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}}, "snowflake_dask": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "snowflake_dask", "platformVersion": "3.5.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "snowflake_dask", "platformVersion": "3.5.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-03-20:12-28", "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "snowflake_dask", "platformVersion": "3.2.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "azureml_serving": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}}, "batch_inference": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference ( also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.3.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.1"}, "1.4.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.4.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.2.0"}}, "hugging_face_serving": {"latest": {"apiVersion": "v1", "categories": ["huggingface", "genai", "model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.1.0", "test_valid": false}, "1.1.0": {"apiVersion": "v1", "categories": ["huggingface", "genai", "model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.1.0", "test_valid": false}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0"}}, "hugging_face_classifier_trainer": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.3.0"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.2.0"}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.0", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.1.0"}, "0.3.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.3.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.0", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.0.1"}}, "validate_great_expectations": {"latest": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0"}}, "question_answering": {"latest": {"apiVersion": "v1", "categories": ["genai", "huggingface", "machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.4.0"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.2.0"}, "0.3.1": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1"}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.1.0"}, "0.4.0": {"apiVersion": "v1", "categories": ["genai", "huggingface", "machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.4.0"}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.3.0"}}, "transcribe": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "genai", "huggingface", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "genai", "huggingface", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.1.0"}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": false}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0"}}, "pii_recognizer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.3.0", "test_valid": false}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.1.0", "test_valid": false}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.3.0", "test_valid": false}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.0.1"}}, "huggingface_auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["huggingface", "genai", "machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["huggingface", "genai", "machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}}, "batch_inference_v2": {"latest": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0"}, "2.4.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.4.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}, "2.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0"}, "1.8.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0"}, "2.2.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.2.0"}, "1.9.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc16", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.9.0"}, "2.1.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.1.0"}, "2.0.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.0.0"}}, "translate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "huggingface", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.1.0", "test_valid": true}, "0.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "huggingface", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.1.0", "test_valid": true}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": true}}, "structured_data_generator": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.5.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.0.0"}}, "text_to_audio_generator": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "pytorch"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.2.0", "test_valid": true}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "pytorch"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.2.0", "test_valid": true}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.0.0", "test_valid": true}}, "silero_vad": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.3.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.1.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.3.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0"}}, "pyannote_audio": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.2.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.0.0"}}}}} \ No newline at end of file diff --git a/functions/master/bert_embeddings/1.3.0/src/bert_embeddings.ipynb b/functions/master/bert_embeddings/1.3.0/src/bert_embeddings.ipynb new file mode 100644 index 00000000..cb6d5584 --- /dev/null +++ b/functions/master/bert_embeddings/1.3.0/src/bert_embeddings.ipynb @@ -0,0 +1,503 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## BERT Embeddings Serverless Function\n", + "This notebook presents deployment of pretrained BERT model that outputs embeddings for given textual sequences as a serverless function. Embeddings are meaningful, contextual representations of text in the form of ndarrays that are used frequently as input to various learning tasks in the field of NLP." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Embeddings without bert" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[One-Hot Encoding](https://en.wikipedia.org/wiki/One-hot) is a general method that can vectorize any categorical features. It is simple and fast to create and update the vectorization.
\n", + "in case of text embeddings, each row is a sentence and each column is a word/char/[n-gram](https://en.wikipedia.org/wiki/N-gram)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# some sentences to do examine\n", + "sentences = ['the quick brown fox jumps over the lazy dog',\n", + " 'Hello I am Jacob',\n", + " 'Daniel visited Tel-Aviv last month']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "lets see the difference between bert embeddings and one-hot encoding" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog', 'Hello', 'I', 'am', 'Jacob', 'Daniel', 'visited', 'Tel-Aviv', 'last', 'month']\n" + ] + } + ], + "source": [ + "# constructing a list of all the words (will be our columns) - make sure no duplicate words are set\n", + "tokens = []\n", + "for sentence in sentences:\n", + " for word in sentence.split():\n", + " tokens.append(word) if word not in tokens else \"\"\n", + "print(tokens)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# constructing the one hot vector\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "one_hot = pd.DataFrame(columns = range(len(tokens)))\n", + "# filling our empty dataframe with each sentence encoding\n", + "for sentence in sentences:\n", + " vector = np.zeros(len(tokens))\n", + " for word in sentence.split():\n", + " vector[tokens.index(word)]=1\n", + " one_hot = one_hot.append(pd.Series(vector),ignore_index=True)\n", + "one_hot.columns = tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
thequickbrownfoxjumpsoverlazydogHelloIamJacobDanielvisitedTel-Avivlastmonth
01.01.01.01.01.01.01.01.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.01.01.01.01.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.00.00.01.01.01.01.01.0
\n", + "
" + ], + "text/plain": [ + " the quick brown fox jumps over lazy dog Hello I am Jacob \\\n", + "0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 \n", + "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + " Daniel visited Tel-Aviv last month \n", + "0 0.0 0.0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 0.0 \n", + "2 1.0 1.0 1.0 1.0 1.0 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "one_hot" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The table above represents the one-hot encoding of our sentences, each row is a sentence and each column is a word.\n", + "this representation is very slim and will be a very weak learning dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introducing Bert embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from mlrun import import_function, auto_mount" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# importing the function from the hub\n", + "fn = import_function(\"hub://bert_embeddings\").apply(auto_mount())" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-02-02 09:29:59,002 [info] Starting remote function deploy\n", + "2023-02-02 09:29:59 (info) Deploying function\n", + "2023-02-02 09:29:59 (info) Building\n", + "2023-02-02 09:29:59 (info) Staging files and preparing base images\n", + "2023-02-02 09:29:59 (info) Building processor image\n", + "2023-02-02 09:32:09 (info) Build complete\n", + "2023-02-02 09:32:35 (info) Function deploy complete\n", + "> 2023-02-02 09:32:36,059 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-default-bert-embeddings.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-bert-embeddings-default.default-tenant.app.cto-office.iguazio-cd1.com/']}\n" + ] + } + ], + "source": [ + "# deploying the function\n", + "addr = fn.deploy()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "import json\n", + "# sending a request to the function endpoint to get the sentences' embeddings\n", + "resp = requests.post(addr, json=json.dumps(sentences))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "output_embeddings = pickle.loads(resp.content)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "embeddings per token shape: (3, 11, 768), pooled embeddings shape: (3, 768)\n" + ] + } + ], + "source": [ + "print(f'embeddings per token shape: {output_embeddings[0].shape}, pooled embeddings shape: {output_embeddings[1].shape}')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...758759760761762763764765766767
0-0.733322-0.2235400.3424620.383463-0.1647960.0405220.8028450.1528420.331639-0.999779...0.2065640.2314150.1964330.7979080.4351750.7493700.2460980.427603-0.5773840.842063
1-0.953005-0.535132-0.7438220.8939340.646276-0.2793880.9435130.275504-0.555109-0.999992...0.582386-0.0046140.9760790.931517-0.3914420.5303840.675933-0.682721-0.7463390.957809
2-0.843678-0.453405-0.8260110.6508050.494036-0.1541170.8216420.349507-0.650629-0.999978...0.618286-0.3367000.9362620.857577-0.7874890.2461370.676243-0.612532-0.7087860.840879
\n", + "

3 rows × 768 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 -0.733322 -0.223540 0.342462 0.383463 -0.164796 0.040522 0.802845 \n", + "1 -0.953005 -0.535132 -0.743822 0.893934 0.646276 -0.279388 0.943513 \n", + "2 -0.843678 -0.453405 -0.826011 0.650805 0.494036 -0.154117 0.821642 \n", + "\n", + " 7 8 9 ... 758 759 760 761 \\\n", + "0 0.152842 0.331639 -0.999779 ... 0.206564 0.231415 0.196433 0.797908 \n", + "1 0.275504 -0.555109 -0.999992 ... 0.582386 -0.004614 0.976079 0.931517 \n", + "2 0.349507 -0.650629 -0.999978 ... 0.618286 -0.336700 0.936262 0.857577 \n", + "\n", + " 762 763 764 765 766 767 \n", + "0 0.435175 0.749370 0.246098 0.427603 -0.577384 0.842063 \n", + "1 -0.391442 0.530384 0.675933 -0.682721 -0.746339 0.957809 \n", + "2 -0.787489 0.246137 0.676243 -0.612532 -0.708786 0.840879 \n", + "\n", + "[3 rows x 768 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(output_embeddings[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "we can see that the size of the first dimension of the outputs is three since we passed in three sequences. Also the intermediate dimension of the first output is the maximal number of tokens across all input sequences. Sequences with less tokens are padded with zero values.
\n", + "Note that the first input has an intermediate dimension of size 11 that corresponds to the number of max tokens in the input sequence after addition of two special tokens marking beginning and end of a sequence by the tokenizer.
\n", + "The last dimension for both is of size 768 which is the embedding dimension for this default configuration of bert.
\n", + "Now you tell me, which encoding are you gonna use in your project ??" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/functions/master/bert_embeddings/1.3.0/src/bert_embeddings.py b/functions/master/bert_embeddings/1.3.0/src/bert_embeddings.py new file mode 100644 index 00000000..109081b1 --- /dev/null +++ b/functions/master/bert_embeddings/1.3.0/src/bert_embeddings.py @@ -0,0 +1,41 @@ +# Copyright 2019 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import json +import pickle + +import torch +from transformers import BertModel, BertTokenizer + + +def init_context(context): + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + model = BertModel.from_pretrained("bert-base-uncased") + model.eval() + + setattr(context.user_data, "tokenizer", tokenizer) + setattr(context.user_data, "model", model) + + +def handler(context, event): + docs = json.loads(event.body) + docs = [doc.lower() for doc in docs] + docs = context.user_data.tokenizer.batch_encode_plus( + docs, pad_to_max_length=True, return_tensors="pt" + ) + + with torch.no_grad(): + embeddings = context.user_data.model(**docs) + embeddings = [embeddings[0].numpy(), embeddings[1].numpy()] + return pickle.dumps(embeddings) diff --git a/functions/master/bert_embeddings/1.3.0/src/function.yaml b/functions/master/bert_embeddings/1.3.0/src/function.yaml new file mode 100644 index 00000000..15319c16 --- /dev/null +++ b/functions/master/bert_embeddings/1.3.0/src/function.yaml @@ -0,0 +1,42 @@ +kind: remote +metadata: + name: bert-embeddings + tag: '' + hash: ecf6647fe4716e0df54ce50278b735034536a568 + project: '' + labels: + framework: pytorch + categories: + - huggingface + - machine-learning + - data-preparation + - pytorch +spec: + command: '' + args: [] + image: mlrun/mlrun + build: + functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IGpzb24KaW1wb3J0IHBpY2tsZQoKaW1wb3J0IHRvcmNoCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBCZXJ0TW9kZWwsIEJlcnRUb2tlbml6ZXIKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgdG9rZW5pemVyID0gQmVydFRva2VuaXplci5mcm9tX3ByZXRyYWluZWQoImJlcnQtYmFzZS11bmNhc2VkIikKICAgIG1vZGVsID0gQmVydE1vZGVsLmZyb21fcHJldHJhaW5lZCgiYmVydC1iYXNlLXVuY2FzZWQiKQogICAgbW9kZWwuZXZhbCgpCgogICAgc2V0YXR0cihjb250ZXh0LnVzZXJfZGF0YSwgInRva2VuaXplciIsIHRva2VuaXplcikKICAgIHNldGF0dHIoY29udGV4dC51c2VyX2RhdGEsICJtb2RlbCIsIG1vZGVsKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGRvY3MgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICBkb2NzID0gW2RvYy5sb3dlcigpIGZvciBkb2MgaW4gZG9jc10KICAgIGRvY3MgPSBjb250ZXh0LnVzZXJfZGF0YS50b2tlbml6ZXIuYmF0Y2hfZW5jb2RlX3BsdXMoCiAgICAgICAgZG9jcywgcGFkX3RvX21heF9sZW5ndGg9VHJ1ZSwgcmV0dXJuX3RlbnNvcnM9InB0IgogICAgKQoKICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgIGVtYmVkZGluZ3MgPSBjb250ZXh0LnVzZXJfZGF0YS5tb2RlbCgqKmRvY3MpCiAgICBlbWJlZGRpbmdzID0gW2VtYmVkZGluZ3NbMF0ubnVtcHkoKSwgZW1iZWRkaW5nc1sxXS5udW1weSgpXQogICAgcmV0dXJuIHBpY2tsZS5kdW1wcyhlbWJlZGRpbmdzKQo= + commands: [] + code_origin: '' + origin_filename: '' + requirements: + - torch + description: Get BERT based embeddings for given text + default_handler: '' + disable_auto_mount: false + clone_target_dir: '' + env: + - name: MLRUN_HTTPDB__NUCLIO__EXPLICIT_ACK + value: enabled + priority_class_name: '' + preemption_mode: prevent + min_replicas: 1 + max_replicas: 4 + source: '' + function_handler: bert_embeddings:handler + base_image_pull: false + affinity: null + tolerations: null + security_context: {} +verbose: false diff --git a/functions/master/bert_embeddings/1.3.0/src/item.yaml b/functions/master/bert_embeddings/1.3.0/src/item.yaml new file mode 100644 index 00000000..f96e54ea --- /dev/null +++ b/functions/master/bert_embeddings/1.3.0/src/item.yaml @@ -0,0 +1,28 @@ +apiVersion: v1 +categories: +- huggingface +- machine-learning +- data-preparation +- pytorch +description: Get BERT based embeddings for given text +doc: '' +example: bert_embeddings.ipynb +generationDate: 2022-08-28:17-25 +hidden: false +icon: '' +labels: + framework: pytorch +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.4.1 +name: bert-embeddings +platformVersion: 3.5.3 +spec: + filename: bert_embeddings.py + handler: handler + image: mlrun/mlrun + kind: nuclio + requirements: + - torch +url: '' +version: 1.3.0 diff --git a/functions/master/bert_embeddings/1.3.0/src/requirements.txt b/functions/master/bert_embeddings/1.3.0/src/requirements.txt new file mode 100644 index 00000000..747b7aa9 --- /dev/null +++ b/functions/master/bert_embeddings/1.3.0/src/requirements.txt @@ -0,0 +1 @@ +transformers \ No newline at end of file diff --git a/functions/master/bert_embeddings/1.3.0/src/test_bert_embeddings.py b/functions/master/bert_embeddings/1.3.0/src/test_bert_embeddings.py new file mode 100644 index 00000000..7ad9101c --- /dev/null +++ b/functions/master/bert_embeddings/1.3.0/src/test_bert_embeddings.py @@ -0,0 +1,32 @@ +# Copyright 2019 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from bert_embeddings import init_context,handler +import nuclio +import json +import pickle +import numpy as np + +ARCHIVE = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz" +ARTIFACTS_PATH = 'artifacts' + + +def test_bert_embeddings(): + event = nuclio.Event(body=json.dumps(['John loves Mary'])) + ctx = nuclio.Context() + init_context(ctx) + outputs = pickle.loads(handler(ctx, event)) + assert (True if abs(np.mean(outputs[0]) - -0.011996539) <= 0.0001 else False) is True + assert (True if abs(np.mean(outputs[0]) - -0.011996539) > 0 else False) is True + diff --git a/functions/master/bert_embeddings/1.3.0/static/bert_embeddings.html b/functions/master/bert_embeddings/1.3.0/static/bert_embeddings.html new file mode 100644 index 00000000..863a0261 --- /dev/null +++ b/functions/master/bert_embeddings/1.3.0/static/bert_embeddings.html @@ -0,0 +1,181 @@ + + + + + + + +bert_embeddings.bert_embeddings + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+ + + +
+ +
+
+
+
+
+
+ + +
+
+ +
+
+
+
+
+ +
+

+ +
+
+
+
+
+
+
+

Source code for bert_embeddings.bert_embeddings

+# Copyright 2019 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import json
+import pickle
+
+import torch
+from transformers import BertModel, BertTokenizer
+
+
+
[docs]def init_context(context): + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + model = BertModel.from_pretrained("bert-base-uncased") + model.eval() + + setattr(context.user_data, "tokenizer", tokenizer) + setattr(context.user_data, "model", model)
+ + +
[docs]def handler(context, event): + docs = json.loads(event.body) + docs = [doc.lower() for doc in docs] + docs = context.user_data.tokenizer.batch_encode_plus( + docs, pad_to_max_length=True, return_tensors="pt" + ) + + with torch.no_grad(): + embeddings = context.user_data.model(**docs) + embeddings = [embeddings[0].numpy(), embeddings[1].numpy()] + return pickle.dumps(embeddings)
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+ + + + \ No newline at end of file diff --git a/functions/master/bert_embeddings/1.3.0/static/documentation.html b/functions/master/bert_embeddings/1.3.0/static/documentation.html new file mode 100644 index 00000000..b99c805d --- /dev/null +++ b/functions/master/bert_embeddings/1.3.0/static/documentation.html @@ -0,0 +1,230 @@ + + + + + + + +bert_embeddings package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+ + + +
+ +
+
+
+
+
+
+ + + + +
+
+ + +
+
+
+ +
+

bert_embeddings package

+ +
+ +
+
+
+
+
+

bert_embeddings package#

+
+

Submodules#

+
+
+

bert_embeddings.bert_embeddings module#

+
+
+bert_embeddings.bert_embeddings.handler(context, event)[source]#
+
+
+
+bert_embeddings.bert_embeddings.init_context(context)[source]#
+
+
+
+

Module contents#

+
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+ + + + \ No newline at end of file diff --git a/functions/master/bert_embeddings/1.3.0/static/example.html b/functions/master/bert_embeddings/1.3.0/static/example.html new file mode 100644 index 00000000..59984155 --- /dev/null +++ b/functions/master/bert_embeddings/1.3.0/static/example.html @@ -0,0 +1,584 @@ + + + + + + + +BERT Embeddings Serverless Function + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+ + + +
+ +
+
+
+
+
+
+ + + + +
+
+ + +
+
+
+ +
+

BERT Embeddings Serverless Function

+ +
+ +
+
+
+
+
+

BERT Embeddings Serverless Function#

+

This notebook presents deployment of pretrained BERT model that outputs embeddings for given textual sequences as a serverless function. Embeddings are meaningful, contextual representations of text in the form of ndarrays that are used frequently as input to various learning tasks in the field of NLP.

+
+
+

Embeddings without bert#

+

One-Hot Encoding is a general method that can vectorize any categorical features. It is simple and fast to create and update the vectorization.
+in case of text embeddings, each row is a sentence and each column is a word/char/n-gram.

+
+
+
# some sentences to do examine
+sentences = ['the quick brown fox jumps over the lazy dog',
+              'Hello I am Jacob',
+              'Daniel visited Tel-Aviv last month']
+
+
+
+
+

lets see the difference between bert embeddings and one-hot encoding

+
+
+
# constructing a list of all the words (will be our columns) - make sure no duplicate words are set
+tokens = []
+for sentence in sentences:
+    for word in sentence.split():
+        tokens.append(word) if word not in tokens else ""
+print(tokens)
+
+
+
+
+
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog', 'Hello', 'I', 'am', 'Jacob', 'Daniel', 'visited', 'Tel-Aviv', 'last', 'month']
+
+
+
+
+
+
+
# constructing the one hot vector
+import pandas as pd
+import numpy as np
+
+one_hot = pd.DataFrame(columns = range(len(tokens)))
+# filling our empty dataframe with each sentence encoding
+for sentence in sentences:
+    vector = np.zeros(len(tokens))
+    for word in sentence.split():
+        vector[tokens.index(word)]=1
+    one_hot = one_hot.append(pd.Series(vector),ignore_index=True)
+one_hot.columns = tokens
+
+
+
+
+
+
+
one_hot
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
thequickbrownfoxjumpsoverlazydogHelloIamJacobDanielvisitedTel-Avivlastmonth
01.01.01.01.01.01.01.01.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.01.01.01.01.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.00.00.01.01.01.01.01.0
+
+
+

The table above represents the one-hot encoding of our sentences, each row is a sentence and each column is a word. +this representation is very slim and will be a very weak learning dataset.

+
+
+

Introducing Bert embeddings#

+
+
+
from mlrun import import_function, auto_mount
+
+
+
+
+
+
+
# importing the function from the hub
+fn = import_function("hub://bert_embeddings").apply(auto_mount())
+
+
+
+
+
+
+
# deploying the function
+addr = fn.deploy()
+
+
+
+
+
> 2023-02-02 09:29:59,002 [info] Starting remote function deploy
+2023-02-02 09:29:59  (info) Deploying function
+2023-02-02 09:29:59  (info) Building
+2023-02-02 09:29:59  (info) Staging files and preparing base images
+2023-02-02 09:29:59  (info) Building processor image
+2023-02-02 09:32:09  (info) Build complete
+2023-02-02 09:32:35  (info) Function deploy complete
+> 2023-02-02 09:32:36,059 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-default-bert-embeddings.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-bert-embeddings-default.default-tenant.app.cto-office.iguazio-cd1.com/']}
+
+
+
+
+
+
+
import requests
+import json
+# sending a request to the function endpoint to get the sentences' embeddings
+resp = requests.post(addr, json=json.dumps(sentences))
+
+
+
+
+
+
+
import pickle
+output_embeddings = pickle.loads(resp.content)
+
+
+
+
+
+
+
print(f'embeddings per token shape: {output_embeddings[0].shape}, pooled embeddings shape: {output_embeddings[1].shape}')
+
+
+
+
+
embeddings per token shape: (3, 11, 768), pooled embeddings shape: (3, 768)
+
+
+
+
+
+
+
pd.DataFrame(output_embeddings[1])
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
0123456789...758759760761762763764765766767
0-0.733322-0.2235400.3424620.383463-0.1647960.0405220.8028450.1528420.331639-0.999779...0.2065640.2314150.1964330.7979080.4351750.7493700.2460980.427603-0.5773840.842063
1-0.953005-0.535132-0.7438220.8939340.646276-0.2793880.9435130.275504-0.555109-0.999992...0.582386-0.0046140.9760790.931517-0.3914420.5303840.675933-0.682721-0.7463390.957809
2-0.843678-0.453405-0.8260110.6508050.494036-0.1541170.8216420.349507-0.650629-0.999978...0.618286-0.3367000.9362620.857577-0.7874890.2461370.676243-0.612532-0.7087860.840879
+

3 rows × 768 columns

+
+
+

we can see that the size of the first dimension of the outputs is three since we passed in three sequences. Also the intermediate dimension of the first output is the maximal number of tokens across all input sequences. Sequences with less tokens are padded with zero values.
+Note that the first input has an intermediate dimension of size 11 that corresponds to the number of max tokens in the input sequence after addition of two special tokens marking beginning and end of a sequence by the tokenizer.
+The last dimension for both is of size 768 which is the embedding dimension for this default configuration of bert.
+Now you tell me, which encoding are you gonna use in your project ??

+
+
+
+
+ +
+
+
+
+
+ +
+
+
+ + + + \ No newline at end of file diff --git a/functions/master/bert_embeddings/1.3.0/static/function.html b/functions/master/bert_embeddings/1.3.0/static/function.html new file mode 100644 index 00000000..9ab9c5fd --- /dev/null +++ b/functions/master/bert_embeddings/1.3.0/static/function.html @@ -0,0 +1,64 @@ + + + + + + + + + + + Source + + + + +
+        
+kind: remote
+metadata:
+  name: bert-embeddings
+  tag: ''
+  hash: ecf6647fe4716e0df54ce50278b735034536a568
+  project: ''
+  labels:
+    framework: pytorch
+  categories:
+  - huggingface
+  - machine-learning
+  - data-preparation
+  - pytorch
+spec:
+  command: ''
+  args: []
+  image: mlrun/mlrun
+  build:
+    functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IGpzb24KaW1wb3J0IHBpY2tsZQoKaW1wb3J0IHRvcmNoCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBCZXJ0TW9kZWwsIEJlcnRUb2tlbml6ZXIKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgdG9rZW5pemVyID0gQmVydFRva2VuaXplci5mcm9tX3ByZXRyYWluZWQoImJlcnQtYmFzZS11bmNhc2VkIikKICAgIG1vZGVsID0gQmVydE1vZGVsLmZyb21fcHJldHJhaW5lZCgiYmVydC1iYXNlLXVuY2FzZWQiKQogICAgbW9kZWwuZXZhbCgpCgogICAgc2V0YXR0cihjb250ZXh0LnVzZXJfZGF0YSwgInRva2VuaXplciIsIHRva2VuaXplcikKICAgIHNldGF0dHIoY29udGV4dC51c2VyX2RhdGEsICJtb2RlbCIsIG1vZGVsKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGRvY3MgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICBkb2NzID0gW2RvYy5sb3dlcigpIGZvciBkb2MgaW4gZG9jc10KICAgIGRvY3MgPSBjb250ZXh0LnVzZXJfZGF0YS50b2tlbml6ZXIuYmF0Y2hfZW5jb2RlX3BsdXMoCiAgICAgICAgZG9jcywgcGFkX3RvX21heF9sZW5ndGg9VHJ1ZSwgcmV0dXJuX3RlbnNvcnM9InB0IgogICAgKQoKICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgIGVtYmVkZGluZ3MgPSBjb250ZXh0LnVzZXJfZGF0YS5tb2RlbCgqKmRvY3MpCiAgICBlbWJlZGRpbmdzID0gW2VtYmVkZGluZ3NbMF0ubnVtcHkoKSwgZW1iZWRkaW5nc1sxXS5udW1weSgpXQogICAgcmV0dXJuIHBpY2tsZS5kdW1wcyhlbWJlZGRpbmdzKQo=
+    commands: []
+    code_origin: ''
+    origin_filename: ''
+    requirements:
+    - torch
+  description: Get BERT based embeddings for given text
+  default_handler: ''
+  disable_auto_mount: false
+  clone_target_dir: ''
+  env:
+  - name: MLRUN_HTTPDB__NUCLIO__EXPLICIT_ACK
+    value: enabled
+  priority_class_name: ''
+  preemption_mode: prevent
+  min_replicas: 1
+  max_replicas: 4
+  source: ''
+  function_handler: bert_embeddings:handler
+  base_image_pull: false
+  affinity: null
+  tolerations: null
+  security_context: {}
+verbose: false
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/bert_embeddings/1.3.0/static/item.html b/functions/master/bert_embeddings/1.3.0/static/item.html new file mode 100644 index 00000000..25e99ef6 --- /dev/null +++ b/functions/master/bert_embeddings/1.3.0/static/item.html @@ -0,0 +1,50 @@ + + + + + + + + + + + Source + + + + +
+        
+apiVersion: v1
+categories:
+- huggingface
+- machine-learning
+- data-preparation
+- pytorch
+description: Get BERT based embeddings for given text
+doc: ''
+example: bert_embeddings.ipynb
+generationDate: 2022-08-28:17-25
+hidden: false
+icon: ''
+labels:
+  framework: pytorch
+maintainers: []
+marketplaceType: ''
+mlrunVersion: 1.4.1
+name: bert-embeddings
+platformVersion: 3.5.3
+spec:
+  filename: bert_embeddings.py
+  handler: handler
+  image: mlrun/mlrun
+  kind: nuclio
+  requirements:
+  - torch
+url: ''
+version: 1.3.0
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/bert_embeddings/1.3.0/static/source.html b/functions/master/bert_embeddings/1.3.0/static/source.html new file mode 100644 index 00000000..1df4accf --- /dev/null +++ b/functions/master/bert_embeddings/1.3.0/static/source.html @@ -0,0 +1,63 @@ + + + + + + + + + + + Source + + + + +
+        
+# Copyright 2019 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import json
+import pickle
+
+import torch
+from transformers import BertModel, BertTokenizer
+
+
+def init_context(context):
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    model = BertModel.from_pretrained("bert-base-uncased")
+    model.eval()
+
+    setattr(context.user_data, "tokenizer", tokenizer)
+    setattr(context.user_data, "model", model)
+
+
+def handler(context, event):
+    docs = json.loads(event.body)
+    docs = [doc.lower() for doc in docs]
+    docs = context.user_data.tokenizer.batch_encode_plus(
+        docs, pad_to_max_length=True, return_tensors="pt"
+    )
+
+    with torch.no_grad():
+        embeddings = context.user_data.model(**docs)
+    embeddings = [embeddings[0].numpy(), embeddings[1].numpy()]
+    return pickle.dumps(embeddings)
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/bert_embeddings/latest/src/function.yaml b/functions/master/bert_embeddings/latest/src/function.yaml index 4a3fcf54..15319c16 100644 --- a/functions/master/bert_embeddings/latest/src/function.yaml +++ b/functions/master/bert_embeddings/latest/src/function.yaml @@ -2,13 +2,15 @@ kind: remote metadata: name: bert-embeddings tag: '' - hash: 57a2ce8e0da1f6e813a8649e9ea6fcbb69a1ce5f + hash: ecf6647fe4716e0df54ce50278b735034536a568 project: '' labels: framework: pytorch categories: + - huggingface - machine-learning - data-preparation + - pytorch spec: command: '' args: [] @@ -16,15 +18,17 @@ spec: build: functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IGpzb24KaW1wb3J0IHBpY2tsZQoKaW1wb3J0IHRvcmNoCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBCZXJ0TW9kZWwsIEJlcnRUb2tlbml6ZXIKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgdG9rZW5pemVyID0gQmVydFRva2VuaXplci5mcm9tX3ByZXRyYWluZWQoImJlcnQtYmFzZS11bmNhc2VkIikKICAgIG1vZGVsID0gQmVydE1vZGVsLmZyb21fcHJldHJhaW5lZCgiYmVydC1iYXNlLXVuY2FzZWQiKQogICAgbW9kZWwuZXZhbCgpCgogICAgc2V0YXR0cihjb250ZXh0LnVzZXJfZGF0YSwgInRva2VuaXplciIsIHRva2VuaXplcikKICAgIHNldGF0dHIoY29udGV4dC51c2VyX2RhdGEsICJtb2RlbCIsIG1vZGVsKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGRvY3MgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICBkb2NzID0gW2RvYy5sb3dlcigpIGZvciBkb2MgaW4gZG9jc10KICAgIGRvY3MgPSBjb250ZXh0LnVzZXJfZGF0YS50b2tlbml6ZXIuYmF0Y2hfZW5jb2RlX3BsdXMoCiAgICAgICAgZG9jcywgcGFkX3RvX21heF9sZW5ndGg9VHJ1ZSwgcmV0dXJuX3RlbnNvcnM9InB0IgogICAgKQoKICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgIGVtYmVkZGluZ3MgPSBjb250ZXh0LnVzZXJfZGF0YS5tb2RlbCgqKmRvY3MpCiAgICBlbWJlZGRpbmdzID0gW2VtYmVkZGluZ3NbMF0ubnVtcHkoKSwgZW1iZWRkaW5nc1sxXS5udW1weSgpXQogICAgcmV0dXJuIHBpY2tsZS5kdW1wcyhlbWJlZGRpbmdzKQo= commands: [] - code_origin: http://github.com/aviaIguazio/functions.git#a1c9940e4c2420c88063768b4038e29b1f4e37a6:/Users/Avi_Asulin/PycharmProjects/mlrun/functions/bert_embeddings/bert_embeddings.py - origin_filename: /Users/Avi_Asulin/PycharmProjects/mlrun/functions/bert_embeddings/bert_embeddings.py + code_origin: '' + origin_filename: '' requirements: - torch description: Get BERT based embeddings for given text default_handler: '' disable_auto_mount: false clone_target_dir: '' - env: [] + env: + - name: MLRUN_HTTPDB__NUCLIO__EXPLICIT_ACK + value: enabled priority_class_name: '' preemption_mode: prevent min_replicas: 1 diff --git a/functions/master/bert_embeddings/latest/src/item.yaml b/functions/master/bert_embeddings/latest/src/item.yaml index f0eaed1c..f96e54ea 100644 --- a/functions/master/bert_embeddings/latest/src/item.yaml +++ b/functions/master/bert_embeddings/latest/src/item.yaml @@ -1,7 +1,9 @@ apiVersion: v1 categories: +- huggingface - machine-learning - data-preparation +- pytorch description: Get BERT based embeddings for given text doc: '' example: bert_embeddings.ipynb @@ -23,4 +25,4 @@ spec: requirements: - torch url: '' -version: 1.2.0 +version: 1.3.0 diff --git a/functions/master/bert_embeddings/latest/static/function.html b/functions/master/bert_embeddings/latest/static/function.html index 985f9e26..9ab9c5fd 100644 --- a/functions/master/bert_embeddings/latest/static/function.html +++ b/functions/master/bert_embeddings/latest/static/function.html @@ -19,13 +19,15 @@ metadata: name: bert-embeddings tag: '' - hash: 57a2ce8e0da1f6e813a8649e9ea6fcbb69a1ce5f + hash: ecf6647fe4716e0df54ce50278b735034536a568 project: '' labels: framework: pytorch categories: + - huggingface - machine-learning - data-preparation + - pytorch spec: command: '' args: [] @@ -33,15 +35,17 @@ build: functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IGpzb24KaW1wb3J0IHBpY2tsZQoKaW1wb3J0IHRvcmNoCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBCZXJ0TW9kZWwsIEJlcnRUb2tlbml6ZXIKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgdG9rZW5pemVyID0gQmVydFRva2VuaXplci5mcm9tX3ByZXRyYWluZWQoImJlcnQtYmFzZS11bmNhc2VkIikKICAgIG1vZGVsID0gQmVydE1vZGVsLmZyb21fcHJldHJhaW5lZCgiYmVydC1iYXNlLXVuY2FzZWQiKQogICAgbW9kZWwuZXZhbCgpCgogICAgc2V0YXR0cihjb250ZXh0LnVzZXJfZGF0YSwgInRva2VuaXplciIsIHRva2VuaXplcikKICAgIHNldGF0dHIoY29udGV4dC51c2VyX2RhdGEsICJtb2RlbCIsIG1vZGVsKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGRvY3MgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICBkb2NzID0gW2RvYy5sb3dlcigpIGZvciBkb2MgaW4gZG9jc10KICAgIGRvY3MgPSBjb250ZXh0LnVzZXJfZGF0YS50b2tlbml6ZXIuYmF0Y2hfZW5jb2RlX3BsdXMoCiAgICAgICAgZG9jcywgcGFkX3RvX21heF9sZW5ndGg9VHJ1ZSwgcmV0dXJuX3RlbnNvcnM9InB0IgogICAgKQoKICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgIGVtYmVkZGluZ3MgPSBjb250ZXh0LnVzZXJfZGF0YS5tb2RlbCgqKmRvY3MpCiAgICBlbWJlZGRpbmdzID0gW2VtYmVkZGluZ3NbMF0ubnVtcHkoKSwgZW1iZWRkaW5nc1sxXS5udW1weSgpXQogICAgcmV0dXJuIHBpY2tsZS5kdW1wcyhlbWJlZGRpbmdzKQo= commands: [] - code_origin: http://github.com/aviaIguazio/functions.git#a1c9940e4c2420c88063768b4038e29b1f4e37a6:/Users/Avi_Asulin/PycharmProjects/mlrun/functions/bert_embeddings/bert_embeddings.py - origin_filename: /Users/Avi_Asulin/PycharmProjects/mlrun/functions/bert_embeddings/bert_embeddings.py + code_origin: '' + origin_filename: '' requirements: - torch description: Get BERT based embeddings for given text default_handler: '' disable_auto_mount: false clone_target_dir: '' - env: [] + env: + - name: MLRUN_HTTPDB__NUCLIO__EXPLICIT_ACK + value: enabled priority_class_name: '' preemption_mode: prevent min_replicas: 1 diff --git a/functions/master/bert_embeddings/latest/static/item.html b/functions/master/bert_embeddings/latest/static/item.html index 612e78b6..25e99ef6 100644 --- a/functions/master/bert_embeddings/latest/static/item.html +++ b/functions/master/bert_embeddings/latest/static/item.html @@ -17,8 +17,10 @@ apiVersion: v1 categories: +- huggingface - machine-learning - data-preparation +- pytorch description: Get BERT based embeddings for given text doc: '' example: bert_embeddings.ipynb @@ -40,7 +42,7 @@ requirements: - torch url: '' -version: 1.2.0 +version: 1.3.0 diff --git a/functions/master/catalog.json b/functions/master/catalog.json index 3c25469b..cc0de364 100644 --- a/functions/master/catalog.json +++ b/functions/master/catalog.json @@ -1 +1 @@ -{"tf2_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving", "platformVersion": "", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "load_dask": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dask", "platformVersion": "3.5.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/load_dask.ipynb", "source": "src/load_dask.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dask", "platformVersion": "3.2.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/load_dask.ipynb", "source": "src/load_dask.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dask", "platformVersion": "3.5.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/load_dask.ipynb", "source": "src/load_dask.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dask", "platformVersion": "3.2.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/load_dask.ipynb", "source": "src/load_dask.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dask", "platformVersion": "", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/load_dask.ipynb", "source": "src/load_dask.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "xgb_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "xgb_serving", "platformVersion": "3.5.3", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.2", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "xgb_serving", "platformVersion": "3.5.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.2": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "xgb_serving", "platformVersion": "3.5.3", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.2", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "xgb_serving", "platformVersion": "3.0.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "sql_to_file": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "sql-to-file", "platformVersion": "3.5.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/sql_to_file.ipynb", "source": "src/sql_to_file.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/sql_to_file.ipynb", "source": "src/sql_to_file.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "sql-to-file", "platformVersion": "3.5.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/sql_to_file.ipynb", "source": "src/sql_to_file.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/sql_to_file.ipynb", "source": "src/sql_to_file.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/sql_to_file.ipynb", "source": "src/sql_to_file.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "sql-to-file", "platformVersion": "", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/sql_to_file.ipynb", "source": "src/sql_to_file.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "feature_selection": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "feature-selection", "platformVersion": "2.10.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "slack_notify": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "slack-notify", "platformVersion": "3.5.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/slack_notify.ipynb", "source": "src/slack_notify.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "slack-notify", "platformVersion": "3.2.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/slack_notify.ipynb", "source": "src/slack_notify.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "slack-notify", "platformVersion": "3.5.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/slack_notify.ipynb", "source": "src/slack_notify.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "slack-notify", "platformVersion": "3.2.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/slack_notify.ipynb", "source": "src/slack_notify.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "slack-notify", "platformVersion": "", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/slack_notify.ipynb", "source": "src/slack_notify.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server", "platformVersion": "", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "ingest": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "ingest", "platformVersion": "3.5.0", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/ingest.ipynb", "source": "src/ingest.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "ingest", "platformVersion": "3.5.0", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/ingest.ipynb", "source": "src/ingest.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "ingest", "platformVersion": "", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/ingest.ipynb", "source": "src/ingest.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "describe": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "describe", "platformVersion": "3.5.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-07:14-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.2": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-26:10-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.2", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "describe", "platformVersion": "2.10.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "github_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "github-utils", "platformVersion": "", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "aggregate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "aggregate", "platformVersion": "3.0.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "bert_embeddings": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "bert-embeddings", "platformVersion": "3.2.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "bert-embeddings", "platformVersion": "3.5.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "bert-embeddings", "platformVersion": "3.2.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "bert-embeddings", "platformVersion": "3.5.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "1.1.1", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "bert-embeddings", "platformVersion": "2.10.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "concept_drift": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/concept_drift.ipynb", "source": "src/concept_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/concept_drift.ipynb", "source": "src/concept_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/concept_drift.ipynb", "source": "src/concept_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/concept_drift.ipynb", "source": "src/concept_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.9.1", "assets": {"example": "src/concept_drift.ipynb", "source": "src/concept_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift", "platformVersion": "", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/concept_drift.ipynb", "source": "src/concept_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "pandas_profiling_report": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "pandas-profiling-report", "platformVersion": "3.5.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/pandas_profiling_report.ipynb", "source": "src/pandas_profiling_report.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "pandas-profiling-report", "platformVersion": "3.2.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/pandas_profiling_report.ipynb", "source": "src/pandas_profiling_report.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "pandas-profiling-report", "platformVersion": "3.5.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/pandas_profiling_report.ipynb", "source": "src/pandas_profiling_report.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "pandas-profiling-report", "platformVersion": "3.2.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/pandas_profiling_report.ipynb", "source": "src/pandas_profiling_report.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "pandas-profiling-report", "platformVersion": "", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/pandas_profiling_report.ipynb", "source": "src/pandas_profiling_report.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "load_dataset": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dataset", "platformVersion": "3.5.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dataset", "platformVersion": "", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "concept_drift_streaming": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift-streaming", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/concept_drift_streaming.ipynb", "source": "src/concept_drift_streaming.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/concept_drift_streaming.ipynb", "source": "src/concept_drift_streaming.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift-streaming", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/concept_drift_streaming.ipynb", "source": "src/concept_drift_streaming.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/concept_drift_streaming.ipynb", "source": "src/concept_drift_streaming.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.9.1", "assets": {"example": "src/concept_drift_streaming.ipynb", "source": "src/concept_drift_streaming.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift-streaming", "platformVersion": "", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/concept_drift_streaming.ipynb", "source": "src/concept_drift_streaming.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.7": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.7", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.3", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.3.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.7.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.6": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.6", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.5", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "tf2_serving_v2": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf2_serving_v2.ipynb", "source": "src/tf2_serving_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/tf2_serving_v2.ipynb", "source": "src/tf2_serving_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf2_serving_v2.ipynb", "source": "src/tf2_serving_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/tf2_serving_v2.ipynb", "source": "src/tf2_serving_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1", "assets": {"example": "src/tf2_serving_v2.ipynb", "source": "src/tf2_serving_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving-v2", "platformVersion": "", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/tf2_serving_v2.ipynb", "source": "src/tf2_serving_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "stream_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "stream-to-parquet", "platformVersion": "3.5.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/stream_to_parquet.ipynb", "source": "src/stream_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "stream-to-parquet", "platformVersion": "3.2.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/stream_to_parquet.ipynb", "source": "src/stream_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "stream-to-parquet", "platformVersion": "3.5.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/stream_to_parquet.ipynb", "source": "src/stream_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "stream-to-parquet", "platformVersion": "3.2.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/stream_to_parquet.ipynb", "source": "src/stream_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "stream-to-parquet", "platformVersion": "", "spec": {"filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": [], "customFields": {"min_replicas": 1, "max_replicas": 1}}, "url": "", "version": "0.0.1", "assets": {"example": "src/stream_to_parquet.ipynb", "source": "src/stream_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "v2_model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-server", "platformVersion": "", "spec": {"filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": [], "customFields": {"default_class": "ClassifierModel"}}, "url": "", "version": "0.0.1", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "feature_perms": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-perms", "platformVersion": "3.5.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "test_valid": false, "assets": {"example": "src/feature_perms.ipynb", "source": "src/feature_perms.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/feature_perms.ipynb", "source": "src/feature_perms.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-perms", "platformVersion": "3.5.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "test_valid": false, "assets": {"example": "src/feature_perms.ipynb", "source": "src/feature_perms.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/feature_perms.ipynb", "source": "src/feature_perms.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "feature-perms", "platformVersion": "", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/feature_perms.ipynb", "source": "src/feature_perms.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/feature_perms.ipynb", "source": "src/feature_perms.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "model_server_tester": {"latest": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server-tester", "platformVersion": "", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "get_offline_features": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/get_offline_features.ipynb", "source": "src/get_offline_features.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/get_offline_features.ipynb", "source": "src/get_offline_features.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.1", "assets": {"example": "src/get_offline_features.ipynb", "source": "src/get_offline_features.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-01-17:17-56", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/get_offline_features.ipynb", "source": "src/get_offline_features.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/get_offline_features.ipynb", "source": "src/get_offline_features.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.2", "assets": {"example": "src/get_offline_features.ipynb", "source": "src/get_offline_features.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "v2_model_tester": {"latest": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-tester", "platformVersion": "", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "coxph_test": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "coxph-test", "platformVersion": "", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "virtual_drift": {"latest": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "virtual-drift", "platformVersion": "3.5.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/virtual_drift.ipynb", "source": "src/virtual_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "virtual-drift", "platformVersion": "3.2.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/virtual_drift.ipynb", "source": "src/virtual_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "virtual-drift", "platformVersion": "3.5.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/virtual_drift.ipynb", "source": "src/virtual_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "virtual-drift", "platformVersion": "3.2.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/virtual_drift.ipynb", "source": "src/virtual_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "virtual-drift", "platformVersion": "", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/virtual_drift.ipynb", "source": "src/virtual_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "rnn_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "rnn-serving", "platformVersion": "3.5.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.1.0", "assets": {"example": "src/rnn_serving.ipynb", "source": "src/rnn_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "0.8.0", "assets": {"example": "src/rnn_serving.ipynb", "source": "src/rnn_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "rnn-serving", "platformVersion": "3.5.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.1.0", "assets": {"example": "src/rnn_serving.ipynb", "source": "src/rnn_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "0.9.0", "assets": {"example": "src/rnn_serving.ipynb", "source": "src/rnn_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "rnn-serving", "platformVersion": "", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["keras"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/rnn_serving.ipynb", "source": "src/rnn_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.0.0", "assets": {"example": "src/rnn_serving.ipynb", "source": "src/rnn_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "tf1_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf1-serving", "platformVersion": "3.5.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf1_serving.ipynb", "source": "src/tf1_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/tf1_serving.ipynb", "source": "src/tf1_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf1-serving", "platformVersion": "3.5.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf1_serving.ipynb", "source": "src/tf1_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/tf1_serving.ipynb", "source": "src/tf1_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/tf1_serving.ipynb", "source": "src/tf1_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf1-serving", "platformVersion": "", "spec": {"filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": [], "env": {"MODEL_CLASS": "TFModel", "ENABLE_EXPLAINER": false}}, "url": "", "version": "0.0.1", "assets": {"example": "src/tf1_serving.ipynb", "source": "src/tf1_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "model_monitoring_batch": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-batch", "platformVersion": "", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "open_archive": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "open-archive", "platformVersion": "", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "onnx_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "onnxoptimizer~=0.2.0", "onnxmltools~=1.9.0", "tf2onnx~=1.9.0"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.2": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.1.1", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "gen_class_data": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "gen_class_data", "platformVersion": "3.5.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "gen_class_data", "platformVersion": "3.0.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "azureml_utils": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true, "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-04-20:15-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "0.9.5", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true, "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.2.0", "test_valid": false, "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.4", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "churn_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "churn-server", "platformVersion": "", "spec": {"filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": [], "env": {"ENABLE_EXPLAINER": "False"}, "customFields": {"default_class": "ChurnModel"}}, "url": "", "version": "0.0.1", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "describe_spark": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe_spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe-spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe_spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe-spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe_spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "describe-spark", "platformVersion": "", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe-spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "model_monitoring_stream": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-stream", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_monitoring_stream.ipynb", "source": "src/model_monitoring_stream.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/model_monitoring_stream.ipynb", "source": "src/model_monitoring_stream.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-stream", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_monitoring_stream.ipynb", "source": "src/model_monitoring_stream.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/model_monitoring_stream.ipynb", "source": "src/model_monitoring_stream.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/model_monitoring_stream.ipynb", "source": "src/model_monitoring_stream.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-stream", "platformVersion": "", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/model_monitoring_stream.ipynb", "source": "src/model_monitoring_stream.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "send_email": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "send-email", "platformVersion": "3.5.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "send-email", "platformVersion": "", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "arc_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "arc-to-parquet", "platformVersion": "2.10.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "snowflake_dask": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "snowflake_dask", "platformVersion": "3.5.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/snowflake-dask-mlrun.ipynb", "source": "src/snowflake_dask.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "snowflake_dask", "platformVersion": "3.5.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/snowflake-dask-mlrun.ipynb", "source": "src/snowflake_dask.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-03-20:12-28", "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "snowflake_dask", "platformVersion": "3.2.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/snowflake-dask-mlrun.ipynb", "source": "src/snowflake_dask.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "azureml_serving": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/azureml_serving.ipynb", "source": "src/azureml_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/azureml_serving.ipynb", "source": "src/azureml_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "batch_inference": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference ( also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.3.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.7.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.1", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.4.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.2.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "hugging_face_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/hugging_face_serving.ipynb", "source": "src/hugging_face_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/hugging_face_serving.ipynb", "source": "src/hugging_face_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "hugging_face_classifier_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.2.0", "assets": {"example": "src/hugging_face_classifier_trainer.ipynb", "source": "src/hugging_face_classifier_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.2.0", "assets": {"example": "src/hugging_face_classifier_trainer.ipynb", "source": "src/hugging_face_classifier_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.0", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.1.0", "assets": {"example": "src/hugging_face_classifier_trainer.ipynb", "source": "src/hugging_face_classifier_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.0", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/hugging_face_classifier_trainer.ipynb", "source": "src/hugging_face_classifier_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "validate_great_expectations": {"latest": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/validate_great_expectations.ipynb", "source": "src/validate_great_expectations.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/validate_great_expectations.ipynb", "source": "src/validate_great_expectations.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "question_answering": {"latest": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.2.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.3.1": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.1.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.3.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "transcribe": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true, "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": false, "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "pii_recognizer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.1.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "huggingface_auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/huggingface_auto_trainer.ipynb", "source": "src/huggingface_auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/huggingface_auto_trainer.ipynb", "source": "src/huggingface_auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "batch_inference_v2": {"latest": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.4.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.4.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.8.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.2.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.2.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.9.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc16", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.9.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.1.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.1.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.0.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.0.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "translate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "structured_data_generator": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "text_to_audio_generator": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.0.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "silero_vad": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "pyannote_audio": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}} \ No newline at end of file +{"tf2_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving", "platformVersion": "", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "load_dask": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dask", "platformVersion": "3.5.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/load_dask.ipynb", "source": "src/load_dask.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dask", "platformVersion": "3.2.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/load_dask.ipynb", "source": "src/load_dask.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dask", "platformVersion": "3.5.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/load_dask.ipynb", "source": "src/load_dask.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dask", "platformVersion": "3.2.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/load_dask.ipynb", "source": "src/load_dask.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dask", "platformVersion": "", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/load_dask.ipynb", "source": "src/load_dask.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "xgb_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "xgb_serving", "platformVersion": "3.5.3", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.2", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "xgb_serving", "platformVersion": "3.5.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.2": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "xgb_serving", "platformVersion": "3.5.3", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.2", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "xgb_serving", "platformVersion": "3.0.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "sql_to_file": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "sql-to-file", "platformVersion": "3.5.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/sql_to_file.ipynb", "source": "src/sql_to_file.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/sql_to_file.ipynb", "source": "src/sql_to_file.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "sql-to-file", "platformVersion": "3.5.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/sql_to_file.ipynb", "source": "src/sql_to_file.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/sql_to_file.ipynb", "source": "src/sql_to_file.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/sql_to_file.ipynb", "source": "src/sql_to_file.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "sql-to-file", "platformVersion": "", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/sql_to_file.ipynb", "source": "src/sql_to_file.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "feature_selection": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "feature-selection", "platformVersion": "2.10.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "slack_notify": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "slack-notify", "platformVersion": "3.5.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/slack_notify.ipynb", "source": "src/slack_notify.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "slack-notify", "platformVersion": "3.2.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/slack_notify.ipynb", "source": "src/slack_notify.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "slack-notify", "platformVersion": "3.5.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/slack_notify.ipynb", "source": "src/slack_notify.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "slack-notify", "platformVersion": "3.2.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/slack_notify.ipynb", "source": "src/slack_notify.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "slack-notify", "platformVersion": "", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/slack_notify.ipynb", "source": "src/slack_notify.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server", "platformVersion": "", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "ingest": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "ingest", "platformVersion": "3.5.0", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/ingest.ipynb", "source": "src/ingest.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "ingest", "platformVersion": "3.5.0", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/ingest.ipynb", "source": "src/ingest.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "ingest", "platformVersion": "", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/ingest.ipynb", "source": "src/ingest.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "describe": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "describe", "platformVersion": "3.5.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-07:14-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.2": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-26:10-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.2", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "describe", "platformVersion": "2.10.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "github_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "github-utils", "platformVersion": "", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "aggregate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "aggregate", "platformVersion": "3.0.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "bert_embeddings": {"latest": {"apiVersion": "v1", "categories": ["huggingface", "machine-learning", "data-preparation", "pytorch"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.3.0", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "bert-embeddings", "platformVersion": "3.2.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "bert-embeddings", "platformVersion": "3.5.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["huggingface", "machine-learning", "data-preparation", "pytorch"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.3.0", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "bert-embeddings", "platformVersion": "3.2.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "bert-embeddings", "platformVersion": "3.5.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "1.1.1", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "bert-embeddings", "platformVersion": "2.10.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "concept_drift": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/concept_drift.ipynb", "source": "src/concept_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/concept_drift.ipynb", "source": "src/concept_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/concept_drift.ipynb", "source": "src/concept_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/concept_drift.ipynb", "source": "src/concept_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.9.1", "assets": {"example": "src/concept_drift.ipynb", "source": "src/concept_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift", "platformVersion": "", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/concept_drift.ipynb", "source": "src/concept_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "pandas_profiling_report": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "pandas-profiling-report", "platformVersion": "3.5.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/pandas_profiling_report.ipynb", "source": "src/pandas_profiling_report.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "pandas-profiling-report", "platformVersion": "3.2.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/pandas_profiling_report.ipynb", "source": "src/pandas_profiling_report.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "pandas-profiling-report", "platformVersion": "3.5.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/pandas_profiling_report.ipynb", "source": "src/pandas_profiling_report.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "pandas-profiling-report", "platformVersion": "3.2.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/pandas_profiling_report.ipynb", "source": "src/pandas_profiling_report.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "pandas-profiling-report", "platformVersion": "", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/pandas_profiling_report.ipynb", "source": "src/pandas_profiling_report.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "load_dataset": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dataset", "platformVersion": "3.5.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dataset", "platformVersion": "", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "concept_drift_streaming": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift-streaming", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/concept_drift_streaming.ipynb", "source": "src/concept_drift_streaming.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/concept_drift_streaming.ipynb", "source": "src/concept_drift_streaming.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift-streaming", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/concept_drift_streaming.ipynb", "source": "src/concept_drift_streaming.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/concept_drift_streaming.ipynb", "source": "src/concept_drift_streaming.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.9.1", "assets": {"example": "src/concept_drift_streaming.ipynb", "source": "src/concept_drift_streaming.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift-streaming", "platformVersion": "", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/concept_drift_streaming.ipynb", "source": "src/concept_drift_streaming.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.7": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.7", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.3", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.3.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.7.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.6": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.6", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.5", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "tf2_serving_v2": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf2_serving_v2.ipynb", "source": "src/tf2_serving_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/tf2_serving_v2.ipynb", "source": "src/tf2_serving_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf2_serving_v2.ipynb", "source": "src/tf2_serving_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/tf2_serving_v2.ipynb", "source": "src/tf2_serving_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1", "assets": {"example": "src/tf2_serving_v2.ipynb", "source": "src/tf2_serving_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving-v2", "platformVersion": "", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/tf2_serving_v2.ipynb", "source": "src/tf2_serving_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "stream_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "stream-to-parquet", "platformVersion": "3.5.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/stream_to_parquet.ipynb", "source": "src/stream_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "stream-to-parquet", "platformVersion": "3.2.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/stream_to_parquet.ipynb", "source": "src/stream_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "stream-to-parquet", "platformVersion": "3.5.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/stream_to_parquet.ipynb", "source": "src/stream_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "stream-to-parquet", "platformVersion": "3.2.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/stream_to_parquet.ipynb", "source": "src/stream_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "stream-to-parquet", "platformVersion": "", "spec": {"filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": [], "customFields": {"min_replicas": 1, "max_replicas": 1}}, "url": "", "version": "0.0.1", "assets": {"example": "src/stream_to_parquet.ipynb", "source": "src/stream_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "v2_model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-server", "platformVersion": "", "spec": {"filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": [], "customFields": {"default_class": "ClassifierModel"}}, "url": "", "version": "0.0.1", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "feature_perms": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-perms", "platformVersion": "3.5.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "test_valid": false, "assets": {"example": "src/feature_perms.ipynb", "source": "src/feature_perms.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/feature_perms.ipynb", "source": "src/feature_perms.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-perms", "platformVersion": "3.5.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "test_valid": false, "assets": {"example": "src/feature_perms.ipynb", "source": "src/feature_perms.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/feature_perms.ipynb", "source": "src/feature_perms.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "feature-perms", "platformVersion": "", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/feature_perms.ipynb", "source": "src/feature_perms.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/feature_perms.ipynb", "source": "src/feature_perms.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "model_server_tester": {"latest": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server-tester", "platformVersion": "", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "get_offline_features": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/get_offline_features.ipynb", "source": "src/get_offline_features.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/get_offline_features.ipynb", "source": "src/get_offline_features.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.1", "assets": {"example": "src/get_offline_features.ipynb", "source": "src/get_offline_features.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-01-17:17-56", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/get_offline_features.ipynb", "source": "src/get_offline_features.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/get_offline_features.ipynb", "source": "src/get_offline_features.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.2", "assets": {"example": "src/get_offline_features.ipynb", "source": "src/get_offline_features.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "v2_model_tester": {"latest": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-tester", "platformVersion": "", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "coxph_test": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "coxph-test", "platformVersion": "", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "virtual_drift": {"latest": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "virtual-drift", "platformVersion": "3.5.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/virtual_drift.ipynb", "source": "src/virtual_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "virtual-drift", "platformVersion": "3.2.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/virtual_drift.ipynb", "source": "src/virtual_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "virtual-drift", "platformVersion": "3.5.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/virtual_drift.ipynb", "source": "src/virtual_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "virtual-drift", "platformVersion": "3.2.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/virtual_drift.ipynb", "source": "src/virtual_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "virtual-drift", "platformVersion": "", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/virtual_drift.ipynb", "source": "src/virtual_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "rnn_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "rnn-serving", "platformVersion": "3.5.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.1.0", "assets": {"example": "src/rnn_serving.ipynb", "source": "src/rnn_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "0.8.0", "assets": {"example": "src/rnn_serving.ipynb", "source": "src/rnn_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "rnn-serving", "platformVersion": "3.5.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.1.0", "assets": {"example": "src/rnn_serving.ipynb", "source": "src/rnn_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "0.9.0", "assets": {"example": "src/rnn_serving.ipynb", "source": "src/rnn_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "rnn-serving", "platformVersion": "", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["keras"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/rnn_serving.ipynb", "source": "src/rnn_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.0.0", "assets": {"example": "src/rnn_serving.ipynb", "source": "src/rnn_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "tf1_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf1-serving", "platformVersion": "3.5.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf1_serving.ipynb", "source": "src/tf1_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/tf1_serving.ipynb", "source": "src/tf1_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf1-serving", "platformVersion": "3.5.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf1_serving.ipynb", "source": "src/tf1_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/tf1_serving.ipynb", "source": "src/tf1_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/tf1_serving.ipynb", "source": "src/tf1_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf1-serving", "platformVersion": "", "spec": {"filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": [], "env": {"MODEL_CLASS": "TFModel", "ENABLE_EXPLAINER": false}}, "url": "", "version": "0.0.1", "assets": {"example": "src/tf1_serving.ipynb", "source": "src/tf1_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "model_monitoring_batch": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-batch", "platformVersion": "", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "open_archive": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "open-archive", "platformVersion": "", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "onnx_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "onnxoptimizer~=0.2.0", "onnxmltools~=1.9.0", "tf2onnx~=1.9.0"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.2": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.1.1", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "gen_class_data": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "gen_class_data", "platformVersion": "3.5.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "gen_class_data", "platformVersion": "3.0.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "azureml_utils": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true, "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-04-20:15-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "0.9.5", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true, "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.2.0", "test_valid": false, "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.4", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "churn_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "churn-server", "platformVersion": "", "spec": {"filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": [], "env": {"ENABLE_EXPLAINER": "False"}, "customFields": {"default_class": "ChurnModel"}}, "url": "", "version": "0.0.1", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "describe_spark": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe_spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe-spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe_spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe-spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe_spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "describe-spark", "platformVersion": "", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe-spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "model_monitoring_stream": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-stream", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_monitoring_stream.ipynb", "source": "src/model_monitoring_stream.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/model_monitoring_stream.ipynb", "source": "src/model_monitoring_stream.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-stream", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_monitoring_stream.ipynb", "source": "src/model_monitoring_stream.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/model_monitoring_stream.ipynb", "source": "src/model_monitoring_stream.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/model_monitoring_stream.ipynb", "source": "src/model_monitoring_stream.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-stream", "platformVersion": "", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/model_monitoring_stream.ipynb", "source": "src/model_monitoring_stream.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "send_email": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "send-email", "platformVersion": "3.5.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "send-email", "platformVersion": "", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "arc_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "arc-to-parquet", "platformVersion": "2.10.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "snowflake_dask": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "snowflake_dask", "platformVersion": "3.5.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/snowflake-dask-mlrun.ipynb", "source": "src/snowflake_dask.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "snowflake_dask", "platformVersion": "3.5.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/snowflake-dask-mlrun.ipynb", "source": "src/snowflake_dask.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-03-20:12-28", "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "snowflake_dask", "platformVersion": "3.2.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/snowflake-dask-mlrun.ipynb", "source": "src/snowflake_dask.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "azureml_serving": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/azureml_serving.ipynb", "source": "src/azureml_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/azureml_serving.ipynb", "source": "src/azureml_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "batch_inference": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference ( also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.3.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.7.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.1", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.4.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.2.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "hugging_face_serving": {"latest": {"apiVersion": "v1", "categories": ["huggingface", "genai", "model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.1.0", "test_valid": false, "assets": {"example": "src/hugging_face_serving.ipynb", "source": "src/hugging_face_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["huggingface", "genai", "model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.1.0", "test_valid": false, "assets": {"example": "src/hugging_face_serving.ipynb", "source": "src/hugging_face_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/hugging_face_serving.ipynb", "source": "src/hugging_face_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "hugging_face_classifier_trainer": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.3.0", "assets": {"example": "src/hugging_face_classifier_trainer.ipynb", "source": "src/hugging_face_classifier_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.2.0", "assets": {"example": "src/hugging_face_classifier_trainer.ipynb", "source": "src/hugging_face_classifier_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.0", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.1.0", "assets": {"example": "src/hugging_face_classifier_trainer.ipynb", "source": "src/hugging_face_classifier_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.3.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.3.0", "assets": {"example": "src/hugging_face_classifier_trainer.ipynb", "source": "src/hugging_face_classifier_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.0", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/hugging_face_classifier_trainer.ipynb", "source": "src/hugging_face_classifier_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "validate_great_expectations": {"latest": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/validate_great_expectations.ipynb", "source": "src/validate_great_expectations.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/validate_great_expectations.ipynb", "source": "src/validate_great_expectations.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "question_answering": {"latest": {"apiVersion": "v1", "categories": ["genai", "huggingface", "machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.4.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.2.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.3.1": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.1.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.4.0": {"apiVersion": "v1", "categories": ["genai", "huggingface", "machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.4.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.3.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "transcribe": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "genai", "huggingface", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "genai", "huggingface", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true, "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": false, "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "pii_recognizer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.3.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.1.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.3.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "huggingface_auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["huggingface", "genai", "machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/huggingface_auto_trainer.ipynb", "source": "src/huggingface_auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["huggingface", "genai", "machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/huggingface_auto_trainer.ipynb", "source": "src/huggingface_auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/huggingface_auto_trainer.ipynb", "source": "src/huggingface_auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "batch_inference_v2": {"latest": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.4.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.4.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.8.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.2.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.2.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.9.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc16", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.9.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.1.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.1.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.0.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.0.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "translate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "huggingface", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.1.0", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "huggingface", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.1.0", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "structured_data_generator": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.5.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.5.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "text_to_audio_generator": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "pytorch"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.2.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "pytorch"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.2.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.0.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "silero_vad": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.3.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.3.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "pyannote_audio": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}} \ No newline at end of file diff --git a/functions/master/hugging_face_classifier_trainer/0.3.0/src/function.yaml b/functions/master/hugging_face_classifier_trainer/0.3.0/src/function.yaml new file mode 100644 index 00000000..65f5aeb1 --- /dev/null +++ b/functions/master/hugging_face_classifier_trainer/0.3.0/src/function.yaml @@ -0,0 +1,370 @@ +kind: job +metadata: + name: hugging-face-classifier-trainer + tag: '' + hash: f9d8aa4a2c66e24fa418bb163829adc3e2ada06c + project: '' + labels: + author: davids + categories: + - deep-learning + - huggingface + - machine-learning + - model-training +spec: + command: '' + args: [] + image: '' + build: + functionSourceCode: import os
import shutil
import tempfile
import zipfile
from abc import ABC
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import mlrun
import mlrun.datastore
import mlrun.utils
import numpy as np
import pandas as pd
import transformers
from datasets import Dataset, load_dataset, load_metric
from mlrun import MLClientCtx
from mlrun import feature_store as fs
from mlrun.artifacts import Artifact, PlotlyArtifact
from mlrun.datastore import DataItem
from mlrun.frameworks._common import CommonTypes, MLRunInterface
from mlrun.utils import create_class
from plotly import graph_objects as go
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    PreTrainedModel,
    PreTrainedTokenizer,
    Trainer,
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
)


# ----------------------from MLRUN--------------------------------
class HFORTOptimizerMLRunInterface(MLRunInterface, ABC):
    """
    Interface for adding MLRun features for tensorflow keras API.
    """

    # MLRun's context default name:
    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"

    # Attributes to be inserted so the MLRun interface will be fully enabled.
    _PROPERTIES = {
        "_auto_log": False,
        "_context": None,
        "_model_name": "model",
        "_tag": "",
        "_labels": None,
        "_extra_data": None,
    }
    _METHODS = ["enable_auto_logging"]
    # Attributes to replace so the MLRun interface will be fully enabled.
    _REPLACED_METHODS = [
        "optimize",
    ]

    @classmethod
    def add_interface(
        cls,
        obj,
        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
    ):
        """
        Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras
        MLRun's features.
        :param obj:                     The object to enrich his interface.
        :param restoration: Restoration information tuple as returned from 'remove_interface' in order to
                                        add the interface in a certain state.
        """
        super(HFORTOptimizerMLRunInterface, cls).add_interface(
            obj=obj, restoration=restoration
        )

    @classmethod
    def mlrun_optimize(cls):
        """
        MLRun's tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be
        passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.

        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.
        """

        def wrapper(self, *args, **kwargs):
            save_dir = cls._get_function_argument(
                self.optimize,
                argument_name="save_dir",
                passed_args=args,
                passed_kwargs=kwargs,
            )[0]

            # Call the original optimize method:
            result = self.original_optimize(*args, **kwargs)

            if self._auto_log:
                # Log the onnx model:
                self._context.log_model(
                    key="model",
                    db_key=self._model_name,
                    model_file=f"{save_dir}/model_optimized.onnx",
                    tag=self._tag,
                    framework="ONNX",
                    labels=self._labels,
                    extra_data=self._extra_data,
                )

            return result

        return wrapper

    def enable_auto_logging(
        self,
        context: mlrun.MLClientCtx,
        model_name: str = "model",
        tag: str = "",
        labels: Dict[str, str] = None,
        extra_data: dict = None,
    ):
        self._auto_log = True

        self._context = context
        self._model_name = model_name
        self._tag = tag
        self._labels = labels
        self._extra_data = extra_data


class HFTrainerMLRunInterface(MLRunInterface, ABC):
    """
    Interface for adding MLRun features for tensorflow keras API.
    """

    # MLRuns context default name:
    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"

    # Attributes to replace so the MLRun interface will be fully enabled.
    _REPLACED_METHODS = [
        "train",
        # "evaluate"
    ]

    @classmethod
    def add_interface(
        cls,
        obj: Trainer,
        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
    ):
        """
        Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras
        MLRuns features.
        :param obj:                     The object to enrich his interface.
        :param restoration: Restoration information tuple as returned from 'remove_interface' in order to
                                        add the interface in a certain state.
        """

        super(HFTrainerMLRunInterface, cls).add_interface(
            obj=obj, restoration=restoration
        )

    @classmethod
    def mlrun_train(cls):

        """
        MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be
        passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.

        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.
        """

        def wrapper(self: Trainer, *args, **kwargs):
            # Restore the evaluation method as `train` will use it:
            # cls._restore_attribute(obj=self, attribute_name="evaluate")

            # Call the original fit method:
            result = self.original_train(*args, **kwargs)

            # Replace the evaluation method again:
            # cls._replace_function(obj=self, function_name="evaluate")

            return result

        return wrapper


class MLRunCallback(TrainerCallback):
    """
    Callback for collecting logs during training / evaluation of the `Trainer` API.
    """

    def __init__(
        self,
        context: mlrun.MLClientCtx = None,
        model_name: str = "model",
        tag: str = "",
        labels: Dict[str, str] = None,
        extra_data: dict = None,
    ):
        super().__init__()

        # Store the configurations:
        self._context = (
            context
            if context is not None
            else mlrun.get_or_create_ctx("./mlrun-huggingface")
        )
        self._model_name = model_name
        self._tag = tag
        self._labels = labels
        self._extra_data = extra_data if extra_data is not None else {}

        # Set up the logging mode:
        self._is_training = False
        self._steps: List[List[int]] = []
        self._metric_scores: Dict[str, List[float]] = {}
        self._artifacts: Dict[str, Artifact] = {}

    def on_epoch_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        self._steps.append([])

    def on_epoch_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        self._log_metrics()

    def on_log(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        logs: Dict[str, float] = None,
        **kwargs,
    ):
        recent_logs = state.log_history[-1].copy()

        recent_logs.pop("epoch")
        current_step = int(recent_logs.pop("step"))
        if current_step not in self._steps[-1]:
            self._steps[-1].append(current_step)

        for metric_name, metric_score in recent_logs.items():
            if metric_name.startswith("train_"):
                if metric_name.split("train_")[1] not in self._metric_scores:
                    self._metric_scores[metric_name] = [metric_score]
                continue
            if metric_name not in self._metric_scores:
                self._metric_scores[metric_name] = []
            self._metric_scores[metric_name].append(metric_score)

    def on_train_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        self._is_training = True

    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        model: PreTrainedModel = None,
        tokenizer: PreTrainedTokenizer = None,
        **kwargs,
    ):
        self._log_metrics()

        temp_directory = tempfile.gettempdir()

        # Save and log the tokenizer:
        if tokenizer is not None:
            # Save tokenizer:
            tokenizer_dir = os.path.join(temp_directory, "tokenizer")
            tokenizer.save_pretrained(save_directory=tokenizer_dir)
            # Zip the tokenizer directory:
            tokenizer_zip = shutil.make_archive(
                base_name="tokenizer",
                format="zip",
                root_dir=tokenizer_dir,
            )
            # Log the zip file:
            self._artifacts["tokenizer"] = self._context.log_artifact(
                item="tokenizer", local_path=tokenizer_zip
            )

        # Save the model:
        model_dir = os.path.join(temp_directory, "model")
        model.save_pretrained(save_directory=model_dir)

        # Zip the model directory:
        shutil.make_archive(
            base_name="model",
            format="zip",
            root_dir=model_dir,
        )

        # Log the model:
        self._context.log_model(
            key="model",
            db_key=self._model_name,
            model_file="model.zip",
            tag=self._tag,
            framework="Hugging Face",
            labels=self._labels,
            extra_data={**self._artifacts, **self._extra_data},
        )

    def on_evaluate(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        self._log_metrics()

        if self._is_training:
            return

        # TODO: Update the model object

    def _log_metrics(self):
        for metric_name, metric_scores in self._metric_scores.items():
            self._context.log_result(key=metric_name, value=metric_scores[-1])
            if len(metric_scores) > 1:
                self._log_metric_plot(name=metric_name, scores=metric_scores)
        self._context.commit(completed=False)

    def _log_metric_plot(self, name: str, scores: List[float]):
        # Initialize a plotly figure:
        metric_figure = go.Figure()

        # Add titles:
        metric_figure.update_layout(
            title=name.capitalize().replace("_", " "),
            xaxis_title="Samples",
            yaxis_title="Scores",
        )

        # Draw:
        metric_figure.add_trace(
            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
        )

        # Create the plotly artifact:
        artifact_name = f"{name}_plot"
        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
        self._artifacts[artifact_name] = self._context.log_artifact(artifact)


def _apply_mlrun_on_trainer(
    trainer: transformers.Trainer,
    model_name: str = None,
    tag: str = "",
    context: mlrun.MLClientCtx = None,
    auto_log: bool = True,
    labels: Dict[str, str] = None,
    extra_data: dict = None,
    **kwargs,
):
    # Get parameters defaults:
    if context is None:
        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)

    HFTrainerMLRunInterface.add_interface(obj=trainer)

    if auto_log:
        trainer.add_callback(
            MLRunCallback(
                context=context,
                model_name=model_name,
                tag=tag,
                labels=labels,
                extra_data=extra_data,
            )
        )


def _apply_mlrun_on_optimizer(
    optimizer,
    model_name: str = None,
    tag: str = "",
    context: mlrun.MLClientCtx = None,
    auto_log: bool = True,
    labels: Dict[str, str] = None,
    extra_data: dict = None,
    **kwargs,
):
    # Get parameters defaults:
    if context is None:
        context = mlrun.get_or_create_ctx(
            HFORTOptimizerMLRunInterface.DEFAULT_CONTEXT_NAME
        )

    HFORTOptimizerMLRunInterface.add_interface(obj=optimizer)

    if auto_log:
        optimizer.enable_auto_logging(
            context=context,
            model_name=model_name,
            tag=tag,
            labels=labels,
            extra_data=extra_data,
        )


def apply_mlrun(
    huggingface_object,
    model_name: str = None,
    tag: str = "",
    context: mlrun.MLClientCtx = None,
    auto_log: bool = True,
    labels: Dict[str, str] = None,
    extra_data: dict = None,
    **kwargs,
):
    """
    Wrap the given model with MLRun's interface providing it with mlrun's additional features.
    :param huggingface_object: The model to wrap. Can be loaded from the model path given as well.
    :param model_name:         The model name to use for storing the model artifact. Default: "model".
    :param tag:                The model's tag to log with.
    :param context:            MLRun context to work with. If no context is given it will be retrieved via
                               'mlrun.get_or_create_ctx(None)'
    :param auto_log:           Whether to enable MLRun's auto logging. Default: True.
    """

    if isinstance(huggingface_object, transformers.Trainer):
        return _apply_mlrun_on_trainer(
            trainer=huggingface_object,
            model_name=model_name,
            tag=tag,
            context=context,
            auto_log=auto_log,
            labels=labels,
            extra_data=extra_data,
        )
    import optimum.onnxruntime as optimum_ort

    if isinstance(huggingface_object, optimum_ort.ORTOptimizer):
        return _apply_mlrun_on_optimizer(
            optimizer=huggingface_object,
            model_name=model_name,
            tag=tag,
            context=context,
            auto_log=auto_log,
            labels=labels,
            extra_data=extra_data,
        )
    raise mlrun.errors.MLRunInvalidArgumentError


# ---------------------- from auto_trainer--------------------------------
class KWArgsPrefixes:
    MODEL_CLASS = "CLASS_"
    FIT = "FIT_"
    TRAIN = "TRAIN_"
    PREDICT = "PREDICT_"


def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]:
    """
    Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these
    keys.

    :param src:         The source dict to extract the values from.
    :param prefix_key:  Only keys with this prefix will be returned. The keys in the result dict will be without this
                        prefix.
    """
    return {
        key.replace(prefix_key, ""): val
        for key, val in src.items()
        if key.startswith(prefix_key)
    }


def _get_dataframe(
    context: MLClientCtx,
    dataset: DataItem,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: Union[str, List[str], int, List[int]] = None,
) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]:
    """
    Getting the DataFrame of the dataset and drop the columns accordingly.

    :param context:         MLRun context.
    :param dataset:         The dataset to train the model on.
                            Can be either a list of lists, dict, URI or a FeatureVector.
    :param label_columns:   The target label(s) of the column(s) in the dataset. for Regression or
                            Classification tasks.
    :param drop_columns:    str/int or a list of strings/ints that represent the column names/indices to drop.
    """
    if isinstance(dataset, (list, dict)):
        dataset = pd.DataFrame(dataset)
        # Checking if drop_columns provided by integer type:
        if drop_columns:
            if isinstance(drop_columns, str) or (
                isinstance(drop_columns, list)
                and any(isinstance(col, str) for col in drop_columns)
            ):
                context.logger.error(
                    "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset"
                )
                raise ValueError
            dataset.drop(drop_columns, axis=1, inplace=True)

        return dataset, label_columns

    store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url)
    if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix:
        # feature-vector case:
        label_columns = label_columns or dataset.meta.status.label_column
        dataset = fs.get_offline_features(
            dataset.meta.uri, drop_columns=drop_columns
        ).to_dataframe()

        context.logger.info(f"label columns: {label_columns}")
    else:
        # simple URL case:
        dataset = dataset.as_df()
        if drop_columns:
            if all(col in dataset for col in drop_columns):
                dataset = dataset.drop(drop_columns, axis=1)
            else:
                context.logger.info(
                    "not all of the columns to drop in the dataset, drop columns process skipped"
                )
    return dataset, label_columns


# ---------------------- Hugging Face Trainer --------------------------------


def _create_compute_metrics(metrics: List[str]) -> Callable[[EvalPrediction], Dict]:
    """
    This function create and returns a function that will be used to compute metrics at evaluation.
    :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc.

    :returns: Function that will be used to compute metrics at evaluation.
             Must take a [`EvalPrediction`] and return a dictionary string to metric values.
    """

    def _compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        metric_dict_results = {}
        for metric in metrics:
            load_met = load_metric(metric)
            metric_res = load_met.compute(predictions=predictions, references=labels)[
                metric
            ]
            metric_dict_results[metric] = metric_res

        return metric_dict_results

    return _compute_metrics


def _edit_columns(
    dataset: Dataset,
    drop_columns: List[str] = None,
    rename_columns: [str, str] = None,
) -> Dataset:
    """
    Drop and renames that columns of the given dataset
    :param dataset:         Dataset to process
    :param drop_columns:    The columns to drop from the dataset.
    :param rename_columns:  Dict of columns ro rename : {<old_name>: <new_name>, ...}

    :returns: The dataset after the desired process
    """
    if drop_columns:
        dataset = dataset.remove_columns(drop_columns)
    if rename_columns:
        dataset = dataset.rename_columns(rename_columns)
    return dataset


def _prepare_dataset(
    context: MLClientCtx,
    dataset_name: str,
    label_name: str = None,
    drop_columns: Optional[List[str]] = None,
    num_of_train_samples: int = None,
    train_test_split_size: float = None,
    random_state: int = None,
) -> Tuple[Dataset, Dataset]:
    """
    Loading the dataset and editing the columns

    :param context:                 MLRun contex
    :param dataset_name:            The name of the dataset to get from the HuggingFace hub
    :param label_name:              The target label of the column in the dataset.
    :param drop_columns:            The columns to drop from the dataset.
    :param num_of_train_samples:    Max number of training samples, for debugging.
    :param train_test_split_size:   Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
                                    in the test split.
    :param random_state:            Random state for train_test_split

    """

    context.logger.info(
        f"Loading and editing {dataset_name} dataset from Hugging Face hub"
    )
    rename_cols = {label_name: "labels"}

    # Loading and editing dataset:
    dataset = load_dataset(dataset_name)

    # train set
    train_dataset = dataset["train"]
    if num_of_train_samples:
        train_dataset = train_dataset.shuffle(seed=random_state).select(
            list(range(num_of_train_samples))
        )
    train_dataset = _edit_columns(train_dataset, drop_columns, rename_cols)

    # test set
    test_dataset = dataset["test"]
    if train_test_split_size or num_of_train_samples:
        train_test_split_size = train_test_split_size or 0.2
        num_of_test_samples = int(
            (train_dataset.num_rows * train_test_split_size)
            // (1 - train_test_split_size)
        )
        test_dataset = test_dataset.shuffle(seed=random_state).select(
            list(range(num_of_test_samples))
        )
    test_dataset = _edit_columns(test_dataset, drop_columns, rename_cols)

    return train_dataset, test_dataset


def train(
    context: MLClientCtx,
    hf_dataset: str = None,
    dataset: DataItem = None,
    test_set: DataItem = None,
    drop_columns: Optional[List[str]] = None,
    pretrained_tokenizer: str = None,
    pretrained_model: str = None,
    model_class: str = None,
    model_name: str = "huggingface-model",
    label_name: str = "labels",
    text_col: str = "text",
    num_of_train_samples: int = None,
    train_test_split_size: float = None,
    metrics: List[str] = None,
    random_state: int = None,
):
    """
    Training and evaluating a pretrained model with a pretrained tokenizer over a dataset.
    The dataset can be either be the name of the dataset that contains in the HuggingFace hub,
    or a URI or a FeatureVector

    :param context:                 MLRun context
    :param hf_dataset:              The name of the dataset to get from the HuggingFace hub
    :param dataset:                 The dataset to train the model on. Can be either a URI or a FeatureVector
    :param test_set:                The test set to train the model with.
    :param drop_columns:            The columns to drop from the dataset.
    :param pretrained_tokenizer:    The name of the pretrained tokenizer from the HuggingFace hub.
    :param pretrained_model:        The name of the pretrained model from the HuggingFace hub.
    :param model_name:              The model's name to use for storing the model artifact, default to 'model'
    :param model_class:             The class of the model, e.g. `transformers.AutoModelForSequenceClassification`
    :param label_name:              The target label of the column in the dataset.
    :param text_col:                The input text column un the dataset.
    :param num_of_train_samples:    Max number of training samples, for debugging.
    :param train_test_split_size:   Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
                                    in the test split.
    :param metrics:                 List of different metrics for evaluate the model such as f1, accuracy etc.
    :param random_state:            Random state for train_test_split
    """

    if train_test_split_size is None and test_set is None:
        context.logger.info(
            "'train_test_split_size' is not provided, setting train_test_split_size to 0.2"
        )
        train_test_split_size = 0.2

    # Creating tokenizer:
    tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer)

    def preprocess_function(examples):
        return tokenizer(examples[text_col], truncation=True)

    # prepare data for training
    if hf_dataset:
        train_dataset, test_dataset = _prepare_dataset(
            context,
            hf_dataset,
            label_name,
            drop_columns,
            num_of_train_samples,
            train_test_split_size,
            random_state=random_state,
        )
    elif dataset:
        # Get DataFrame by URL or by FeatureVector:
        train_dataset, label_name = _get_dataframe(
            context=context,
            dataset=dataset,
            label_columns=label_name,
            drop_columns=drop_columns,
        )
        if test_set:
            test_dataset, _ = _get_dataframe(
                context=context,
                dataset=test_set,
                label_columns=label_name,
                drop_columns=drop_columns,
            )
        else:
            train_dataset, test_dataset = train_test_split(
                train_dataset,
                test_size=train_test_split_size,
                random_state=random_state,
            )
        train_dataset = Dataset.from_pandas(train_dataset)
        test_dataset = Dataset.from_pandas(test_dataset)
    else:
        raise mlrun.errors.MLRunInvalidArgumentError(
            "Training data was not provided. A training dataset is mandatory for training."
            " Please provide a training set using one of the arguments 'hf_dataset' or 'dataset'."
        )

    # Mapping datasets with the tokenizer:
    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_test = test_dataset.map(preprocess_function, batched=True)

    # Creating data collator for batching:
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Parsing kwargs:
    train_kwargs = _get_sub_dict_by_prefix(
        src=context.parameters, prefix_key=KWArgsPrefixes.TRAIN
    )
    model_class_kwargs = _get_sub_dict_by_prefix(
        src=context.parameters, prefix_key=KWArgsPrefixes.MODEL_CLASS
    )

    # Loading our pretrained model:
    model_class_kwargs["pretrained_model_name_or_path"] = (
        model_class_kwargs.get("pretrained_model_name_or_path") or pretrained_model
    )
    train_kwargs["hub_token"] = train_kwargs.get("hub_token") or pretrained_tokenizer
    if not model_class_kwargs["pretrained_model_name_or_path"]:
        raise mlrun.errors.MLRunRuntimeError(
            "Must provide pretrained_model name as "
            "function argument or in extra params"
        )
    model = create_class(model_class).from_pretrained(**model_class_kwargs)

    # Preparing training arguments:
    training_args = TrainingArguments(
        **train_kwargs,
    )

    compute_metrics = _create_compute_metrics(metrics) if metrics else None
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    apply_mlrun(trainer, model_name=model_name)

    # Apply training with evaluation:
    context.logger.info(f"training '{model_name}'")
    trainer.train()


def _get_model_dir(model_uri: str):
    model_file, _, _ = mlrun.artifacts.get_model(model_uri)
    model_dir = tempfile.gettempdir()
    # Unzip the Model:
    with zipfile.ZipFile(model_file, "r") as zip_file:
        zip_file.extractall(model_dir)

    return model_dir


def optimize(
    model_path: str,
    model_name: str = "optimized_model",
    target_dir: str = "./optimized",
    optimization_level: int = 1,
):
    """
    Optimizing the transformer model using ONNX optimization.


    :param model_path:          The path of the model to optimize.
    :param model_name:          Name of the optimized model.
    :param target_dir:          The directory to save the ONNX model.
    :param optimization_level:  Optimization level performed by ONNX Runtime of the loaded graph. (default is 1)
    """
    # We import these in the function scope so ONNX won't be mandatory for the other handlers:
    from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer
    from optimum.onnxruntime.configuration import OptimizationConfig

    model_dir = _get_model_dir(model_uri=model_path)
    # Creating configuration for optimization step:
    optimization_config = OptimizationConfig(optimization_level=optimization_level)

    # Converting our pretrained model to an ONNX-Runtime model:
    ort_model = ORTModelForSequenceClassification.from_pretrained(
        model_dir, from_transformers=True
    )

    # Creating an ONNX-Runtime optimizer from ONNX model:
    optimizer = ORTOptimizer.from_pretrained(ort_model)

    apply_mlrun(optimizer, model_name=model_name)
    # Optimizing and saving the ONNX model:
    optimizer.optimize(save_dir=target_dir, optimization_config=optimization_config)
 + base_image: mlrun/mlrun + commands: [] + code_origin: '' + origin_filename: '' + requirements: + - onnx~=1.14.1 + - onnxruntime~=1.16.1 + - optimum~=1.6.4 + - transformers~=4.26.1 + - datasets~=2.10.1 + - scikit-learn~=1.0.2 + entry_points: + add_interface: + name: add_interface + doc: 'Enrich the object with this interface properties, methods and functions, + so it will have this TensorFlow.Keras + + MLRuns features.' + parameters: + - name: cls + - name: obj + type: Trainer + doc: The object to enrich his interface. + - name: restoration + type: MLRunInterfaceRestorationType + doc: Restoration information tuple as returned from 'remove_interface' in + order to add the interface in a certain state. + default: null + outputs: [] + lineno: 146 + has_varargs: false + has_kwargs: false + mlrun_optimize: + name: mlrun_optimize + doc: 'MLRun''s tf.keras.Model.fit wrapper. It will setup the optimizer when + using horovod. The optimizer must be + + passed in a keyword argument and when using horovod, it must be passed as + an Optimizer instance, not a string. + + + raise MLRunInvalidArgumentError: In case the optimizer provided did not follow + the instructions above.' + parameters: + - name: cls + outputs: [] + lineno: 79 + has_varargs: false + has_kwargs: false + wrapper: + name: wrapper + doc: '' + parameters: + - name: self + type: Trainer + outputs: [] + lineno: 173 + has_varargs: true + has_kwargs: true + enable_auto_logging: + name: enable_auto_logging + doc: '' + parameters: + - name: self + - name: context + type: MLClientCtx + - name: model_name + type: str + default: model + - name: tag + type: str + default: '' + - name: labels + type: Dict[str, str] + default: null + - name: extra_data + type: dict + default: null + outputs: [] + lineno: 114 + has_varargs: false + has_kwargs: false + mlrun_train: + name: mlrun_train + doc: 'MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using + horovod. The optimizer must be + + passed in a keyword argument and when using horovod, it must be passed as + an Optimizer instance, not a string. + + + raise MLRunInvalidArgumentError: In case the optimizer provided did not follow + the instructions above.' + parameters: + - name: cls + outputs: [] + lineno: 164 + has_varargs: false + has_kwargs: false + on_epoch_begin: + name: on_epoch_begin + doc: '' + parameters: + - name: self + - name: args + type: TrainingArguments + - name: state + type: TrainerState + - name: control + type: TrainerControl + outputs: [] + lineno: 220 + has_varargs: false + has_kwargs: true + on_epoch_end: + name: on_epoch_end + doc: '' + parameters: + - name: self + - name: args + type: TrainingArguments + - name: state + type: TrainerState + - name: control + type: TrainerControl + outputs: [] + lineno: 229 + has_varargs: false + has_kwargs: true + on_log: + name: on_log + doc: '' + parameters: + - name: self + - name: args + type: TrainingArguments + - name: state + type: TrainerState + - name: control + type: TrainerControl + - name: logs + type: Dict[str, float] + default: null + outputs: [] + lineno: 238 + has_varargs: false + has_kwargs: true + on_train_begin: + name: on_train_begin + doc: '' + parameters: + - name: self + - name: args + type: TrainingArguments + - name: state + type: TrainerState + - name: control + type: TrainerControl + outputs: [] + lineno: 262 + has_varargs: false + has_kwargs: true + on_train_end: + name: on_train_end + doc: '' + parameters: + - name: self + - name: args + type: TrainingArguments + - name: state + type: TrainerState + - name: control + type: TrainerControl + - name: model + type: PreTrainedModel + default: null + - name: tokenizer + type: PreTrainedTokenizer + default: null + outputs: [] + lineno: 271 + has_varargs: false + has_kwargs: true + on_evaluate: + name: on_evaluate + doc: '' + parameters: + - name: self + - name: args + type: TrainingArguments + - name: state + type: TrainerState + - name: control + type: TrainerControl + outputs: [] + lineno: 322 + has_varargs: false + has_kwargs: true + apply_mlrun: + name: apply_mlrun + doc: Wrap the given model with MLRun's interface providing it with mlrun's additional + features. + parameters: + - name: huggingface_object + doc: The model to wrap. Can be loaded from the model path given as well. + - name: model_name + type: str + doc: 'The model name to use for storing the model artifact. Default: "model".' + default: null + - name: tag + type: str + doc: The model's tag to log with. + default: '' + - name: context + type: MLClientCtx + doc: MLRun context to work with. If no context is given it will be retrieved + via 'mlrun.get_or_create_ctx(None)' + default: null + - name: auto_log + type: bool + doc: 'Whether to enable MLRun''s auto logging. Default: True.' + default: true + - name: labels + type: Dict[str, str] + default: null + - name: extra_data + type: dict + default: null + outputs: [] + lineno: 421 + has_varargs: false + has_kwargs: true + train: + name: train + doc: 'Training and evaluating a pretrained model with a pretrained tokenizer + over a dataset. + + The dataset can be either be the name of the dataset that contains in the + HuggingFace hub, + + or a URI or a FeatureVector' + parameters: + - name: context + type: MLClientCtx + doc: MLRun context + - name: hf_dataset + type: str + doc: The name of the dataset to get from the HuggingFace hub + default: null + - name: dataset + type: DataItem + doc: The dataset to train the model on. Can be either a URI or a FeatureVector + default: null + - name: test_set + type: DataItem + doc: The test set to train the model with. + default: null + - name: drop_columns + type: Optional[List[str]] + doc: The columns to drop from the dataset. + default: null + - name: pretrained_tokenizer + type: str + doc: The name of the pretrained tokenizer from the HuggingFace hub. + default: null + - name: pretrained_model + type: str + doc: The name of the pretrained model from the HuggingFace hub. + default: null + - name: model_class + type: str + doc: The class of the model, e.g. `transformers.AutoModelForSequenceClassification` + default: null + - name: model_name + type: str + doc: The model's name to use for storing the model artifact, default to 'model' + default: huggingface-model + - name: label_name + type: str + doc: The target label of the column in the dataset. + default: labels + - name: text_col + type: str + doc: The input text column un the dataset. + default: text + - name: num_of_train_samples + type: int + doc: Max number of training samples, for debugging. + default: null + - name: train_test_split_size + type: float + doc: Should be between 0.0 and 1.0 and represent the proportion of the dataset + to include in the test split. + default: null + - name: metrics + type: List[str] + doc: List of different metrics for evaluate the model such as f1, accuracy + etc. + default: null + - name: random_state + type: int + doc: Random state for train_test_split + default: null + outputs: [] + lineno: 647 + has_varargs: false + has_kwargs: false + preprocess_function: + name: preprocess_function + doc: '' + parameters: + - name: examples + outputs: [] + lineno: 696 + has_varargs: false + has_kwargs: false + optimize: + name: optimize + doc: Optimizing the transformer model using ONNX optimization. + parameters: + - name: model_path + type: str + doc: The path of the model to optimize. + - name: model_name + type: str + doc: Name of the optimized model. + default: optimized_model + - name: target_dir + type: str + doc: The directory to save the ONNX model. + default: ./optimized + - name: optimization_level + type: int + doc: Optimization level performed by ONNX Runtime of the loaded graph. (default + is 1) + default: 1 + outputs: [] + lineno: 799 + has_varargs: false + has_kwargs: false + description: Automatic train and optimize functions for HuggingFace framework + default_handler: train + disable_auto_mount: false + clone_target_dir: '' + env: [] + priority_class_name: '' + preemption_mode: prevent + affinity: null + tolerations: null + security_context: {} +verbose: false diff --git a/functions/master/hugging_face_classifier_trainer/0.3.0/src/hugging_face_classifier_trainer.ipynb b/functions/master/hugging_face_classifier_trainer/0.3.0/src/hugging_face_classifier_trainer.ipynb new file mode 100644 index 00000000..2768d2dc --- /dev/null +++ b/functions/master/hugging_face_classifier_trainer/0.3.0/src/hugging_face_classifier_trainer.ipynb @@ -0,0 +1,2533 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "\n", + "# MLRun Hugging Face Classifier Trainer Tutorial" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "This notebook shows how to use the handlers of the Hugging Face classifier trainer.\n", + "the following handlers are:\n", + "- `train`\n", + "- `optimize`\n", + "\n", + "All you need is simply **HF model type** and a **HF dataset name**." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: onnx~=1.14.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 1)) (1.14.1)\n", + "Requirement already satisfied: onnxruntime==1.16.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 2)) (1.16.1)\n", + "Requirement already satisfied: optimum~=1.6.4 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 3)) (1.6.4)\n", + "Requirement already satisfied: transformers~=4.26.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 4)) (4.26.1)\n", + "Requirement already satisfied: datasets~=2.10.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 5)) (2.10.1)\n", + "Requirement already satisfied: scikit-learn~=1.0.2 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 6)) (1.0.2)\n", + "Requirement already satisfied: coloredlogs in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (15.0.1)\n", + "Requirement already satisfied: flatbuffers in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.12)\n", + "Requirement already satisfied: numpy>=1.21.6 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.23.5)\n", + "Requirement already satisfied: packaging in /conda/envs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (21.3)\n", + "Requirement already satisfied: protobuf in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (3.20.2)\n", + "Requirement already satisfied: sympy in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.12)\n", + "Requirement already satisfied: typing-extensions>=3.6.2.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from onnx~=1.14.1->-r requirements.txt (line 1)) (4.7.1)\n", + "Requirement already satisfied: torch>=1.9 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from optimum~=1.6.4->-r requirements.txt (line 3)) (2.1.2)\n", + "Requirement already satisfied: huggingface-hub>=0.8.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from optimum~=1.6.4->-r requirements.txt (line 3)) (0.20.1)\n", + "Requirement already satisfied: filelock in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (3.13.1)\n", + "Requirement already satisfied: pyyaml>=5.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (5.4.1)\n", + "Requirement already satisfied: regex!=2019.12.17 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (2023.12.25)\n", + "Requirement already satisfied: requests in /conda/envs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (2.31.0)\n", + "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (0.13.3)\n", + "Requirement already satisfied: tqdm>=4.27 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (4.65.0)\n", + "Requirement already satisfied: pyarrow>=6.0.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (11.0.0)\n", + "Requirement already satisfied: dill<0.3.7,>=0.3.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (0.3.6)\n", + "Requirement already satisfied: pandas in /conda/envs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (1.4.4)\n", + "Requirement already satisfied: xxhash in /conda/envs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (3.3.0)\n", + "Requirement already satisfied: multiprocess in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (0.70.14)\n", + "Requirement already satisfied: fsspec>=2021.11.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from fsspec[http]>=2021.11.1->datasets~=2.10.1->-r requirements.txt (line 5)) (2023.9.2)\n", + "Requirement already satisfied: aiohttp in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (3.9.1)\n", + "Requirement already satisfied: responses<0.19 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (0.18.0)\n", + "Requirement already satisfied: scipy>=1.1.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r requirements.txt (line 6)) (1.11.4)\n", + "Requirement already satisfied: joblib>=0.11 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r requirements.txt (line 6)) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r requirements.txt (line 6)) (3.2.0)\n", + "Requirement already satisfied: attrs>=17.3.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (19.1.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (6.0.4)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (1.9.2)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (1.4.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (1.3.1)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (4.0.3)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from packaging->onnxruntime==1.16.1->-r requirements.txt (line 2)) (3.1.1)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (2.1.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (1.26.16)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (2023.7.22)\n", + "Requirement already satisfied: networkx in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (3.2.1)\n", + "Requirement already satisfied: jinja2 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (3.1.3)\n", + "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)\n", + "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)\n", + "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)\n", + "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (8.9.2.26)\n", + "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.3.1)\n", + "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (11.0.2.54)\n", + "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (10.3.2.106)\n", + "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (11.4.5.107)\n", + "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.0.106)\n", + "Requirement already satisfied: nvidia-nccl-cu12==2.18.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (2.18.1)\n", + "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)\n", + "Requirement already satisfied: triton==2.1.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (2.1.0)\n", + "Requirement already satisfied: nvidia-nvjitlink-cu12 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.3.101)\n", + "Requirement already satisfied: sentencepiece!=0.1.92,>=0.1.91 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers[sentencepiece]>=4.26.0->optimum~=1.6.4->-r requirements.txt (line 3)) (0.2.0)\n", + "Requirement already satisfied: humanfriendly>=9.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from coloredlogs->onnxruntime==1.16.1->-r requirements.txt (line 2)) (9.2)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from pandas->datasets~=2.10.1->-r requirements.txt (line 5)) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from pandas->datasets~=2.10.1->-r requirements.txt (line 5)) (2023.3.post1)\n", + "Requirement already satisfied: mpmath>=0.19 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from sympy->onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.3.0)\n", + "Requirement already satisfied: six>=1.5 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas->datasets~=2.10.1->-r requirements.txt (line 5)) (1.16.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from jinja2->torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (2.1.3)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -r requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "import mlrun" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-24 17:10:17,091 [info] Project loaded successfully: {'project_name': 'hugging-face-trainer'}\n" + ] + } + ], + "source": [ + "project = mlrun.get_or_create_project('hugging-face-trainer', context=\"./\", user_project=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### **Importing the hugging_face_classifier_trainer function from the Marketplace**" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "hugging_face_classifier_trainer = mlrun.import_function(\"hub://hugging_face_classifier_trainer\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### **Training a model**\n", + "\n", + "Choosing the `train` handler" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "#### Define task parameters¶\n", + "* Class parameters should contain the prefix `CLASS_`\n", + "* Train parameters should contain the prefix `TRAIN_`" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "model_class = \"transformers.AutoModelForSequenceClassification\"\n", + "additional_parameters = {\n", + " \"TRAIN_output_dir\": \"finetuning-sentiment-model-3000-samples\",\n", + " \"TRAIN_learning_rate\": 2e-5,\n", + " \"TRAIN_per_device_train_batch_size\": 16,\n", + " \"TRAIN_per_device_eval_batch_size\": 16,\n", + " \"TRAIN_num_train_epochs\": 3,\n", + " \"TRAIN_weight_decay\": 0.01,\n", + " \"TRAIN_push_to_hub\": False,\n", + " \"TRAIN_evaluation_strategy\": \"epoch\",\n", + " \"TRAIN_eval_steps\": 1,\n", + " \"TRAIN_logging_steps\": 1,\n", + " \"CLASS_num_labels\": 2\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "#### Running the Training job with the \"train\" handler" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-24 17:10:21,025 [info] Storing function: {'name': 'hugging-face-classifier-trainer-train', 'uid': '514d8d5530c842238b1cc81983cd943e', 'db': 'http://mlrun-api:8080'}\n", + "> 2024-03-24 17:11:03,727 [info] 'train_test_split_size' is not provided, setting train_test_split_size to 0.2\n", + "> 2024-03-24 17:11:03,882 [info] Loading and editing Shayanvsf/US_Airline_Sentiment dataset from Hugging Face hub\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset parquet (/igz/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f43b1388d0b344888323bec590baadee", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3 [00:00 2024-03-24 17:11:08,938 [info] training 'huggingface-model'\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.\n", + "This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + "***** Running training *****\n", + " Num examples = 100\n", + " Num Epochs = 3\n", + " Instantaneous batch size per device = 16\n", + " Total train batch size (w. parallel, distributed & accumulation) = 16\n", + " Gradient Accumulation steps = 1\n", + " Total optimization steps = 21\n", + " Number of trainable parameters = 66955010\n", + "You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [21/21 00:15, Epoch 3/3]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EpochTraining LossValidation LossAccuracyF1
10.7389000.5153110.7916670.000000
20.5259000.4815630.7916670.000000
30.4908000.4716750.7916670.000000

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.\n", + "***** Running Evaluation *****\n", + " Num examples = 24\n", + " Batch size = 16\n", + "/tmp/tmp0c1aawrq.py:561: FutureWarning:\n", + "\n", + "load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n", + "\n", + "The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.\n", + "***** Running Evaluation *****\n", + " Num examples = 24\n", + " Batch size = 16\n", + "The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.\n", + "***** Running Evaluation *****\n", + " Num examples = 24\n", + " Batch size = 16\n", + "\n", + "\n", + "Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "\n", + "\n", + "tokenizer config file saved in /tmp/tokenizer/tokenizer_config.json\n", + "Special tokens file saved in /tmp/tokenizer/special_tokens_map.json\n", + "Configuration saved in /tmp/model/config.json\n", + "Model weights saved in /tmp/model/pytorch_model.bin\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
hugging-face-trainer-avia0Mar 24 17:10:21completedhugging-face-classifier-trainer-train
v3io_user=avia
kind=local
owner=avia
host=jupyter-avia-6454bdd4c5-xz8cg
hf_dataset=Shayanvsf/US_Airline_Sentiment
drop_columns=['airline_sentiment_confidence', 'negativereason_confidence']
pretrained_tokenizer=distilbert-base-uncased
pretrained_model=distilbert-base-uncased
model_class=transformers.AutoModelForSequenceClassification
label_name=airline_sentiment
num_of_train_samples=100
metrics=['accuracy', 'f1']
random_state=42
TRAIN_output_dir=finetuning-sentiment-model-3000-samples
TRAIN_learning_rate=2e-05
TRAIN_per_device_train_batch_size=16
TRAIN_per_device_eval_batch_size=16
TRAIN_num_train_epochs=3
TRAIN_weight_decay=0.01
TRAIN_push_to_hub=False
TRAIN_evaluation_strategy=epoch
TRAIN_eval_steps=1
TRAIN_logging_steps=1
CLASS_num_labels=2
loss=0.4908
learning_rate=0.0
eval_loss=0.47167453169822693
eval_accuracy=0.7916666666666666
eval_f1=0.0
eval_runtime=0.5186
eval_samples_per_second=46.276
eval_steps_per_second=3.856
train_runtime=17.6054
train_samples_per_second=17.04
train_steps_per_second=1.193
total_flos=3327208489680.0
loss_plot
learning_rate_plot
eval_loss_plot
eval_accuracy_plot
eval_f1_plot
eval_runtime_plot
eval_samples_per_second_plot
eval_steps_per_second_plot
tokenizer
model
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-24 17:12:01,880 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-train'}\n" + ] + } + ], + "source": [ + "train_run = hugging_face_classifier_trainer.run(params={\n", + " \"hf_dataset\": \"Shayanvsf/US_Airline_Sentiment\",\n", + " \"drop_columns\": [\n", + " \"airline_sentiment_confidence\",\n", + " \"negativereason_confidence\",\n", + " ],\n", + " \"pretrained_tokenizer\": \"distilbert-base-uncased\",\n", + " \"pretrained_model\": \"distilbert-base-uncased\",\n", + " \"model_class\": \"transformers.AutoModelForSequenceClassification\",\n", + " \"label_name\": \"airline_sentiment\",\n", + " \"num_of_train_samples\": 100,\n", + " \"metrics\": [\"accuracy\", \"f1\"],\n", + " \"random_state\": 42,\n", + " **additional_parameters\n", + " },\n", + " handler=\"train\",\n", + " local=True,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "#### The result of the train run" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'loss': 0.4908,\n", + " 'learning_rate': 0.0,\n", + " 'eval_loss': 0.47167453169822693,\n", + " 'eval_accuracy': 0.7916666666666666,\n", + " 'eval_f1': 0.0,\n", + " 'eval_runtime': 0.5186,\n", + " 'eval_samples_per_second': 46.276,\n", + " 'eval_steps_per_second': 3.856,\n", + " 'train_runtime': 17.6054,\n", + " 'train_samples_per_second': 17.04,\n", + " 'train_steps_per_second': 1.193,\n", + " 'total_flos': 3327208489680.0,\n", + " 'loss_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/loss_plot.html',\n", + " 'learning_rate_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/learning_rate_plot.html',\n", + " 'eval_loss_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_loss_plot.html',\n", + " 'eval_accuracy_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_accuracy_plot.html',\n", + " 'eval_f1_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_f1_plot.html',\n", + " 'eval_runtime_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_runtime_plot.html',\n", + " 'eval_samples_per_second_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_samples_per_second_plot.html',\n", + " 'eval_steps_per_second_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_steps_per_second_plot.html',\n", + " 'tokenizer': 'store://artifacts/hugging-face-trainer-avia/hugging-face-classifier-trainer-train_tokenizer@514d8d5530c842238b1cc81983cd943e',\n", + " 'model': 'store://artifacts/hugging-face-trainer-avia/huggingface-model@514d8d5530c842238b1cc81983cd943e'}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_run.outputs" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "train_run.artifact('loss_plot').show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "#### Getting the model for evaluating and predicting" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "model_path = train_run.outputs['model']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### **Optimize the model**\n", + "\n", + "Choosing the `optimize` handler\n", + "\n", + "The result of using this handled is an onnx optimized model." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-24 17:12:02,020 [info] Storing function: {'name': 'hugging-face-classifier-trainer-optimize', 'uid': 'fbee1ead18444824a4b5c0308a677bf4', 'db': 'http://mlrun-api:8080'}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/optimum/onnxruntime/configuration.py:726: FutureWarning:\n", + "\n", + "disable_embed_layer_norm will be deprecated soon, use disable_embed_layer_norm_fusion instead, disable_embed_layer_norm_fusion is set to True.\n", + "\n", + "loading configuration file /tmp/config.json\n", + "Model config DistilBertConfig {\n", + " \"_name_or_path\": \"/tmp/config.json\",\n", + " \"activation\": \"gelu\",\n", + " \"architectures\": [\n", + " \"DistilBertForSequenceClassification\"\n", + " ],\n", + " \"attention_dropout\": 0.1,\n", + " \"dim\": 768,\n", + " \"dropout\": 0.1,\n", + " \"hidden_dim\": 3072,\n", + " \"initializer_range\": 0.02,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"distilbert\",\n", + " \"n_heads\": 12,\n", + " \"n_layers\": 6,\n", + " \"pad_token_id\": 0,\n", + " \"problem_type\": \"single_label_classification\",\n", + " \"qa_dropout\": 0.1,\n", + " \"seq_classif_dropout\": 0.2,\n", + " \"sinusoidal_pos_embds\": false,\n", + " \"tie_weights_\": true,\n", + " \"torch_dtype\": \"float32\",\n", + " \"transformers_version\": \"4.26.1\",\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n", + "loading configuration file /tmp/config.json\n", + "Model config DistilBertConfig {\n", + " \"_name_or_path\": \"/tmp\",\n", + " \"activation\": \"gelu\",\n", + " \"architectures\": [\n", + " \"DistilBertForSequenceClassification\"\n", + " ],\n", + " \"attention_dropout\": 0.1,\n", + " \"dim\": 768,\n", + " \"dropout\": 0.1,\n", + " \"hidden_dim\": 3072,\n", + " \"initializer_range\": 0.02,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"distilbert\",\n", + " \"n_heads\": 12,\n", + " \"n_layers\": 6,\n", + " \"pad_token_id\": 0,\n", + " \"problem_type\": \"single_label_classification\",\n", + " \"qa_dropout\": 0.1,\n", + " \"seq_classif_dropout\": 0.2,\n", + " \"sinusoidal_pos_embds\": false,\n", + " \"tie_weights_\": true,\n", + " \"torch_dtype\": \"float32\",\n", + " \"transformers_version\": \"4.26.1\",\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n", + "loading weights file /tmp/pytorch_model.bin\n", + "All model checkpoint weights were used when initializing DistilBertForSequenceClassification.\n", + "\n", + "All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at /tmp.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.\n", + "/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/transformers/models/distilbert/modeling_distilbert.py:218: TracerWarning:\n", + "\n", + "torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n", + "\n", + "Configuration saved in /tmp/tmp79wjp8m8/config.json\n", + "Could not locate the tokenizer configuration file, will try to use the model config instead.\n", + "loading configuration file /tmp/config.json\n", + "Model config DistilBertConfig {\n", + " \"_name_or_path\": \"/tmp\",\n", + " \"activation\": \"gelu\",\n", + " \"architectures\": [\n", + " \"DistilBertForSequenceClassification\"\n", + " ],\n", + " \"attention_dropout\": 0.1,\n", + " \"dim\": 768,\n", + " \"dropout\": 0.1,\n", + " \"hidden_dim\": 3072,\n", + " \"initializer_range\": 0.02,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"distilbert\",\n", + " \"n_heads\": 12,\n", + " \"n_layers\": 6,\n", + " \"pad_token_id\": 0,\n", + " \"problem_type\": \"single_label_classification\",\n", + " \"qa_dropout\": 0.1,\n", + " \"seq_classif_dropout\": 0.2,\n", + " \"sinusoidal_pos_embds\": false,\n", + " \"tie_weights_\": true,\n", + " \"torch_dtype\": \"float32\",\n", + " \"transformers_version\": \"4.26.1\",\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n", + "loading configuration file /tmp/config.json\n", + "Model config DistilBertConfig {\n", + " \"_name_or_path\": \"/tmp\",\n", + " \"activation\": \"gelu\",\n", + " \"architectures\": [\n", + " \"DistilBertForSequenceClassification\"\n", + " ],\n", + " \"attention_dropout\": 0.1,\n", + " \"dim\": 768,\n", + " \"dropout\": 0.1,\n", + " \"hidden_dim\": 3072,\n", + " \"initializer_range\": 0.02,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"distilbert\",\n", + " \"n_heads\": 12,\n", + " \"n_layers\": 6,\n", + " \"pad_token_id\": 0,\n", + " \"problem_type\": \"single_label_classification\",\n", + " \"qa_dropout\": 0.1,\n", + " \"seq_classif_dropout\": 0.2,\n", + " \"sinusoidal_pos_embds\": false,\n", + " \"tie_weights_\": true,\n", + " \"torch_dtype\": \"float32\",\n", + " \"transformers_version\": \"4.26.1\",\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n", + "Could not locate the tokenizer configuration file, will try to use the model config instead.\n", + "loading configuration file /tmp/config.json\n", + "Model config DistilBertConfig {\n", + " \"_name_or_path\": \"/tmp\",\n", + " \"activation\": \"gelu\",\n", + " \"architectures\": [\n", + " \"DistilBertForSequenceClassification\"\n", + " ],\n", + " \"attention_dropout\": 0.1,\n", + " \"dim\": 768,\n", + " \"dropout\": 0.1,\n", + " \"hidden_dim\": 3072,\n", + " \"initializer_range\": 0.02,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"distilbert\",\n", + " \"n_heads\": 12,\n", + " \"n_layers\": 6,\n", + " \"pad_token_id\": 0,\n", + " \"problem_type\": \"single_label_classification\",\n", + " \"qa_dropout\": 0.1,\n", + " \"seq_classif_dropout\": 0.2,\n", + " \"sinusoidal_pos_embds\": false,\n", + " \"tie_weights_\": true,\n", + " \"torch_dtype\": \"float32\",\n", + " \"transformers_version\": \"4.26.1\",\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n", + "Could not locate the tokenizer configuration file, will try to use the model config instead.\n", + "loading configuration file /tmp/tmp79wjp8m8/config.json\n", + "Model config DistilBertConfig {\n", + " \"_name_or_path\": \"/tmp/tmp79wjp8m8\",\n", + " \"activation\": \"gelu\",\n", + " \"architectures\": [\n", + " \"DistilBertForSequenceClassification\"\n", + " ],\n", + " \"attention_dropout\": 0.1,\n", + " \"dim\": 768,\n", + " \"dropout\": 0.1,\n", + " \"hidden_dim\": 3072,\n", + " \"initializer_range\": 0.02,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"distilbert\",\n", + " \"n_heads\": 12,\n", + " \"n_layers\": 6,\n", + " \"pad_token_id\": 0,\n", + " \"problem_type\": \"single_label_classification\",\n", + " \"qa_dropout\": 0.1,\n", + " \"seq_classif_dropout\": 0.2,\n", + " \"sinusoidal_pos_embds\": false,\n", + " \"tie_weights_\": true,\n", + " \"torch_dtype\": \"float32\",\n", + " \"transformers_version\": \"4.26.1\",\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n", + "loading configuration file /tmp/tmp79wjp8m8/config.json\n", + "Model config DistilBertConfig {\n", + " \"_name_or_path\": \"/tmp/tmp79wjp8m8\",\n", + " \"activation\": \"gelu\",\n", + " \"architectures\": [\n", + " \"DistilBertForSequenceClassification\"\n", + " ],\n", + " \"attention_dropout\": 0.1,\n", + " \"dim\": 768,\n", + " \"dropout\": 0.1,\n", + " \"hidden_dim\": 3072,\n", + " \"initializer_range\": 0.02,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"distilbert\",\n", + " \"n_heads\": 12,\n", + " \"n_layers\": 6,\n", + " \"pad_token_id\": 0,\n", + " \"problem_type\": \"single_label_classification\",\n", + " \"qa_dropout\": 0.1,\n", + " \"seq_classif_dropout\": 0.2,\n", + " \"sinusoidal_pos_embds\": false,\n", + " \"tie_weights_\": true,\n", + " \"torch_dtype\": \"float32\",\n", + " \"transformers_version\": \"4.26.1\",\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n", + "Could not locate the tokenizer configuration file, will try to use the model config instead.\n", + "loading configuration file /tmp/tmp79wjp8m8/config.json\n", + "Model config DistilBertConfig {\n", + " \"_name_or_path\": \"/tmp/tmp79wjp8m8\",\n", + " \"activation\": \"gelu\",\n", + " \"architectures\": [\n", + " \"DistilBertForSequenceClassification\"\n", + " ],\n", + " \"attention_dropout\": 0.1,\n", + " \"dim\": 768,\n", + " \"dropout\": 0.1,\n", + " \"hidden_dim\": 3072,\n", + " \"initializer_range\": 0.02,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"distilbert\",\n", + " \"n_heads\": 12,\n", + " \"n_layers\": 6,\n", + " \"pad_token_id\": 0,\n", + " \"problem_type\": \"single_label_classification\",\n", + " \"qa_dropout\": 0.1,\n", + " \"seq_classif_dropout\": 0.2,\n", + " \"sinusoidal_pos_embds\": false,\n", + " \"tie_weights_\": true,\n", + " \"torch_dtype\": \"float32\",\n", + " \"transformers_version\": \"4.26.1\",\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n", + "Configuration saved in optimized/config.json\n", + "Could not locate the tokenizer configuration file, will try to use the model config instead.\n", + "loading configuration file /tmp/tmp79wjp8m8/config.json\n", + "Model config DistilBertConfig {\n", + " \"_name_or_path\": \"/tmp/tmp79wjp8m8\",\n", + " \"activation\": \"gelu\",\n", + " \"architectures\": [\n", + " \"DistilBertForSequenceClassification\"\n", + " ],\n", + " \"attention_dropout\": 0.1,\n", + " \"dim\": 768,\n", + " \"dropout\": 0.1,\n", + " \"hidden_dim\": 3072,\n", + " \"initializer_range\": 0.02,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"distilbert\",\n", + " \"n_heads\": 12,\n", + " \"n_layers\": 6,\n", + " \"pad_token_id\": 0,\n", + " \"problem_type\": \"single_label_classification\",\n", + " \"qa_dropout\": 0.1,\n", + " \"seq_classif_dropout\": 0.2,\n", + " \"sinusoidal_pos_embds\": false,\n", + " \"tie_weights_\": true,\n", + " \"torch_dtype\": \"float32\",\n", + " \"transformers_version\": \"4.26.1\",\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n", + "loading configuration file /tmp/tmp79wjp8m8/config.json\n", + "Model config DistilBertConfig {\n", + " \"_name_or_path\": \"/tmp/tmp79wjp8m8\",\n", + " \"activation\": \"gelu\",\n", + " \"architectures\": [\n", + " \"DistilBertForSequenceClassification\"\n", + " ],\n", + " \"attention_dropout\": 0.1,\n", + " \"dim\": 768,\n", + " \"dropout\": 0.1,\n", + " \"hidden_dim\": 3072,\n", + " \"initializer_range\": 0.02,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"distilbert\",\n", + " \"n_heads\": 12,\n", + " \"n_layers\": 6,\n", + " \"pad_token_id\": 0,\n", + " \"problem_type\": \"single_label_classification\",\n", + " \"qa_dropout\": 0.1,\n", + " \"seq_classif_dropout\": 0.2,\n", + " \"sinusoidal_pos_embds\": false,\n", + " \"tie_weights_\": true,\n", + " \"torch_dtype\": \"float32\",\n", + " \"transformers_version\": \"4.26.1\",\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n", + "Could not locate the tokenizer configuration file, will try to use the model config instead.\n", + "loading configuration file /tmp/tmp79wjp8m8/config.json\n", + "Model config DistilBertConfig {\n", + " \"_name_or_path\": \"/tmp/tmp79wjp8m8\",\n", + " \"activation\": \"gelu\",\n", + " \"architectures\": [\n", + " \"DistilBertForSequenceClassification\"\n", + " ],\n", + " \"attention_dropout\": 0.1,\n", + " \"dim\": 768,\n", + " \"dropout\": 0.1,\n", + " \"hidden_dim\": 3072,\n", + " \"initializer_range\": 0.02,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"distilbert\",\n", + " \"n_heads\": 12,\n", + " \"n_layers\": 6,\n", + " \"pad_token_id\": 0,\n", + " \"problem_type\": \"single_label_classification\",\n", + " \"qa_dropout\": 0.1,\n", + " \"seq_classif_dropout\": 0.2,\n", + " \"sinusoidal_pos_embds\": false,\n", + " \"tie_weights_\": true,\n", + " \"torch_dtype\": \"float32\",\n", + " \"transformers_version\": \"4.26.1\",\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n", + "Failed to remove node input: \"/distilbert/transformer/layer.0/attention/Transpose_output_0\"\n", + "input: \"/distilbert/transformer/layer.0/attention/Constant_11_output_0\"\n", + "output: \"/distilbert/transformer/layer.0/attention/Div_output_0\"\n", + "name: \"/distilbert/transformer/layer.0/attention/Div\"\n", + "op_type: \"Div\"\n", + "\n", + "Failed to remove node input: \"/distilbert/transformer/layer.1/attention/Transpose_output_0\"\n", + "input: \"/distilbert/transformer/layer.1/attention/Constant_11_output_0\"\n", + "output: \"/distilbert/transformer/layer.1/attention/Div_output_0\"\n", + "name: \"/distilbert/transformer/layer.1/attention/Div\"\n", + "op_type: \"Div\"\n", + "\n", + "Failed to remove node input: \"/distilbert/transformer/layer.2/attention/Transpose_output_0\"\n", + "input: \"/distilbert/transformer/layer.2/attention/Constant_11_output_0\"\n", + "output: \"/distilbert/transformer/layer.2/attention/Div_output_0\"\n", + "name: \"/distilbert/transformer/layer.2/attention/Div\"\n", + "op_type: \"Div\"\n", + "\n", + "Failed to remove node input: \"/distilbert/transformer/layer.3/attention/Transpose_output_0\"\n", + "input: \"/distilbert/transformer/layer.3/attention/Constant_11_output_0\"\n", + "output: \"/distilbert/transformer/layer.3/attention/Div_output_0\"\n", + "name: \"/distilbert/transformer/layer.3/attention/Div\"\n", + "op_type: \"Div\"\n", + "\n", + "Failed to remove node input: \"/distilbert/transformer/layer.4/attention/Transpose_output_0\"\n", + "input: \"/distilbert/transformer/layer.4/attention/Constant_11_output_0\"\n", + "output: \"/distilbert/transformer/layer.4/attention/Div_output_0\"\n", + "name: \"/distilbert/transformer/layer.4/attention/Div\"\n", + "op_type: \"Div\"\n", + "\n", + "Failed to remove node input: \"/distilbert/transformer/layer.5/attention/Transpose_output_0\"\n", + "input: \"/distilbert/transformer/layer.5/attention/Constant_11_output_0\"\n", + "output: \"/distilbert/transformer/layer.5/attention/Div_output_0\"\n", + "name: \"/distilbert/transformer/layer.5/attention/Div\"\n", + "op_type: \"Div\"\n", + "\n", + "Configuration saved in optimized/config.json\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
hugging-face-trainer-avia0Mar 24 17:12:02completedhugging-face-classifier-trainer-optimize
v3io_user=avia
kind=local
owner=avia
host=jupyter-avia-6454bdd4c5-xz8cg
model_path=store://artifacts/hugging-face-trainer-avia/huggingface-model@514d8d5530c842238b1cc81983cd943e
model
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-24 17:12:22,721 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-optimize'}\n" + ] + } + ], + "source": [ + "optimize_run = hugging_face_classifier_trainer.run(params={\n", + " \"model_path\": str(model_path)\n", + " },\n", + " handler=\"optimize\",\n", + " local=True,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'model': 'store://artifacts/hugging-face-trainer-avia/optimized_model@fbee1ead18444824a4b5c0308a677bf4'}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "optimize_run.outputs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### **Running the training remotely**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/mlrun/projects/operations.py:276: OverwriteBuildParamsWarning:\n", + "\n", + "The `overwrite_build_params` parameter default will change from 'False' to 'True' in 1.8.0.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-24 17:14:22,792 [info] Started building image: .mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer:latest\n", + "\u001b[36mINFO\u001b[0m[0000] Retrieving image manifest mlrun/mlrun:1.6.1 \n", + "\u001b[36mINFO\u001b[0m[0000] Retrieving image mlrun/mlrun:1.6.1 from registry index.docker.io \n", + "\u001b[36mINFO\u001b[0m[0000] Built cross stage deps: map[] \n", + "\u001b[36mINFO\u001b[0m[0000] Retrieving image manifest mlrun/mlrun:1.6.1 \n", + "\u001b[36mINFO\u001b[0m[0000] Returning cached image manifest \n", + "\u001b[36mINFO\u001b[0m[0000] Executing 0 build triggers \n", + "\u001b[36mINFO\u001b[0m[0000] Building stage 'mlrun/mlrun:1.6.1' [idx: '0', base-idx: '-1'] \n", + "\u001b[36mINFO\u001b[0m[0000] Unpacking rootfs as cmd RUN echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt requires it. \n", + "\u001b[36mINFO\u001b[0m[0047] RUN echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt \n", + "\u001b[36mINFO\u001b[0m[0047] Initializing snapshotter ... \n", + "\u001b[36mINFO\u001b[0m[0047] Taking snapshot of full filesystem... \n", + "\u001b[36mINFO\u001b[0m[0074] Cmd: /bin/sh \n", + "\u001b[36mINFO\u001b[0m[0074] Args: [-c echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt] \n", + "\u001b[36mINFO\u001b[0m[0074] Running: [/bin/sh -c echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt] \n", + "Installing /empty/requirements.txt...\n", + "mlrun[complete]==1.6.1\n", + "onnx~=1.14.1\n", + "onnxruntime~=1.16.1\n", + "optimum~=1.6.4\n", + "transformers~=4.26.1\n", + "datasets~=2.10.1\n", + "scikit-learn~=1.0.2\n", + "\u001b[36mINFO\u001b[0m[0074] Taking snapshot of full filesystem... \n", + "\u001b[36mINFO\u001b[0m[0078] No files were changed, appending empty layer to config. No layer added to image. \n", + "\u001b[36mINFO\u001b[0m[0078] RUN python -m pip install -r /empty/requirements.txt \n", + "\u001b[36mINFO\u001b[0m[0078] Cmd: /bin/sh \n", + "\u001b[36mINFO\u001b[0m[0078] Args: [-c python -m pip install -r /empty/requirements.txt] \n", + "\u001b[36mINFO\u001b[0m[0078] Running: [/bin/sh -c python -m pip install -r /empty/requirements.txt] \n", + "Requirement already satisfied: mlrun[complete]==1.6.1 in /opt/conda/lib/python3.9/site-packages (from -r /empty/requirements.txt (line 1)) (1.6.1)\n", + "Collecting onnx~=1.14.1 (from -r /empty/requirements.txt (line 2))\n", + " Obtaining dependency information for onnx~=1.14.1 from https://files.pythonhosted.org/packages/ff/24/0e522fdcadf0e15fc304145a5b6e5d7246d7f2c507fd9bfe6e1fafb2aa95/onnx-1.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", + " Downloading onnx-1.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)\n", + "Collecting onnxruntime~=1.16.1 (from -r /empty/requirements.txt (line 3))\n", + " Obtaining dependency information for onnxruntime~=1.16.1 from https://files.pythonhosted.org/packages/de/ab/ed3ae0d649cee41e870f8b1653cf4a1c1fc321e0ded4e3e1a3d4a25c0131/onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", + " Downloading onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)\n", + "Collecting optimum~=1.6.4 (from -r /empty/requirements.txt (line 4))\n", + " Obtaining dependency information for optimum~=1.6.4 from https://files.pythonhosted.org/packages/31/72/a7e3b2c57d6368c5f4bb6fba54a85cbf07d25c385a2db3f1a638f3c0ddb2/optimum-1.6.4-py3-none-any.whl.metadata\n", + " Downloading optimum-1.6.4-py3-none-any.whl.metadata (17 kB)\n", + "Collecting transformers~=4.26.1 (from -r /empty/requirements.txt (line 5))\n", + " Obtaining dependency information for transformers~=4.26.1 from https://files.pythonhosted.org/packages/1e/e2/60c3f4691b16d126ee9cfe28f598b13c424b60350ab339aba81aef054b8f/transformers-4.26.1-py3-none-any.whl.metadata\n", + " Downloading transformers-4.26.1-py3-none-any.whl.metadata (100 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.3/100.3 kB 6.2 MB/s eta 0:00:00\n", + "Collecting datasets~=2.10.1 (from -r /empty/requirements.txt (line 6))\n", + " Obtaining dependency information for datasets~=2.10.1 from https://files.pythonhosted.org/packages/fe/17/5825fdf034ff1a315becdbb9b6fe5a2bd9d8e724464535f18809593bf9c2/datasets-2.10.1-py3-none-any.whl.metadata\n", + " Downloading datasets-2.10.1-py3-none-any.whl.metadata (20 kB)\n", + "Collecting scikit-learn~=1.0.2 (from -r /empty/requirements.txt (line 7))\n", + " Obtaining dependency information for scikit-learn~=1.0.2 from https://files.pythonhosted.org/packages/57/aa/483fbe6b5314bce2d49801e6cec1f2139a9c220d0d51494788fff47233b3/scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", + " Downloading scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)\n", + "Requirement already satisfied: urllib3<1.27,>=1.26.9 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.26.18)\n", + "Requirement already satisfied: GitPython>=3.1.41,~=3.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.1.42)\n", + "Requirement already satisfied: aiohttp~=3.9 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.9.3)\n", + "Requirement already satisfied: aiohttp-retry~=2.8 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.3)\n", + "Requirement already satisfied: click~=8.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (8.1.7)\n", + "Requirement already satisfied: kfp~=1.8 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.22)\n", + "Requirement already satisfied: nest-asyncio~=1.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.6.0)\n", + "Requirement already satisfied: ipython~=8.10 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (8.18.1)\n", + "Requirement already satisfied: nuclio-jupyter~=0.9.15 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.16)\n", + "Requirement already satisfied: numpy<1.27.0,>=1.16.5 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.26.4)\n", + "Requirement already satisfied: pandas<2.2,>=1.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1.4)\n", + "Requirement already satisfied: pyarrow<15,>=10.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (14.0.2)\n", + "Requirement already satisfied: pyyaml~=5.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.4.1)\n", + "Requirement already satisfied: requests~=2.31 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.31.0)\n", + "Requirement already satisfied: tabulate~=0.8.6 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.8.10)\n", + "Requirement already satisfied: v3io~=0.5.21 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.23)\n", + "Requirement already satisfied: pydantic>=1.10.8,~=1.10 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.10.14)\n", + "Requirement already satisfied: mergedeep~=1.3 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.4)\n", + "Requirement already satisfied: v3io-frames~=0.10.12 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.10.13)\n", + "Requirement already satisfied: semver~=3.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.2)\n", + "Requirement already satisfied: dependency-injector~=4.41 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.41.0)\n", + "Requirement already satisfied: fsspec==2023.9.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.2)\n", + "Requirement already satisfied: v3iofs~=0.1.17 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.18)\n", + "Requirement already satisfied: storey~=1.6.18 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.6.18)\n", + "Requirement already satisfied: inflection~=0.5.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.1)\n", + "Requirement already satisfied: python-dotenv~=0.17.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.17.1)\n", + "Requirement already satisfied: setuptools~=68.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (68.2.2)\n", + "Requirement already satisfied: deprecated~=1.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.14)\n", + "Requirement already satisfied: jinja2>=3.1.3,~=3.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.1.3)\n", + "Requirement already satisfied: anyio~=3.7 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.7.1)\n", + "Requirement already satisfied: orjson~=3.9 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.9.15)\n", + "Requirement already satisfied: adlfs==2023.9.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.0)\n", + "Requirement already satisfied: aiobotocore<2.8,>=2.5.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.5.4)\n", + "Requirement already satisfied: avro~=1.11 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.11.3)\n", + "Requirement already satisfied: azure-core~=1.24 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.30.0)\n", + "Requirement already satisfied: azure-identity~=1.5 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.15.0)\n", + "Requirement already satisfied: azure-keyvault-secrets~=4.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.8.0)\n", + "Requirement already satisfied: boto3<1.29.0,>=1.28.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.28.17)\n", + "Requirement already satisfied: dask~=2023.9.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.3)\n", + "Requirement already satisfied: databricks-sdk~=0.13.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.13.0)\n", + "Requirement already satisfied: distributed~=2023.9.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.3)\n", + "Requirement already satisfied: gcsfs==2023.9.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.2)\n", + "Requirement already satisfied: google-cloud-bigquery[bqstorage,pandas]==3.14.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.14.1)\n", + "Requirement already satisfied: graphviz~=0.20.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.20.1)\n", + "Requirement already satisfied: kafka-python~=2.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.2)\n", + "Requirement already satisfied: mlflow~=2.8 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.10.2)\n", + "Requirement already satisfied: msrest~=0.6.21 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.21)\n", + "Requirement already satisfied: plotly<5.12.0,~=5.4 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.11.0)\n", + "Requirement already satisfied: pyopenssl>=23 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (24.0.0)\n", + "Requirement already satisfied: redis~=4.3 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.6.0)\n", + "Requirement already satisfied: s3fs==2023.9.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.2)\n", + "Requirement already satisfied: sqlalchemy~=1.4 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.51)\n", + "Requirement already satisfied: azure-datalake-store<0.1,>=0.0.46 in /opt/conda/lib/python3.9/site-packages (from adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.0.53)\n", + "Requirement already satisfied: azure-storage-blob>=12.12.0 in /opt/conda/lib/python3.9/site-packages (from adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (12.19.0)\n", + "Requirement already satisfied: decorator>4.1.2 in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.1.1)\n", + "Requirement already satisfied: google-auth>=1.2 in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.28.1)\n", + "Requirement already satisfied: google-auth-oauthlib in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)\n", + "Requirement already satisfied: google-cloud-storage in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.14.0)\n", + "Requirement already satisfied: google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.17.1)\n", + "Requirement already satisfied: google-cloud-core<3.0.0dev,>=1.6.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.1)\n", + "Requirement already satisfied: google-resumable-media<3.0dev,>=0.6.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.7.0)\n", + "Requirement already satisfied: packaging>=20.0.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (23.1)\n", + "Requirement already satisfied: python-dateutil<3.0dev,>=2.7.2 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.2)\n", + "Requirement already satisfied: db-dtypes<2.0.0dev,>=0.3.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)\n", + "Requirement already satisfied: google-cloud-bigquery-storage<3.0.0dev,>=2.6.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.24.0)\n", + "Requirement already satisfied: grpcio<2.0dev,>=1.47.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.62.0)\n", + "Requirement already satisfied: protobuf>=3.20.2 in /opt/conda/lib/python3.9/site-packages (from onnx~=1.14.1->-r /empty/requirements.txt (line 2)) (3.20.3)\n", + "Requirement already satisfied: typing-extensions>=3.6.2.1 in /opt/conda/lib/python3.9/site-packages (from onnx~=1.14.1->-r /empty/requirements.txt (line 2)) (4.10.0)\n", + "Collecting coloredlogs (from onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))\n", + " Obtaining dependency information for coloredlogs from https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl.metadata\n", + " Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)\n", + "Collecting flatbuffers (from onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))\n", + " Obtaining dependency information for flatbuffers from https://files.pythonhosted.org/packages/bf/45/c961e3cb6ddad76b325c163d730562bb6deb1ace5acbed0306f5fbefb90e/flatbuffers-24.3.7-py2.py3-none-any.whl.metadata\n", + " Downloading flatbuffers-24.3.7-py2.py3-none-any.whl.metadata (849 bytes)\n", + "Collecting sympy (from onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))\n", + " Obtaining dependency information for sympy from https://files.pythonhosted.org/packages/d2/05/e6600db80270777c4a64238a98d442f0fd07cc8915be2a1c16da7f2b9e74/sympy-1.12-py3-none-any.whl.metadata\n", + " Downloading sympy-1.12-py3-none-any.whl.metadata (12 kB)\n", + "Collecting transformers[sentencepiece]>=4.26.0 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/0a/fd/280f4385e76f3c1890efc15fa93f7206134fefad6351397e1bfab6d0d0de/transformers-4.39.1-py3-none-any.whl.metadata\n", + " Downloading transformers-4.39.1-py3-none-any.whl.metadata (134 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 40.1 MB/s eta 0:00:00\n", + "Collecting torch>=1.9 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", + " Obtaining dependency information for torch>=1.9 from https://files.pythonhosted.org/packages/98/04/95a12556d068786d6505c609daf2805bed91c9210c5185499a7c121eba47/torch-2.2.1-cp39-cp39-manylinux1_x86_64.whl.metadata\n", + " Downloading torch-2.2.1-cp39-cp39-manylinux1_x86_64.whl.metadata (25 kB)\n", + "Collecting numpy<1.27.0,>=1.16.5 (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1))\n", + " Obtaining dependency information for numpy<1.27.0,>=1.16.5 from https://files.pythonhosted.org/packages/4c/b9/038abd6fbd67b05b03cb1af590cfc02b7f1e5a37af7ac6a868f5093c29f5/numpy-1.23.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", + " Downloading numpy-1.23.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)\n", + "Collecting huggingface-hub>=0.8.0 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", + " Obtaining dependency information for huggingface-hub>=0.8.0 from https://files.pythonhosted.org/packages/ab/28/d4b691840d73126d4c9845f8a22dad033ac872509b6d3a0d93b456eef424/huggingface_hub-0.21.4-py3-none-any.whl.metadata\n", + " Downloading huggingface_hub-0.21.4-py3-none-any.whl.metadata (13 kB)\n", + "Collecting filelock (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))\n", + " Obtaining dependency information for filelock from https://files.pythonhosted.org/packages/81/54/84d42a0bee35edba99dee7b59a8d4970eccdd44b99fe728ed912106fc781/filelock-3.13.1-py3-none-any.whl.metadata\n", + " Downloading filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)\n", + "Collecting regex!=2019.12.17 (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))\n", + " Obtaining dependency information for regex!=2019.12.17 from https://files.pythonhosted.org/packages/05/9e/80c20f1151432a6025690c9c2037053039b028a7b236fa81d7e7ac9dec60/regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", + " Downloading regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 40.9/40.9 kB 217.5 MB/s eta 0:00:00\n", + "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))\n", + " Obtaining dependency information for tokenizers!=0.11.3,<0.14,>=0.11.1 from https://files.pythonhosted.org/packages/d6/27/07a337087dd507170a1b20fed3bbf8da81401185a7130a6e74e440c52040/tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", + " Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n", + "Requirement already satisfied: tqdm>=4.27 in /opt/conda/lib/python3.9/site-packages (from transformers~=4.26.1->-r /empty/requirements.txt (line 5)) (4.65.0)\n", + "Collecting dill<0.3.7,>=0.3.0 (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))\n", + " Obtaining dependency information for dill<0.3.7,>=0.3.0 from https://files.pythonhosted.org/packages/be/e3/a84bf2e561beed15813080d693b4b27573262433fced9c1d1fea59e60553/dill-0.3.6-py3-none-any.whl.metadata\n", + " Downloading dill-0.3.6-py3-none-any.whl.metadata (9.8 kB)\n", + "Requirement already satisfied: xxhash in /opt/conda/lib/python3.9/site-packages (from datasets~=2.10.1->-r /empty/requirements.txt (line 6)) (3.4.1)\n", + "Collecting multiprocess (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))\n", + " Obtaining dependency information for multiprocess from https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl.metadata\n", + " Downloading multiprocess-0.70.16-py39-none-any.whl.metadata (7.2 kB)\n", + "Collecting responses<0.19 (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))\n", + " Obtaining dependency information for responses<0.19 from https://files.pythonhosted.org/packages/79/f3/2b3a6dc5986303b3dd1bbbcf482022acb2583c428cd23f0b6d37b1a1a519/responses-0.18.0-py3-none-any.whl.metadata\n", + " Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)\n", + "Requirement already satisfied: scipy>=1.1.0 in /opt/conda/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r /empty/requirements.txt (line 7)) (1.12.0)\n", + "Requirement already satisfied: joblib>=0.11 in /opt/conda/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r /empty/requirements.txt (line 7)) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r /empty/requirements.txt (line 7)) (3.3.0)\n", + "Requirement already satisfied: botocore<1.31.18,>=1.31.17 in /opt/conda/lib/python3.9/site-packages (from aiobotocore<2.8,>=2.5.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.31.17)\n", + "Requirement already satisfied: wrapt<2.0.0,>=1.10.10 in /opt/conda/lib/python3.9/site-packages (from aiobotocore<2.8,>=2.5.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.16.0)\n", + "Requirement already satisfied: aioitertools<1.0.0,>=0.5.1 in /opt/conda/lib/python3.9/site-packages (from aiobotocore<2.8,>=2.5.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.11.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (23.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.1)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.0.5)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.9.4)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.0.3)\n", + "Requirement already satisfied: idna>=2.8 in /opt/conda/lib/python3.9/site-packages (from anyio~=3.7->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.4)\n", + "Requirement already satisfied: sniffio>=1.1 in /opt/conda/lib/python3.9/site-packages (from anyio~=3.7->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.1)\n", + "Requirement already satisfied: exceptiongroup in /opt/conda/lib/python3.9/site-packages (from anyio~=3.7->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)\n", + "Requirement already satisfied: six>=1.11.0 in /opt/conda/lib/python3.9/site-packages (from azure-core~=1.24->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.16.0)\n", + "Requirement already satisfied: cryptography>=2.5 in /opt/conda/lib/python3.9/site-packages (from azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (42.0.2)\n", + "Requirement already satisfied: msal<2.0.0,>=1.24.0 in /opt/conda/lib/python3.9/site-packages (from azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.27.0)\n", + "Requirement already satisfied: msal-extensions<2.0.0,>=0.3.0 in /opt/conda/lib/python3.9/site-packages (from azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.1.0)\n", + "Requirement already satisfied: isodate>=0.6.1 in /opt/conda/lib/python3.9/site-packages (from azure-keyvault-secrets~=4.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.1)\n", + "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /opt/conda/lib/python3.9/site-packages (from boto3<1.29.0,>=1.28.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.1)\n", + "Requirement already satisfied: s3transfer<0.7.0,>=0.6.0 in /opt/conda/lib/python3.9/site-packages (from boto3<1.29.0,>=1.28.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.2)\n", + "Requirement already satisfied: cloudpickle>=1.5.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.2.1)\n", + "Requirement already satisfied: partd>=1.2.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.1)\n", + "Requirement already satisfied: toolz>=0.10.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.12.0)\n", + "Requirement already satisfied: importlib-metadata>=4.13.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.0.1)\n", + "Requirement already satisfied: locket>=1.0.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.0)\n", + "Requirement already satisfied: msgpack>=1.0.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.7)\n", + "Requirement already satisfied: psutil>=5.7.2 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.9.8)\n", + "Requirement already satisfied: sortedcontainers>=2.0.5 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.0)\n", + "Requirement already satisfied: tblib>=1.6.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.0)\n", + "Requirement already satisfied: tornado>=6.0.4 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.4)\n", + "Requirement already satisfied: zict>=3.0.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.0)\n", + "Requirement already satisfied: gitdb<5,>=4.0.1 in /opt/conda/lib/python3.9/site-packages (from GitPython>=3.1.41,~=3.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.0.11)\n", + "Requirement already satisfied: jedi>=0.16 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.19.1)\n", + "Requirement already satisfied: matplotlib-inline in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.6)\n", + "Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.43)\n", + "Requirement already satisfied: pygments>=2.4.0 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.17.2)\n", + "Requirement already satisfied: stack-data in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.3)\n", + "Requirement already satisfied: traitlets>=5 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.14.1)\n", + "Requirement already satisfied: pexpect>4.3 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.9.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.9/site-packages (from jinja2>=3.1.3,~=3.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1.5)\n", + "Requirement already satisfied: absl-py<2,>=0.9 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.0)\n", + "Requirement already satisfied: kubernetes<26,>=8.0.0 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (25.3.0)\n", + "Requirement already satisfied: google-api-python-client<2,>=1.7.8 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.12.11)\n", + "Requirement already satisfied: requests-toolbelt<1,>=0.8.0 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.10.1)\n", + "Requirement already satisfied: kfp-server-api<2.0.0,>=1.1.2 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.5)\n", + "Requirement already satisfied: jsonschema<5,>=3.0.1 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.21.1)\n", + "Requirement already satisfied: strip-hints<1,>=0.1.8 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.10)\n", + "Requirement already satisfied: docstring-parser<1,>=0.7.3 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.15)\n", + "Requirement already satisfied: kfp-pipeline-spec<0.2.0,>=0.1.16 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.16)\n", + "Requirement already satisfied: fire<1,>=0.3.1 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.0)\n", + "Requirement already satisfied: uritemplate<4,>=3.0.1 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.1)\n", + "Requirement already satisfied: typer<1.0,>=0.3.2 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.0)\n", + "Requirement already satisfied: entrypoints<1 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.4)\n", + "Requirement already satisfied: pytz<2024 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.4)\n", + "Requirement already satisfied: sqlparse<1,>=0.4.0 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.4.4)\n", + "Requirement already satisfied: alembic!=1.10.0,<2 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.13.1)\n", + "Requirement already satisfied: docker<8,>=4.0.0 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.0.0)\n", + "Requirement already satisfied: Flask<4 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.2)\n", + "Requirement already satisfied: querystring-parser<2 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.4)\n", + "Requirement already satisfied: markdown<4,>=3.3 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.5.2)\n", + "Requirement already satisfied: matplotlib<4 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.8.3)\n", + "Requirement already satisfied: gunicorn<22 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (21.2.0)\n", + "Requirement already satisfied: requests-oauthlib>=0.5.0 in /opt/conda/lib/python3.9/site-packages (from msrest~=0.6.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.1)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.9/site-packages (from msrest~=0.6.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2024.2.2)\n", + "Requirement already satisfied: nbconvert>=6.4.5 in /opt/conda/lib/python3.9/site-packages (from nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.16.1)\n", + "Requirement already satisfied: notebook<7.0.0,>=6.4 in /opt/conda/lib/python3.9/site-packages (from nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.5.6)\n", + "Requirement already satisfied: tzdata>=2022.1 in /opt/conda/lib/python3.9/site-packages (from pandas<2.2,>=1.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2024.1)\n", + "Requirement already satisfied: tenacity>=6.2.0 in /opt/conda/lib/python3.9/site-packages (from plotly<5.12.0,~=5.4->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (8.2.3)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.9/site-packages (from requests~=2.31->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.4)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /opt/conda/lib/python3.9/site-packages (from sqlalchemy~=1.4->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.3)\n", + "Requirement already satisfied: nuclio-sdk>=0.5.3 in /opt/conda/lib/python3.9/site-packages (from storey~=1.6.18->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.9)\n", + "Collecting networkx (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", + " Obtaining dependency information for networkx from https://files.pythonhosted.org/packages/d5/f0/8fbc882ca80cf077f1b246c0e3c3465f7f415439bdea6b899f6b19f61f70/networkx-3.2.1-py3-none-any.whl.metadata\n", + " Downloading networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)\n", + "Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", + " Obtaining dependency information for nvidia-cuda-nvrtc-cu12==12.1.105 from https://files.pythonhosted.org/packages/b6/9f/c64c03f49d6fbc56196664d05dba14e3a561038a81a638eeb47f4d4cfd48/nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata\n", + " Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", + "Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", + " Obtaining dependency information for nvidia-cuda-runtime-cu12==12.1.105 from https://files.pythonhosted.org/packages/eb/d5/c68b1d2cdfcc59e72e8a5949a37ddb22ae6cade80cd4a57a84d4c8b55472/nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata\n", + " Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", + "Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", + " Obtaining dependency information for nvidia-cuda-cupti-cu12==12.1.105 from https://files.pythonhosted.org/packages/7e/00/6b218edd739ecfc60524e585ba8e6b00554dd908de2c9c66c1af3e44e18d/nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata\n", + " Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n", + "Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", + " Obtaining dependency information for nvidia-cudnn-cu12==8.9.2.26 from https://files.pythonhosted.org/packages/ff/74/a2e2be7fb83aaedec84f391f082cf765dfb635e7caa9b49065f73e4835d8/nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata\n", + " Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n", + "Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", + " Obtaining dependency information for nvidia-cublas-cu12==12.1.3.1 from https://files.pythonhosted.org/packages/37/6d/121efd7382d5b0284239f4ab1fc1590d86d34ed4a4a2fdb13b30ca8e5740/nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata\n", + " Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", + "Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", + " Obtaining dependency information for nvidia-cufft-cu12==11.0.2.54 from https://files.pythonhosted.org/packages/86/94/eb540db023ce1d162e7bea9f8f5aa781d57c65aed513c33ee9a5123ead4d/nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata\n", + " Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", + "Collecting nvidia-curand-cu12==10.3.2.106 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", + " Obtaining dependency information for nvidia-curand-cu12==10.3.2.106 from https://files.pythonhosted.org/packages/44/31/4890b1c9abc496303412947fc7dcea3d14861720642b49e8ceed89636705/nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl.metadata\n", + " Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", + "Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", + " Obtaining dependency information for nvidia-cusolver-cu12==11.4.5.107 from https://files.pythonhosted.org/packages/bc/1d/8de1e5c67099015c834315e333911273a8c6aaba78923dd1d1e25fc5f217/nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl.metadata\n", + " Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n", + "Collecting nvidia-cusparse-cu12==12.1.0.106 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", + " Obtaining dependency information for nvidia-cusparse-cu12==12.1.0.106 from https://files.pythonhosted.org/packages/65/5b/cfaeebf25cd9fdec14338ccb16f6b2c4c7fa9163aefcf057d86b9cc248bb/nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl.metadata\n", + " Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n", + "Collecting nvidia-nccl-cu12==2.19.3 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", + " Obtaining dependency information for nvidia-nccl-cu12==2.19.3 from https://files.pythonhosted.org/packages/38/00/d0d4e48aef772ad5aebcf70b73028f88db6e5640b36c38e90445b7a57c45/nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl.metadata\n", + " Downloading nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl.metadata (1.8 kB)\n", + "Collecting nvidia-nvtx-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", + " Obtaining dependency information for nvidia-nvtx-cu12==12.1.105 from https://files.pythonhosted.org/packages/da/d3/8057f0587683ed2fcd4dbfbdfdfa807b9160b809976099d36b8f60d08f03/nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata\n", + " Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.7 kB)\n", + "Collecting triton==2.2.0 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", + " Obtaining dependency information for triton==2.2.0 from https://files.pythonhosted.org/packages/6a/5c/01d9f062f719581cf6e60053e1a005d666ec67dcb59630fffaa3a3e5c9d8/triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", + " Downloading triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)\n", + "Collecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", + " Obtaining dependency information for nvidia-nvjitlink-cu12 from https://files.pythonhosted.org/packages/58/d1/d1c80553f9d5d07b6072bc132607d75a0ef3600e28e1890e11c0f55d7346/nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl.metadata\n", + " Downloading nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", + "INFO: pip is looking at multiple versions of transformers[sentencepiece] to determine which version is compatible with other requirements. This could take a while.\n", + "Collecting transformers[sentencepiece]>=4.26.0 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/a4/73/f620d76193954e16db3d5c53a07d956d7b9c800e570758d3bff91906d4a4/transformers-4.39.0-py3-none-any.whl.metadata\n", + " Downloading transformers-4.39.0-py3-none-any.whl.metadata (134 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 115.9 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/b6/4d/fbe6d89fde59d8107f0a02816c4ac4542a8f9a85559fdf33c68282affcc1/transformers-4.38.2-py3-none-any.whl.metadata\n", + " Downloading transformers-4.38.2-py3-none-any.whl.metadata (130 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 130.7/130.7 kB 126.3 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/3e/6b/1b589f7b69aaea8193cf5bc91cf97410284aecd97b6312cdb08baedbdffe/transformers-4.38.1-py3-none-any.whl.metadata\n", + " Downloading transformers-4.38.1-py3-none-any.whl.metadata (131 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 131.1/131.1 kB 138.2 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/91/89/5416dc364c7ef0711c564fd61a69b03d1e40eeb5c506c38e53ba8a969e79/transformers-4.38.0-py3-none-any.whl.metadata\n", + " Downloading transformers-4.38.0-py3-none-any.whl.metadata (131 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 131.1/131.1 kB 186.3 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/85/f6/c5065913119c41ecad148c34e3a861f719e16b89a522287213698da911fc/transformers-4.37.2-py3-none-any.whl.metadata\n", + " Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.4/129.4 kB 236.8 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/ad/67/b4d6a51dcaf988cb45b31e26c6e33fb169fe34ba5fb168b086309bd7c028/transformers-4.37.1-py3-none-any.whl.metadata\n", + " Downloading transformers-4.37.1-py3-none-any.whl.metadata (129 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.4/129.4 kB 156.4 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/3c/45/52133ce6bce49a099cc865599803bf1fad93de887276f728e56848d77a70/transformers-4.37.0-py3-none-any.whl.metadata\n", + " Downloading transformers-4.37.0-py3-none-any.whl.metadata (129 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.4/129.4 kB 102.0 MB/s eta 0:00:00\n", + "INFO: pip is still looking at multiple versions of transformers[sentencepiece] to determine which version is compatible with other requirements. This could take a while.\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/20/0a/739426a81f7635b422fbe6cb8d1d99d1235579a6ac8024c13d743efa6847/transformers-4.36.2-py3-none-any.whl.metadata\n", + " Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.8/126.8 kB 108.8 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/fc/04/0aad491cd98b09236c54ab849863ee85421eeda5138bbf9d33ecc594652b/transformers-4.36.1-py3-none-any.whl.metadata\n", + " Downloading transformers-4.36.1-py3-none-any.whl.metadata (126 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.8/126.8 kB 140.6 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/0f/12/d8e27a190ca67811f81deea3183b528d9169f10b74d827e0b9211520ecfa/transformers-4.36.0-py3-none-any.whl.metadata\n", + " Downloading transformers-4.36.0-py3-none-any.whl.metadata (126 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.8/126.8 kB 267.8 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/12/dd/f17b11a93a9ca27728e12512d167eb1281c151c4c6881d3ab59eb58f4127/transformers-4.35.2-py3-none-any.whl.metadata\n", + " Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.5/123.5 kB 130.2 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/92/ba/cfff7e01f7070d9fca3964bf42b2257b86964c3e6763b8d5435436cc1d77/transformers-4.35.1-py3-none-any.whl.metadata\n", + " Downloading transformers-4.35.1-py3-none-any.whl.metadata (123 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.1/123.1 kB 183.6 MB/s eta 0:00:00\n", + "INFO: This is taking longer than usual. You might need to provide the dependency resolver with stricter constraints to reduce runtime. See https://pip.pypa.io/warnings/backtracking for guidance. If you want to abort this run, press Ctrl + C.\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/9a/06/e4ec2a321e57c03b7e9345d709d554a52c33760e5015fdff0919d9459af0/transformers-4.35.0-py3-none-any.whl.metadata\n", + " Downloading transformers-4.35.0-py3-none-any.whl.metadata (123 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.1/123.1 kB 177.3 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/c1/bd/f64d67df4d3b05a460f281defe830ffab6d7940b7ca98ec085e94e024781/transformers-4.34.1-py3-none-any.whl.metadata\n", + " Downloading transformers-4.34.1-py3-none-any.whl.metadata (121 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.5/121.5 kB 270.5 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/1a/d1/3bba59606141ae808017f6fde91453882f931957f125009417b87a281067/transformers-4.34.0-py3-none-any.whl.metadata\n", + " Downloading transformers-4.34.0-py3-none-any.whl.metadata (121 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.5/121.5 kB 133.4 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/98/46/f6a79f944d5c7763a9bc13b2aa6ac72daf43a6551f5fb03bccf0a9c2fec1/transformers-4.33.3-py3-none-any.whl.metadata\n", + " Downloading transformers-4.33.3-py3-none-any.whl.metadata (119 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 163.1 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/1a/06/3817f9bb923437ead9a794f0ac0d03b8b5e0478ab112db4c413dd37c09da/transformers-4.33.2-py3-none-any.whl.metadata\n", + " Downloading transformers-4.33.2-py3-none-any.whl.metadata (119 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 274.9 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/13/30/54b59e73400df3de506ad8630284e9fd63f4b94f735423d55fc342181037/transformers-4.33.1-py3-none-any.whl.metadata\n", + " Downloading transformers-4.33.1-py3-none-any.whl.metadata (119 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 274.2 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/e1/9d/4d9fe5c3b820db10773392ac5f4a0c8dab668f70b245ce2ce09785166128/transformers-4.33.0-py3-none-any.whl.metadata\n", + " Downloading transformers-4.33.0-py3-none-any.whl.metadata (119 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 185.9 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/83/8d/f65f8138365462ace54458a9e164f4b28ce1141361970190eef36bdef986/transformers-4.32.1-py3-none-any.whl.metadata\n", + " Downloading transformers-4.32.1-py3-none-any.whl.metadata (118 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 118.5/118.5 kB 144.4 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/ae/95/283a1c004430bd2a9425d6937fc545dd49a4e4592feb76be0299a14e2378/transformers-4.32.0-py3-none-any.whl.metadata\n", + " Downloading transformers-4.32.0-py3-none-any.whl.metadata (118 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 118.5/118.5 kB 150.3 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/21/02/ae8e595f45b6c8edee07913892b3b41f5f5f273962ad98851dc6a564bbb9/transformers-4.31.0-py3-none-any.whl.metadata\n", + " Downloading transformers-4.31.0-py3-none-any.whl.metadata (116 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.9/116.9 kB 156.7 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/5b/0b/e45d26ccd28568013523e04f325432ea88a442b4e3020b757cf4361f0120/transformers-4.30.2-py3-none-any.whl.metadata\n", + " Downloading transformers-4.30.2-py3-none-any.whl.metadata (113 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.6/113.6 kB 263.7 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/b8/df/b01b5e67cde3883757c9212455cbb9169385dcab5858b7172199126b756d/transformers-4.30.1-py3-none-any.whl.metadata\n", + " Downloading transformers-4.30.1-py3-none-any.whl.metadata (113 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.6/113.6 kB 263.8 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/e2/72/1af3d38e98fdcceb3876de4567ac395a66c26976e259fe2d46266e052d61/transformers-4.30.0-py3-none-any.whl.metadata\n", + " Downloading transformers-4.30.0-py3-none-any.whl.metadata (113 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.6/113.6 kB 266.5 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/17/aa/a89864288afe45abe1ab79f002140a20348140e86836d96096d8f8a3bac0/transformers-4.29.2-py3-none-any.whl.metadata\n", + " Downloading transformers-4.29.2-py3-none-any.whl.metadata (112 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 112.3/112.3 kB 272.7 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/e8/b5/ddb16f9de207e6571ab7cc5db0cc538fa2d6d91cf024565496462af4c1ce/transformers-4.29.1-py3-none-any.whl.metadata\n", + " Downloading transformers-4.29.1-py3-none-any.whl.metadata (112 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 112.3/112.3 kB 262.3 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/45/e4/4914b11df70954d95a7c36b74bf9010c8594fcec960471479449b0deb4f7/transformers-4.29.0-py3-none-any.whl.metadata\n", + " Downloading transformers-4.29.0-py3-none-any.whl.metadata (111 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 111.9/111.9 kB 269.5 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/d8/a7/a6ff727fd5d96d6625f4658944a2ae230f0c75743a9a117fbda013b03d3d/transformers-4.28.1-py3-none-any.whl.metadata\n", + " Downloading transformers-4.28.1-py3-none-any.whl.metadata (109 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.0/110.0 kB 245.6 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/8b/13/1ce598763b3669d43f192a7911bf2bf730a328012ab8801b93187a4f70d0/transformers-4.28.0-py3-none-any.whl.metadata\n", + " Downloading transformers-4.28.0-py3-none-any.whl.metadata (109 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.0/110.0 kB 256.3 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/87/f0/2a152ed10ab8601431e87a606d397f7473c5fa4f8162f4ec5bda6ddb2df4/transformers-4.27.4-py3-none-any.whl.metadata\n", + " Downloading transformers-4.27.4-py3-none-any.whl.metadata (106 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 254.4 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/52/ac/9dc5a17ba60bc354d99250d9d1629f99d76f6729cee438fa91c8cc74bc5d/transformers-4.27.3-py3-none-any.whl.metadata\n", + " Downloading transformers-4.27.3-py3-none-any.whl.metadata (106 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 251.5 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/73/f0/4a795505387a3e7cd7f0c2a2a87f876658f9a07947a38fb67bffceff9246/transformers-4.27.2-py3-none-any.whl.metadata\n", + " Downloading transformers-4.27.2-py3-none-any.whl.metadata (106 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 246.1 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/6d/9b/2f536f9e73390209e0b27b74691355dac494b7ec8154f3012fdc6debbae7/transformers-4.27.1-py3-none-any.whl.metadata\n", + " Downloading transformers-4.27.1-py3-none-any.whl.metadata (106 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 114.0 MB/s eta 0:00:00\n", + " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/4d/3e/1378ed266cf991f5ab5fcb29e953d97d793c7f9242ea5dc52f856415ea3a/transformers-4.27.0-py3-none-any.whl.metadata\n", + " Downloading transformers-4.27.0-py3-none-any.whl.metadata (106 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 247.2 MB/s eta 0:00:00\n", + "Collecting sentencepiece!=0.1.92,>=0.1.91 (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))\n", + " Obtaining dependency information for sentencepiece!=0.1.92,>=0.1.91 from https://files.pythonhosted.org/packages/5f/01/c95e42eb86282b2c79305d3e0b0ca5a743f85a61262bb7130999c70b9374/sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", + " Downloading sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)\n", + "Collecting protobuf>=3.20.2 (from onnx~=1.14.1->-r /empty/requirements.txt (line 2))\n", + " Obtaining dependency information for protobuf>=3.20.2 from https://files.pythonhosted.org/packages/38/b1/d9b615dceb67ac38e13cbd7680c27182b40154996022cbb244ba1ac7d30f/protobuf-3.20.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata\n", + " Downloading protobuf-3.20.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (679 bytes)\n", + "Requirement already satisfied: future>=0.18.2 in /opt/conda/lib/python3.9/site-packages (from v3io~=0.5.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.0)\n", + "Requirement already satisfied: ujson>=3 in /opt/conda/lib/python3.9/site-packages (from v3io~=0.5.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.9.0)\n", + "Requirement already satisfied: googleapis-common-protos>=1.5.3 in /opt/conda/lib/python3.9/site-packages (from v3io-frames~=0.10.12->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.62.0)\n", + "Requirement already satisfied: grpcio-tools!=1.34.0,<1.49,>=1.30 in /opt/conda/lib/python3.9/site-packages (from v3io-frames~=0.10.12->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.48.2)\n", + "Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))\n", + " Obtaining dependency information for humanfriendly>=9.1 from https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl.metadata\n", + " Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)\n", + "INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.\n", + "Collecting multiprocess (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))\n", + " Obtaining dependency information for multiprocess from https://files.pythonhosted.org/packages/c6/c9/820b5ab056f4ada76fbe05bd481a948f287957d6cbfd59e2dd2618b408c1/multiprocess-0.70.15-py39-none-any.whl.metadata\n", + " Downloading multiprocess-0.70.15-py39-none-any.whl.metadata (7.2 kB)\n", + " Obtaining dependency information for multiprocess from https://files.pythonhosted.org/packages/6a/f4/fbeb03ef7abdda54db4a6a75c971b88ab73d724ff09e3275cc1e99f1c946/multiprocess-0.70.14-py39-none-any.whl.metadata\n", + " Downloading multiprocess-0.70.14-py39-none-any.whl.metadata (6.6 kB)\n", + "Collecting mpmath>=0.19 (from sympy->onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))\n", + " Obtaining dependency information for mpmath>=0.19 from https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl.metadata\n", + " Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)\n", + "Requirement already satisfied: Mako in /opt/conda/lib/python3.9/site-packages (from alembic!=1.10.0,<2->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.2)\n", + "Requirement already satisfied: cffi in /opt/conda/lib/python3.9/site-packages (from azure-datalake-store<0.1,>=0.0.46->adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.16.0)\n", + "Requirement already satisfied: termcolor in /opt/conda/lib/python3.9/site-packages (from fire<1,>=0.3.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.0)\n", + "Requirement already satisfied: Werkzeug>=3.0.0 in /opt/conda/lib/python3.9/site-packages (from Flask<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.1)\n", + "Requirement already satisfied: itsdangerous>=2.1.2 in /opt/conda/lib/python3.9/site-packages (from Flask<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1.2)\n", + "Requirement already satisfied: blinker>=1.6.2 in /opt/conda/lib/python3.9/site-packages (from Flask<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.7.0)\n", + "Requirement already satisfied: smmap<6,>=3.0.1 in /opt/conda/lib/python3.9/site-packages (from gitdb<5,>=4.0.1->GitPython>=3.1.41,~=3.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.0.1)\n", + "Requirement already satisfied: httplib2<1dev,>=0.15.0 in /opt/conda/lib/python3.9/site-packages (from google-api-python-client<2,>=1.7.8->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.22.0)\n", + "Requirement already satisfied: google-auth-httplib2>=0.0.3 in /opt/conda/lib/python3.9/site-packages (from google-api-python-client<2,>=1.7.8->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.0)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.9/site-packages (from google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.3.3)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.9/site-packages (from google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.3.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.9/site-packages (from google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.9)\n", + "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery-storage<3.0.0dev,>=2.6.0->google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.23.0)\n", + "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-storage->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.5.0)\n", + "Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.9/site-packages (from importlib-metadata>=4.13.0->dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.17.0)\n", + "Requirement already satisfied: parso<0.9.0,>=0.8.3 in /opt/conda/lib/python3.9/site-packages (from jedi>=0.16->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.8.3)\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.12.1)\n", + "Requirement already satisfied: referencing>=0.28.4 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.33.0)\n", + "Requirement already satisfied: rpds-py>=0.7.1 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.18.0)\n", + "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /opt/conda/lib/python3.9/site-packages (from kubernetes<26,>=8.0.0->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.7.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)\n", + "Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.49.0)\n", + "Requirement already satisfied: kiwisolver>=1.3.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.5)\n", + "Requirement already satisfied: pillow>=8 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (10.2.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.1.1)\n", + "Requirement already satisfied: importlib-resources>=3.2.0 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.1.2)\n", + "Requirement already satisfied: PyJWT[crypto]<3,>=1.0.0 in /opt/conda/lib/python3.9/site-packages (from msal<2.0.0,>=1.24.0->azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.0)\n", + "Requirement already satisfied: portalocker<3,>=1.0 in /opt/conda/lib/python3.9/site-packages (from msal-extensions<2.0.0,>=0.3.0->azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.2)\n", + "Requirement already satisfied: beautifulsoup4 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.12.3)\n", + "Requirement already satisfied: bleach!=5.0.0 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.1.0)\n", + "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.7.1)\n", + "Requirement already satisfied: jupyter-core>=4.7 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.7.1)\n", + "Requirement already satisfied: jupyterlab-pygments in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.3.0)\n", + "Requirement already satisfied: mistune<4,>=2.0.3 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.2)\n", + "Requirement already satisfied: nbclient>=0.5.0 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.0)\n", + "Requirement already satisfied: nbformat>=5.7 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.9.2)\n", + "Requirement already satisfied: pandocfilters>=1.4.1 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.5.1)\n", + "Requirement already satisfied: tinycss2 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.1)\n", + "Requirement already satisfied: pyzmq<25,>=17 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (24.0.1)\n", + "Requirement already satisfied: argon2-cffi in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (23.1.0)\n", + "Requirement already satisfied: jupyter-client<8,>=5.3.4 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.4.9)\n", + "Requirement already satisfied: ipython-genutils in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.0)\n", + "Requirement already satisfied: ipykernel in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.29.3)\n", + "Requirement already satisfied: Send2Trash>=1.8.0 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.2)\n", + "Requirement already satisfied: terminado>=0.8.3 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.18.0)\n", + "Requirement already satisfied: prometheus-client in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.20.0)\n", + "Requirement already satisfied: nbclassic>=0.4.7 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.0)\n", + "Requirement already satisfied: ptyprocess>=0.5 in /opt/conda/lib/python3.9/site-packages (from pexpect>4.3->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.7.0)\n", + "Requirement already satisfied: wcwidth in /opt/conda/lib/python3.9/site-packages (from prompt-toolkit<3.1.0,>=3.0.41->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.13)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /opt/conda/lib/python3.9/site-packages (from requests-oauthlib>=0.5.0->msrest~=0.6.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.2.2)\n", + "Requirement already satisfied: wheel in /opt/conda/lib/python3.9/site-packages (from strip-hints<1,>=0.1.8->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.41.2)\n", + "Requirement already satisfied: executing>=1.2.0 in /opt/conda/lib/python3.9/site-packages (from stack-data->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.1)\n", + "Requirement already satisfied: asttokens>=2.1.0 in /opt/conda/lib/python3.9/site-packages (from stack-data->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.1)\n", + "Requirement already satisfied: pure-eval in /opt/conda/lib/python3.9/site-packages (from stack-data->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.2)\n", + "Requirement already satisfied: webencodings in /opt/conda/lib/python3.9/site-packages (from bleach!=5.0.0->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.1)\n", + "Requirement already satisfied: pycparser in /opt/conda/lib/python3.9/site-packages (from cffi->azure-datalake-store<0.1,>=0.0.46->adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.21)\n", + "Requirement already satisfied: grpcio-status<2.0.dev0,>=1.33.2 in /opt/conda/lib/python3.9/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.48.2)\n", + "Requirement already satisfied: platformdirs>=2.5 in /opt/conda/lib/python3.9/site-packages (from jupyter-core>=4.7->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.10.0)\n", + "Requirement already satisfied: jupyter-server>=1.8 in /opt/conda/lib/python3.9/site-packages (from nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.12.5)\n", + "Requirement already satisfied: notebook-shim>=0.2.3 in /opt/conda/lib/python3.9/site-packages (from nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.4)\n", + "Requirement already satisfied: fastjsonschema in /opt/conda/lib/python3.9/site-packages (from nbformat>=5.7->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.19.1)\n", + "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /opt/conda/lib/python3.9/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.1)\n", + "Requirement already satisfied: argon2-cffi-bindings in /opt/conda/lib/python3.9/site-packages (from argon2-cffi->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (21.2.0)\n", + "Requirement already satisfied: soupsieve>1.2 in /opt/conda/lib/python3.9/site-packages (from beautifulsoup4->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.5)\n", + "Requirement already satisfied: comm>=0.1.1 in /opt/conda/lib/python3.9/site-packages (from ipykernel->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.1)\n", + "Requirement already satisfied: debugpy>=1.6.5 in /opt/conda/lib/python3.9/site-packages (from ipykernel->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.1)\n", + "Requirement already satisfied: jupyter-events>=0.9.0 in /opt/conda/lib/python3.9/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.0)\n", + "Requirement already satisfied: jupyter-server-terminals in /opt/conda/lib/python3.9/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.2)\n", + "Requirement already satisfied: overrides in /opt/conda/lib/python3.9/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.7.0)\n", + "Requirement already satisfied: python-json-logger>=2.0.4 in /opt/conda/lib/python3.9/site-packages (from jupyter-events>=0.9.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.7)\n", + "Requirement already satisfied: rfc3339-validator in /opt/conda/lib/python3.9/site-packages (from jupyter-events>=0.9.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.4)\n", + "Requirement already satisfied: rfc3986-validator>=0.1.1 in /opt/conda/lib/python3.9/site-packages (from jupyter-events>=0.9.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.1)\n", + "Requirement already satisfied: fqdn in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.5.1)\n", + "Requirement already satisfied: isoduration in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (20.11.0)\n", + "Requirement already satisfied: jsonpointer>1.13 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1)\n", + "Requirement already satisfied: uri-template in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.0)\n", + "Requirement already satisfied: webcolors>=1.11 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.13)\n", + "Requirement already satisfied: arrow>=0.15.0 in /opt/conda/lib/python3.9/site-packages (from isoduration->jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.0)\n", + "Requirement already satisfied: types-python-dateutil>=2.8.10 in /opt/conda/lib/python3.9/site-packages (from arrow>=0.15.0->isoduration->jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.19.20240106)\n", + "Downloading onnx-1.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.6 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.6/14.6 MB 274.2 MB/s eta 0:00:00\n", + "Downloading onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.4/6.4 MB 277.9 MB/s eta 0:00:00\n", + "Downloading optimum-1.6.4-py3-none-any.whl (227 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 227.8/227.8 kB 291.3 MB/s eta 0:00:00\n", + "Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.3/6.3 MB 242.4 MB/s eta 0:00:00\n", + "Downloading datasets-2.10.1-py3-none-any.whl (469 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 469.0/469.0 kB 185.9 MB/s eta 0:00:00\n", + "Downloading scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.4 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 26.4/26.4 MB 275.9 MB/s eta 0:00:00\n", + "Downloading dill-0.3.6-py3-none-any.whl (110 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.5/110.5 kB 282.3 MB/s eta 0:00:00\n", + "Downloading huggingface_hub-0.21.4-py3-none-any.whl (346 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 346.4/346.4 kB 311.7 MB/s eta 0:00:00\n", + "Downloading numpy-1.23.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 17.1/17.1 MB 269.6 MB/s eta 0:00:00\n", + "Downloading regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 773.4/773.4 kB 311.9 MB/s eta 0:00:00\n", + "Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n", + "Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.8/7.8 MB 264.1 MB/s eta 0:00:00\n", + "Downloading torch-2.2.1-cp39-cp39-manylinux1_x86_64.whl (755.5 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 755.5/755.5 MB 204.0 MB/s eta 0:00:00\n", + "Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 410.6/410.6 MB 40.3 MB/s eta 0:00:00\n", + "Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.1/14.1 MB 43.0 MB/s eta 0:00:00\n", + "Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 23.7/23.7 MB 46.9 MB/s eta 0:00:00\n", + "Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 823.6/823.6 kB 51.0 MB/s eta 0:00:00\n", + "Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 731.7/731.7 MB 58.2 MB/s eta 0:00:00\n", + "Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.6/121.6 MB 69.0 MB/s eta 0:00:00\n", + "Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.5/56.5 MB 36.0 MB/s eta 0:00:00\n", + "Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 124.2/124.2 MB 52.8 MB/s eta 0:00:00\n", + "Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.0/196.0 MB 45.9 MB/s eta 0:00:00\n", + "Downloading nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl (166.0 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 166.0/166.0 MB 19.6 MB/s eta 0:00:00\n", + "Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.1/99.1 kB 27.7 MB/s eta 0:00:00\n", + "Downloading triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (167.9 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 167.9/167.9 MB 41.3 MB/s eta 0:00:00\n", + "Downloading protobuf-3.20.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.0/1.0 MB 42.8 MB/s eta 0:00:00\n", + "Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 46.0/46.0 kB 192.0 MB/s eta 0:00:00\n", + "Downloading filelock-3.13.1-py3-none-any.whl (11 kB)\n", + "Downloading flatbuffers-24.3.7-py2.py3-none-any.whl (26 kB)\n", + "Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 132.9/132.9 kB 100.7 MB/s eta 0:00:00\n", + "Downloading sympy-1.12-py3-none-any.whl (5.7 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.7/5.7 MB 41.4 MB/s eta 0:00:00\n", + "Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 86.8/86.8 kB 253.7 MB/s eta 0:00:00\n", + "Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 536.2/536.2 kB 45.4 MB/s eta 0:00:00\n", + "Downloading sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 46.1 MB/s eta 0:00:00\n", + "Downloading networkx-3.2.1-py3-none-any.whl (1.6 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.6/1.6 MB 43.7 MB/s eta 0:00:00\n", + "Downloading nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl (21.1 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.1/21.1 MB 43.8 MB/s eta 0:00:00\n", + "Installing collected packages: tokenizers, sentencepiece, mpmath, flatbuffers, sympy, regex, protobuf, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, numpy, networkx, humanfriendly, filelock, dill, triton, responses, onnx, nvidia-cusparse-cu12, nvidia-cudnn-cu12, multiprocess, huggingface-hub, coloredlogs, transformers, scikit-learn, onnxruntime, nvidia-cusolver-cu12, torch, datasets, optimum\n", + " Attempting uninstall: protobuf\n", + " Found existing installation: protobuf 3.20.3\n", + " Uninstalling protobuf-3.20.3:\n", + " Successfully uninstalled protobuf-3.20.3\n", + " Attempting uninstall: numpy\n", + " Found existing installation: numpy 1.26.4\n", + " Uninstalling numpy-1.26.4:\n", + " Successfully uninstalled numpy-1.26.4\n", + " Attempting uninstall: scikit-learn\n", + " Found existing installation: scikit-learn 1.4.1.post1\n", + " Uninstalling scikit-learn-1.4.1.post1:\n", + " Successfully uninstalled scikit-learn-1.4.1.post1\n", + "Successfully installed coloredlogs-15.0.1 datasets-2.10.1 dill-0.3.6 filelock-3.13.1 flatbuffers-24.3.7 huggingface-hub-0.21.4 humanfriendly-10.0 mpmath-1.3.0 multiprocess-0.70.14 networkx-3.2.1 numpy-1.23.5 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.19.3 nvidia-nvjitlink-cu12-12.4.99 nvidia-nvtx-cu12-12.1.105 onnx-1.14.1 onnxruntime-1.16.3 optimum-1.6.4 protobuf-3.20.2 regex-2023.12.25 responses-0.18.0 scikit-learn-1.0.2 sentencepiece-0.2.0 sympy-1.12 tokenizers-0.13.3 torch-2.2.1 transformers-4.26.1 triton-2.2.0\n", + "WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n", + "\u001b[36mINFO\u001b[0m[0238] Taking snapshot of full filesystem... \n", + "\u001b[36mINFO\u001b[0m[0463] Pushing image to docker-registry.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer:latest \n", + "\u001b[36mINFO\u001b[0m[0493] Pushed docker-registry.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer@sha256:691d0bb3c23487b4b5d2f84ab323c24735626ee81681475f53a4158b72d4cfee \n" + ] + }, + { + "data": { + "text/plain": [ + "BuildStatus(ready=True, outputs={'image': '.mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer:latest'})" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project.build_function(\"hugging-face-classifier-trainer\",with_mlrun=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-24 17:22:42,252 [info] Storing function: {'name': 'hugging-face-classifier-trainer-train', 'uid': '53252ce7aacb4b1aacf86bf3b862daa2', 'db': 'http://mlrun-api:8080'}\n", + "> 2024-03-24 17:22:42,536 [info] Job is running in the background, pod: hugging-face-classifier-trainer-train-dqqfr\n", + "> 2024-03-24 17:24:43,288 [info] 'train_test_split_size' is not provided, setting train_test_split_size to 0.2\n", + "> 2024-03-24 17:24:43,847 [info] Loading and editing Shayanvsf/US_Airline_Sentiment dataset from Hugging Face hub\n", + "Downloading metadata: 100%|██████████| 1.03k/1.03k [00:00<00:00, 6.77MB/s]\n", + "Downloading and preparing dataset None/None (download: 265.13 KiB, generated: 1.50 MiB, post-processed: Unknown size, total: 1.76 MiB) to /root/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...\n", + "Downloading data files: 0%| | 0/3 [00:00 2024-03-24 17:24:47,076 [info] training 'huggingface-model'\n", + "The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.\n", + "This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + "***** Running training *****\n", + " Num examples = 100\n", + " Num Epochs = 3\n", + " Instantaneous batch size per device = 16\n", + " Total train batch size (w. parallel, distributed & accumulation) = 16\n", + " Gradient Accumulation steps = 1\n", + " Total optimization steps = 21\n", + " Number of trainable parameters = 66955010\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + " 0%| | 0/21 [00:00 2024-03-24 17:26:00,230 [info] To track results use the CLI: {'info_cmd': 'mlrun get run 53252ce7aacb4b1aacf86bf3b862daa2 -p hugging-face-trainer-avia', 'logs_cmd': 'mlrun logs 53252ce7aacb4b1aacf86bf3b862daa2 -p hugging-face-trainer-avia'}\n", + "> 2024-03-24 17:26:00,231 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlprojects/hugging-face-trainer-avia/jobs/monitor/53252ce7aacb4b1aacf86bf3b862daa2/overview'}\n", + "> 2024-03-24 17:26:00,231 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-train'}\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
hugging-face-trainer-avia0Mar 24 17:24:39completedhugging-face-classifier-trainer-train
v3io_user=avia
kind=job
owner=avia
mlrun/client_version=1.6.1
mlrun/client_python_version=3.9.16
host=hugging-face-classifier-trainer-train-dqqfr
hf_dataset=Shayanvsf/US_Airline_Sentiment
drop_columns=['airline_sentiment_confidence', 'negativereason_confidence']
pretrained_tokenizer=distilbert-base-uncased
pretrained_model=distilbert-base-uncased
model_class=transformers.AutoModelForSequenceClassification
label_name=airline_sentiment
num_of_train_samples=100
metrics=['accuracy', 'f1']
random_state=42
TRAIN_output_dir=finetuning-sentiment-model-3000-samples
TRAIN_learning_rate=2e-05
TRAIN_per_device_train_batch_size=16
TRAIN_per_device_eval_batch_size=16
TRAIN_num_train_epochs=3
TRAIN_weight_decay=0.01
TRAIN_push_to_hub=False
TRAIN_evaluation_strategy=epoch
TRAIN_eval_steps=1
TRAIN_logging_steps=1
CLASS_num_labels=2
loss=0.5215
learning_rate=0.0
eval_loss=0.4750453531742096
eval_accuracy=0.7916666666666666
eval_f1=0.0
eval_runtime=1.0524
eval_samples_per_second=22.806
eval_steps_per_second=1.9
train_runtime=55.1543
train_samples_per_second=5.439
train_steps_per_second=0.381
total_flos=3327208489680.0
loss_plot
learning_rate_plot
eval_loss_plot
eval_accuracy_plot
eval_f1_plot
eval_runtime_plot
eval_samples_per_second_plot
eval_steps_per_second_plot
tokenizer
model
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-24 17:26:09,792 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-train'}\n" + ] + } + ], + "source": [ + "train_run = hugging_face_classifier_trainer.run(params={\n", + " \"hf_dataset\": \"Shayanvsf/US_Airline_Sentiment\",\n", + " \"drop_columns\": [\n", + " \"airline_sentiment_confidence\",\n", + " \"negativereason_confidence\",\n", + " ],\n", + " \"pretrained_tokenizer\": \"distilbert-base-uncased\",\n", + " \"pretrained_model\": \"distilbert-base-uncased\",\n", + " \"model_class\": \"transformers.AutoModelForSequenceClassification\",\n", + " \"label_name\": \"airline_sentiment\",\n", + " \"num_of_train_samples\": 100,\n", + " \"metrics\": [\"accuracy\", \"f1\"],\n", + " \"random_state\": 42,\n", + " **additional_parameters\n", + " },\n", + " handler=\"train\", \n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "[Back to the top](#top)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlrun-base", + "language": "python", + "name": "conda-env-mlrun-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/functions/master/hugging_face_classifier_trainer/0.3.0/src/hugging_face_classifier_trainer.py b/functions/master/hugging_face_classifier_trainer/0.3.0/src/hugging_face_classifier_trainer.py new file mode 100755 index 00000000..29d07039 --- /dev/null +++ b/functions/master/hugging_face_classifier_trainer/0.3.0/src/hugging_face_classifier_trainer.py @@ -0,0 +1,832 @@ +import os +import shutil +import tempfile +import zipfile +from abc import ABC +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import mlrun +import mlrun.datastore +import mlrun.utils +import numpy as np +import pandas as pd +import transformers +from datasets import Dataset, load_dataset, load_metric +from mlrun import MLClientCtx +from mlrun import feature_store as fs +from mlrun.artifacts import Artifact, PlotlyArtifact +from mlrun.datastore import DataItem +from mlrun.frameworks._common import CommonTypes, MLRunInterface +from mlrun.utils import create_class +from plotly import graph_objects as go +from sklearn.model_selection import train_test_split +from transformers import ( + AutoTokenizer, + DataCollatorWithPadding, + EvalPrediction, + PreTrainedModel, + PreTrainedTokenizer, + Trainer, + TrainerCallback, + TrainerControl, + TrainerState, + TrainingArguments, +) + + +# ----------------------from MLRUN-------------------------------- +class HFORTOptimizerMLRunInterface(MLRunInterface, ABC): + """ + Interface for adding MLRun features for tensorflow keras API. + """ + + # MLRun's context default name: + DEFAULT_CONTEXT_NAME = "mlrun-huggingface" + + # Attributes to be inserted so the MLRun interface will be fully enabled. + _PROPERTIES = { + "_auto_log": False, + "_context": None, + "_model_name": "model", + "_tag": "", + "_labels": None, + "_extra_data": None, + } + _METHODS = ["enable_auto_logging"] + # Attributes to replace so the MLRun interface will be fully enabled. + _REPLACED_METHODS = [ + "optimize", + ] + + @classmethod + def add_interface( + cls, + obj, + restoration: CommonTypes.MLRunInterfaceRestorationType = None, + ): + """ + Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras + MLRun's features. + :param obj: The object to enrich his interface. + :param restoration: Restoration information tuple as returned from 'remove_interface' in order to + add the interface in a certain state. + """ + super(HFORTOptimizerMLRunInterface, cls).add_interface( + obj=obj, restoration=restoration + ) + + @classmethod + def mlrun_optimize(cls): + """ + MLRun's tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be + passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string. + + raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above. + """ + + def wrapper(self, *args, **kwargs): + save_dir = cls._get_function_argument( + self.optimize, + argument_name="save_dir", + passed_args=args, + passed_kwargs=kwargs, + )[0] + + # Call the original optimize method: + result = self.original_optimize(*args, **kwargs) + + if self._auto_log: + # Log the onnx model: + self._context.log_model( + key="model", + db_key=self._model_name, + model_file=f"{save_dir}/model_optimized.onnx", + tag=self._tag, + framework="ONNX", + labels=self._labels, + extra_data=self._extra_data, + ) + + return result + + return wrapper + + def enable_auto_logging( + self, + context: mlrun.MLClientCtx, + model_name: str = "model", + tag: str = "", + labels: Dict[str, str] = None, + extra_data: dict = None, + ): + self._auto_log = True + + self._context = context + self._model_name = model_name + self._tag = tag + self._labels = labels + self._extra_data = extra_data + + +class HFTrainerMLRunInterface(MLRunInterface, ABC): + """ + Interface for adding MLRun features for tensorflow keras API. + """ + + # MLRuns context default name: + DEFAULT_CONTEXT_NAME = "mlrun-huggingface" + + # Attributes to replace so the MLRun interface will be fully enabled. + _REPLACED_METHODS = [ + "train", + # "evaluate" + ] + + @classmethod + def add_interface( + cls, + obj: Trainer, + restoration: CommonTypes.MLRunInterfaceRestorationType = None, + ): + """ + Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras + MLRuns features. + :param obj: The object to enrich his interface. + :param restoration: Restoration information tuple as returned from 'remove_interface' in order to + add the interface in a certain state. + """ + + super(HFTrainerMLRunInterface, cls).add_interface( + obj=obj, restoration=restoration + ) + + @classmethod + def mlrun_train(cls): + + """ + MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be + passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string. + + raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above. + """ + + def wrapper(self: Trainer, *args, **kwargs): + # Restore the evaluation method as `train` will use it: + # cls._restore_attribute(obj=self, attribute_name="evaluate") + + # Call the original fit method: + result = self.original_train(*args, **kwargs) + + # Replace the evaluation method again: + # cls._replace_function(obj=self, function_name="evaluate") + + return result + + return wrapper + + +class MLRunCallback(TrainerCallback): + """ + Callback for collecting logs during training / evaluation of the `Trainer` API. + """ + + def __init__( + self, + context: mlrun.MLClientCtx = None, + model_name: str = "model", + tag: str = "", + labels: Dict[str, str] = None, + extra_data: dict = None, + ): + super().__init__() + + # Store the configurations: + self._context = ( + context + if context is not None + else mlrun.get_or_create_ctx("./mlrun-huggingface") + ) + self._model_name = model_name + self._tag = tag + self._labels = labels + self._extra_data = extra_data if extra_data is not None else {} + + # Set up the logging mode: + self._is_training = False + self._steps: List[List[int]] = [] + self._metric_scores: Dict[str, List[float]] = {} + self._artifacts: Dict[str, Artifact] = {} + + def on_epoch_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + self._steps.append([]) + + def on_epoch_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + self._log_metrics() + + def on_log( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + logs: Dict[str, float] = None, + **kwargs, + ): + recent_logs = state.log_history[-1].copy() + + recent_logs.pop("epoch") + current_step = int(recent_logs.pop("step")) + if current_step not in self._steps[-1]: + self._steps[-1].append(current_step) + + for metric_name, metric_score in recent_logs.items(): + if metric_name.startswith("train_"): + if metric_name.split("train_")[1] not in self._metric_scores: + self._metric_scores[metric_name] = [metric_score] + continue + if metric_name not in self._metric_scores: + self._metric_scores[metric_name] = [] + self._metric_scores[metric_name].append(metric_score) + + def on_train_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + self._is_training = True + + def on_train_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + model: PreTrainedModel = None, + tokenizer: PreTrainedTokenizer = None, + **kwargs, + ): + self._log_metrics() + + temp_directory = tempfile.gettempdir() + + # Save and log the tokenizer: + if tokenizer is not None: + # Save tokenizer: + tokenizer_dir = os.path.join(temp_directory, "tokenizer") + tokenizer.save_pretrained(save_directory=tokenizer_dir) + # Zip the tokenizer directory: + tokenizer_zip = shutil.make_archive( + base_name="tokenizer", + format="zip", + root_dir=tokenizer_dir, + ) + # Log the zip file: + self._artifacts["tokenizer"] = self._context.log_artifact( + item="tokenizer", local_path=tokenizer_zip + ) + + # Save the model: + model_dir = os.path.join(temp_directory, "model") + model.save_pretrained(save_directory=model_dir) + + # Zip the model directory: + shutil.make_archive( + base_name="model", + format="zip", + root_dir=model_dir, + ) + + # Log the model: + self._context.log_model( + key="model", + db_key=self._model_name, + model_file="model.zip", + tag=self._tag, + framework="Hugging Face", + labels=self._labels, + extra_data={**self._artifacts, **self._extra_data}, + ) + + def on_evaluate( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + self._log_metrics() + + if self._is_training: + return + + # TODO: Update the model object + + def _log_metrics(self): + for metric_name, metric_scores in self._metric_scores.items(): + self._context.log_result(key=metric_name, value=metric_scores[-1]) + if len(metric_scores) > 1: + self._log_metric_plot(name=metric_name, scores=metric_scores) + self._context.commit(completed=False) + + def _log_metric_plot(self, name: str, scores: List[float]): + # Initialize a plotly figure: + metric_figure = go.Figure() + + # Add titles: + metric_figure.update_layout( + title=name.capitalize().replace("_", " "), + xaxis_title="Samples", + yaxis_title="Scores", + ) + + # Draw: + metric_figure.add_trace( + go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") + ) + + # Create the plotly artifact: + artifact_name = f"{name}_plot" + artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) + self._artifacts[artifact_name] = self._context.log_artifact(artifact) + + +def _apply_mlrun_on_trainer( + trainer: transformers.Trainer, + model_name: str = None, + tag: str = "", + context: mlrun.MLClientCtx = None, + auto_log: bool = True, + labels: Dict[str, str] = None, + extra_data: dict = None, + **kwargs, +): + # Get parameters defaults: + if context is None: + context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) + + HFTrainerMLRunInterface.add_interface(obj=trainer) + + if auto_log: + trainer.add_callback( + MLRunCallback( + context=context, + model_name=model_name, + tag=tag, + labels=labels, + extra_data=extra_data, + ) + ) + + +def _apply_mlrun_on_optimizer( + optimizer, + model_name: str = None, + tag: str = "", + context: mlrun.MLClientCtx = None, + auto_log: bool = True, + labels: Dict[str, str] = None, + extra_data: dict = None, + **kwargs, +): + # Get parameters defaults: + if context is None: + context = mlrun.get_or_create_ctx( + HFORTOptimizerMLRunInterface.DEFAULT_CONTEXT_NAME + ) + + HFORTOptimizerMLRunInterface.add_interface(obj=optimizer) + + if auto_log: + optimizer.enable_auto_logging( + context=context, + model_name=model_name, + tag=tag, + labels=labels, + extra_data=extra_data, + ) + + +def apply_mlrun( + huggingface_object, + model_name: str = None, + tag: str = "", + context: mlrun.MLClientCtx = None, + auto_log: bool = True, + labels: Dict[str, str] = None, + extra_data: dict = None, + **kwargs, +): + """ + Wrap the given model with MLRun's interface providing it with mlrun's additional features. + :param huggingface_object: The model to wrap. Can be loaded from the model path given as well. + :param model_name: The model name to use for storing the model artifact. Default: "model". + :param tag: The model's tag to log with. + :param context: MLRun context to work with. If no context is given it will be retrieved via + 'mlrun.get_or_create_ctx(None)' + :param auto_log: Whether to enable MLRun's auto logging. Default: True. + """ + + if isinstance(huggingface_object, transformers.Trainer): + return _apply_mlrun_on_trainer( + trainer=huggingface_object, + model_name=model_name, + tag=tag, + context=context, + auto_log=auto_log, + labels=labels, + extra_data=extra_data, + ) + import optimum.onnxruntime as optimum_ort + + if isinstance(huggingface_object, optimum_ort.ORTOptimizer): + return _apply_mlrun_on_optimizer( + optimizer=huggingface_object, + model_name=model_name, + tag=tag, + context=context, + auto_log=auto_log, + labels=labels, + extra_data=extra_data, + ) + raise mlrun.errors.MLRunInvalidArgumentError + + +# ---------------------- from auto_trainer-------------------------------- +class KWArgsPrefixes: + MODEL_CLASS = "CLASS_" + FIT = "FIT_" + TRAIN = "TRAIN_" + PREDICT = "PREDICT_" + + +def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]: + """ + Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these + keys. + + :param src: The source dict to extract the values from. + :param prefix_key: Only keys with this prefix will be returned. The keys in the result dict will be without this + prefix. + """ + return { + key.replace(prefix_key, ""): val + for key, val in src.items() + if key.startswith(prefix_key) + } + + +def _get_dataframe( + context: MLClientCtx, + dataset: DataItem, + label_columns: Optional[Union[str, List[str]]] = None, + drop_columns: Union[str, List[str], int, List[int]] = None, +) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]: + """ + Getting the DataFrame of the dataset and drop the columns accordingly. + + :param context: MLRun context. + :param dataset: The dataset to train the model on. + Can be either a list of lists, dict, URI or a FeatureVector. + :param label_columns: The target label(s) of the column(s) in the dataset. for Regression or + Classification tasks. + :param drop_columns: str/int or a list of strings/ints that represent the column names/indices to drop. + """ + if isinstance(dataset, (list, dict)): + dataset = pd.DataFrame(dataset) + # Checking if drop_columns provided by integer type: + if drop_columns: + if isinstance(drop_columns, str) or ( + isinstance(drop_columns, list) + and any(isinstance(col, str) for col in drop_columns) + ): + context.logger.error( + "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset" + ) + raise ValueError + dataset.drop(drop_columns, axis=1, inplace=True) + + return dataset, label_columns + + store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url) + if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix: + # feature-vector case: + label_columns = label_columns or dataset.meta.status.label_column + dataset = fs.get_offline_features( + dataset.meta.uri, drop_columns=drop_columns + ).to_dataframe() + + context.logger.info(f"label columns: {label_columns}") + else: + # simple URL case: + dataset = dataset.as_df() + if drop_columns: + if all(col in dataset for col in drop_columns): + dataset = dataset.drop(drop_columns, axis=1) + else: + context.logger.info( + "not all of the columns to drop in the dataset, drop columns process skipped" + ) + return dataset, label_columns + + +# ---------------------- Hugging Face Trainer -------------------------------- + + +def _create_compute_metrics(metrics: List[str]) -> Callable[[EvalPrediction], Dict]: + """ + This function create and returns a function that will be used to compute metrics at evaluation. + :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc. + + :returns: Function that will be used to compute metrics at evaluation. + Must take a [`EvalPrediction`] and return a dictionary string to metric values. + """ + + def _compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + metric_dict_results = {} + for metric in metrics: + load_met = load_metric(metric) + metric_res = load_met.compute(predictions=predictions, references=labels)[ + metric + ] + metric_dict_results[metric] = metric_res + + return metric_dict_results + + return _compute_metrics + + +def _edit_columns( + dataset: Dataset, + drop_columns: List[str] = None, + rename_columns: [str, str] = None, +) -> Dataset: + """ + Drop and renames that columns of the given dataset + :param dataset: Dataset to process + :param drop_columns: The columns to drop from the dataset. + :param rename_columns: Dict of columns ro rename : {: , ...} + + :returns: The dataset after the desired process + """ + if drop_columns: + dataset = dataset.remove_columns(drop_columns) + if rename_columns: + dataset = dataset.rename_columns(rename_columns) + return dataset + + +def _prepare_dataset( + context: MLClientCtx, + dataset_name: str, + label_name: str = None, + drop_columns: Optional[List[str]] = None, + num_of_train_samples: int = None, + train_test_split_size: float = None, + random_state: int = None, +) -> Tuple[Dataset, Dataset]: + """ + Loading the dataset and editing the columns + + :param context: MLRun contex + :param dataset_name: The name of the dataset to get from the HuggingFace hub + :param label_name: The target label of the column in the dataset. + :param drop_columns: The columns to drop from the dataset. + :param num_of_train_samples: Max number of training samples, for debugging. + :param train_test_split_size: Should be between 0.0 and 1.0 and represent the proportion of the dataset to include + in the test split. + :param random_state: Random state for train_test_split + + """ + + context.logger.info( + f"Loading and editing {dataset_name} dataset from Hugging Face hub" + ) + rename_cols = {label_name: "labels"} + + # Loading and editing dataset: + dataset = load_dataset(dataset_name) + + # train set + train_dataset = dataset["train"] + if num_of_train_samples: + train_dataset = train_dataset.shuffle(seed=random_state).select( + list(range(num_of_train_samples)) + ) + train_dataset = _edit_columns(train_dataset, drop_columns, rename_cols) + + # test set + test_dataset = dataset["test"] + if train_test_split_size or num_of_train_samples: + train_test_split_size = train_test_split_size or 0.2 + num_of_test_samples = int( + (train_dataset.num_rows * train_test_split_size) + // (1 - train_test_split_size) + ) + test_dataset = test_dataset.shuffle(seed=random_state).select( + list(range(num_of_test_samples)) + ) + test_dataset = _edit_columns(test_dataset, drop_columns, rename_cols) + + return train_dataset, test_dataset + + +def train( + context: MLClientCtx, + hf_dataset: str = None, + dataset: DataItem = None, + test_set: DataItem = None, + drop_columns: Optional[List[str]] = None, + pretrained_tokenizer: str = None, + pretrained_model: str = None, + model_class: str = None, + model_name: str = "huggingface-model", + label_name: str = "labels", + text_col: str = "text", + num_of_train_samples: int = None, + train_test_split_size: float = None, + metrics: List[str] = None, + random_state: int = None, +): + """ + Training and evaluating a pretrained model with a pretrained tokenizer over a dataset. + The dataset can be either be the name of the dataset that contains in the HuggingFace hub, + or a URI or a FeatureVector + + :param context: MLRun context + :param hf_dataset: The name of the dataset to get from the HuggingFace hub + :param dataset: The dataset to train the model on. Can be either a URI or a FeatureVector + :param test_set: The test set to train the model with. + :param drop_columns: The columns to drop from the dataset. + :param pretrained_tokenizer: The name of the pretrained tokenizer from the HuggingFace hub. + :param pretrained_model: The name of the pretrained model from the HuggingFace hub. + :param model_name: The model's name to use for storing the model artifact, default to 'model' + :param model_class: The class of the model, e.g. `transformers.AutoModelForSequenceClassification` + :param label_name: The target label of the column in the dataset. + :param text_col: The input text column un the dataset. + :param num_of_train_samples: Max number of training samples, for debugging. + :param train_test_split_size: Should be between 0.0 and 1.0 and represent the proportion of the dataset to include + in the test split. + :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc. + :param random_state: Random state for train_test_split + """ + + if train_test_split_size is None and test_set is None: + context.logger.info( + "'train_test_split_size' is not provided, setting train_test_split_size to 0.2" + ) + train_test_split_size = 0.2 + + # Creating tokenizer: + tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer) + + def preprocess_function(examples): + return tokenizer(examples[text_col], truncation=True) + + # prepare data for training + if hf_dataset: + train_dataset, test_dataset = _prepare_dataset( + context, + hf_dataset, + label_name, + drop_columns, + num_of_train_samples, + train_test_split_size, + random_state=random_state, + ) + elif dataset: + # Get DataFrame by URL or by FeatureVector: + train_dataset, label_name = _get_dataframe( + context=context, + dataset=dataset, + label_columns=label_name, + drop_columns=drop_columns, + ) + if test_set: + test_dataset, _ = _get_dataframe( + context=context, + dataset=test_set, + label_columns=label_name, + drop_columns=drop_columns, + ) + else: + train_dataset, test_dataset = train_test_split( + train_dataset, + test_size=train_test_split_size, + random_state=random_state, + ) + train_dataset = Dataset.from_pandas(train_dataset) + test_dataset = Dataset.from_pandas(test_dataset) + else: + raise mlrun.errors.MLRunInvalidArgumentError( + "Training data was not provided. A training dataset is mandatory for training." + " Please provide a training set using one of the arguments 'hf_dataset' or 'dataset'." + ) + + # Mapping datasets with the tokenizer: + tokenized_train = train_dataset.map(preprocess_function, batched=True) + tokenized_test = test_dataset.map(preprocess_function, batched=True) + + # Creating data collator for batching: + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # Parsing kwargs: + train_kwargs = _get_sub_dict_by_prefix( + src=context.parameters, prefix_key=KWArgsPrefixes.TRAIN + ) + model_class_kwargs = _get_sub_dict_by_prefix( + src=context.parameters, prefix_key=KWArgsPrefixes.MODEL_CLASS + ) + + # Loading our pretrained model: + model_class_kwargs["pretrained_model_name_or_path"] = ( + model_class_kwargs.get("pretrained_model_name_or_path") or pretrained_model + ) + train_kwargs["hub_token"] = train_kwargs.get("hub_token") or pretrained_tokenizer + if not model_class_kwargs["pretrained_model_name_or_path"]: + raise mlrun.errors.MLRunRuntimeError( + "Must provide pretrained_model name as " + "function argument or in extra params" + ) + model = create_class(model_class).from_pretrained(**model_class_kwargs) + + # Preparing training arguments: + training_args = TrainingArguments( + **train_kwargs, + ) + + compute_metrics = _create_compute_metrics(metrics) if metrics else None + trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_train, + eval_dataset=tokenized_test, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + ) + + apply_mlrun(trainer, model_name=model_name) + + # Apply training with evaluation: + context.logger.info(f"training '{model_name}'") + trainer.train() + + +def _get_model_dir(model_uri: str): + model_file, _, _ = mlrun.artifacts.get_model(model_uri) + model_dir = tempfile.gettempdir() + # Unzip the Model: + with zipfile.ZipFile(model_file, "r") as zip_file: + zip_file.extractall(model_dir) + + return model_dir + + +def optimize( + model_path: str, + model_name: str = "optimized_model", + target_dir: str = "./optimized", + optimization_level: int = 1, +): + """ + Optimizing the transformer model using ONNX optimization. + + + :param model_path: The path of the model to optimize. + :param model_name: Name of the optimized model. + :param target_dir: The directory to save the ONNX model. + :param optimization_level: Optimization level performed by ONNX Runtime of the loaded graph. (default is 1) + """ + # We import these in the function scope so ONNX won't be mandatory for the other handlers: + from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer + from optimum.onnxruntime.configuration import OptimizationConfig + + model_dir = _get_model_dir(model_uri=model_path) + # Creating configuration for optimization step: + optimization_config = OptimizationConfig(optimization_level=optimization_level) + + # Converting our pretrained model to an ONNX-Runtime model: + ort_model = ORTModelForSequenceClassification.from_pretrained( + model_dir, from_transformers=True + ) + + # Creating an ONNX-Runtime optimizer from ONNX model: + optimizer = ORTOptimizer.from_pretrained(ort_model) + + apply_mlrun(optimizer, model_name=model_name) + # Optimizing and saving the ONNX model: + optimizer.optimize(save_dir=target_dir, optimization_config=optimization_config) diff --git a/functions/master/hugging_face_classifier_trainer/0.3.0/src/item.yaml b/functions/master/hugging_face_classifier_trainer/0.3.0/src/item.yaml new file mode 100755 index 00000000..332902b3 --- /dev/null +++ b/functions/master/hugging_face_classifier_trainer/0.3.0/src/item.yaml @@ -0,0 +1,33 @@ +apiVersion: v1 +categories: +- deep-learning +- huggingface +- machine-learning +- model-training +description: Automatic train and optimize functions for HuggingFace framework +doc: '' +example: hugging_face_classifier_trainer.ipynb +generationDate: 2022-08-28:17-25 +hidden: false +icon: '' +labels: + author: davids +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.6.1 +name: hugging_face_classifier_trainer +platformVersion: 3.5.5 +spec: + filename: hugging_face_classifier_trainer.py + handler: train + image: mlrun/mlrun + kind: job + requirements: + - onnx~=1.14.1 + - onnxruntime~=1.16.1 + - optimum~=1.6.4 + - transformers~=4.26.1 + - datasets~=2.10.1 + - scikit-learn~=1.0.2 +url: '' +version: 0.3.0 diff --git a/functions/master/hugging_face_classifier_trainer/0.3.0/src/requirements.txt b/functions/master/hugging_face_classifier_trainer/0.3.0/src/requirements.txt new file mode 100644 index 00000000..9d0db7b4 --- /dev/null +++ b/functions/master/hugging_face_classifier_trainer/0.3.0/src/requirements.txt @@ -0,0 +1,6 @@ +onnx~=1.14.1 +onnxruntime~=1.16.1 +optimum~=1.6.4 +transformers~=4.26.1 +datasets~=2.10.1 +scikit-learn~=1.0.2 \ No newline at end of file diff --git a/functions/master/hugging_face_classifier_trainer/0.3.0/src/test_hugging_face_classifier_trainer.py b/functions/master/hugging_face_classifier_trainer/0.3.0/src/test_hugging_face_classifier_trainer.py new file mode 100644 index 00000000..a5e0fee9 --- /dev/null +++ b/functions/master/hugging_face_classifier_trainer/0.3.0/src/test_hugging_face_classifier_trainer.py @@ -0,0 +1,145 @@ +# Copyright 2019 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os + +import mlrun +import pytest +from mlrun import import_function + +REQUIRED_ENV_VARS = [ + "MLRUN_DBPATH", + "MLRUN_ARTIFACT_PATH", + "V3IO_USERNAME", + "V3IO_API", + "V3IO_ACCESS_KEY", +] + +ADDITIONAL_PARAM_FOR_TRAIN = { + "TRAIN_output_dir": "finetuning-sentiment-model-3000-samples", + "TRAIN_learning_rate": 2e-5, + "TRAIN_per_device_train_batch_size": 16, + "TRAIN_per_device_eval_batch_size": 16, + "TRAIN_num_train_epochs": 2, + "TRAIN_weight_decay": 0.01, + "TRAIN_push_to_hub": False, + "TRAIN_evaluation_strategy": "epoch", + "TRAIN_eval_steps": 1, + "TRAIN_logging_steps": 1, + "CLASS_num_labels": 2, +} + + +def _validate_environment_variables() -> bool: + """ + Checks that all required Environment variables are set. + """ + environment_keys = os.environ.keys() + return all(key in environment_keys for key in REQUIRED_ENV_VARS) + + +def _set_environment(env_file=None): + if env_file: + mlrun.set_env_from_file(env_file) + mlrun.get_or_create_project( + "hugging-face-classifier-trainer-test", context="./", user_project=True + ) + + +@pytest.mark.skipif( + condition=not _validate_environment_variables(), + reason="Project's environment variables are not set", +) +def test_train_sequence_classification(): + _set_environment() + + # Importing function: + fn = import_function("function.yaml") + + train_run = None + + try: + train_run = fn.run( + params={ + "hf_dataset": "Shayanvsf/US_Airline_Sentiment", + "drop_columns": [ + "airline_sentiment_confidence", + "negativereason_confidence", + ], + "pretrained_tokenizer": "distilbert-base-uncased", + "pretrained_model": "distilbert-base-uncased", + "model_class": "transformers.AutoModelForSequenceClassification", + "label_name": "airline_sentiment", + "num_of_train_samples": 100, + "metrics": ["accuracy", "f1"], + "random_state": 42, + **ADDITIONAL_PARAM_FOR_TRAIN, + }, + handler="train", + local=True, + ) + except Exception as exception: + print(f"- The test failed - raised the following error:\n- {exception}") + assert train_run and all( + key in train_run.outputs for key in ["model", "loss"] + ), "outputs should include more data" + + +@pytest.mark.skipif( + condition=not _validate_environment_variables(), + reason="Project's environment variables are not set", +) +def test_train_and_optimize_sequence_classification(): + _set_environment() + + # Importing function: + fn = import_function("function.yaml") + + train_run = None + optimize_run = None + + try: + train_run = fn.run( + params={ + "hf_dataset": "Shayanvsf/US_Airline_Sentiment", + "drop_columns": [ + "airline_sentiment_confidence", + "negativereason_confidence", + ], + "pretrained_tokenizer": "distilbert-base-uncased", + "pretrained_model": "distilbert-base-uncased", + "model_class": "transformers.AutoModelForSequenceClassification", + "label_name": "airline_sentiment", + "num_of_train_samples": 100, + "metrics": ["accuracy", "f1"], + "random_state": 42, + **ADDITIONAL_PARAM_FOR_TRAIN, + }, + handler="train", + local=True, + ) + + optimize_run = fn.run( + params={"model_path": train_run.outputs["model"]}, + handler="optimize", + local=True, + ) + except Exception as exception: + print(f"- The test failed - raised the following error:\n- {exception}") + assert train_run and all( + key in train_run.outputs for key in ["model", "loss"] + ), "outputs should include more data" + assert optimize_run and all( + key in optimize_run.outputs for key in ["model"] + ), "outputs should include more data" diff --git a/functions/master/hugging_face_classifier_trainer/0.3.0/static/documentation.html b/functions/master/hugging_face_classifier_trainer/0.3.0/static/documentation.html new file mode 100644 index 00000000..1652c838 --- /dev/null +++ b/functions/master/hugging_face_classifier_trainer/0.3.0/static/documentation.html @@ -0,0 +1,394 @@ + + + + + + + +hugging_face_classifier_trainer package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+ + + +
+ +
+
+
+
+
+
+ + + + +
+
+ + +
+
+
+ +
+

hugging_face_classifier_trainer package

+ +
+ +
+
+
+
+
+

hugging_face_classifier_trainer package#

+
+

Submodules#

+
+
+

hugging_face_classifier_trainer.hugging_face_classifier_trainer module#

+
+
+class hugging_face_classifier_trainer.hugging_face_classifier_trainer.HFORTOptimizerMLRunInterface(*args: Any, **kwargs: Any)[source]#
+

Bases: mlrun.frameworks._common., abc.ABC

+

Interface for adding MLRun features for tensorflow keras API.

+
+
+DEFAULT_CONTEXT_NAME = 'mlrun-huggingface'#
+
+
+
+classmethod add_interface(obj, restoration: Optional[mlrun.frameworks._common.CommonTypes.MLRunInterfaceRestorationType] = None)[source]#
+

Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras +MLRun’s features. +:param obj: The object to enrich his interface. +:param restoration: Restoration information tuple as returned from ‘remove_interface’ in order to

+
+

add the interface in a certain state.

+
+
+
+
+enable_auto_logging(context: mlrun.execution.MLClientCtx, model_name: str = 'model', tag: str = '', labels: Optional[Dict[str, str]] = None, extra_data: Optional[dict] = None)[source]#
+
+
+
+classmethod mlrun_optimize()[source]#
+

MLRun’s tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be +passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.

+

raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.

+
+
+
+
+class hugging_face_classifier_trainer.hugging_face_classifier_trainer.HFTrainerMLRunInterface(*args: Any, **kwargs: Any)[source]#
+

Bases: mlrun.frameworks._common., abc.ABC

+

Interface for adding MLRun features for tensorflow keras API.

+
+
+DEFAULT_CONTEXT_NAME = 'mlrun-huggingface'#
+
+
+
+classmethod add_interface(obj: transformers.Trainer, restoration: Optional[mlrun.frameworks._common.CommonTypes.MLRunInterfaceRestorationType] = None)[source]#
+

Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras +MLRuns features. +:param obj: The object to enrich his interface. +:param restoration: Restoration information tuple as returned from ‘remove_interface’ in order to

+
+

add the interface in a certain state.

+
+
+
+
+classmethod mlrun_train()[source]#
+

MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be +passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.

+

raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.

+
+
+
+
+class hugging_face_classifier_trainer.hugging_face_classifier_trainer.KWArgsPrefixes[source]#
+

Bases: object

+
+
+FIT = 'FIT_'#
+
+
+
+MODEL_CLASS = 'CLASS_'#
+
+
+
+PREDICT = 'PREDICT_'#
+
+
+
+TRAIN = 'TRAIN_'#
+
+
+
+
+class hugging_face_classifier_trainer.hugging_face_classifier_trainer.MLRunCallback(*args: Any, **kwargs: Any)[source]#
+

Bases: transformers.

+

Callback for collecting logs during training / evaluation of the Trainer API.

+
+
+on_epoch_begin(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
+
+
+
+on_epoch_end(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
+
+
+
+on_evaluate(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
+
+
+
+on_log(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, logs: Optional[Dict[str, float]] = None, **kwargs)[source]#
+
+
+
+on_train_begin(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
+
+
+
+on_train_end(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, model: Optional[transformers.PreTrainedModel] = None, tokenizer: Optional[transformers.PreTrainedTokenizer] = None, **kwargs)[source]#
+
+
+
+
+hugging_face_classifier_trainer.hugging_face_classifier_trainer.apply_mlrun(huggingface_object, model_name: Optional[str] = None, tag: str = '', context: Optional[mlrun.execution.MLClientCtx] = None, auto_log: bool = True, labels: Optional[Dict[str, str]] = None, extra_data: Optional[dict] = None, **kwargs)[source]#
+

Wrap the given model with MLRun’s interface providing it with mlrun’s additional features. +:param huggingface_object: The model to wrap. Can be loaded from the model path given as well. +:param model_name: The model name to use for storing the model artifact. Default: “model”. +:param tag: The model’s tag to log with. +:param context: MLRun context to work with. If no context is given it will be retrieved via

+
+

‘mlrun.get_or_create_ctx(None)’

+
+
+
Parameters
+

auto_log – Whether to enable MLRun’s auto logging. Default: True.

+
+
+
+
+
+hugging_face_classifier_trainer.hugging_face_classifier_trainer.optimize(model_path: str, model_name: str = 'optimized_model', target_dir: str = './optimized', optimization_level: int = 1)[source]#
+

Optimizing the transformer model using ONNX optimization.

+
+
Parameters
+
    +
  • model_path – The path of the model to optimize.

  • +
  • model_name – Name of the optimized model.

  • +
  • target_dir – The directory to save the ONNX model.

  • +
  • optimization_level – Optimization level performed by ONNX Runtime of the loaded graph. (default is 1)

  • +
+
+
+
+
+
+hugging_face_classifier_trainer.hugging_face_classifier_trainer.train(context: mlrun.execution.MLClientCtx, hf_dataset: Optional[str] = None, dataset: Optional[mlrun.datastore.base.DataItem] = None, test_set: Optional[mlrun.datastore.base.DataItem] = None, drop_columns: Optional[List[str]] = None, pretrained_tokenizer: Optional[str] = None, pretrained_model: Optional[str] = None, model_class: Optional[str] = None, model_name: str = 'huggingface-model', label_name: str = 'labels', text_col: str = 'text', num_of_train_samples: Optional[int] = None, train_test_split_size: Optional[float] = None, metrics: Optional[List[str]] = None, random_state: Optional[int] = None)[source]#
+

Training and evaluating a pretrained model with a pretrained tokenizer over a dataset. +The dataset can be either be the name of the dataset that contains in the HuggingFace hub, +or a URI or a FeatureVector

+
+
Parameters
+
    +
  • context – MLRun context

  • +
  • hf_dataset – The name of the dataset to get from the HuggingFace hub

  • +
  • dataset – The dataset to train the model on. Can be either a URI or a FeatureVector

  • +
  • test_set – The test set to train the model with.

  • +
  • drop_columns – The columns to drop from the dataset.

  • +
  • pretrained_tokenizer – The name of the pretrained tokenizer from the HuggingFace hub.

  • +
  • pretrained_model – The name of the pretrained model from the HuggingFace hub.

  • +
  • model_name – The model’s name to use for storing the model artifact, default to ‘model’

  • +
  • model_class – The class of the model, e.g. transformers.AutoModelForSequenceClassification

  • +
  • label_name – The target label of the column in the dataset.

  • +
  • text_col – The input text column un the dataset.

  • +
  • num_of_train_samples – Max number of training samples, for debugging.

  • +
  • train_test_split_size – Should be between 0.0 and 1.0 and represent the proportion of the dataset to include +in the test split.

  • +
  • metrics – List of different metrics for evaluate the model such as f1, accuracy etc.

  • +
  • random_state – Random state for train_test_split

  • +
+
+
+
+
+
+

Module contents#

+
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+ + + + \ No newline at end of file diff --git a/functions/master/hugging_face_classifier_trainer/0.3.0/static/example.html b/functions/master/hugging_face_classifier_trainer/0.3.0/static/example.html new file mode 100644 index 00000000..5fdd60e5 --- /dev/null +++ b/functions/master/hugging_face_classifier_trainer/0.3.0/static/example.html @@ -0,0 +1,2406 @@ + + + + + + + +MLRun Hugging Face Classifier Trainer Tutorial + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+ + + +
+ +
+
+
+
+
+
+ + + + +
+
+ + +
+
+
+ + +
+
+

+
+

MLRun Hugging Face Classifier Trainer Tutorial#

+

This notebook shows how to use the handlers of the Hugging Face classifier trainer. +the following handlers are:

+
    +
  • train

  • +
  • optimize

  • +
+

All you need is simply HF model type and a HF dataset name.

+
+
+
%pip install -r requirements.txt
+
+
+
+
+
Requirement already satisfied: onnx~=1.14.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 1)) (1.14.1)
+Requirement already satisfied: onnxruntime==1.16.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 2)) (1.16.1)
+Requirement already satisfied: optimum~=1.6.4 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 3)) (1.6.4)
+Requirement already satisfied: transformers~=4.26.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 4)) (4.26.1)
+Requirement already satisfied: datasets~=2.10.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 5)) (2.10.1)
+Requirement already satisfied: scikit-learn~=1.0.2 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 6)) (1.0.2)
+Requirement already satisfied: coloredlogs in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (15.0.1)
+Requirement already satisfied: flatbuffers in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.12)
+Requirement already satisfied: numpy>=1.21.6 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.23.5)
+Requirement already satisfied: packaging in /conda/envs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (21.3)
+Requirement already satisfied: protobuf in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (3.20.2)
+Requirement already satisfied: sympy in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.12)
+Requirement already satisfied: typing-extensions>=3.6.2.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from onnx~=1.14.1->-r requirements.txt (line 1)) (4.7.1)
+Requirement already satisfied: torch>=1.9 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from optimum~=1.6.4->-r requirements.txt (line 3)) (2.1.2)
+Requirement already satisfied: huggingface-hub>=0.8.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from optimum~=1.6.4->-r requirements.txt (line 3)) (0.20.1)
+Requirement already satisfied: filelock in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (3.13.1)
+Requirement already satisfied: pyyaml>=5.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (5.4.1)
+Requirement already satisfied: regex!=2019.12.17 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (2023.12.25)
+Requirement already satisfied: requests in /conda/envs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (2.31.0)
+Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (0.13.3)
+Requirement already satisfied: tqdm>=4.27 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (4.65.0)
+Requirement already satisfied: pyarrow>=6.0.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (11.0.0)
+Requirement already satisfied: dill<0.3.7,>=0.3.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (0.3.6)
+Requirement already satisfied: pandas in /conda/envs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (1.4.4)
+Requirement already satisfied: xxhash in /conda/envs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (3.3.0)
+Requirement already satisfied: multiprocess in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (0.70.14)
+Requirement already satisfied: fsspec>=2021.11.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from fsspec[http]>=2021.11.1->datasets~=2.10.1->-r requirements.txt (line 5)) (2023.9.2)
+Requirement already satisfied: aiohttp in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (3.9.1)
+Requirement already satisfied: responses<0.19 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (0.18.0)
+Requirement already satisfied: scipy>=1.1.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r requirements.txt (line 6)) (1.11.4)
+Requirement already satisfied: joblib>=0.11 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r requirements.txt (line 6)) (1.3.2)
+Requirement already satisfied: threadpoolctl>=2.0.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r requirements.txt (line 6)) (3.2.0)
+Requirement already satisfied: attrs>=17.3.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (19.1.0)
+Requirement already satisfied: multidict<7.0,>=4.5 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (6.0.4)
+Requirement already satisfied: yarl<2.0,>=1.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (1.9.2)
+Requirement already satisfied: frozenlist>=1.1.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (1.4.0)
+Requirement already satisfied: aiosignal>=1.1.2 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (1.3.1)
+Requirement already satisfied: async-timeout<5.0,>=4.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (4.0.3)
+Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from packaging->onnxruntime==1.16.1->-r requirements.txt (line 2)) (3.1.1)
+Requirement already satisfied: charset-normalizer<4,>=2 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (2.1.1)
+Requirement already satisfied: idna<4,>=2.5 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (3.4)
+Requirement already satisfied: urllib3<3,>=1.21.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (1.26.16)
+Requirement already satisfied: certifi>=2017.4.17 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (2023.7.22)
+Requirement already satisfied: networkx in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (3.2.1)
+Requirement already satisfied: jinja2 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (3.1.3)
+Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)
+Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)
+Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)
+Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (8.9.2.26)
+Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.3.1)
+Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (11.0.2.54)
+Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (10.3.2.106)
+Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (11.4.5.107)
+Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.0.106)
+Requirement already satisfied: nvidia-nccl-cu12==2.18.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (2.18.1)
+Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)
+Requirement already satisfied: triton==2.1.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (2.1.0)
+Requirement already satisfied: nvidia-nvjitlink-cu12 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.3.101)
+Requirement already satisfied: sentencepiece!=0.1.92,>=0.1.91 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers[sentencepiece]>=4.26.0->optimum~=1.6.4->-r requirements.txt (line 3)) (0.2.0)
+Requirement already satisfied: humanfriendly>=9.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from coloredlogs->onnxruntime==1.16.1->-r requirements.txt (line 2)) (9.2)
+Requirement already satisfied: python-dateutil>=2.8.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from pandas->datasets~=2.10.1->-r requirements.txt (line 5)) (2.8.2)
+Requirement already satisfied: pytz>=2020.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from pandas->datasets~=2.10.1->-r requirements.txt (line 5)) (2023.3.post1)
+Requirement already satisfied: mpmath>=0.19 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from sympy->onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.3.0)
+Requirement already satisfied: six>=1.5 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas->datasets~=2.10.1->-r requirements.txt (line 5)) (1.16.0)
+Requirement already satisfied: MarkupSafe>=2.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from jinja2->torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (2.1.3)
+Note: you may need to restart the kernel to use updated packages.
+
+
+
+
+
+
+
import mlrun
+
+
+
+
+
+
+
project = mlrun.get_or_create_project('hugging-face-trainer', context="./", user_project=True)
+
+
+
+
+
> 2024-03-24 17:10:17,091 [info] Project loaded successfully: {'project_name': 'hugging-face-trainer'}
+
+
+
+
+
+

Importing the hugging_face_classifier_trainer function from the Marketplace#

+
+
+
hugging_face_classifier_trainer = mlrun.import_function("hub://hugging_face_classifier_trainer")
+
+
+
+
+
+
+

Training a model#

+

Choosing the train handler

+
+

Define task parameters¶#

+
    +
  • Class parameters should contain the prefix CLASS_

  • +
  • Train parameters should contain the prefix TRAIN_

  • +
+
+
+
model_class = "transformers.AutoModelForSequenceClassification"
+additional_parameters = {
+    "TRAIN_output_dir": "finetuning-sentiment-model-3000-samples",
+    "TRAIN_learning_rate": 2e-5,
+    "TRAIN_per_device_train_batch_size": 16,
+    "TRAIN_per_device_eval_batch_size": 16,
+    "TRAIN_num_train_epochs": 3,
+    "TRAIN_weight_decay": 0.01,
+    "TRAIN_push_to_hub": False,
+    "TRAIN_evaluation_strategy": "epoch",
+    "TRAIN_eval_steps": 1,
+    "TRAIN_logging_steps": 1,
+    "CLASS_num_labels": 2
+}
+
+
+
+
+
+
+

Running the Training job with the “train” handler#

+
+
+
train_run = hugging_face_classifier_trainer.run(params={
+                                                        "hf_dataset": "Shayanvsf/US_Airline_Sentiment",
+                                                        "drop_columns": [
+                                                            "airline_sentiment_confidence",
+                                                            "negativereason_confidence",
+                                                        ],
+                                                        "pretrained_tokenizer": "distilbert-base-uncased",
+                                                        "pretrained_model": "distilbert-base-uncased",
+                                                        "model_class": "transformers.AutoModelForSequenceClassification",
+                                                        "label_name": "airline_sentiment",
+                                                        "num_of_train_samples": 100,
+                                                        "metrics": ["accuracy", "f1"],
+                                                        "random_state": 42,
+                                                        **additional_parameters
+                                                    },
+                                                    handler="train",
+                                                    local=True,
+                                                )
+
+
+
+
+
> 2024-03-24 17:10:21,025 [info] Storing function: {'name': 'hugging-face-classifier-trainer-train', 'uid': '514d8d5530c842238b1cc81983cd943e', 'db': 'http://mlrun-api:8080'}
+> 2024-03-24 17:11:03,727 [info] 'train_test_split_size' is not provided, setting train_test_split_size to 0.2
+> 2024-03-24 17:11:03,882 [info] Loading and editing Shayanvsf/US_Airline_Sentiment dataset from Hugging Face hub
+
+
+
Found cached dataset parquet (/igz/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
+
+
+
Loading cached shuffled indices for dataset at /igz/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-ec18d1773cfb9bb5.arrow
+Loading cached shuffled indices for dataset at /igz/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-e0c54c494a578ee6.arrow
+
+
+
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
+- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
+You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+
+
+
> 2024-03-24 17:11:08,938 [info] training 'huggingface-model'
+
+
+
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
+This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
+***** Running training *****
+  Num examples = 100
+  Num Epochs = 3
+  Instantaneous batch size per device = 16
+  Total train batch size (w. parallel, distributed & accumulation) = 16
+  Gradient Accumulation steps = 1
+  Total optimization steps = 21
+  Number of trainable parameters = 66955010
+You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
+
+
+
+
+ + [21/21 00:15, Epoch 3/3] +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
EpochTraining LossValidation LossAccuracyF1
10.7389000.5153110.7916670.000000
20.5259000.4815630.7916670.000000
30.4908000.4716750.7916670.000000

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 24
+  Batch size = 16
+/tmp/tmp0c1aawrq.py:561: FutureWarning:
+
+load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate
+
+The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 24
+  Batch size = 16
+The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 24
+  Batch size = 16
+
+
+Training completed. Do not forget to share your model on huggingface.co/models =)
+
+
+tokenizer config file saved in /tmp/tokenizer/tokenizer_config.json
+Special tokens file saved in /tmp/tokenizer/special_tokens_map.json
+Configuration saved in /tmp/model/config.json
+Model weights saved in /tmp/model/pytorch_model.bin
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
hugging-face-trainer-avia0Mar 24 17:10:21completedhugging-face-classifier-trainer-train
v3io_user=avia
kind=local
owner=avia
host=jupyter-avia-6454bdd4c5-xz8cg
hf_dataset=Shayanvsf/US_Airline_Sentiment
drop_columns=['airline_sentiment_confidence', 'negativereason_confidence']
pretrained_tokenizer=distilbert-base-uncased
pretrained_model=distilbert-base-uncased
model_class=transformers.AutoModelForSequenceClassification
label_name=airline_sentiment
num_of_train_samples=100
metrics=['accuracy', 'f1']
random_state=42
TRAIN_output_dir=finetuning-sentiment-model-3000-samples
TRAIN_learning_rate=2e-05
TRAIN_per_device_train_batch_size=16
TRAIN_per_device_eval_batch_size=16
TRAIN_num_train_epochs=3
TRAIN_weight_decay=0.01
TRAIN_push_to_hub=False
TRAIN_evaluation_strategy=epoch
TRAIN_eval_steps=1
TRAIN_logging_steps=1
CLASS_num_labels=2
loss=0.4908
learning_rate=0.0
eval_loss=0.47167453169822693
eval_accuracy=0.7916666666666666
eval_f1=0.0
eval_runtime=0.5186
eval_samples_per_second=46.276
eval_steps_per_second=3.856
train_runtime=17.6054
train_samples_per_second=17.04
train_steps_per_second=1.193
total_flos=3327208489680.0
loss_plot
learning_rate_plot
eval_loss_plot
eval_accuracy_plot
eval_f1_plot
eval_runtime_plot
eval_samples_per_second_plot
eval_steps_per_second_plot
tokenizer
model
+
+ +
+

+
+
+
> to track results use the .show() or .logs() methods or click here to open in UI
> 2024-03-24 17:12:01,880 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-train'}
+
+
+
+
+
+
+

The result of the train run#

+
+
+
train_run.outputs
+
+
+
+
+
{'loss': 0.4908,
+ 'learning_rate': 0.0,
+ 'eval_loss': 0.47167453169822693,
+ 'eval_accuracy': 0.7916666666666666,
+ 'eval_f1': 0.0,
+ 'eval_runtime': 0.5186,
+ 'eval_samples_per_second': 46.276,
+ 'eval_steps_per_second': 3.856,
+ 'train_runtime': 17.6054,
+ 'train_samples_per_second': 17.04,
+ 'train_steps_per_second': 1.193,
+ 'total_flos': 3327208489680.0,
+ 'loss_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/loss_plot.html',
+ 'learning_rate_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/learning_rate_plot.html',
+ 'eval_loss_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_loss_plot.html',
+ 'eval_accuracy_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_accuracy_plot.html',
+ 'eval_f1_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_f1_plot.html',
+ 'eval_runtime_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_runtime_plot.html',
+ 'eval_samples_per_second_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_samples_per_second_plot.html',
+ 'eval_steps_per_second_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_steps_per_second_plot.html',
+ 'tokenizer': 'store://artifacts/hugging-face-trainer-avia/hugging-face-classifier-trainer-train_tokenizer@514d8d5530c842238b1cc81983cd943e',
+ 'model': 'store://artifacts/hugging-face-trainer-avia/huggingface-model@514d8d5530c842238b1cc81983cd943e'}
+
+
+
+
+
+
+
train_run.artifact('loss_plot').show()
+
+
+
+
+
+ + +
+
+ +
+
+
+
+

Getting the model for evaluating and predicting#

+
+
+
model_path = train_run.outputs['model']
+
+
+
+
+
+
+
+

Optimize the model#

+

Choosing the optimize handler

+

The result of using this handled is an onnx optimized model.

+
+
+
optimize_run = hugging_face_classifier_trainer.run(params={
+                                                        "model_path": str(model_path)
+                                                    },
+                                                    handler="optimize",
+                                                    local=True,
+                                                )
+
+
+
+
+
> 2024-03-24 17:12:02,020 [info] Storing function: {'name': 'hugging-face-classifier-trainer-optimize', 'uid': 'fbee1ead18444824a4b5c0308a677bf4', 'db': 'http://mlrun-api:8080'}
+
+
+
/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/optimum/onnxruntime/configuration.py:726: FutureWarning:
+
+disable_embed_layer_norm will be deprecated soon, use disable_embed_layer_norm_fusion instead, disable_embed_layer_norm_fusion is set to True.
+
+loading configuration file /tmp/config.json
+Model config DistilBertConfig {
+  "_name_or_path": "/tmp/config.json",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.1",
+  "vocab_size": 30522
+}
+
+loading configuration file /tmp/config.json
+Model config DistilBertConfig {
+  "_name_or_path": "/tmp",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.1",
+  "vocab_size": 30522
+}
+
+loading weights file /tmp/pytorch_model.bin
+All model checkpoint weights were used when initializing DistilBertForSequenceClassification.
+
+All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at /tmp.
+If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.
+/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/transformers/models/distilbert/modeling_distilbert.py:218: TracerWarning:
+
+torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
+
+Configuration saved in /tmp/tmp79wjp8m8/config.json
+Could not locate the tokenizer configuration file, will try to use the model config instead.
+loading configuration file /tmp/config.json
+Model config DistilBertConfig {
+  "_name_or_path": "/tmp",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.1",
+  "vocab_size": 30522
+}
+
+loading configuration file /tmp/config.json
+Model config DistilBertConfig {
+  "_name_or_path": "/tmp",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.1",
+  "vocab_size": 30522
+}
+
+Could not locate the tokenizer configuration file, will try to use the model config instead.
+loading configuration file /tmp/config.json
+Model config DistilBertConfig {
+  "_name_or_path": "/tmp",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.1",
+  "vocab_size": 30522
+}
+
+Could not locate the tokenizer configuration file, will try to use the model config instead.
+loading configuration file /tmp/tmp79wjp8m8/config.json
+Model config DistilBertConfig {
+  "_name_or_path": "/tmp/tmp79wjp8m8",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.1",
+  "vocab_size": 30522
+}
+
+loading configuration file /tmp/tmp79wjp8m8/config.json
+Model config DistilBertConfig {
+  "_name_or_path": "/tmp/tmp79wjp8m8",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.1",
+  "vocab_size": 30522
+}
+
+Could not locate the tokenizer configuration file, will try to use the model config instead.
+loading configuration file /tmp/tmp79wjp8m8/config.json
+Model config DistilBertConfig {
+  "_name_or_path": "/tmp/tmp79wjp8m8",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.1",
+  "vocab_size": 30522
+}
+
+Configuration saved in optimized/config.json
+Could not locate the tokenizer configuration file, will try to use the model config instead.
+loading configuration file /tmp/tmp79wjp8m8/config.json
+Model config DistilBertConfig {
+  "_name_or_path": "/tmp/tmp79wjp8m8",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.1",
+  "vocab_size": 30522
+}
+
+loading configuration file /tmp/tmp79wjp8m8/config.json
+Model config DistilBertConfig {
+  "_name_or_path": "/tmp/tmp79wjp8m8",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.1",
+  "vocab_size": 30522
+}
+
+Could not locate the tokenizer configuration file, will try to use the model config instead.
+loading configuration file /tmp/tmp79wjp8m8/config.json
+Model config DistilBertConfig {
+  "_name_or_path": "/tmp/tmp79wjp8m8",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.1",
+  "vocab_size": 30522
+}
+
+Failed to remove node input: "/distilbert/transformer/layer.0/attention/Transpose_output_0"
+input: "/distilbert/transformer/layer.0/attention/Constant_11_output_0"
+output: "/distilbert/transformer/layer.0/attention/Div_output_0"
+name: "/distilbert/transformer/layer.0/attention/Div"
+op_type: "Div"
+
+Failed to remove node input: "/distilbert/transformer/layer.1/attention/Transpose_output_0"
+input: "/distilbert/transformer/layer.1/attention/Constant_11_output_0"
+output: "/distilbert/transformer/layer.1/attention/Div_output_0"
+name: "/distilbert/transformer/layer.1/attention/Div"
+op_type: "Div"
+
+Failed to remove node input: "/distilbert/transformer/layer.2/attention/Transpose_output_0"
+input: "/distilbert/transformer/layer.2/attention/Constant_11_output_0"
+output: "/distilbert/transformer/layer.2/attention/Div_output_0"
+name: "/distilbert/transformer/layer.2/attention/Div"
+op_type: "Div"
+
+Failed to remove node input: "/distilbert/transformer/layer.3/attention/Transpose_output_0"
+input: "/distilbert/transformer/layer.3/attention/Constant_11_output_0"
+output: "/distilbert/transformer/layer.3/attention/Div_output_0"
+name: "/distilbert/transformer/layer.3/attention/Div"
+op_type: "Div"
+
+Failed to remove node input: "/distilbert/transformer/layer.4/attention/Transpose_output_0"
+input: "/distilbert/transformer/layer.4/attention/Constant_11_output_0"
+output: "/distilbert/transformer/layer.4/attention/Div_output_0"
+name: "/distilbert/transformer/layer.4/attention/Div"
+op_type: "Div"
+
+Failed to remove node input: "/distilbert/transformer/layer.5/attention/Transpose_output_0"
+input: "/distilbert/transformer/layer.5/attention/Constant_11_output_0"
+output: "/distilbert/transformer/layer.5/attention/Div_output_0"
+name: "/distilbert/transformer/layer.5/attention/Div"
+op_type: "Div"
+
+Configuration saved in optimized/config.json
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
hugging-face-trainer-avia0Mar 24 17:12:02completedhugging-face-classifier-trainer-optimize
v3io_user=avia
kind=local
owner=avia
host=jupyter-avia-6454bdd4c5-xz8cg
model_path=store://artifacts/hugging-face-trainer-avia/huggingface-model@514d8d5530c842238b1cc81983cd943e
model
+
+ +
+

+
+
+
> to track results use the .show() or .logs() methods or click here to open in UI
> 2024-03-24 17:12:22,721 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-optimize'}
+
+
+
+
+
+
+
optimize_run.outputs
+
+
+
+
+
{'model': 'store://artifacts/hugging-face-trainer-avia/optimized_model@fbee1ead18444824a4b5c0308a677bf4'}
+
+
+
+
+
+
+

Running the training remotely#

+
+
+
project.build_function("hugging-face-classifier-trainer",with_mlrun=True)
+
+
+
+
+
/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/mlrun/projects/operations.py:276: OverwriteBuildParamsWarning:
+
+The `overwrite_build_params` parameter default will change from 'False' to 'True' in 1.8.0.
+
+
+
> 2024-03-24 17:14:22,792 [info] Started building image: .mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer:latest
+INFO[0000] Retrieving image manifest mlrun/mlrun:1.6.1  
+INFO[0000] Retrieving image mlrun/mlrun:1.6.1 from registry index.docker.io 
+INFO[0000] Built cross stage deps: map[]                
+INFO[0000] Retrieving image manifest mlrun/mlrun:1.6.1  
+INFO[0000] Returning cached image manifest              
+INFO[0000] Executing 0 build triggers                   
+INFO[0000] Building stage 'mlrun/mlrun:1.6.1' [idx: '0', base-idx: '-1'] 
+INFO[0000] Unpacking rootfs as cmd RUN echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt requires it. 
+INFO[0047] RUN echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt 
+INFO[0047] Initializing snapshotter ...                 
+INFO[0047] Taking snapshot of full filesystem...        
+INFO[0074] Cmd: /bin/sh                                 
+INFO[0074] Args: [-c echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt] 
+INFO[0074] Running: [/bin/sh -c echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt] 
+Installing /empty/requirements.txt...
+mlrun[complete]==1.6.1
+onnx~=1.14.1
+onnxruntime~=1.16.1
+optimum~=1.6.4
+transformers~=4.26.1
+datasets~=2.10.1
+scikit-learn~=1.0.2
+INFO[0074] Taking snapshot of full filesystem...        
+INFO[0078] No files were changed, appending empty layer to config. No layer added to image. 
+INFO[0078] RUN python -m pip install -r /empty/requirements.txt 
+INFO[0078] Cmd: /bin/sh                                 
+INFO[0078] Args: [-c python -m pip install -r /empty/requirements.txt] 
+INFO[0078] Running: [/bin/sh -c python -m pip install -r /empty/requirements.txt] 
+Requirement already satisfied: mlrun[complete]==1.6.1 in /opt/conda/lib/python3.9/site-packages (from -r /empty/requirements.txt (line 1)) (1.6.1)
+Collecting onnx~=1.14.1 (from -r /empty/requirements.txt (line 2))
+  Obtaining dependency information for onnx~=1.14.1 from https://files.pythonhosted.org/packages/ff/24/0e522fdcadf0e15fc304145a5b6e5d7246d7f2c507fd9bfe6e1fafb2aa95/onnx-1.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
+  Downloading onnx-1.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
+Collecting onnxruntime~=1.16.1 (from -r /empty/requirements.txt (line 3))
+  Obtaining dependency information for onnxruntime~=1.16.1 from https://files.pythonhosted.org/packages/de/ab/ed3ae0d649cee41e870f8b1653cf4a1c1fc321e0ded4e3e1a3d4a25c0131/onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
+  Downloading onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
+Collecting optimum~=1.6.4 (from -r /empty/requirements.txt (line 4))
+  Obtaining dependency information for optimum~=1.6.4 from https://files.pythonhosted.org/packages/31/72/a7e3b2c57d6368c5f4bb6fba54a85cbf07d25c385a2db3f1a638f3c0ddb2/optimum-1.6.4-py3-none-any.whl.metadata
+  Downloading optimum-1.6.4-py3-none-any.whl.metadata (17 kB)
+Collecting transformers~=4.26.1 (from -r /empty/requirements.txt (line 5))
+  Obtaining dependency information for transformers~=4.26.1 from https://files.pythonhosted.org/packages/1e/e2/60c3f4691b16d126ee9cfe28f598b13c424b60350ab339aba81aef054b8f/transformers-4.26.1-py3-none-any.whl.metadata
+  Downloading transformers-4.26.1-py3-none-any.whl.metadata (100 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.3/100.3 kB 6.2 MB/s eta 0:00:00
+Collecting datasets~=2.10.1 (from -r /empty/requirements.txt (line 6))
+  Obtaining dependency information for datasets~=2.10.1 from https://files.pythonhosted.org/packages/fe/17/5825fdf034ff1a315becdbb9b6fe5a2bd9d8e724464535f18809593bf9c2/datasets-2.10.1-py3-none-any.whl.metadata
+  Downloading datasets-2.10.1-py3-none-any.whl.metadata (20 kB)
+Collecting scikit-learn~=1.0.2 (from -r /empty/requirements.txt (line 7))
+  Obtaining dependency information for scikit-learn~=1.0.2 from https://files.pythonhosted.org/packages/57/aa/483fbe6b5314bce2d49801e6cec1f2139a9c220d0d51494788fff47233b3/scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
+  Downloading scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
+Requirement already satisfied: urllib3<1.27,>=1.26.9 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.26.18)
+Requirement already satisfied: GitPython>=3.1.41,~=3.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.1.42)
+Requirement already satisfied: aiohttp~=3.9 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.9.3)
+Requirement already satisfied: aiohttp-retry~=2.8 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.3)
+Requirement already satisfied: click~=8.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (8.1.7)
+Requirement already satisfied: kfp~=1.8 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.22)
+Requirement already satisfied: nest-asyncio~=1.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.6.0)
+Requirement already satisfied: ipython~=8.10 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (8.18.1)
+Requirement already satisfied: nuclio-jupyter~=0.9.15 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.16)
+Requirement already satisfied: numpy<1.27.0,>=1.16.5 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.26.4)
+Requirement already satisfied: pandas<2.2,>=1.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1.4)
+Requirement already satisfied: pyarrow<15,>=10.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (14.0.2)
+Requirement already satisfied: pyyaml~=5.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.4.1)
+Requirement already satisfied: requests~=2.31 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.31.0)
+Requirement already satisfied: tabulate~=0.8.6 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.8.10)
+Requirement already satisfied: v3io~=0.5.21 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.23)
+Requirement already satisfied: pydantic>=1.10.8,~=1.10 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.10.14)
+Requirement already satisfied: mergedeep~=1.3 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.4)
+Requirement already satisfied: v3io-frames~=0.10.12 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.10.13)
+Requirement already satisfied: semver~=3.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.2)
+Requirement already satisfied: dependency-injector~=4.41 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.41.0)
+Requirement already satisfied: fsspec==2023.9.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.2)
+Requirement already satisfied: v3iofs~=0.1.17 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.18)
+Requirement already satisfied: storey~=1.6.18 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.6.18)
+Requirement already satisfied: inflection~=0.5.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.1)
+Requirement already satisfied: python-dotenv~=0.17.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.17.1)
+Requirement already satisfied: setuptools~=68.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (68.2.2)
+Requirement already satisfied: deprecated~=1.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.14)
+Requirement already satisfied: jinja2>=3.1.3,~=3.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.1.3)
+Requirement already satisfied: anyio~=3.7 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.7.1)
+Requirement already satisfied: orjson~=3.9 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.9.15)
+Requirement already satisfied: adlfs==2023.9.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.0)
+Requirement already satisfied: aiobotocore<2.8,>=2.5.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.5.4)
+Requirement already satisfied: avro~=1.11 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.11.3)
+Requirement already satisfied: azure-core~=1.24 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.30.0)
+Requirement already satisfied: azure-identity~=1.5 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.15.0)
+Requirement already satisfied: azure-keyvault-secrets~=4.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.8.0)
+Requirement already satisfied: boto3<1.29.0,>=1.28.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.28.17)
+Requirement already satisfied: dask~=2023.9.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.3)
+Requirement already satisfied: databricks-sdk~=0.13.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.13.0)
+Requirement already satisfied: distributed~=2023.9.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.3)
+Requirement already satisfied: gcsfs==2023.9.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.2)
+Requirement already satisfied: google-cloud-bigquery[bqstorage,pandas]==3.14.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.14.1)
+Requirement already satisfied: graphviz~=0.20.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.20.1)
+Requirement already satisfied: kafka-python~=2.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.2)
+Requirement already satisfied: mlflow~=2.8 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.10.2)
+Requirement already satisfied: msrest~=0.6.21 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.21)
+Requirement already satisfied: plotly<5.12.0,~=5.4 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.11.0)
+Requirement already satisfied: pyopenssl>=23 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (24.0.0)
+Requirement already satisfied: redis~=4.3 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.6.0)
+Requirement already satisfied: s3fs==2023.9.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.2)
+Requirement already satisfied: sqlalchemy~=1.4 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.51)
+Requirement already satisfied: azure-datalake-store<0.1,>=0.0.46 in /opt/conda/lib/python3.9/site-packages (from adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.0.53)
+Requirement already satisfied: azure-storage-blob>=12.12.0 in /opt/conda/lib/python3.9/site-packages (from adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (12.19.0)
+Requirement already satisfied: decorator>4.1.2 in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.1.1)
+Requirement already satisfied: google-auth>=1.2 in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.28.1)
+Requirement already satisfied: google-auth-oauthlib in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)
+Requirement already satisfied: google-cloud-storage in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.14.0)
+Requirement already satisfied: google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.17.1)
+Requirement already satisfied: google-cloud-core<3.0.0dev,>=1.6.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.1)
+Requirement already satisfied: google-resumable-media<3.0dev,>=0.6.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.7.0)
+Requirement already satisfied: packaging>=20.0.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (23.1)
+Requirement already satisfied: python-dateutil<3.0dev,>=2.7.2 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.2)
+Requirement already satisfied: db-dtypes<2.0.0dev,>=0.3.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)
+Requirement already satisfied: google-cloud-bigquery-storage<3.0.0dev,>=2.6.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.24.0)
+Requirement already satisfied: grpcio<2.0dev,>=1.47.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.62.0)
+Requirement already satisfied: protobuf>=3.20.2 in /opt/conda/lib/python3.9/site-packages (from onnx~=1.14.1->-r /empty/requirements.txt (line 2)) (3.20.3)
+Requirement already satisfied: typing-extensions>=3.6.2.1 in /opt/conda/lib/python3.9/site-packages (from onnx~=1.14.1->-r /empty/requirements.txt (line 2)) (4.10.0)
+Collecting coloredlogs (from onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))
+  Obtaining dependency information for coloredlogs from https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl.metadata
+  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
+Collecting flatbuffers (from onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))
+  Obtaining dependency information for flatbuffers from https://files.pythonhosted.org/packages/bf/45/c961e3cb6ddad76b325c163d730562bb6deb1ace5acbed0306f5fbefb90e/flatbuffers-24.3.7-py2.py3-none-any.whl.metadata
+  Downloading flatbuffers-24.3.7-py2.py3-none-any.whl.metadata (849 bytes)
+Collecting sympy (from onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))
+  Obtaining dependency information for sympy from https://files.pythonhosted.org/packages/d2/05/e6600db80270777c4a64238a98d442f0fd07cc8915be2a1c16da7f2b9e74/sympy-1.12-py3-none-any.whl.metadata
+  Downloading sympy-1.12-py3-none-any.whl.metadata (12 kB)
+Collecting transformers[sentencepiece]>=4.26.0 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/0a/fd/280f4385e76f3c1890efc15fa93f7206134fefad6351397e1bfab6d0d0de/transformers-4.39.1-py3-none-any.whl.metadata
+  Downloading transformers-4.39.1-py3-none-any.whl.metadata (134 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 40.1 MB/s eta 0:00:00
+Collecting torch>=1.9 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))
+  Obtaining dependency information for torch>=1.9 from https://files.pythonhosted.org/packages/98/04/95a12556d068786d6505c609daf2805bed91c9210c5185499a7c121eba47/torch-2.2.1-cp39-cp39-manylinux1_x86_64.whl.metadata
+  Downloading torch-2.2.1-cp39-cp39-manylinux1_x86_64.whl.metadata (25 kB)
+Collecting numpy<1.27.0,>=1.16.5 (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1))
+  Obtaining dependency information for numpy<1.27.0,>=1.16.5 from https://files.pythonhosted.org/packages/4c/b9/038abd6fbd67b05b03cb1af590cfc02b7f1e5a37af7ac6a868f5093c29f5/numpy-1.23.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
+  Downloading numpy-1.23.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
+Collecting huggingface-hub>=0.8.0 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))
+  Obtaining dependency information for huggingface-hub>=0.8.0 from https://files.pythonhosted.org/packages/ab/28/d4b691840d73126d4c9845f8a22dad033ac872509b6d3a0d93b456eef424/huggingface_hub-0.21.4-py3-none-any.whl.metadata
+  Downloading huggingface_hub-0.21.4-py3-none-any.whl.metadata (13 kB)
+Collecting filelock (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))
+  Obtaining dependency information for filelock from https://files.pythonhosted.org/packages/81/54/84d42a0bee35edba99dee7b59a8d4970eccdd44b99fe728ed912106fc781/filelock-3.13.1-py3-none-any.whl.metadata
+  Downloading filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
+Collecting regex!=2019.12.17 (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))
+  Obtaining dependency information for regex!=2019.12.17 from https://files.pythonhosted.org/packages/05/9e/80c20f1151432a6025690c9c2037053039b028a7b236fa81d7e7ac9dec60/regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
+  Downloading regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 40.9/40.9 kB 217.5 MB/s eta 0:00:00
+Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))
+  Obtaining dependency information for tokenizers!=0.11.3,<0.14,>=0.11.1 from https://files.pythonhosted.org/packages/d6/27/07a337087dd507170a1b20fed3bbf8da81401185a7130a6e74e440c52040/tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
+  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
+Requirement already satisfied: tqdm>=4.27 in /opt/conda/lib/python3.9/site-packages (from transformers~=4.26.1->-r /empty/requirements.txt (line 5)) (4.65.0)
+Collecting dill<0.3.7,>=0.3.0 (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))
+  Obtaining dependency information for dill<0.3.7,>=0.3.0 from https://files.pythonhosted.org/packages/be/e3/a84bf2e561beed15813080d693b4b27573262433fced9c1d1fea59e60553/dill-0.3.6-py3-none-any.whl.metadata
+  Downloading dill-0.3.6-py3-none-any.whl.metadata (9.8 kB)
+Requirement already satisfied: xxhash in /opt/conda/lib/python3.9/site-packages (from datasets~=2.10.1->-r /empty/requirements.txt (line 6)) (3.4.1)
+Collecting multiprocess (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))
+  Obtaining dependency information for multiprocess from https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl.metadata
+  Downloading multiprocess-0.70.16-py39-none-any.whl.metadata (7.2 kB)
+Collecting responses<0.19 (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))
+  Obtaining dependency information for responses<0.19 from https://files.pythonhosted.org/packages/79/f3/2b3a6dc5986303b3dd1bbbcf482022acb2583c428cd23f0b6d37b1a1a519/responses-0.18.0-py3-none-any.whl.metadata
+  Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)
+Requirement already satisfied: scipy>=1.1.0 in /opt/conda/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r /empty/requirements.txt (line 7)) (1.12.0)
+Requirement already satisfied: joblib>=0.11 in /opt/conda/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r /empty/requirements.txt (line 7)) (1.3.2)
+Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r /empty/requirements.txt (line 7)) (3.3.0)
+Requirement already satisfied: botocore<1.31.18,>=1.31.17 in /opt/conda/lib/python3.9/site-packages (from aiobotocore<2.8,>=2.5.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.31.17)
+Requirement already satisfied: wrapt<2.0.0,>=1.10.10 in /opt/conda/lib/python3.9/site-packages (from aiobotocore<2.8,>=2.5.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.16.0)
+Requirement already satisfied: aioitertools<1.0.0,>=0.5.1 in /opt/conda/lib/python3.9/site-packages (from aiobotocore<2.8,>=2.5.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.11.0)
+Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.1)
+Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (23.2.0)
+Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.1)
+Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.0.5)
+Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.9.4)
+Requirement already satisfied: async-timeout<5.0,>=4.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.0.3)
+Requirement already satisfied: idna>=2.8 in /opt/conda/lib/python3.9/site-packages (from anyio~=3.7->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.4)
+Requirement already satisfied: sniffio>=1.1 in /opt/conda/lib/python3.9/site-packages (from anyio~=3.7->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.1)
+Requirement already satisfied: exceptiongroup in /opt/conda/lib/python3.9/site-packages (from anyio~=3.7->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)
+Requirement already satisfied: six>=1.11.0 in /opt/conda/lib/python3.9/site-packages (from azure-core~=1.24->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.16.0)
+Requirement already satisfied: cryptography>=2.5 in /opt/conda/lib/python3.9/site-packages (from azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (42.0.2)
+Requirement already satisfied: msal<2.0.0,>=1.24.0 in /opt/conda/lib/python3.9/site-packages (from azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.27.0)
+Requirement already satisfied: msal-extensions<2.0.0,>=0.3.0 in /opt/conda/lib/python3.9/site-packages (from azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.1.0)
+Requirement already satisfied: isodate>=0.6.1 in /opt/conda/lib/python3.9/site-packages (from azure-keyvault-secrets~=4.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.1)
+Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /opt/conda/lib/python3.9/site-packages (from boto3<1.29.0,>=1.28.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.1)
+Requirement already satisfied: s3transfer<0.7.0,>=0.6.0 in /opt/conda/lib/python3.9/site-packages (from boto3<1.29.0,>=1.28.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.2)
+Requirement already satisfied: cloudpickle>=1.5.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.2.1)
+Requirement already satisfied: partd>=1.2.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.1)
+Requirement already satisfied: toolz>=0.10.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.12.0)
+Requirement already satisfied: importlib-metadata>=4.13.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.0.1)
+Requirement already satisfied: locket>=1.0.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.0)
+Requirement already satisfied: msgpack>=1.0.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.7)
+Requirement already satisfied: psutil>=5.7.2 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.9.8)
+Requirement already satisfied: sortedcontainers>=2.0.5 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.0)
+Requirement already satisfied: tblib>=1.6.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.0)
+Requirement already satisfied: tornado>=6.0.4 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.4)
+Requirement already satisfied: zict>=3.0.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.0)
+Requirement already satisfied: gitdb<5,>=4.0.1 in /opt/conda/lib/python3.9/site-packages (from GitPython>=3.1.41,~=3.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.0.11)
+Requirement already satisfied: jedi>=0.16 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.19.1)
+Requirement already satisfied: matplotlib-inline in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.6)
+Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.43)
+Requirement already satisfied: pygments>=2.4.0 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.17.2)
+Requirement already satisfied: stack-data in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.3)
+Requirement already satisfied: traitlets>=5 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.14.1)
+Requirement already satisfied: pexpect>4.3 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.9.0)
+Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.9/site-packages (from jinja2>=3.1.3,~=3.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1.5)
+Requirement already satisfied: absl-py<2,>=0.9 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.0)
+Requirement already satisfied: kubernetes<26,>=8.0.0 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (25.3.0)
+Requirement already satisfied: google-api-python-client<2,>=1.7.8 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.12.11)
+Requirement already satisfied: requests-toolbelt<1,>=0.8.0 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.10.1)
+Requirement already satisfied: kfp-server-api<2.0.0,>=1.1.2 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.5)
+Requirement already satisfied: jsonschema<5,>=3.0.1 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.21.1)
+Requirement already satisfied: strip-hints<1,>=0.1.8 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.10)
+Requirement already satisfied: docstring-parser<1,>=0.7.3 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.15)
+Requirement already satisfied: kfp-pipeline-spec<0.2.0,>=0.1.16 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.16)
+Requirement already satisfied: fire<1,>=0.3.1 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.0)
+Requirement already satisfied: uritemplate<4,>=3.0.1 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.1)
+Requirement already satisfied: typer<1.0,>=0.3.2 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.0)
+Requirement already satisfied: entrypoints<1 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.4)
+Requirement already satisfied: pytz<2024 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.4)
+Requirement already satisfied: sqlparse<1,>=0.4.0 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.4.4)
+Requirement already satisfied: alembic!=1.10.0,<2 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.13.1)
+Requirement already satisfied: docker<8,>=4.0.0 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.0.0)
+Requirement already satisfied: Flask<4 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.2)
+Requirement already satisfied: querystring-parser<2 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.4)
+Requirement already satisfied: markdown<4,>=3.3 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.5.2)
+Requirement already satisfied: matplotlib<4 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.8.3)
+Requirement already satisfied: gunicorn<22 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (21.2.0)
+Requirement already satisfied: requests-oauthlib>=0.5.0 in /opt/conda/lib/python3.9/site-packages (from msrest~=0.6.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.1)
+Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.9/site-packages (from msrest~=0.6.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2024.2.2)
+Requirement already satisfied: nbconvert>=6.4.5 in /opt/conda/lib/python3.9/site-packages (from nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.16.1)
+Requirement already satisfied: notebook<7.0.0,>=6.4 in /opt/conda/lib/python3.9/site-packages (from nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.5.6)
+Requirement already satisfied: tzdata>=2022.1 in /opt/conda/lib/python3.9/site-packages (from pandas<2.2,>=1.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2024.1)
+Requirement already satisfied: tenacity>=6.2.0 in /opt/conda/lib/python3.9/site-packages (from plotly<5.12.0,~=5.4->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (8.2.3)
+Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.9/site-packages (from requests~=2.31->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.4)
+Requirement already satisfied: greenlet!=0.4.17 in /opt/conda/lib/python3.9/site-packages (from sqlalchemy~=1.4->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.3)
+Requirement already satisfied: nuclio-sdk>=0.5.3 in /opt/conda/lib/python3.9/site-packages (from storey~=1.6.18->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.9)
+Collecting networkx (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
+  Obtaining dependency information for networkx from https://files.pythonhosted.org/packages/d5/f0/8fbc882ca80cf077f1b246c0e3c3465f7f415439bdea6b899f6b19f61f70/networkx-3.2.1-py3-none-any.whl.metadata
+  Downloading networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)
+Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
+  Obtaining dependency information for nvidia-cuda-nvrtc-cu12==12.1.105 from https://files.pythonhosted.org/packages/b6/9f/c64c03f49d6fbc56196664d05dba14e3a561038a81a638eeb47f4d4cfd48/nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata
+  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
+Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
+  Obtaining dependency information for nvidia-cuda-runtime-cu12==12.1.105 from https://files.pythonhosted.org/packages/eb/d5/c68b1d2cdfcc59e72e8a5949a37ddb22ae6cade80cd4a57a84d4c8b55472/nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata
+  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
+Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
+  Obtaining dependency information for nvidia-cuda-cupti-cu12==12.1.105 from https://files.pythonhosted.org/packages/7e/00/6b218edd739ecfc60524e585ba8e6b00554dd908de2c9c66c1af3e44e18d/nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata
+  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
+Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
+  Obtaining dependency information for nvidia-cudnn-cu12==8.9.2.26 from https://files.pythonhosted.org/packages/ff/74/a2e2be7fb83aaedec84f391f082cf765dfb635e7caa9b49065f73e4835d8/nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata
+  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
+Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
+  Obtaining dependency information for nvidia-cublas-cu12==12.1.3.1 from https://files.pythonhosted.org/packages/37/6d/121efd7382d5b0284239f4ab1fc1590d86d34ed4a4a2fdb13b30ca8e5740/nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata
+  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
+Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
+  Obtaining dependency information for nvidia-cufft-cu12==11.0.2.54 from https://files.pythonhosted.org/packages/86/94/eb540db023ce1d162e7bea9f8f5aa781d57c65aed513c33ee9a5123ead4d/nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata
+  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
+Collecting nvidia-curand-cu12==10.3.2.106 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
+  Obtaining dependency information for nvidia-curand-cu12==10.3.2.106 from https://files.pythonhosted.org/packages/44/31/4890b1c9abc496303412947fc7dcea3d14861720642b49e8ceed89636705/nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl.metadata
+  Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
+Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
+  Obtaining dependency information for nvidia-cusolver-cu12==11.4.5.107 from https://files.pythonhosted.org/packages/bc/1d/8de1e5c67099015c834315e333911273a8c6aaba78923dd1d1e25fc5f217/nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl.metadata
+  Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
+Collecting nvidia-cusparse-cu12==12.1.0.106 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
+  Obtaining dependency information for nvidia-cusparse-cu12==12.1.0.106 from https://files.pythonhosted.org/packages/65/5b/cfaeebf25cd9fdec14338ccb16f6b2c4c7fa9163aefcf057d86b9cc248bb/nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl.metadata
+  Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
+Collecting nvidia-nccl-cu12==2.19.3 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
+  Obtaining dependency information for nvidia-nccl-cu12==2.19.3 from https://files.pythonhosted.org/packages/38/00/d0d4e48aef772ad5aebcf70b73028f88db6e5640b36c38e90445b7a57c45/nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl.metadata
+  Downloading nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl.metadata (1.8 kB)
+Collecting nvidia-nvtx-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
+  Obtaining dependency information for nvidia-nvtx-cu12==12.1.105 from https://files.pythonhosted.org/packages/da/d3/8057f0587683ed2fcd4dbfbdfdfa807b9160b809976099d36b8f60d08f03/nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata
+  Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.7 kB)
+Collecting triton==2.2.0 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
+  Obtaining dependency information for triton==2.2.0 from https://files.pythonhosted.org/packages/6a/5c/01d9f062f719581cf6e60053e1a005d666ec67dcb59630fffaa3a3e5c9d8/triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
+  Downloading triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
+Collecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
+  Obtaining dependency information for nvidia-nvjitlink-cu12 from https://files.pythonhosted.org/packages/58/d1/d1c80553f9d5d07b6072bc132607d75a0ef3600e28e1890e11c0f55d7346/nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl.metadata
+  Downloading nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
+INFO: pip is looking at multiple versions of transformers[sentencepiece] to determine which version is compatible with other requirements. This could take a while.
+Collecting transformers[sentencepiece]>=4.26.0 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/a4/73/f620d76193954e16db3d5c53a07d956d7b9c800e570758d3bff91906d4a4/transformers-4.39.0-py3-none-any.whl.metadata
+  Downloading transformers-4.39.0-py3-none-any.whl.metadata (134 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 115.9 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/b6/4d/fbe6d89fde59d8107f0a02816c4ac4542a8f9a85559fdf33c68282affcc1/transformers-4.38.2-py3-none-any.whl.metadata
+  Downloading transformers-4.38.2-py3-none-any.whl.metadata (130 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 130.7/130.7 kB 126.3 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/3e/6b/1b589f7b69aaea8193cf5bc91cf97410284aecd97b6312cdb08baedbdffe/transformers-4.38.1-py3-none-any.whl.metadata
+  Downloading transformers-4.38.1-py3-none-any.whl.metadata (131 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 131.1/131.1 kB 138.2 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/91/89/5416dc364c7ef0711c564fd61a69b03d1e40eeb5c506c38e53ba8a969e79/transformers-4.38.0-py3-none-any.whl.metadata
+  Downloading transformers-4.38.0-py3-none-any.whl.metadata (131 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 131.1/131.1 kB 186.3 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/85/f6/c5065913119c41ecad148c34e3a861f719e16b89a522287213698da911fc/transformers-4.37.2-py3-none-any.whl.metadata
+  Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.4/129.4 kB 236.8 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/ad/67/b4d6a51dcaf988cb45b31e26c6e33fb169fe34ba5fb168b086309bd7c028/transformers-4.37.1-py3-none-any.whl.metadata
+  Downloading transformers-4.37.1-py3-none-any.whl.metadata (129 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.4/129.4 kB 156.4 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/3c/45/52133ce6bce49a099cc865599803bf1fad93de887276f728e56848d77a70/transformers-4.37.0-py3-none-any.whl.metadata
+  Downloading transformers-4.37.0-py3-none-any.whl.metadata (129 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.4/129.4 kB 102.0 MB/s eta 0:00:00
+INFO: pip is still looking at multiple versions of transformers[sentencepiece] to determine which version is compatible with other requirements. This could take a while.
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/20/0a/739426a81f7635b422fbe6cb8d1d99d1235579a6ac8024c13d743efa6847/transformers-4.36.2-py3-none-any.whl.metadata
+  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.8/126.8 kB 108.8 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/fc/04/0aad491cd98b09236c54ab849863ee85421eeda5138bbf9d33ecc594652b/transformers-4.36.1-py3-none-any.whl.metadata
+  Downloading transformers-4.36.1-py3-none-any.whl.metadata (126 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.8/126.8 kB 140.6 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/0f/12/d8e27a190ca67811f81deea3183b528d9169f10b74d827e0b9211520ecfa/transformers-4.36.0-py3-none-any.whl.metadata
+  Downloading transformers-4.36.0-py3-none-any.whl.metadata (126 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.8/126.8 kB 267.8 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/12/dd/f17b11a93a9ca27728e12512d167eb1281c151c4c6881d3ab59eb58f4127/transformers-4.35.2-py3-none-any.whl.metadata
+  Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.5/123.5 kB 130.2 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/92/ba/cfff7e01f7070d9fca3964bf42b2257b86964c3e6763b8d5435436cc1d77/transformers-4.35.1-py3-none-any.whl.metadata
+  Downloading transformers-4.35.1-py3-none-any.whl.metadata (123 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.1/123.1 kB 183.6 MB/s eta 0:00:00
+INFO: This is taking longer than usual. You might need to provide the dependency resolver with stricter constraints to reduce runtime. See https://pip.pypa.io/warnings/backtracking for guidance. If you want to abort this run, press Ctrl + C.
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/9a/06/e4ec2a321e57c03b7e9345d709d554a52c33760e5015fdff0919d9459af0/transformers-4.35.0-py3-none-any.whl.metadata
+  Downloading transformers-4.35.0-py3-none-any.whl.metadata (123 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.1/123.1 kB 177.3 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/c1/bd/f64d67df4d3b05a460f281defe830ffab6d7940b7ca98ec085e94e024781/transformers-4.34.1-py3-none-any.whl.metadata
+  Downloading transformers-4.34.1-py3-none-any.whl.metadata (121 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.5/121.5 kB 270.5 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/1a/d1/3bba59606141ae808017f6fde91453882f931957f125009417b87a281067/transformers-4.34.0-py3-none-any.whl.metadata
+  Downloading transformers-4.34.0-py3-none-any.whl.metadata (121 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.5/121.5 kB 133.4 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/98/46/f6a79f944d5c7763a9bc13b2aa6ac72daf43a6551f5fb03bccf0a9c2fec1/transformers-4.33.3-py3-none-any.whl.metadata
+  Downloading transformers-4.33.3-py3-none-any.whl.metadata (119 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 163.1 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/1a/06/3817f9bb923437ead9a794f0ac0d03b8b5e0478ab112db4c413dd37c09da/transformers-4.33.2-py3-none-any.whl.metadata
+  Downloading transformers-4.33.2-py3-none-any.whl.metadata (119 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 274.9 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/13/30/54b59e73400df3de506ad8630284e9fd63f4b94f735423d55fc342181037/transformers-4.33.1-py3-none-any.whl.metadata
+  Downloading transformers-4.33.1-py3-none-any.whl.metadata (119 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 274.2 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/e1/9d/4d9fe5c3b820db10773392ac5f4a0c8dab668f70b245ce2ce09785166128/transformers-4.33.0-py3-none-any.whl.metadata
+  Downloading transformers-4.33.0-py3-none-any.whl.metadata (119 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 185.9 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/83/8d/f65f8138365462ace54458a9e164f4b28ce1141361970190eef36bdef986/transformers-4.32.1-py3-none-any.whl.metadata
+  Downloading transformers-4.32.1-py3-none-any.whl.metadata (118 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 118.5/118.5 kB 144.4 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/ae/95/283a1c004430bd2a9425d6937fc545dd49a4e4592feb76be0299a14e2378/transformers-4.32.0-py3-none-any.whl.metadata
+  Downloading transformers-4.32.0-py3-none-any.whl.metadata (118 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 118.5/118.5 kB 150.3 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/21/02/ae8e595f45b6c8edee07913892b3b41f5f5f273962ad98851dc6a564bbb9/transformers-4.31.0-py3-none-any.whl.metadata
+  Downloading transformers-4.31.0-py3-none-any.whl.metadata (116 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.9/116.9 kB 156.7 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/5b/0b/e45d26ccd28568013523e04f325432ea88a442b4e3020b757cf4361f0120/transformers-4.30.2-py3-none-any.whl.metadata
+  Downloading transformers-4.30.2-py3-none-any.whl.metadata (113 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.6/113.6 kB 263.7 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/b8/df/b01b5e67cde3883757c9212455cbb9169385dcab5858b7172199126b756d/transformers-4.30.1-py3-none-any.whl.metadata
+  Downloading transformers-4.30.1-py3-none-any.whl.metadata (113 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.6/113.6 kB 263.8 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/e2/72/1af3d38e98fdcceb3876de4567ac395a66c26976e259fe2d46266e052d61/transformers-4.30.0-py3-none-any.whl.metadata
+  Downloading transformers-4.30.0-py3-none-any.whl.metadata (113 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.6/113.6 kB 266.5 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/17/aa/a89864288afe45abe1ab79f002140a20348140e86836d96096d8f8a3bac0/transformers-4.29.2-py3-none-any.whl.metadata
+  Downloading transformers-4.29.2-py3-none-any.whl.metadata (112 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 112.3/112.3 kB 272.7 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/e8/b5/ddb16f9de207e6571ab7cc5db0cc538fa2d6d91cf024565496462af4c1ce/transformers-4.29.1-py3-none-any.whl.metadata
+  Downloading transformers-4.29.1-py3-none-any.whl.metadata (112 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 112.3/112.3 kB 262.3 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/45/e4/4914b11df70954d95a7c36b74bf9010c8594fcec960471479449b0deb4f7/transformers-4.29.0-py3-none-any.whl.metadata
+  Downloading transformers-4.29.0-py3-none-any.whl.metadata (111 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 111.9/111.9 kB 269.5 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/d8/a7/a6ff727fd5d96d6625f4658944a2ae230f0c75743a9a117fbda013b03d3d/transformers-4.28.1-py3-none-any.whl.metadata
+  Downloading transformers-4.28.1-py3-none-any.whl.metadata (109 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.0/110.0 kB 245.6 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/8b/13/1ce598763b3669d43f192a7911bf2bf730a328012ab8801b93187a4f70d0/transformers-4.28.0-py3-none-any.whl.metadata
+  Downloading transformers-4.28.0-py3-none-any.whl.metadata (109 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.0/110.0 kB 256.3 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/87/f0/2a152ed10ab8601431e87a606d397f7473c5fa4f8162f4ec5bda6ddb2df4/transformers-4.27.4-py3-none-any.whl.metadata
+  Downloading transformers-4.27.4-py3-none-any.whl.metadata (106 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 254.4 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/52/ac/9dc5a17ba60bc354d99250d9d1629f99d76f6729cee438fa91c8cc74bc5d/transformers-4.27.3-py3-none-any.whl.metadata
+  Downloading transformers-4.27.3-py3-none-any.whl.metadata (106 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 251.5 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/73/f0/4a795505387a3e7cd7f0c2a2a87f876658f9a07947a38fb67bffceff9246/transformers-4.27.2-py3-none-any.whl.metadata
+  Downloading transformers-4.27.2-py3-none-any.whl.metadata (106 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 246.1 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/6d/9b/2f536f9e73390209e0b27b74691355dac494b7ec8154f3012fdc6debbae7/transformers-4.27.1-py3-none-any.whl.metadata
+  Downloading transformers-4.27.1-py3-none-any.whl.metadata (106 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 114.0 MB/s eta 0:00:00
+  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/4d/3e/1378ed266cf991f5ab5fcb29e953d97d793c7f9242ea5dc52f856415ea3a/transformers-4.27.0-py3-none-any.whl.metadata
+  Downloading transformers-4.27.0-py3-none-any.whl.metadata (106 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 247.2 MB/s eta 0:00:00
+Collecting sentencepiece!=0.1.92,>=0.1.91 (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))
+  Obtaining dependency information for sentencepiece!=0.1.92,>=0.1.91 from https://files.pythonhosted.org/packages/5f/01/c95e42eb86282b2c79305d3e0b0ca5a743f85a61262bb7130999c70b9374/sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
+  Downloading sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
+Collecting protobuf>=3.20.2 (from onnx~=1.14.1->-r /empty/requirements.txt (line 2))
+  Obtaining dependency information for protobuf>=3.20.2 from https://files.pythonhosted.org/packages/38/b1/d9b615dceb67ac38e13cbd7680c27182b40154996022cbb244ba1ac7d30f/protobuf-3.20.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata
+  Downloading protobuf-3.20.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (679 bytes)
+Requirement already satisfied: future>=0.18.2 in /opt/conda/lib/python3.9/site-packages (from v3io~=0.5.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.0)
+Requirement already satisfied: ujson>=3 in /opt/conda/lib/python3.9/site-packages (from v3io~=0.5.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.9.0)
+Requirement already satisfied: googleapis-common-protos>=1.5.3 in /opt/conda/lib/python3.9/site-packages (from v3io-frames~=0.10.12->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.62.0)
+Requirement already satisfied: grpcio-tools!=1.34.0,<1.49,>=1.30 in /opt/conda/lib/python3.9/site-packages (from v3io-frames~=0.10.12->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.48.2)
+Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))
+  Obtaining dependency information for humanfriendly>=9.1 from https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl.metadata
+  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
+INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
+Collecting multiprocess (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))
+  Obtaining dependency information for multiprocess from https://files.pythonhosted.org/packages/c6/c9/820b5ab056f4ada76fbe05bd481a948f287957d6cbfd59e2dd2618b408c1/multiprocess-0.70.15-py39-none-any.whl.metadata
+  Downloading multiprocess-0.70.15-py39-none-any.whl.metadata (7.2 kB)
+  Obtaining dependency information for multiprocess from https://files.pythonhosted.org/packages/6a/f4/fbeb03ef7abdda54db4a6a75c971b88ab73d724ff09e3275cc1e99f1c946/multiprocess-0.70.14-py39-none-any.whl.metadata
+  Downloading multiprocess-0.70.14-py39-none-any.whl.metadata (6.6 kB)
+Collecting mpmath>=0.19 (from sympy->onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))
+  Obtaining dependency information for mpmath>=0.19 from https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl.metadata
+  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
+Requirement already satisfied: Mako in /opt/conda/lib/python3.9/site-packages (from alembic!=1.10.0,<2->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.2)
+Requirement already satisfied: cffi in /opt/conda/lib/python3.9/site-packages (from azure-datalake-store<0.1,>=0.0.46->adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.16.0)
+Requirement already satisfied: termcolor in /opt/conda/lib/python3.9/site-packages (from fire<1,>=0.3.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.0)
+Requirement already satisfied: Werkzeug>=3.0.0 in /opt/conda/lib/python3.9/site-packages (from Flask<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.1)
+Requirement already satisfied: itsdangerous>=2.1.2 in /opt/conda/lib/python3.9/site-packages (from Flask<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1.2)
+Requirement already satisfied: blinker>=1.6.2 in /opt/conda/lib/python3.9/site-packages (from Flask<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.7.0)
+Requirement already satisfied: smmap<6,>=3.0.1 in /opt/conda/lib/python3.9/site-packages (from gitdb<5,>=4.0.1->GitPython>=3.1.41,~=3.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.0.1)
+Requirement already satisfied: httplib2<1dev,>=0.15.0 in /opt/conda/lib/python3.9/site-packages (from google-api-python-client<2,>=1.7.8->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.22.0)
+Requirement already satisfied: google-auth-httplib2>=0.0.3 in /opt/conda/lib/python3.9/site-packages (from google-api-python-client<2,>=1.7.8->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.0)
+Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.9/site-packages (from google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.3.3)
+Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.9/site-packages (from google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.3.0)
+Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.9/site-packages (from google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.9)
+Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery-storage<3.0.0dev,>=2.6.0->google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.23.0)
+Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-storage->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.5.0)
+Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.9/site-packages (from importlib-metadata>=4.13.0->dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.17.0)
+Requirement already satisfied: parso<0.9.0,>=0.8.3 in /opt/conda/lib/python3.9/site-packages (from jedi>=0.16->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.8.3)
+Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.12.1)
+Requirement already satisfied: referencing>=0.28.4 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.33.0)
+Requirement already satisfied: rpds-py>=0.7.1 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.18.0)
+Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /opt/conda/lib/python3.9/site-packages (from kubernetes<26,>=8.0.0->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.7.0)
+Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)
+Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.12.1)
+Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.49.0)
+Requirement already satisfied: kiwisolver>=1.3.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.5)
+Requirement already satisfied: pillow>=8 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (10.2.0)
+Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.1.1)
+Requirement already satisfied: importlib-resources>=3.2.0 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.1.2)
+Requirement already satisfied: PyJWT[crypto]<3,>=1.0.0 in /opt/conda/lib/python3.9/site-packages (from msal<2.0.0,>=1.24.0->azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.0)
+Requirement already satisfied: portalocker<3,>=1.0 in /opt/conda/lib/python3.9/site-packages (from msal-extensions<2.0.0,>=0.3.0->azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.2)
+Requirement already satisfied: beautifulsoup4 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.12.3)
+Requirement already satisfied: bleach!=5.0.0 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.1.0)
+Requirement already satisfied: defusedxml in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.7.1)
+Requirement already satisfied: jupyter-core>=4.7 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.7.1)
+Requirement already satisfied: jupyterlab-pygments in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.3.0)
+Requirement already satisfied: mistune<4,>=2.0.3 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.2)
+Requirement already satisfied: nbclient>=0.5.0 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.0)
+Requirement already satisfied: nbformat>=5.7 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.9.2)
+Requirement already satisfied: pandocfilters>=1.4.1 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.5.1)
+Requirement already satisfied: tinycss2 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.1)
+Requirement already satisfied: pyzmq<25,>=17 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (24.0.1)
+Requirement already satisfied: argon2-cffi in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (23.1.0)
+Requirement already satisfied: jupyter-client<8,>=5.3.4 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.4.9)
+Requirement already satisfied: ipython-genutils in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.0)
+Requirement already satisfied: ipykernel in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.29.3)
+Requirement already satisfied: Send2Trash>=1.8.0 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.2)
+Requirement already satisfied: terminado>=0.8.3 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.18.0)
+Requirement already satisfied: prometheus-client in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.20.0)
+Requirement already satisfied: nbclassic>=0.4.7 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.0)
+Requirement already satisfied: ptyprocess>=0.5 in /opt/conda/lib/python3.9/site-packages (from pexpect>4.3->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.7.0)
+Requirement already satisfied: wcwidth in /opt/conda/lib/python3.9/site-packages (from prompt-toolkit<3.1.0,>=3.0.41->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.13)
+Requirement already satisfied: oauthlib>=3.0.0 in /opt/conda/lib/python3.9/site-packages (from requests-oauthlib>=0.5.0->msrest~=0.6.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.2.2)
+Requirement already satisfied: wheel in /opt/conda/lib/python3.9/site-packages (from strip-hints<1,>=0.1.8->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.41.2)
+Requirement already satisfied: executing>=1.2.0 in /opt/conda/lib/python3.9/site-packages (from stack-data->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.1)
+Requirement already satisfied: asttokens>=2.1.0 in /opt/conda/lib/python3.9/site-packages (from stack-data->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.1)
+Requirement already satisfied: pure-eval in /opt/conda/lib/python3.9/site-packages (from stack-data->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.2)
+Requirement already satisfied: webencodings in /opt/conda/lib/python3.9/site-packages (from bleach!=5.0.0->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.1)
+Requirement already satisfied: pycparser in /opt/conda/lib/python3.9/site-packages (from cffi->azure-datalake-store<0.1,>=0.0.46->adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.21)
+Requirement already satisfied: grpcio-status<2.0.dev0,>=1.33.2 in /opt/conda/lib/python3.9/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.48.2)
+Requirement already satisfied: platformdirs>=2.5 in /opt/conda/lib/python3.9/site-packages (from jupyter-core>=4.7->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.10.0)
+Requirement already satisfied: jupyter-server>=1.8 in /opt/conda/lib/python3.9/site-packages (from nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.12.5)
+Requirement already satisfied: notebook-shim>=0.2.3 in /opt/conda/lib/python3.9/site-packages (from nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.4)
+Requirement already satisfied: fastjsonschema in /opt/conda/lib/python3.9/site-packages (from nbformat>=5.7->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.19.1)
+Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /opt/conda/lib/python3.9/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.1)
+Requirement already satisfied: argon2-cffi-bindings in /opt/conda/lib/python3.9/site-packages (from argon2-cffi->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (21.2.0)
+Requirement already satisfied: soupsieve>1.2 in /opt/conda/lib/python3.9/site-packages (from beautifulsoup4->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.5)
+Requirement already satisfied: comm>=0.1.1 in /opt/conda/lib/python3.9/site-packages (from ipykernel->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.1)
+Requirement already satisfied: debugpy>=1.6.5 in /opt/conda/lib/python3.9/site-packages (from ipykernel->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.1)
+Requirement already satisfied: jupyter-events>=0.9.0 in /opt/conda/lib/python3.9/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.0)
+Requirement already satisfied: jupyter-server-terminals in /opt/conda/lib/python3.9/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.2)
+Requirement already satisfied: overrides in /opt/conda/lib/python3.9/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.7.0)
+Requirement already satisfied: python-json-logger>=2.0.4 in /opt/conda/lib/python3.9/site-packages (from jupyter-events>=0.9.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.7)
+Requirement already satisfied: rfc3339-validator in /opt/conda/lib/python3.9/site-packages (from jupyter-events>=0.9.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.4)
+Requirement already satisfied: rfc3986-validator>=0.1.1 in /opt/conda/lib/python3.9/site-packages (from jupyter-events>=0.9.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.1)
+Requirement already satisfied: fqdn in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.5.1)
+Requirement already satisfied: isoduration in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (20.11.0)
+Requirement already satisfied: jsonpointer>1.13 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1)
+Requirement already satisfied: uri-template in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.0)
+Requirement already satisfied: webcolors>=1.11 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.13)
+Requirement already satisfied: arrow>=0.15.0 in /opt/conda/lib/python3.9/site-packages (from isoduration->jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.0)
+Requirement already satisfied: types-python-dateutil>=2.8.10 in /opt/conda/lib/python3.9/site-packages (from arrow>=0.15.0->isoduration->jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.19.20240106)
+Downloading onnx-1.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.6 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.6/14.6 MB 274.2 MB/s eta 0:00:00
+Downloading onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.4/6.4 MB 277.9 MB/s eta 0:00:00
+Downloading optimum-1.6.4-py3-none-any.whl (227 kB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 227.8/227.8 kB 291.3 MB/s eta 0:00:00
+Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.3/6.3 MB 242.4 MB/s eta 0:00:00
+Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 469.0/469.0 kB 185.9 MB/s eta 0:00:00
+Downloading scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.4 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 26.4/26.4 MB 275.9 MB/s eta 0:00:00
+Downloading dill-0.3.6-py3-none-any.whl (110 kB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.5/110.5 kB 282.3 MB/s eta 0:00:00
+Downloading huggingface_hub-0.21.4-py3-none-any.whl (346 kB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 346.4/346.4 kB 311.7 MB/s eta 0:00:00
+Downloading numpy-1.23.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 17.1/17.1 MB 269.6 MB/s eta 0:00:00
+Downloading regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 773.4/773.4 kB 311.9 MB/s eta 0:00:00
+Downloading responses-0.18.0-py3-none-any.whl (38 kB)
+Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.8/7.8 MB 264.1 MB/s eta 0:00:00
+Downloading torch-2.2.1-cp39-cp39-manylinux1_x86_64.whl (755.5 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 755.5/755.5 MB 204.0 MB/s eta 0:00:00
+Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 410.6/410.6 MB 40.3 MB/s eta 0:00:00
+Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.1/14.1 MB 43.0 MB/s eta 0:00:00
+Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 23.7/23.7 MB 46.9 MB/s eta 0:00:00
+Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 823.6/823.6 kB 51.0 MB/s eta 0:00:00
+Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 731.7/731.7 MB 58.2 MB/s eta 0:00:00
+Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.6/121.6 MB 69.0 MB/s eta 0:00:00
+Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.5/56.5 MB 36.0 MB/s eta 0:00:00
+Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 124.2/124.2 MB 52.8 MB/s eta 0:00:00
+Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.0/196.0 MB 45.9 MB/s eta 0:00:00
+Downloading nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl (166.0 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 166.0/166.0 MB 19.6 MB/s eta 0:00:00
+Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.1/99.1 kB 27.7 MB/s eta 0:00:00
+Downloading triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (167.9 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 167.9/167.9 MB 41.3 MB/s eta 0:00:00
+Downloading protobuf-3.20.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.0/1.0 MB 42.8 MB/s eta 0:00:00
+Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 46.0/46.0 kB 192.0 MB/s eta 0:00:00
+Downloading filelock-3.13.1-py3-none-any.whl (11 kB)
+Downloading flatbuffers-24.3.7-py2.py3-none-any.whl (26 kB)
+Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 132.9/132.9 kB 100.7 MB/s eta 0:00:00
+Downloading sympy-1.12-py3-none-any.whl (5.7 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.7/5.7 MB 41.4 MB/s eta 0:00:00
+Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 86.8/86.8 kB 253.7 MB/s eta 0:00:00
+Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 536.2/536.2 kB 45.4 MB/s eta 0:00:00
+Downloading sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 46.1 MB/s eta 0:00:00
+Downloading networkx-3.2.1-py3-none-any.whl (1.6 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.6/1.6 MB 43.7 MB/s eta 0:00:00
+Downloading nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl (21.1 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.1/21.1 MB 43.8 MB/s eta 0:00:00
+Installing collected packages: tokenizers, sentencepiece, mpmath, flatbuffers, sympy, regex, protobuf, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, numpy, networkx, humanfriendly, filelock, dill, triton, responses, onnx, nvidia-cusparse-cu12, nvidia-cudnn-cu12, multiprocess, huggingface-hub, coloredlogs, transformers, scikit-learn, onnxruntime, nvidia-cusolver-cu12, torch, datasets, optimum
+  Attempting uninstall: protobuf
+    Found existing installation: protobuf 3.20.3
+    Uninstalling protobuf-3.20.3:
+      Successfully uninstalled protobuf-3.20.3
+  Attempting uninstall: numpy
+    Found existing installation: numpy 1.26.4
+    Uninstalling numpy-1.26.4:
+      Successfully uninstalled numpy-1.26.4
+  Attempting uninstall: scikit-learn
+    Found existing installation: scikit-learn 1.4.1.post1
+    Uninstalling scikit-learn-1.4.1.post1:
+      Successfully uninstalled scikit-learn-1.4.1.post1
+Successfully installed coloredlogs-15.0.1 datasets-2.10.1 dill-0.3.6 filelock-3.13.1 flatbuffers-24.3.7 huggingface-hub-0.21.4 humanfriendly-10.0 mpmath-1.3.0 multiprocess-0.70.14 networkx-3.2.1 numpy-1.23.5 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.19.3 nvidia-nvjitlink-cu12-12.4.99 nvidia-nvtx-cu12-12.1.105 onnx-1.14.1 onnxruntime-1.16.3 optimum-1.6.4 protobuf-3.20.2 regex-2023.12.25 responses-0.18.0 scikit-learn-1.0.2 sentencepiece-0.2.0 sympy-1.12 tokenizers-0.13.3 torch-2.2.1 transformers-4.26.1 triton-2.2.0
+WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
+INFO[0238] Taking snapshot of full filesystem...        
+INFO[0463] Pushing image to docker-registry.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer:latest 
+INFO[0493] Pushed docker-registry.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer@sha256:691d0bb3c23487b4b5d2f84ab323c24735626ee81681475f53a4158b72d4cfee 
+
+
+
BuildStatus(ready=True, outputs={'image': '.mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer:latest'})
+
+
+
+
+
+
+
train_run = hugging_face_classifier_trainer.run(params={
+                                                        "hf_dataset": "Shayanvsf/US_Airline_Sentiment",
+                                                        "drop_columns": [
+                                                            "airline_sentiment_confidence",
+                                                            "negativereason_confidence",
+                                                        ],
+                                                        "pretrained_tokenizer": "distilbert-base-uncased",
+                                                        "pretrained_model": "distilbert-base-uncased",
+                                                        "model_class": "transformers.AutoModelForSequenceClassification",
+                                                        "label_name": "airline_sentiment",
+                                                        "num_of_train_samples": 100,
+                                                        "metrics": ["accuracy", "f1"],
+                                                        "random_state": 42,
+                                                        **additional_parameters
+                                                    },
+                                                    handler="train",                                                    
+                                                )
+
+
+
+
+
> 2024-03-24 17:22:42,252 [info] Storing function: {'name': 'hugging-face-classifier-trainer-train', 'uid': '53252ce7aacb4b1aacf86bf3b862daa2', 'db': 'http://mlrun-api:8080'}
+> 2024-03-24 17:22:42,536 [info] Job is running in the background, pod: hugging-face-classifier-trainer-train-dqqfr
+> 2024-03-24 17:24:43,288 [info] 'train_test_split_size' is not provided, setting train_test_split_size to 0.2
+> 2024-03-24 17:24:43,847 [info] Loading and editing Shayanvsf/US_Airline_Sentiment dataset from Hugging Face hub
+Downloading metadata: 100%|██████████| 1.03k/1.03k [00:00<00:00, 6.77MB/s]
+Downloading and preparing dataset None/None (download: 265.13 KiB, generated: 1.50 MiB, post-processed: Unknown size, total: 1.76 MiB) to /root/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...
+Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]
+Downloading data: 100%|██████████| 92.6k/92.6k [00:00<00:00, 59.3MB/s]
+Downloading data files:  33%|███▎      | 1/3 [00:00<00:00,  6.42it/s]
+Downloading data: 100%|██████████| 605k/605k [00:00<00:00, 81.8MB/s]
+Downloading data files:  67%|██████▋   | 2/3 [00:00<00:00,  6.59it/s]
+Downloading data: 100%|██████████| 179k/179k [00:00<00:00, 50.9MB/s]
+Downloading data files: 100%|██████████| 3/3 [00:00<00:00,  6.62it/s]
+Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 1263.34it/s]
+Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.
+100%|██████████| 3/3 [00:00<00:00, 978.99it/s]                              
+Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
+- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
+You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+> 2024-03-24 17:24:47,076 [info] training 'huggingface-model'
+The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
+This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
+***** Running training *****
+  Num examples = 100
+  Num Epochs = 3
+  Instantaneous batch size per device = 16
+  Total train batch size (w. parallel, distributed & accumulation) = 16
+  Gradient Accumulation steps = 1
+  Total optimization steps = 21
+  Number of trainable parameters = 66955010
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+  0%|          | 0/21 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
+ 33%|███▎      | 7/21 [00:16<00:28,  2.02s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 24
+  Batch size = 16
+
+{'loss': 0.7005, 'learning_rate': 1.904761904761905e-05, 'epoch': 0.14}
+{'loss': 0.6528, 'learning_rate': 1.8095238095238097e-05, 'epoch': 0.29}
+{'loss': 0.6468, 'learning_rate': 1.7142857142857142e-05, 'epoch': 0.43}
+{'loss': 0.5877, 'learning_rate': 1.6190476190476193e-05, 'epoch': 0.57}
+{'loss': 0.6694, 'learning_rate': 1.523809523809524e-05, 'epoch': 0.71}
+{'loss': 0.5219, 'learning_rate': 1.4285714285714287e-05, 'epoch': 0.86}
+{'loss': 0.7052, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.0}
+  0%|          | 0/2 [00:00<?, ?it/s]
+100%|██████████| 2/2 [00:00<00:00,  4.86it/s]main.py:561: FutureWarning:
+
+load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate
+
+
+
+Downloading builder script: 4.21kB [00:00, 11.4MB/s]                   
+
+
+Downloading builder script: 6.50kB [00:00, 21.8MB/s]                   
+                                              
+ 33%|███▎      | 7/21 [00:18<00:28,  2.02s/it]
+100%|██████████| 2/2 [00:00<00:00,  4.86it/s]
+ 67%|██████▋   | 14/21 [00:34<00:14,  2.07s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 24
+  Batch size = 16
+{'eval_loss': 0.5350419878959656, 'eval_accuracy': 0.7916666666666666, 'eval_f1': 0.0, 'eval_runtime': 1.5536, 'eval_samples_per_second': 15.448, 'eval_steps_per_second': 1.287, 'epoch': 1.0}
+{'loss': 0.5942, 'learning_rate': 1.2380952380952383e-05, 'epoch': 1.14}
+{'loss': 0.5899, 'learning_rate': 1.1428571428571429e-05, 'epoch': 1.29}
+{'loss': 0.5317, 'learning_rate': 1.0476190476190477e-05, 'epoch': 1.43}
+{'loss': 0.4516, 'learning_rate': 9.523809523809525e-06, 'epoch': 1.57}
+{'loss': 0.5121, 'learning_rate': 8.571428571428571e-06, 'epoch': 1.71}
+{'loss': 0.5264, 'learning_rate': 7.61904761904762e-06, 'epoch': 1.86}
+{'loss': 0.539, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}
+
+  0%|          | 0/2 [00:00<?, ?it/s]
+                                               A
+ 67%|██████▋   | 14/21 [00:35<00:14,  2.07s/it]
+100%|██████████| 2/2 [00:00<00:00,  4.95it/s]
+100%|██████████| 21/21 [00:52<00:00,  2.05s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 24
+  Batch size = 16
+{'eval_loss': 0.4877033233642578, 'eval_accuracy': 0.7916666666666666, 'eval_f1': 0.0, 'eval_runtime': 1.1789, 'eval_samples_per_second': 20.357, 'eval_steps_per_second': 1.696, 'epoch': 2.0}
+{'loss': 0.4059, 'learning_rate': 5.7142857142857145e-06, 'epoch': 2.14}
+{'loss': 0.5851, 'learning_rate': 4.761904761904762e-06, 'epoch': 2.29}
+{'loss': 0.4135, 'learning_rate': 3.80952380952381e-06, 'epoch': 2.43}
+{'loss': 0.6571, 'learning_rate': 2.8571428571428573e-06, 'epoch': 2.57}
+{'loss': 0.4883, 'learning_rate': 1.904761904761905e-06, 'epoch': 2.71}
+{'loss': 0.5114, 'learning_rate': 9.523809523809525e-07, 'epoch': 2.86}
+{'loss': 0.5215, 'learning_rate': 0.0, 'epoch': 3.0}
+
+  0%|          | 0/2 [00:00<?, ?it/s]
+                                               A
+100%|██████████| 21/21 [00:54<00:00,  2.05s/it]
+100%|██████████| 2/2 [00:00<00:00,  6.38it/s]
+                                             
+
+Training completed. Do not forget to share your model on huggingface.co/models =)
+
+
+100%|██████████| 21/21 [00:55<00:00,  2.62s/it]
+tokenizer config file saved in /tmp/tokenizer/tokenizer_config.json
+Special tokens file saved in /tmp/tokenizer/special_tokens_map.json
+Configuration saved in /tmp/model/config.json
+Model weights saved in /tmp/model/pytorch_model.bin
+{'eval_loss': 0.4750453531742096, 'eval_accuracy': 0.7916666666666666, 'eval_f1': 0.0, 'eval_runtime': 1.0524, 'eval_samples_per_second': 22.806, 'eval_steps_per_second': 1.9, 'epoch': 3.0}
+{'train_runtime': 55.1543, 'train_samples_per_second': 5.439, 'train_steps_per_second': 0.381, 'train_loss': 0.5624780683290391, 'epoch': 3.0}
+> 2024-03-24 17:26:00,230 [info] To track results use the CLI: {'info_cmd': 'mlrun get run 53252ce7aacb4b1aacf86bf3b862daa2 -p hugging-face-trainer-avia', 'logs_cmd': 'mlrun logs 53252ce7aacb4b1aacf86bf3b862daa2 -p hugging-face-trainer-avia'}
+> 2024-03-24 17:26:00,231 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlprojects/hugging-face-trainer-avia/jobs/monitor/53252ce7aacb4b1aacf86bf3b862daa2/overview'}
+> 2024-03-24 17:26:00,231 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-train'}
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
hugging-face-trainer-avia0Mar 24 17:24:39completedhugging-face-classifier-trainer-train
v3io_user=avia
kind=job
owner=avia
mlrun/client_version=1.6.1
mlrun/client_python_version=3.9.16
host=hugging-face-classifier-trainer-train-dqqfr
hf_dataset=Shayanvsf/US_Airline_Sentiment
drop_columns=['airline_sentiment_confidence', 'negativereason_confidence']
pretrained_tokenizer=distilbert-base-uncased
pretrained_model=distilbert-base-uncased
model_class=transformers.AutoModelForSequenceClassification
label_name=airline_sentiment
num_of_train_samples=100
metrics=['accuracy', 'f1']
random_state=42
TRAIN_output_dir=finetuning-sentiment-model-3000-samples
TRAIN_learning_rate=2e-05
TRAIN_per_device_train_batch_size=16
TRAIN_per_device_eval_batch_size=16
TRAIN_num_train_epochs=3
TRAIN_weight_decay=0.01
TRAIN_push_to_hub=False
TRAIN_evaluation_strategy=epoch
TRAIN_eval_steps=1
TRAIN_logging_steps=1
CLASS_num_labels=2
loss=0.5215
learning_rate=0.0
eval_loss=0.4750453531742096
eval_accuracy=0.7916666666666666
eval_f1=0.0
eval_runtime=1.0524
eval_samples_per_second=22.806
eval_steps_per_second=1.9
train_runtime=55.1543
train_samples_per_second=5.439
train_steps_per_second=0.381
total_flos=3327208489680.0
loss_plot
learning_rate_plot
eval_loss_plot
eval_accuracy_plot
eval_f1_plot
eval_runtime_plot
eval_samples_per_second_plot
eval_steps_per_second_plot
tokenizer
model
+
+ +
+

+
+
+
> to track results use the .show() or .logs() methods or click here to open in UI
> 2024-03-24 17:26:09,792 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-train'}
+
+
+
+
+

Back to the top

+
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+ + + + \ No newline at end of file diff --git a/functions/master/hugging_face_classifier_trainer/0.3.0/static/function.html b/functions/master/hugging_face_classifier_trainer/0.3.0/static/function.html new file mode 100644 index 00000000..08fe9f21 --- /dev/null +++ b/functions/master/hugging_face_classifier_trainer/0.3.0/static/function.html @@ -0,0 +1,392 @@ + + + + + + + + + + + Source + + + + +
+        
+kind: job
+metadata:
+  name: hugging-face-classifier-trainer
+  tag: ''
+  hash: f9d8aa4a2c66e24fa418bb163829adc3e2ada06c
+  project: ''
+  labels:
+    author: davids
+  categories:
+  - deep-learning
+  - huggingface
+  - machine-learning
+  - model-training
+spec:
+  command: ''
+  args: []
+  image: ''
+  build:
+    functionSourceCode: import os
import shutil
import tempfile
import zipfile
from abc import ABC
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import mlrun
import mlrun.datastore
import mlrun.utils
import numpy as np
import pandas as pd
import transformers
from datasets import Dataset, load_dataset, load_metric
from mlrun import MLClientCtx
from mlrun import feature_store as fs
from mlrun.artifacts import Artifact, PlotlyArtifact
from mlrun.datastore import DataItem
from mlrun.frameworks._common import CommonTypes, MLRunInterface
from mlrun.utils import create_class
from plotly import graph_objects as go
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    PreTrainedModel,
    PreTrainedTokenizer,
    Trainer,
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
)


# ----------------------from MLRUN--------------------------------
class HFORTOptimizerMLRunInterface(MLRunInterface, ABC):
    """
    Interface for adding MLRun features for tensorflow keras API.
    """

    # MLRun's context default name:
    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"

    # Attributes to be inserted so the MLRun interface will be fully enabled.
    _PROPERTIES = {
        "_auto_log": False,
        "_context": None,
        "_model_name": "model",
        "_tag": "",
        "_labels": None,
        "_extra_data": None,
    }
    _METHODS = ["enable_auto_logging"]
    # Attributes to replace so the MLRun interface will be fully enabled.
    _REPLACED_METHODS = [
        "optimize",
    ]

    @classmethod
    def add_interface(
        cls,
        obj,
        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
    ):
        """
        Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras
        MLRun's features.
        :param obj:                     The object to enrich his interface.
        :param restoration: Restoration information tuple as returned from 'remove_interface' in order to
                                        add the interface in a certain state.
        """
        super(HFORTOptimizerMLRunInterface, cls).add_interface(
            obj=obj, restoration=restoration
        )

    @classmethod
    def mlrun_optimize(cls):
        """
        MLRun's tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be
        passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.

        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.
        """

        def wrapper(self, *args, **kwargs):
            save_dir = cls._get_function_argument(
                self.optimize,
                argument_name="save_dir",
                passed_args=args,
                passed_kwargs=kwargs,
            )[0]

            # Call the original optimize method:
            result = self.original_optimize(*args, **kwargs)

            if self._auto_log:
                # Log the onnx model:
                self._context.log_model(
                    key="model",
                    db_key=self._model_name,
                    model_file=f"{save_dir}/model_optimized.onnx",
                    tag=self._tag,
                    framework="ONNX",
                    labels=self._labels,
                    extra_data=self._extra_data,
                )

            return result

        return wrapper

    def enable_auto_logging(
        self,
        context: mlrun.MLClientCtx,
        model_name: str = "model",
        tag: str = "",
        labels: Dict[str, str] = None,
        extra_data: dict = None,
    ):
        self._auto_log = True

        self._context = context
        self._model_name = model_name
        self._tag = tag
        self._labels = labels
        self._extra_data = extra_data


class HFTrainerMLRunInterface(MLRunInterface, ABC):
    """
    Interface for adding MLRun features for tensorflow keras API.
    """

    # MLRuns context default name:
    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"

    # Attributes to replace so the MLRun interface will be fully enabled.
    _REPLACED_METHODS = [
        "train",
        # "evaluate"
    ]

    @classmethod
    def add_interface(
        cls,
        obj: Trainer,
        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
    ):
        """
        Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras
        MLRuns features.
        :param obj:                     The object to enrich his interface.
        :param restoration: Restoration information tuple as returned from 'remove_interface' in order to
                                        add the interface in a certain state.
        """

        super(HFTrainerMLRunInterface, cls).add_interface(
            obj=obj, restoration=restoration
        )

    @classmethod
    def mlrun_train(cls):

        """
        MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be
        passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.

        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.
        """

        def wrapper(self: Trainer, *args, **kwargs):
            # Restore the evaluation method as `train` will use it:
            # cls._restore_attribute(obj=self, attribute_name="evaluate")

            # Call the original fit method:
            result = self.original_train(*args, **kwargs)

            # Replace the evaluation method again:
            # cls._replace_function(obj=self, function_name="evaluate")

            return result

        return wrapper


class MLRunCallback(TrainerCallback):
    """
    Callback for collecting logs during training / evaluation of the `Trainer` API.
    """

    def __init__(
        self,
        context: mlrun.MLClientCtx = None,
        model_name: str = "model",
        tag: str = "",
        labels: Dict[str, str] = None,
        extra_data: dict = None,
    ):
        super().__init__()

        # Store the configurations:
        self._context = (
            context
            if context is not None
            else mlrun.get_or_create_ctx("./mlrun-huggingface")
        )
        self._model_name = model_name
        self._tag = tag
        self._labels = labels
        self._extra_data = extra_data if extra_data is not None else {}

        # Set up the logging mode:
        self._is_training = False
        self._steps: List[List[int]] = []
        self._metric_scores: Dict[str, List[float]] = {}
        self._artifacts: Dict[str, Artifact] = {}

    def on_epoch_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        self._steps.append([])

    def on_epoch_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        self._log_metrics()

    def on_log(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        logs: Dict[str, float] = None,
        **kwargs,
    ):
        recent_logs = state.log_history[-1].copy()

        recent_logs.pop("epoch")
        current_step = int(recent_logs.pop("step"))
        if current_step not in self._steps[-1]:
            self._steps[-1].append(current_step)

        for metric_name, metric_score in recent_logs.items():
            if metric_name.startswith("train_"):
                if metric_name.split("train_")[1] not in self._metric_scores:
                    self._metric_scores[metric_name] = [metric_score]
                continue
            if metric_name not in self._metric_scores:
                self._metric_scores[metric_name] = []
            self._metric_scores[metric_name].append(metric_score)

    def on_train_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        self._is_training = True

    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        model: PreTrainedModel = None,
        tokenizer: PreTrainedTokenizer = None,
        **kwargs,
    ):
        self._log_metrics()

        temp_directory = tempfile.gettempdir()

        # Save and log the tokenizer:
        if tokenizer is not None:
            # Save tokenizer:
            tokenizer_dir = os.path.join(temp_directory, "tokenizer")
            tokenizer.save_pretrained(save_directory=tokenizer_dir)
            # Zip the tokenizer directory:
            tokenizer_zip = shutil.make_archive(
                base_name="tokenizer",
                format="zip",
                root_dir=tokenizer_dir,
            )
            # Log the zip file:
            self._artifacts["tokenizer"] = self._context.log_artifact(
                item="tokenizer", local_path=tokenizer_zip
            )

        # Save the model:
        model_dir = os.path.join(temp_directory, "model")
        model.save_pretrained(save_directory=model_dir)

        # Zip the model directory:
        shutil.make_archive(
            base_name="model",
            format="zip",
            root_dir=model_dir,
        )

        # Log the model:
        self._context.log_model(
            key="model",
            db_key=self._model_name,
            model_file="model.zip",
            tag=self._tag,
            framework="Hugging Face",
            labels=self._labels,
            extra_data={**self._artifacts, **self._extra_data},
        )

    def on_evaluate(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        self._log_metrics()

        if self._is_training:
            return

        # TODO: Update the model object

    def _log_metrics(self):
        for metric_name, metric_scores in self._metric_scores.items():
            self._context.log_result(key=metric_name, value=metric_scores[-1])
            if len(metric_scores) > 1:
                self._log_metric_plot(name=metric_name, scores=metric_scores)
        self._context.commit(completed=False)

    def _log_metric_plot(self, name: str, scores: List[float]):
        # Initialize a plotly figure:
        metric_figure = go.Figure()

        # Add titles:
        metric_figure.update_layout(
            title=name.capitalize().replace("_", " "),
            xaxis_title="Samples",
            yaxis_title="Scores",
        )

        # Draw:
        metric_figure.add_trace(
            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
        )

        # Create the plotly artifact:
        artifact_name = f"{name}_plot"
        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
        self._artifacts[artifact_name] = self._context.log_artifact(artifact)


def _apply_mlrun_on_trainer(
    trainer: transformers.Trainer,
    model_name: str = None,
    tag: str = "",
    context: mlrun.MLClientCtx = None,
    auto_log: bool = True,
    labels: Dict[str, str] = None,
    extra_data: dict = None,
    **kwargs,
):
    # Get parameters defaults:
    if context is None:
        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)

    HFTrainerMLRunInterface.add_interface(obj=trainer)

    if auto_log:
        trainer.add_callback(
            MLRunCallback(
                context=context,
                model_name=model_name,
                tag=tag,
                labels=labels,
                extra_data=extra_data,
            )
        )


def _apply_mlrun_on_optimizer(
    optimizer,
    model_name: str = None,
    tag: str = "",
    context: mlrun.MLClientCtx = None,
    auto_log: bool = True,
    labels: Dict[str, str] = None,
    extra_data: dict = None,
    **kwargs,
):
    # Get parameters defaults:
    if context is None:
        context = mlrun.get_or_create_ctx(
            HFORTOptimizerMLRunInterface.DEFAULT_CONTEXT_NAME
        )

    HFORTOptimizerMLRunInterface.add_interface(obj=optimizer)

    if auto_log:
        optimizer.enable_auto_logging(
            context=context,
            model_name=model_name,
            tag=tag,
            labels=labels,
            extra_data=extra_data,
        )


def apply_mlrun(
    huggingface_object,
    model_name: str = None,
    tag: str = "",
    context: mlrun.MLClientCtx = None,
    auto_log: bool = True,
    labels: Dict[str, str] = None,
    extra_data: dict = None,
    **kwargs,
):
    """
    Wrap the given model with MLRun's interface providing it with mlrun's additional features.
    :param huggingface_object: The model to wrap. Can be loaded from the model path given as well.
    :param model_name:         The model name to use for storing the model artifact. Default: "model".
    :param tag:                The model's tag to log with.
    :param context:            MLRun context to work with. If no context is given it will be retrieved via
                               'mlrun.get_or_create_ctx(None)'
    :param auto_log:           Whether to enable MLRun's auto logging. Default: True.
    """

    if isinstance(huggingface_object, transformers.Trainer):
        return _apply_mlrun_on_trainer(
            trainer=huggingface_object,
            model_name=model_name,
            tag=tag,
            context=context,
            auto_log=auto_log,
            labels=labels,
            extra_data=extra_data,
        )
    import optimum.onnxruntime as optimum_ort

    if isinstance(huggingface_object, optimum_ort.ORTOptimizer):
        return _apply_mlrun_on_optimizer(
            optimizer=huggingface_object,
            model_name=model_name,
            tag=tag,
            context=context,
            auto_log=auto_log,
            labels=labels,
            extra_data=extra_data,
        )
    raise mlrun.errors.MLRunInvalidArgumentError


# ---------------------- from auto_trainer--------------------------------
class KWArgsPrefixes:
    MODEL_CLASS = "CLASS_"
    FIT = "FIT_"
    TRAIN = "TRAIN_"
    PREDICT = "PREDICT_"


def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]:
    """
    Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these
    keys.

    :param src:         The source dict to extract the values from.
    :param prefix_key:  Only keys with this prefix will be returned. The keys in the result dict will be without this
                        prefix.
    """
    return {
        key.replace(prefix_key, ""): val
        for key, val in src.items()
        if key.startswith(prefix_key)
    }


def _get_dataframe(
    context: MLClientCtx,
    dataset: DataItem,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: Union[str, List[str], int, List[int]] = None,
) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]:
    """
    Getting the DataFrame of the dataset and drop the columns accordingly.

    :param context:         MLRun context.
    :param dataset:         The dataset to train the model on.
                            Can be either a list of lists, dict, URI or a FeatureVector.
    :param label_columns:   The target label(s) of the column(s) in the dataset. for Regression or
                            Classification tasks.
    :param drop_columns:    str/int or a list of strings/ints that represent the column names/indices to drop.
    """
    if isinstance(dataset, (list, dict)):
        dataset = pd.DataFrame(dataset)
        # Checking if drop_columns provided by integer type:
        if drop_columns:
            if isinstance(drop_columns, str) or (
                isinstance(drop_columns, list)
                and any(isinstance(col, str) for col in drop_columns)
            ):
                context.logger.error(
                    "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset"
                )
                raise ValueError
            dataset.drop(drop_columns, axis=1, inplace=True)

        return dataset, label_columns

    store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url)
    if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix:
        # feature-vector case:
        label_columns = label_columns or dataset.meta.status.label_column
        dataset = fs.get_offline_features(
            dataset.meta.uri, drop_columns=drop_columns
        ).to_dataframe()

        context.logger.info(f"label columns: {label_columns}")
    else:
        # simple URL case:
        dataset = dataset.as_df()
        if drop_columns:
            if all(col in dataset for col in drop_columns):
                dataset = dataset.drop(drop_columns, axis=1)
            else:
                context.logger.info(
                    "not all of the columns to drop in the dataset, drop columns process skipped"
                )
    return dataset, label_columns


# ---------------------- Hugging Face Trainer --------------------------------


def _create_compute_metrics(metrics: List[str]) -> Callable[[EvalPrediction], Dict]:
    """
    This function create and returns a function that will be used to compute metrics at evaluation.
    :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc.

    :returns: Function that will be used to compute metrics at evaluation.
             Must take a [`EvalPrediction`] and return a dictionary string to metric values.
    """

    def _compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        metric_dict_results = {}
        for metric in metrics:
            load_met = load_metric(metric)
            metric_res = load_met.compute(predictions=predictions, references=labels)[
                metric
            ]
            metric_dict_results[metric] = metric_res

        return metric_dict_results

    return _compute_metrics


def _edit_columns(
    dataset: Dataset,
    drop_columns: List[str] = None,
    rename_columns: [str, str] = None,
) -> Dataset:
    """
    Drop and renames that columns of the given dataset
    :param dataset:         Dataset to process
    :param drop_columns:    The columns to drop from the dataset.
    :param rename_columns:  Dict of columns ro rename : {<old_name>: <new_name>, ...}

    :returns: The dataset after the desired process
    """
    if drop_columns:
        dataset = dataset.remove_columns(drop_columns)
    if rename_columns:
        dataset = dataset.rename_columns(rename_columns)
    return dataset


def _prepare_dataset(
    context: MLClientCtx,
    dataset_name: str,
    label_name: str = None,
    drop_columns: Optional[List[str]] = None,
    num_of_train_samples: int = None,
    train_test_split_size: float = None,
    random_state: int = None,
) -> Tuple[Dataset, Dataset]:
    """
    Loading the dataset and editing the columns

    :param context:                 MLRun contex
    :param dataset_name:            The name of the dataset to get from the HuggingFace hub
    :param label_name:              The target label of the column in the dataset.
    :param drop_columns:            The columns to drop from the dataset.
    :param num_of_train_samples:    Max number of training samples, for debugging.
    :param train_test_split_size:   Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
                                    in the test split.
    :param random_state:            Random state for train_test_split

    """

    context.logger.info(
        f"Loading and editing {dataset_name} dataset from Hugging Face hub"
    )
    rename_cols = {label_name: "labels"}

    # Loading and editing dataset:
    dataset = load_dataset(dataset_name)

    # train set
    train_dataset = dataset["train"]
    if num_of_train_samples:
        train_dataset = train_dataset.shuffle(seed=random_state).select(
            list(range(num_of_train_samples))
        )
    train_dataset = _edit_columns(train_dataset, drop_columns, rename_cols)

    # test set
    test_dataset = dataset["test"]
    if train_test_split_size or num_of_train_samples:
        train_test_split_size = train_test_split_size or 0.2
        num_of_test_samples = int(
            (train_dataset.num_rows * train_test_split_size)
            // (1 - train_test_split_size)
        )
        test_dataset = test_dataset.shuffle(seed=random_state).select(
            list(range(num_of_test_samples))
        )
    test_dataset = _edit_columns(test_dataset, drop_columns, rename_cols)

    return train_dataset, test_dataset


def train(
    context: MLClientCtx,
    hf_dataset: str = None,
    dataset: DataItem = None,
    test_set: DataItem = None,
    drop_columns: Optional[List[str]] = None,
    pretrained_tokenizer: str = None,
    pretrained_model: str = None,
    model_class: str = None,
    model_name: str = "huggingface-model",
    label_name: str = "labels",
    text_col: str = "text",
    num_of_train_samples: int = None,
    train_test_split_size: float = None,
    metrics: List[str] = None,
    random_state: int = None,
):
    """
    Training and evaluating a pretrained model with a pretrained tokenizer over a dataset.
    The dataset can be either be the name of the dataset that contains in the HuggingFace hub,
    or a URI or a FeatureVector

    :param context:                 MLRun context
    :param hf_dataset:              The name of the dataset to get from the HuggingFace hub
    :param dataset:                 The dataset to train the model on. Can be either a URI or a FeatureVector
    :param test_set:                The test set to train the model with.
    :param drop_columns:            The columns to drop from the dataset.
    :param pretrained_tokenizer:    The name of the pretrained tokenizer from the HuggingFace hub.
    :param pretrained_model:        The name of the pretrained model from the HuggingFace hub.
    :param model_name:              The model's name to use for storing the model artifact, default to 'model'
    :param model_class:             The class of the model, e.g. `transformers.AutoModelForSequenceClassification`
    :param label_name:              The target label of the column in the dataset.
    :param text_col:                The input text column un the dataset.
    :param num_of_train_samples:    Max number of training samples, for debugging.
    :param train_test_split_size:   Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
                                    in the test split.
    :param metrics:                 List of different metrics for evaluate the model such as f1, accuracy etc.
    :param random_state:            Random state for train_test_split
    """

    if train_test_split_size is None and test_set is None:
        context.logger.info(
            "'train_test_split_size' is not provided, setting train_test_split_size to 0.2"
        )
        train_test_split_size = 0.2

    # Creating tokenizer:
    tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer)

    def preprocess_function(examples):
        return tokenizer(examples[text_col], truncation=True)

    # prepare data for training
    if hf_dataset:
        train_dataset, test_dataset = _prepare_dataset(
            context,
            hf_dataset,
            label_name,
            drop_columns,
            num_of_train_samples,
            train_test_split_size,
            random_state=random_state,
        )
    elif dataset:
        # Get DataFrame by URL or by FeatureVector:
        train_dataset, label_name = _get_dataframe(
            context=context,
            dataset=dataset,
            label_columns=label_name,
            drop_columns=drop_columns,
        )
        if test_set:
            test_dataset, _ = _get_dataframe(
                context=context,
                dataset=test_set,
                label_columns=label_name,
                drop_columns=drop_columns,
            )
        else:
            train_dataset, test_dataset = train_test_split(
                train_dataset,
                test_size=train_test_split_size,
                random_state=random_state,
            )
        train_dataset = Dataset.from_pandas(train_dataset)
        test_dataset = Dataset.from_pandas(test_dataset)
    else:
        raise mlrun.errors.MLRunInvalidArgumentError(
            "Training data was not provided. A training dataset is mandatory for training."
            " Please provide a training set using one of the arguments 'hf_dataset' or 'dataset'."
        )

    # Mapping datasets with the tokenizer:
    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_test = test_dataset.map(preprocess_function, batched=True)

    # Creating data collator for batching:
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Parsing kwargs:
    train_kwargs = _get_sub_dict_by_prefix(
        src=context.parameters, prefix_key=KWArgsPrefixes.TRAIN
    )
    model_class_kwargs = _get_sub_dict_by_prefix(
        src=context.parameters, prefix_key=KWArgsPrefixes.MODEL_CLASS
    )

    # Loading our pretrained model:
    model_class_kwargs["pretrained_model_name_or_path"] = (
        model_class_kwargs.get("pretrained_model_name_or_path") or pretrained_model
    )
    train_kwargs["hub_token"] = train_kwargs.get("hub_token") or pretrained_tokenizer
    if not model_class_kwargs["pretrained_model_name_or_path"]:
        raise mlrun.errors.MLRunRuntimeError(
            "Must provide pretrained_model name as "
            "function argument or in extra params"
        )
    model = create_class(model_class).from_pretrained(**model_class_kwargs)

    # Preparing training arguments:
    training_args = TrainingArguments(
        **train_kwargs,
    )

    compute_metrics = _create_compute_metrics(metrics) if metrics else None
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    apply_mlrun(trainer, model_name=model_name)

    # Apply training with evaluation:
    context.logger.info(f"training '{model_name}'")
    trainer.train()


def _get_model_dir(model_uri: str):
    model_file, _, _ = mlrun.artifacts.get_model(model_uri)
    model_dir = tempfile.gettempdir()
    # Unzip the Model:
    with zipfile.ZipFile(model_file, "r") as zip_file:
        zip_file.extractall(model_dir)

    return model_dir


def optimize(
    model_path: str,
    model_name: str = "optimized_model",
    target_dir: str = "./optimized",
    optimization_level: int = 1,
):
    """
    Optimizing the transformer model using ONNX optimization.


    :param model_path:          The path of the model to optimize.
    :param model_name:          Name of the optimized model.
    :param target_dir:          The directory to save the ONNX model.
    :param optimization_level:  Optimization level performed by ONNX Runtime of the loaded graph. (default is 1)
    """
    # We import these in the function scope so ONNX won't be mandatory for the other handlers:
    from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer
    from optimum.onnxruntime.configuration import OptimizationConfig

    model_dir = _get_model_dir(model_uri=model_path)
    # Creating configuration for optimization step:
    optimization_config = OptimizationConfig(optimization_level=optimization_level)

    # Converting our pretrained model to an ONNX-Runtime model:
    ort_model = ORTModelForSequenceClassification.from_pretrained(
        model_dir, from_transformers=True
    )

    # Creating an ONNX-Runtime optimizer from ONNX model:
    optimizer = ORTOptimizer.from_pretrained(ort_model)

    apply_mlrun(optimizer, model_name=model_name)
    # Optimizing and saving the ONNX model:
    optimizer.optimize(save_dir=target_dir, optimization_config=optimization_config)

+    base_image: mlrun/mlrun
+    commands: []
+    code_origin: ''
+    origin_filename: ''
+    requirements:
+    - onnx~=1.14.1
+    - onnxruntime~=1.16.1
+    - optimum~=1.6.4
+    - transformers~=4.26.1
+    - datasets~=2.10.1
+    - scikit-learn~=1.0.2
+  entry_points:
+    add_interface:
+      name: add_interface
+      doc: 'Enrich the object with this interface properties, methods and functions,
+        so it will have this TensorFlow.Keras
+
+        MLRuns features.'
+      parameters:
+      - name: cls
+      - name: obj
+        type: Trainer
+        doc: The object to enrich his interface.
+      - name: restoration
+        type: MLRunInterfaceRestorationType
+        doc: Restoration information tuple as returned from 'remove_interface' in
+          order to add the interface in a certain state.
+        default: null
+      outputs: []
+      lineno: 146
+      has_varargs: false
+      has_kwargs: false
+    mlrun_optimize:
+      name: mlrun_optimize
+      doc: 'MLRun''s tf.keras.Model.fit wrapper. It will setup the optimizer when
+        using horovod. The optimizer must be
+
+        passed in a keyword argument and when using horovod, it must be passed as
+        an Optimizer instance, not a string.
+
+
+        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow
+        the instructions above.'
+      parameters:
+      - name: cls
+      outputs: []
+      lineno: 79
+      has_varargs: false
+      has_kwargs: false
+    wrapper:
+      name: wrapper
+      doc: ''
+      parameters:
+      - name: self
+        type: Trainer
+      outputs: []
+      lineno: 173
+      has_varargs: true
+      has_kwargs: true
+    enable_auto_logging:
+      name: enable_auto_logging
+      doc: ''
+      parameters:
+      - name: self
+      - name: context
+        type: MLClientCtx
+      - name: model_name
+        type: str
+        default: model
+      - name: tag
+        type: str
+        default: ''
+      - name: labels
+        type: Dict[str, str]
+        default: null
+      - name: extra_data
+        type: dict
+        default: null
+      outputs: []
+      lineno: 114
+      has_varargs: false
+      has_kwargs: false
+    mlrun_train:
+      name: mlrun_train
+      doc: 'MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using
+        horovod. The optimizer must be
+
+        passed in a keyword argument and when using horovod, it must be passed as
+        an Optimizer instance, not a string.
+
+
+        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow
+        the instructions above.'
+      parameters:
+      - name: cls
+      outputs: []
+      lineno: 164
+      has_varargs: false
+      has_kwargs: false
+    on_epoch_begin:
+      name: on_epoch_begin
+      doc: ''
+      parameters:
+      - name: self
+      - name: args
+        type: TrainingArguments
+      - name: state
+        type: TrainerState
+      - name: control
+        type: TrainerControl
+      outputs: []
+      lineno: 220
+      has_varargs: false
+      has_kwargs: true
+    on_epoch_end:
+      name: on_epoch_end
+      doc: ''
+      parameters:
+      - name: self
+      - name: args
+        type: TrainingArguments
+      - name: state
+        type: TrainerState
+      - name: control
+        type: TrainerControl
+      outputs: []
+      lineno: 229
+      has_varargs: false
+      has_kwargs: true
+    on_log:
+      name: on_log
+      doc: ''
+      parameters:
+      - name: self
+      - name: args
+        type: TrainingArguments
+      - name: state
+        type: TrainerState
+      - name: control
+        type: TrainerControl
+      - name: logs
+        type: Dict[str, float]
+        default: null
+      outputs: []
+      lineno: 238
+      has_varargs: false
+      has_kwargs: true
+    on_train_begin:
+      name: on_train_begin
+      doc: ''
+      parameters:
+      - name: self
+      - name: args
+        type: TrainingArguments
+      - name: state
+        type: TrainerState
+      - name: control
+        type: TrainerControl
+      outputs: []
+      lineno: 262
+      has_varargs: false
+      has_kwargs: true
+    on_train_end:
+      name: on_train_end
+      doc: ''
+      parameters:
+      - name: self
+      - name: args
+        type: TrainingArguments
+      - name: state
+        type: TrainerState
+      - name: control
+        type: TrainerControl
+      - name: model
+        type: PreTrainedModel
+        default: null
+      - name: tokenizer
+        type: PreTrainedTokenizer
+        default: null
+      outputs: []
+      lineno: 271
+      has_varargs: false
+      has_kwargs: true
+    on_evaluate:
+      name: on_evaluate
+      doc: ''
+      parameters:
+      - name: self
+      - name: args
+        type: TrainingArguments
+      - name: state
+        type: TrainerState
+      - name: control
+        type: TrainerControl
+      outputs: []
+      lineno: 322
+      has_varargs: false
+      has_kwargs: true
+    apply_mlrun:
+      name: apply_mlrun
+      doc: Wrap the given model with MLRun's interface providing it with mlrun's additional
+        features.
+      parameters:
+      - name: huggingface_object
+        doc: The model to wrap. Can be loaded from the model path given as well.
+      - name: model_name
+        type: str
+        doc: 'The model name to use for storing the model artifact. Default: "model".'
+        default: null
+      - name: tag
+        type: str
+        doc: The model's tag to log with.
+        default: ''
+      - name: context
+        type: MLClientCtx
+        doc: MLRun context to work with. If no context is given it will be retrieved
+          via 'mlrun.get_or_create_ctx(None)'
+        default: null
+      - name: auto_log
+        type: bool
+        doc: 'Whether to enable MLRun''s auto logging. Default: True.'
+        default: true
+      - name: labels
+        type: Dict[str, str]
+        default: null
+      - name: extra_data
+        type: dict
+        default: null
+      outputs: []
+      lineno: 421
+      has_varargs: false
+      has_kwargs: true
+    train:
+      name: train
+      doc: 'Training and evaluating a pretrained model with a pretrained tokenizer
+        over a dataset.
+
+        The dataset can be either be the name of the dataset that contains in the
+        HuggingFace hub,
+
+        or a URI or a FeatureVector'
+      parameters:
+      - name: context
+        type: MLClientCtx
+        doc: MLRun context
+      - name: hf_dataset
+        type: str
+        doc: The name of the dataset to get from the HuggingFace hub
+        default: null
+      - name: dataset
+        type: DataItem
+        doc: The dataset to train the model on. Can be either a URI or a FeatureVector
+        default: null
+      - name: test_set
+        type: DataItem
+        doc: The test set to train the model with.
+        default: null
+      - name: drop_columns
+        type: Optional[List[str]]
+        doc: The columns to drop from the dataset.
+        default: null
+      - name: pretrained_tokenizer
+        type: str
+        doc: The name of the pretrained tokenizer from the HuggingFace hub.
+        default: null
+      - name: pretrained_model
+        type: str
+        doc: The name of the pretrained model from the HuggingFace hub.
+        default: null
+      - name: model_class
+        type: str
+        doc: The class of the model, e.g. `transformers.AutoModelForSequenceClassification`
+        default: null
+      - name: model_name
+        type: str
+        doc: The model's name to use for storing the model artifact, default to 'model'
+        default: huggingface-model
+      - name: label_name
+        type: str
+        doc: The target label of the column in the dataset.
+        default: labels
+      - name: text_col
+        type: str
+        doc: The input text column un the dataset.
+        default: text
+      - name: num_of_train_samples
+        type: int
+        doc: Max number of training samples, for debugging.
+        default: null
+      - name: train_test_split_size
+        type: float
+        doc: Should be between 0.0 and 1.0 and represent the proportion of the dataset
+          to include in the test split.
+        default: null
+      - name: metrics
+        type: List[str]
+        doc: List of different metrics for evaluate the model such as f1, accuracy
+          etc.
+        default: null
+      - name: random_state
+        type: int
+        doc: Random state for train_test_split
+        default: null
+      outputs: []
+      lineno: 647
+      has_varargs: false
+      has_kwargs: false
+    preprocess_function:
+      name: preprocess_function
+      doc: ''
+      parameters:
+      - name: examples
+      outputs: []
+      lineno: 696
+      has_varargs: false
+      has_kwargs: false
+    optimize:
+      name: optimize
+      doc: Optimizing the transformer model using ONNX optimization.
+      parameters:
+      - name: model_path
+        type: str
+        doc: The path of the model to optimize.
+      - name: model_name
+        type: str
+        doc: Name of the optimized model.
+        default: optimized_model
+      - name: target_dir
+        type: str
+        doc: The directory to save the ONNX model.
+        default: ./optimized
+      - name: optimization_level
+        type: int
+        doc: Optimization level performed by ONNX Runtime of the loaded graph. (default
+          is 1)
+        default: 1
+      outputs: []
+      lineno: 799
+      has_varargs: false
+      has_kwargs: false
+  description: Automatic train and optimize functions for HuggingFace framework
+  default_handler: train
+  disable_auto_mount: false
+  clone_target_dir: ''
+  env: []
+  priority_class_name: ''
+  preemption_mode: prevent
+  affinity: null
+  tolerations: null
+  security_context: {}
+verbose: false
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/hugging_face_classifier_trainer/0.3.0/static/hugging_face_classifier_trainer.html b/functions/master/hugging_face_classifier_trainer/0.3.0/static/hugging_face_classifier_trainer.html new file mode 100644 index 00000000..99a105cb --- /dev/null +++ b/functions/master/hugging_face_classifier_trainer/0.3.0/static/hugging_face_classifier_trainer.html @@ -0,0 +1,972 @@ + + + + + + + +hugging_face_classifier_trainer.hugging_face_classifier_trainer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+ + + +
+ +
+
+
+
+
+
+ + +
+
+ +
+
+
+
+
+ +
+

+ +
+
+
+
+
+
+
+

Source code for hugging_face_classifier_trainer.hugging_face_classifier_trainer

+import os
+import shutil
+import tempfile
+import zipfile
+from abc import ABC
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import mlrun
+import mlrun.datastore
+import mlrun.utils
+import numpy as np
+import pandas as pd
+import transformers
+from datasets import Dataset, load_dataset, load_metric
+from mlrun import MLClientCtx
+from mlrun import feature_store as fs
+from mlrun.artifacts import Artifact, PlotlyArtifact
+from mlrun.datastore import DataItem
+from mlrun.frameworks._common import CommonTypes, MLRunInterface
+from mlrun.utils import create_class
+from plotly import graph_objects as go
+from sklearn.model_selection import train_test_split
+from transformers import (
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    EvalPrediction,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+    Trainer,
+    TrainerCallback,
+    TrainerControl,
+    TrainerState,
+    TrainingArguments,
+)
+
+
+# ----------------------from MLRUN--------------------------------
+
[docs]class HFORTOptimizerMLRunInterface(MLRunInterface, ABC): + """ + Interface for adding MLRun features for tensorflow keras API. + """ + + # MLRun's context default name: + DEFAULT_CONTEXT_NAME = "mlrun-huggingface" + + # Attributes to be inserted so the MLRun interface will be fully enabled. + _PROPERTIES = { + "_auto_log": False, + "_context": None, + "_model_name": "model", + "_tag": "", + "_labels": None, + "_extra_data": None, + } + _METHODS = ["enable_auto_logging"] + # Attributes to replace so the MLRun interface will be fully enabled. + _REPLACED_METHODS = [ + "optimize", + ] + +
[docs] @classmethod + def add_interface( + cls, + obj, + restoration: CommonTypes.MLRunInterfaceRestorationType = None, + ): + """ + Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras + MLRun's features. + :param obj: The object to enrich his interface. + :param restoration: Restoration information tuple as returned from 'remove_interface' in order to + add the interface in a certain state. + """ + super(HFORTOptimizerMLRunInterface, cls).add_interface( + obj=obj, restoration=restoration + )
+ +
[docs] @classmethod + def mlrun_optimize(cls): + """ + MLRun's tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be + passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string. + + raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above. + """ + + def wrapper(self, *args, **kwargs): + save_dir = cls._get_function_argument( + self.optimize, + argument_name="save_dir", + passed_args=args, + passed_kwargs=kwargs, + )[0] + + # Call the original optimize method: + result = self.original_optimize(*args, **kwargs) + + if self._auto_log: + # Log the onnx model: + self._context.log_model( + key="model", + db_key=self._model_name, + model_file=f"{save_dir}/model_optimized.onnx", + tag=self._tag, + framework="ONNX", + labels=self._labels, + extra_data=self._extra_data, + ) + + return result + + return wrapper
+ +
[docs] def enable_auto_logging( + self, + context: mlrun.MLClientCtx, + model_name: str = "model", + tag: str = "", + labels: Dict[str, str] = None, + extra_data: dict = None, + ): + self._auto_log = True + + self._context = context + self._model_name = model_name + self._tag = tag + self._labels = labels + self._extra_data = extra_data
+ + +
[docs]class HFTrainerMLRunInterface(MLRunInterface, ABC): + """ + Interface for adding MLRun features for tensorflow keras API. + """ + + # MLRuns context default name: + DEFAULT_CONTEXT_NAME = "mlrun-huggingface" + + # Attributes to replace so the MLRun interface will be fully enabled. + _REPLACED_METHODS = [ + "train", + # "evaluate" + ] + +
[docs] @classmethod + def add_interface( + cls, + obj: Trainer, + restoration: CommonTypes.MLRunInterfaceRestorationType = None, + ): + """ + Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras + MLRuns features. + :param obj: The object to enrich his interface. + :param restoration: Restoration information tuple as returned from 'remove_interface' in order to + add the interface in a certain state. + """ + + super(HFTrainerMLRunInterface, cls).add_interface( + obj=obj, restoration=restoration + )
+ +
[docs] @classmethod + def mlrun_train(cls): + + """ + MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be + passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string. + + raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above. + """ + + def wrapper(self: Trainer, *args, **kwargs): + # Restore the evaluation method as `train` will use it: + # cls._restore_attribute(obj=self, attribute_name="evaluate") + + # Call the original fit method: + result = self.original_train(*args, **kwargs) + + # Replace the evaluation method again: + # cls._replace_function(obj=self, function_name="evaluate") + + return result + + return wrapper
+ + +
[docs]class MLRunCallback(TrainerCallback): + """ + Callback for collecting logs during training / evaluation of the `Trainer` API. + """ + + def __init__( + self, + context: mlrun.MLClientCtx = None, + model_name: str = "model", + tag: str = "", + labels: Dict[str, str] = None, + extra_data: dict = None, + ): + super().__init__() + + # Store the configurations: + self._context = ( + context + if context is not None + else mlrun.get_or_create_ctx("./mlrun-huggingface") + ) + self._model_name = model_name + self._tag = tag + self._labels = labels + self._extra_data = extra_data if extra_data is not None else {} + + # Set up the logging mode: + self._is_training = False + self._steps: List[List[int]] = [] + self._metric_scores: Dict[str, List[float]] = {} + self._artifacts: Dict[str, Artifact] = {} + +
[docs] def on_epoch_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + self._steps.append([])
+ +
[docs] def on_epoch_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + self._log_metrics()
+ +
[docs] def on_log( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + logs: Dict[str, float] = None, + **kwargs, + ): + recent_logs = state.log_history[-1].copy() + + recent_logs.pop("epoch") + current_step = int(recent_logs.pop("step")) + if current_step not in self._steps[-1]: + self._steps[-1].append(current_step) + + for metric_name, metric_score in recent_logs.items(): + if metric_name.startswith("train_"): + if metric_name.split("train_")[1] not in self._metric_scores: + self._metric_scores[metric_name] = [metric_score] + continue + if metric_name not in self._metric_scores: + self._metric_scores[metric_name] = [] + self._metric_scores[metric_name].append(metric_score)
+ +
[docs] def on_train_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + self._is_training = True
+ +
[docs] def on_train_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + model: PreTrainedModel = None, + tokenizer: PreTrainedTokenizer = None, + **kwargs, + ): + self._log_metrics() + + temp_directory = tempfile.gettempdir() + + # Save and log the tokenizer: + if tokenizer is not None: + # Save tokenizer: + tokenizer_dir = os.path.join(temp_directory, "tokenizer") + tokenizer.save_pretrained(save_directory=tokenizer_dir) + # Zip the tokenizer directory: + tokenizer_zip = shutil.make_archive( + base_name="tokenizer", + format="zip", + root_dir=tokenizer_dir, + ) + # Log the zip file: + self._artifacts["tokenizer"] = self._context.log_artifact( + item="tokenizer", local_path=tokenizer_zip + ) + + # Save the model: + model_dir = os.path.join(temp_directory, "model") + model.save_pretrained(save_directory=model_dir) + + # Zip the model directory: + shutil.make_archive( + base_name="model", + format="zip", + root_dir=model_dir, + ) + + # Log the model: + self._context.log_model( + key="model", + db_key=self._model_name, + model_file="model.zip", + tag=self._tag, + framework="Hugging Face", + labels=self._labels, + extra_data={**self._artifacts, **self._extra_data}, + )
+ +
[docs] def on_evaluate( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + self._log_metrics() + + if self._is_training: + return
+ + # TODO: Update the model object + + def _log_metrics(self): + for metric_name, metric_scores in self._metric_scores.items(): + self._context.log_result(key=metric_name, value=metric_scores[-1]) + if len(metric_scores) > 1: + self._log_metric_plot(name=metric_name, scores=metric_scores) + self._context.commit(completed=False) + + def _log_metric_plot(self, name: str, scores: List[float]): + # Initialize a plotly figure: + metric_figure = go.Figure() + + # Add titles: + metric_figure.update_layout( + title=name.capitalize().replace("_", " "), + xaxis_title="Samples", + yaxis_title="Scores", + ) + + # Draw: + metric_figure.add_trace( + go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") + ) + + # Create the plotly artifact: + artifact_name = f"{name}_plot" + artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) + self._artifacts[artifact_name] = self._context.log_artifact(artifact)
+ + +def _apply_mlrun_on_trainer( + trainer: transformers.Trainer, + model_name: str = None, + tag: str = "", + context: mlrun.MLClientCtx = None, + auto_log: bool = True, + labels: Dict[str, str] = None, + extra_data: dict = None, + **kwargs, +): + # Get parameters defaults: + if context is None: + context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) + + HFTrainerMLRunInterface.add_interface(obj=trainer) + + if auto_log: + trainer.add_callback( + MLRunCallback( + context=context, + model_name=model_name, + tag=tag, + labels=labels, + extra_data=extra_data, + ) + ) + + +def _apply_mlrun_on_optimizer( + optimizer, + model_name: str = None, + tag: str = "", + context: mlrun.MLClientCtx = None, + auto_log: bool = True, + labels: Dict[str, str] = None, + extra_data: dict = None, + **kwargs, +): + # Get parameters defaults: + if context is None: + context = mlrun.get_or_create_ctx( + HFORTOptimizerMLRunInterface.DEFAULT_CONTEXT_NAME + ) + + HFORTOptimizerMLRunInterface.add_interface(obj=optimizer) + + if auto_log: + optimizer.enable_auto_logging( + context=context, + model_name=model_name, + tag=tag, + labels=labels, + extra_data=extra_data, + ) + + +
[docs]def apply_mlrun( + huggingface_object, + model_name: str = None, + tag: str = "", + context: mlrun.MLClientCtx = None, + auto_log: bool = True, + labels: Dict[str, str] = None, + extra_data: dict = None, + **kwargs, +): + """ + Wrap the given model with MLRun's interface providing it with mlrun's additional features. + :param huggingface_object: The model to wrap. Can be loaded from the model path given as well. + :param model_name: The model name to use for storing the model artifact. Default: "model". + :param tag: The model's tag to log with. + :param context: MLRun context to work with. If no context is given it will be retrieved via + 'mlrun.get_or_create_ctx(None)' + :param auto_log: Whether to enable MLRun's auto logging. Default: True. + """ + + if isinstance(huggingface_object, transformers.Trainer): + return _apply_mlrun_on_trainer( + trainer=huggingface_object, + model_name=model_name, + tag=tag, + context=context, + auto_log=auto_log, + labels=labels, + extra_data=extra_data, + ) + import optimum.onnxruntime as optimum_ort + + if isinstance(huggingface_object, optimum_ort.ORTOptimizer): + return _apply_mlrun_on_optimizer( + optimizer=huggingface_object, + model_name=model_name, + tag=tag, + context=context, + auto_log=auto_log, + labels=labels, + extra_data=extra_data, + ) + raise mlrun.errors.MLRunInvalidArgumentError
+ + +# ---------------------- from auto_trainer-------------------------------- +
[docs]class KWArgsPrefixes: + MODEL_CLASS = "CLASS_" + FIT = "FIT_" + TRAIN = "TRAIN_" + PREDICT = "PREDICT_"
+ + +def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]: + """ + Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these + keys. + + :param src: The source dict to extract the values from. + :param prefix_key: Only keys with this prefix will be returned. The keys in the result dict will be without this + prefix. + """ + return { + key.replace(prefix_key, ""): val + for key, val in src.items() + if key.startswith(prefix_key) + } + + +def _get_dataframe( + context: MLClientCtx, + dataset: DataItem, + label_columns: Optional[Union[str, List[str]]] = None, + drop_columns: Union[str, List[str], int, List[int]] = None, +) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]: + """ + Getting the DataFrame of the dataset and drop the columns accordingly. + + :param context: MLRun context. + :param dataset: The dataset to train the model on. + Can be either a list of lists, dict, URI or a FeatureVector. + :param label_columns: The target label(s) of the column(s) in the dataset. for Regression or + Classification tasks. + :param drop_columns: str/int or a list of strings/ints that represent the column names/indices to drop. + """ + if isinstance(dataset, (list, dict)): + dataset = pd.DataFrame(dataset) + # Checking if drop_columns provided by integer type: + if drop_columns: + if isinstance(drop_columns, str) or ( + isinstance(drop_columns, list) + and any(isinstance(col, str) for col in drop_columns) + ): + context.logger.error( + "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset" + ) + raise ValueError + dataset.drop(drop_columns, axis=1, inplace=True) + + return dataset, label_columns + + store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url) + if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix: + # feature-vector case: + label_columns = label_columns or dataset.meta.status.label_column + dataset = fs.get_offline_features( + dataset.meta.uri, drop_columns=drop_columns + ).to_dataframe() + + context.logger.info(f"label columns: {label_columns}") + else: + # simple URL case: + dataset = dataset.as_df() + if drop_columns: + if all(col in dataset for col in drop_columns): + dataset = dataset.drop(drop_columns, axis=1) + else: + context.logger.info( + "not all of the columns to drop in the dataset, drop columns process skipped" + ) + return dataset, label_columns + + +# ---------------------- Hugging Face Trainer -------------------------------- + + +def _create_compute_metrics(metrics: List[str]) -> Callable[[EvalPrediction], Dict]: + """ + This function create and returns a function that will be used to compute metrics at evaluation. + :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc. + + :returns: Function that will be used to compute metrics at evaluation. + Must take a [`EvalPrediction`] and return a dictionary string to metric values. + """ + + def _compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + metric_dict_results = {} + for metric in metrics: + load_met = load_metric(metric) + metric_res = load_met.compute(predictions=predictions, references=labels)[ + metric + ] + metric_dict_results[metric] = metric_res + + return metric_dict_results + + return _compute_metrics + + +def _edit_columns( + dataset: Dataset, + drop_columns: List[str] = None, + rename_columns: [str, str] = None, +) -> Dataset: + """ + Drop and renames that columns of the given dataset + :param dataset: Dataset to process + :param drop_columns: The columns to drop from the dataset. + :param rename_columns: Dict of columns ro rename : {<old_name>: <new_name>, ...} + + :returns: The dataset after the desired process + """ + if drop_columns: + dataset = dataset.remove_columns(drop_columns) + if rename_columns: + dataset = dataset.rename_columns(rename_columns) + return dataset + + +def _prepare_dataset( + context: MLClientCtx, + dataset_name: str, + label_name: str = None, + drop_columns: Optional[List[str]] = None, + num_of_train_samples: int = None, + train_test_split_size: float = None, + random_state: int = None, +) -> Tuple[Dataset, Dataset]: + """ + Loading the dataset and editing the columns + + :param context: MLRun contex + :param dataset_name: The name of the dataset to get from the HuggingFace hub + :param label_name: The target label of the column in the dataset. + :param drop_columns: The columns to drop from the dataset. + :param num_of_train_samples: Max number of training samples, for debugging. + :param train_test_split_size: Should be between 0.0 and 1.0 and represent the proportion of the dataset to include + in the test split. + :param random_state: Random state for train_test_split + + """ + + context.logger.info( + f"Loading and editing {dataset_name} dataset from Hugging Face hub" + ) + rename_cols = {label_name: "labels"} + + # Loading and editing dataset: + dataset = load_dataset(dataset_name) + + # train set + train_dataset = dataset["train"] + if num_of_train_samples: + train_dataset = train_dataset.shuffle(seed=random_state).select( + list(range(num_of_train_samples)) + ) + train_dataset = _edit_columns(train_dataset, drop_columns, rename_cols) + + # test set + test_dataset = dataset["test"] + if train_test_split_size or num_of_train_samples: + train_test_split_size = train_test_split_size or 0.2 + num_of_test_samples = int( + (train_dataset.num_rows * train_test_split_size) + // (1 - train_test_split_size) + ) + test_dataset = test_dataset.shuffle(seed=random_state).select( + list(range(num_of_test_samples)) + ) + test_dataset = _edit_columns(test_dataset, drop_columns, rename_cols) + + return train_dataset, test_dataset + + +
[docs]def train( + context: MLClientCtx, + hf_dataset: str = None, + dataset: DataItem = None, + test_set: DataItem = None, + drop_columns: Optional[List[str]] = None, + pretrained_tokenizer: str = None, + pretrained_model: str = None, + model_class: str = None, + model_name: str = "huggingface-model", + label_name: str = "labels", + text_col: str = "text", + num_of_train_samples: int = None, + train_test_split_size: float = None, + metrics: List[str] = None, + random_state: int = None, +): + """ + Training and evaluating a pretrained model with a pretrained tokenizer over a dataset. + The dataset can be either be the name of the dataset that contains in the HuggingFace hub, + or a URI or a FeatureVector + + :param context: MLRun context + :param hf_dataset: The name of the dataset to get from the HuggingFace hub + :param dataset: The dataset to train the model on. Can be either a URI or a FeatureVector + :param test_set: The test set to train the model with. + :param drop_columns: The columns to drop from the dataset. + :param pretrained_tokenizer: The name of the pretrained tokenizer from the HuggingFace hub. + :param pretrained_model: The name of the pretrained model from the HuggingFace hub. + :param model_name: The model's name to use for storing the model artifact, default to 'model' + :param model_class: The class of the model, e.g. `transformers.AutoModelForSequenceClassification` + :param label_name: The target label of the column in the dataset. + :param text_col: The input text column un the dataset. + :param num_of_train_samples: Max number of training samples, for debugging. + :param train_test_split_size: Should be between 0.0 and 1.0 and represent the proportion of the dataset to include + in the test split. + :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc. + :param random_state: Random state for train_test_split + """ + + if train_test_split_size is None and test_set is None: + context.logger.info( + "'train_test_split_size' is not provided, setting train_test_split_size to 0.2" + ) + train_test_split_size = 0.2 + + # Creating tokenizer: + tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer) + + def preprocess_function(examples): + return tokenizer(examples[text_col], truncation=True) + + # prepare data for training + if hf_dataset: + train_dataset, test_dataset = _prepare_dataset( + context, + hf_dataset, + label_name, + drop_columns, + num_of_train_samples, + train_test_split_size, + random_state=random_state, + ) + elif dataset: + # Get DataFrame by URL or by FeatureVector: + train_dataset, label_name = _get_dataframe( + context=context, + dataset=dataset, + label_columns=label_name, + drop_columns=drop_columns, + ) + if test_set: + test_dataset, _ = _get_dataframe( + context=context, + dataset=test_set, + label_columns=label_name, + drop_columns=drop_columns, + ) + else: + train_dataset, test_dataset = train_test_split( + train_dataset, + test_size=train_test_split_size, + random_state=random_state, + ) + train_dataset = Dataset.from_pandas(train_dataset) + test_dataset = Dataset.from_pandas(test_dataset) + else: + raise mlrun.errors.MLRunInvalidArgumentError( + "Training data was not provided. A training dataset is mandatory for training." + " Please provide a training set using one of the arguments 'hf_dataset' or 'dataset'." + ) + + # Mapping datasets with the tokenizer: + tokenized_train = train_dataset.map(preprocess_function, batched=True) + tokenized_test = test_dataset.map(preprocess_function, batched=True) + + # Creating data collator for batching: + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # Parsing kwargs: + train_kwargs = _get_sub_dict_by_prefix( + src=context.parameters, prefix_key=KWArgsPrefixes.TRAIN + ) + model_class_kwargs = _get_sub_dict_by_prefix( + src=context.parameters, prefix_key=KWArgsPrefixes.MODEL_CLASS + ) + + # Loading our pretrained model: + model_class_kwargs["pretrained_model_name_or_path"] = ( + model_class_kwargs.get("pretrained_model_name_or_path") or pretrained_model + ) + train_kwargs["hub_token"] = train_kwargs.get("hub_token") or pretrained_tokenizer + if not model_class_kwargs["pretrained_model_name_or_path"]: + raise mlrun.errors.MLRunRuntimeError( + "Must provide pretrained_model name as " + "function argument or in extra params" + ) + model = create_class(model_class).from_pretrained(**model_class_kwargs) + + # Preparing training arguments: + training_args = TrainingArguments( + **train_kwargs, + ) + + compute_metrics = _create_compute_metrics(metrics) if metrics else None + trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_train, + eval_dataset=tokenized_test, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + ) + + apply_mlrun(trainer, model_name=model_name) + + # Apply training with evaluation: + context.logger.info(f"training '{model_name}'") + trainer.train()
+ + +def _get_model_dir(model_uri: str): + model_file, _, _ = mlrun.artifacts.get_model(model_uri) + model_dir = tempfile.gettempdir() + # Unzip the Model: + with zipfile.ZipFile(model_file, "r") as zip_file: + zip_file.extractall(model_dir) + + return model_dir + + +
[docs]def optimize( + model_path: str, + model_name: str = "optimized_model", + target_dir: str = "./optimized", + optimization_level: int = 1, +): + """ + Optimizing the transformer model using ONNX optimization. + + + :param model_path: The path of the model to optimize. + :param model_name: Name of the optimized model. + :param target_dir: The directory to save the ONNX model. + :param optimization_level: Optimization level performed by ONNX Runtime of the loaded graph. (default is 1) + """ + # We import these in the function scope so ONNX won't be mandatory for the other handlers: + from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer + from optimum.onnxruntime.configuration import OptimizationConfig + + model_dir = _get_model_dir(model_uri=model_path) + # Creating configuration for optimization step: + optimization_config = OptimizationConfig(optimization_level=optimization_level) + + # Converting our pretrained model to an ONNX-Runtime model: + ort_model = ORTModelForSequenceClassification.from_pretrained( + model_dir, from_transformers=True + ) + + # Creating an ONNX-Runtime optimizer from ONNX model: + optimizer = ORTOptimizer.from_pretrained(ort_model) + + apply_mlrun(optimizer, model_name=model_name) + # Optimizing and saving the ONNX model: + optimizer.optimize(save_dir=target_dir, optimization_config=optimization_config)
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+ + + + \ No newline at end of file diff --git a/functions/master/hugging_face_classifier_trainer/0.3.0/static/item.html b/functions/master/hugging_face_classifier_trainer/0.3.0/static/item.html new file mode 100644 index 00000000..70a200c7 --- /dev/null +++ b/functions/master/hugging_face_classifier_trainer/0.3.0/static/item.html @@ -0,0 +1,55 @@ + + + + + + + + + + + Source + + + + +
+        
+apiVersion: v1
+categories:
+- deep-learning
+- huggingface
+- machine-learning
+- model-training
+description: Automatic train and optimize functions for HuggingFace framework
+doc: ''
+example: hugging_face_classifier_trainer.ipynb
+generationDate: 2022-08-28:17-25
+hidden: false
+icon: ''
+labels:
+  author: davids
+maintainers: []
+marketplaceType: ''
+mlrunVersion: 1.6.1
+name: hugging_face_classifier_trainer
+platformVersion: 3.5.5
+spec:
+  filename: hugging_face_classifier_trainer.py
+  handler: train
+  image: mlrun/mlrun
+  kind: job
+  requirements:
+  - onnx~=1.14.1
+  - onnxruntime~=1.16.1
+  - optimum~=1.6.4
+  - transformers~=4.26.1
+  - datasets~=2.10.1
+  - scikit-learn~=1.0.2
+url: ''
+version: 0.3.0
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/hugging_face_classifier_trainer/0.3.0/static/source.html b/functions/master/hugging_face_classifier_trainer/0.3.0/static/source.html new file mode 100644 index 00000000..6eee51f5 --- /dev/null +++ b/functions/master/hugging_face_classifier_trainer/0.3.0/static/source.html @@ -0,0 +1,854 @@ + + + + + + + + + + + Source + + + + +
+        
+import os
+import shutil
+import tempfile
+import zipfile
+from abc import ABC
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import mlrun
+import mlrun.datastore
+import mlrun.utils
+import numpy as np
+import pandas as pd
+import transformers
+from datasets import Dataset, load_dataset, load_metric
+from mlrun import MLClientCtx
+from mlrun import feature_store as fs
+from mlrun.artifacts import Artifact, PlotlyArtifact
+from mlrun.datastore import DataItem
+from mlrun.frameworks._common import CommonTypes, MLRunInterface
+from mlrun.utils import create_class
+from plotly import graph_objects as go
+from sklearn.model_selection import train_test_split
+from transformers import (
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    EvalPrediction,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+    Trainer,
+    TrainerCallback,
+    TrainerControl,
+    TrainerState,
+    TrainingArguments,
+)
+
+
+# ----------------------from MLRUN--------------------------------
+class HFORTOptimizerMLRunInterface(MLRunInterface, ABC):
+    """
+    Interface for adding MLRun features for tensorflow keras API.
+    """
+
+    # MLRun's context default name:
+    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"
+
+    # Attributes to be inserted so the MLRun interface will be fully enabled.
+    _PROPERTIES = {
+        "_auto_log": False,
+        "_context": None,
+        "_model_name": "model",
+        "_tag": "",
+        "_labels": None,
+        "_extra_data": None,
+    }
+    _METHODS = ["enable_auto_logging"]
+    # Attributes to replace so the MLRun interface will be fully enabled.
+    _REPLACED_METHODS = [
+        "optimize",
+    ]
+
+    @classmethod
+    def add_interface(
+        cls,
+        obj,
+        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
+    ):
+        """
+        Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras
+        MLRun's features.
+        :param obj:                     The object to enrich his interface.
+        :param restoration: Restoration information tuple as returned from 'remove_interface' in order to
+                                        add the interface in a certain state.
+        """
+        super(HFORTOptimizerMLRunInterface, cls).add_interface(
+            obj=obj, restoration=restoration
+        )
+
+    @classmethod
+    def mlrun_optimize(cls):
+        """
+        MLRun's tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be
+        passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.
+
+        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.
+        """
+
+        def wrapper(self, *args, **kwargs):
+            save_dir = cls._get_function_argument(
+                self.optimize,
+                argument_name="save_dir",
+                passed_args=args,
+                passed_kwargs=kwargs,
+            )[0]
+
+            # Call the original optimize method:
+            result = self.original_optimize(*args, **kwargs)
+
+            if self._auto_log:
+                # Log the onnx model:
+                self._context.log_model(
+                    key="model",
+                    db_key=self._model_name,
+                    model_file=f"{save_dir}/model_optimized.onnx",
+                    tag=self._tag,
+                    framework="ONNX",
+                    labels=self._labels,
+                    extra_data=self._extra_data,
+                )
+
+            return result
+
+        return wrapper
+
+    def enable_auto_logging(
+        self,
+        context: mlrun.MLClientCtx,
+        model_name: str = "model",
+        tag: str = "",
+        labels: Dict[str, str] = None,
+        extra_data: dict = None,
+    ):
+        self._auto_log = True
+
+        self._context = context
+        self._model_name = model_name
+        self._tag = tag
+        self._labels = labels
+        self._extra_data = extra_data
+
+
+class HFTrainerMLRunInterface(MLRunInterface, ABC):
+    """
+    Interface for adding MLRun features for tensorflow keras API.
+    """
+
+    # MLRuns context default name:
+    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"
+
+    # Attributes to replace so the MLRun interface will be fully enabled.
+    _REPLACED_METHODS = [
+        "train",
+        # "evaluate"
+    ]
+
+    @classmethod
+    def add_interface(
+        cls,
+        obj: Trainer,
+        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
+    ):
+        """
+        Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras
+        MLRuns features.
+        :param obj:                     The object to enrich his interface.
+        :param restoration: Restoration information tuple as returned from 'remove_interface' in order to
+                                        add the interface in a certain state.
+        """
+
+        super(HFTrainerMLRunInterface, cls).add_interface(
+            obj=obj, restoration=restoration
+        )
+
+    @classmethod
+    def mlrun_train(cls):
+
+        """
+        MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be
+        passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.
+
+        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.
+        """
+
+        def wrapper(self: Trainer, *args, **kwargs):
+            # Restore the evaluation method as `train` will use it:
+            # cls._restore_attribute(obj=self, attribute_name="evaluate")
+
+            # Call the original fit method:
+            result = self.original_train(*args, **kwargs)
+
+            # Replace the evaluation method again:
+            # cls._replace_function(obj=self, function_name="evaluate")
+
+            return result
+
+        return wrapper
+
+
+class MLRunCallback(TrainerCallback):
+    """
+    Callback for collecting logs during training / evaluation of the `Trainer` API.
+    """
+
+    def __init__(
+        self,
+        context: mlrun.MLClientCtx = None,
+        model_name: str = "model",
+        tag: str = "",
+        labels: Dict[str, str] = None,
+        extra_data: dict = None,
+    ):
+        super().__init__()
+
+        # Store the configurations:
+        self._context = (
+            context
+            if context is not None
+            else mlrun.get_or_create_ctx("./mlrun-huggingface")
+        )
+        self._model_name = model_name
+        self._tag = tag
+        self._labels = labels
+        self._extra_data = extra_data if extra_data is not None else {}
+
+        # Set up the logging mode:
+        self._is_training = False
+        self._steps: List[List[int]] = []
+        self._metric_scores: Dict[str, List[float]] = {}
+        self._artifacts: Dict[str, Artifact] = {}
+
+    def on_epoch_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        self._steps.append([])
+
+    def on_epoch_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        self._log_metrics()
+
+    def on_log(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        logs: Dict[str, float] = None,
+        **kwargs,
+    ):
+        recent_logs = state.log_history[-1].copy()
+
+        recent_logs.pop("epoch")
+        current_step = int(recent_logs.pop("step"))
+        if current_step not in self._steps[-1]:
+            self._steps[-1].append(current_step)
+
+        for metric_name, metric_score in recent_logs.items():
+            if metric_name.startswith("train_"):
+                if metric_name.split("train_")[1] not in self._metric_scores:
+                    self._metric_scores[metric_name] = [metric_score]
+                continue
+            if metric_name not in self._metric_scores:
+                self._metric_scores[metric_name] = []
+            self._metric_scores[metric_name].append(metric_score)
+
+    def on_train_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        self._is_training = True
+
+    def on_train_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        model: PreTrainedModel = None,
+        tokenizer: PreTrainedTokenizer = None,
+        **kwargs,
+    ):
+        self._log_metrics()
+
+        temp_directory = tempfile.gettempdir()
+
+        # Save and log the tokenizer:
+        if tokenizer is not None:
+            # Save tokenizer:
+            tokenizer_dir = os.path.join(temp_directory, "tokenizer")
+            tokenizer.save_pretrained(save_directory=tokenizer_dir)
+            # Zip the tokenizer directory:
+            tokenizer_zip = shutil.make_archive(
+                base_name="tokenizer",
+                format="zip",
+                root_dir=tokenizer_dir,
+            )
+            # Log the zip file:
+            self._artifacts["tokenizer"] = self._context.log_artifact(
+                item="tokenizer", local_path=tokenizer_zip
+            )
+
+        # Save the model:
+        model_dir = os.path.join(temp_directory, "model")
+        model.save_pretrained(save_directory=model_dir)
+
+        # Zip the model directory:
+        shutil.make_archive(
+            base_name="model",
+            format="zip",
+            root_dir=model_dir,
+        )
+
+        # Log the model:
+        self._context.log_model(
+            key="model",
+            db_key=self._model_name,
+            model_file="model.zip",
+            tag=self._tag,
+            framework="Hugging Face",
+            labels=self._labels,
+            extra_data={**self._artifacts, **self._extra_data},
+        )
+
+    def on_evaluate(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        self._log_metrics()
+
+        if self._is_training:
+            return
+
+        # TODO: Update the model object
+
+    def _log_metrics(self):
+        for metric_name, metric_scores in self._metric_scores.items():
+            self._context.log_result(key=metric_name, value=metric_scores[-1])
+            if len(metric_scores) > 1:
+                self._log_metric_plot(name=metric_name, scores=metric_scores)
+        self._context.commit(completed=False)
+
+    def _log_metric_plot(self, name: str, scores: List[float]):
+        # Initialize a plotly figure:
+        metric_figure = go.Figure()
+
+        # Add titles:
+        metric_figure.update_layout(
+            title=name.capitalize().replace("_", " "),
+            xaxis_title="Samples",
+            yaxis_title="Scores",
+        )
+
+        # Draw:
+        metric_figure.add_trace(
+            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
+        )
+
+        # Create the plotly artifact:
+        artifact_name = f"{name}_plot"
+        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
+        self._artifacts[artifact_name] = self._context.log_artifact(artifact)
+
+
+def _apply_mlrun_on_trainer(
+    trainer: transformers.Trainer,
+    model_name: str = None,
+    tag: str = "",
+    context: mlrun.MLClientCtx = None,
+    auto_log: bool = True,
+    labels: Dict[str, str] = None,
+    extra_data: dict = None,
+    **kwargs,
+):
+    # Get parameters defaults:
+    if context is None:
+        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)
+
+    HFTrainerMLRunInterface.add_interface(obj=trainer)
+
+    if auto_log:
+        trainer.add_callback(
+            MLRunCallback(
+                context=context,
+                model_name=model_name,
+                tag=tag,
+                labels=labels,
+                extra_data=extra_data,
+            )
+        )
+
+
+def _apply_mlrun_on_optimizer(
+    optimizer,
+    model_name: str = None,
+    tag: str = "",
+    context: mlrun.MLClientCtx = None,
+    auto_log: bool = True,
+    labels: Dict[str, str] = None,
+    extra_data: dict = None,
+    **kwargs,
+):
+    # Get parameters defaults:
+    if context is None:
+        context = mlrun.get_or_create_ctx(
+            HFORTOptimizerMLRunInterface.DEFAULT_CONTEXT_NAME
+        )
+
+    HFORTOptimizerMLRunInterface.add_interface(obj=optimizer)
+
+    if auto_log:
+        optimizer.enable_auto_logging(
+            context=context,
+            model_name=model_name,
+            tag=tag,
+            labels=labels,
+            extra_data=extra_data,
+        )
+
+
+def apply_mlrun(
+    huggingface_object,
+    model_name: str = None,
+    tag: str = "",
+    context: mlrun.MLClientCtx = None,
+    auto_log: bool = True,
+    labels: Dict[str, str] = None,
+    extra_data: dict = None,
+    **kwargs,
+):
+    """
+    Wrap the given model with MLRun's interface providing it with mlrun's additional features.
+    :param huggingface_object: The model to wrap. Can be loaded from the model path given as well.
+    :param model_name:         The model name to use for storing the model artifact. Default: "model".
+    :param tag:                The model's tag to log with.
+    :param context:            MLRun context to work with. If no context is given it will be retrieved via
+                               'mlrun.get_or_create_ctx(None)'
+    :param auto_log:           Whether to enable MLRun's auto logging. Default: True.
+    """
+
+    if isinstance(huggingface_object, transformers.Trainer):
+        return _apply_mlrun_on_trainer(
+            trainer=huggingface_object,
+            model_name=model_name,
+            tag=tag,
+            context=context,
+            auto_log=auto_log,
+            labels=labels,
+            extra_data=extra_data,
+        )
+    import optimum.onnxruntime as optimum_ort
+
+    if isinstance(huggingface_object, optimum_ort.ORTOptimizer):
+        return _apply_mlrun_on_optimizer(
+            optimizer=huggingface_object,
+            model_name=model_name,
+            tag=tag,
+            context=context,
+            auto_log=auto_log,
+            labels=labels,
+            extra_data=extra_data,
+        )
+    raise mlrun.errors.MLRunInvalidArgumentError
+
+
+# ---------------------- from auto_trainer--------------------------------
+class KWArgsPrefixes:
+    MODEL_CLASS = "CLASS_"
+    FIT = "FIT_"
+    TRAIN = "TRAIN_"
+    PREDICT = "PREDICT_"
+
+
+def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]:
+    """
+    Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these
+    keys.
+
+    :param src:         The source dict to extract the values from.
+    :param prefix_key:  Only keys with this prefix will be returned. The keys in the result dict will be without this
+                        prefix.
+    """
+    return {
+        key.replace(prefix_key, ""): val
+        for key, val in src.items()
+        if key.startswith(prefix_key)
+    }
+
+
+def _get_dataframe(
+    context: MLClientCtx,
+    dataset: DataItem,
+    label_columns: Optional[Union[str, List[str]]] = None,
+    drop_columns: Union[str, List[str], int, List[int]] = None,
+) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]:
+    """
+    Getting the DataFrame of the dataset and drop the columns accordingly.
+
+    :param context:         MLRun context.
+    :param dataset:         The dataset to train the model on.
+                            Can be either a list of lists, dict, URI or a FeatureVector.
+    :param label_columns:   The target label(s) of the column(s) in the dataset. for Regression or
+                            Classification tasks.
+    :param drop_columns:    str/int or a list of strings/ints that represent the column names/indices to drop.
+    """
+    if isinstance(dataset, (list, dict)):
+        dataset = pd.DataFrame(dataset)
+        # Checking if drop_columns provided by integer type:
+        if drop_columns:
+            if isinstance(drop_columns, str) or (
+                isinstance(drop_columns, list)
+                and any(isinstance(col, str) for col in drop_columns)
+            ):
+                context.logger.error(
+                    "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset"
+                )
+                raise ValueError
+            dataset.drop(drop_columns, axis=1, inplace=True)
+
+        return dataset, label_columns
+
+    store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url)
+    if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix:
+        # feature-vector case:
+        label_columns = label_columns or dataset.meta.status.label_column
+        dataset = fs.get_offline_features(
+            dataset.meta.uri, drop_columns=drop_columns
+        ).to_dataframe()
+
+        context.logger.info(f"label columns: {label_columns}")
+    else:
+        # simple URL case:
+        dataset = dataset.as_df()
+        if drop_columns:
+            if all(col in dataset for col in drop_columns):
+                dataset = dataset.drop(drop_columns, axis=1)
+            else:
+                context.logger.info(
+                    "not all of the columns to drop in the dataset, drop columns process skipped"
+                )
+    return dataset, label_columns
+
+
+# ---------------------- Hugging Face Trainer --------------------------------
+
+
+def _create_compute_metrics(metrics: List[str]) -> Callable[[EvalPrediction], Dict]:
+    """
+    This function create and returns a function that will be used to compute metrics at evaluation.
+    :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc.
+
+    :returns: Function that will be used to compute metrics at evaluation.
+             Must take a [`EvalPrediction`] and return a dictionary string to metric values.
+    """
+
+    def _compute_metrics(eval_pred):
+        logits, labels = eval_pred
+        predictions = np.argmax(logits, axis=-1)
+        metric_dict_results = {}
+        for metric in metrics:
+            load_met = load_metric(metric)
+            metric_res = load_met.compute(predictions=predictions, references=labels)[
+                metric
+            ]
+            metric_dict_results[metric] = metric_res
+
+        return metric_dict_results
+
+    return _compute_metrics
+
+
+def _edit_columns(
+    dataset: Dataset,
+    drop_columns: List[str] = None,
+    rename_columns: [str, str] = None,
+) -> Dataset:
+    """
+    Drop and renames that columns of the given dataset
+    :param dataset:         Dataset to process
+    :param drop_columns:    The columns to drop from the dataset.
+    :param rename_columns:  Dict of columns ro rename : {: , ...}
+
+    :returns: The dataset after the desired process
+    """
+    if drop_columns:
+        dataset = dataset.remove_columns(drop_columns)
+    if rename_columns:
+        dataset = dataset.rename_columns(rename_columns)
+    return dataset
+
+
+def _prepare_dataset(
+    context: MLClientCtx,
+    dataset_name: str,
+    label_name: str = None,
+    drop_columns: Optional[List[str]] = None,
+    num_of_train_samples: int = None,
+    train_test_split_size: float = None,
+    random_state: int = None,
+) -> Tuple[Dataset, Dataset]:
+    """
+    Loading the dataset and editing the columns
+
+    :param context:                 MLRun contex
+    :param dataset_name:            The name of the dataset to get from the HuggingFace hub
+    :param label_name:              The target label of the column in the dataset.
+    :param drop_columns:            The columns to drop from the dataset.
+    :param num_of_train_samples:    Max number of training samples, for debugging.
+    :param train_test_split_size:   Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
+                                    in the test split.
+    :param random_state:            Random state for train_test_split
+
+    """
+
+    context.logger.info(
+        f"Loading and editing {dataset_name} dataset from Hugging Face hub"
+    )
+    rename_cols = {label_name: "labels"}
+
+    # Loading and editing dataset:
+    dataset = load_dataset(dataset_name)
+
+    # train set
+    train_dataset = dataset["train"]
+    if num_of_train_samples:
+        train_dataset = train_dataset.shuffle(seed=random_state).select(
+            list(range(num_of_train_samples))
+        )
+    train_dataset = _edit_columns(train_dataset, drop_columns, rename_cols)
+
+    # test set
+    test_dataset = dataset["test"]
+    if train_test_split_size or num_of_train_samples:
+        train_test_split_size = train_test_split_size or 0.2
+        num_of_test_samples = int(
+            (train_dataset.num_rows * train_test_split_size)
+            // (1 - train_test_split_size)
+        )
+        test_dataset = test_dataset.shuffle(seed=random_state).select(
+            list(range(num_of_test_samples))
+        )
+    test_dataset = _edit_columns(test_dataset, drop_columns, rename_cols)
+
+    return train_dataset, test_dataset
+
+
+def train(
+    context: MLClientCtx,
+    hf_dataset: str = None,
+    dataset: DataItem = None,
+    test_set: DataItem = None,
+    drop_columns: Optional[List[str]] = None,
+    pretrained_tokenizer: str = None,
+    pretrained_model: str = None,
+    model_class: str = None,
+    model_name: str = "huggingface-model",
+    label_name: str = "labels",
+    text_col: str = "text",
+    num_of_train_samples: int = None,
+    train_test_split_size: float = None,
+    metrics: List[str] = None,
+    random_state: int = None,
+):
+    """
+    Training and evaluating a pretrained model with a pretrained tokenizer over a dataset.
+    The dataset can be either be the name of the dataset that contains in the HuggingFace hub,
+    or a URI or a FeatureVector
+
+    :param context:                 MLRun context
+    :param hf_dataset:              The name of the dataset to get from the HuggingFace hub
+    :param dataset:                 The dataset to train the model on. Can be either a URI or a FeatureVector
+    :param test_set:                The test set to train the model with.
+    :param drop_columns:            The columns to drop from the dataset.
+    :param pretrained_tokenizer:    The name of the pretrained tokenizer from the HuggingFace hub.
+    :param pretrained_model:        The name of the pretrained model from the HuggingFace hub.
+    :param model_name:              The model's name to use for storing the model artifact, default to 'model'
+    :param model_class:             The class of the model, e.g. `transformers.AutoModelForSequenceClassification`
+    :param label_name:              The target label of the column in the dataset.
+    :param text_col:                The input text column un the dataset.
+    :param num_of_train_samples:    Max number of training samples, for debugging.
+    :param train_test_split_size:   Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
+                                    in the test split.
+    :param metrics:                 List of different metrics for evaluate the model such as f1, accuracy etc.
+    :param random_state:            Random state for train_test_split
+    """
+
+    if train_test_split_size is None and test_set is None:
+        context.logger.info(
+            "'train_test_split_size' is not provided, setting train_test_split_size to 0.2"
+        )
+        train_test_split_size = 0.2
+
+    # Creating tokenizer:
+    tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer)
+
+    def preprocess_function(examples):
+        return tokenizer(examples[text_col], truncation=True)
+
+    # prepare data for training
+    if hf_dataset:
+        train_dataset, test_dataset = _prepare_dataset(
+            context,
+            hf_dataset,
+            label_name,
+            drop_columns,
+            num_of_train_samples,
+            train_test_split_size,
+            random_state=random_state,
+        )
+    elif dataset:
+        # Get DataFrame by URL or by FeatureVector:
+        train_dataset, label_name = _get_dataframe(
+            context=context,
+            dataset=dataset,
+            label_columns=label_name,
+            drop_columns=drop_columns,
+        )
+        if test_set:
+            test_dataset, _ = _get_dataframe(
+                context=context,
+                dataset=test_set,
+                label_columns=label_name,
+                drop_columns=drop_columns,
+            )
+        else:
+            train_dataset, test_dataset = train_test_split(
+                train_dataset,
+                test_size=train_test_split_size,
+                random_state=random_state,
+            )
+        train_dataset = Dataset.from_pandas(train_dataset)
+        test_dataset = Dataset.from_pandas(test_dataset)
+    else:
+        raise mlrun.errors.MLRunInvalidArgumentError(
+            "Training data was not provided. A training dataset is mandatory for training."
+            " Please provide a training set using one of the arguments 'hf_dataset' or 'dataset'."
+        )
+
+    # Mapping datasets with the tokenizer:
+    tokenized_train = train_dataset.map(preprocess_function, batched=True)
+    tokenized_test = test_dataset.map(preprocess_function, batched=True)
+
+    # Creating data collator for batching:
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    # Parsing kwargs:
+    train_kwargs = _get_sub_dict_by_prefix(
+        src=context.parameters, prefix_key=KWArgsPrefixes.TRAIN
+    )
+    model_class_kwargs = _get_sub_dict_by_prefix(
+        src=context.parameters, prefix_key=KWArgsPrefixes.MODEL_CLASS
+    )
+
+    # Loading our pretrained model:
+    model_class_kwargs["pretrained_model_name_or_path"] = (
+        model_class_kwargs.get("pretrained_model_name_or_path") or pretrained_model
+    )
+    train_kwargs["hub_token"] = train_kwargs.get("hub_token") or pretrained_tokenizer
+    if not model_class_kwargs["pretrained_model_name_or_path"]:
+        raise mlrun.errors.MLRunRuntimeError(
+            "Must provide pretrained_model name as "
+            "function argument or in extra params"
+        )
+    model = create_class(model_class).from_pretrained(**model_class_kwargs)
+
+    # Preparing training arguments:
+    training_args = TrainingArguments(
+        **train_kwargs,
+    )
+
+    compute_metrics = _create_compute_metrics(metrics) if metrics else None
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_train,
+        eval_dataset=tokenized_test,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+    )
+
+    apply_mlrun(trainer, model_name=model_name)
+
+    # Apply training with evaluation:
+    context.logger.info(f"training '{model_name}'")
+    trainer.train()
+
+
+def _get_model_dir(model_uri: str):
+    model_file, _, _ = mlrun.artifacts.get_model(model_uri)
+    model_dir = tempfile.gettempdir()
+    # Unzip the Model:
+    with zipfile.ZipFile(model_file, "r") as zip_file:
+        zip_file.extractall(model_dir)
+
+    return model_dir
+
+
+def optimize(
+    model_path: str,
+    model_name: str = "optimized_model",
+    target_dir: str = "./optimized",
+    optimization_level: int = 1,
+):
+    """
+    Optimizing the transformer model using ONNX optimization.
+
+
+    :param model_path:          The path of the model to optimize.
+    :param model_name:          Name of the optimized model.
+    :param target_dir:          The directory to save the ONNX model.
+    :param optimization_level:  Optimization level performed by ONNX Runtime of the loaded graph. (default is 1)
+    """
+    # We import these in the function scope so ONNX won't be mandatory for the other handlers:
+    from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer
+    from optimum.onnxruntime.configuration import OptimizationConfig
+
+    model_dir = _get_model_dir(model_uri=model_path)
+    # Creating configuration for optimization step:
+    optimization_config = OptimizationConfig(optimization_level=optimization_level)
+
+    # Converting our pretrained model to an ONNX-Runtime model:
+    ort_model = ORTModelForSequenceClassification.from_pretrained(
+        model_dir, from_transformers=True
+    )
+
+    # Creating an ONNX-Runtime optimizer from ONNX model:
+    optimizer = ORTOptimizer.from_pretrained(ort_model)
+
+    apply_mlrun(optimizer, model_name=model_name)
+    # Optimizing and saving the ONNX model:
+    optimizer.optimize(save_dir=target_dir, optimization_config=optimization_config)
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/hugging_face_classifier_trainer/latest/src/function.yaml b/functions/master/hugging_face_classifier_trainer/latest/src/function.yaml index eb223b2b..65f5aeb1 100644 --- a/functions/master/hugging_face_classifier_trainer/latest/src/function.yaml +++ b/functions/master/hugging_face_classifier_trainer/latest/src/function.yaml @@ -2,11 +2,13 @@ kind: job metadata: name: hugging-face-classifier-trainer tag: '' - hash: e8113e81f04c96fc9a8a94e717dea81ee3e05a18 + hash: f9d8aa4a2c66e24fa418bb163829adc3e2ada06c project: '' labels: author: davids categories: + - deep-learning + - huggingface - machine-learning - model-training spec: diff --git a/functions/master/hugging_face_classifier_trainer/latest/src/item.yaml b/functions/master/hugging_face_classifier_trainer/latest/src/item.yaml index 3c087765..332902b3 100755 --- a/functions/master/hugging_face_classifier_trainer/latest/src/item.yaml +++ b/functions/master/hugging_face_classifier_trainer/latest/src/item.yaml @@ -1,5 +1,7 @@ apiVersion: v1 categories: +- deep-learning +- huggingface - machine-learning - model-training description: Automatic train and optimize functions for HuggingFace framework @@ -28,4 +30,4 @@ spec: - datasets~=2.10.1 - scikit-learn~=1.0.2 url: '' -version: 0.2.0 +version: 0.3.0 diff --git a/functions/master/hugging_face_classifier_trainer/latest/static/function.html b/functions/master/hugging_face_classifier_trainer/latest/static/function.html index 2bf1ffb9..08fe9f21 100644 --- a/functions/master/hugging_face_classifier_trainer/latest/static/function.html +++ b/functions/master/hugging_face_classifier_trainer/latest/static/function.html @@ -19,11 +19,13 @@ metadata: name: hugging-face-classifier-trainer tag: '' - hash: e8113e81f04c96fc9a8a94e717dea81ee3e05a18 + hash: f9d8aa4a2c66e24fa418bb163829adc3e2ada06c project: '' labels: author: davids categories: + - deep-learning + - huggingface - machine-learning - model-training spec: diff --git a/functions/master/hugging_face_classifier_trainer/latest/static/item.html b/functions/master/hugging_face_classifier_trainer/latest/static/item.html index 7db7e49b..70a200c7 100644 --- a/functions/master/hugging_face_classifier_trainer/latest/static/item.html +++ b/functions/master/hugging_face_classifier_trainer/latest/static/item.html @@ -17,6 +17,8 @@ apiVersion: v1 categories: +- deep-learning +- huggingface - machine-learning - model-training description: Automatic train and optimize functions for HuggingFace framework @@ -45,7 +47,7 @@ - datasets~=2.10.1 - scikit-learn~=1.0.2 url: '' -version: 0.2.0 +version: 0.3.0 diff --git a/functions/master/hugging_face_serving/1.1.0/src/function.yaml b/functions/master/hugging_face_serving/1.1.0/src/function.yaml new file mode 100644 index 00000000..764fc1cf --- /dev/null +++ b/functions/master/hugging_face_serving/1.1.0/src/function.yaml @@ -0,0 +1,46 @@ +kind: serving +metadata: + name: hugging-face-serving + tag: '' + hash: 1a489a57da861f129eb26e933f34e58927e41195 + project: '' + labels: + author: yonish + categories: + - huggingface + - genai + - model-serving + - machine-learning +spec: + command: '' + args: [] + image: mlrun/ml-models + build: + functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKCmZyb20gYWJjIGltcG9ydCBBQkMKZnJvbSBpbXBvcnRsaWIgaW1wb3J0IGltcG9ydF9tb2R1bGUKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBwaXBlbGluZQoKaW1wb3J0IG1scnVuLnNlcnZpbmcKClBBQ0tBR0VfTU9EVUxFID0gInRyYW5zZm9ybWVycyIKU0VSSUFMSVpBQkxFX1RZUEVTID0gW2RpY3QsIGxpc3QsIHR1cGxlLCBzdHIsIGludCwgZmxvYXRdCgoKY2xhc3MgSHVnZ2luZ0ZhY2VNb2RlbFNlcnZlcihtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIsIEFCQyk6CiAgICAiIiIKICAgIEh1Z2dpbmcgRmFjZSBNb2RlbCBzZXJ2aW5nIGNsYXNzLCBpbmhlcml0aW5nIHRoZSBWMk1vZGVsU2VydmVyIGNsYXNzIGZvciBiZWluZyBpbml0aWFsaXplZCBhdXRvbWF0aWNhbGx5IGJ5IHRoZQogICAgbW9kZWwgc2VydmVyIGFuZCBiZSBhYmxlIHRvIHJ1biBsb2NhbGx5IGFzIHBhcnQgb2YgYSBudWNsaW8gc2VydmVybGVzcyBmdW5jdGlvbiwgb3IgYXMgcGFydCBvZiBhIHJlYWwtdGltZSBwaXBlbGluZS4KICAgICIiIgoKICAgIGRlZiBfX2luaXRfXygKICAgICAgICBzZWxmLAogICAgICAgIGNvbnRleHQ6IG1scnVuLk1MQ2xpZW50Q3R4LAogICAgICAgIG5hbWU6IHN0ciwKICAgICAgICB0YXNrOiBzdHIsCiAgICAgICAgbW9kZWxfcGF0aDogc3RyID0gTm9uZSwKICAgICAgICBtb2RlbF9uYW1lOiBzdHIgPSBOb25lLAogICAgICAgIG1vZGVsX2NsYXNzOiBzdHIgPSBOb25lLAogICAgICAgIHRva2VuaXplcl9uYW1lOiBzdHIgPSBOb25lLAogICAgICAgIHRva2VuaXplcl9jbGFzczogc3RyID0gTm9uZSwKICAgICAgICBmcmFtZXdvcms6IHN0ciA9IE5vbmUsCiAgICAgICAgKipjbGFzc19hcmdzLAogICAgKToKICAgICAgICAiIiIKICAgICAgICBJbml0aWFsaXplIGEgc2VydmluZyBjbGFzcyBmb3IgYSBIdWdnaW5nIGZhY2UgbW9kZWwuCgogICAgICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIFRoZSBtbHJ1biBjb250ZXh0IHRvIHdvcmsgd2l0aAogICAgICAgIDpwYXJhbSBuYW1lOiAgICAgICAgICAgIFRoZSBuYW1lIG9mIHRoaXMgc2VydmVyIHRvIGJlIGluaXRpYWxpemVkCiAgICAgICAgOnBhcmFtIG1vZGVsX3BhdGg6ICAgICAgTm90IGluIHVzZS4gV2hlbiBhZGRpbmcgYSBtb2RlbCBwYXNzIGFueSBzdHJpbmcgdmFsdWUKICAgICAgICA6cGFyYW0gbW9kZWxfbmFtZTogICAgICBUaGUgbW9kZWwncyBuYW1lIGluIHRoZSBIdWdnaW5nIEZhY2UgaHViCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZS5nLiwgYG5scHRvd24vYmVydC1iYXNlLW11bHRpbGluZ3VhbC11bmNhc2VkLXNlbnRpbWVudGAKICAgICAgICA6cGFyYW0gbW9kZWxfY2xhc3M6ICAgICBUaGUgbW9kZWwncyBjbGFzcyB0eXBlIG9iamVjdCB3aGljaCBjYW4gYmUgcGFzc2VkIGFzIHRoZSBjbGFzcydzIG5hbWUgKHN0cmluZykuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgTXVzdCBiZSBwcm92aWRlZCBhbmQgdG8gYmUgbWF0Y2hlZCB3aXRoIGBtb2RlbF9uYW1lYC4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBlLmcuLCBgQXV0b01vZGVsRm9yU2VxdWVuY2VDbGFzc2lmaWNhdGlvbmAKICAgICAgICA6cGFyYW0gdG9rZW5pemVyX25hbWU6ICBUaGUgdG9rZW5pemVyJ3MgbmFtZSBpbiB0aGUgSHVnZ2luZyBGYWNlIGh1YgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGUuZy4sIGBubHB0b3duL2JlcnQtYmFzZS1tdWx0aWxpbmd1YWwtdW5jYXNlZC1zZW50aW1lbnRgCiAgICAgICAgOnBhcmFtIHRva2VuaXplcl9jbGFzczogVGhlIG1vZGVsJ3MgY2xhc3MgdHlwZSBvYmplY3Qgd2hpY2ggY2FuIGJlIHBhc3NlZCBhcyB0aGUgY2xhc3MncyBuYW1lIChzdHJpbmcpLgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIE11c3QgYmUgcHJvdmlkZWQgYW5kIHRvIGJlIG1hdGNoZWQgd2l0aCBgbW9kZWxfbmFtZWAuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZS5nLiwgYEF1dG9Ub2tlbml6ZXJgCiAgICAgICAgOnBhcmFtIGZyYW1ld29yazogICAgICAgVGhlIGZyYW1ld29yayB0byB1c2UsIGVpdGhlciBgInB0ImAgZm9yIFB5VG9yY2ggb3IgYCJ0ZiJgIGZvciBUZW5zb3JGbG93LiBUaGUgc3BlY2lmaWVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZnJhbWV3b3JrIG11c3QgYmUgaW5zdGFsbGVkLgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIElmIG5vIGZyYW1ld29yayBpcyBzcGVjaWZpZWQsIHdpbGwgZGVmYXVsdCB0byB0aGUgb25lIGN1cnJlbnRseSBpbnN0YWxsZWQuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgSWYgbm8gZnJhbWV3b3JrIGlzIHNwZWNpZmllZCBhbmQgYm90aCBmcmFtZXdvcmtzIGFyZSBpbnN0YWxsZWQsIHdpbGwgZGVmYXVsdCB0byB0aGUKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBmcmFtZXdvcmsgb2YgdGhlIGBtb2RlbGAsIG9yIHRvIFB5VG9yY2ggaWYgbm8gbW9kZWwgaXMgcHJvdmlkZWQuCiAgICAgICAgOnBhcmFtIGNsYXNzX2FyZ3M6ICAgICAgLQogICAgICAgICIiIgogICAgICAgIHN1cGVyKEh1Z2dpbmdGYWNlTW9kZWxTZXJ2ZXIsIHNlbGYpLl9faW5pdF9fKAogICAgICAgICAgICBjb250ZXh0PWNvbnRleHQsCiAgICAgICAgICAgIG5hbWU9bmFtZSwKICAgICAgICAgICAgbW9kZWxfcGF0aD1tb2RlbF9wYXRoLAogICAgICAgICAgICAqKmNsYXNzX2FyZ3MsCiAgICAgICAgKQogICAgICAgIHNlbGYudGFzayA9IHRhc2sKICAgICAgICBzZWxmLm1vZGVsID0gTm9uZQogICAgICAgIHNlbGYudG9rZW5pemVyID0gTm9uZQogICAgICAgIHNlbGYubW9kZWxfbmFtZSA9IG1vZGVsX25hbWUKICAgICAgICBzZWxmLnRva2VuaXplcl9uYW1lID0gdG9rZW5pemVyX25hbWUKICAgICAgICBzZWxmLm1vZGVsX2NsYXNzID0gbW9kZWxfY2xhc3MKICAgICAgICBzZWxmLnRva2VuaXplcl9jbGFzcyA9IHRva2VuaXplcl9jbGFzcwogICAgICAgIHNlbGYuZnJhbWV3b3JrID0gZnJhbWV3b3JrCiAgICAgICAgc2VsZi5waXBlID0gTm9uZQoKICAgIGRlZiBsb2FkKHNlbGYpOgogICAgICAgICIiImxvYWQgYW5kIGluaXRpYWxpemUgdGhlIG1vZGVsIGFuZC9vciBvdGhlciBlbGVtZW50cyIiIgogICAgICAgIGlmIHNlbGYubW9kZWxfY2xhc3M6CiAgICAgICAgICAgIG1vZGVsX29iamVjdCA9IGdldGF0dHIoaW1wb3J0X21vZHVsZShQQUNLQUdFX01PRFVMRSksIHNlbGYubW9kZWxfY2xhc3MpCiAgICAgICAgICAgIHNlbGYubW9kZWwgPSBtb2RlbF9vYmplY3QuZnJvbV9wcmV0cmFpbmVkKHNlbGYubW9kZWxfbmFtZSkKICAgICAgICBpZiBzZWxmLnRva2VuaXplcl9jbGFzczoKICAgICAgICAgICAgdG9rZW5pemVyX29iamVjdCA9IGdldGF0dHIoCiAgICAgICAgICAgICAgICBpbXBvcnRfbW9kdWxlKFBBQ0tBR0VfTU9EVUxFKSwgc2VsZi50b2tlbml6ZXJfY2xhc3MKICAgICAgICAgICAgKQogICAgICAgICAgICBzZWxmLnRva2VuaXplciA9IHRva2VuaXplcl9vYmplY3QuZnJvbV9wcmV0cmFpbmVkKHNlbGYudG9rZW5pemVyX25hbWUpCiAgICAgICAgc2VsZi5waXBlID0gcGlwZWxpbmUoCiAgICAgICAgICAgIHRhc2s9c2VsZi50YXNrLAogICAgICAgICAgICBtb2RlbD1zZWxmLm1vZGVsIG9yIHNlbGYubW9kZWxfbmFtZSwKICAgICAgICAgICAgdG9rZW5pemVyPXNlbGYudG9rZW5pemVyLAogICAgICAgICAgICBmcmFtZXdvcms9c2VsZi5mcmFtZXdvcmssCiAgICAgICAgKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGJvZHk6IGRpY3QpIC0+IExpc3Q6CiAgICAgICAgIiIiR2VuZXJhdGUgbW9kZWwgcHJlZGljdGlvbnMgZnJvbSBzYW1wbGUuIiIiCiAgICAgICAgaWYgc2VsZi5waXBlIGlzIE5vbmU6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoIlBsZWFzZSB1c2UgYC5sb2FkKClgIikKICAgICAgICB0cnk6CiAgICAgICAgICAgIGlmIGlzaW5zdGFuY2UoYm9keVsiaW5wdXRzIl1bMF0sIGRpY3QpOgogICAgICAgICAgICAgICAgcmVzdWx0ID0gW3NlbGYucGlwZSgqKl9pbnB1dCkgZm9yIF9pbnB1dCBpbiBib2R5WyJpbnB1dHMiXV0KICAgICAgICAgICAgZWxzZToKICAgICAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYucGlwZShib2R5WyJpbnB1dHMiXSkKICAgICAgICAgICAgIyByZXBsYWNlIGxpc3Qgb2YgbGlzdHMgb2YgZGljdHMgaW50byBhIGxpc3Qgb2YgZGljdHM6CiAgICAgICAgICAgIGlmIGFsbChpc2luc3RhbmNlKHJlcywgbGlzdCkgZm9yIHJlcyBpbiByZXN1bHQpOgogICAgICAgICAgICAgICAgbmV3X3Jlc3VsdCA9IFtyZXNbMF0gZm9yIHJlcyBpbiByZXN1bHRdCiAgICAgICAgICAgICAgICByZXN1bHQgPSBuZXdfcmVzdWx0CgogICAgICAgICAgICBub25fc2VyaWFsaXphYmxlX3R5cGVzID0gW10KICAgICAgICAgICAgZm9yIHJlcyBpbiByZXN1bHQ6CiAgICAgICAgICAgICAgICBmb3Iga2V5LCB2YWwgaW4gcmVzLml0ZW1zKCk6CiAgICAgICAgICAgICAgICAgICAgaWYgdHlwZSh2YWwpIG5vdCBpbiBTRVJJQUxJWkFCTEVfVFlQRVM6CiAgICAgICAgICAgICAgICAgICAgICAgIG5vbl9zZXJpYWxpemFibGVfdHlwZXMuYXBwZW5kKHN0cih0eXBlKHZhbCkpKQogICAgICAgICAgICAgICAgICAgICAgICByZXNba2V5XSA9IHN0cih2YWwpCiAgICAgICAgICAgIGlmIG5vbl9zZXJpYWxpemFibGVfdHlwZXM6CiAgICAgICAgICAgICAgICBzZWxmLmNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgICAgICAgICAgICAgZiJOb24tc2VyaWFsaXphYmxlIHR5cGVzOiB7bm9uX3NlcmlhbGl6YWJsZV90eXBlc30gd2VyZSBjYXN0ZWQgdG8gc3RyaW5ncyIKICAgICAgICAgICAgICAgICkKICAgICAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiRmFpbGVkIHRvIHByZWRpY3QgJXMiICUgZSkKICAgICAgICByZXR1cm4gcmVzdWx0Cgpmcm9tIG1scnVuLnJ1bnRpbWVzIGltcG9ydCBudWNsaW9faW5pdF9ob29rCmRlZiBpbml0X2NvbnRleHQoY29udGV4dCk6CiAgICBudWNsaW9faW5pdF9ob29rKGNvbnRleHQsIGdsb2JhbHMoKSwgJ3NlcnZpbmdfdjInKQoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgogICAgcmV0dXJuIGNvbnRleHQubWxydW5faGFuZGxlcihjb250ZXh0LCBldmVudCkK + commands: [] + code_origin: '' + origin_filename: '' + requirements: + - transformers==4.21.3 + - tensorflow==2.9.2 + description: Generic Hugging Face model server. + default_handler: '' + disable_auto_mount: false + clone_target_dir: '' + env: + - name: MLRUN_HTTPDB__NUCLIO__EXPLICIT_ACK + value: enabled + priority_class_name: '' + preemption_mode: prevent + min_replicas: 1 + max_replicas: 4 + source: '' + function_kind: serving_v2 + function_handler: hugging_face_serving:handler + base_image_pull: false + default_class: HuggingFaceModelServer + secret_sources: [] + affinity: null + tolerations: null + security_context: {} +verbose: false diff --git a/functions/master/hugging_face_serving/1.1.0/src/hugging_face_serving.ipynb b/functions/master/hugging_face_serving/1.1.0/src/hugging_face_serving.ipynb new file mode 100644 index 00000000..94baf9ff --- /dev/null +++ b/functions/master/hugging_face_serving/1.1.0/src/hugging_face_serving.ipynb @@ -0,0 +1,252 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hugging Face 🤗 Serving" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Importing the Hugging Face 🤗 model serving function" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "serving_function = mlrun.import_function('function.yaml')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Adding a pretrained model" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "serving_function.add_model(\n", + " 'mymodel',\n", + " class_name='HuggingFaceModelServer',\n", + " model_path='123', # This is not used, just for enabling the process.\n", + " \n", + " task=\"sentiment-analysis\",\n", + " model_class=\"AutoModelForSequenceClassification\",\n", + " model_name=\"nlptown/bert-base-multilingual-uncased-sentiment\",\n", + " tokenizer_class=\"AutoTokenizer\",\n", + " tokenizer_name=\"nlptown/bert-base-multilingual-uncased-sentiment\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Testing the pipeline locally" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2022-09-07 08:54:42,419 [info] model mymodel was loaded\n", + "> 2022-09-07 08:54:42,420 [info] Loaded ['mymodel']\n" + ] + } + ], + "source": [ + "server = serving_function.to_mock_server()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "prediction: [{'label': '5 stars', 'score': 0.7272651791572571}]\n" + ] + } + ], + "source": [ + "result = server.test(\n", + " '/v2/models/mymodel',\n", + " body={\"inputs\": [\"Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.\"]}\n", + ")\n", + "print(f\"prediction: {result['outputs']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Adding a default model from 🤗" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "serving_function.add_model(\n", + " 'default-model',\n", + " class_name='HuggingFaceModelServer',\n", + " model_path='123', # This is not used, just for enabling the process.\n", + " \n", + " task=\"sentiment-analysis\",\n", + " framework='pt', # Use `pt` for pytorch and `tf` for tensorflow.\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deploy the pipeline to our k8s cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2022-09-07 08:54:42,487 [info] Starting remote function deploy\n", + "2022-09-07 08:54:43 (info) Deploying function\n", + "2022-09-07 08:54:43 (info) Building\n", + "2022-09-07 08:54:44 (info) Staging files and preparing base images\n", + "2022-09-07 08:54:44 (info) Building processor image\n", + "2022-09-07 08:56:29 (info) Build complete\n", + "> 2022-09-07 08:57:09,536 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-default-hugging-face-serving.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-hugging-face-serving-default.default-tenant.app.yh43.iguazio-cd1.com/']}\n" + ] + }, + { + "data": { + "text/plain": [ + "'http://default-hugging-face-serving-default.default-tenant.app.yh43.iguazio-cd1.com/'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "serving_function.deploy()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Infer our sentences through our model" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2022-09-07 08:57:09,616 [info] invoking function: {'method': 'POST', 'path': 'http://nuclio-default-hugging-face-serving.default-tenant.svc.cluster.local:8080/v2/models/default-model/predict'}\n" + ] + }, + { + "data": { + "text/plain": [ + "{'id': 'f7753a17-fa84-44fa-9264-1dc65172d05c',\n", + " 'model_name': 'default-model',\n", + " 'outputs': [{'label': 'POSITIVE', 'score': 0.9993784427642822}]}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "serving_function.invoke(\n", + " path='v2/models/default-model/predict',\n", + " body={\"inputs\": [\"We are delighted that we can serve 🤗 Transformers with MLRun.\"]})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/functions/master/hugging_face_serving/1.1.0/src/hugging_face_serving.py b/functions/master/hugging_face_serving/1.1.0/src/hugging_face_serving.py new file mode 100644 index 00000000..06dc4207 --- /dev/null +++ b/functions/master/hugging_face_serving/1.1.0/src/hugging_face_serving.py @@ -0,0 +1,129 @@ +# Copyright 2019 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from abc import ABC +from importlib import import_module +from typing import List + +from transformers import pipeline + +import mlrun.serving + +PACKAGE_MODULE = "transformers" +SERIALIZABLE_TYPES = [dict, list, tuple, str, int, float] + + +class HuggingFaceModelServer(mlrun.serving.V2ModelServer, ABC): + """ + Hugging Face Model serving class, inheriting the V2ModelServer class for being initialized automatically by the + model server and be able to run locally as part of a nuclio serverless function, or as part of a real-time pipeline. + """ + + def __init__( + self, + context: mlrun.MLClientCtx, + name: str, + task: str, + model_path: str = None, + model_name: str = None, + model_class: str = None, + tokenizer_name: str = None, + tokenizer_class: str = None, + framework: str = None, + **class_args, + ): + """ + Initialize a serving class for a Hugging face model. + + :param context: The mlrun context to work with + :param name: The name of this server to be initialized + :param model_path: Not in use. When adding a model pass any string value + :param model_name: The model's name in the Hugging Face hub + e.g., `nlptown/bert-base-multilingual-uncased-sentiment` + :param model_class: The model's class type object which can be passed as the class's name (string). + Must be provided and to be matched with `model_name`. + e.g., `AutoModelForSequenceClassification` + :param tokenizer_name: The tokenizer's name in the Hugging Face hub + e.g., `nlptown/bert-base-multilingual-uncased-sentiment` + :param tokenizer_class: The model's class type object which can be passed as the class's name (string). + Must be provided and to be matched with `model_name`. + e.g., `AutoTokenizer` + :param framework: The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified + framework must be installed. + If no framework is specified, will default to the one currently installed. + If no framework is specified and both frameworks are installed, will default to the + framework of the `model`, or to PyTorch if no model is provided. + :param class_args: - + """ + super(HuggingFaceModelServer, self).__init__( + context=context, + name=name, + model_path=model_path, + **class_args, + ) + self.task = task + self.model = None + self.tokenizer = None + self.model_name = model_name + self.tokenizer_name = tokenizer_name + self.model_class = model_class + self.tokenizer_class = tokenizer_class + self.framework = framework + self.pipe = None + + def load(self): + """load and initialize the model and/or other elements""" + if self.model_class: + model_object = getattr(import_module(PACKAGE_MODULE), self.model_class) + self.model = model_object.from_pretrained(self.model_name) + if self.tokenizer_class: + tokenizer_object = getattr( + import_module(PACKAGE_MODULE), self.tokenizer_class + ) + self.tokenizer = tokenizer_object.from_pretrained(self.tokenizer_name) + self.pipe = pipeline( + task=self.task, + model=self.model or self.model_name, + tokenizer=self.tokenizer, + framework=self.framework, + ) + + def predict(self, body: dict) -> List: + """Generate model predictions from sample.""" + if self.pipe is None: + raise ValueError("Please use `.load()`") + try: + if isinstance(body["inputs"][0], dict): + result = [self.pipe(**_input) for _input in body["inputs"]] + else: + result = self.pipe(body["inputs"]) + # replace list of lists of dicts into a list of dicts: + if all(isinstance(res, list) for res in result): + new_result = [res[0] for res in result] + result = new_result + + non_serializable_types = [] + for res in result: + for key, val in res.items(): + if type(val) not in SERIALIZABLE_TYPES: + non_serializable_types.append(str(type(val))) + res[key] = str(val) + if non_serializable_types: + self.context.logger.info( + f"Non-serializable types: {non_serializable_types} were casted to strings" + ) + except Exception as e: + raise Exception("Failed to predict %s" % e) + return result diff --git a/functions/master/hugging_face_serving/1.1.0/src/item.yaml b/functions/master/hugging_face_serving/1.1.0/src/item.yaml new file mode 100644 index 00000000..d1f78769 --- /dev/null +++ b/functions/master/hugging_face_serving/1.1.0/src/item.yaml @@ -0,0 +1,32 @@ +apiVersion: v1 +categories: +- huggingface +- genai +- model-serving +- machine-learning +description: Generic Hugging Face model server. +doc: '' +example: hugging_face_serving.ipynb +generationDate: 2022-09-05:17-00 +hidden: false +icon: '' +labels: + author: yonish +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.1.0 +name: hugging_face_serving +platformVersion: '' +spec: + customFields: + default_class: HuggingFaceModelServer + filename: hugging_face_serving.py + handler: handler + image: mlrun/ml-models + kind: serving + requirements: + - transformers==4.21.3 + - tensorflow==2.9.2 +url: '' +version: 1.1.0 +test_valid: false \ No newline at end of file diff --git a/functions/master/hugging_face_serving/1.1.0/src/requirements.txt b/functions/master/hugging_face_serving/1.1.0/src/requirements.txt new file mode 100644 index 00000000..56d9116d --- /dev/null +++ b/functions/master/hugging_face_serving/1.1.0/src/requirements.txt @@ -0,0 +1,2 @@ +transformers +numpy diff --git a/functions/master/hugging_face_serving/1.1.0/src/test_hugging_face_serving.py b/functions/master/hugging_face_serving/1.1.0/src/test_hugging_face_serving.py new file mode 100644 index 00000000..6fdc02dd --- /dev/null +++ b/functions/master/hugging_face_serving/1.1.0/src/test_hugging_face_serving.py @@ -0,0 +1,119 @@ +# Copyright 2019 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import numpy as np +import pytest + +import mlrun + +CLASS_NAME = "HuggingFaceModelServer" + +PIPELINES = [ + { + "task": "sentiment-analysis", + "example": "We are very happy to show you the 🤗 Transformers library.", + "result_keys": ["label", "score"], + }, + { + "task": "text-generation", + "example": { + "text_inputs": "Hello, I'm a language model", + "max_length": 20, + "num_return_sequences": 1, + }, + "result_keys": ["generated_text"], + }, + { + "task": "ner", + "example": "My name is Wolfgang", + "result_keys": ["entity", "score", "index", "word", "start", "end"], + }, + { + "task": "question-answering", + "example": { + "question": "Where do I live?", + "context": "My name is Merve and I live in İstanbul.", + }, + "result_keys": ["score", "start", "end", "answer"], + }, + { + "task": "fill-mask", + "example": "Paris is the of France.", + "result_keys": ["score", "token", "token_str", "sequence"], + }, + { + "task": "summarization", + "example": "Paris is the capital and most populous city of France," + " with an estimated population of 2,175,601 residents as of 2018," + " in an area of more than 105 square kilometres (41 square miles)." + " The City of Paris is the centre and seat of government of the region" + " and province of Île-de-France, or Paris Region, which has an estimated population of 12,174,880," + " or about 18 percent of the population of France as of 2017.", + "result_keys": ["summary_text"], + }, + { + "task": "translation_en_to_fr", + "example": "How old are you?", + "result_keys": ["translation_text"], + }, +] + + +@pytest.mark.parametrize("pipeline", PIPELINES) +def test_default_models(pipeline): + serving_function = mlrun.import_function("function.yaml") + serving_function.add_model( + pipeline["task"], + class_name=CLASS_NAME, + model_path="123", # This is not used, just for enabling the process. + task=pipeline["task"], + ) + server = serving_function.to_mock_server() + result = server.test( + f'/v2/models/{pipeline["task"]}', body={"inputs": [pipeline["example"]]} + ) + prediction = result["outputs"][0] + assert all( + result_key in prediction.keys() for result_key in pipeline["result_keys"] + ) + + +def test_local_model_serving(): + + serving_function = mlrun.import_function("function.yaml") + + # Adding model: + serving_function.add_model( + "model1", + class_name=CLASS_NAME, + model_path="123", # This is not used, just for enabling the process. + task="sentiment-analysis", + model_class="TFAutoModelForSequenceClassification", + model_name="nlptown/bert-base-multilingual-uncased-sentiment", + tokenizer_class="AutoTokenizer", + tokenizer_name="nlptown/bert-base-multilingual-uncased-sentiment", + ) + + server = serving_function.to_mock_server() + result = server.test( + "/v2/models/model1", + body={ + "inputs": [ + "Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers." + ] + }, + ) + + prediction = result["outputs"][0] + assert prediction["label"] == "5 stars" and np.isclose(prediction["score"], 0.72727) diff --git a/functions/master/hugging_face_serving/1.1.0/static/documentation.html b/functions/master/hugging_face_serving/1.1.0/static/documentation.html new file mode 100644 index 00000000..ab2b2bea --- /dev/null +++ b/functions/master/hugging_face_serving/1.1.0/static/documentation.html @@ -0,0 +1,239 @@ + + + + + + + +hugging_face_serving package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+ + + +
+ +
+
+
+
+
+
+ + + + +
+
+ + +
+
+
+ +
+

hugging_face_serving package

+ +
+ +
+
+
+
+
+

hugging_face_serving package#

+
+

Submodules#

+
+
+

hugging_face_serving.hugging_face_serving module#

+
+
+class hugging_face_serving.hugging_face_serving.HuggingFaceModelServer(context: mlrun.execution.MLClientCtx, name: str, task: str, model_path: Optional[str] = None, model_name: Optional[str] = None, model_class: Optional[str] = None, tokenizer_name: Optional[str] = None, tokenizer_class: Optional[str] = None, framework: Optional[str] = None, **class_args)[source]#
+

Bases: mlrun.serving.v2_serving.V2ModelServer, abc.ABC

+

Hugging Face Model serving class, inheriting the V2ModelServer class for being initialized automatically by the +model server and be able to run locally as part of a nuclio serverless function, or as part of a real-time pipeline.

+
+
+load()[source]#
+

load and initialize the model and/or other elements

+
+
+
+predict(body: dict)List[source]#
+

Generate model predictions from sample.

+
+
+
+
+

Module contents#

+
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+ + + + \ No newline at end of file diff --git a/functions/master/hugging_face_serving/1.1.0/static/example.html b/functions/master/hugging_face_serving/1.1.0/static/example.html new file mode 100644 index 00000000..19e1859b --- /dev/null +++ b/functions/master/hugging_face_serving/1.1.0/static/example.html @@ -0,0 +1,395 @@ + + + + + + + +Hugging Face 🤗 Serving + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+ + + +
+ +
+
+
+
+
+
+ + + + +
+
+ + +
+
+
+ + +
+
+
+

Hugging Face 🤗 Serving#

+
+
+
import mlrun
+
+
+
+
+
+

Importing the Hugging Face 🤗 model serving function#

+
+
+
serving_function = mlrun.import_function('function.yaml')
+
+
+
+
+
+
+

Adding a pretrained model#

+
+
+
serving_function.add_model(
+    'mymodel',
+    class_name='HuggingFaceModelServer',
+    model_path='123',  # This is not used, just for enabling the process.
+    
+    task="sentiment-analysis",
+    model_class="AutoModelForSequenceClassification",
+    model_name="nlptown/bert-base-multilingual-uncased-sentiment",
+    tokenizer_class="AutoTokenizer",
+    tokenizer_name="nlptown/bert-base-multilingual-uncased-sentiment",
+)
+
+
+
+
+
<mlrun.serving.states.TaskStep at 0x7fc3ec3a7a50>
+
+
+
+
+
+
+

Testing the pipeline locally#

+
+
+
server = serving_function.to_mock_server()
+
+
+
+
+
> 2022-09-07 08:54:42,419 [info] model mymodel was loaded
+> 2022-09-07 08:54:42,420 [info] Loaded ['mymodel']
+
+
+
+
+
+
+
result = server.test(
+    '/v2/models/mymodel',
+    body={"inputs": ["Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers."]}
+)
+print(f"prediction: {result['outputs']}")
+
+
+
+
+
prediction: [{'label': '5 stars', 'score': 0.7272651791572571}]
+
+
+
+
+
+
+

Adding a default model from 🤗#

+
+
+
serving_function.add_model(
+    'default-model',
+    class_name='HuggingFaceModelServer',
+    model_path='123',  # This is not used, just for enabling the process.
+    
+    task="sentiment-analysis",
+    framework='pt', # Use `pt` for pytorch and `tf` for tensorflow.
+)
+
+
+
+
+
<mlrun.serving.states.TaskStep at 0x7fc2d3472f10>
+
+
+
+
+
+
+

Deploy the pipeline to our k8s cluster#

+
+
+
serving_function.deploy()
+
+
+
+
+
> 2022-09-07 08:54:42,487 [info] Starting remote function deploy
+2022-09-07 08:54:43  (info) Deploying function
+2022-09-07 08:54:43  (info) Building
+2022-09-07 08:54:44  (info) Staging files and preparing base images
+2022-09-07 08:54:44  (info) Building processor image
+2022-09-07 08:56:29  (info) Build complete
+> 2022-09-07 08:57:09,536 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-default-hugging-face-serving.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-hugging-face-serving-default.default-tenant.app.yh43.iguazio-cd1.com/']}
+
+
+
'http://default-hugging-face-serving-default.default-tenant.app.yh43.iguazio-cd1.com/'
+
+
+
+
+
+
+

Infer our sentences through our model#

+
+
+
serving_function.invoke(
+    path='v2/models/default-model/predict',
+    body={"inputs": ["We are delighted that we can serve 🤗 Transformers with MLRun."]})
+
+
+
+
+
> 2022-09-07 08:57:09,616 [info] invoking function: {'method': 'POST', 'path': 'http://nuclio-default-hugging-face-serving.default-tenant.svc.cluster.local:8080/v2/models/default-model/predict'}
+
+
+
{'id': 'f7753a17-fa84-44fa-9264-1dc65172d05c',
+ 'model_name': 'default-model',
+ 'outputs': [{'label': 'POSITIVE', 'score': 0.9993784427642822}]}
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+ + + + \ No newline at end of file diff --git a/functions/master/hugging_face_serving/1.1.0/static/function.html b/functions/master/hugging_face_serving/1.1.0/static/function.html new file mode 100644 index 00000000..5163d6a0 --- /dev/null +++ b/functions/master/hugging_face_serving/1.1.0/static/function.html @@ -0,0 +1,68 @@ + + + + + + + + + + + Source + + + + +
+        
+kind: serving
+metadata:
+  name: hugging-face-serving
+  tag: ''
+  hash: 1a489a57da861f129eb26e933f34e58927e41195
+  project: ''
+  labels:
+    author: yonish
+  categories:
+  - huggingface
+  - genai
+  - model-serving
+  - machine-learning
+spec:
+  command: ''
+  args: []
+  image: mlrun/ml-models
+  build:
+    functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKCmZyb20gYWJjIGltcG9ydCBBQkMKZnJvbSBpbXBvcnRsaWIgaW1wb3J0IGltcG9ydF9tb2R1bGUKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBwaXBlbGluZQoKaW1wb3J0IG1scnVuLnNlcnZpbmcKClBBQ0tBR0VfTU9EVUxFID0gInRyYW5zZm9ybWVycyIKU0VSSUFMSVpBQkxFX1RZUEVTID0gW2RpY3QsIGxpc3QsIHR1cGxlLCBzdHIsIGludCwgZmxvYXRdCgoKY2xhc3MgSHVnZ2luZ0ZhY2VNb2RlbFNlcnZlcihtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIsIEFCQyk6CiAgICAiIiIKICAgIEh1Z2dpbmcgRmFjZSBNb2RlbCBzZXJ2aW5nIGNsYXNzLCBpbmhlcml0aW5nIHRoZSBWMk1vZGVsU2VydmVyIGNsYXNzIGZvciBiZWluZyBpbml0aWFsaXplZCBhdXRvbWF0aWNhbGx5IGJ5IHRoZQogICAgbW9kZWwgc2VydmVyIGFuZCBiZSBhYmxlIHRvIHJ1biBsb2NhbGx5IGFzIHBhcnQgb2YgYSBudWNsaW8gc2VydmVybGVzcyBmdW5jdGlvbiwgb3IgYXMgcGFydCBvZiBhIHJlYWwtdGltZSBwaXBlbGluZS4KICAgICIiIgoKICAgIGRlZiBfX2luaXRfXygKICAgICAgICBzZWxmLAogICAgICAgIGNvbnRleHQ6IG1scnVuLk1MQ2xpZW50Q3R4LAogICAgICAgIG5hbWU6IHN0ciwKICAgICAgICB0YXNrOiBzdHIsCiAgICAgICAgbW9kZWxfcGF0aDogc3RyID0gTm9uZSwKICAgICAgICBtb2RlbF9uYW1lOiBzdHIgPSBOb25lLAogICAgICAgIG1vZGVsX2NsYXNzOiBzdHIgPSBOb25lLAogICAgICAgIHRva2VuaXplcl9uYW1lOiBzdHIgPSBOb25lLAogICAgICAgIHRva2VuaXplcl9jbGFzczogc3RyID0gTm9uZSwKICAgICAgICBmcmFtZXdvcms6IHN0ciA9IE5vbmUsCiAgICAgICAgKipjbGFzc19hcmdzLAogICAgKToKICAgICAgICAiIiIKICAgICAgICBJbml0aWFsaXplIGEgc2VydmluZyBjbGFzcyBmb3IgYSBIdWdnaW5nIGZhY2UgbW9kZWwuCgogICAgICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIFRoZSBtbHJ1biBjb250ZXh0IHRvIHdvcmsgd2l0aAogICAgICAgIDpwYXJhbSBuYW1lOiAgICAgICAgICAgIFRoZSBuYW1lIG9mIHRoaXMgc2VydmVyIHRvIGJlIGluaXRpYWxpemVkCiAgICAgICAgOnBhcmFtIG1vZGVsX3BhdGg6ICAgICAgTm90IGluIHVzZS4gV2hlbiBhZGRpbmcgYSBtb2RlbCBwYXNzIGFueSBzdHJpbmcgdmFsdWUKICAgICAgICA6cGFyYW0gbW9kZWxfbmFtZTogICAgICBUaGUgbW9kZWwncyBuYW1lIGluIHRoZSBIdWdnaW5nIEZhY2UgaHViCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZS5nLiwgYG5scHRvd24vYmVydC1iYXNlLW11bHRpbGluZ3VhbC11bmNhc2VkLXNlbnRpbWVudGAKICAgICAgICA6cGFyYW0gbW9kZWxfY2xhc3M6ICAgICBUaGUgbW9kZWwncyBjbGFzcyB0eXBlIG9iamVjdCB3aGljaCBjYW4gYmUgcGFzc2VkIGFzIHRoZSBjbGFzcydzIG5hbWUgKHN0cmluZykuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgTXVzdCBiZSBwcm92aWRlZCBhbmQgdG8gYmUgbWF0Y2hlZCB3aXRoIGBtb2RlbF9uYW1lYC4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBlLmcuLCBgQXV0b01vZGVsRm9yU2VxdWVuY2VDbGFzc2lmaWNhdGlvbmAKICAgICAgICA6cGFyYW0gdG9rZW5pemVyX25hbWU6ICBUaGUgdG9rZW5pemVyJ3MgbmFtZSBpbiB0aGUgSHVnZ2luZyBGYWNlIGh1YgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGUuZy4sIGBubHB0b3duL2JlcnQtYmFzZS1tdWx0aWxpbmd1YWwtdW5jYXNlZC1zZW50aW1lbnRgCiAgICAgICAgOnBhcmFtIHRva2VuaXplcl9jbGFzczogVGhlIG1vZGVsJ3MgY2xhc3MgdHlwZSBvYmplY3Qgd2hpY2ggY2FuIGJlIHBhc3NlZCBhcyB0aGUgY2xhc3MncyBuYW1lIChzdHJpbmcpLgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIE11c3QgYmUgcHJvdmlkZWQgYW5kIHRvIGJlIG1hdGNoZWQgd2l0aCBgbW9kZWxfbmFtZWAuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZS5nLiwgYEF1dG9Ub2tlbml6ZXJgCiAgICAgICAgOnBhcmFtIGZyYW1ld29yazogICAgICAgVGhlIGZyYW1ld29yayB0byB1c2UsIGVpdGhlciBgInB0ImAgZm9yIFB5VG9yY2ggb3IgYCJ0ZiJgIGZvciBUZW5zb3JGbG93LiBUaGUgc3BlY2lmaWVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZnJhbWV3b3JrIG11c3QgYmUgaW5zdGFsbGVkLgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIElmIG5vIGZyYW1ld29yayBpcyBzcGVjaWZpZWQsIHdpbGwgZGVmYXVsdCB0byB0aGUgb25lIGN1cnJlbnRseSBpbnN0YWxsZWQuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgSWYgbm8gZnJhbWV3b3JrIGlzIHNwZWNpZmllZCBhbmQgYm90aCBmcmFtZXdvcmtzIGFyZSBpbnN0YWxsZWQsIHdpbGwgZGVmYXVsdCB0byB0aGUKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBmcmFtZXdvcmsgb2YgdGhlIGBtb2RlbGAsIG9yIHRvIFB5VG9yY2ggaWYgbm8gbW9kZWwgaXMgcHJvdmlkZWQuCiAgICAgICAgOnBhcmFtIGNsYXNzX2FyZ3M6ICAgICAgLQogICAgICAgICIiIgogICAgICAgIHN1cGVyKEh1Z2dpbmdGYWNlTW9kZWxTZXJ2ZXIsIHNlbGYpLl9faW5pdF9fKAogICAgICAgICAgICBjb250ZXh0PWNvbnRleHQsCiAgICAgICAgICAgIG5hbWU9bmFtZSwKICAgICAgICAgICAgbW9kZWxfcGF0aD1tb2RlbF9wYXRoLAogICAgICAgICAgICAqKmNsYXNzX2FyZ3MsCiAgICAgICAgKQogICAgICAgIHNlbGYudGFzayA9IHRhc2sKICAgICAgICBzZWxmLm1vZGVsID0gTm9uZQogICAgICAgIHNlbGYudG9rZW5pemVyID0gTm9uZQogICAgICAgIHNlbGYubW9kZWxfbmFtZSA9IG1vZGVsX25hbWUKICAgICAgICBzZWxmLnRva2VuaXplcl9uYW1lID0gdG9rZW5pemVyX25hbWUKICAgICAgICBzZWxmLm1vZGVsX2NsYXNzID0gbW9kZWxfY2xhc3MKICAgICAgICBzZWxmLnRva2VuaXplcl9jbGFzcyA9IHRva2VuaXplcl9jbGFzcwogICAgICAgIHNlbGYuZnJhbWV3b3JrID0gZnJhbWV3b3JrCiAgICAgICAgc2VsZi5waXBlID0gTm9uZQoKICAgIGRlZiBsb2FkKHNlbGYpOgogICAgICAgICIiImxvYWQgYW5kIGluaXRpYWxpemUgdGhlIG1vZGVsIGFuZC9vciBvdGhlciBlbGVtZW50cyIiIgogICAgICAgIGlmIHNlbGYubW9kZWxfY2xhc3M6CiAgICAgICAgICAgIG1vZGVsX29iamVjdCA9IGdldGF0dHIoaW1wb3J0X21vZHVsZShQQUNLQUdFX01PRFVMRSksIHNlbGYubW9kZWxfY2xhc3MpCiAgICAgICAgICAgIHNlbGYubW9kZWwgPSBtb2RlbF9vYmplY3QuZnJvbV9wcmV0cmFpbmVkKHNlbGYubW9kZWxfbmFtZSkKICAgICAgICBpZiBzZWxmLnRva2VuaXplcl9jbGFzczoKICAgICAgICAgICAgdG9rZW5pemVyX29iamVjdCA9IGdldGF0dHIoCiAgICAgICAgICAgICAgICBpbXBvcnRfbW9kdWxlKFBBQ0tBR0VfTU9EVUxFKSwgc2VsZi50b2tlbml6ZXJfY2xhc3MKICAgICAgICAgICAgKQogICAgICAgICAgICBzZWxmLnRva2VuaXplciA9IHRva2VuaXplcl9vYmplY3QuZnJvbV9wcmV0cmFpbmVkKHNlbGYudG9rZW5pemVyX25hbWUpCiAgICAgICAgc2VsZi5waXBlID0gcGlwZWxpbmUoCiAgICAgICAgICAgIHRhc2s9c2VsZi50YXNrLAogICAgICAgICAgICBtb2RlbD1zZWxmLm1vZGVsIG9yIHNlbGYubW9kZWxfbmFtZSwKICAgICAgICAgICAgdG9rZW5pemVyPXNlbGYudG9rZW5pemVyLAogICAgICAgICAgICBmcmFtZXdvcms9c2VsZi5mcmFtZXdvcmssCiAgICAgICAgKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGJvZHk6IGRpY3QpIC0+IExpc3Q6CiAgICAgICAgIiIiR2VuZXJhdGUgbW9kZWwgcHJlZGljdGlvbnMgZnJvbSBzYW1wbGUuIiIiCiAgICAgICAgaWYgc2VsZi5waXBlIGlzIE5vbmU6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoIlBsZWFzZSB1c2UgYC5sb2FkKClgIikKICAgICAgICB0cnk6CiAgICAgICAgICAgIGlmIGlzaW5zdGFuY2UoYm9keVsiaW5wdXRzIl1bMF0sIGRpY3QpOgogICAgICAgICAgICAgICAgcmVzdWx0ID0gW3NlbGYucGlwZSgqKl9pbnB1dCkgZm9yIF9pbnB1dCBpbiBib2R5WyJpbnB1dHMiXV0KICAgICAgICAgICAgZWxzZToKICAgICAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYucGlwZShib2R5WyJpbnB1dHMiXSkKICAgICAgICAgICAgIyByZXBsYWNlIGxpc3Qgb2YgbGlzdHMgb2YgZGljdHMgaW50byBhIGxpc3Qgb2YgZGljdHM6CiAgICAgICAgICAgIGlmIGFsbChpc2luc3RhbmNlKHJlcywgbGlzdCkgZm9yIHJlcyBpbiByZXN1bHQpOgogICAgICAgICAgICAgICAgbmV3X3Jlc3VsdCA9IFtyZXNbMF0gZm9yIHJlcyBpbiByZXN1bHRdCiAgICAgICAgICAgICAgICByZXN1bHQgPSBuZXdfcmVzdWx0CgogICAgICAgICAgICBub25fc2VyaWFsaXphYmxlX3R5cGVzID0gW10KICAgICAgICAgICAgZm9yIHJlcyBpbiByZXN1bHQ6CiAgICAgICAgICAgICAgICBmb3Iga2V5LCB2YWwgaW4gcmVzLml0ZW1zKCk6CiAgICAgICAgICAgICAgICAgICAgaWYgdHlwZSh2YWwpIG5vdCBpbiBTRVJJQUxJWkFCTEVfVFlQRVM6CiAgICAgICAgICAgICAgICAgICAgICAgIG5vbl9zZXJpYWxpemFibGVfdHlwZXMuYXBwZW5kKHN0cih0eXBlKHZhbCkpKQogICAgICAgICAgICAgICAgICAgICAgICByZXNba2V5XSA9IHN0cih2YWwpCiAgICAgICAgICAgIGlmIG5vbl9zZXJpYWxpemFibGVfdHlwZXM6CiAgICAgICAgICAgICAgICBzZWxmLmNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgICAgICAgICAgICAgZiJOb24tc2VyaWFsaXphYmxlIHR5cGVzOiB7bm9uX3NlcmlhbGl6YWJsZV90eXBlc30gd2VyZSBjYXN0ZWQgdG8gc3RyaW5ncyIKICAgICAgICAgICAgICAgICkKICAgICAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiRmFpbGVkIHRvIHByZWRpY3QgJXMiICUgZSkKICAgICAgICByZXR1cm4gcmVzdWx0Cgpmcm9tIG1scnVuLnJ1bnRpbWVzIGltcG9ydCBudWNsaW9faW5pdF9ob29rCmRlZiBpbml0X2NvbnRleHQoY29udGV4dCk6CiAgICBudWNsaW9faW5pdF9ob29rKGNvbnRleHQsIGdsb2JhbHMoKSwgJ3NlcnZpbmdfdjInKQoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgogICAgcmV0dXJuIGNvbnRleHQubWxydW5faGFuZGxlcihjb250ZXh0LCBldmVudCkK
+    commands: []
+    code_origin: ''
+    origin_filename: ''
+    requirements:
+    - transformers==4.21.3
+    - tensorflow==2.9.2
+  description: Generic Hugging Face model server.
+  default_handler: ''
+  disable_auto_mount: false
+  clone_target_dir: ''
+  env:
+  - name: MLRUN_HTTPDB__NUCLIO__EXPLICIT_ACK
+    value: enabled
+  priority_class_name: ''
+  preemption_mode: prevent
+  min_replicas: 1
+  max_replicas: 4
+  source: ''
+  function_kind: serving_v2
+  function_handler: hugging_face_serving:handler
+  base_image_pull: false
+  default_class: HuggingFaceModelServer
+  secret_sources: []
+  affinity: null
+  tolerations: null
+  security_context: {}
+verbose: false
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/hugging_face_serving/1.1.0/static/hugging_face_serving.html b/functions/master/hugging_face_serving/1.1.0/static/hugging_face_serving.html new file mode 100644 index 00000000..b87689f0 --- /dev/null +++ b/functions/master/hugging_face_serving/1.1.0/static/hugging_face_serving.html @@ -0,0 +1,269 @@ + + + + + + + +hugging_face_serving.hugging_face_serving + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+ + + +
+ +
+
+
+
+
+
+ + +
+
+ +
+
+
+
+
+ +
+

+ +
+
+
+
+
+
+
+

Source code for hugging_face_serving.hugging_face_serving

+# Copyright 2019 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from abc import ABC
+from importlib import import_module
+from typing import List
+
+from transformers import pipeline
+
+import mlrun.serving
+
+PACKAGE_MODULE = "transformers"
+SERIALIZABLE_TYPES = [dict, list, tuple, str, int, float]
+
+
+
[docs]class HuggingFaceModelServer(mlrun.serving.V2ModelServer, ABC): + """ + Hugging Face Model serving class, inheriting the V2ModelServer class for being initialized automatically by the + model server and be able to run locally as part of a nuclio serverless function, or as part of a real-time pipeline. + """ + + def __init__( + self, + context: mlrun.MLClientCtx, + name: str, + task: str, + model_path: str = None, + model_name: str = None, + model_class: str = None, + tokenizer_name: str = None, + tokenizer_class: str = None, + framework: str = None, + **class_args, + ): + """ + Initialize a serving class for a Hugging face model. + + :param context: The mlrun context to work with + :param name: The name of this server to be initialized + :param model_path: Not in use. When adding a model pass any string value + :param model_name: The model's name in the Hugging Face hub + e.g., `nlptown/bert-base-multilingual-uncased-sentiment` + :param model_class: The model's class type object which can be passed as the class's name (string). + Must be provided and to be matched with `model_name`. + e.g., `AutoModelForSequenceClassification` + :param tokenizer_name: The tokenizer's name in the Hugging Face hub + e.g., `nlptown/bert-base-multilingual-uncased-sentiment` + :param tokenizer_class: The model's class type object which can be passed as the class's name (string). + Must be provided and to be matched with `model_name`. + e.g., `AutoTokenizer` + :param framework: The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified + framework must be installed. + If no framework is specified, will default to the one currently installed. + If no framework is specified and both frameworks are installed, will default to the + framework of the `model`, or to PyTorch if no model is provided. + :param class_args: - + """ + super(HuggingFaceModelServer, self).__init__( + context=context, + name=name, + model_path=model_path, + **class_args, + ) + self.task = task + self.model = None + self.tokenizer = None + self.model_name = model_name + self.tokenizer_name = tokenizer_name + self.model_class = model_class + self.tokenizer_class = tokenizer_class + self.framework = framework + self.pipe = None + +
[docs] def load(self): + """load and initialize the model and/or other elements""" + if self.model_class: + model_object = getattr(import_module(PACKAGE_MODULE), self.model_class) + self.model = model_object.from_pretrained(self.model_name) + if self.tokenizer_class: + tokenizer_object = getattr( + import_module(PACKAGE_MODULE), self.tokenizer_class + ) + self.tokenizer = tokenizer_object.from_pretrained(self.tokenizer_name) + self.pipe = pipeline( + task=self.task, + model=self.model or self.model_name, + tokenizer=self.tokenizer, + framework=self.framework, + )
+ +
[docs] def predict(self, body: dict) -> List: + """Generate model predictions from sample.""" + if self.pipe is None: + raise ValueError("Please use `.load()`") + try: + if isinstance(body["inputs"][0], dict): + result = [self.pipe(**_input) for _input in body["inputs"]] + else: + result = self.pipe(body["inputs"]) + # replace list of lists of dicts into a list of dicts: + if all(isinstance(res, list) for res in result): + new_result = [res[0] for res in result] + result = new_result + + non_serializable_types = [] + for res in result: + for key, val in res.items(): + if type(val) not in SERIALIZABLE_TYPES: + non_serializable_types.append(str(type(val))) + res[key] = str(val) + if non_serializable_types: + self.context.logger.info( + f"Non-serializable types: {non_serializable_types} were casted to strings" + ) + except Exception as e: + raise Exception("Failed to predict %s" % e) + return result
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+ + + + \ No newline at end of file diff --git a/functions/master/hugging_face_serving/1.1.0/static/item.html b/functions/master/hugging_face_serving/1.1.0/static/item.html new file mode 100644 index 00000000..6dc1e112 --- /dev/null +++ b/functions/master/hugging_face_serving/1.1.0/static/item.html @@ -0,0 +1,53 @@ + + + + + + + + + + + Source + + + + +
+        
+apiVersion: v1
+categories:
+- huggingface
+- genai
+- model-serving
+- machine-learning
+description: Generic Hugging Face model server.
+doc: ''
+example: hugging_face_serving.ipynb
+generationDate: 2022-09-05:17-00
+hidden: false
+icon: ''
+labels:
+  author: yonish
+maintainers: []
+marketplaceType: ''
+mlrunVersion: 1.1.0
+name: hugging_face_serving
+platformVersion: ''
+spec:
+  customFields:
+    default_class: HuggingFaceModelServer
+  filename: hugging_face_serving.py
+  handler: handler
+  image: mlrun/ml-models
+  kind: serving
+  requirements:
+  - transformers==4.21.3
+  - tensorflow==2.9.2
+url: ''
+version: 1.1.0
+test_valid: false
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/hugging_face_serving/1.1.0/static/source.html b/functions/master/hugging_face_serving/1.1.0/static/source.html new file mode 100644 index 00000000..2826f166 --- /dev/null +++ b/functions/master/hugging_face_serving/1.1.0/static/source.html @@ -0,0 +1,151 @@ + + + + + + + + + + + Source + + + + +
+        
+# Copyright 2019 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from abc import ABC
+from importlib import import_module
+from typing import List
+
+from transformers import pipeline
+
+import mlrun.serving
+
+PACKAGE_MODULE = "transformers"
+SERIALIZABLE_TYPES = [dict, list, tuple, str, int, float]
+
+
+class HuggingFaceModelServer(mlrun.serving.V2ModelServer, ABC):
+    """
+    Hugging Face Model serving class, inheriting the V2ModelServer class for being initialized automatically by the
+    model server and be able to run locally as part of a nuclio serverless function, or as part of a real-time pipeline.
+    """
+
+    def __init__(
+        self,
+        context: mlrun.MLClientCtx,
+        name: str,
+        task: str,
+        model_path: str = None,
+        model_name: str = None,
+        model_class: str = None,
+        tokenizer_name: str = None,
+        tokenizer_class: str = None,
+        framework: str = None,
+        **class_args,
+    ):
+        """
+        Initialize a serving class for a Hugging face model.
+
+        :param context:         The mlrun context to work with
+        :param name:            The name of this server to be initialized
+        :param model_path:      Not in use. When adding a model pass any string value
+        :param model_name:      The model's name in the Hugging Face hub
+                                e.g., `nlptown/bert-base-multilingual-uncased-sentiment`
+        :param model_class:     The model's class type object which can be passed as the class's name (string).
+                                Must be provided and to be matched with `model_name`.
+                                e.g., `AutoModelForSequenceClassification`
+        :param tokenizer_name:  The tokenizer's name in the Hugging Face hub
+                                e.g., `nlptown/bert-base-multilingual-uncased-sentiment`
+        :param tokenizer_class: The model's class type object which can be passed as the class's name (string).
+                                Must be provided and to be matched with `model_name`.
+                                e.g., `AutoTokenizer`
+        :param framework:       The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified
+                                framework must be installed.
+                                If no framework is specified, will default to the one currently installed.
+                                If no framework is specified and both frameworks are installed, will default to the
+                                framework of the `model`, or to PyTorch if no model is provided.
+        :param class_args:      -
+        """
+        super(HuggingFaceModelServer, self).__init__(
+            context=context,
+            name=name,
+            model_path=model_path,
+            **class_args,
+        )
+        self.task = task
+        self.model = None
+        self.tokenizer = None
+        self.model_name = model_name
+        self.tokenizer_name = tokenizer_name
+        self.model_class = model_class
+        self.tokenizer_class = tokenizer_class
+        self.framework = framework
+        self.pipe = None
+
+    def load(self):
+        """load and initialize the model and/or other elements"""
+        if self.model_class:
+            model_object = getattr(import_module(PACKAGE_MODULE), self.model_class)
+            self.model = model_object.from_pretrained(self.model_name)
+        if self.tokenizer_class:
+            tokenizer_object = getattr(
+                import_module(PACKAGE_MODULE), self.tokenizer_class
+            )
+            self.tokenizer = tokenizer_object.from_pretrained(self.tokenizer_name)
+        self.pipe = pipeline(
+            task=self.task,
+            model=self.model or self.model_name,
+            tokenizer=self.tokenizer,
+            framework=self.framework,
+        )
+
+    def predict(self, body: dict) -> List:
+        """Generate model predictions from sample."""
+        if self.pipe is None:
+            raise ValueError("Please use `.load()`")
+        try:
+            if isinstance(body["inputs"][0], dict):
+                result = [self.pipe(**_input) for _input in body["inputs"]]
+            else:
+                result = self.pipe(body["inputs"])
+            # replace list of lists of dicts into a list of dicts:
+            if all(isinstance(res, list) for res in result):
+                new_result = [res[0] for res in result]
+                result = new_result
+
+            non_serializable_types = []
+            for res in result:
+                for key, val in res.items():
+                    if type(val) not in SERIALIZABLE_TYPES:
+                        non_serializable_types.append(str(type(val)))
+                        res[key] = str(val)
+            if non_serializable_types:
+                self.context.logger.info(
+                    f"Non-serializable types: {non_serializable_types} were casted to strings"
+                )
+        except Exception as e:
+            raise Exception("Failed to predict %s" % e)
+        return result
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/hugging_face_serving/latest/src/function.yaml b/functions/master/hugging_face_serving/latest/src/function.yaml index e1bb3b0c..764fc1cf 100644 --- a/functions/master/hugging_face_serving/latest/src/function.yaml +++ b/functions/master/hugging_face_serving/latest/src/function.yaml @@ -2,11 +2,13 @@ kind: serving metadata: name: hugging-face-serving tag: '' - hash: 39bfca7b639022fa03f5ca87f85f9e17fc837b70 + hash: 1a489a57da861f129eb26e933f34e58927e41195 project: '' labels: author: yonish categories: + - huggingface + - genai - model-serving - machine-learning spec: @@ -14,37 +16,28 @@ spec: args: [] image: mlrun/ml-models build: - commands: - - python -m pip install transformers==4.21.3 tensorflow==2.9.2 - code_origin: https://github.com/mlrun/functions.git#250244b2527c5ce8a82438b4340df34de6e19dc3:/Users/yonatanshelach/yoni/projects/functions/hugging_face_serving/hugging_face_serving.py - origin_filename: /Users/yonatanshelach/yoni/projects/functions/hugging_face_serving/hugging_face_serving.py + functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKCmZyb20gYWJjIGltcG9ydCBBQkMKZnJvbSBpbXBvcnRsaWIgaW1wb3J0IGltcG9ydF9tb2R1bGUKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBwaXBlbGluZQoKaW1wb3J0IG1scnVuLnNlcnZpbmcKClBBQ0tBR0VfTU9EVUxFID0gInRyYW5zZm9ybWVycyIKU0VSSUFMSVpBQkxFX1RZUEVTID0gW2RpY3QsIGxpc3QsIHR1cGxlLCBzdHIsIGludCwgZmxvYXRdCgoKY2xhc3MgSHVnZ2luZ0ZhY2VNb2RlbFNlcnZlcihtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIsIEFCQyk6CiAgICAiIiIKICAgIEh1Z2dpbmcgRmFjZSBNb2RlbCBzZXJ2aW5nIGNsYXNzLCBpbmhlcml0aW5nIHRoZSBWMk1vZGVsU2VydmVyIGNsYXNzIGZvciBiZWluZyBpbml0aWFsaXplZCBhdXRvbWF0aWNhbGx5IGJ5IHRoZQogICAgbW9kZWwgc2VydmVyIGFuZCBiZSBhYmxlIHRvIHJ1biBsb2NhbGx5IGFzIHBhcnQgb2YgYSBudWNsaW8gc2VydmVybGVzcyBmdW5jdGlvbiwgb3IgYXMgcGFydCBvZiBhIHJlYWwtdGltZSBwaXBlbGluZS4KICAgICIiIgoKICAgIGRlZiBfX2luaXRfXygKICAgICAgICBzZWxmLAogICAgICAgIGNvbnRleHQ6IG1scnVuLk1MQ2xpZW50Q3R4LAogICAgICAgIG5hbWU6IHN0ciwKICAgICAgICB0YXNrOiBzdHIsCiAgICAgICAgbW9kZWxfcGF0aDogc3RyID0gTm9uZSwKICAgICAgICBtb2RlbF9uYW1lOiBzdHIgPSBOb25lLAogICAgICAgIG1vZGVsX2NsYXNzOiBzdHIgPSBOb25lLAogICAgICAgIHRva2VuaXplcl9uYW1lOiBzdHIgPSBOb25lLAogICAgICAgIHRva2VuaXplcl9jbGFzczogc3RyID0gTm9uZSwKICAgICAgICBmcmFtZXdvcms6IHN0ciA9IE5vbmUsCiAgICAgICAgKipjbGFzc19hcmdzLAogICAgKToKICAgICAgICAiIiIKICAgICAgICBJbml0aWFsaXplIGEgc2VydmluZyBjbGFzcyBmb3IgYSBIdWdnaW5nIGZhY2UgbW9kZWwuCgogICAgICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIFRoZSBtbHJ1biBjb250ZXh0IHRvIHdvcmsgd2l0aAogICAgICAgIDpwYXJhbSBuYW1lOiAgICAgICAgICAgIFRoZSBuYW1lIG9mIHRoaXMgc2VydmVyIHRvIGJlIGluaXRpYWxpemVkCiAgICAgICAgOnBhcmFtIG1vZGVsX3BhdGg6ICAgICAgTm90IGluIHVzZS4gV2hlbiBhZGRpbmcgYSBtb2RlbCBwYXNzIGFueSBzdHJpbmcgdmFsdWUKICAgICAgICA6cGFyYW0gbW9kZWxfbmFtZTogICAgICBUaGUgbW9kZWwncyBuYW1lIGluIHRoZSBIdWdnaW5nIEZhY2UgaHViCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZS5nLiwgYG5scHRvd24vYmVydC1iYXNlLW11bHRpbGluZ3VhbC11bmNhc2VkLXNlbnRpbWVudGAKICAgICAgICA6cGFyYW0gbW9kZWxfY2xhc3M6ICAgICBUaGUgbW9kZWwncyBjbGFzcyB0eXBlIG9iamVjdCB3aGljaCBjYW4gYmUgcGFzc2VkIGFzIHRoZSBjbGFzcydzIG5hbWUgKHN0cmluZykuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgTXVzdCBiZSBwcm92aWRlZCBhbmQgdG8gYmUgbWF0Y2hlZCB3aXRoIGBtb2RlbF9uYW1lYC4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBlLmcuLCBgQXV0b01vZGVsRm9yU2VxdWVuY2VDbGFzc2lmaWNhdGlvbmAKICAgICAgICA6cGFyYW0gdG9rZW5pemVyX25hbWU6ICBUaGUgdG9rZW5pemVyJ3MgbmFtZSBpbiB0aGUgSHVnZ2luZyBGYWNlIGh1YgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGUuZy4sIGBubHB0b3duL2JlcnQtYmFzZS1tdWx0aWxpbmd1YWwtdW5jYXNlZC1zZW50aW1lbnRgCiAgICAgICAgOnBhcmFtIHRva2VuaXplcl9jbGFzczogVGhlIG1vZGVsJ3MgY2xhc3MgdHlwZSBvYmplY3Qgd2hpY2ggY2FuIGJlIHBhc3NlZCBhcyB0aGUgY2xhc3MncyBuYW1lIChzdHJpbmcpLgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIE11c3QgYmUgcHJvdmlkZWQgYW5kIHRvIGJlIG1hdGNoZWQgd2l0aCBgbW9kZWxfbmFtZWAuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZS5nLiwgYEF1dG9Ub2tlbml6ZXJgCiAgICAgICAgOnBhcmFtIGZyYW1ld29yazogICAgICAgVGhlIGZyYW1ld29yayB0byB1c2UsIGVpdGhlciBgInB0ImAgZm9yIFB5VG9yY2ggb3IgYCJ0ZiJgIGZvciBUZW5zb3JGbG93LiBUaGUgc3BlY2lmaWVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZnJhbWV3b3JrIG11c3QgYmUgaW5zdGFsbGVkLgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIElmIG5vIGZyYW1ld29yayBpcyBzcGVjaWZpZWQsIHdpbGwgZGVmYXVsdCB0byB0aGUgb25lIGN1cnJlbnRseSBpbnN0YWxsZWQuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgSWYgbm8gZnJhbWV3b3JrIGlzIHNwZWNpZmllZCBhbmQgYm90aCBmcmFtZXdvcmtzIGFyZSBpbnN0YWxsZWQsIHdpbGwgZGVmYXVsdCB0byB0aGUKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBmcmFtZXdvcmsgb2YgdGhlIGBtb2RlbGAsIG9yIHRvIFB5VG9yY2ggaWYgbm8gbW9kZWwgaXMgcHJvdmlkZWQuCiAgICAgICAgOnBhcmFtIGNsYXNzX2FyZ3M6ICAgICAgLQogICAgICAgICIiIgogICAgICAgIHN1cGVyKEh1Z2dpbmdGYWNlTW9kZWxTZXJ2ZXIsIHNlbGYpLl9faW5pdF9fKAogICAgICAgICAgICBjb250ZXh0PWNvbnRleHQsCiAgICAgICAgICAgIG5hbWU9bmFtZSwKICAgICAgICAgICAgbW9kZWxfcGF0aD1tb2RlbF9wYXRoLAogICAgICAgICAgICAqKmNsYXNzX2FyZ3MsCiAgICAgICAgKQogICAgICAgIHNlbGYudGFzayA9IHRhc2sKICAgICAgICBzZWxmLm1vZGVsID0gTm9uZQogICAgICAgIHNlbGYudG9rZW5pemVyID0gTm9uZQogICAgICAgIHNlbGYubW9kZWxfbmFtZSA9IG1vZGVsX25hbWUKICAgICAgICBzZWxmLnRva2VuaXplcl9uYW1lID0gdG9rZW5pemVyX25hbWUKICAgICAgICBzZWxmLm1vZGVsX2NsYXNzID0gbW9kZWxfY2xhc3MKICAgICAgICBzZWxmLnRva2VuaXplcl9jbGFzcyA9IHRva2VuaXplcl9jbGFzcwogICAgICAgIHNlbGYuZnJhbWV3b3JrID0gZnJhbWV3b3JrCiAgICAgICAgc2VsZi5waXBlID0gTm9uZQoKICAgIGRlZiBsb2FkKHNlbGYpOgogICAgICAgICIiImxvYWQgYW5kIGluaXRpYWxpemUgdGhlIG1vZGVsIGFuZC9vciBvdGhlciBlbGVtZW50cyIiIgogICAgICAgIGlmIHNlbGYubW9kZWxfY2xhc3M6CiAgICAgICAgICAgIG1vZGVsX29iamVjdCA9IGdldGF0dHIoaW1wb3J0X21vZHVsZShQQUNLQUdFX01PRFVMRSksIHNlbGYubW9kZWxfY2xhc3MpCiAgICAgICAgICAgIHNlbGYubW9kZWwgPSBtb2RlbF9vYmplY3QuZnJvbV9wcmV0cmFpbmVkKHNlbGYubW9kZWxfbmFtZSkKICAgICAgICBpZiBzZWxmLnRva2VuaXplcl9jbGFzczoKICAgICAgICAgICAgdG9rZW5pemVyX29iamVjdCA9IGdldGF0dHIoCiAgICAgICAgICAgICAgICBpbXBvcnRfbW9kdWxlKFBBQ0tBR0VfTU9EVUxFKSwgc2VsZi50b2tlbml6ZXJfY2xhc3MKICAgICAgICAgICAgKQogICAgICAgICAgICBzZWxmLnRva2VuaXplciA9IHRva2VuaXplcl9vYmplY3QuZnJvbV9wcmV0cmFpbmVkKHNlbGYudG9rZW5pemVyX25hbWUpCiAgICAgICAgc2VsZi5waXBlID0gcGlwZWxpbmUoCiAgICAgICAgICAgIHRhc2s9c2VsZi50YXNrLAogICAgICAgICAgICBtb2RlbD1zZWxmLm1vZGVsIG9yIHNlbGYubW9kZWxfbmFtZSwKICAgICAgICAgICAgdG9rZW5pemVyPXNlbGYudG9rZW5pemVyLAogICAgICAgICAgICBmcmFtZXdvcms9c2VsZi5mcmFtZXdvcmssCiAgICAgICAgKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGJvZHk6IGRpY3QpIC0+IExpc3Q6CiAgICAgICAgIiIiR2VuZXJhdGUgbW9kZWwgcHJlZGljdGlvbnMgZnJvbSBzYW1wbGUuIiIiCiAgICAgICAgaWYgc2VsZi5waXBlIGlzIE5vbmU6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoIlBsZWFzZSB1c2UgYC5sb2FkKClgIikKICAgICAgICB0cnk6CiAgICAgICAgICAgIGlmIGlzaW5zdGFuY2UoYm9keVsiaW5wdXRzIl1bMF0sIGRpY3QpOgogICAgICAgICAgICAgICAgcmVzdWx0ID0gW3NlbGYucGlwZSgqKl9pbnB1dCkgZm9yIF9pbnB1dCBpbiBib2R5WyJpbnB1dHMiXV0KICAgICAgICAgICAgZWxzZToKICAgICAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYucGlwZShib2R5WyJpbnB1dHMiXSkKICAgICAgICAgICAgIyByZXBsYWNlIGxpc3Qgb2YgbGlzdHMgb2YgZGljdHMgaW50byBhIGxpc3Qgb2YgZGljdHM6CiAgICAgICAgICAgIGlmIGFsbChpc2luc3RhbmNlKHJlcywgbGlzdCkgZm9yIHJlcyBpbiByZXN1bHQpOgogICAgICAgICAgICAgICAgbmV3X3Jlc3VsdCA9IFtyZXNbMF0gZm9yIHJlcyBpbiByZXN1bHRdCiAgICAgICAgICAgICAgICByZXN1bHQgPSBuZXdfcmVzdWx0CgogICAgICAgICAgICBub25fc2VyaWFsaXphYmxlX3R5cGVzID0gW10KICAgICAgICAgICAgZm9yIHJlcyBpbiByZXN1bHQ6CiAgICAgICAgICAgICAgICBmb3Iga2V5LCB2YWwgaW4gcmVzLml0ZW1zKCk6CiAgICAgICAgICAgICAgICAgICAgaWYgdHlwZSh2YWwpIG5vdCBpbiBTRVJJQUxJWkFCTEVfVFlQRVM6CiAgICAgICAgICAgICAgICAgICAgICAgIG5vbl9zZXJpYWxpemFibGVfdHlwZXMuYXBwZW5kKHN0cih0eXBlKHZhbCkpKQogICAgICAgICAgICAgICAgICAgICAgICByZXNba2V5XSA9IHN0cih2YWwpCiAgICAgICAgICAgIGlmIG5vbl9zZXJpYWxpemFibGVfdHlwZXM6CiAgICAgICAgICAgICAgICBzZWxmLmNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgICAgICAgICAgICAgZiJOb24tc2VyaWFsaXphYmxlIHR5cGVzOiB7bm9uX3NlcmlhbGl6YWJsZV90eXBlc30gd2VyZSBjYXN0ZWQgdG8gc3RyaW5ncyIKICAgICAgICAgICAgICAgICkKICAgICAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiRmFpbGVkIHRvIHByZWRpY3QgJXMiICUgZSkKICAgICAgICByZXR1cm4gcmVzdWx0Cgpmcm9tIG1scnVuLnJ1bnRpbWVzIGltcG9ydCBudWNsaW9faW5pdF9ob29rCmRlZiBpbml0X2NvbnRleHQoY29udGV4dCk6CiAgICBudWNsaW9faW5pdF9ob29rKGNvbnRleHQsIGdsb2JhbHMoKSwgJ3NlcnZpbmdfdjInKQoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgogICAgcmV0dXJuIGNvbnRleHQubWxydW5faGFuZGxlcihjb250ZXh0LCBldmVudCkK + commands: [] + code_origin: '' + origin_filename: '' + requirements: + - transformers==4.21.3 + - tensorflow==2.9.2 description: Generic Hugging Face model server. - default_handler: handler + default_handler: '' disable_auto_mount: false - env: [] + clone_target_dir: '' + env: + - name: MLRUN_HTTPDB__NUCLIO__EXPLICIT_ACK + value: enabled priority_class_name: '' preemption_mode: prevent min_replicas: 1 max_replicas: 4 - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: hugging-face-serving - labels: {} - annotations: - nuclio.io/generated_by: function generated from /Users/yonatanshelach/yoni/projects/functions/hugging_face_serving/hugging_face_serving.py - spec: - runtime: python - handler: hugging_face_serving:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKCmZyb20gYWJjIGltcG9ydCBBQkMKZnJvbSBpbXBvcnRsaWIgaW1wb3J0IGltcG9ydF9tb2R1bGUKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBwaXBlbGluZQoKaW1wb3J0IG1scnVuLnNlcnZpbmcKClBBQ0tBR0VfTU9EVUxFID0gInRyYW5zZm9ybWVycyIKU0VSSUFMSVpBQkxFX1RZUEVTID0gW2RpY3QsIGxpc3QsIHR1cGxlLCBzdHIsIGludCwgZmxvYXRdCgoKY2xhc3MgSHVnZ2luZ0ZhY2VNb2RlbFNlcnZlcihtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIsIEFCQyk6CiAgICAiIiIKICAgIEh1Z2dpbmcgRmFjZSBNb2RlbCBzZXJ2aW5nIGNsYXNzLCBpbmhlcml0aW5nIHRoZSBWMk1vZGVsU2VydmVyIGNsYXNzIGZvciBiZWluZyBpbml0aWFsaXplZCBhdXRvbWF0aWNhbGx5IGJ5IHRoZQogICAgbW9kZWwgc2VydmVyIGFuZCBiZSBhYmxlIHRvIHJ1biBsb2NhbGx5IGFzIHBhcnQgb2YgYSBudWNsaW8gc2VydmVybGVzcyBmdW5jdGlvbiwgb3IgYXMgcGFydCBvZiBhIHJlYWwtdGltZSBwaXBlbGluZS4KICAgICIiIgoKICAgIGRlZiBfX2luaXRfXygKICAgICAgICBzZWxmLAogICAgICAgIGNvbnRleHQ6IG1scnVuLk1MQ2xpZW50Q3R4LAogICAgICAgIG5hbWU6IHN0ciwKICAgICAgICB0YXNrOiBzdHIsCiAgICAgICAgbW9kZWxfcGF0aDogc3RyID0gTm9uZSwKICAgICAgICBtb2RlbF9uYW1lOiBzdHIgPSBOb25lLAogICAgICAgIG1vZGVsX2NsYXNzOiBzdHIgPSBOb25lLAogICAgICAgIHRva2VuaXplcl9uYW1lOiBzdHIgPSBOb25lLAogICAgICAgIHRva2VuaXplcl9jbGFzczogc3RyID0gTm9uZSwKICAgICAgICBmcmFtZXdvcms6IHN0ciA9IE5vbmUsCiAgICAgICAgKipjbGFzc19hcmdzLAogICAgKToKICAgICAgICAiIiIKICAgICAgICBJbml0aWFsaXplIGEgc2VydmluZyBjbGFzcyBmb3IgYSBIdWdnaW5nIGZhY2UgbW9kZWwuCgogICAgICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIFRoZSBtbHJ1biBjb250ZXh0IHRvIHdvcmsgd2l0aAogICAgICAgIDpwYXJhbSBuYW1lOiAgICAgICAgICAgIFRoZSBuYW1lIG9mIHRoaXMgc2VydmVyIHRvIGJlIGluaXRpYWxpemVkCiAgICAgICAgOnBhcmFtIG1vZGVsX3BhdGg6ICAgICAgTm90IGluIHVzZS4gV2hlbiBhZGRpbmcgYSBtb2RlbCBwYXNzIGFueSBzdHJpbmcgdmFsdWUKICAgICAgICA6cGFyYW0gbW9kZWxfbmFtZTogICAgICBUaGUgbW9kZWwncyBuYW1lIGluIHRoZSBIdWdnaW5nIEZhY2UgaHViCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZS5nLiwgYG5scHRvd24vYmVydC1iYXNlLW11bHRpbGluZ3VhbC11bmNhc2VkLXNlbnRpbWVudGAKICAgICAgICA6cGFyYW0gbW9kZWxfY2xhc3M6ICAgICBUaGUgbW9kZWwncyBjbGFzcyB0eXBlIG9iamVjdCB3aGljaCBjYW4gYmUgcGFzc2VkIGFzIHRoZSBjbGFzcydzIG5hbWUgKHN0cmluZykuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgTXVzdCBiZSBwcm92aWRlZCBhbmQgdG8gYmUgbWF0Y2hlZCB3aXRoIGBtb2RlbF9uYW1lYC4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBlLmcuLCBgQXV0b01vZGVsRm9yU2VxdWVuY2VDbGFzc2lmaWNhdGlvbmAKICAgICAgICA6cGFyYW0gdG9rZW5pemVyX25hbWU6ICBUaGUgdG9rZW5pemVyJ3MgbmFtZSBpbiB0aGUgSHVnZ2luZyBGYWNlIGh1YgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGUuZy4sIGBubHB0b3duL2JlcnQtYmFzZS1tdWx0aWxpbmd1YWwtdW5jYXNlZC1zZW50aW1lbnRgCiAgICAgICAgOnBhcmFtIHRva2VuaXplcl9jbGFzczogVGhlIG1vZGVsJ3MgY2xhc3MgdHlwZSBvYmplY3Qgd2hpY2ggY2FuIGJlIHBhc3NlZCBhcyB0aGUgY2xhc3MncyBuYW1lIChzdHJpbmcpLgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIE11c3QgYmUgcHJvdmlkZWQgYW5kIHRvIGJlIG1hdGNoZWQgd2l0aCBgbW9kZWxfbmFtZWAuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZS5nLiwgYEF1dG9Ub2tlbml6ZXJgCiAgICAgICAgOnBhcmFtIGZyYW1ld29yazogICAgICAgVGhlIGZyYW1ld29yayB0byB1c2UsIGVpdGhlciBgInB0ImAgZm9yIFB5VG9yY2ggb3IgYCJ0ZiJgIGZvciBUZW5zb3JGbG93LiBUaGUgc3BlY2lmaWVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZnJhbWV3b3JrIG11c3QgYmUgaW5zdGFsbGVkLgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIElmIG5vIGZyYW1ld29yayBpcyBzcGVjaWZpZWQsIHdpbGwgZGVmYXVsdCB0byB0aGUgb25lIGN1cnJlbnRseSBpbnN0YWxsZWQuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgSWYgbm8gZnJhbWV3b3JrIGlzIHNwZWNpZmllZCBhbmQgYm90aCBmcmFtZXdvcmtzIGFyZSBpbnN0YWxsZWQsIHdpbGwgZGVmYXVsdCB0byB0aGUKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBmcmFtZXdvcmsgb2YgdGhlIGBtb2RlbGAsIG9yIHRvIFB5VG9yY2ggaWYgbm8gbW9kZWwgaXMgcHJvdmlkZWQuCiAgICAgICAgOnBhcmFtIGNsYXNzX2FyZ3M6ICAgICAgLQogICAgICAgICIiIgogICAgICAgIHN1cGVyKEh1Z2dpbmdGYWNlTW9kZWxTZXJ2ZXIsIHNlbGYpLl9faW5pdF9fKAogICAgICAgICAgICBjb250ZXh0PWNvbnRleHQsCiAgICAgICAgICAgIG5hbWU9bmFtZSwKICAgICAgICAgICAgbW9kZWxfcGF0aD1tb2RlbF9wYXRoLAogICAgICAgICAgICAqKmNsYXNzX2FyZ3MsCiAgICAgICAgKQogICAgICAgIHNlbGYudGFzayA9IHRhc2sKICAgICAgICBzZWxmLm1vZGVsID0gTm9uZQogICAgICAgIHNlbGYudG9rZW5pemVyID0gTm9uZQogICAgICAgIHNlbGYubW9kZWxfbmFtZSA9IG1vZGVsX25hbWUKICAgICAgICBzZWxmLnRva2VuaXplcl9uYW1lID0gdG9rZW5pemVyX25hbWUKICAgICAgICBzZWxmLm1vZGVsX2NsYXNzID0gbW9kZWxfY2xhc3MKICAgICAgICBzZWxmLnRva2VuaXplcl9jbGFzcyA9IHRva2VuaXplcl9jbGFzcwogICAgICAgIHNlbGYuZnJhbWV3b3JrID0gZnJhbWV3b3JrCiAgICAgICAgc2VsZi5waXBlID0gTm9uZQoKICAgIGRlZiBsb2FkKHNlbGYpOgogICAgICAgICIiImxvYWQgYW5kIGluaXRpYWxpemUgdGhlIG1vZGVsIGFuZC9vciBvdGhlciBlbGVtZW50cyIiIgogICAgICAgIGlmIHNlbGYubW9kZWxfY2xhc3M6CiAgICAgICAgICAgIG1vZGVsX29iamVjdCA9IGdldGF0dHIoaW1wb3J0X21vZHVsZShQQUNLQUdFX01PRFVMRSksIHNlbGYubW9kZWxfY2xhc3MpCiAgICAgICAgICAgIHNlbGYubW9kZWwgPSBtb2RlbF9vYmplY3QuZnJvbV9wcmV0cmFpbmVkKHNlbGYubW9kZWxfbmFtZSkKICAgICAgICBpZiBzZWxmLnRva2VuaXplcl9jbGFzczoKICAgICAgICAgICAgdG9rZW5pemVyX29iamVjdCA9IGdldGF0dHIoCiAgICAgICAgICAgICAgICBpbXBvcnRfbW9kdWxlKFBBQ0tBR0VfTU9EVUxFKSwgc2VsZi50b2tlbml6ZXJfY2xhc3MKICAgICAgICAgICAgKQogICAgICAgICAgICBzZWxmLnRva2VuaXplciA9IHRva2VuaXplcl9vYmplY3QuZnJvbV9wcmV0cmFpbmVkKHNlbGYudG9rZW5pemVyX25hbWUpCiAgICAgICAgc2VsZi5waXBlID0gcGlwZWxpbmUoCiAgICAgICAgICAgIHRhc2s9c2VsZi50YXNrLAogICAgICAgICAgICBtb2RlbD1zZWxmLm1vZGVsIG9yIHNlbGYubW9kZWxfbmFtZSwKICAgICAgICAgICAgdG9rZW5pemVyPXNlbGYudG9rZW5pemVyLAogICAgICAgICAgICBmcmFtZXdvcms9c2VsZi5mcmFtZXdvcmssCiAgICAgICAgKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGJvZHk6IGRpY3QpIC0+IExpc3Q6CiAgICAgICAgIiIiR2VuZXJhdGUgbW9kZWwgcHJlZGljdGlvbnMgZnJvbSBzYW1wbGUuIiIiCiAgICAgICAgaWYgc2VsZi5waXBlIGlzIE5vbmU6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoIlBsZWFzZSB1c2UgYC5sb2FkKClgIikKICAgICAgICB0cnk6CiAgICAgICAgICAgIGlmIGlzaW5zdGFuY2UoYm9keVsiaW5wdXRzIl1bMF0sIGRpY3QpOgogICAgICAgICAgICAgICAgcmVzdWx0ID0gW3NlbGYucGlwZSgqKl9pbnB1dCkgZm9yIF9pbnB1dCBpbiBib2R5WyJpbnB1dHMiXV0KICAgICAgICAgICAgZWxzZToKICAgICAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYucGlwZShib2R5WyJpbnB1dHMiXSkKICAgICAgICAgICAgIyByZXBsYWNlIGxpc3Qgb2YgbGlzdHMgb2YgZGljdHMgaW50byBhIGxpc3Qgb2YgZGljdHM6CiAgICAgICAgICAgIGlmIGFsbChpc2luc3RhbmNlKHJlcywgbGlzdCkgZm9yIHJlcyBpbiByZXN1bHQpOgogICAgICAgICAgICAgICAgbmV3X3Jlc3VsdCA9IFtyZXNbMF0gZm9yIHJlcyBpbiByZXN1bHRdCiAgICAgICAgICAgICAgICByZXN1bHQgPSBuZXdfcmVzdWx0CgogICAgICAgICAgICBub25fc2VyaWFsaXphYmxlX3R5cGVzID0gW10KICAgICAgICAgICAgZm9yIHJlcyBpbiByZXN1bHQ6CiAgICAgICAgICAgICAgICBmb3Iga2V5LCB2YWwgaW4gcmVzLml0ZW1zKCk6CiAgICAgICAgICAgICAgICAgICAgaWYgdHlwZSh2YWwpIG5vdCBpbiBTRVJJQUxJWkFCTEVfVFlQRVM6CiAgICAgICAgICAgICAgICAgICAgICAgIG5vbl9zZXJpYWxpemFibGVfdHlwZXMuYXBwZW5kKHN0cih0eXBlKHZhbCkpKQogICAgICAgICAgICAgICAgICAgICAgICByZXNba2V5XSA9IHN0cih2YWwpCiAgICAgICAgICAgIGlmIG5vbl9zZXJpYWxpemFibGVfdHlwZXM6CiAgICAgICAgICAgICAgICBzZWxmLmNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgICAgICAgICAgICAgZiJOb24tc2VyaWFsaXphYmxlIHR5cGVzOiB7bm9uX3NlcmlhbGl6YWJsZV90eXBlc30gd2VyZSBjYXN0ZWQgdG8gc3RyaW5ncyIKICAgICAgICAgICAgICAgICkKICAgICAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiRmFpbGVkIHRvIHByZWRpY3QgJXMiICUgZSkKICAgICAgICByZXR1cm4gcmVzdWx0Cgpmcm9tIG1scnVuLnJ1bnRpbWVzIGltcG9ydCBudWNsaW9faW5pdF9ob29rCmRlZiBpbml0X2NvbnRleHQoY29udGV4dCk6CiAgICBudWNsaW9faW5pdF9ob29rKGNvbnRleHQsIGdsb2JhbHMoKSwgJ3NlcnZpbmdfdjInKQoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgogICAgcmV0dXJuIGNvbnRleHQubWxydW5faGFuZGxlcihjb250ZXh0LCBldmVudCkK source: '' function_kind: serving_v2 + function_handler: hugging_face_serving:handler + base_image_pull: false default_class: HuggingFaceModelServer secret_sources: [] affinity: null diff --git a/functions/master/hugging_face_serving/latest/src/item.yaml b/functions/master/hugging_face_serving/latest/src/item.yaml index f7fa9263..d1f78769 100644 --- a/functions/master/hugging_face_serving/latest/src/item.yaml +++ b/functions/master/hugging_face_serving/latest/src/item.yaml @@ -1,5 +1,7 @@ apiVersion: v1 categories: +- huggingface +- genai - model-serving - machine-learning description: Generic Hugging Face model server. @@ -26,4 +28,5 @@ spec: - transformers==4.21.3 - tensorflow==2.9.2 url: '' -version: 1.0.0 +version: 1.1.0 +test_valid: false \ No newline at end of file diff --git a/functions/master/hugging_face_serving/latest/static/function.html b/functions/master/hugging_face_serving/latest/static/function.html index c155e675..5163d6a0 100644 --- a/functions/master/hugging_face_serving/latest/static/function.html +++ b/functions/master/hugging_face_serving/latest/static/function.html @@ -19,11 +19,13 @@ metadata: name: hugging-face-serving tag: '' - hash: 39bfca7b639022fa03f5ca87f85f9e17fc837b70 + hash: 1a489a57da861f129eb26e933f34e58927e41195 project: '' labels: author: yonish categories: + - huggingface + - genai - model-serving - machine-learning spec: @@ -31,37 +33,28 @@ args: [] image: mlrun/ml-models build: - commands: - - python -m pip install transformers==4.21.3 tensorflow==2.9.2 - code_origin: https://github.com/mlrun/functions.git#250244b2527c5ce8a82438b4340df34de6e19dc3:/Users/yonatanshelach/yoni/projects/functions/hugging_face_serving/hugging_face_serving.py - origin_filename: /Users/yonatanshelach/yoni/projects/functions/hugging_face_serving/hugging_face_serving.py + functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKCmZyb20gYWJjIGltcG9ydCBBQkMKZnJvbSBpbXBvcnRsaWIgaW1wb3J0IGltcG9ydF9tb2R1bGUKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBwaXBlbGluZQoKaW1wb3J0IG1scnVuLnNlcnZpbmcKClBBQ0tBR0VfTU9EVUxFID0gInRyYW5zZm9ybWVycyIKU0VSSUFMSVpBQkxFX1RZUEVTID0gW2RpY3QsIGxpc3QsIHR1cGxlLCBzdHIsIGludCwgZmxvYXRdCgoKY2xhc3MgSHVnZ2luZ0ZhY2VNb2RlbFNlcnZlcihtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIsIEFCQyk6CiAgICAiIiIKICAgIEh1Z2dpbmcgRmFjZSBNb2RlbCBzZXJ2aW5nIGNsYXNzLCBpbmhlcml0aW5nIHRoZSBWMk1vZGVsU2VydmVyIGNsYXNzIGZvciBiZWluZyBpbml0aWFsaXplZCBhdXRvbWF0aWNhbGx5IGJ5IHRoZQogICAgbW9kZWwgc2VydmVyIGFuZCBiZSBhYmxlIHRvIHJ1biBsb2NhbGx5IGFzIHBhcnQgb2YgYSBudWNsaW8gc2VydmVybGVzcyBmdW5jdGlvbiwgb3IgYXMgcGFydCBvZiBhIHJlYWwtdGltZSBwaXBlbGluZS4KICAgICIiIgoKICAgIGRlZiBfX2luaXRfXygKICAgICAgICBzZWxmLAogICAgICAgIGNvbnRleHQ6IG1scnVuLk1MQ2xpZW50Q3R4LAogICAgICAgIG5hbWU6IHN0ciwKICAgICAgICB0YXNrOiBzdHIsCiAgICAgICAgbW9kZWxfcGF0aDogc3RyID0gTm9uZSwKICAgICAgICBtb2RlbF9uYW1lOiBzdHIgPSBOb25lLAogICAgICAgIG1vZGVsX2NsYXNzOiBzdHIgPSBOb25lLAogICAgICAgIHRva2VuaXplcl9uYW1lOiBzdHIgPSBOb25lLAogICAgICAgIHRva2VuaXplcl9jbGFzczogc3RyID0gTm9uZSwKICAgICAgICBmcmFtZXdvcms6IHN0ciA9IE5vbmUsCiAgICAgICAgKipjbGFzc19hcmdzLAogICAgKToKICAgICAgICAiIiIKICAgICAgICBJbml0aWFsaXplIGEgc2VydmluZyBjbGFzcyBmb3IgYSBIdWdnaW5nIGZhY2UgbW9kZWwuCgogICAgICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIFRoZSBtbHJ1biBjb250ZXh0IHRvIHdvcmsgd2l0aAogICAgICAgIDpwYXJhbSBuYW1lOiAgICAgICAgICAgIFRoZSBuYW1lIG9mIHRoaXMgc2VydmVyIHRvIGJlIGluaXRpYWxpemVkCiAgICAgICAgOnBhcmFtIG1vZGVsX3BhdGg6ICAgICAgTm90IGluIHVzZS4gV2hlbiBhZGRpbmcgYSBtb2RlbCBwYXNzIGFueSBzdHJpbmcgdmFsdWUKICAgICAgICA6cGFyYW0gbW9kZWxfbmFtZTogICAgICBUaGUgbW9kZWwncyBuYW1lIGluIHRoZSBIdWdnaW5nIEZhY2UgaHViCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZS5nLiwgYG5scHRvd24vYmVydC1iYXNlLW11bHRpbGluZ3VhbC11bmNhc2VkLXNlbnRpbWVudGAKICAgICAgICA6cGFyYW0gbW9kZWxfY2xhc3M6ICAgICBUaGUgbW9kZWwncyBjbGFzcyB0eXBlIG9iamVjdCB3aGljaCBjYW4gYmUgcGFzc2VkIGFzIHRoZSBjbGFzcydzIG5hbWUgKHN0cmluZykuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgTXVzdCBiZSBwcm92aWRlZCBhbmQgdG8gYmUgbWF0Y2hlZCB3aXRoIGBtb2RlbF9uYW1lYC4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBlLmcuLCBgQXV0b01vZGVsRm9yU2VxdWVuY2VDbGFzc2lmaWNhdGlvbmAKICAgICAgICA6cGFyYW0gdG9rZW5pemVyX25hbWU6ICBUaGUgdG9rZW5pemVyJ3MgbmFtZSBpbiB0aGUgSHVnZ2luZyBGYWNlIGh1YgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGUuZy4sIGBubHB0b3duL2JlcnQtYmFzZS1tdWx0aWxpbmd1YWwtdW5jYXNlZC1zZW50aW1lbnRgCiAgICAgICAgOnBhcmFtIHRva2VuaXplcl9jbGFzczogVGhlIG1vZGVsJ3MgY2xhc3MgdHlwZSBvYmplY3Qgd2hpY2ggY2FuIGJlIHBhc3NlZCBhcyB0aGUgY2xhc3MncyBuYW1lIChzdHJpbmcpLgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIE11c3QgYmUgcHJvdmlkZWQgYW5kIHRvIGJlIG1hdGNoZWQgd2l0aCBgbW9kZWxfbmFtZWAuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZS5nLiwgYEF1dG9Ub2tlbml6ZXJgCiAgICAgICAgOnBhcmFtIGZyYW1ld29yazogICAgICAgVGhlIGZyYW1ld29yayB0byB1c2UsIGVpdGhlciBgInB0ImAgZm9yIFB5VG9yY2ggb3IgYCJ0ZiJgIGZvciBUZW5zb3JGbG93LiBUaGUgc3BlY2lmaWVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZnJhbWV3b3JrIG11c3QgYmUgaW5zdGFsbGVkLgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIElmIG5vIGZyYW1ld29yayBpcyBzcGVjaWZpZWQsIHdpbGwgZGVmYXVsdCB0byB0aGUgb25lIGN1cnJlbnRseSBpbnN0YWxsZWQuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgSWYgbm8gZnJhbWV3b3JrIGlzIHNwZWNpZmllZCBhbmQgYm90aCBmcmFtZXdvcmtzIGFyZSBpbnN0YWxsZWQsIHdpbGwgZGVmYXVsdCB0byB0aGUKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBmcmFtZXdvcmsgb2YgdGhlIGBtb2RlbGAsIG9yIHRvIFB5VG9yY2ggaWYgbm8gbW9kZWwgaXMgcHJvdmlkZWQuCiAgICAgICAgOnBhcmFtIGNsYXNzX2FyZ3M6ICAgICAgLQogICAgICAgICIiIgogICAgICAgIHN1cGVyKEh1Z2dpbmdGYWNlTW9kZWxTZXJ2ZXIsIHNlbGYpLl9faW5pdF9fKAogICAgICAgICAgICBjb250ZXh0PWNvbnRleHQsCiAgICAgICAgICAgIG5hbWU9bmFtZSwKICAgICAgICAgICAgbW9kZWxfcGF0aD1tb2RlbF9wYXRoLAogICAgICAgICAgICAqKmNsYXNzX2FyZ3MsCiAgICAgICAgKQogICAgICAgIHNlbGYudGFzayA9IHRhc2sKICAgICAgICBzZWxmLm1vZGVsID0gTm9uZQogICAgICAgIHNlbGYudG9rZW5pemVyID0gTm9uZQogICAgICAgIHNlbGYubW9kZWxfbmFtZSA9IG1vZGVsX25hbWUKICAgICAgICBzZWxmLnRva2VuaXplcl9uYW1lID0gdG9rZW5pemVyX25hbWUKICAgICAgICBzZWxmLm1vZGVsX2NsYXNzID0gbW9kZWxfY2xhc3MKICAgICAgICBzZWxmLnRva2VuaXplcl9jbGFzcyA9IHRva2VuaXplcl9jbGFzcwogICAgICAgIHNlbGYuZnJhbWV3b3JrID0gZnJhbWV3b3JrCiAgICAgICAgc2VsZi5waXBlID0gTm9uZQoKICAgIGRlZiBsb2FkKHNlbGYpOgogICAgICAgICIiImxvYWQgYW5kIGluaXRpYWxpemUgdGhlIG1vZGVsIGFuZC9vciBvdGhlciBlbGVtZW50cyIiIgogICAgICAgIGlmIHNlbGYubW9kZWxfY2xhc3M6CiAgICAgICAgICAgIG1vZGVsX29iamVjdCA9IGdldGF0dHIoaW1wb3J0X21vZHVsZShQQUNLQUdFX01PRFVMRSksIHNlbGYubW9kZWxfY2xhc3MpCiAgICAgICAgICAgIHNlbGYubW9kZWwgPSBtb2RlbF9vYmplY3QuZnJvbV9wcmV0cmFpbmVkKHNlbGYubW9kZWxfbmFtZSkKICAgICAgICBpZiBzZWxmLnRva2VuaXplcl9jbGFzczoKICAgICAgICAgICAgdG9rZW5pemVyX29iamVjdCA9IGdldGF0dHIoCiAgICAgICAgICAgICAgICBpbXBvcnRfbW9kdWxlKFBBQ0tBR0VfTU9EVUxFKSwgc2VsZi50b2tlbml6ZXJfY2xhc3MKICAgICAgICAgICAgKQogICAgICAgICAgICBzZWxmLnRva2VuaXplciA9IHRva2VuaXplcl9vYmplY3QuZnJvbV9wcmV0cmFpbmVkKHNlbGYudG9rZW5pemVyX25hbWUpCiAgICAgICAgc2VsZi5waXBlID0gcGlwZWxpbmUoCiAgICAgICAgICAgIHRhc2s9c2VsZi50YXNrLAogICAgICAgICAgICBtb2RlbD1zZWxmLm1vZGVsIG9yIHNlbGYubW9kZWxfbmFtZSwKICAgICAgICAgICAgdG9rZW5pemVyPXNlbGYudG9rZW5pemVyLAogICAgICAgICAgICBmcmFtZXdvcms9c2VsZi5mcmFtZXdvcmssCiAgICAgICAgKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGJvZHk6IGRpY3QpIC0+IExpc3Q6CiAgICAgICAgIiIiR2VuZXJhdGUgbW9kZWwgcHJlZGljdGlvbnMgZnJvbSBzYW1wbGUuIiIiCiAgICAgICAgaWYgc2VsZi5waXBlIGlzIE5vbmU6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoIlBsZWFzZSB1c2UgYC5sb2FkKClgIikKICAgICAgICB0cnk6CiAgICAgICAgICAgIGlmIGlzaW5zdGFuY2UoYm9keVsiaW5wdXRzIl1bMF0sIGRpY3QpOgogICAgICAgICAgICAgICAgcmVzdWx0ID0gW3NlbGYucGlwZSgqKl9pbnB1dCkgZm9yIF9pbnB1dCBpbiBib2R5WyJpbnB1dHMiXV0KICAgICAgICAgICAgZWxzZToKICAgICAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYucGlwZShib2R5WyJpbnB1dHMiXSkKICAgICAgICAgICAgIyByZXBsYWNlIGxpc3Qgb2YgbGlzdHMgb2YgZGljdHMgaW50byBhIGxpc3Qgb2YgZGljdHM6CiAgICAgICAgICAgIGlmIGFsbChpc2luc3RhbmNlKHJlcywgbGlzdCkgZm9yIHJlcyBpbiByZXN1bHQpOgogICAgICAgICAgICAgICAgbmV3X3Jlc3VsdCA9IFtyZXNbMF0gZm9yIHJlcyBpbiByZXN1bHRdCiAgICAgICAgICAgICAgICByZXN1bHQgPSBuZXdfcmVzdWx0CgogICAgICAgICAgICBub25fc2VyaWFsaXphYmxlX3R5cGVzID0gW10KICAgICAgICAgICAgZm9yIHJlcyBpbiByZXN1bHQ6CiAgICAgICAgICAgICAgICBmb3Iga2V5LCB2YWwgaW4gcmVzLml0ZW1zKCk6CiAgICAgICAgICAgICAgICAgICAgaWYgdHlwZSh2YWwpIG5vdCBpbiBTRVJJQUxJWkFCTEVfVFlQRVM6CiAgICAgICAgICAgICAgICAgICAgICAgIG5vbl9zZXJpYWxpemFibGVfdHlwZXMuYXBwZW5kKHN0cih0eXBlKHZhbCkpKQogICAgICAgICAgICAgICAgICAgICAgICByZXNba2V5XSA9IHN0cih2YWwpCiAgICAgICAgICAgIGlmIG5vbl9zZXJpYWxpemFibGVfdHlwZXM6CiAgICAgICAgICAgICAgICBzZWxmLmNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgICAgICAgICAgICAgZiJOb24tc2VyaWFsaXphYmxlIHR5cGVzOiB7bm9uX3NlcmlhbGl6YWJsZV90eXBlc30gd2VyZSBjYXN0ZWQgdG8gc3RyaW5ncyIKICAgICAgICAgICAgICAgICkKICAgICAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiRmFpbGVkIHRvIHByZWRpY3QgJXMiICUgZSkKICAgICAgICByZXR1cm4gcmVzdWx0Cgpmcm9tIG1scnVuLnJ1bnRpbWVzIGltcG9ydCBudWNsaW9faW5pdF9ob29rCmRlZiBpbml0X2NvbnRleHQoY29udGV4dCk6CiAgICBudWNsaW9faW5pdF9ob29rKGNvbnRleHQsIGdsb2JhbHMoKSwgJ3NlcnZpbmdfdjInKQoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgogICAgcmV0dXJuIGNvbnRleHQubWxydW5faGFuZGxlcihjb250ZXh0LCBldmVudCkK + commands: [] + code_origin: '' + origin_filename: '' + requirements: + - transformers==4.21.3 + - tensorflow==2.9.2 description: Generic Hugging Face model server. - default_handler: handler + default_handler: '' disable_auto_mount: false - env: [] + clone_target_dir: '' + env: + - name: MLRUN_HTTPDB__NUCLIO__EXPLICIT_ACK + value: enabled priority_class_name: '' preemption_mode: prevent min_replicas: 1 max_replicas: 4 - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: hugging-face-serving - labels: {} - annotations: - nuclio.io/generated_by: function generated from /Users/yonatanshelach/yoni/projects/functions/hugging_face_serving/hugging_face_serving.py - spec: - runtime: python - handler: hugging_face_serving:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKCmZyb20gYWJjIGltcG9ydCBBQkMKZnJvbSBpbXBvcnRsaWIgaW1wb3J0IGltcG9ydF9tb2R1bGUKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBwaXBlbGluZQoKaW1wb3J0IG1scnVuLnNlcnZpbmcKClBBQ0tBR0VfTU9EVUxFID0gInRyYW5zZm9ybWVycyIKU0VSSUFMSVpBQkxFX1RZUEVTID0gW2RpY3QsIGxpc3QsIHR1cGxlLCBzdHIsIGludCwgZmxvYXRdCgoKY2xhc3MgSHVnZ2luZ0ZhY2VNb2RlbFNlcnZlcihtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIsIEFCQyk6CiAgICAiIiIKICAgIEh1Z2dpbmcgRmFjZSBNb2RlbCBzZXJ2aW5nIGNsYXNzLCBpbmhlcml0aW5nIHRoZSBWMk1vZGVsU2VydmVyIGNsYXNzIGZvciBiZWluZyBpbml0aWFsaXplZCBhdXRvbWF0aWNhbGx5IGJ5IHRoZQogICAgbW9kZWwgc2VydmVyIGFuZCBiZSBhYmxlIHRvIHJ1biBsb2NhbGx5IGFzIHBhcnQgb2YgYSBudWNsaW8gc2VydmVybGVzcyBmdW5jdGlvbiwgb3IgYXMgcGFydCBvZiBhIHJlYWwtdGltZSBwaXBlbGluZS4KICAgICIiIgoKICAgIGRlZiBfX2luaXRfXygKICAgICAgICBzZWxmLAogICAgICAgIGNvbnRleHQ6IG1scnVuLk1MQ2xpZW50Q3R4LAogICAgICAgIG5hbWU6IHN0ciwKICAgICAgICB0YXNrOiBzdHIsCiAgICAgICAgbW9kZWxfcGF0aDogc3RyID0gTm9uZSwKICAgICAgICBtb2RlbF9uYW1lOiBzdHIgPSBOb25lLAogICAgICAgIG1vZGVsX2NsYXNzOiBzdHIgPSBOb25lLAogICAgICAgIHRva2VuaXplcl9uYW1lOiBzdHIgPSBOb25lLAogICAgICAgIHRva2VuaXplcl9jbGFzczogc3RyID0gTm9uZSwKICAgICAgICBmcmFtZXdvcms6IHN0ciA9IE5vbmUsCiAgICAgICAgKipjbGFzc19hcmdzLAogICAgKToKICAgICAgICAiIiIKICAgICAgICBJbml0aWFsaXplIGEgc2VydmluZyBjbGFzcyBmb3IgYSBIdWdnaW5nIGZhY2UgbW9kZWwuCgogICAgICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIFRoZSBtbHJ1biBjb250ZXh0IHRvIHdvcmsgd2l0aAogICAgICAgIDpwYXJhbSBuYW1lOiAgICAgICAgICAgIFRoZSBuYW1lIG9mIHRoaXMgc2VydmVyIHRvIGJlIGluaXRpYWxpemVkCiAgICAgICAgOnBhcmFtIG1vZGVsX3BhdGg6ICAgICAgTm90IGluIHVzZS4gV2hlbiBhZGRpbmcgYSBtb2RlbCBwYXNzIGFueSBzdHJpbmcgdmFsdWUKICAgICAgICA6cGFyYW0gbW9kZWxfbmFtZTogICAgICBUaGUgbW9kZWwncyBuYW1lIGluIHRoZSBIdWdnaW5nIEZhY2UgaHViCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZS5nLiwgYG5scHRvd24vYmVydC1iYXNlLW11bHRpbGluZ3VhbC11bmNhc2VkLXNlbnRpbWVudGAKICAgICAgICA6cGFyYW0gbW9kZWxfY2xhc3M6ICAgICBUaGUgbW9kZWwncyBjbGFzcyB0eXBlIG9iamVjdCB3aGljaCBjYW4gYmUgcGFzc2VkIGFzIHRoZSBjbGFzcydzIG5hbWUgKHN0cmluZykuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgTXVzdCBiZSBwcm92aWRlZCBhbmQgdG8gYmUgbWF0Y2hlZCB3aXRoIGBtb2RlbF9uYW1lYC4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBlLmcuLCBgQXV0b01vZGVsRm9yU2VxdWVuY2VDbGFzc2lmaWNhdGlvbmAKICAgICAgICA6cGFyYW0gdG9rZW5pemVyX25hbWU6ICBUaGUgdG9rZW5pemVyJ3MgbmFtZSBpbiB0aGUgSHVnZ2luZyBGYWNlIGh1YgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGUuZy4sIGBubHB0b3duL2JlcnQtYmFzZS1tdWx0aWxpbmd1YWwtdW5jYXNlZC1zZW50aW1lbnRgCiAgICAgICAgOnBhcmFtIHRva2VuaXplcl9jbGFzczogVGhlIG1vZGVsJ3MgY2xhc3MgdHlwZSBvYmplY3Qgd2hpY2ggY2FuIGJlIHBhc3NlZCBhcyB0aGUgY2xhc3MncyBuYW1lIChzdHJpbmcpLgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIE11c3QgYmUgcHJvdmlkZWQgYW5kIHRvIGJlIG1hdGNoZWQgd2l0aCBgbW9kZWxfbmFtZWAuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZS5nLiwgYEF1dG9Ub2tlbml6ZXJgCiAgICAgICAgOnBhcmFtIGZyYW1ld29yazogICAgICAgVGhlIGZyYW1ld29yayB0byB1c2UsIGVpdGhlciBgInB0ImAgZm9yIFB5VG9yY2ggb3IgYCJ0ZiJgIGZvciBUZW5zb3JGbG93LiBUaGUgc3BlY2lmaWVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZnJhbWV3b3JrIG11c3QgYmUgaW5zdGFsbGVkLgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIElmIG5vIGZyYW1ld29yayBpcyBzcGVjaWZpZWQsIHdpbGwgZGVmYXVsdCB0byB0aGUgb25lIGN1cnJlbnRseSBpbnN0YWxsZWQuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgSWYgbm8gZnJhbWV3b3JrIGlzIHNwZWNpZmllZCBhbmQgYm90aCBmcmFtZXdvcmtzIGFyZSBpbnN0YWxsZWQsIHdpbGwgZGVmYXVsdCB0byB0aGUKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBmcmFtZXdvcmsgb2YgdGhlIGBtb2RlbGAsIG9yIHRvIFB5VG9yY2ggaWYgbm8gbW9kZWwgaXMgcHJvdmlkZWQuCiAgICAgICAgOnBhcmFtIGNsYXNzX2FyZ3M6ICAgICAgLQogICAgICAgICIiIgogICAgICAgIHN1cGVyKEh1Z2dpbmdGYWNlTW9kZWxTZXJ2ZXIsIHNlbGYpLl9faW5pdF9fKAogICAgICAgICAgICBjb250ZXh0PWNvbnRleHQsCiAgICAgICAgICAgIG5hbWU9bmFtZSwKICAgICAgICAgICAgbW9kZWxfcGF0aD1tb2RlbF9wYXRoLAogICAgICAgICAgICAqKmNsYXNzX2FyZ3MsCiAgICAgICAgKQogICAgICAgIHNlbGYudGFzayA9IHRhc2sKICAgICAgICBzZWxmLm1vZGVsID0gTm9uZQogICAgICAgIHNlbGYudG9rZW5pemVyID0gTm9uZQogICAgICAgIHNlbGYubW9kZWxfbmFtZSA9IG1vZGVsX25hbWUKICAgICAgICBzZWxmLnRva2VuaXplcl9uYW1lID0gdG9rZW5pemVyX25hbWUKICAgICAgICBzZWxmLm1vZGVsX2NsYXNzID0gbW9kZWxfY2xhc3MKICAgICAgICBzZWxmLnRva2VuaXplcl9jbGFzcyA9IHRva2VuaXplcl9jbGFzcwogICAgICAgIHNlbGYuZnJhbWV3b3JrID0gZnJhbWV3b3JrCiAgICAgICAgc2VsZi5waXBlID0gTm9uZQoKICAgIGRlZiBsb2FkKHNlbGYpOgogICAgICAgICIiImxvYWQgYW5kIGluaXRpYWxpemUgdGhlIG1vZGVsIGFuZC9vciBvdGhlciBlbGVtZW50cyIiIgogICAgICAgIGlmIHNlbGYubW9kZWxfY2xhc3M6CiAgICAgICAgICAgIG1vZGVsX29iamVjdCA9IGdldGF0dHIoaW1wb3J0X21vZHVsZShQQUNLQUdFX01PRFVMRSksIHNlbGYubW9kZWxfY2xhc3MpCiAgICAgICAgICAgIHNlbGYubW9kZWwgPSBtb2RlbF9vYmplY3QuZnJvbV9wcmV0cmFpbmVkKHNlbGYubW9kZWxfbmFtZSkKICAgICAgICBpZiBzZWxmLnRva2VuaXplcl9jbGFzczoKICAgICAgICAgICAgdG9rZW5pemVyX29iamVjdCA9IGdldGF0dHIoCiAgICAgICAgICAgICAgICBpbXBvcnRfbW9kdWxlKFBBQ0tBR0VfTU9EVUxFKSwgc2VsZi50b2tlbml6ZXJfY2xhc3MKICAgICAgICAgICAgKQogICAgICAgICAgICBzZWxmLnRva2VuaXplciA9IHRva2VuaXplcl9vYmplY3QuZnJvbV9wcmV0cmFpbmVkKHNlbGYudG9rZW5pemVyX25hbWUpCiAgICAgICAgc2VsZi5waXBlID0gcGlwZWxpbmUoCiAgICAgICAgICAgIHRhc2s9c2VsZi50YXNrLAogICAgICAgICAgICBtb2RlbD1zZWxmLm1vZGVsIG9yIHNlbGYubW9kZWxfbmFtZSwKICAgICAgICAgICAgdG9rZW5pemVyPXNlbGYudG9rZW5pemVyLAogICAgICAgICAgICBmcmFtZXdvcms9c2VsZi5mcmFtZXdvcmssCiAgICAgICAgKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGJvZHk6IGRpY3QpIC0+IExpc3Q6CiAgICAgICAgIiIiR2VuZXJhdGUgbW9kZWwgcHJlZGljdGlvbnMgZnJvbSBzYW1wbGUuIiIiCiAgICAgICAgaWYgc2VsZi5waXBlIGlzIE5vbmU6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoIlBsZWFzZSB1c2UgYC5sb2FkKClgIikKICAgICAgICB0cnk6CiAgICAgICAgICAgIGlmIGlzaW5zdGFuY2UoYm9keVsiaW5wdXRzIl1bMF0sIGRpY3QpOgogICAgICAgICAgICAgICAgcmVzdWx0ID0gW3NlbGYucGlwZSgqKl9pbnB1dCkgZm9yIF9pbnB1dCBpbiBib2R5WyJpbnB1dHMiXV0KICAgICAgICAgICAgZWxzZToKICAgICAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYucGlwZShib2R5WyJpbnB1dHMiXSkKICAgICAgICAgICAgIyByZXBsYWNlIGxpc3Qgb2YgbGlzdHMgb2YgZGljdHMgaW50byBhIGxpc3Qgb2YgZGljdHM6CiAgICAgICAgICAgIGlmIGFsbChpc2luc3RhbmNlKHJlcywgbGlzdCkgZm9yIHJlcyBpbiByZXN1bHQpOgogICAgICAgICAgICAgICAgbmV3X3Jlc3VsdCA9IFtyZXNbMF0gZm9yIHJlcyBpbiByZXN1bHRdCiAgICAgICAgICAgICAgICByZXN1bHQgPSBuZXdfcmVzdWx0CgogICAgICAgICAgICBub25fc2VyaWFsaXphYmxlX3R5cGVzID0gW10KICAgICAgICAgICAgZm9yIHJlcyBpbiByZXN1bHQ6CiAgICAgICAgICAgICAgICBmb3Iga2V5LCB2YWwgaW4gcmVzLml0ZW1zKCk6CiAgICAgICAgICAgICAgICAgICAgaWYgdHlwZSh2YWwpIG5vdCBpbiBTRVJJQUxJWkFCTEVfVFlQRVM6CiAgICAgICAgICAgICAgICAgICAgICAgIG5vbl9zZXJpYWxpemFibGVfdHlwZXMuYXBwZW5kKHN0cih0eXBlKHZhbCkpKQogICAgICAgICAgICAgICAgICAgICAgICByZXNba2V5XSA9IHN0cih2YWwpCiAgICAgICAgICAgIGlmIG5vbl9zZXJpYWxpemFibGVfdHlwZXM6CiAgICAgICAgICAgICAgICBzZWxmLmNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgICAgICAgICAgICAgZiJOb24tc2VyaWFsaXphYmxlIHR5cGVzOiB7bm9uX3NlcmlhbGl6YWJsZV90eXBlc30gd2VyZSBjYXN0ZWQgdG8gc3RyaW5ncyIKICAgICAgICAgICAgICAgICkKICAgICAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiRmFpbGVkIHRvIHByZWRpY3QgJXMiICUgZSkKICAgICAgICByZXR1cm4gcmVzdWx0Cgpmcm9tIG1scnVuLnJ1bnRpbWVzIGltcG9ydCBudWNsaW9faW5pdF9ob29rCmRlZiBpbml0X2NvbnRleHQoY29udGV4dCk6CiAgICBudWNsaW9faW5pdF9ob29rKGNvbnRleHQsIGdsb2JhbHMoKSwgJ3NlcnZpbmdfdjInKQoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgogICAgcmV0dXJuIGNvbnRleHQubWxydW5faGFuZGxlcihjb250ZXh0LCBldmVudCkK source: '' function_kind: serving_v2 + function_handler: hugging_face_serving:handler + base_image_pull: false default_class: HuggingFaceModelServer secret_sources: [] affinity: null diff --git a/functions/master/hugging_face_serving/latest/static/item.html b/functions/master/hugging_face_serving/latest/static/item.html index 70d12db6..6dc1e112 100644 --- a/functions/master/hugging_face_serving/latest/static/item.html +++ b/functions/master/hugging_face_serving/latest/static/item.html @@ -17,6 +17,8 @@ apiVersion: v1 categories: +- huggingface +- genai - model-serving - machine-learning description: Generic Hugging Face model server. @@ -43,8 +45,8 @@ - transformers==4.21.3 - tensorflow==2.9.2 url: '' -version: 1.0.0 - +version: 1.1.0 +test_valid: false diff --git a/functions/master/huggingface_auto_trainer/1.1.0/src/function.yaml b/functions/master/huggingface_auto_trainer/1.1.0/src/function.yaml new file mode 100644 index 00000000..702a8401 --- /dev/null +++ b/functions/master/huggingface_auto_trainer/1.1.0/src/function.yaml @@ -0,0 +1,327 @@ +kind: job +metadata: + name: huggingface-auto-trainer + tag: '' + hash: 55c9aa4a822780f7388819ccf633dfe26b31f02e + project: '' + labels: + author: Zeevr + categories: + - huggingface + - genai + - machine-learning + - model-training +spec: + command: '' + args: [] + image: mlrun/mlrun + build: + functionSourceCode: import importlib
import os
import shutil
import tempfile
import zipfile
from abc import ABC
from typing import Dict, List, Tuple, Union

import mlrun
import numpy as np
import pandas as pd
import peft
import torch
import transformers
from datasets import Dataset, load_dataset
from mlrun.artifacts.manager import Artifact, PlotlyArtifact
from mlrun.datastore import is_store_uri
from mlrun.frameworks._common import CommonTypes, MLRunInterface
from mlrun.utils import logger
from peft import (LoraConfig, PeftModel, get_peft_model,
                  prepare_model_for_kbit_training)
from plotly import graph_objects as go
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
                          PreTrainedModel, PreTrainedTokenizer, Trainer,
                          TrainerCallback, TrainerControl, TrainerState,
                          TrainingArguments)

supported_tasks = [
    "question-answering",
    "summarization",
    "table-question-answering",
    "text2text-generation",
    "text-classification",
    "sentiment-analysis",
    "text-generation",
    "token-classification",
    "translation",
    "translation_xx_to_yy",
]


class ConfigKeys:
    deepspeed = "deepspeed"
    quantization = "quantization"
    lora = "lora"
    training = "training"
    tokenizer_pretrained = "tokenizer_pretrained"
    model_pretrained = "model_pretrained"
    data_collator = "data_collator"


# ----------------------from MLRUN--------------------------------
class HFTrainerMLRunInterface(MLRunInterface, ABC):
    """
    This is temporary and will be built in mlrun 1.5.0
    Interface for adding MLRun features for tensorflow keras API.
    """

    # MLRuns context default name:
    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"

    # Attributes to replace so the MLRun interface will be fully enabled.
    _REPLACED_METHODS = [
        "train",
        # "evaluate"
    ]

    @classmethod
    def add_interface(
        cls,
        obj: Trainer,
        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
    ):
        super(HFTrainerMLRunInterface, cls).add_interface(
            obj=obj, restoration=restoration
        )

    @classmethod
    def mlrun_train(cls):
        def wrapper(self: Trainer, *args, **kwargs):
            # Restore the evaluation method as `train` will use it:
            # cls._restore_attribute(obj=self, attribute_name="evaluate")

            # Call the original fit method:
            result = self.original_train(*args, **kwargs)

            # Replace the evaluation method again:
            # cls._replace_function(obj=self, function_name="evaluate")

            return result

        return wrapper


class MLRunCallback(TrainerCallback):
    """
    This is temporary and will be built in mlrun 1.5.0
    Callback for collecting logs during training / evaluation of the `Trainer` API.
    """

    def __init__(
        self,
        context: mlrun.MLClientCtx = None,
        model_name: str = "model",
        tag: str = "",
        labels: Dict[str, str] = None,
        extra_data: dict = None,
    ):
        super().__init__()

        # Store the configurations:
        self._context = (
            context
            if context is not None
            else mlrun.get_or_create_ctx("./mlrun-huggingface")
        )
        self._model_name = model_name
        self._tag = tag
        self._labels = labels
        self._extra_data = extra_data if extra_data is not None else {}

        # Set up the logging mode:
        self._is_training = False
        self._steps: List[List[int]] = []
        self._metric_scores: Dict[str, List[float]] = {}
        self._artifacts: Dict[str, Artifact] = {}

    def on_epoch_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self._steps.append([])

    def on_epoch_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

    def on_log(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        logs: Dict[str, float] = None,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        recent_logs = state.log_history[-1].copy()

        recent_logs.pop("epoch")
        current_step = int(recent_logs.pop("step"))
        if current_step not in self._steps[-1]:
            self._steps[-1].append(current_step)

        for metric_name, metric_score in recent_logs.items():
            if metric_name.startswith("train_"):
                if metric_name.split("train_")[1] not in self._metric_scores:
                    self._metric_scores[metric_name] = [metric_score]
                continue
            if metric_name not in self._metric_scores:
                self._metric_scores[metric_name] = []
            self._metric_scores[metric_name].append(metric_score)

    def on_train_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self._is_training = True

    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        model: PreTrainedModel = None,
        tokenizer: PreTrainedTokenizer = None,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

    def on_evaluate(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

        if self._is_training:
            return

    def log_metrics(self):
        for metric_name, metric_scores in self._metric_scores.items():
            self._context.log_result(key=metric_name, value=metric_scores[-1])
            if len(metric_scores) > 1:
                self.log_metric_plot(name=metric_name, scores=metric_scores)
        self._context.commit(completed=False)

    def log_metric_plot(self, name: str, scores: List[float]):
        # Initialize a plotly figure:
        metric_figure = go.Figure()

        # Add titles:
        metric_figure.update_layout(
            title=name.capitalize().replace("_", " "),
            xaxis_title="Samples",
            yaxis_title="Scores",
        )

        # Draw:
        metric_figure.add_trace(
            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
        )

        # Create the plotly artifact:
        artifact_name = f"{name}_plot"
        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
        self._artifacts[artifact_name] = self._context.log_artifact(artifact)


def apply_mlrun(
    trainer: transformers.Trainer,
    model_name: str = None,
    tag: str = "",
    context: mlrun.MLClientCtx = None,
    auto_log: bool = True,
    labels: Dict[str, str] = None,
    extra_data: dict = None,
    **kwargs,
):
    """
    This is temporary and will be built in mlrun 1.5.0
    """
    # Get parameters defaults:
    if context is None:
        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)

    HFTrainerMLRunInterface.add_interface(obj=trainer)

    if auto_log:
        trainer.add_callback(
            MLRunCallback(
                context=context,
                model_name=model_name,
                tag=tag,
                labels=labels,
                extra_data=extra_data,
            )
        )


# ----------------------end from MLRUN--------------------------------


def _print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%:"
        f" {100 * trainable_params / all_param}"
    )


# default configs
# will be used if user provides "True" with config name as input
QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

LORA_CONFIG = peft.LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

DEEPSPEED_CONFIG = {
    "train_micro_batch_size_per_gpu": "auto",
    "fp16": {"enabled": True},
    "autotuning": {
        "enabled": True,
        "arg_mappings": {
            "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
            "gradient_accumulation_steps ": "--gradient_accumulation_steps",
        },
    },
    "zero_optimization": {
        "stage": 2,
    },
}


def _update_config(src: dict, dst: dict):
    """
    update configs according to user, this way the user can add/modify values in default configs for e.g.

    goes over all configs and corresponding prefixes, collect all the keys from the given dict that start
     with the prefix and add them to appropriate config

    :param src: dict of all candidate values to update dict.
    :param dst: dict containing all configs to update.
    """

    for config_name, config in dst.items():

        # If given True we use default dict
        # Can also be False or a config dict given from user, so we check specifically fo True
        if config is True and config_name == "quantization":
            config = QUANTIZATION_CONFIG

        if config is True and config_name == "lora":
            config = LORA_CONFIG

        if config is True and config_name == "deepspeed":
            config = DEEPSPEED_CONFIG

        # in some cases we can get a boolean value, in that case no need to look for args
        if isinstance(config, bool):
            config = None

        elif isinstance(config, dict):
            for key, val in src.items():
                if key.startswith(config_name):
                    config[key.replace(f"{config_name}_", "")] = val

        # update by config name
        else:
            for key, val in src.items():
                if key.startswith(config_name):
                    setattr(config, key.replace(f"{config_name}_", ""), val)

        dst.update({config_name: config})


def _get_class_object(class_path: str) -> type:
    """
    given a full class name, this function returns the correct class

    :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM')

    :return the wanted class object
    """
    module_path, class_name = class_path.rsplit(".", 1)
    module = importlib.import_module(module_path)
    return getattr(module, class_name)


def _set_model_and_tokenizer(
    model: Union[str, List[str]],
    tokenizer: Union[str, List[str]],
    task: str,
    framework: str,
    lora_config: dict,
    quantization_config: dict,
    use_cuda: bool,
    tokenizer_pretrained_config,
    model_pretrained_config,
    device_map: str,
):
    """
    get the correct model and tokenizer according to given user inputs

    :param model: a tuple containing model name and class, or str with model name or path
    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
    :param task: a supported nlp task, used to choose model if not provided
    :param framework: pt or tf
    :param lora_config: lora config or None, to load model in appropriate way
    :param quantization_config: quantization config or None, to load model in appropriate way
    :param use_cuda: use gpu or not
    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
    :param model_pretrained_config: config to load the pretrained model
    :param device_map: a device map for model training if using number of gpu's

    :returns: model and tokenizer
    """
    # if task is not supported and no model was given we can't choose one
    if task and task not in supported_tasks and not model:
        logger.error("unsupported task option chosen")
        raise

    # load model from store
    if isinstance(model, str) and is_store_uri(model):
        pass
        # TODO: load both model and tokenizer and return, need guy's help

    # if it's a tuple them we assume it contains of both name and class
    if isinstance(model, list):
        model_name, model_class = model
        model_class = _get_class_object(model_class)

    # in the case we don't get the model class we need the task in order to choose the correct model
    else:
        if task is None:
            logger.error("task must be chosen in order to determine the correct model")
            raise Exception(
                "this function requires either a supported task or a model and model class to be chosen"
            )

        _, available_classes, task_options = transformers.pipelines.check_task(task)

        if isinstance(model, str):
            model_name = model

        # if model is not given, we take the default model for the given task
        else:
            model_name, _ = transformers.pipelines.get_default_model_and_revision(
                available_classes, framework, task_options
            )
        if not available_classes.get(framework, tuple()):
            logger.error(
                "given task's default model is not supported in specified framework"
            )
            raise Exception(
                "this function requires either a supported task or a model and model class to be chosen"
            )

        model_class = available_classes[framework][0]

    # load the pretrained model
    if use_cuda:
        device_map = device_map
    else:
        device_map = None

    model = model_class.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map=device_map,
        **model_pretrained_config,
    )

    # If quantization config is given we will load a quantized model, if not a regular one
    if quantization_config:
        model.gradient_checkpointing_enable()
        model = peft.prepare_model_for_kbit_training(model)

    # If lora config was given we want to do lora fine tune, we update model here
    if lora_config:
        model = peft.get_peft_model(model, lora_config)

    # if not specified we choose the default tokenizer that corresponding to the model
    if tokenizer is None:
        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
        return model_name, model, tokenizer

    if isinstance(tokenizer, str):
        tokenizer_name = tokenizer
        tokenizer_class = transformers.AutoTokenizer

    # if it's not a str then it's a tuple of both name and class
    else:
        tokenizer_name, tokenizer_class = tokenizer
        tokenizer_class = _get_class_object(tokenizer_class)

    tokenizer = tokenizer_class.from_pretrained(
        tokenizer_name, **tokenizer_pretrained_config
    )

    tokenizer.pad_token = tokenizer.eos_token

    return model_name, model, tokenizer


def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset:
    """
    loads the specific dataset provided by the user

    :param dataset: name or path of dataset to load
    :param is_train: bool that indicates the purpose of the dataset
    :param kwargs: other kwargs for loading the dataset

    :returns: loaded dataset
    """
    # if split in kwargs then the user decides how to split the dataset
    if "split" in kwargs:
        return load_dataset(dataset, **kwargs)

    # if it's a dataset for train we split with train
    if is_train:
        return load_dataset(dataset, split="train", **kwargs)

    # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them
    dataset = load_dataset(dataset, **kwargs)
    if "test" in dataset:
        return dataset.get("test")
    elif "eval" in dataset:
        return dataset.get("eval")
    elif "validation" in dataset:
        return dataset.get("validation")


def _prepare_dataset(
    train_dataset: str,
    eval_dataset: str,
    train_load_dataset_kwargs,
    eval_load_dataset_kwargs,
    tokenizer,
    dataset_columns_to_train: Union[str, list],
) -> (Dataset, Union[Dataset, None]):
    """
    Loads the train and eval datasets (if provided) passes them through the tokenizer and
    returns them ready to use in training

    :param train_dataset: the name or path to the train dataset
    :param eval_dataset: the name or path to the eval dataset
    :param dataset_columns_to_train: which columns to pass to the model as inputs
                                        (need to pass through the tokenizer first)
    :param train_load_dataset_kwargs: kwargs for dataset loading
    :param eval_load_dataset_kwargs: kwargs for dataset loading
    :param tokenizer: the tokenizer to pass the data through

    :returns: tokenized datasets
    """
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    # we take col name/s in a list for easy generalization
    if isinstance(dataset_columns_to_train, str):
        dataset_columns_to_train = [dataset_columns_to_train]

    if isinstance(train_dataset, mlrun.datastore.DataItem):
        train_dataset = Dataset.from_pandas(train_dataset.as_df())
        return (
            train_dataset.map(
                lambda examples: tokenizer(
                    *[examples[col] for col in dataset_columns_to_train],
                    truncation=True,
                    padding=True,
                ),
                batched=True,
            ),
            None,
        )

    # Load datasets
    # if provided two paths/names we load each separately using designated func
    if eval_dataset:
        train_dataset = _dataset_loader(
            dataset=train_dataset, is_train=True, **train_load_dataset_kwargs
        )
        eval_dataset = _dataset_loader(
            dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs
        )

    # if only on path is given then we must check if it contains both dataset or if only one should be used
    else:
        dataset = load_dataset(train_dataset, **train_load_dataset_kwargs)
        if "train" in dataset:
            train_dataset = dataset.get("train")
            if "test" in dataset:
                eval_dataset = dataset.get("test")
            elif "eval" in dataset:
                eval_dataset = dataset.get("eval")
            elif "validation" in dataset:
                eval_dataset = dataset.get("validation")
            else:
                # only train dataset given, tokenize and return it
                return (
                    train_dataset.map(
                        lambda examples: tokenizer(
                            *[examples[col] for col in dataset_columns_to_train],
                            truncation=True,
                            padding=True,
                        ),
                        batched=True,
                    ),
                    None,
                )
        else:
            logger.error("train dataset is mandatory")
            raise KeyError("no train dataset found in given dataset")

    # Tokenize the data so the model can understand it
    tokenized_train_dataset = train_dataset.map(
        lambda examples: tokenizer(
            *[examples[col] for col in dataset_columns_to_train],
            truncation=True,
            padding=True,
        ),
        batched=True,
    )

    tokenized_eval_dataset = eval_dataset.map(
        lambda examples: tokenizer(
            *[examples[col] for col in dataset_columns_to_train],
            truncation=True,
            padding=True,
        ),
        batched=True,
    )

    return tokenized_train_dataset, tokenized_eval_dataset


def finetune_llm(
    context: mlrun.MLClientCtx,
    train_dataset: Union[str, mlrun.datastore.DataItem],
    eval_dataset: str = None,
    train_load_dataset_kwargs: dict = {},
    eval_load_dataset_kwargs: dict = {},
    dataset_columns_to_train: Union[str, list] = "text",
    model: Union[str, List[str]] = "huggingface-model",
    tokenizer: Union[str, List[str]] = None,
    deepspeed_config: Union[dict, bool] = False,
    quantization_config: Union[dict, bool] = False,
    lora_config: Union[dict, bool] = False,
    training_config: dict = {},
    model_pretrained_config: dict = {},
    tokenizer_pretrained_config: dict = {},
    data_collator_config: dict = {},
    task: str = "text-generation",
    use_cuda: bool = True,
    framework: str = "pt",
    device_map: str = "auto",
    **kwargs,
):
    """
    Fine-tunes a Language Model (LLM) on a specific task using the provided dataset.
     The function takes various configuration parameters to customize the training process
     and adapt the model to specific tasks using a provided dataset.

    :param context: mlrun context in order to log trained model
    :param dataset_columns_to_train: which columns to pass to the model as inputs
    :param eval_load_dataset_kwargs: kwargs for dataset loading
    :param train_load_dataset_kwargs: kwargs for dataset loading
    :param framework: pt ot tf
    :param use_cuda: use gpu or not
    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
    :param model_pretrained_config: config to load the pretrained model
    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
    :param model: a tuple containing model name and class, or str with model name or path
    :param train_dataset: The train dataset used for fine-tuning the language model.
    :param eval_dataset: The eval dataset used for evaluate the language model during training.
    :param deepspeed_config: Configuration options for DeepSpeed (optional).
    :param quantization_config: Configuration options for model quantization (optional).
    :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional).
    :param training_config: Configuration options specific to the fine-tuning training process (optional).
    :param data_collator_config: Configuration options for data collation during training (optional).
    :param task: A description of the specific task the model is being fine-tuned for.
    :param kwargs: Additional keyword arguments.
    """

    # TODO: match forward.keyword to dataset.keyword - check if relevant in new design
    # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design

    # Look for updates to configs given in kwargs
    configs = {
        ConfigKeys.deepspeed: deepspeed_config,
        ConfigKeys.quantization: quantization_config,
        ConfigKeys.lora: lora_config,
        ConfigKeys.training: training_config,
        ConfigKeys.model_pretrained: model_pretrained_config,
        ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config,
        ConfigKeys.data_collator: data_collator_config,
    }
    _update_config(dst=configs, src=kwargs)

    # check gpu permission and availability
    if use_cuda:
        if torch.cuda.is_available():
            # Clean gpu cache
            torch.cuda.empty_cache()
        else:
            logger.warning("'use_cuda' is set to True, but no cuda device is available")

    # get model and tokenizer
    model_name, model, tokenizer = _set_model_and_tokenizer(
        model=model,
        tokenizer=tokenizer,
        task=task,
        framework=framework,
        lora_config=configs[ConfigKeys.lora],
        quantization_config=configs[ConfigKeys.quantization],
        use_cuda=use_cuda,
        tokenizer_pretrained_config=tokenizer_pretrained_config,
        model_pretrained_config=configs[ConfigKeys.model_pretrained],
        device_map=device_map,
    )

    # Load datasets
    tokenized_train, tokenized_eval = _prepare_dataset(
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        train_load_dataset_kwargs=train_load_dataset_kwargs,
        eval_load_dataset_kwargs=eval_load_dataset_kwargs,
        tokenizer=tokenizer,
        dataset_columns_to_train=dataset_columns_to_train,
    )

    # Initialize the data collator for the trainer to use in order to create batches of data
    data_collator = transformers.DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False, **data_collator_config
    )

    # Initialize training kwargs from user kwargs:
    train_kwargs = configs[ConfigKeys.training]

    # If deepspeed config given we add it to training kwargs
    if configs[ConfigKeys.deepspeed]:
        train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed]

    # Take a look at the trainable parameters in the model
    _print_trainable_parameters(model)

    # Preparing training arguments:
    training_args = transformers.TrainingArguments(
        output_dir=tempfile.mkdtemp(),
        **train_kwargs,
    )

    trainer = transformers.Trainer(
        model=model,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        tokenizer=tokenizer,
        data_collator=data_collator,
        args=training_args,
    )

    apply_mlrun(trainer, model_name=model_name.split("/")[-1])
    model.config.use_cache = (
        False  # silence the warnings. Please re-enable for inference!
    )

    # Apply training with evaluation:
    context.logger.info(f"training '{model_name}'")
    trainer.train()

    temp_directory = tempfile.TemporaryDirectory().name
    trainer.save_model(temp_directory)

    # Zip the model directory:
    shutil.make_archive(
        base_name="model",
        format="zip",
        root_dir=temp_directory,
    )

    # Log the model:
    context.log_model(
        key="model",
        db_key=model_name.split("/")[-1],
        model_file="model.zip",
        tag="",
        framework="Hugging Face",
    )


def evaluate(
    context,
    model_path,
    data: pd.DataFrame,
    model_name: str = None,
    tokenizer_name: str = None,
):
    """
    Evaluating the model using perplexity, for more information visit:
    https://huggingface.co/docs/transformers/perplexity

    :param context:     mlrun context
    :param model_path:  path to the model directory
    :param data:        the data to evaluate the model
    :param model_name:  name of base model
    :param tokenizer_name: name of base tokenizer
    """
    # Get the model artifact and file:
    (
        model_file,
        model_artifact,
        extra_data,
    ) = mlrun.artifacts.get_model(model_path)

    # Read the name:
    _model_name = model_artifact.spec.db_key

    # Extract logged model files:
    model_directory = os.path.join(os.path.dirname(model_file), _model_name)
    with zipfile.ZipFile(model_file, "r") as zip_file:
        zip_file.extractall(model_directory)

    # Loading the saved pretrained tokenizer and model:
    dataset = Dataset.from_pandas(data)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    pad_token_id = tokenizer.eos_token_id
    model = AutoModelForCausalLM.from_pretrained(
        model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True
    )
    model = PeftModel.from_pretrained(model, model_directory)
    model.eval()
    encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt")

    max_length = 1024
    stride = 512
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc]
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids.cuda(), labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean()).item()
    context.log_result("perplexity", ppl)
 + commands: [] + code_origin: '' + origin_filename: '' + requirements: [] + entry_points: + add_interface: + name: add_interface + doc: '' + parameters: + - name: cls + - name: obj + type: Trainer + - name: restoration + type: MLRunInterfaceRestorationType + default: null + outputs: [] + lineno: 70 + has_varargs: false + has_kwargs: false + mlrun_train: + name: mlrun_train + doc: '' + parameters: + - name: cls + outputs: [] + lineno: 80 + has_varargs: false + has_kwargs: false + wrapper: + name: wrapper + doc: '' + parameters: + - name: self + type: Trainer + outputs: [] + lineno: 81 + has_varargs: true + has_kwargs: true + on_epoch_begin: + name: on_epoch_begin + doc: '' + parameters: + - name: self + - name: args + type: TrainingArguments + - name: state + type: TrainerState + - name: control + type: TrainerControl + outputs: [] + lineno: 129 + has_varargs: false + has_kwargs: true + on_epoch_end: + name: on_epoch_end + doc: '' + parameters: + - name: self + - name: args + type: TrainingArguments + - name: state + type: TrainerState + - name: control + type: TrainerControl + outputs: [] + lineno: 140 + has_varargs: false + has_kwargs: true + on_log: + name: on_log + doc: '' + parameters: + - name: self + - name: args + type: TrainingArguments + - name: state + type: TrainerState + - name: control + type: TrainerControl + - name: logs + type: Dict[str, float] + default: null + outputs: [] + lineno: 151 + has_varargs: false + has_kwargs: true + on_train_begin: + name: on_train_begin + doc: '' + parameters: + - name: self + - name: args + type: TrainingArguments + - name: state + type: TrainerState + - name: control + type: TrainerControl + outputs: [] + lineno: 177 + has_varargs: false + has_kwargs: true + on_train_end: + name: on_train_end + doc: '' + parameters: + - name: self + - name: args + type: TrainingArguments + - name: state + type: TrainerState + - name: control + type: TrainerControl + - name: model + type: PreTrainedModel + default: null + - name: tokenizer + type: PreTrainedTokenizer + default: null + outputs: [] + lineno: 188 + has_varargs: false + has_kwargs: true + on_evaluate: + name: on_evaluate + doc: '' + parameters: + - name: self + - name: args + type: TrainingArguments + - name: state + type: TrainerState + - name: control + type: TrainerControl + outputs: [] + lineno: 201 + has_varargs: false + has_kwargs: true + log_metrics: + name: log_metrics + doc: '' + parameters: + - name: self + outputs: [] + lineno: 215 + has_varargs: false + has_kwargs: false + log_metric_plot: + name: log_metric_plot + doc: '' + parameters: + - name: self + - name: name + type: str + - name: scores + type: List[float] + outputs: [] + lineno: 222 + has_varargs: false + has_kwargs: false + apply_mlrun: + name: apply_mlrun + doc: This is temporary and will be built in mlrun 1.5.0 + parameters: + - name: trainer + type: Trainer + - name: model_name + type: str + default: null + - name: tag + type: str + default: '' + - name: context + type: MLClientCtx + default: null + - name: auto_log + type: bool + default: true + - name: labels + type: Dict[str, str] + default: null + - name: extra_data + type: dict + default: null + outputs: [] + lineno: 244 + has_varargs: false + has_kwargs: true + finetune_llm: + name: finetune_llm + doc: "Fine-tunes a Language Model (LLM) on a specific task using the provided\ + \ dataset.\n The function takes various configuration parameters to customize\ + \ the training process\n and adapt the model to specific tasks using a provided\ + \ dataset." + parameters: + - name: context + type: MLClientCtx + doc: mlrun context in order to log trained model + - name: train_dataset + type: Union[str, mlrun.datastore.DataItem] + doc: The train dataset used for fine-tuning the language model. + - name: eval_dataset + type: str + doc: The eval dataset used for evaluate the language model during training. + default: null + - name: train_load_dataset_kwargs + type: dict + doc: kwargs for dataset loading + default: {} + - name: eval_load_dataset_kwargs + type: dict + doc: kwargs for dataset loading + default: {} + - name: dataset_columns_to_train + type: Union[str, list] + doc: which columns to pass to the model as inputs + default: text + - name: model + type: Union[str, List[str]] + doc: a tuple containing model name and class, or str with model name or path + default: huggingface-model + - name: tokenizer + type: Union[str, List[str]] + doc: a tuple containing tokenizer name and class, or str with tokenizer name + or path + default: null + - name: deepspeed_config + type: Union[dict, bool] + doc: Configuration options for DeepSpeed (optional). + default: false + - name: quantization_config + type: Union[dict, bool] + doc: Configuration options for model quantization (optional). + default: false + - name: lora_config + type: Union[dict, bool] + doc: Configuration options for Low-Rank Approximation (LoRA) (optional). + default: false + - name: training_config + type: dict + doc: Configuration options specific to the fine-tuning training process (optional). + default: {} + - name: model_pretrained_config + type: dict + doc: config to load the pretrained model + default: {} + - name: tokenizer_pretrained_config + type: dict + doc: config to load the pretrained tokenizer + default: {} + - name: data_collator_config + type: dict + doc: Configuration options for data collation during training (optional). + default: {} + - name: task + type: str + doc: A description of the specific task the model is being fine-tuned for. + default: text-generation + - name: use_cuda + type: bool + doc: use gpu or not + default: true + - name: framework + type: str + doc: pt ot tf + default: pt + - name: device_map + type: str + default: auto + outputs: [] + lineno: 630 + has_varargs: false + has_kwargs: true + evaluate: + name: evaluate + doc: 'Evaluating the model using perplexity, for more information visit: + + https://huggingface.co/docs/transformers/perplexity' + parameters: + - name: context + doc: mlrun context + - name: model_path + doc: path to the model directory + - name: data + type: DataFrame + doc: the data to evaluate the model + - name: model_name + type: str + doc: name of base model + default: null + - name: tokenizer_name + type: str + doc: name of base tokenizer + default: null + outputs: [] + lineno: 784 + has_varargs: false + has_kwargs: false + description: fine-tune llm model with ease + default_handler: finetune_llm + disable_auto_mount: false + clone_target_dir: '' + env: [] + priority_class_name: '' + preemption_mode: prevent + affinity: null + tolerations: null + security_context: {} +verbose: false diff --git a/functions/master/huggingface_auto_trainer/1.1.0/src/huggingface_auto_trainer.ipynb b/functions/master/huggingface_auto_trainer/1.1.0/src/huggingface_auto_trainer.ipynb new file mode 100644 index 00000000..847fa98d --- /dev/null +++ b/functions/master/huggingface_auto_trainer/1.1.0/src/huggingface_auto_trainer.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a2c5dc6d-33d0-4e74-a875-6eab556e3b2d", + "metadata": {}, + "source": [ + "# Llm auto trainer" + ] + }, + { + "cell_type": "markdown", + "id": "cc7aa261-17b2-4362-bf6a-34af79b0230b", + "metadata": {}, + "source": [ + "## Notebook Introduction: Fine-Tuning a Large Language Model with Ease\n", + "\n", + "Welcome to this example notebook that demonstrates a simplified yet powerful approach to fine-tuning a Large Language Model (LLM) effortlessly. Fine-tuning is a crucial technique that allows you to adapt pre-trained language models to specific tasks, making them more contextually relevant and useful.\n", + "\n", + "In this notebook, we will walk you through a step-by-step process of fine-tuning a state-of-the-art language model using a user-friendly and efficient method. You don't need to be an expert in machine learning or natural language processing to follow along – our approach focuses on simplicity and effectiveness." + ] + }, + { + "cell_type": "markdown", + "id": "425249e9-f43f-45e6-aa25-9f53099049cd", + "metadata": {}, + "source": [ + "### First, we will select the model we wish to fine-tune and take the matching tokenizer and appropriate config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3410e9c2-0557-4961-995e-0ef0cc07bf82", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig\n", + "from transformers import logging\n", + "\n", + "logging.set_verbosity(\"CRITICAL\")\n", + "\n", + "model_name = \"tiiuae/falcon-7b\"\n", + "tokenizer = model_name\n", + "generation_config = GenerationConfig.from_pretrained(model_name)" + ] + }, + { + "cell_type": "markdown", + "id": "f33f3c35-cf61-4b0f-8da9-1c30d3b53230", + "metadata": {}, + "source": [ + "### Then, in order to use with mlrun, we will create an mlrun project and create an mlrun function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8ee7c35-adf7-4ed8-9e7e-e659b9461cd5", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun\n", + "\n", + "project = mlrun.get_or_create_project(\n", + " name=\"auto-trainer-test\",\n", + " context=\"./\",\n", + " user_project=True,\n", + " parameters={\n", + " \"default_image\": \"yonishelach/mlrun-llm\",\n", + " },\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d56b834f-adf6-4736-8de7-3348e050f561", + "metadata": {}, + "outputs": [], + "source": [ + "project.set_function(\n", + " \"auto-trainer.py\",\n", + " name=\"auto-trainer\",\n", + " kind=\"job\",\n", + " image=\"yonishelach/mlrun-llm\",\n", + " handler=\"finetune_llm\",\n", + ")\n", + "project.save()" + ] + }, + { + "cell_type": "markdown", + "id": "f42315db-6ddd-4dc1-89f3-c732f92d0d47", + "metadata": {}, + "source": [ + "### we can set the every config or parameter we want, including training arguments, hyper parameters and more, and pass to the function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e62e577-15fb-477d-9c56-fa9fb4c2669b", + "metadata": {}, + "outputs": [], + "source": [ + "import transformers\n", + "\n", + "training_arguments = {\n", + " \"per_device_train_batch_size\": 4,\n", + " \"gradient_accumulation_steps\": 1,\n", + " \"warmup_steps\": 2,\n", + " \"max_steps\": 10,\n", + " \"learning_rate\": 2e-4,\n", + " \"fp16\": True,\n", + " \"logging_steps\": 1,\n", + " \"optim\": \"paged_adamw_8bit\",\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "284a5772-f88d-46c9-87bc-fc14e434c1b4", + "metadata": {}, + "source": [ + "### Now we simply run the function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11ab5888-5870-4bf8-9657-db930adecd77", + "metadata": {}, + "outputs": [], + "source": [ + "training_run = mlrun.run_function(\n", + " function=\"auto-trainer\",\n", + " name=\"auto-trainer\",\n", + " local=True,\n", + " params={\n", + " \"model\": (model_name, \"transformers.AutoModelForCausalLM\"),\n", + " \"tokenizer\": tokenizer,\n", + " \"train_dataset\": \"Abirate/english_quotes\",\n", + " \"training_config\": training_arguments,\n", + " \"quantization_config\": True,\n", + " \"lora_config\": True,\n", + " \"dataset_columns_to_train\": \"quote\",\n", + " \"lora_target_modules\": [\"query_key_value\"],\n", + " \"model_pretrained_config\": {\"trust_remote_code\": True, \"use_cache\": False},\n", + " },\n", + " handler=\"finetune_llm\",\n", + " outputs=[\"model\"],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e674d25-5f1f-4ea8-af02-7d22c2fb6760", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a4dfe9b-407a-43c0-9c5e-56de106477ac", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlrun-base", + "language": "python", + "name": "conda-env-mlrun-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/functions/master/huggingface_auto_trainer/1.1.0/src/huggingface_auto_trainer.py b/functions/master/huggingface_auto_trainer/1.1.0/src/huggingface_auto_trainer.py new file mode 100644 index 00000000..d1166318 --- /dev/null +++ b/functions/master/huggingface_auto_trainer/1.1.0/src/huggingface_auto_trainer.py @@ -0,0 +1,855 @@ +import importlib +import os +import shutil +import tempfile +import zipfile +from abc import ABC +from typing import Dict, List, Tuple, Union + +import mlrun +import numpy as np +import pandas as pd +import peft +import torch +import transformers +from datasets import Dataset, load_dataset +from mlrun.artifacts.manager import Artifact, PlotlyArtifact +from mlrun.datastore import is_store_uri +from mlrun.frameworks._common import CommonTypes, MLRunInterface +from mlrun.utils import logger +from peft import (LoraConfig, PeftModel, get_peft_model, + prepare_model_for_kbit_training) +from plotly import graph_objects as go +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig, DataCollatorForLanguageModeling, + PreTrainedModel, PreTrainedTokenizer, Trainer, + TrainerCallback, TrainerControl, TrainerState, + TrainingArguments) + +supported_tasks = [ + "question-answering", + "summarization", + "table-question-answering", + "text2text-generation", + "text-classification", + "sentiment-analysis", + "text-generation", + "token-classification", + "translation", + "translation_xx_to_yy", +] + + +class ConfigKeys: + deepspeed = "deepspeed" + quantization = "quantization" + lora = "lora" + training = "training" + tokenizer_pretrained = "tokenizer_pretrained" + model_pretrained = "model_pretrained" + data_collator = "data_collator" + + +# ----------------------from MLRUN-------------------------------- +class HFTrainerMLRunInterface(MLRunInterface, ABC): + """ + This is temporary and will be built in mlrun 1.5.0 + Interface for adding MLRun features for tensorflow keras API. + """ + + # MLRuns context default name: + DEFAULT_CONTEXT_NAME = "mlrun-huggingface" + + # Attributes to replace so the MLRun interface will be fully enabled. + _REPLACED_METHODS = [ + "train", + # "evaluate" + ] + + @classmethod + def add_interface( + cls, + obj: Trainer, + restoration: CommonTypes.MLRunInterfaceRestorationType = None, + ): + super(HFTrainerMLRunInterface, cls).add_interface( + obj=obj, restoration=restoration + ) + + @classmethod + def mlrun_train(cls): + def wrapper(self: Trainer, *args, **kwargs): + # Restore the evaluation method as `train` will use it: + # cls._restore_attribute(obj=self, attribute_name="evaluate") + + # Call the original fit method: + result = self.original_train(*args, **kwargs) + + # Replace the evaluation method again: + # cls._replace_function(obj=self, function_name="evaluate") + + return result + + return wrapper + + +class MLRunCallback(TrainerCallback): + """ + This is temporary and will be built in mlrun 1.5.0 + Callback for collecting logs during training / evaluation of the `Trainer` API. + """ + + def __init__( + self, + context: mlrun.MLClientCtx = None, + model_name: str = "model", + tag: str = "", + labels: Dict[str, str] = None, + extra_data: dict = None, + ): + super().__init__() + + # Store the configurations: + self._context = ( + context + if context is not None + else mlrun.get_or_create_ctx("./mlrun-huggingface") + ) + self._model_name = model_name + self._tag = tag + self._labels = labels + self._extra_data = extra_data if extra_data is not None else {} + + # Set up the logging mode: + self._is_training = False + self._steps: List[List[int]] = [] + self._metric_scores: Dict[str, List[float]] = {} + self._artifacts: Dict[str, Artifact] = {} + + def on_epoch_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self._steps.append([]) + + def on_epoch_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + def on_log( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + logs: Dict[str, float] = None, + **kwargs, + ): + if not state.is_world_process_zero: + return + recent_logs = state.log_history[-1].copy() + + recent_logs.pop("epoch") + current_step = int(recent_logs.pop("step")) + if current_step not in self._steps[-1]: + self._steps[-1].append(current_step) + + for metric_name, metric_score in recent_logs.items(): + if metric_name.startswith("train_"): + if metric_name.split("train_")[1] not in self._metric_scores: + self._metric_scores[metric_name] = [metric_score] + continue + if metric_name not in self._metric_scores: + self._metric_scores[metric_name] = [] + self._metric_scores[metric_name].append(metric_score) + + def on_train_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self._is_training = True + + def on_train_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + model: PreTrainedModel = None, + tokenizer: PreTrainedTokenizer = None, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + def on_evaluate( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + if self._is_training: + return + + def log_metrics(self): + for metric_name, metric_scores in self._metric_scores.items(): + self._context.log_result(key=metric_name, value=metric_scores[-1]) + if len(metric_scores) > 1: + self.log_metric_plot(name=metric_name, scores=metric_scores) + self._context.commit(completed=False) + + def log_metric_plot(self, name: str, scores: List[float]): + # Initialize a plotly figure: + metric_figure = go.Figure() + + # Add titles: + metric_figure.update_layout( + title=name.capitalize().replace("_", " "), + xaxis_title="Samples", + yaxis_title="Scores", + ) + + # Draw: + metric_figure.add_trace( + go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") + ) + + # Create the plotly artifact: + artifact_name = f"{name}_plot" + artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) + self._artifacts[artifact_name] = self._context.log_artifact(artifact) + + +def apply_mlrun( + trainer: transformers.Trainer, + model_name: str = None, + tag: str = "", + context: mlrun.MLClientCtx = None, + auto_log: bool = True, + labels: Dict[str, str] = None, + extra_data: dict = None, + **kwargs, +): + """ + This is temporary and will be built in mlrun 1.5.0 + """ + # Get parameters defaults: + if context is None: + context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) + + HFTrainerMLRunInterface.add_interface(obj=trainer) + + if auto_log: + trainer.add_callback( + MLRunCallback( + context=context, + model_name=model_name, + tag=tag, + labels=labels, + extra_data=extra_data, + ) + ) + + +# ----------------------end from MLRUN-------------------------------- + + +def _print_trainable_parameters(model): + """ + Prints the number of trainable parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in model.named_parameters(): + all_param += param.numel() + if param.requires_grad: + trainable_params += param.numel() + print( + f"trainable params: {trainable_params} || all params: {all_param} || trainable%:" + f" {100 * trainable_params / all_param}" + ) + + +# default configs +# will be used if user provides "True" with config name as input +QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, +) + +LORA_CONFIG = peft.LoraConfig( + r=8, + lora_alpha=32, + target_modules=["query_key_value"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", +) + +DEEPSPEED_CONFIG = { + "train_micro_batch_size_per_gpu": "auto", + "fp16": {"enabled": True}, + "autotuning": { + "enabled": True, + "arg_mappings": { + "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", + "gradient_accumulation_steps ": "--gradient_accumulation_steps", + }, + }, + "zero_optimization": { + "stage": 2, + }, +} + + +def _update_config(src: dict, dst: dict): + """ + update configs according to user, this way the user can add/modify values in default configs for e.g. + + goes over all configs and corresponding prefixes, collect all the keys from the given dict that start + with the prefix and add them to appropriate config + + :param src: dict of all candidate values to update dict. + :param dst: dict containing all configs to update. + """ + + for config_name, config in dst.items(): + + # If given True we use default dict + # Can also be False or a config dict given from user, so we check specifically fo True + if config is True and config_name == "quantization": + config = QUANTIZATION_CONFIG + + if config is True and config_name == "lora": + config = LORA_CONFIG + + if config is True and config_name == "deepspeed": + config = DEEPSPEED_CONFIG + + # in some cases we can get a boolean value, in that case no need to look for args + if isinstance(config, bool): + config = None + + elif isinstance(config, dict): + for key, val in src.items(): + if key.startswith(config_name): + config[key.replace(f"{config_name}_", "")] = val + + # update by config name + else: + for key, val in src.items(): + if key.startswith(config_name): + setattr(config, key.replace(f"{config_name}_", ""), val) + + dst.update({config_name: config}) + + +def _get_class_object(class_path: str) -> type: + """ + given a full class name, this function returns the correct class + + :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM') + + :return the wanted class object + """ + module_path, class_name = class_path.rsplit(".", 1) + module = importlib.import_module(module_path) + return getattr(module, class_name) + + +def _set_model_and_tokenizer( + model: Union[str, List[str]], + tokenizer: Union[str, List[str]], + task: str, + framework: str, + lora_config: dict, + quantization_config: dict, + use_cuda: bool, + tokenizer_pretrained_config, + model_pretrained_config, + device_map: str, +): + """ + get the correct model and tokenizer according to given user inputs + + :param model: a tuple containing model name and class, or str with model name or path + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path + :param task: a supported nlp task, used to choose model if not provided + :param framework: pt or tf + :param lora_config: lora config or None, to load model in appropriate way + :param quantization_config: quantization config or None, to load model in appropriate way + :param use_cuda: use gpu or not + :param tokenizer_pretrained_config: config to load the pretrained tokenizer + :param model_pretrained_config: config to load the pretrained model + :param device_map: a device map for model training if using number of gpu's + + :returns: model and tokenizer + """ + # if task is not supported and no model was given we can't choose one + if task and task not in supported_tasks and not model: + logger.error("unsupported task option chosen") + raise + + # load model from store + if isinstance(model, str) and is_store_uri(model): + pass + # TODO: load both model and tokenizer and return, need guy's help + + # if it's a tuple them we assume it contains of both name and class + if isinstance(model, list): + model_name, model_class = model + model_class = _get_class_object(model_class) + + # in the case we don't get the model class we need the task in order to choose the correct model + else: + if task is None: + logger.error("task must be chosen in order to determine the correct model") + raise Exception( + "this function requires either a supported task or a model and model class to be chosen" + ) + + _, available_classes, task_options = transformers.pipelines.check_task(task) + + if isinstance(model, str): + model_name = model + + # if model is not given, we take the default model for the given task + else: + model_name, _ = transformers.pipelines.get_default_model_and_revision( + available_classes, framework, task_options + ) + if not available_classes.get(framework, tuple()): + logger.error( + "given task's default model is not supported in specified framework" + ) + raise Exception( + "this function requires either a supported task or a model and model class to be chosen" + ) + + model_class = available_classes[framework][0] + + # load the pretrained model + if use_cuda: + device_map = device_map + else: + device_map = None + + model = model_class.from_pretrained( + model_name, + quantization_config=quantization_config, + device_map=device_map, + **model_pretrained_config, + ) + + # If quantization config is given we will load a quantized model, if not a regular one + if quantization_config: + model.gradient_checkpointing_enable() + model = peft.prepare_model_for_kbit_training(model) + + # If lora config was given we want to do lora fine tune, we update model here + if lora_config: + model = peft.get_peft_model(model, lora_config) + + # if not specified we choose the default tokenizer that corresponding to the model + if tokenizer is None: + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) + return model_name, model, tokenizer + + if isinstance(tokenizer, str): + tokenizer_name = tokenizer + tokenizer_class = transformers.AutoTokenizer + + # if it's not a str then it's a tuple of both name and class + else: + tokenizer_name, tokenizer_class = tokenizer + tokenizer_class = _get_class_object(tokenizer_class) + + tokenizer = tokenizer_class.from_pretrained( + tokenizer_name, **tokenizer_pretrained_config + ) + + tokenizer.pad_token = tokenizer.eos_token + + return model_name, model, tokenizer + + +def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: + """ + loads the specific dataset provided by the user + + :param dataset: name or path of dataset to load + :param is_train: bool that indicates the purpose of the dataset + :param kwargs: other kwargs for loading the dataset + + :returns: loaded dataset + """ + # if split in kwargs then the user decides how to split the dataset + if "split" in kwargs: + return load_dataset(dataset, **kwargs) + + # if it's a dataset for train we split with train + if is_train: + return load_dataset(dataset, split="train", **kwargs) + + # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them + dataset = load_dataset(dataset, **kwargs) + if "test" in dataset: + return dataset.get("test") + elif "eval" in dataset: + return dataset.get("eval") + elif "validation" in dataset: + return dataset.get("validation") + + +def _prepare_dataset( + train_dataset: str, + eval_dataset: str, + train_load_dataset_kwargs, + eval_load_dataset_kwargs, + tokenizer, + dataset_columns_to_train: Union[str, list], +) -> (Dataset, Union[Dataset, None]): + """ + Loads the train and eval datasets (if provided) passes them through the tokenizer and + returns them ready to use in training + + :param train_dataset: the name or path to the train dataset + :param eval_dataset: the name or path to the eval dataset + :param dataset_columns_to_train: which columns to pass to the model as inputs + (need to pass through the tokenizer first) + :param train_load_dataset_kwargs: kwargs for dataset loading + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param tokenizer: the tokenizer to pass the data through + + :returns: tokenized datasets + """ + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + + # we take col name/s in a list for easy generalization + if isinstance(dataset_columns_to_train, str): + dataset_columns_to_train = [dataset_columns_to_train] + + if isinstance(train_dataset, mlrun.datastore.DataItem): + train_dataset = Dataset.from_pandas(train_dataset.as_df()) + return ( + train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ), + None, + ) + + # Load datasets + # if provided two paths/names we load each separately using designated func + if eval_dataset: + train_dataset = _dataset_loader( + dataset=train_dataset, is_train=True, **train_load_dataset_kwargs + ) + eval_dataset = _dataset_loader( + dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs + ) + + # if only on path is given then we must check if it contains both dataset or if only one should be used + else: + dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) + if "train" in dataset: + train_dataset = dataset.get("train") + if "test" in dataset: + eval_dataset = dataset.get("test") + elif "eval" in dataset: + eval_dataset = dataset.get("eval") + elif "validation" in dataset: + eval_dataset = dataset.get("validation") + else: + # only train dataset given, tokenize and return it + return ( + train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ), + None, + ) + else: + logger.error("train dataset is mandatory") + raise KeyError("no train dataset found in given dataset") + + # Tokenize the data so the model can understand it + tokenized_train_dataset = train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ) + + tokenized_eval_dataset = eval_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ) + + return tokenized_train_dataset, tokenized_eval_dataset + + +def finetune_llm( + context: mlrun.MLClientCtx, + train_dataset: Union[str, mlrun.datastore.DataItem], + eval_dataset: str = None, + train_load_dataset_kwargs: dict = {}, + eval_load_dataset_kwargs: dict = {}, + dataset_columns_to_train: Union[str, list] = "text", + model: Union[str, List[str]] = "huggingface-model", + tokenizer: Union[str, List[str]] = None, + deepspeed_config: Union[dict, bool] = False, + quantization_config: Union[dict, bool] = False, + lora_config: Union[dict, bool] = False, + training_config: dict = {}, + model_pretrained_config: dict = {}, + tokenizer_pretrained_config: dict = {}, + data_collator_config: dict = {}, + task: str = "text-generation", + use_cuda: bool = True, + framework: str = "pt", + device_map: str = "auto", + **kwargs, +): + """ + Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. + The function takes various configuration parameters to customize the training process + and adapt the model to specific tasks using a provided dataset. + + :param context: mlrun context in order to log trained model + :param dataset_columns_to_train: which columns to pass to the model as inputs + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param train_load_dataset_kwargs: kwargs for dataset loading + :param framework: pt ot tf + :param use_cuda: use gpu or not + :param tokenizer_pretrained_config: config to load the pretrained tokenizer + :param model_pretrained_config: config to load the pretrained model + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path + :param model: a tuple containing model name and class, or str with model name or path + :param train_dataset: The train dataset used for fine-tuning the language model. + :param eval_dataset: The eval dataset used for evaluate the language model during training. + :param deepspeed_config: Configuration options for DeepSpeed (optional). + :param quantization_config: Configuration options for model quantization (optional). + :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). + :param training_config: Configuration options specific to the fine-tuning training process (optional). + :param data_collator_config: Configuration options for data collation during training (optional). + :param task: A description of the specific task the model is being fine-tuned for. + :param kwargs: Additional keyword arguments. + """ + + # TODO: match forward.keyword to dataset.keyword - check if relevant in new design + # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design + + # Look for updates to configs given in kwargs + configs = { + ConfigKeys.deepspeed: deepspeed_config, + ConfigKeys.quantization: quantization_config, + ConfigKeys.lora: lora_config, + ConfigKeys.training: training_config, + ConfigKeys.model_pretrained: model_pretrained_config, + ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, + ConfigKeys.data_collator: data_collator_config, + } + _update_config(dst=configs, src=kwargs) + + # check gpu permission and availability + if use_cuda: + if torch.cuda.is_available(): + # Clean gpu cache + torch.cuda.empty_cache() + else: + logger.warning("'use_cuda' is set to True, but no cuda device is available") + + # get model and tokenizer + model_name, model, tokenizer = _set_model_and_tokenizer( + model=model, + tokenizer=tokenizer, + task=task, + framework=framework, + lora_config=configs[ConfigKeys.lora], + quantization_config=configs[ConfigKeys.quantization], + use_cuda=use_cuda, + tokenizer_pretrained_config=tokenizer_pretrained_config, + model_pretrained_config=configs[ConfigKeys.model_pretrained], + device_map=device_map, + ) + + # Load datasets + tokenized_train, tokenized_eval = _prepare_dataset( + train_dataset=train_dataset, + eval_dataset=eval_dataset, + train_load_dataset_kwargs=train_load_dataset_kwargs, + eval_load_dataset_kwargs=eval_load_dataset_kwargs, + tokenizer=tokenizer, + dataset_columns_to_train=dataset_columns_to_train, + ) + + # Initialize the data collator for the trainer to use in order to create batches of data + data_collator = transformers.DataCollatorForLanguageModeling( + tokenizer=tokenizer, mlm=False, **data_collator_config + ) + + # Initialize training kwargs from user kwargs: + train_kwargs = configs[ConfigKeys.training] + + # If deepspeed config given we add it to training kwargs + if configs[ConfigKeys.deepspeed]: + train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed] + + # Take a look at the trainable parameters in the model + _print_trainable_parameters(model) + + # Preparing training arguments: + training_args = transformers.TrainingArguments( + output_dir=tempfile.mkdtemp(), + **train_kwargs, + ) + + trainer = transformers.Trainer( + model=model, + train_dataset=tokenized_train, + eval_dataset=tokenized_eval, + tokenizer=tokenizer, + data_collator=data_collator, + args=training_args, + ) + + apply_mlrun(trainer, model_name=model_name.split("/")[-1]) + model.config.use_cache = ( + False # silence the warnings. Please re-enable for inference! + ) + + # Apply training with evaluation: + context.logger.info(f"training '{model_name}'") + trainer.train() + + temp_directory = tempfile.TemporaryDirectory().name + trainer.save_model(temp_directory) + + # Zip the model directory: + shutil.make_archive( + base_name="model", + format="zip", + root_dir=temp_directory, + ) + + # Log the model: + context.log_model( + key="model", + db_key=model_name.split("/")[-1], + model_file="model.zip", + tag="", + framework="Hugging Face", + ) + + +def evaluate( + context, + model_path, + data: pd.DataFrame, + model_name: str = None, + tokenizer_name: str = None, +): + """ + Evaluating the model using perplexity, for more information visit: + https://huggingface.co/docs/transformers/perplexity + + :param context: mlrun context + :param model_path: path to the model directory + :param data: the data to evaluate the model + :param model_name: name of base model + :param tokenizer_name: name of base tokenizer + """ + # Get the model artifact and file: + ( + model_file, + model_artifact, + extra_data, + ) = mlrun.artifacts.get_model(model_path) + + # Read the name: + _model_name = model_artifact.spec.db_key + + # Extract logged model files: + model_directory = os.path.join(os.path.dirname(model_file), _model_name) + with zipfile.ZipFile(model_file, "r") as zip_file: + zip_file.extractall(model_directory) + + # Loading the saved pretrained tokenizer and model: + dataset = Dataset.from_pandas(data) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + pad_token_id = tokenizer.eos_token_id + model = AutoModelForCausalLM.from_pretrained( + model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True + ) + model = PeftModel.from_pretrained(model, model_directory) + model.eval() + encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt") + + max_length = 1024 + stride = 512 + seq_len = encodings.input_ids.size(1) + + nlls = [] + prev_end_loc = 0 + for begin_loc in range(0, seq_len, stride): + end_loc = min(begin_loc + max_length, seq_len) + trg_len = end_loc - prev_end_loc # may be different from stride on last loop + input_ids = encodings.input_ids[:, begin_loc:end_loc] + target_ids = input_ids.clone() + target_ids[:, :-trg_len] = -100 + + with torch.no_grad(): + outputs = model(input_ids.cuda(), labels=target_ids) + + # loss is calculated using CrossEntropyLoss which averages over valid labels + # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels + # to the left by 1. + neg_log_likelihood = outputs.loss + + nlls.append(neg_log_likelihood) + + prev_end_loc = end_loc + if end_loc == seq_len: + break + + ppl = torch.exp(torch.stack(nlls).mean()).item() + context.log_result("perplexity", ppl) diff --git a/functions/master/huggingface_auto_trainer/1.1.0/src/item.yaml b/functions/master/huggingface_auto_trainer/1.1.0/src/item.yaml new file mode 100644 index 00000000..b7c9bbcc --- /dev/null +++ b/functions/master/huggingface_auto_trainer/1.1.0/src/item.yaml @@ -0,0 +1,27 @@ +apiVersion: v1 +categories: +- huggingface +- genai +- machine-learning +- model-training +description: fine-tune llm model with ease +doc: '' +example: huggingface_auto_trainer.ipynb +generationDate: 2023-08-21:17-25 +hidden: false +icon: '' +labels: + author: Zeevr +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.4.0 +name: huggingface-auto-trainer +platformVersion: 3.5.0 +spec: + filename: huggingface_auto_trainer.py + handler: finetune_llm + image: mlrun/mlrun + kind: job + requirements: [] +url: '' +version: 1.1.0 diff --git a/functions/master/huggingface_auto_trainer/1.1.0/src/requirements.txt b/functions/master/huggingface_auto_trainer/1.1.0/src/requirements.txt new file mode 100644 index 00000000..1376b1d0 --- /dev/null +++ b/functions/master/huggingface_auto_trainer/1.1.0/src/requirements.txt @@ -0,0 +1,5 @@ +peft +transformers +torch +datasets +plotly diff --git a/functions/master/huggingface_auto_trainer/1.1.0/src/test_huggingface_auto_trainer.py b/functions/master/huggingface_auto_trainer/1.1.0/src/test_huggingface_auto_trainer.py new file mode 100644 index 00000000..53576e4e --- /dev/null +++ b/functions/master/huggingface_auto_trainer/1.1.0/src/test_huggingface_auto_trainer.py @@ -0,0 +1,42 @@ +import tempfile + +import mlrun + + +def test_train(): + + model_name = "distilgpt2" + tokenizer = model_name + auto_trainer = mlrun.import_function("function.yaml") + + training_arguments = { + "per_device_train_batch_size": 4, + "gradient_accumulation_steps": 1, + "warmup_steps": 2, + "max_steps": 10, + "learning_rate": 2e-4, + "logging_steps": 1, + } + + params = { + "model": (model_name, "transformers.AutoModelForCausalLM"), + "tokenizer": tokenizer, + "train_dataset": "Abirate/english_quotes", + "training_config": training_arguments, + "dataset_columns_to_train": "quote", + "model_pretrained_config": {"use_cache": False}, + "use_cuda": False, + } + + try: + with tempfile.TemporaryDirectory() as test_directory: + auto_trainer.run( + local=True, + params=params, + handler="finetune_llm", + returns=["model"], + workdir=test_directory, + ) + + except Exception as exception: + print(f"- The training failed - raised the following error:\n- {exception}") diff --git a/functions/master/huggingface_auto_trainer/1.1.0/static/documentation.html b/functions/master/huggingface_auto_trainer/1.1.0/static/documentation.html new file mode 100644 index 00000000..be893164 --- /dev/null +++ b/functions/master/huggingface_auto_trainer/1.1.0/static/documentation.html @@ -0,0 +1,380 @@ + + + + + + + +huggingface_auto_trainer package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+ + + +
+ +
+
+
+
+
+
+ + + + +
+
+ + +
+
+
+ +
+

huggingface_auto_trainer package

+ +
+ +
+
+
+
+
+

huggingface_auto_trainer package#

+
+

Submodules#

+
+
+

huggingface_auto_trainer.huggingface_auto_trainer module#

+
+
+class huggingface_auto_trainer.huggingface_auto_trainer.ConfigKeys[source]#
+

Bases: object

+
+
+data_collator = 'data_collator'#
+
+
+
+deepspeed = 'deepspeed'#
+
+
+
+lora = 'lora'#
+
+
+
+model_pretrained = 'model_pretrained'#
+
+
+
+quantization = 'quantization'#
+
+
+
+tokenizer_pretrained = 'tokenizer_pretrained'#
+
+
+
+training = 'training'#
+
+
+
+
+class huggingface_auto_trainer.huggingface_auto_trainer.HFTrainerMLRunInterface[source]#
+

Bases: abc.ABC, Generic[mlrun.frameworks._common.utils.MLRunInterfaceableType]

+

This is temporary and will be built in mlrun 1.5.0 +Interface for adding MLRun features for tensorflow keras API.

+
+
+DEFAULT_CONTEXT_NAME = 'mlrun-huggingface'#
+
+
+
+classmethod add_interface(obj: transformers.Trainer, restoration: Optional[Tuple[Dict[str, Any], Dict[str, Any], List[str]]] = None)[source]#
+

Enrich the object with this interface properties, methods and functions so it will have this framework MLRun’s +features.

+
+
Parameters
+
    +
  • obj – The object to enrich his interface.

  • +
  • restoration – Restoration information tuple as returned from ‘remove_interface’ in order to add the +interface in a certain state.

  • +
+
+
+
+
+
+classmethod mlrun_train()[source]#
+
+
+
+
+class huggingface_auto_trainer.huggingface_auto_trainer.MLRunCallback(*args: Any, **kwargs: Any)[source]#
+

Bases: transformers.

+

This is temporary and will be built in mlrun 1.5.0 +Callback for collecting logs during training / evaluation of the Trainer API.

+
+
+log_metric_plot(name: str, scores: List[float])[source]#
+
+
+
+log_metrics()[source]#
+
+
+
+on_epoch_begin(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
+
+
+
+on_epoch_end(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
+
+
+
+on_evaluate(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
+
+
+
+on_log(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, logs: Optional[Dict[str, float]] = None, **kwargs)[source]#
+
+
+
+on_train_begin(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
+
+
+
+on_train_end(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, model: Optional[transformers.PreTrainedModel] = None, tokenizer: Optional[transformers.PreTrainedTokenizer] = None, **kwargs)[source]#
+
+
+
+
+huggingface_auto_trainer.huggingface_auto_trainer.apply_mlrun(trainer: transformers.Trainer, model_name: Optional[str] = None, tag: str = '', context: Optional[mlrun.execution.MLClientCtx] = None, auto_log: bool = True, labels: Optional[Dict[str, str]] = None, extra_data: Optional[dict] = None, **kwargs)[source]#
+

This is temporary and will be built in mlrun 1.5.0

+
+
+
+huggingface_auto_trainer.huggingface_auto_trainer.evaluate(context, model_path, data: pandas.core.frame.DataFrame, model_name: Optional[str] = None, tokenizer_name: Optional[str] = None)[source]#
+

Evaluating the model using perplexity, for more information visit: +https://huggingface.co/docs/transformers/perplexity

+
+
Parameters
+
    +
  • context – mlrun context

  • +
  • model_path – path to the model directory

  • +
  • data – the data to evaluate the model

  • +
  • model_name – name of base model

  • +
  • tokenizer_name – name of base tokenizer

  • +
+
+
+
+
+
+huggingface_auto_trainer.huggingface_auto_trainer.finetune_llm(context: mlrun.execution.MLClientCtx, train_dataset: Union[str, mlrun.datastore.base.DataItem], eval_dataset: Optional[str] = None, train_load_dataset_kwargs: dict = {}, eval_load_dataset_kwargs: dict = {}, dataset_columns_to_train: Union[str, list] = 'text', model: Union[str, List[str]] = 'huggingface-model', tokenizer: Optional[Union[str, List[str]]] = None, deepspeed_config: Union[dict, bool] = False, quantization_config: Union[dict, bool] = False, lora_config: Union[dict, bool] = False, training_config: dict = {}, model_pretrained_config: dict = {}, tokenizer_pretrained_config: dict = {}, data_collator_config: dict = {}, task: str = 'text-generation', use_cuda: bool = True, framework: str = 'pt', device_map: str = 'auto', **kwargs)[source]#
+
+
Fine-tunes a Language Model (LLM) on a specific task using the provided dataset.

The function takes various configuration parameters to customize the training process +and adapt the model to specific tasks using a provided dataset.

+
+
+
+
Parameters
+
    +
  • context – mlrun context in order to log trained model

  • +
  • dataset_columns_to_train – which columns to pass to the model as inputs

  • +
  • eval_load_dataset_kwargs – kwargs for dataset loading

  • +
  • train_load_dataset_kwargs – kwargs for dataset loading

  • +
  • framework – pt ot tf

  • +
  • use_cuda – use gpu or not

  • +
  • tokenizer_pretrained_config – config to load the pretrained tokenizer

  • +
  • model_pretrained_config – config to load the pretrained model

  • +
  • tokenizer – a tuple containing tokenizer name and class, or str with tokenizer name or path

  • +
  • model – a tuple containing model name and class, or str with model name or path

  • +
  • train_dataset – The train dataset used for fine-tuning the language model.

  • +
  • eval_dataset – The eval dataset used for evaluate the language model during training.

  • +
  • deepspeed_config – Configuration options for DeepSpeed (optional).

  • +
  • quantization_config – Configuration options for model quantization (optional).

  • +
  • lora_config – Configuration options for Low-Rank Approximation (LoRA) (optional).

  • +
  • training_config – Configuration options specific to the fine-tuning training process (optional).

  • +
  • data_collator_config – Configuration options for data collation during training (optional).

  • +
  • task – A description of the specific task the model is being fine-tuned for.

  • +
  • kwargs – Additional keyword arguments.

  • +
+
+
+
+
+
+

Module contents#

+
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+ + + + \ No newline at end of file diff --git a/functions/master/huggingface_auto_trainer/1.1.0/static/example.html b/functions/master/huggingface_auto_trainer/1.1.0/static/example.html new file mode 100644 index 00000000..7ae9a6c4 --- /dev/null +++ b/functions/master/huggingface_auto_trainer/1.1.0/static/example.html @@ -0,0 +1,351 @@ + + + + + + + +Llm auto trainer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+ + + +
+ +
+
+
+
+
+
+ + + + +
+
+ + +
+
+
+ + +
+
+
+

Llm auto trainer#

+
+

Notebook Introduction: Fine-Tuning a Large Language Model with Ease#

+

Welcome to this example notebook that demonstrates a simplified yet powerful approach to fine-tuning a Large Language Model (LLM) effortlessly. Fine-tuning is a crucial technique that allows you to adapt pre-trained language models to specific tasks, making them more contextually relevant and useful.

+

In this notebook, we will walk you through a step-by-step process of fine-tuning a state-of-the-art language model using a user-friendly and efficient method. You don’t need to be an expert in machine learning or natural language processing to follow along – our approach focuses on simplicity and effectiveness.

+
+

First, we will select the model we wish to fine-tune and take the matching tokenizer and appropriate config#

+
+
+
import os
+from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
+from transformers import logging
+
+logging.set_verbosity("CRITICAL")
+
+model_name = "tiiuae/falcon-7b"
+tokenizer = model_name
+generation_config = GenerationConfig.from_pretrained(model_name)
+
+
+
+
+
+
+

Then, in order to use with mlrun, we will create an mlrun project and create an mlrun function#

+
+
+
import mlrun
+
+project = mlrun.get_or_create_project(
+    name="auto-trainer-test",
+    context="./",
+    user_project=True,
+    parameters={
+        "default_image": "yonishelach/mlrun-llm",
+    },
+)
+
+
+
+
+
+
+
project.set_function(
+    "auto-trainer.py",
+    name="auto-trainer",
+    kind="job",
+    image="yonishelach/mlrun-llm",
+    handler="finetune_llm",
+)
+project.save()
+
+
+
+
+
+
+

we can set the every config or parameter we want, including training arguments, hyper parameters and more, and pass to the function#

+
+
+
import transformers
+
+training_arguments = {
+    "per_device_train_batch_size": 4,
+    "gradient_accumulation_steps": 1,
+    "warmup_steps": 2,
+    "max_steps": 10,
+    "learning_rate": 2e-4,
+    "fp16": True,
+    "logging_steps": 1,
+    "optim": "paged_adamw_8bit",
+}
+
+
+
+
+
+
+

Now we simply run the function#

+
+
+
training_run = mlrun.run_function(
+    function="auto-trainer",
+    name="auto-trainer",
+    local=True,
+    params={
+        "model": (model_name, "transformers.AutoModelForCausalLM"),
+        "tokenizer": tokenizer,
+        "train_dataset": "Abirate/english_quotes",
+        "training_config": training_arguments,
+        "quantization_config": True,
+        "lora_config": True,
+        "dataset_columns_to_train": "quote",
+        "lora_target_modules": ["query_key_value"],
+        "model_pretrained_config": {"trust_remote_code": True, "use_cache": False},
+    },
+    handler="finetune_llm",
+    outputs=["model"],
+)
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+ + + + \ No newline at end of file diff --git a/functions/master/huggingface_auto_trainer/1.1.0/static/function.html b/functions/master/huggingface_auto_trainer/1.1.0/static/function.html new file mode 100644 index 00000000..a989880e --- /dev/null +++ b/functions/master/huggingface_auto_trainer/1.1.0/static/function.html @@ -0,0 +1,349 @@ + + + + + + + + + + + Source + + + + +
+        
+kind: job
+metadata:
+  name: huggingface-auto-trainer
+  tag: ''
+  hash: 55c9aa4a822780f7388819ccf633dfe26b31f02e
+  project: ''
+  labels:
+    author: Zeevr
+  categories:
+  - huggingface
+  - genai
+  - machine-learning
+  - model-training
+spec:
+  command: ''
+  args: []
+  image: mlrun/mlrun
+  build:
+    functionSourceCode: import importlib
import os
import shutil
import tempfile
import zipfile
from abc import ABC
from typing import Dict, List, Tuple, Union

import mlrun
import numpy as np
import pandas as pd
import peft
import torch
import transformers
from datasets import Dataset, load_dataset
from mlrun.artifacts.manager import Artifact, PlotlyArtifact
from mlrun.datastore import is_store_uri
from mlrun.frameworks._common import CommonTypes, MLRunInterface
from mlrun.utils import logger
from peft import (LoraConfig, PeftModel, get_peft_model,
                  prepare_model_for_kbit_training)
from plotly import graph_objects as go
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
                          PreTrainedModel, PreTrainedTokenizer, Trainer,
                          TrainerCallback, TrainerControl, TrainerState,
                          TrainingArguments)

supported_tasks = [
    "question-answering",
    "summarization",
    "table-question-answering",
    "text2text-generation",
    "text-classification",
    "sentiment-analysis",
    "text-generation",
    "token-classification",
    "translation",
    "translation_xx_to_yy",
]


class ConfigKeys:
    deepspeed = "deepspeed"
    quantization = "quantization"
    lora = "lora"
    training = "training"
    tokenizer_pretrained = "tokenizer_pretrained"
    model_pretrained = "model_pretrained"
    data_collator = "data_collator"


# ----------------------from MLRUN--------------------------------
class HFTrainerMLRunInterface(MLRunInterface, ABC):
    """
    This is temporary and will be built in mlrun 1.5.0
    Interface for adding MLRun features for tensorflow keras API.
    """

    # MLRuns context default name:
    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"

    # Attributes to replace so the MLRun interface will be fully enabled.
    _REPLACED_METHODS = [
        "train",
        # "evaluate"
    ]

    @classmethod
    def add_interface(
        cls,
        obj: Trainer,
        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
    ):
        super(HFTrainerMLRunInterface, cls).add_interface(
            obj=obj, restoration=restoration
        )

    @classmethod
    def mlrun_train(cls):
        def wrapper(self: Trainer, *args, **kwargs):
            # Restore the evaluation method as `train` will use it:
            # cls._restore_attribute(obj=self, attribute_name="evaluate")

            # Call the original fit method:
            result = self.original_train(*args, **kwargs)

            # Replace the evaluation method again:
            # cls._replace_function(obj=self, function_name="evaluate")

            return result

        return wrapper


class MLRunCallback(TrainerCallback):
    """
    This is temporary and will be built in mlrun 1.5.0
    Callback for collecting logs during training / evaluation of the `Trainer` API.
    """

    def __init__(
        self,
        context: mlrun.MLClientCtx = None,
        model_name: str = "model",
        tag: str = "",
        labels: Dict[str, str] = None,
        extra_data: dict = None,
    ):
        super().__init__()

        # Store the configurations:
        self._context = (
            context
            if context is not None
            else mlrun.get_or_create_ctx("./mlrun-huggingface")
        )
        self._model_name = model_name
        self._tag = tag
        self._labels = labels
        self._extra_data = extra_data if extra_data is not None else {}

        # Set up the logging mode:
        self._is_training = False
        self._steps: List[List[int]] = []
        self._metric_scores: Dict[str, List[float]] = {}
        self._artifacts: Dict[str, Artifact] = {}

    def on_epoch_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self._steps.append([])

    def on_epoch_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

    def on_log(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        logs: Dict[str, float] = None,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        recent_logs = state.log_history[-1].copy()

        recent_logs.pop("epoch")
        current_step = int(recent_logs.pop("step"))
        if current_step not in self._steps[-1]:
            self._steps[-1].append(current_step)

        for metric_name, metric_score in recent_logs.items():
            if metric_name.startswith("train_"):
                if metric_name.split("train_")[1] not in self._metric_scores:
                    self._metric_scores[metric_name] = [metric_score]
                continue
            if metric_name not in self._metric_scores:
                self._metric_scores[metric_name] = []
            self._metric_scores[metric_name].append(metric_score)

    def on_train_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self._is_training = True

    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        model: PreTrainedModel = None,
        tokenizer: PreTrainedTokenizer = None,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

    def on_evaluate(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

        if self._is_training:
            return

    def log_metrics(self):
        for metric_name, metric_scores in self._metric_scores.items():
            self._context.log_result(key=metric_name, value=metric_scores[-1])
            if len(metric_scores) > 1:
                self.log_metric_plot(name=metric_name, scores=metric_scores)
        self._context.commit(completed=False)

    def log_metric_plot(self, name: str, scores: List[float]):
        # Initialize a plotly figure:
        metric_figure = go.Figure()

        # Add titles:
        metric_figure.update_layout(
            title=name.capitalize().replace("_", " "),
            xaxis_title="Samples",
            yaxis_title="Scores",
        )

        # Draw:
        metric_figure.add_trace(
            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
        )

        # Create the plotly artifact:
        artifact_name = f"{name}_plot"
        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
        self._artifacts[artifact_name] = self._context.log_artifact(artifact)


def apply_mlrun(
    trainer: transformers.Trainer,
    model_name: str = None,
    tag: str = "",
    context: mlrun.MLClientCtx = None,
    auto_log: bool = True,
    labels: Dict[str, str] = None,
    extra_data: dict = None,
    **kwargs,
):
    """
    This is temporary and will be built in mlrun 1.5.0
    """
    # Get parameters defaults:
    if context is None:
        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)

    HFTrainerMLRunInterface.add_interface(obj=trainer)

    if auto_log:
        trainer.add_callback(
            MLRunCallback(
                context=context,
                model_name=model_name,
                tag=tag,
                labels=labels,
                extra_data=extra_data,
            )
        )


# ----------------------end from MLRUN--------------------------------


def _print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%:"
        f" {100 * trainable_params / all_param}"
    )


# default configs
# will be used if user provides "True" with config name as input
QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

LORA_CONFIG = peft.LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

DEEPSPEED_CONFIG = {
    "train_micro_batch_size_per_gpu": "auto",
    "fp16": {"enabled": True},
    "autotuning": {
        "enabled": True,
        "arg_mappings": {
            "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
            "gradient_accumulation_steps ": "--gradient_accumulation_steps",
        },
    },
    "zero_optimization": {
        "stage": 2,
    },
}


def _update_config(src: dict, dst: dict):
    """
    update configs according to user, this way the user can add/modify values in default configs for e.g.

    goes over all configs and corresponding prefixes, collect all the keys from the given dict that start
     with the prefix and add them to appropriate config

    :param src: dict of all candidate values to update dict.
    :param dst: dict containing all configs to update.
    """

    for config_name, config in dst.items():

        # If given True we use default dict
        # Can also be False or a config dict given from user, so we check specifically fo True
        if config is True and config_name == "quantization":
            config = QUANTIZATION_CONFIG

        if config is True and config_name == "lora":
            config = LORA_CONFIG

        if config is True and config_name == "deepspeed":
            config = DEEPSPEED_CONFIG

        # in some cases we can get a boolean value, in that case no need to look for args
        if isinstance(config, bool):
            config = None

        elif isinstance(config, dict):
            for key, val in src.items():
                if key.startswith(config_name):
                    config[key.replace(f"{config_name}_", "")] = val

        # update by config name
        else:
            for key, val in src.items():
                if key.startswith(config_name):
                    setattr(config, key.replace(f"{config_name}_", ""), val)

        dst.update({config_name: config})


def _get_class_object(class_path: str) -> type:
    """
    given a full class name, this function returns the correct class

    :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM')

    :return the wanted class object
    """
    module_path, class_name = class_path.rsplit(".", 1)
    module = importlib.import_module(module_path)
    return getattr(module, class_name)


def _set_model_and_tokenizer(
    model: Union[str, List[str]],
    tokenizer: Union[str, List[str]],
    task: str,
    framework: str,
    lora_config: dict,
    quantization_config: dict,
    use_cuda: bool,
    tokenizer_pretrained_config,
    model_pretrained_config,
    device_map: str,
):
    """
    get the correct model and tokenizer according to given user inputs

    :param model: a tuple containing model name and class, or str with model name or path
    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
    :param task: a supported nlp task, used to choose model if not provided
    :param framework: pt or tf
    :param lora_config: lora config or None, to load model in appropriate way
    :param quantization_config: quantization config or None, to load model in appropriate way
    :param use_cuda: use gpu or not
    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
    :param model_pretrained_config: config to load the pretrained model
    :param device_map: a device map for model training if using number of gpu's

    :returns: model and tokenizer
    """
    # if task is not supported and no model was given we can't choose one
    if task and task not in supported_tasks and not model:
        logger.error("unsupported task option chosen")
        raise

    # load model from store
    if isinstance(model, str) and is_store_uri(model):
        pass
        # TODO: load both model and tokenizer and return, need guy's help

    # if it's a tuple them we assume it contains of both name and class
    if isinstance(model, list):
        model_name, model_class = model
        model_class = _get_class_object(model_class)

    # in the case we don't get the model class we need the task in order to choose the correct model
    else:
        if task is None:
            logger.error("task must be chosen in order to determine the correct model")
            raise Exception(
                "this function requires either a supported task or a model and model class to be chosen"
            )

        _, available_classes, task_options = transformers.pipelines.check_task(task)

        if isinstance(model, str):
            model_name = model

        # if model is not given, we take the default model for the given task
        else:
            model_name, _ = transformers.pipelines.get_default_model_and_revision(
                available_classes, framework, task_options
            )
        if not available_classes.get(framework, tuple()):
            logger.error(
                "given task's default model is not supported in specified framework"
            )
            raise Exception(
                "this function requires either a supported task or a model and model class to be chosen"
            )

        model_class = available_classes[framework][0]

    # load the pretrained model
    if use_cuda:
        device_map = device_map
    else:
        device_map = None

    model = model_class.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map=device_map,
        **model_pretrained_config,
    )

    # If quantization config is given we will load a quantized model, if not a regular one
    if quantization_config:
        model.gradient_checkpointing_enable()
        model = peft.prepare_model_for_kbit_training(model)

    # If lora config was given we want to do lora fine tune, we update model here
    if lora_config:
        model = peft.get_peft_model(model, lora_config)

    # if not specified we choose the default tokenizer that corresponding to the model
    if tokenizer is None:
        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
        return model_name, model, tokenizer

    if isinstance(tokenizer, str):
        tokenizer_name = tokenizer
        tokenizer_class = transformers.AutoTokenizer

    # if it's not a str then it's a tuple of both name and class
    else:
        tokenizer_name, tokenizer_class = tokenizer
        tokenizer_class = _get_class_object(tokenizer_class)

    tokenizer = tokenizer_class.from_pretrained(
        tokenizer_name, **tokenizer_pretrained_config
    )

    tokenizer.pad_token = tokenizer.eos_token

    return model_name, model, tokenizer


def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset:
    """
    loads the specific dataset provided by the user

    :param dataset: name or path of dataset to load
    :param is_train: bool that indicates the purpose of the dataset
    :param kwargs: other kwargs for loading the dataset

    :returns: loaded dataset
    """
    # if split in kwargs then the user decides how to split the dataset
    if "split" in kwargs:
        return load_dataset(dataset, **kwargs)

    # if it's a dataset for train we split with train
    if is_train:
        return load_dataset(dataset, split="train", **kwargs)

    # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them
    dataset = load_dataset(dataset, **kwargs)
    if "test" in dataset:
        return dataset.get("test")
    elif "eval" in dataset:
        return dataset.get("eval")
    elif "validation" in dataset:
        return dataset.get("validation")


def _prepare_dataset(
    train_dataset: str,
    eval_dataset: str,
    train_load_dataset_kwargs,
    eval_load_dataset_kwargs,
    tokenizer,
    dataset_columns_to_train: Union[str, list],
) -> (Dataset, Union[Dataset, None]):
    """
    Loads the train and eval datasets (if provided) passes them through the tokenizer and
    returns them ready to use in training

    :param train_dataset: the name or path to the train dataset
    :param eval_dataset: the name or path to the eval dataset
    :param dataset_columns_to_train: which columns to pass to the model as inputs
                                        (need to pass through the tokenizer first)
    :param train_load_dataset_kwargs: kwargs for dataset loading
    :param eval_load_dataset_kwargs: kwargs for dataset loading
    :param tokenizer: the tokenizer to pass the data through

    :returns: tokenized datasets
    """
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    # we take col name/s in a list for easy generalization
    if isinstance(dataset_columns_to_train, str):
        dataset_columns_to_train = [dataset_columns_to_train]

    if isinstance(train_dataset, mlrun.datastore.DataItem):
        train_dataset = Dataset.from_pandas(train_dataset.as_df())
        return (
            train_dataset.map(
                lambda examples: tokenizer(
                    *[examples[col] for col in dataset_columns_to_train],
                    truncation=True,
                    padding=True,
                ),
                batched=True,
            ),
            None,
        )

    # Load datasets
    # if provided two paths/names we load each separately using designated func
    if eval_dataset:
        train_dataset = _dataset_loader(
            dataset=train_dataset, is_train=True, **train_load_dataset_kwargs
        )
        eval_dataset = _dataset_loader(
            dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs
        )

    # if only on path is given then we must check if it contains both dataset or if only one should be used
    else:
        dataset = load_dataset(train_dataset, **train_load_dataset_kwargs)
        if "train" in dataset:
            train_dataset = dataset.get("train")
            if "test" in dataset:
                eval_dataset = dataset.get("test")
            elif "eval" in dataset:
                eval_dataset = dataset.get("eval")
            elif "validation" in dataset:
                eval_dataset = dataset.get("validation")
            else:
                # only train dataset given, tokenize and return it
                return (
                    train_dataset.map(
                        lambda examples: tokenizer(
                            *[examples[col] for col in dataset_columns_to_train],
                            truncation=True,
                            padding=True,
                        ),
                        batched=True,
                    ),
                    None,
                )
        else:
            logger.error("train dataset is mandatory")
            raise KeyError("no train dataset found in given dataset")

    # Tokenize the data so the model can understand it
    tokenized_train_dataset = train_dataset.map(
        lambda examples: tokenizer(
            *[examples[col] for col in dataset_columns_to_train],
            truncation=True,
            padding=True,
        ),
        batched=True,
    )

    tokenized_eval_dataset = eval_dataset.map(
        lambda examples: tokenizer(
            *[examples[col] for col in dataset_columns_to_train],
            truncation=True,
            padding=True,
        ),
        batched=True,
    )

    return tokenized_train_dataset, tokenized_eval_dataset


def finetune_llm(
    context: mlrun.MLClientCtx,
    train_dataset: Union[str, mlrun.datastore.DataItem],
    eval_dataset: str = None,
    train_load_dataset_kwargs: dict = {},
    eval_load_dataset_kwargs: dict = {},
    dataset_columns_to_train: Union[str, list] = "text",
    model: Union[str, List[str]] = "huggingface-model",
    tokenizer: Union[str, List[str]] = None,
    deepspeed_config: Union[dict, bool] = False,
    quantization_config: Union[dict, bool] = False,
    lora_config: Union[dict, bool] = False,
    training_config: dict = {},
    model_pretrained_config: dict = {},
    tokenizer_pretrained_config: dict = {},
    data_collator_config: dict = {},
    task: str = "text-generation",
    use_cuda: bool = True,
    framework: str = "pt",
    device_map: str = "auto",
    **kwargs,
):
    """
    Fine-tunes a Language Model (LLM) on a specific task using the provided dataset.
     The function takes various configuration parameters to customize the training process
     and adapt the model to specific tasks using a provided dataset.

    :param context: mlrun context in order to log trained model
    :param dataset_columns_to_train: which columns to pass to the model as inputs
    :param eval_load_dataset_kwargs: kwargs for dataset loading
    :param train_load_dataset_kwargs: kwargs for dataset loading
    :param framework: pt ot tf
    :param use_cuda: use gpu or not
    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
    :param model_pretrained_config: config to load the pretrained model
    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
    :param model: a tuple containing model name and class, or str with model name or path
    :param train_dataset: The train dataset used for fine-tuning the language model.
    :param eval_dataset: The eval dataset used for evaluate the language model during training.
    :param deepspeed_config: Configuration options for DeepSpeed (optional).
    :param quantization_config: Configuration options for model quantization (optional).
    :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional).
    :param training_config: Configuration options specific to the fine-tuning training process (optional).
    :param data_collator_config: Configuration options for data collation during training (optional).
    :param task: A description of the specific task the model is being fine-tuned for.
    :param kwargs: Additional keyword arguments.
    """

    # TODO: match forward.keyword to dataset.keyword - check if relevant in new design
    # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design

    # Look for updates to configs given in kwargs
    configs = {
        ConfigKeys.deepspeed: deepspeed_config,
        ConfigKeys.quantization: quantization_config,
        ConfigKeys.lora: lora_config,
        ConfigKeys.training: training_config,
        ConfigKeys.model_pretrained: model_pretrained_config,
        ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config,
        ConfigKeys.data_collator: data_collator_config,
    }
    _update_config(dst=configs, src=kwargs)

    # check gpu permission and availability
    if use_cuda:
        if torch.cuda.is_available():
            # Clean gpu cache
            torch.cuda.empty_cache()
        else:
            logger.warning("'use_cuda' is set to True, but no cuda device is available")

    # get model and tokenizer
    model_name, model, tokenizer = _set_model_and_tokenizer(
        model=model,
        tokenizer=tokenizer,
        task=task,
        framework=framework,
        lora_config=configs[ConfigKeys.lora],
        quantization_config=configs[ConfigKeys.quantization],
        use_cuda=use_cuda,
        tokenizer_pretrained_config=tokenizer_pretrained_config,
        model_pretrained_config=configs[ConfigKeys.model_pretrained],
        device_map=device_map,
    )

    # Load datasets
    tokenized_train, tokenized_eval = _prepare_dataset(
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        train_load_dataset_kwargs=train_load_dataset_kwargs,
        eval_load_dataset_kwargs=eval_load_dataset_kwargs,
        tokenizer=tokenizer,
        dataset_columns_to_train=dataset_columns_to_train,
    )

    # Initialize the data collator for the trainer to use in order to create batches of data
    data_collator = transformers.DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False, **data_collator_config
    )

    # Initialize training kwargs from user kwargs:
    train_kwargs = configs[ConfigKeys.training]

    # If deepspeed config given we add it to training kwargs
    if configs[ConfigKeys.deepspeed]:
        train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed]

    # Take a look at the trainable parameters in the model
    _print_trainable_parameters(model)

    # Preparing training arguments:
    training_args = transformers.TrainingArguments(
        output_dir=tempfile.mkdtemp(),
        **train_kwargs,
    )

    trainer = transformers.Trainer(
        model=model,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        tokenizer=tokenizer,
        data_collator=data_collator,
        args=training_args,
    )

    apply_mlrun(trainer, model_name=model_name.split("/")[-1])
    model.config.use_cache = (
        False  # silence the warnings. Please re-enable for inference!
    )

    # Apply training with evaluation:
    context.logger.info(f"training '{model_name}'")
    trainer.train()

    temp_directory = tempfile.TemporaryDirectory().name
    trainer.save_model(temp_directory)

    # Zip the model directory:
    shutil.make_archive(
        base_name="model",
        format="zip",
        root_dir=temp_directory,
    )

    # Log the model:
    context.log_model(
        key="model",
        db_key=model_name.split("/")[-1],
        model_file="model.zip",
        tag="",
        framework="Hugging Face",
    )


def evaluate(
    context,
    model_path,
    data: pd.DataFrame,
    model_name: str = None,
    tokenizer_name: str = None,
):
    """
    Evaluating the model using perplexity, for more information visit:
    https://huggingface.co/docs/transformers/perplexity

    :param context:     mlrun context
    :param model_path:  path to the model directory
    :param data:        the data to evaluate the model
    :param model_name:  name of base model
    :param tokenizer_name: name of base tokenizer
    """
    # Get the model artifact and file:
    (
        model_file,
        model_artifact,
        extra_data,
    ) = mlrun.artifacts.get_model(model_path)

    # Read the name:
    _model_name = model_artifact.spec.db_key

    # Extract logged model files:
    model_directory = os.path.join(os.path.dirname(model_file), _model_name)
    with zipfile.ZipFile(model_file, "r") as zip_file:
        zip_file.extractall(model_directory)

    # Loading the saved pretrained tokenizer and model:
    dataset = Dataset.from_pandas(data)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    pad_token_id = tokenizer.eos_token_id
    model = AutoModelForCausalLM.from_pretrained(
        model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True
    )
    model = PeftModel.from_pretrained(model, model_directory)
    model.eval()
    encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt")

    max_length = 1024
    stride = 512
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc]
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids.cuda(), labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean()).item()
    context.log_result("perplexity", ppl)

+    commands: []
+    code_origin: ''
+    origin_filename: ''
+    requirements: []
+  entry_points:
+    add_interface:
+      name: add_interface
+      doc: ''
+      parameters:
+      - name: cls
+      - name: obj
+        type: Trainer
+      - name: restoration
+        type: MLRunInterfaceRestorationType
+        default: null
+      outputs: []
+      lineno: 70
+      has_varargs: false
+      has_kwargs: false
+    mlrun_train:
+      name: mlrun_train
+      doc: ''
+      parameters:
+      - name: cls
+      outputs: []
+      lineno: 80
+      has_varargs: false
+      has_kwargs: false
+    wrapper:
+      name: wrapper
+      doc: ''
+      parameters:
+      - name: self
+        type: Trainer
+      outputs: []
+      lineno: 81
+      has_varargs: true
+      has_kwargs: true
+    on_epoch_begin:
+      name: on_epoch_begin
+      doc: ''
+      parameters:
+      - name: self
+      - name: args
+        type: TrainingArguments
+      - name: state
+        type: TrainerState
+      - name: control
+        type: TrainerControl
+      outputs: []
+      lineno: 129
+      has_varargs: false
+      has_kwargs: true
+    on_epoch_end:
+      name: on_epoch_end
+      doc: ''
+      parameters:
+      - name: self
+      - name: args
+        type: TrainingArguments
+      - name: state
+        type: TrainerState
+      - name: control
+        type: TrainerControl
+      outputs: []
+      lineno: 140
+      has_varargs: false
+      has_kwargs: true
+    on_log:
+      name: on_log
+      doc: ''
+      parameters:
+      - name: self
+      - name: args
+        type: TrainingArguments
+      - name: state
+        type: TrainerState
+      - name: control
+        type: TrainerControl
+      - name: logs
+        type: Dict[str, float]
+        default: null
+      outputs: []
+      lineno: 151
+      has_varargs: false
+      has_kwargs: true
+    on_train_begin:
+      name: on_train_begin
+      doc: ''
+      parameters:
+      - name: self
+      - name: args
+        type: TrainingArguments
+      - name: state
+        type: TrainerState
+      - name: control
+        type: TrainerControl
+      outputs: []
+      lineno: 177
+      has_varargs: false
+      has_kwargs: true
+    on_train_end:
+      name: on_train_end
+      doc: ''
+      parameters:
+      - name: self
+      - name: args
+        type: TrainingArguments
+      - name: state
+        type: TrainerState
+      - name: control
+        type: TrainerControl
+      - name: model
+        type: PreTrainedModel
+        default: null
+      - name: tokenizer
+        type: PreTrainedTokenizer
+        default: null
+      outputs: []
+      lineno: 188
+      has_varargs: false
+      has_kwargs: true
+    on_evaluate:
+      name: on_evaluate
+      doc: ''
+      parameters:
+      - name: self
+      - name: args
+        type: TrainingArguments
+      - name: state
+        type: TrainerState
+      - name: control
+        type: TrainerControl
+      outputs: []
+      lineno: 201
+      has_varargs: false
+      has_kwargs: true
+    log_metrics:
+      name: log_metrics
+      doc: ''
+      parameters:
+      - name: self
+      outputs: []
+      lineno: 215
+      has_varargs: false
+      has_kwargs: false
+    log_metric_plot:
+      name: log_metric_plot
+      doc: ''
+      parameters:
+      - name: self
+      - name: name
+        type: str
+      - name: scores
+        type: List[float]
+      outputs: []
+      lineno: 222
+      has_varargs: false
+      has_kwargs: false
+    apply_mlrun:
+      name: apply_mlrun
+      doc: This is temporary and will be built in mlrun 1.5.0
+      parameters:
+      - name: trainer
+        type: Trainer
+      - name: model_name
+        type: str
+        default: null
+      - name: tag
+        type: str
+        default: ''
+      - name: context
+        type: MLClientCtx
+        default: null
+      - name: auto_log
+        type: bool
+        default: true
+      - name: labels
+        type: Dict[str, str]
+        default: null
+      - name: extra_data
+        type: dict
+        default: null
+      outputs: []
+      lineno: 244
+      has_varargs: false
+      has_kwargs: true
+    finetune_llm:
+      name: finetune_llm
+      doc: "Fine-tunes a Language Model (LLM) on a specific task using the provided\
+        \ dataset.\n The function takes various configuration parameters to customize\
+        \ the training process\n and adapt the model to specific tasks using a provided\
+        \ dataset."
+      parameters:
+      - name: context
+        type: MLClientCtx
+        doc: mlrun context in order to log trained model
+      - name: train_dataset
+        type: Union[str, mlrun.datastore.DataItem]
+        doc: The train dataset used for fine-tuning the language model.
+      - name: eval_dataset
+        type: str
+        doc: The eval dataset used for evaluate the language model during training.
+        default: null
+      - name: train_load_dataset_kwargs
+        type: dict
+        doc: kwargs for dataset loading
+        default: {}
+      - name: eval_load_dataset_kwargs
+        type: dict
+        doc: kwargs for dataset loading
+        default: {}
+      - name: dataset_columns_to_train
+        type: Union[str, list]
+        doc: which columns to pass to the model as inputs
+        default: text
+      - name: model
+        type: Union[str, List[str]]
+        doc: a tuple containing model name and class, or str with model name or path
+        default: huggingface-model
+      - name: tokenizer
+        type: Union[str, List[str]]
+        doc: a tuple containing tokenizer name and class, or str with tokenizer name
+          or path
+        default: null
+      - name: deepspeed_config
+        type: Union[dict, bool]
+        doc: Configuration options for DeepSpeed (optional).
+        default: false
+      - name: quantization_config
+        type: Union[dict, bool]
+        doc: Configuration options for model quantization (optional).
+        default: false
+      - name: lora_config
+        type: Union[dict, bool]
+        doc: Configuration options for Low-Rank Approximation (LoRA) (optional).
+        default: false
+      - name: training_config
+        type: dict
+        doc: Configuration options specific to the fine-tuning training process (optional).
+        default: {}
+      - name: model_pretrained_config
+        type: dict
+        doc: config to load the pretrained model
+        default: {}
+      - name: tokenizer_pretrained_config
+        type: dict
+        doc: config to load the pretrained tokenizer
+        default: {}
+      - name: data_collator_config
+        type: dict
+        doc: Configuration options for data collation during training (optional).
+        default: {}
+      - name: task
+        type: str
+        doc: A description of the specific task the model is being fine-tuned for.
+        default: text-generation
+      - name: use_cuda
+        type: bool
+        doc: use gpu or not
+        default: true
+      - name: framework
+        type: str
+        doc: pt ot tf
+        default: pt
+      - name: device_map
+        type: str
+        default: auto
+      outputs: []
+      lineno: 630
+      has_varargs: false
+      has_kwargs: true
+    evaluate:
+      name: evaluate
+      doc: 'Evaluating the model using perplexity, for more information visit:
+
+        https://huggingface.co/docs/transformers/perplexity'
+      parameters:
+      - name: context
+        doc: mlrun context
+      - name: model_path
+        doc: path to the model directory
+      - name: data
+        type: DataFrame
+        doc: the data to evaluate the model
+      - name: model_name
+        type: str
+        doc: name of base model
+        default: null
+      - name: tokenizer_name
+        type: str
+        doc: name of base tokenizer
+        default: null
+      outputs: []
+      lineno: 784
+      has_varargs: false
+      has_kwargs: false
+  description: fine-tune llm model with ease
+  default_handler: finetune_llm
+  disable_auto_mount: false
+  clone_target_dir: ''
+  env: []
+  priority_class_name: ''
+  preemption_mode: prevent
+  affinity: null
+  tolerations: null
+  security_context: {}
+verbose: false
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/huggingface_auto_trainer/1.1.0/static/huggingface_auto_trainer.html b/functions/master/huggingface_auto_trainer/1.1.0/static/huggingface_auto_trainer.html new file mode 100644 index 00000000..2063d183 --- /dev/null +++ b/functions/master/huggingface_auto_trainer/1.1.0/static/huggingface_auto_trainer.html @@ -0,0 +1,995 @@ + + + + + + + +huggingface_auto_trainer.huggingface_auto_trainer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+ + + +
+ +
+
+
+
+
+
+ + +
+
+ +
+
+
+
+
+ +
+

+ +
+
+
+
+
+
+
+

Source code for huggingface_auto_trainer.huggingface_auto_trainer

+import importlib
+import os
+import shutil
+import tempfile
+import zipfile
+from abc import ABC
+from typing import Dict, List, Tuple, Union
+
+import mlrun
+import numpy as np
+import pandas as pd
+import peft
+import torch
+import transformers
+from datasets import Dataset, load_dataset
+from mlrun.artifacts.manager import Artifact, PlotlyArtifact
+from mlrun.datastore import is_store_uri
+from mlrun.frameworks._common import CommonTypes, MLRunInterface
+from mlrun.utils import logger
+from peft import (LoraConfig, PeftModel, get_peft_model,
+                  prepare_model_for_kbit_training)
+from plotly import graph_objects as go
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
+                          PreTrainedModel, PreTrainedTokenizer, Trainer,
+                          TrainerCallback, TrainerControl, TrainerState,
+                          TrainingArguments)
+
+supported_tasks = [
+    "question-answering",
+    "summarization",
+    "table-question-answering",
+    "text2text-generation",
+    "text-classification",
+    "sentiment-analysis",
+    "text-generation",
+    "token-classification",
+    "translation",
+    "translation_xx_to_yy",
+]
+
+
+
[docs]class ConfigKeys: + deepspeed = "deepspeed" + quantization = "quantization" + lora = "lora" + training = "training" + tokenizer_pretrained = "tokenizer_pretrained" + model_pretrained = "model_pretrained" + data_collator = "data_collator"
+ + +# ----------------------from MLRUN-------------------------------- +
[docs]class HFTrainerMLRunInterface(MLRunInterface, ABC): + """ + This is temporary and will be built in mlrun 1.5.0 + Interface for adding MLRun features for tensorflow keras API. + """ + + # MLRuns context default name: + DEFAULT_CONTEXT_NAME = "mlrun-huggingface" + + # Attributes to replace so the MLRun interface will be fully enabled. + _REPLACED_METHODS = [ + "train", + # "evaluate" + ] + +
[docs] @classmethod + def add_interface( + cls, + obj: Trainer, + restoration: CommonTypes.MLRunInterfaceRestorationType = None, + ): + super(HFTrainerMLRunInterface, cls).add_interface( + obj=obj, restoration=restoration + )
+ +
[docs] @classmethod + def mlrun_train(cls): + def wrapper(self: Trainer, *args, **kwargs): + # Restore the evaluation method as `train` will use it: + # cls._restore_attribute(obj=self, attribute_name="evaluate") + + # Call the original fit method: + result = self.original_train(*args, **kwargs) + + # Replace the evaluation method again: + # cls._replace_function(obj=self, function_name="evaluate") + + return result + + return wrapper
+ + +
[docs]class MLRunCallback(TrainerCallback): + """ + This is temporary and will be built in mlrun 1.5.0 + Callback for collecting logs during training / evaluation of the `Trainer` API. + """ + + def __init__( + self, + context: mlrun.MLClientCtx = None, + model_name: str = "model", + tag: str = "", + labels: Dict[str, str] = None, + extra_data: dict = None, + ): + super().__init__() + + # Store the configurations: + self._context = ( + context + if context is not None + else mlrun.get_or_create_ctx("./mlrun-huggingface") + ) + self._model_name = model_name + self._tag = tag + self._labels = labels + self._extra_data = extra_data if extra_data is not None else {} + + # Set up the logging mode: + self._is_training = False + self._steps: List[List[int]] = [] + self._metric_scores: Dict[str, List[float]] = {} + self._artifacts: Dict[str, Artifact] = {} + +
[docs] def on_epoch_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self._steps.append([])
+ +
[docs] def on_epoch_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics()
+ +
[docs] def on_log( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + logs: Dict[str, float] = None, + **kwargs, + ): + if not state.is_world_process_zero: + return + recent_logs = state.log_history[-1].copy() + + recent_logs.pop("epoch") + current_step = int(recent_logs.pop("step")) + if current_step not in self._steps[-1]: + self._steps[-1].append(current_step) + + for metric_name, metric_score in recent_logs.items(): + if metric_name.startswith("train_"): + if metric_name.split("train_")[1] not in self._metric_scores: + self._metric_scores[metric_name] = [metric_score] + continue + if metric_name not in self._metric_scores: + self._metric_scores[metric_name] = [] + self._metric_scores[metric_name].append(metric_score)
+ +
[docs] def on_train_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self._is_training = True
+ +
[docs] def on_train_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + model: PreTrainedModel = None, + tokenizer: PreTrainedTokenizer = None, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics()
+ +
[docs] def on_evaluate( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if not state.is_world_process_zero: + return + self.log_metrics() + + if self._is_training: + return
+ +
[docs] def log_metrics(self): + for metric_name, metric_scores in self._metric_scores.items(): + self._context.log_result(key=metric_name, value=metric_scores[-1]) + if len(metric_scores) > 1: + self.log_metric_plot(name=metric_name, scores=metric_scores) + self._context.commit(completed=False)
+ +
[docs] def log_metric_plot(self, name: str, scores: List[float]): + # Initialize a plotly figure: + metric_figure = go.Figure() + + # Add titles: + metric_figure.update_layout( + title=name.capitalize().replace("_", " "), + xaxis_title="Samples", + yaxis_title="Scores", + ) + + # Draw: + metric_figure.add_trace( + go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") + ) + + # Create the plotly artifact: + artifact_name = f"{name}_plot" + artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) + self._artifacts[artifact_name] = self._context.log_artifact(artifact)
+ + +
[docs]def apply_mlrun( + trainer: transformers.Trainer, + model_name: str = None, + tag: str = "", + context: mlrun.MLClientCtx = None, + auto_log: bool = True, + labels: Dict[str, str] = None, + extra_data: dict = None, + **kwargs, +): + """ + This is temporary and will be built in mlrun 1.5.0 + """ + # Get parameters defaults: + if context is None: + context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) + + HFTrainerMLRunInterface.add_interface(obj=trainer) + + if auto_log: + trainer.add_callback( + MLRunCallback( + context=context, + model_name=model_name, + tag=tag, + labels=labels, + extra_data=extra_data, + ) + )
+ + +# ----------------------end from MLRUN-------------------------------- + + +def _print_trainable_parameters(model): + """ + Prints the number of trainable parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in model.named_parameters(): + all_param += param.numel() + if param.requires_grad: + trainable_params += param.numel() + print( + f"trainable params: {trainable_params} || all params: {all_param} || trainable%:" + f" {100 * trainable_params / all_param}" + ) + + +# default configs +# will be used if user provides "True" with config name as input +QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, +) + +LORA_CONFIG = peft.LoraConfig( + r=8, + lora_alpha=32, + target_modules=["query_key_value"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", +) + +DEEPSPEED_CONFIG = { + "train_micro_batch_size_per_gpu": "auto", + "fp16": {"enabled": True}, + "autotuning": { + "enabled": True, + "arg_mappings": { + "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", + "gradient_accumulation_steps ": "--gradient_accumulation_steps", + }, + }, + "zero_optimization": { + "stage": 2, + }, +} + + +def _update_config(src: dict, dst: dict): + """ + update configs according to user, this way the user can add/modify values in default configs for e.g. + + goes over all configs and corresponding prefixes, collect all the keys from the given dict that start + with the prefix and add them to appropriate config + + :param src: dict of all candidate values to update dict. + :param dst: dict containing all configs to update. + """ + + for config_name, config in dst.items(): + + # If given True we use default dict + # Can also be False or a config dict given from user, so we check specifically fo True + if config is True and config_name == "quantization": + config = QUANTIZATION_CONFIG + + if config is True and config_name == "lora": + config = LORA_CONFIG + + if config is True and config_name == "deepspeed": + config = DEEPSPEED_CONFIG + + # in some cases we can get a boolean value, in that case no need to look for args + if isinstance(config, bool): + config = None + + elif isinstance(config, dict): + for key, val in src.items(): + if key.startswith(config_name): + config[key.replace(f"{config_name}_", "")] = val + + # update by config name + else: + for key, val in src.items(): + if key.startswith(config_name): + setattr(config, key.replace(f"{config_name}_", ""), val) + + dst.update({config_name: config}) + + +def _get_class_object(class_path: str) -> type: + """ + given a full class name, this function returns the correct class + + :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM') + + :return the wanted class object + """ + module_path, class_name = class_path.rsplit(".", 1) + module = importlib.import_module(module_path) + return getattr(module, class_name) + + +def _set_model_and_tokenizer( + model: Union[str, List[str]], + tokenizer: Union[str, List[str]], + task: str, + framework: str, + lora_config: dict, + quantization_config: dict, + use_cuda: bool, + tokenizer_pretrained_config, + model_pretrained_config, + device_map: str, +): + """ + get the correct model and tokenizer according to given user inputs + + :param model: a tuple containing model name and class, or str with model name or path + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path + :param task: a supported nlp task, used to choose model if not provided + :param framework: pt or tf + :param lora_config: lora config or None, to load model in appropriate way + :param quantization_config: quantization config or None, to load model in appropriate way + :param use_cuda: use gpu or not + :param tokenizer_pretrained_config: config to load the pretrained tokenizer + :param model_pretrained_config: config to load the pretrained model + :param device_map: a device map for model training if using number of gpu's + + :returns: model and tokenizer + """ + # if task is not supported and no model was given we can't choose one + if task and task not in supported_tasks and not model: + logger.error("unsupported task option chosen") + raise + + # load model from store + if isinstance(model, str) and is_store_uri(model): + pass + # TODO: load both model and tokenizer and return, need guy's help + + # if it's a tuple them we assume it contains of both name and class + if isinstance(model, list): + model_name, model_class = model + model_class = _get_class_object(model_class) + + # in the case we don't get the model class we need the task in order to choose the correct model + else: + if task is None: + logger.error("task must be chosen in order to determine the correct model") + raise Exception( + "this function requires either a supported task or a model and model class to be chosen" + ) + + _, available_classes, task_options = transformers.pipelines.check_task(task) + + if isinstance(model, str): + model_name = model + + # if model is not given, we take the default model for the given task + else: + model_name, _ = transformers.pipelines.get_default_model_and_revision( + available_classes, framework, task_options + ) + if not available_classes.get(framework, tuple()): + logger.error( + "given task's default model is not supported in specified framework" + ) + raise Exception( + "this function requires either a supported task or a model and model class to be chosen" + ) + + model_class = available_classes[framework][0] + + # load the pretrained model + if use_cuda: + device_map = device_map + else: + device_map = None + + model = model_class.from_pretrained( + model_name, + quantization_config=quantization_config, + device_map=device_map, + **model_pretrained_config, + ) + + # If quantization config is given we will load a quantized model, if not a regular one + if quantization_config: + model.gradient_checkpointing_enable() + model = peft.prepare_model_for_kbit_training(model) + + # If lora config was given we want to do lora fine tune, we update model here + if lora_config: + model = peft.get_peft_model(model, lora_config) + + # if not specified we choose the default tokenizer that corresponding to the model + if tokenizer is None: + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) + return model_name, model, tokenizer + + if isinstance(tokenizer, str): + tokenizer_name = tokenizer + tokenizer_class = transformers.AutoTokenizer + + # if it's not a str then it's a tuple of both name and class + else: + tokenizer_name, tokenizer_class = tokenizer + tokenizer_class = _get_class_object(tokenizer_class) + + tokenizer = tokenizer_class.from_pretrained( + tokenizer_name, **tokenizer_pretrained_config + ) + + tokenizer.pad_token = tokenizer.eos_token + + return model_name, model, tokenizer + + +def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: + """ + loads the specific dataset provided by the user + + :param dataset: name or path of dataset to load + :param is_train: bool that indicates the purpose of the dataset + :param kwargs: other kwargs for loading the dataset + + :returns: loaded dataset + """ + # if split in kwargs then the user decides how to split the dataset + if "split" in kwargs: + return load_dataset(dataset, **kwargs) + + # if it's a dataset for train we split with train + if is_train: + return load_dataset(dataset, split="train", **kwargs) + + # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them + dataset = load_dataset(dataset, **kwargs) + if "test" in dataset: + return dataset.get("test") + elif "eval" in dataset: + return dataset.get("eval") + elif "validation" in dataset: + return dataset.get("validation") + + +def _prepare_dataset( + train_dataset: str, + eval_dataset: str, + train_load_dataset_kwargs, + eval_load_dataset_kwargs, + tokenizer, + dataset_columns_to_train: Union[str, list], +) -> (Dataset, Union[Dataset, None]): + """ + Loads the train and eval datasets (if provided) passes them through the tokenizer and + returns them ready to use in training + + :param train_dataset: the name or path to the train dataset + :param eval_dataset: the name or path to the eval dataset + :param dataset_columns_to_train: which columns to pass to the model as inputs + (need to pass through the tokenizer first) + :param train_load_dataset_kwargs: kwargs for dataset loading + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param tokenizer: the tokenizer to pass the data through + + :returns: tokenized datasets + """ + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + + # we take col name/s in a list for easy generalization + if isinstance(dataset_columns_to_train, str): + dataset_columns_to_train = [dataset_columns_to_train] + + if isinstance(train_dataset, mlrun.datastore.DataItem): + train_dataset = Dataset.from_pandas(train_dataset.as_df()) + return ( + train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ), + None, + ) + + # Load datasets + # if provided two paths/names we load each separately using designated func + if eval_dataset: + train_dataset = _dataset_loader( + dataset=train_dataset, is_train=True, **train_load_dataset_kwargs + ) + eval_dataset = _dataset_loader( + dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs + ) + + # if only on path is given then we must check if it contains both dataset or if only one should be used + else: + dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) + if "train" in dataset: + train_dataset = dataset.get("train") + if "test" in dataset: + eval_dataset = dataset.get("test") + elif "eval" in dataset: + eval_dataset = dataset.get("eval") + elif "validation" in dataset: + eval_dataset = dataset.get("validation") + else: + # only train dataset given, tokenize and return it + return ( + train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ), + None, + ) + else: + logger.error("train dataset is mandatory") + raise KeyError("no train dataset found in given dataset") + + # Tokenize the data so the model can understand it + tokenized_train_dataset = train_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ) + + tokenized_eval_dataset = eval_dataset.map( + lambda examples: tokenizer( + *[examples[col] for col in dataset_columns_to_train], + truncation=True, + padding=True, + ), + batched=True, + ) + + return tokenized_train_dataset, tokenized_eval_dataset + + +
[docs]def finetune_llm( + context: mlrun.MLClientCtx, + train_dataset: Union[str, mlrun.datastore.DataItem], + eval_dataset: str = None, + train_load_dataset_kwargs: dict = {}, + eval_load_dataset_kwargs: dict = {}, + dataset_columns_to_train: Union[str, list] = "text", + model: Union[str, List[str]] = "huggingface-model", + tokenizer: Union[str, List[str]] = None, + deepspeed_config: Union[dict, bool] = False, + quantization_config: Union[dict, bool] = False, + lora_config: Union[dict, bool] = False, + training_config: dict = {}, + model_pretrained_config: dict = {}, + tokenizer_pretrained_config: dict = {}, + data_collator_config: dict = {}, + task: str = "text-generation", + use_cuda: bool = True, + framework: str = "pt", + device_map: str = "auto", + **kwargs, +): + """ + Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. + The function takes various configuration parameters to customize the training process + and adapt the model to specific tasks using a provided dataset. + + :param context: mlrun context in order to log trained model + :param dataset_columns_to_train: which columns to pass to the model as inputs + :param eval_load_dataset_kwargs: kwargs for dataset loading + :param train_load_dataset_kwargs: kwargs for dataset loading + :param framework: pt ot tf + :param use_cuda: use gpu or not + :param tokenizer_pretrained_config: config to load the pretrained tokenizer + :param model_pretrained_config: config to load the pretrained model + :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path + :param model: a tuple containing model name and class, or str with model name or path + :param train_dataset: The train dataset used for fine-tuning the language model. + :param eval_dataset: The eval dataset used for evaluate the language model during training. + :param deepspeed_config: Configuration options for DeepSpeed (optional). + :param quantization_config: Configuration options for model quantization (optional). + :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). + :param training_config: Configuration options specific to the fine-tuning training process (optional). + :param data_collator_config: Configuration options for data collation during training (optional). + :param task: A description of the specific task the model is being fine-tuned for. + :param kwargs: Additional keyword arguments. + """ + + # TODO: match forward.keyword to dataset.keyword - check if relevant in new design + # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design + + # Look for updates to configs given in kwargs + configs = { + ConfigKeys.deepspeed: deepspeed_config, + ConfigKeys.quantization: quantization_config, + ConfigKeys.lora: lora_config, + ConfigKeys.training: training_config, + ConfigKeys.model_pretrained: model_pretrained_config, + ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, + ConfigKeys.data_collator: data_collator_config, + } + _update_config(dst=configs, src=kwargs) + + # check gpu permission and availability + if use_cuda: + if torch.cuda.is_available(): + # Clean gpu cache + torch.cuda.empty_cache() + else: + logger.warning("'use_cuda' is set to True, but no cuda device is available") + + # get model and tokenizer + model_name, model, tokenizer = _set_model_and_tokenizer( + model=model, + tokenizer=tokenizer, + task=task, + framework=framework, + lora_config=configs[ConfigKeys.lora], + quantization_config=configs[ConfigKeys.quantization], + use_cuda=use_cuda, + tokenizer_pretrained_config=tokenizer_pretrained_config, + model_pretrained_config=configs[ConfigKeys.model_pretrained], + device_map=device_map, + ) + + # Load datasets + tokenized_train, tokenized_eval = _prepare_dataset( + train_dataset=train_dataset, + eval_dataset=eval_dataset, + train_load_dataset_kwargs=train_load_dataset_kwargs, + eval_load_dataset_kwargs=eval_load_dataset_kwargs, + tokenizer=tokenizer, + dataset_columns_to_train=dataset_columns_to_train, + ) + + # Initialize the data collator for the trainer to use in order to create batches of data + data_collator = transformers.DataCollatorForLanguageModeling( + tokenizer=tokenizer, mlm=False, **data_collator_config + ) + + # Initialize training kwargs from user kwargs: + train_kwargs = configs[ConfigKeys.training] + + # If deepspeed config given we add it to training kwargs + if configs[ConfigKeys.deepspeed]: + train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed] + + # Take a look at the trainable parameters in the model + _print_trainable_parameters(model) + + # Preparing training arguments: + training_args = transformers.TrainingArguments( + output_dir=tempfile.mkdtemp(), + **train_kwargs, + ) + + trainer = transformers.Trainer( + model=model, + train_dataset=tokenized_train, + eval_dataset=tokenized_eval, + tokenizer=tokenizer, + data_collator=data_collator, + args=training_args, + ) + + apply_mlrun(trainer, model_name=model_name.split("/")[-1]) + model.config.use_cache = ( + False # silence the warnings. Please re-enable for inference! + ) + + # Apply training with evaluation: + context.logger.info(f"training '{model_name}'") + trainer.train() + + temp_directory = tempfile.TemporaryDirectory().name + trainer.save_model(temp_directory) + + # Zip the model directory: + shutil.make_archive( + base_name="model", + format="zip", + root_dir=temp_directory, + ) + + # Log the model: + context.log_model( + key="model", + db_key=model_name.split("/")[-1], + model_file="model.zip", + tag="", + framework="Hugging Face", + )
+ + +
[docs]def evaluate( + context, + model_path, + data: pd.DataFrame, + model_name: str = None, + tokenizer_name: str = None, +): + """ + Evaluating the model using perplexity, for more information visit: + https://huggingface.co/docs/transformers/perplexity + + :param context: mlrun context + :param model_path: path to the model directory + :param data: the data to evaluate the model + :param model_name: name of base model + :param tokenizer_name: name of base tokenizer + """ + # Get the model artifact and file: + ( + model_file, + model_artifact, + extra_data, + ) = mlrun.artifacts.get_model(model_path) + + # Read the name: + _model_name = model_artifact.spec.db_key + + # Extract logged model files: + model_directory = os.path.join(os.path.dirname(model_file), _model_name) + with zipfile.ZipFile(model_file, "r") as zip_file: + zip_file.extractall(model_directory) + + # Loading the saved pretrained tokenizer and model: + dataset = Dataset.from_pandas(data) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + pad_token_id = tokenizer.eos_token_id + model = AutoModelForCausalLM.from_pretrained( + model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True + ) + model = PeftModel.from_pretrained(model, model_directory) + model.eval() + encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt") + + max_length = 1024 + stride = 512 + seq_len = encodings.input_ids.size(1) + + nlls = [] + prev_end_loc = 0 + for begin_loc in range(0, seq_len, stride): + end_loc = min(begin_loc + max_length, seq_len) + trg_len = end_loc - prev_end_loc # may be different from stride on last loop + input_ids = encodings.input_ids[:, begin_loc:end_loc] + target_ids = input_ids.clone() + target_ids[:, :-trg_len] = -100 + + with torch.no_grad(): + outputs = model(input_ids.cuda(), labels=target_ids) + + # loss is calculated using CrossEntropyLoss which averages over valid labels + # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels + # to the left by 1. + neg_log_likelihood = outputs.loss + + nlls.append(neg_log_likelihood) + + prev_end_loc = end_loc + if end_loc == seq_len: + break + + ppl = torch.exp(torch.stack(nlls).mean()).item() + context.log_result("perplexity", ppl)
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+ + + + \ No newline at end of file diff --git a/functions/master/huggingface_auto_trainer/1.1.0/static/item.html b/functions/master/huggingface_auto_trainer/1.1.0/static/item.html new file mode 100644 index 00000000..be5b35b4 --- /dev/null +++ b/functions/master/huggingface_auto_trainer/1.1.0/static/item.html @@ -0,0 +1,49 @@ + + + + + + + + + + + Source + + + + +
+        
+apiVersion: v1
+categories:
+- huggingface
+- genai
+- machine-learning
+- model-training
+description: fine-tune llm model with ease
+doc: ''
+example: huggingface_auto_trainer.ipynb
+generationDate: 2023-08-21:17-25
+hidden: false
+icon: ''
+labels:
+  author: Zeevr
+maintainers: []
+marketplaceType: ''
+mlrunVersion: 1.4.0
+name: huggingface-auto-trainer
+platformVersion: 3.5.0
+spec:
+  filename: huggingface_auto_trainer.py
+  handler: finetune_llm
+  image: mlrun/mlrun
+  kind: job
+  requirements: []
+url: ''
+version: 1.1.0
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/huggingface_auto_trainer/1.1.0/static/source.html b/functions/master/huggingface_auto_trainer/1.1.0/static/source.html new file mode 100644 index 00000000..7c445e5d --- /dev/null +++ b/functions/master/huggingface_auto_trainer/1.1.0/static/source.html @@ -0,0 +1,877 @@ + + + + + + + + + + + Source + + + + +
+        
+import importlib
+import os
+import shutil
+import tempfile
+import zipfile
+from abc import ABC
+from typing import Dict, List, Tuple, Union
+
+import mlrun
+import numpy as np
+import pandas as pd
+import peft
+import torch
+import transformers
+from datasets import Dataset, load_dataset
+from mlrun.artifacts.manager import Artifact, PlotlyArtifact
+from mlrun.datastore import is_store_uri
+from mlrun.frameworks._common import CommonTypes, MLRunInterface
+from mlrun.utils import logger
+from peft import (LoraConfig, PeftModel, get_peft_model,
+                  prepare_model_for_kbit_training)
+from plotly import graph_objects as go
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
+                          PreTrainedModel, PreTrainedTokenizer, Trainer,
+                          TrainerCallback, TrainerControl, TrainerState,
+                          TrainingArguments)
+
+supported_tasks = [
+    "question-answering",
+    "summarization",
+    "table-question-answering",
+    "text2text-generation",
+    "text-classification",
+    "sentiment-analysis",
+    "text-generation",
+    "token-classification",
+    "translation",
+    "translation_xx_to_yy",
+]
+
+
+class ConfigKeys:
+    deepspeed = "deepspeed"
+    quantization = "quantization"
+    lora = "lora"
+    training = "training"
+    tokenizer_pretrained = "tokenizer_pretrained"
+    model_pretrained = "model_pretrained"
+    data_collator = "data_collator"
+
+
+# ----------------------from MLRUN--------------------------------
+class HFTrainerMLRunInterface(MLRunInterface, ABC):
+    """
+    This is temporary and will be built in mlrun 1.5.0
+    Interface for adding MLRun features for tensorflow keras API.
+    """
+
+    # MLRuns context default name:
+    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"
+
+    # Attributes to replace so the MLRun interface will be fully enabled.
+    _REPLACED_METHODS = [
+        "train",
+        # "evaluate"
+    ]
+
+    @classmethod
+    def add_interface(
+        cls,
+        obj: Trainer,
+        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
+    ):
+        super(HFTrainerMLRunInterface, cls).add_interface(
+            obj=obj, restoration=restoration
+        )
+
+    @classmethod
+    def mlrun_train(cls):
+        def wrapper(self: Trainer, *args, **kwargs):
+            # Restore the evaluation method as `train` will use it:
+            # cls._restore_attribute(obj=self, attribute_name="evaluate")
+
+            # Call the original fit method:
+            result = self.original_train(*args, **kwargs)
+
+            # Replace the evaluation method again:
+            # cls._replace_function(obj=self, function_name="evaluate")
+
+            return result
+
+        return wrapper
+
+
+class MLRunCallback(TrainerCallback):
+    """
+    This is temporary and will be built in mlrun 1.5.0
+    Callback for collecting logs during training / evaluation of the `Trainer` API.
+    """
+
+    def __init__(
+        self,
+        context: mlrun.MLClientCtx = None,
+        model_name: str = "model",
+        tag: str = "",
+        labels: Dict[str, str] = None,
+        extra_data: dict = None,
+    ):
+        super().__init__()
+
+        # Store the configurations:
+        self._context = (
+            context
+            if context is not None
+            else mlrun.get_or_create_ctx("./mlrun-huggingface")
+        )
+        self._model_name = model_name
+        self._tag = tag
+        self._labels = labels
+        self._extra_data = extra_data if extra_data is not None else {}
+
+        # Set up the logging mode:
+        self._is_training = False
+        self._steps: List[List[int]] = []
+        self._metric_scores: Dict[str, List[float]] = {}
+        self._artifacts: Dict[str, Artifact] = {}
+
+    def on_epoch_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self._steps.append([])
+
+    def on_epoch_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self.log_metrics()
+
+    def on_log(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        logs: Dict[str, float] = None,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        recent_logs = state.log_history[-1].copy()
+
+        recent_logs.pop("epoch")
+        current_step = int(recent_logs.pop("step"))
+        if current_step not in self._steps[-1]:
+            self._steps[-1].append(current_step)
+
+        for metric_name, metric_score in recent_logs.items():
+            if metric_name.startswith("train_"):
+                if metric_name.split("train_")[1] not in self._metric_scores:
+                    self._metric_scores[metric_name] = [metric_score]
+                continue
+            if metric_name not in self._metric_scores:
+                self._metric_scores[metric_name] = []
+            self._metric_scores[metric_name].append(metric_score)
+
+    def on_train_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self._is_training = True
+
+    def on_train_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        model: PreTrainedModel = None,
+        tokenizer: PreTrainedTokenizer = None,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self.log_metrics()
+
+    def on_evaluate(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not state.is_world_process_zero:
+            return
+        self.log_metrics()
+
+        if self._is_training:
+            return
+
+    def log_metrics(self):
+        for metric_name, metric_scores in self._metric_scores.items():
+            self._context.log_result(key=metric_name, value=metric_scores[-1])
+            if len(metric_scores) > 1:
+                self.log_metric_plot(name=metric_name, scores=metric_scores)
+        self._context.commit(completed=False)
+
+    def log_metric_plot(self, name: str, scores: List[float]):
+        # Initialize a plotly figure:
+        metric_figure = go.Figure()
+
+        # Add titles:
+        metric_figure.update_layout(
+            title=name.capitalize().replace("_", " "),
+            xaxis_title="Samples",
+            yaxis_title="Scores",
+        )
+
+        # Draw:
+        metric_figure.add_trace(
+            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
+        )
+
+        # Create the plotly artifact:
+        artifact_name = f"{name}_plot"
+        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
+        self._artifacts[artifact_name] = self._context.log_artifact(artifact)
+
+
+def apply_mlrun(
+    trainer: transformers.Trainer,
+    model_name: str = None,
+    tag: str = "",
+    context: mlrun.MLClientCtx = None,
+    auto_log: bool = True,
+    labels: Dict[str, str] = None,
+    extra_data: dict = None,
+    **kwargs,
+):
+    """
+    This is temporary and will be built in mlrun 1.5.0
+    """
+    # Get parameters defaults:
+    if context is None:
+        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)
+
+    HFTrainerMLRunInterface.add_interface(obj=trainer)
+
+    if auto_log:
+        trainer.add_callback(
+            MLRunCallback(
+                context=context,
+                model_name=model_name,
+                tag=tag,
+                labels=labels,
+                extra_data=extra_data,
+            )
+        )
+
+
+# ----------------------end from MLRUN--------------------------------
+
+
+def _print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%:"
+        f" {100 * trainable_params / all_param}"
+    )
+
+
+# default configs
+# will be used if user provides "True" with config name as input
+QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+)
+
+LORA_CONFIG = peft.LoraConfig(
+    r=8,
+    lora_alpha=32,
+    target_modules=["query_key_value"],
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+
+DEEPSPEED_CONFIG = {
+    "train_micro_batch_size_per_gpu": "auto",
+    "fp16": {"enabled": True},
+    "autotuning": {
+        "enabled": True,
+        "arg_mappings": {
+            "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+            "gradient_accumulation_steps ": "--gradient_accumulation_steps",
+        },
+    },
+    "zero_optimization": {
+        "stage": 2,
+    },
+}
+
+
+def _update_config(src: dict, dst: dict):
+    """
+    update configs according to user, this way the user can add/modify values in default configs for e.g.
+
+    goes over all configs and corresponding prefixes, collect all the keys from the given dict that start
+     with the prefix and add them to appropriate config
+
+    :param src: dict of all candidate values to update dict.
+    :param dst: dict containing all configs to update.
+    """
+
+    for config_name, config in dst.items():
+
+        # If given True we use default dict
+        # Can also be False or a config dict given from user, so we check specifically fo True
+        if config is True and config_name == "quantization":
+            config = QUANTIZATION_CONFIG
+
+        if config is True and config_name == "lora":
+            config = LORA_CONFIG
+
+        if config is True and config_name == "deepspeed":
+            config = DEEPSPEED_CONFIG
+
+        # in some cases we can get a boolean value, in that case no need to look for args
+        if isinstance(config, bool):
+            config = None
+
+        elif isinstance(config, dict):
+            for key, val in src.items():
+                if key.startswith(config_name):
+                    config[key.replace(f"{config_name}_", "")] = val
+
+        # update by config name
+        else:
+            for key, val in src.items():
+                if key.startswith(config_name):
+                    setattr(config, key.replace(f"{config_name}_", ""), val)
+
+        dst.update({config_name: config})
+
+
+def _get_class_object(class_path: str) -> type:
+    """
+    given a full class name, this function returns the correct class
+
+    :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM')
+
+    :return the wanted class object
+    """
+    module_path, class_name = class_path.rsplit(".", 1)
+    module = importlib.import_module(module_path)
+    return getattr(module, class_name)
+
+
+def _set_model_and_tokenizer(
+    model: Union[str, List[str]],
+    tokenizer: Union[str, List[str]],
+    task: str,
+    framework: str,
+    lora_config: dict,
+    quantization_config: dict,
+    use_cuda: bool,
+    tokenizer_pretrained_config,
+    model_pretrained_config,
+    device_map: str,
+):
+    """
+    get the correct model and tokenizer according to given user inputs
+
+    :param model: a tuple containing model name and class, or str with model name or path
+    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
+    :param task: a supported nlp task, used to choose model if not provided
+    :param framework: pt or tf
+    :param lora_config: lora config or None, to load model in appropriate way
+    :param quantization_config: quantization config or None, to load model in appropriate way
+    :param use_cuda: use gpu or not
+    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
+    :param model_pretrained_config: config to load the pretrained model
+    :param device_map: a device map for model training if using number of gpu's
+
+    :returns: model and tokenizer
+    """
+    # if task is not supported and no model was given we can't choose one
+    if task and task not in supported_tasks and not model:
+        logger.error("unsupported task option chosen")
+        raise
+
+    # load model from store
+    if isinstance(model, str) and is_store_uri(model):
+        pass
+        # TODO: load both model and tokenizer and return, need guy's help
+
+    # if it's a tuple them we assume it contains of both name and class
+    if isinstance(model, list):
+        model_name, model_class = model
+        model_class = _get_class_object(model_class)
+
+    # in the case we don't get the model class we need the task in order to choose the correct model
+    else:
+        if task is None:
+            logger.error("task must be chosen in order to determine the correct model")
+            raise Exception(
+                "this function requires either a supported task or a model and model class to be chosen"
+            )
+
+        _, available_classes, task_options = transformers.pipelines.check_task(task)
+
+        if isinstance(model, str):
+            model_name = model
+
+        # if model is not given, we take the default model for the given task
+        else:
+            model_name, _ = transformers.pipelines.get_default_model_and_revision(
+                available_classes, framework, task_options
+            )
+        if not available_classes.get(framework, tuple()):
+            logger.error(
+                "given task's default model is not supported in specified framework"
+            )
+            raise Exception(
+                "this function requires either a supported task or a model and model class to be chosen"
+            )
+
+        model_class = available_classes[framework][0]
+
+    # load the pretrained model
+    if use_cuda:
+        device_map = device_map
+    else:
+        device_map = None
+
+    model = model_class.from_pretrained(
+        model_name,
+        quantization_config=quantization_config,
+        device_map=device_map,
+        **model_pretrained_config,
+    )
+
+    # If quantization config is given we will load a quantized model, if not a regular one
+    if quantization_config:
+        model.gradient_checkpointing_enable()
+        model = peft.prepare_model_for_kbit_training(model)
+
+    # If lora config was given we want to do lora fine tune, we update model here
+    if lora_config:
+        model = peft.get_peft_model(model, lora_config)
+
+    # if not specified we choose the default tokenizer that corresponding to the model
+    if tokenizer is None:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+        return model_name, model, tokenizer
+
+    if isinstance(tokenizer, str):
+        tokenizer_name = tokenizer
+        tokenizer_class = transformers.AutoTokenizer
+
+    # if it's not a str then it's a tuple of both name and class
+    else:
+        tokenizer_name, tokenizer_class = tokenizer
+        tokenizer_class = _get_class_object(tokenizer_class)
+
+    tokenizer = tokenizer_class.from_pretrained(
+        tokenizer_name, **tokenizer_pretrained_config
+    )
+
+    tokenizer.pad_token = tokenizer.eos_token
+
+    return model_name, model, tokenizer
+
+
+def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset:
+    """
+    loads the specific dataset provided by the user
+
+    :param dataset: name or path of dataset to load
+    :param is_train: bool that indicates the purpose of the dataset
+    :param kwargs: other kwargs for loading the dataset
+
+    :returns: loaded dataset
+    """
+    # if split in kwargs then the user decides how to split the dataset
+    if "split" in kwargs:
+        return load_dataset(dataset, **kwargs)
+
+    # if it's a dataset for train we split with train
+    if is_train:
+        return load_dataset(dataset, split="train", **kwargs)
+
+    # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them
+    dataset = load_dataset(dataset, **kwargs)
+    if "test" in dataset:
+        return dataset.get("test")
+    elif "eval" in dataset:
+        return dataset.get("eval")
+    elif "validation" in dataset:
+        return dataset.get("validation")
+
+
+def _prepare_dataset(
+    train_dataset: str,
+    eval_dataset: str,
+    train_load_dataset_kwargs,
+    eval_load_dataset_kwargs,
+    tokenizer,
+    dataset_columns_to_train: Union[str, list],
+) -> (Dataset, Union[Dataset, None]):
+    """
+    Loads the train and eval datasets (if provided) passes them through the tokenizer and
+    returns them ready to use in training
+
+    :param train_dataset: the name or path to the train dataset
+    :param eval_dataset: the name or path to the eval dataset
+    :param dataset_columns_to_train: which columns to pass to the model as inputs
+                                        (need to pass through the tokenizer first)
+    :param train_load_dataset_kwargs: kwargs for dataset loading
+    :param eval_load_dataset_kwargs: kwargs for dataset loading
+    :param tokenizer: the tokenizer to pass the data through
+
+    :returns: tokenized datasets
+    """
+    if not tokenizer.pad_token:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    # we take col name/s in a list for easy generalization
+    if isinstance(dataset_columns_to_train, str):
+        dataset_columns_to_train = [dataset_columns_to_train]
+
+    if isinstance(train_dataset, mlrun.datastore.DataItem):
+        train_dataset = Dataset.from_pandas(train_dataset.as_df())
+        return (
+            train_dataset.map(
+                lambda examples: tokenizer(
+                    *[examples[col] for col in dataset_columns_to_train],
+                    truncation=True,
+                    padding=True,
+                ),
+                batched=True,
+            ),
+            None,
+        )
+
+    # Load datasets
+    # if provided two paths/names we load each separately using designated func
+    if eval_dataset:
+        train_dataset = _dataset_loader(
+            dataset=train_dataset, is_train=True, **train_load_dataset_kwargs
+        )
+        eval_dataset = _dataset_loader(
+            dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs
+        )
+
+    # if only on path is given then we must check if it contains both dataset or if only one should be used
+    else:
+        dataset = load_dataset(train_dataset, **train_load_dataset_kwargs)
+        if "train" in dataset:
+            train_dataset = dataset.get("train")
+            if "test" in dataset:
+                eval_dataset = dataset.get("test")
+            elif "eval" in dataset:
+                eval_dataset = dataset.get("eval")
+            elif "validation" in dataset:
+                eval_dataset = dataset.get("validation")
+            else:
+                # only train dataset given, tokenize and return it
+                return (
+                    train_dataset.map(
+                        lambda examples: tokenizer(
+                            *[examples[col] for col in dataset_columns_to_train],
+                            truncation=True,
+                            padding=True,
+                        ),
+                        batched=True,
+                    ),
+                    None,
+                )
+        else:
+            logger.error("train dataset is mandatory")
+            raise KeyError("no train dataset found in given dataset")
+
+    # Tokenize the data so the model can understand it
+    tokenized_train_dataset = train_dataset.map(
+        lambda examples: tokenizer(
+            *[examples[col] for col in dataset_columns_to_train],
+            truncation=True,
+            padding=True,
+        ),
+        batched=True,
+    )
+
+    tokenized_eval_dataset = eval_dataset.map(
+        lambda examples: tokenizer(
+            *[examples[col] for col in dataset_columns_to_train],
+            truncation=True,
+            padding=True,
+        ),
+        batched=True,
+    )
+
+    return tokenized_train_dataset, tokenized_eval_dataset
+
+
+def finetune_llm(
+    context: mlrun.MLClientCtx,
+    train_dataset: Union[str, mlrun.datastore.DataItem],
+    eval_dataset: str = None,
+    train_load_dataset_kwargs: dict = {},
+    eval_load_dataset_kwargs: dict = {},
+    dataset_columns_to_train: Union[str, list] = "text",
+    model: Union[str, List[str]] = "huggingface-model",
+    tokenizer: Union[str, List[str]] = None,
+    deepspeed_config: Union[dict, bool] = False,
+    quantization_config: Union[dict, bool] = False,
+    lora_config: Union[dict, bool] = False,
+    training_config: dict = {},
+    model_pretrained_config: dict = {},
+    tokenizer_pretrained_config: dict = {},
+    data_collator_config: dict = {},
+    task: str = "text-generation",
+    use_cuda: bool = True,
+    framework: str = "pt",
+    device_map: str = "auto",
+    **kwargs,
+):
+    """
+    Fine-tunes a Language Model (LLM) on a specific task using the provided dataset.
+     The function takes various configuration parameters to customize the training process
+     and adapt the model to specific tasks using a provided dataset.
+
+    :param context: mlrun context in order to log trained model
+    :param dataset_columns_to_train: which columns to pass to the model as inputs
+    :param eval_load_dataset_kwargs: kwargs for dataset loading
+    :param train_load_dataset_kwargs: kwargs for dataset loading
+    :param framework: pt ot tf
+    :param use_cuda: use gpu or not
+    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
+    :param model_pretrained_config: config to load the pretrained model
+    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
+    :param model: a tuple containing model name and class, or str with model name or path
+    :param train_dataset: The train dataset used for fine-tuning the language model.
+    :param eval_dataset: The eval dataset used for evaluate the language model during training.
+    :param deepspeed_config: Configuration options for DeepSpeed (optional).
+    :param quantization_config: Configuration options for model quantization (optional).
+    :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional).
+    :param training_config: Configuration options specific to the fine-tuning training process (optional).
+    :param data_collator_config: Configuration options for data collation during training (optional).
+    :param task: A description of the specific task the model is being fine-tuned for.
+    :param kwargs: Additional keyword arguments.
+    """
+
+    # TODO: match forward.keyword to dataset.keyword - check if relevant in new design
+    # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design
+
+    # Look for updates to configs given in kwargs
+    configs = {
+        ConfigKeys.deepspeed: deepspeed_config,
+        ConfigKeys.quantization: quantization_config,
+        ConfigKeys.lora: lora_config,
+        ConfigKeys.training: training_config,
+        ConfigKeys.model_pretrained: model_pretrained_config,
+        ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config,
+        ConfigKeys.data_collator: data_collator_config,
+    }
+    _update_config(dst=configs, src=kwargs)
+
+    # check gpu permission and availability
+    if use_cuda:
+        if torch.cuda.is_available():
+            # Clean gpu cache
+            torch.cuda.empty_cache()
+        else:
+            logger.warning("'use_cuda' is set to True, but no cuda device is available")
+
+    # get model and tokenizer
+    model_name, model, tokenizer = _set_model_and_tokenizer(
+        model=model,
+        tokenizer=tokenizer,
+        task=task,
+        framework=framework,
+        lora_config=configs[ConfigKeys.lora],
+        quantization_config=configs[ConfigKeys.quantization],
+        use_cuda=use_cuda,
+        tokenizer_pretrained_config=tokenizer_pretrained_config,
+        model_pretrained_config=configs[ConfigKeys.model_pretrained],
+        device_map=device_map,
+    )
+
+    # Load datasets
+    tokenized_train, tokenized_eval = _prepare_dataset(
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        train_load_dataset_kwargs=train_load_dataset_kwargs,
+        eval_load_dataset_kwargs=eval_load_dataset_kwargs,
+        tokenizer=tokenizer,
+        dataset_columns_to_train=dataset_columns_to_train,
+    )
+
+    # Initialize the data collator for the trainer to use in order to create batches of data
+    data_collator = transformers.DataCollatorForLanguageModeling(
+        tokenizer=tokenizer, mlm=False, **data_collator_config
+    )
+
+    # Initialize training kwargs from user kwargs:
+    train_kwargs = configs[ConfigKeys.training]
+
+    # If deepspeed config given we add it to training kwargs
+    if configs[ConfigKeys.deepspeed]:
+        train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed]
+
+    # Take a look at the trainable parameters in the model
+    _print_trainable_parameters(model)
+
+    # Preparing training arguments:
+    training_args = transformers.TrainingArguments(
+        output_dir=tempfile.mkdtemp(),
+        **train_kwargs,
+    )
+
+    trainer = transformers.Trainer(
+        model=model,
+        train_dataset=tokenized_train,
+        eval_dataset=tokenized_eval,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        args=training_args,
+    )
+
+    apply_mlrun(trainer, model_name=model_name.split("/")[-1])
+    model.config.use_cache = (
+        False  # silence the warnings. Please re-enable for inference!
+    )
+
+    # Apply training with evaluation:
+    context.logger.info(f"training '{model_name}'")
+    trainer.train()
+
+    temp_directory = tempfile.TemporaryDirectory().name
+    trainer.save_model(temp_directory)
+
+    # Zip the model directory:
+    shutil.make_archive(
+        base_name="model",
+        format="zip",
+        root_dir=temp_directory,
+    )
+
+    # Log the model:
+    context.log_model(
+        key="model",
+        db_key=model_name.split("/")[-1],
+        model_file="model.zip",
+        tag="",
+        framework="Hugging Face",
+    )
+
+
+def evaluate(
+    context,
+    model_path,
+    data: pd.DataFrame,
+    model_name: str = None,
+    tokenizer_name: str = None,
+):
+    """
+    Evaluating the model using perplexity, for more information visit:
+    https://huggingface.co/docs/transformers/perplexity
+
+    :param context:     mlrun context
+    :param model_path:  path to the model directory
+    :param data:        the data to evaluate the model
+    :param model_name:  name of base model
+    :param tokenizer_name: name of base tokenizer
+    """
+    # Get the model artifact and file:
+    (
+        model_file,
+        model_artifact,
+        extra_data,
+    ) = mlrun.artifacts.get_model(model_path)
+
+    # Read the name:
+    _model_name = model_artifact.spec.db_key
+
+    # Extract logged model files:
+    model_directory = os.path.join(os.path.dirname(model_file), _model_name)
+    with zipfile.ZipFile(model_file, "r") as zip_file:
+        zip_file.extractall(model_directory)
+
+    # Loading the saved pretrained tokenizer and model:
+    dataset = Dataset.from_pandas(data)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+    pad_token_id = tokenizer.eos_token_id
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True
+    )
+    model = PeftModel.from_pretrained(model, model_directory)
+    model.eval()
+    encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt")
+
+    max_length = 1024
+    stride = 512
+    seq_len = encodings.input_ids.size(1)
+
+    nlls = []
+    prev_end_loc = 0
+    for begin_loc in range(0, seq_len, stride):
+        end_loc = min(begin_loc + max_length, seq_len)
+        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
+        input_ids = encodings.input_ids[:, begin_loc:end_loc]
+        target_ids = input_ids.clone()
+        target_ids[:, :-trg_len] = -100
+
+        with torch.no_grad():
+            outputs = model(input_ids.cuda(), labels=target_ids)
+
+            # loss is calculated using CrossEntropyLoss which averages over valid labels
+            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
+            # to the left by 1.
+            neg_log_likelihood = outputs.loss
+
+        nlls.append(neg_log_likelihood)
+
+        prev_end_loc = end_loc
+        if end_loc == seq_len:
+            break
+
+    ppl = torch.exp(torch.stack(nlls).mean()).item()
+    context.log_result("perplexity", ppl)
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/huggingface_auto_trainer/latest/src/function.yaml b/functions/master/huggingface_auto_trainer/latest/src/function.yaml index eff09b4c..702a8401 100644 --- a/functions/master/huggingface_auto_trainer/latest/src/function.yaml +++ b/functions/master/huggingface_auto_trainer/latest/src/function.yaml @@ -2,11 +2,13 @@ kind: job metadata: name: huggingface-auto-trainer tag: '' - hash: 4459f0b675c36a20c8f542126a96b98b0ac82271 + hash: 55c9aa4a822780f7388819ccf633dfe26b31f02e project: '' labels: author: Zeevr categories: + - huggingface + - genai - machine-learning - model-training spec: @@ -16,8 +18,8 @@ spec: build: functionSourceCode: import importlib
import os
import shutil
import tempfile
import zipfile
from abc import ABC
from typing import Dict, List, Tuple, Union

import mlrun
import numpy as np
import pandas as pd
import peft
import torch
import transformers
from datasets import Dataset, load_dataset
from mlrun.artifacts.manager import Artifact, PlotlyArtifact
from mlrun.datastore import is_store_uri
from mlrun.frameworks._common import CommonTypes, MLRunInterface
from mlrun.utils import logger
from peft import (LoraConfig, PeftModel, get_peft_model,
                  prepare_model_for_kbit_training)
from plotly import graph_objects as go
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
                          PreTrainedModel, PreTrainedTokenizer, Trainer,
                          TrainerCallback, TrainerControl, TrainerState,
                          TrainingArguments)

supported_tasks = [
    "question-answering",
    "summarization",
    "table-question-answering",
    "text2text-generation",
    "text-classification",
    "sentiment-analysis",
    "text-generation",
    "token-classification",
    "translation",
    "translation_xx_to_yy",
]


class ConfigKeys:
    deepspeed = "deepspeed"
    quantization = "quantization"
    lora = "lora"
    training = "training"
    tokenizer_pretrained = "tokenizer_pretrained"
    model_pretrained = "model_pretrained"
    data_collator = "data_collator"


# ----------------------from MLRUN--------------------------------
class HFTrainerMLRunInterface(MLRunInterface, ABC):
    """
    This is temporary and will be built in mlrun 1.5.0
    Interface for adding MLRun features for tensorflow keras API.
    """

    # MLRuns context default name:
    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"

    # Attributes to replace so the MLRun interface will be fully enabled.
    _REPLACED_METHODS = [
        "train",
        # "evaluate"
    ]

    @classmethod
    def add_interface(
        cls,
        obj: Trainer,
        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
    ):
        super(HFTrainerMLRunInterface, cls).add_interface(
            obj=obj, restoration=restoration
        )

    @classmethod
    def mlrun_train(cls):
        def wrapper(self: Trainer, *args, **kwargs):
            # Restore the evaluation method as `train` will use it:
            # cls._restore_attribute(obj=self, attribute_name="evaluate")

            # Call the original fit method:
            result = self.original_train(*args, **kwargs)

            # Replace the evaluation method again:
            # cls._replace_function(obj=self, function_name="evaluate")

            return result

        return wrapper


class MLRunCallback(TrainerCallback):
    """
    This is temporary and will be built in mlrun 1.5.0
    Callback for collecting logs during training / evaluation of the `Trainer` API.
    """

    def __init__(
        self,
        context: mlrun.MLClientCtx = None,
        model_name: str = "model",
        tag: str = "",
        labels: Dict[str, str] = None,
        extra_data: dict = None,
    ):
        super().__init__()

        # Store the configurations:
        self._context = (
            context
            if context is not None
            else mlrun.get_or_create_ctx("./mlrun-huggingface")
        )
        self._model_name = model_name
        self._tag = tag
        self._labels = labels
        self._extra_data = extra_data if extra_data is not None else {}

        # Set up the logging mode:
        self._is_training = False
        self._steps: List[List[int]] = []
        self._metric_scores: Dict[str, List[float]] = {}
        self._artifacts: Dict[str, Artifact] = {}

    def on_epoch_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self._steps.append([])

    def on_epoch_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

    def on_log(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        logs: Dict[str, float] = None,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        recent_logs = state.log_history[-1].copy()

        recent_logs.pop("epoch")
        current_step = int(recent_logs.pop("step"))
        if current_step not in self._steps[-1]:
            self._steps[-1].append(current_step)

        for metric_name, metric_score in recent_logs.items():
            if metric_name.startswith("train_"):
                if metric_name.split("train_")[1] not in self._metric_scores:
                    self._metric_scores[metric_name] = [metric_score]
                continue
            if metric_name not in self._metric_scores:
                self._metric_scores[metric_name] = []
            self._metric_scores[metric_name].append(metric_score)

    def on_train_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self._is_training = True

    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        model: PreTrainedModel = None,
        tokenizer: PreTrainedTokenizer = None,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

    def on_evaluate(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

        if self._is_training:
            return

    def log_metrics(self):
        for metric_name, metric_scores in self._metric_scores.items():
            self._context.log_result(key=metric_name, value=metric_scores[-1])
            if len(metric_scores) > 1:
                self.log_metric_plot(name=metric_name, scores=metric_scores)
        self._context.commit(completed=False)

    def log_metric_plot(self, name: str, scores: List[float]):
        # Initialize a plotly figure:
        metric_figure = go.Figure()

        # Add titles:
        metric_figure.update_layout(
            title=name.capitalize().replace("_", " "),
            xaxis_title="Samples",
            yaxis_title="Scores",
        )

        # Draw:
        metric_figure.add_trace(
            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
        )

        # Create the plotly artifact:
        artifact_name = f"{name}_plot"
        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
        self._artifacts[artifact_name] = self._context.log_artifact(artifact)


def apply_mlrun(
    trainer: transformers.Trainer,
    model_name: str = None,
    tag: str = "",
    context: mlrun.MLClientCtx = None,
    auto_log: bool = True,
    labels: Dict[str, str] = None,
    extra_data: dict = None,
    **kwargs,
):
    """
    This is temporary and will be built in mlrun 1.5.0
    """
    # Get parameters defaults:
    if context is None:
        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)

    HFTrainerMLRunInterface.add_interface(obj=trainer)

    if auto_log:
        trainer.add_callback(
            MLRunCallback(
                context=context,
                model_name=model_name,
                tag=tag,
                labels=labels,
                extra_data=extra_data,
            )
        )


# ----------------------end from MLRUN--------------------------------


def _print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%:"
        f" {100 * trainable_params / all_param}"
    )


# default configs
# will be used if user provides "True" with config name as input
QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

LORA_CONFIG = peft.LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

DEEPSPEED_CONFIG = {
    "train_micro_batch_size_per_gpu": "auto",
    "fp16": {"enabled": True},
    "autotuning": {
        "enabled": True,
        "arg_mappings": {
            "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
            "gradient_accumulation_steps ": "--gradient_accumulation_steps",
        },
    },
    "zero_optimization": {
        "stage": 2,
    },
}


def _update_config(src: dict, dst: dict):
    """
    update configs according to user, this way the user can add/modify values in default configs for e.g.

    goes over all configs and corresponding prefixes, collect all the keys from the given dict that start
     with the prefix and add them to appropriate config

    :param src: dict of all candidate values to update dict.
    :param dst: dict containing all configs to update.
    """

    for config_name, config in dst.items():

        # If given True we use default dict
        # Can also be False or a config dict given from user, so we check specifically fo True
        if config is True and config_name == "quantization":
            config = QUANTIZATION_CONFIG

        if config is True and config_name == "lora":
            config = LORA_CONFIG

        if config is True and config_name == "deepspeed":
            config = DEEPSPEED_CONFIG

        # in some cases we can get a boolean value, in that case no need to look for args
        if isinstance(config, bool):
            config = None

        elif isinstance(config, dict):
            for key, val in src.items():
                if key.startswith(config_name):
                    config[key.replace(f"{config_name}_", "")] = val

        # update by config name
        else:
            for key, val in src.items():
                if key.startswith(config_name):
                    setattr(config, key.replace(f"{config_name}_", ""), val)

        dst.update({config_name: config})


def _get_class_object(class_path: str) -> type:
    """
    given a full class name, this function returns the correct class

    :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM')

    :return the wanted class object
    """
    module_path, class_name = class_path.rsplit(".", 1)
    module = importlib.import_module(module_path)
    return getattr(module, class_name)


def _set_model_and_tokenizer(
    model: Union[str, List[str]],
    tokenizer: Union[str, List[str]],
    task: str,
    framework: str,
    lora_config: dict,
    quantization_config: dict,
    use_cuda: bool,
    tokenizer_pretrained_config,
    model_pretrained_config,
    device_map: str,
):
    """
    get the correct model and tokenizer according to given user inputs

    :param model: a tuple containing model name and class, or str with model name or path
    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
    :param task: a supported nlp task, used to choose model if not provided
    :param framework: pt or tf
    :param lora_config: lora config or None, to load model in appropriate way
    :param quantization_config: quantization config or None, to load model in appropriate way
    :param use_cuda: use gpu or not
    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
    :param model_pretrained_config: config to load the pretrained model
    :param device_map: a device map for model training if using number of gpu's

    :returns: model and tokenizer
    """
    # if task is not supported and no model was given we can't choose one
    if task and task not in supported_tasks and not model:
        logger.error("unsupported task option chosen")
        raise

    # load model from store
    if isinstance(model, str) and is_store_uri(model):
        pass
        # TODO: load both model and tokenizer and return, need guy's help

    # if it's a tuple them we assume it contains of both name and class
    if isinstance(model, list):
        model_name, model_class = model
        model_class = _get_class_object(model_class)

    # in the case we don't get the model class we need the task in order to choose the correct model
    else:
        if task is None:
            logger.error("task must be chosen in order to determine the correct model")
            raise Exception(
                "this function requires either a supported task or a model and model class to be chosen"
            )

        _, available_classes, task_options = transformers.pipelines.check_task(task)

        if isinstance(model, str):
            model_name = model

        # if model is not given, we take the default model for the given task
        else:
            model_name, _ = transformers.pipelines.get_default_model_and_revision(
                available_classes, framework, task_options
            )
        if not available_classes.get(framework, tuple()):
            logger.error(
                "given task's default model is not supported in specified framework"
            )
            raise Exception(
                "this function requires either a supported task or a model and model class to be chosen"
            )

        model_class = available_classes[framework][0]

    # load the pretrained model
    if use_cuda:
        device_map = device_map
    else:
        device_map = None

    model = model_class.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map=device_map,
        **model_pretrained_config,
    )

    # If quantization config is given we will load a quantized model, if not a regular one
    if quantization_config:
        model.gradient_checkpointing_enable()
        model = peft.prepare_model_for_kbit_training(model)

    # If lora config was given we want to do lora fine tune, we update model here
    if lora_config:
        model = peft.get_peft_model(model, lora_config)

    # if not specified we choose the default tokenizer that corresponding to the model
    if tokenizer is None:
        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
        return model_name, model, tokenizer

    if isinstance(tokenizer, str):
        tokenizer_name = tokenizer
        tokenizer_class = transformers.AutoTokenizer

    # if it's not a str then it's a tuple of both name and class
    else:
        tokenizer_name, tokenizer_class = tokenizer
        tokenizer_class = _get_class_object(tokenizer_class)

    tokenizer = tokenizer_class.from_pretrained(
        tokenizer_name, **tokenizer_pretrained_config
    )

    tokenizer.pad_token = tokenizer.eos_token

    return model_name, model, tokenizer


def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset:
    """
    loads the specific dataset provided by the user

    :param dataset: name or path of dataset to load
    :param is_train: bool that indicates the purpose of the dataset
    :param kwargs: other kwargs for loading the dataset

    :returns: loaded dataset
    """
    # if split in kwargs then the user decides how to split the dataset
    if "split" in kwargs:
        return load_dataset(dataset, **kwargs)

    # if it's a dataset for train we split with train
    if is_train:
        return load_dataset(dataset, split="train", **kwargs)

    # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them
    dataset = load_dataset(dataset, **kwargs)
    if "test" in dataset:
        return dataset.get("test")
    elif "eval" in dataset:
        return dataset.get("eval")
    elif "validation" in dataset:
        return dataset.get("validation")


def _prepare_dataset(
    train_dataset: str,
    eval_dataset: str,
    train_load_dataset_kwargs,
    eval_load_dataset_kwargs,
    tokenizer,
    dataset_columns_to_train: Union[str, list],
) -> (Dataset, Union[Dataset, None]):
    """
    Loads the train and eval datasets (if provided) passes them through the tokenizer and
    returns them ready to use in training

    :param train_dataset: the name or path to the train dataset
    :param eval_dataset: the name or path to the eval dataset
    :param dataset_columns_to_train: which columns to pass to the model as inputs
                                        (need to pass through the tokenizer first)
    :param train_load_dataset_kwargs: kwargs for dataset loading
    :param eval_load_dataset_kwargs: kwargs for dataset loading
    :param tokenizer: the tokenizer to pass the data through

    :returns: tokenized datasets
    """
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    # we take col name/s in a list for easy generalization
    if isinstance(dataset_columns_to_train, str):
        dataset_columns_to_train = [dataset_columns_to_train]

    if isinstance(train_dataset, mlrun.datastore.DataItem):
        train_dataset = Dataset.from_pandas(train_dataset.as_df())
        return (
            train_dataset.map(
                lambda examples: tokenizer(
                    *[examples[col] for col in dataset_columns_to_train],
                    truncation=True,
                    padding=True,
                ),
                batched=True,
            ),
            None,
        )

    # Load datasets
    # if provided two paths/names we load each separately using designated func
    if eval_dataset:
        train_dataset = _dataset_loader(
            dataset=train_dataset, is_train=True, **train_load_dataset_kwargs
        )
        eval_dataset = _dataset_loader(
            dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs
        )

    # if only on path is given then we must check if it contains both dataset or if only one should be used
    else:
        dataset = load_dataset(train_dataset, **train_load_dataset_kwargs)
        if "train" in dataset:
            train_dataset = dataset.get("train")
            if "test" in dataset:
                eval_dataset = dataset.get("test")
            elif "eval" in dataset:
                eval_dataset = dataset.get("eval")
            elif "validation" in dataset:
                eval_dataset = dataset.get("validation")
            else:
                # only train dataset given, tokenize and return it
                return (
                    train_dataset.map(
                        lambda examples: tokenizer(
                            *[examples[col] for col in dataset_columns_to_train],
                            truncation=True,
                            padding=True,
                        ),
                        batched=True,
                    ),
                    None,
                )
        else:
            logger.error("train dataset is mandatory")
            raise KeyError("no train dataset found in given dataset")

    # Tokenize the data so the model can understand it
    tokenized_train_dataset = train_dataset.map(
        lambda examples: tokenizer(
            *[examples[col] for col in dataset_columns_to_train],
            truncation=True,
            padding=True,
        ),
        batched=True,
    )

    tokenized_eval_dataset = eval_dataset.map(
        lambda examples: tokenizer(
            *[examples[col] for col in dataset_columns_to_train],
            truncation=True,
            padding=True,
        ),
        batched=True,
    )

    return tokenized_train_dataset, tokenized_eval_dataset


def finetune_llm(
    context: mlrun.MLClientCtx,
    train_dataset: Union[str, mlrun.datastore.DataItem],
    eval_dataset: str = None,
    train_load_dataset_kwargs: dict = {},
    eval_load_dataset_kwargs: dict = {},
    dataset_columns_to_train: Union[str, list] = "text",
    model: Union[str, List[str]] = "huggingface-model",
    tokenizer: Union[str, List[str]] = None,
    deepspeed_config: Union[dict, bool] = False,
    quantization_config: Union[dict, bool] = False,
    lora_config: Union[dict, bool] = False,
    training_config: dict = {},
    model_pretrained_config: dict = {},
    tokenizer_pretrained_config: dict = {},
    data_collator_config: dict = {},
    task: str = "text-generation",
    use_cuda: bool = True,
    framework: str = "pt",
    device_map: str = "auto",
    **kwargs,
):
    """
    Fine-tunes a Language Model (LLM) on a specific task using the provided dataset.
     The function takes various configuration parameters to customize the training process
     and adapt the model to specific tasks using a provided dataset.

    :param context: mlrun context in order to log trained model
    :param dataset_columns_to_train: which columns to pass to the model as inputs
    :param eval_load_dataset_kwargs: kwargs for dataset loading
    :param train_load_dataset_kwargs: kwargs for dataset loading
    :param framework: pt ot tf
    :param use_cuda: use gpu or not
    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
    :param model_pretrained_config: config to load the pretrained model
    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
    :param model: a tuple containing model name and class, or str with model name or path
    :param train_dataset: The train dataset used for fine-tuning the language model.
    :param eval_dataset: The eval dataset used for evaluate the language model during training.
    :param deepspeed_config: Configuration options for DeepSpeed (optional).
    :param quantization_config: Configuration options for model quantization (optional).
    :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional).
    :param training_config: Configuration options specific to the fine-tuning training process (optional).
    :param data_collator_config: Configuration options for data collation during training (optional).
    :param task: A description of the specific task the model is being fine-tuned for.
    :param kwargs: Additional keyword arguments.
    """

    # TODO: match forward.keyword to dataset.keyword - check if relevant in new design
    # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design

    # Look for updates to configs given in kwargs
    configs = {
        ConfigKeys.deepspeed: deepspeed_config,
        ConfigKeys.quantization: quantization_config,
        ConfigKeys.lora: lora_config,
        ConfigKeys.training: training_config,
        ConfigKeys.model_pretrained: model_pretrained_config,
        ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config,
        ConfigKeys.data_collator: data_collator_config,
    }
    _update_config(dst=configs, src=kwargs)

    # check gpu permission and availability
    if use_cuda:
        if torch.cuda.is_available():
            # Clean gpu cache
            torch.cuda.empty_cache()
        else:
            logger.warning("'use_cuda' is set to True, but no cuda device is available")

    # get model and tokenizer
    model_name, model, tokenizer = _set_model_and_tokenizer(
        model=model,
        tokenizer=tokenizer,
        task=task,
        framework=framework,
        lora_config=configs[ConfigKeys.lora],
        quantization_config=configs[ConfigKeys.quantization],
        use_cuda=use_cuda,
        tokenizer_pretrained_config=tokenizer_pretrained_config,
        model_pretrained_config=configs[ConfigKeys.model_pretrained],
        device_map=device_map,
    )

    # Load datasets
    tokenized_train, tokenized_eval = _prepare_dataset(
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        train_load_dataset_kwargs=train_load_dataset_kwargs,
        eval_load_dataset_kwargs=eval_load_dataset_kwargs,
        tokenizer=tokenizer,
        dataset_columns_to_train=dataset_columns_to_train,
    )

    # Initialize the data collator for the trainer to use in order to create batches of data
    data_collator = transformers.DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False, **data_collator_config
    )

    # Initialize training kwargs from user kwargs:
    train_kwargs = configs[ConfigKeys.training]

    # If deepspeed config given we add it to training kwargs
    if configs[ConfigKeys.deepspeed]:
        train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed]

    # Take a look at the trainable parameters in the model
    _print_trainable_parameters(model)

    # Preparing training arguments:
    training_args = transformers.TrainingArguments(
        output_dir=tempfile.mkdtemp(),
        **train_kwargs,
    )

    trainer = transformers.Trainer(
        model=model,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        tokenizer=tokenizer,
        data_collator=data_collator,
        args=training_args,
    )

    apply_mlrun(trainer, model_name=model_name.split("/")[-1])
    model.config.use_cache = (
        False  # silence the warnings. Please re-enable for inference!
    )

    # Apply training with evaluation:
    context.logger.info(f"training '{model_name}'")
    trainer.train()

    temp_directory = tempfile.TemporaryDirectory().name
    trainer.save_model(temp_directory)

    # Zip the model directory:
    shutil.make_archive(
        base_name="model",
        format="zip",
        root_dir=temp_directory,
    )

    # Log the model:
    context.log_model(
        key="model",
        db_key=model_name.split("/")[-1],
        model_file="model.zip",
        tag="",
        framework="Hugging Face",
    )


def evaluate(
    context,
    model_path,
    data: pd.DataFrame,
    model_name: str = None,
    tokenizer_name: str = None,
):
    """
    Evaluating the model using perplexity, for more information visit:
    https://huggingface.co/docs/transformers/perplexity

    :param context:     mlrun context
    :param model_path:  path to the model directory
    :param data:        the data to evaluate the model
    :param model_name:  name of base model
    :param tokenizer_name: name of base tokenizer
    """
    # Get the model artifact and file:
    (
        model_file,
        model_artifact,
        extra_data,
    ) = mlrun.artifacts.get_model(model_path)

    # Read the name:
    _model_name = model_artifact.spec.db_key

    # Extract logged model files:
    model_directory = os.path.join(os.path.dirname(model_file), _model_name)
    with zipfile.ZipFile(model_file, "r") as zip_file:
        zip_file.extractall(model_directory)

    # Loading the saved pretrained tokenizer and model:
    dataset = Dataset.from_pandas(data)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    pad_token_id = tokenizer.eos_token_id
    model = AutoModelForCausalLM.from_pretrained(
        model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True
    )
    model = PeftModel.from_pretrained(model, model_directory)
    model.eval()
    encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt")

    max_length = 1024
    stride = 512
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc]
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids.cuda(), labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean()).item()
    context.log_result("perplexity", ppl)
 commands: [] - code_origin: https://github.com/ZeevRispler/functions.git#a63a647cf6bc3015a8dcbd18903f9db44bdf0b66:/Users/Zeev_Rispler/PycharmProjects/functions/huggingface_auto_trainer/huggingface_auto_trainer.py - origin_filename: /Users/Zeev_Rispler/PycharmProjects/functions/huggingface_auto_trainer/huggingface_auto_trainer.py + code_origin: '' + origin_filename: '' requirements: [] entry_points: add_interface: @@ -25,183 +27,161 @@ spec: doc: '' parameters: - name: cls - default: '' - name: obj type: Trainer - default: '' - name: restoration type: MLRunInterfaceRestorationType default: null - outputs: - - default: '' + outputs: [] lineno: 70 + has_varargs: false + has_kwargs: false mlrun_train: name: mlrun_train doc: '' parameters: - name: cls - default: '' - outputs: - - default: '' + outputs: [] lineno: 80 + has_varargs: false + has_kwargs: false wrapper: name: wrapper doc: '' parameters: - name: self type: Trainer - default: '' - outputs: - - default: '' + outputs: [] lineno: 81 + has_varargs: true + has_kwargs: true on_epoch_begin: name: on_epoch_begin doc: '' parameters: - name: self - default: '' - name: args type: TrainingArguments - default: '' - name: state type: TrainerState - default: '' - name: control type: TrainerControl - default: '' - outputs: - - default: '' + outputs: [] lineno: 129 + has_varargs: false + has_kwargs: true on_epoch_end: name: on_epoch_end doc: '' parameters: - name: self - default: '' - name: args type: TrainingArguments - default: '' - name: state type: TrainerState - default: '' - name: control type: TrainerControl - default: '' - outputs: - - default: '' + outputs: [] lineno: 140 + has_varargs: false + has_kwargs: true on_log: name: on_log doc: '' parameters: - name: self - default: '' - name: args type: TrainingArguments - default: '' - name: state type: TrainerState - default: '' - name: control type: TrainerControl - default: '' - name: logs type: Dict[str, float] default: null - outputs: - - default: '' + outputs: [] lineno: 151 + has_varargs: false + has_kwargs: true on_train_begin: name: on_train_begin doc: '' parameters: - name: self - default: '' - name: args type: TrainingArguments - default: '' - name: state type: TrainerState - default: '' - name: control type: TrainerControl - default: '' - outputs: - - default: '' + outputs: [] lineno: 177 + has_varargs: false + has_kwargs: true on_train_end: name: on_train_end doc: '' parameters: - name: self - default: '' - name: args type: TrainingArguments - default: '' - name: state type: TrainerState - default: '' - name: control type: TrainerControl - default: '' - name: model type: PreTrainedModel default: null - name: tokenizer type: PreTrainedTokenizer default: null - outputs: - - default: '' + outputs: [] lineno: 188 + has_varargs: false + has_kwargs: true on_evaluate: name: on_evaluate doc: '' parameters: - name: self - default: '' - name: args type: TrainingArguments - default: '' - name: state type: TrainerState - default: '' - name: control type: TrainerControl - default: '' - outputs: - - default: '' + outputs: [] lineno: 201 + has_varargs: false + has_kwargs: true log_metrics: name: log_metrics doc: '' parameters: - name: self - default: '' - outputs: - - default: '' + outputs: [] lineno: 215 + has_varargs: false + has_kwargs: false log_metric_plot: name: log_metric_plot doc: '' parameters: - name: self - default: '' - name: name type: str - default: '' - name: scores type: List[float] - default: '' - outputs: - - default: '' + outputs: [] lineno: 222 + has_varargs: false + has_kwargs: false apply_mlrun: name: apply_mlrun doc: This is temporary and will be built in mlrun 1.5.0 parameters: - name: trainer type: Trainer - default: '' - name: model_name type: str default: null @@ -220,9 +200,10 @@ spec: - name: extra_data type: dict default: null - outputs: - - default: '' + outputs: [] lineno: 244 + has_varargs: false + has_kwargs: true finetune_llm: name: finetune_llm doc: "Fine-tunes a Language Model (LLM) on a specific task using the provided\ @@ -233,11 +214,9 @@ spec: - name: context type: MLClientCtx doc: mlrun context in order to log trained model - default: '' - name: train_dataset type: Union[str, mlrun.datastore.DataItem] doc: The train dataset used for fine-tuning the language model. - default: '' - name: eval_dataset type: str doc: The eval dataset used for evaluate the language model during training. @@ -306,9 +285,10 @@ spec: - name: device_map type: str default: auto - outputs: - - default: '' + outputs: [] lineno: 630 + has_varargs: false + has_kwargs: true evaluate: name: evaluate doc: 'Evaluating the model using perplexity, for more information visit: @@ -317,14 +297,11 @@ spec: parameters: - name: context doc: mlrun context - default: '' - name: model_path doc: path to the model directory - default: '' - name: data type: DataFrame doc: the data to evaluate the model - default: '' - name: model_name type: str doc: name of base model @@ -333,9 +310,10 @@ spec: type: str doc: name of base tokenizer default: null - outputs: - - default: '' + outputs: [] lineno: 784 + has_varargs: false + has_kwargs: false description: fine-tune llm model with ease default_handler: finetune_llm disable_auto_mount: false diff --git a/functions/master/huggingface_auto_trainer/latest/src/item.yaml b/functions/master/huggingface_auto_trainer/latest/src/item.yaml index e556c11d..b7c9bbcc 100644 --- a/functions/master/huggingface_auto_trainer/latest/src/item.yaml +++ b/functions/master/huggingface_auto_trainer/latest/src/item.yaml @@ -1,5 +1,7 @@ apiVersion: v1 categories: +- huggingface +- genai - machine-learning - model-training description: fine-tune llm model with ease @@ -22,4 +24,4 @@ spec: kind: job requirements: [] url: '' -version: 1.0.0 +version: 1.1.0 diff --git a/functions/master/huggingface_auto_trainer/latest/static/function.html b/functions/master/huggingface_auto_trainer/latest/static/function.html index 9a1f2953..a989880e 100644 --- a/functions/master/huggingface_auto_trainer/latest/static/function.html +++ b/functions/master/huggingface_auto_trainer/latest/static/function.html @@ -19,11 +19,13 @@ metadata: name: huggingface-auto-trainer tag: '' - hash: 4459f0b675c36a20c8f542126a96b98b0ac82271 + hash: 55c9aa4a822780f7388819ccf633dfe26b31f02e project: '' labels: author: Zeevr categories: + - huggingface + - genai - machine-learning - model-training spec: @@ -33,8 +35,8 @@ build: functionSourceCode: import importlib
import os
import shutil
import tempfile
import zipfile
from abc import ABC
from typing import Dict, List, Tuple, Union

import mlrun
import numpy as np
import pandas as pd
import peft
import torch
import transformers
from datasets import Dataset, load_dataset
from mlrun.artifacts.manager import Artifact, PlotlyArtifact
from mlrun.datastore import is_store_uri
from mlrun.frameworks._common import CommonTypes, MLRunInterface
from mlrun.utils import logger
from peft import (LoraConfig, PeftModel, get_peft_model,
                  prepare_model_for_kbit_training)
from plotly import graph_objects as go
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
                          PreTrainedModel, PreTrainedTokenizer, Trainer,
                          TrainerCallback, TrainerControl, TrainerState,
                          TrainingArguments)

supported_tasks = [
    "question-answering",
    "summarization",
    "table-question-answering",
    "text2text-generation",
    "text-classification",
    "sentiment-analysis",
    "text-generation",
    "token-classification",
    "translation",
    "translation_xx_to_yy",
]


class ConfigKeys:
    deepspeed = "deepspeed"
    quantization = "quantization"
    lora = "lora"
    training = "training"
    tokenizer_pretrained = "tokenizer_pretrained"
    model_pretrained = "model_pretrained"
    data_collator = "data_collator"


# ----------------------from MLRUN--------------------------------
class HFTrainerMLRunInterface(MLRunInterface, ABC):
    """
    This is temporary and will be built in mlrun 1.5.0
    Interface for adding MLRun features for tensorflow keras API.
    """

    # MLRuns context default name:
    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"

    # Attributes to replace so the MLRun interface will be fully enabled.
    _REPLACED_METHODS = [
        "train",
        # "evaluate"
    ]

    @classmethod
    def add_interface(
        cls,
        obj: Trainer,
        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
    ):
        super(HFTrainerMLRunInterface, cls).add_interface(
            obj=obj, restoration=restoration
        )

    @classmethod
    def mlrun_train(cls):
        def wrapper(self: Trainer, *args, **kwargs):
            # Restore the evaluation method as `train` will use it:
            # cls._restore_attribute(obj=self, attribute_name="evaluate")

            # Call the original fit method:
            result = self.original_train(*args, **kwargs)

            # Replace the evaluation method again:
            # cls._replace_function(obj=self, function_name="evaluate")

            return result

        return wrapper


class MLRunCallback(TrainerCallback):
    """
    This is temporary and will be built in mlrun 1.5.0
    Callback for collecting logs during training / evaluation of the `Trainer` API.
    """

    def __init__(
        self,
        context: mlrun.MLClientCtx = None,
        model_name: str = "model",
        tag: str = "",
        labels: Dict[str, str] = None,
        extra_data: dict = None,
    ):
        super().__init__()

        # Store the configurations:
        self._context = (
            context
            if context is not None
            else mlrun.get_or_create_ctx("./mlrun-huggingface")
        )
        self._model_name = model_name
        self._tag = tag
        self._labels = labels
        self._extra_data = extra_data if extra_data is not None else {}

        # Set up the logging mode:
        self._is_training = False
        self._steps: List[List[int]] = []
        self._metric_scores: Dict[str, List[float]] = {}
        self._artifacts: Dict[str, Artifact] = {}

    def on_epoch_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self._steps.append([])

    def on_epoch_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

    def on_log(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        logs: Dict[str, float] = None,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        recent_logs = state.log_history[-1].copy()

        recent_logs.pop("epoch")
        current_step = int(recent_logs.pop("step"))
        if current_step not in self._steps[-1]:
            self._steps[-1].append(current_step)

        for metric_name, metric_score in recent_logs.items():
            if metric_name.startswith("train_"):
                if metric_name.split("train_")[1] not in self._metric_scores:
                    self._metric_scores[metric_name] = [metric_score]
                continue
            if metric_name not in self._metric_scores:
                self._metric_scores[metric_name] = []
            self._metric_scores[metric_name].append(metric_score)

    def on_train_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self._is_training = True

    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        model: PreTrainedModel = None,
        tokenizer: PreTrainedTokenizer = None,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

    def on_evaluate(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if not state.is_world_process_zero:
            return
        self.log_metrics()

        if self._is_training:
            return

    def log_metrics(self):
        for metric_name, metric_scores in self._metric_scores.items():
            self._context.log_result(key=metric_name, value=metric_scores[-1])
            if len(metric_scores) > 1:
                self.log_metric_plot(name=metric_name, scores=metric_scores)
        self._context.commit(completed=False)

    def log_metric_plot(self, name: str, scores: List[float]):
        # Initialize a plotly figure:
        metric_figure = go.Figure()

        # Add titles:
        metric_figure.update_layout(
            title=name.capitalize().replace("_", " "),
            xaxis_title="Samples",
            yaxis_title="Scores",
        )

        # Draw:
        metric_figure.add_trace(
            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
        )

        # Create the plotly artifact:
        artifact_name = f"{name}_plot"
        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
        self._artifacts[artifact_name] = self._context.log_artifact(artifact)


def apply_mlrun(
    trainer: transformers.Trainer,
    model_name: str = None,
    tag: str = "",
    context: mlrun.MLClientCtx = None,
    auto_log: bool = True,
    labels: Dict[str, str] = None,
    extra_data: dict = None,
    **kwargs,
):
    """
    This is temporary and will be built in mlrun 1.5.0
    """
    # Get parameters defaults:
    if context is None:
        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)

    HFTrainerMLRunInterface.add_interface(obj=trainer)

    if auto_log:
        trainer.add_callback(
            MLRunCallback(
                context=context,
                model_name=model_name,
                tag=tag,
                labels=labels,
                extra_data=extra_data,
            )
        )


# ----------------------end from MLRUN--------------------------------


def _print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%:"
        f" {100 * trainable_params / all_param}"
    )


# default configs
# will be used if user provides "True" with config name as input
QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

LORA_CONFIG = peft.LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

DEEPSPEED_CONFIG = {
    "train_micro_batch_size_per_gpu": "auto",
    "fp16": {"enabled": True},
    "autotuning": {
        "enabled": True,
        "arg_mappings": {
            "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
            "gradient_accumulation_steps ": "--gradient_accumulation_steps",
        },
    },
    "zero_optimization": {
        "stage": 2,
    },
}


def _update_config(src: dict, dst: dict):
    """
    update configs according to user, this way the user can add/modify values in default configs for e.g.

    goes over all configs and corresponding prefixes, collect all the keys from the given dict that start
     with the prefix and add them to appropriate config

    :param src: dict of all candidate values to update dict.
    :param dst: dict containing all configs to update.
    """

    for config_name, config in dst.items():

        # If given True we use default dict
        # Can also be False or a config dict given from user, so we check specifically fo True
        if config is True and config_name == "quantization":
            config = QUANTIZATION_CONFIG

        if config is True and config_name == "lora":
            config = LORA_CONFIG

        if config is True and config_name == "deepspeed":
            config = DEEPSPEED_CONFIG

        # in some cases we can get a boolean value, in that case no need to look for args
        if isinstance(config, bool):
            config = None

        elif isinstance(config, dict):
            for key, val in src.items():
                if key.startswith(config_name):
                    config[key.replace(f"{config_name}_", "")] = val

        # update by config name
        else:
            for key, val in src.items():
                if key.startswith(config_name):
                    setattr(config, key.replace(f"{config_name}_", ""), val)

        dst.update({config_name: config})


def _get_class_object(class_path: str) -> type:
    """
    given a full class name, this function returns the correct class

    :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM')

    :return the wanted class object
    """
    module_path, class_name = class_path.rsplit(".", 1)
    module = importlib.import_module(module_path)
    return getattr(module, class_name)


def _set_model_and_tokenizer(
    model: Union[str, List[str]],
    tokenizer: Union[str, List[str]],
    task: str,
    framework: str,
    lora_config: dict,
    quantization_config: dict,
    use_cuda: bool,
    tokenizer_pretrained_config,
    model_pretrained_config,
    device_map: str,
):
    """
    get the correct model and tokenizer according to given user inputs

    :param model: a tuple containing model name and class, or str with model name or path
    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
    :param task: a supported nlp task, used to choose model if not provided
    :param framework: pt or tf
    :param lora_config: lora config or None, to load model in appropriate way
    :param quantization_config: quantization config or None, to load model in appropriate way
    :param use_cuda: use gpu or not
    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
    :param model_pretrained_config: config to load the pretrained model
    :param device_map: a device map for model training if using number of gpu's

    :returns: model and tokenizer
    """
    # if task is not supported and no model was given we can't choose one
    if task and task not in supported_tasks and not model:
        logger.error("unsupported task option chosen")
        raise

    # load model from store
    if isinstance(model, str) and is_store_uri(model):
        pass
        # TODO: load both model and tokenizer and return, need guy's help

    # if it's a tuple them we assume it contains of both name and class
    if isinstance(model, list):
        model_name, model_class = model
        model_class = _get_class_object(model_class)

    # in the case we don't get the model class we need the task in order to choose the correct model
    else:
        if task is None:
            logger.error("task must be chosen in order to determine the correct model")
            raise Exception(
                "this function requires either a supported task or a model and model class to be chosen"
            )

        _, available_classes, task_options = transformers.pipelines.check_task(task)

        if isinstance(model, str):
            model_name = model

        # if model is not given, we take the default model for the given task
        else:
            model_name, _ = transformers.pipelines.get_default_model_and_revision(
                available_classes, framework, task_options
            )
        if not available_classes.get(framework, tuple()):
            logger.error(
                "given task's default model is not supported in specified framework"
            )
            raise Exception(
                "this function requires either a supported task or a model and model class to be chosen"
            )

        model_class = available_classes[framework][0]

    # load the pretrained model
    if use_cuda:
        device_map = device_map
    else:
        device_map = None

    model = model_class.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map=device_map,
        **model_pretrained_config,
    )

    # If quantization config is given we will load a quantized model, if not a regular one
    if quantization_config:
        model.gradient_checkpointing_enable()
        model = peft.prepare_model_for_kbit_training(model)

    # If lora config was given we want to do lora fine tune, we update model here
    if lora_config:
        model = peft.get_peft_model(model, lora_config)

    # if not specified we choose the default tokenizer that corresponding to the model
    if tokenizer is None:
        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
        return model_name, model, tokenizer

    if isinstance(tokenizer, str):
        tokenizer_name = tokenizer
        tokenizer_class = transformers.AutoTokenizer

    # if it's not a str then it's a tuple of both name and class
    else:
        tokenizer_name, tokenizer_class = tokenizer
        tokenizer_class = _get_class_object(tokenizer_class)

    tokenizer = tokenizer_class.from_pretrained(
        tokenizer_name, **tokenizer_pretrained_config
    )

    tokenizer.pad_token = tokenizer.eos_token

    return model_name, model, tokenizer


def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset:
    """
    loads the specific dataset provided by the user

    :param dataset: name or path of dataset to load
    :param is_train: bool that indicates the purpose of the dataset
    :param kwargs: other kwargs for loading the dataset

    :returns: loaded dataset
    """
    # if split in kwargs then the user decides how to split the dataset
    if "split" in kwargs:
        return load_dataset(dataset, **kwargs)

    # if it's a dataset for train we split with train
    if is_train:
        return load_dataset(dataset, split="train", **kwargs)

    # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them
    dataset = load_dataset(dataset, **kwargs)
    if "test" in dataset:
        return dataset.get("test")
    elif "eval" in dataset:
        return dataset.get("eval")
    elif "validation" in dataset:
        return dataset.get("validation")


def _prepare_dataset(
    train_dataset: str,
    eval_dataset: str,
    train_load_dataset_kwargs,
    eval_load_dataset_kwargs,
    tokenizer,
    dataset_columns_to_train: Union[str, list],
) -> (Dataset, Union[Dataset, None]):
    """
    Loads the train and eval datasets (if provided) passes them through the tokenizer and
    returns them ready to use in training

    :param train_dataset: the name or path to the train dataset
    :param eval_dataset: the name or path to the eval dataset
    :param dataset_columns_to_train: which columns to pass to the model as inputs
                                        (need to pass through the tokenizer first)
    :param train_load_dataset_kwargs: kwargs for dataset loading
    :param eval_load_dataset_kwargs: kwargs for dataset loading
    :param tokenizer: the tokenizer to pass the data through

    :returns: tokenized datasets
    """
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    # we take col name/s in a list for easy generalization
    if isinstance(dataset_columns_to_train, str):
        dataset_columns_to_train = [dataset_columns_to_train]

    if isinstance(train_dataset, mlrun.datastore.DataItem):
        train_dataset = Dataset.from_pandas(train_dataset.as_df())
        return (
            train_dataset.map(
                lambda examples: tokenizer(
                    *[examples[col] for col in dataset_columns_to_train],
                    truncation=True,
                    padding=True,
                ),
                batched=True,
            ),
            None,
        )

    # Load datasets
    # if provided two paths/names we load each separately using designated func
    if eval_dataset:
        train_dataset = _dataset_loader(
            dataset=train_dataset, is_train=True, **train_load_dataset_kwargs
        )
        eval_dataset = _dataset_loader(
            dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs
        )

    # if only on path is given then we must check if it contains both dataset or if only one should be used
    else:
        dataset = load_dataset(train_dataset, **train_load_dataset_kwargs)
        if "train" in dataset:
            train_dataset = dataset.get("train")
            if "test" in dataset:
                eval_dataset = dataset.get("test")
            elif "eval" in dataset:
                eval_dataset = dataset.get("eval")
            elif "validation" in dataset:
                eval_dataset = dataset.get("validation")
            else:
                # only train dataset given, tokenize and return it
                return (
                    train_dataset.map(
                        lambda examples: tokenizer(
                            *[examples[col] for col in dataset_columns_to_train],
                            truncation=True,
                            padding=True,
                        ),
                        batched=True,
                    ),
                    None,
                )
        else:
            logger.error("train dataset is mandatory")
            raise KeyError("no train dataset found in given dataset")

    # Tokenize the data so the model can understand it
    tokenized_train_dataset = train_dataset.map(
        lambda examples: tokenizer(
            *[examples[col] for col in dataset_columns_to_train],
            truncation=True,
            padding=True,
        ),
        batched=True,
    )

    tokenized_eval_dataset = eval_dataset.map(
        lambda examples: tokenizer(
            *[examples[col] for col in dataset_columns_to_train],
            truncation=True,
            padding=True,
        ),
        batched=True,
    )

    return tokenized_train_dataset, tokenized_eval_dataset


def finetune_llm(
    context: mlrun.MLClientCtx,
    train_dataset: Union[str, mlrun.datastore.DataItem],
    eval_dataset: str = None,
    train_load_dataset_kwargs: dict = {},
    eval_load_dataset_kwargs: dict = {},
    dataset_columns_to_train: Union[str, list] = "text",
    model: Union[str, List[str]] = "huggingface-model",
    tokenizer: Union[str, List[str]] = None,
    deepspeed_config: Union[dict, bool] = False,
    quantization_config: Union[dict, bool] = False,
    lora_config: Union[dict, bool] = False,
    training_config: dict = {},
    model_pretrained_config: dict = {},
    tokenizer_pretrained_config: dict = {},
    data_collator_config: dict = {},
    task: str = "text-generation",
    use_cuda: bool = True,
    framework: str = "pt",
    device_map: str = "auto",
    **kwargs,
):
    """
    Fine-tunes a Language Model (LLM) on a specific task using the provided dataset.
     The function takes various configuration parameters to customize the training process
     and adapt the model to specific tasks using a provided dataset.

    :param context: mlrun context in order to log trained model
    :param dataset_columns_to_train: which columns to pass to the model as inputs
    :param eval_load_dataset_kwargs: kwargs for dataset loading
    :param train_load_dataset_kwargs: kwargs for dataset loading
    :param framework: pt ot tf
    :param use_cuda: use gpu or not
    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
    :param model_pretrained_config: config to load the pretrained model
    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
    :param model: a tuple containing model name and class, or str with model name or path
    :param train_dataset: The train dataset used for fine-tuning the language model.
    :param eval_dataset: The eval dataset used for evaluate the language model during training.
    :param deepspeed_config: Configuration options for DeepSpeed (optional).
    :param quantization_config: Configuration options for model quantization (optional).
    :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional).
    :param training_config: Configuration options specific to the fine-tuning training process (optional).
    :param data_collator_config: Configuration options for data collation during training (optional).
    :param task: A description of the specific task the model is being fine-tuned for.
    :param kwargs: Additional keyword arguments.
    """

    # TODO: match forward.keyword to dataset.keyword - check if relevant in new design
    # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design

    # Look for updates to configs given in kwargs
    configs = {
        ConfigKeys.deepspeed: deepspeed_config,
        ConfigKeys.quantization: quantization_config,
        ConfigKeys.lora: lora_config,
        ConfigKeys.training: training_config,
        ConfigKeys.model_pretrained: model_pretrained_config,
        ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config,
        ConfigKeys.data_collator: data_collator_config,
    }
    _update_config(dst=configs, src=kwargs)

    # check gpu permission and availability
    if use_cuda:
        if torch.cuda.is_available():
            # Clean gpu cache
            torch.cuda.empty_cache()
        else:
            logger.warning("'use_cuda' is set to True, but no cuda device is available")

    # get model and tokenizer
    model_name, model, tokenizer = _set_model_and_tokenizer(
        model=model,
        tokenizer=tokenizer,
        task=task,
        framework=framework,
        lora_config=configs[ConfigKeys.lora],
        quantization_config=configs[ConfigKeys.quantization],
        use_cuda=use_cuda,
        tokenizer_pretrained_config=tokenizer_pretrained_config,
        model_pretrained_config=configs[ConfigKeys.model_pretrained],
        device_map=device_map,
    )

    # Load datasets
    tokenized_train, tokenized_eval = _prepare_dataset(
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        train_load_dataset_kwargs=train_load_dataset_kwargs,
        eval_load_dataset_kwargs=eval_load_dataset_kwargs,
        tokenizer=tokenizer,
        dataset_columns_to_train=dataset_columns_to_train,
    )

    # Initialize the data collator for the trainer to use in order to create batches of data
    data_collator = transformers.DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False, **data_collator_config
    )

    # Initialize training kwargs from user kwargs:
    train_kwargs = configs[ConfigKeys.training]

    # If deepspeed config given we add it to training kwargs
    if configs[ConfigKeys.deepspeed]:
        train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed]

    # Take a look at the trainable parameters in the model
    _print_trainable_parameters(model)

    # Preparing training arguments:
    training_args = transformers.TrainingArguments(
        output_dir=tempfile.mkdtemp(),
        **train_kwargs,
    )

    trainer = transformers.Trainer(
        model=model,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        tokenizer=tokenizer,
        data_collator=data_collator,
        args=training_args,
    )

    apply_mlrun(trainer, model_name=model_name.split("/")[-1])
    model.config.use_cache = (
        False  # silence the warnings. Please re-enable for inference!
    )

    # Apply training with evaluation:
    context.logger.info(f"training '{model_name}'")
    trainer.train()

    temp_directory = tempfile.TemporaryDirectory().name
    trainer.save_model(temp_directory)

    # Zip the model directory:
    shutil.make_archive(
        base_name="model",
        format="zip",
        root_dir=temp_directory,
    )

    # Log the model:
    context.log_model(
        key="model",
        db_key=model_name.split("/")[-1],
        model_file="model.zip",
        tag="",
        framework="Hugging Face",
    )


def evaluate(
    context,
    model_path,
    data: pd.DataFrame,
    model_name: str = None,
    tokenizer_name: str = None,
):
    """
    Evaluating the model using perplexity, for more information visit:
    https://huggingface.co/docs/transformers/perplexity

    :param context:     mlrun context
    :param model_path:  path to the model directory
    :param data:        the data to evaluate the model
    :param model_name:  name of base model
    :param tokenizer_name: name of base tokenizer
    """
    # Get the model artifact and file:
    (
        model_file,
        model_artifact,
        extra_data,
    ) = mlrun.artifacts.get_model(model_path)

    # Read the name:
    _model_name = model_artifact.spec.db_key

    # Extract logged model files:
    model_directory = os.path.join(os.path.dirname(model_file), _model_name)
    with zipfile.ZipFile(model_file, "r") as zip_file:
        zip_file.extractall(model_directory)

    # Loading the saved pretrained tokenizer and model:
    dataset = Dataset.from_pandas(data)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    pad_token_id = tokenizer.eos_token_id
    model = AutoModelForCausalLM.from_pretrained(
        model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True
    )
    model = PeftModel.from_pretrained(model, model_directory)
    model.eval()
    encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt")

    max_length = 1024
    stride = 512
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc]
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids.cuda(), labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean()).item()
    context.log_result("perplexity", ppl)
 commands: [] - code_origin: https://github.com/ZeevRispler/functions.git#a63a647cf6bc3015a8dcbd18903f9db44bdf0b66:/Users/Zeev_Rispler/PycharmProjects/functions/huggingface_auto_trainer/huggingface_auto_trainer.py - origin_filename: /Users/Zeev_Rispler/PycharmProjects/functions/huggingface_auto_trainer/huggingface_auto_trainer.py + code_origin: '' + origin_filename: '' requirements: [] entry_points: add_interface: @@ -42,183 +44,161 @@ doc: '' parameters: - name: cls - default: '' - name: obj type: Trainer - default: '' - name: restoration type: MLRunInterfaceRestorationType default: null - outputs: - - default: '' + outputs: [] lineno: 70 + has_varargs: false + has_kwargs: false mlrun_train: name: mlrun_train doc: '' parameters: - name: cls - default: '' - outputs: - - default: '' + outputs: [] lineno: 80 + has_varargs: false + has_kwargs: false wrapper: name: wrapper doc: '' parameters: - name: self type: Trainer - default: '' - outputs: - - default: '' + outputs: [] lineno: 81 + has_varargs: true + has_kwargs: true on_epoch_begin: name: on_epoch_begin doc: '' parameters: - name: self - default: '' - name: args type: TrainingArguments - default: '' - name: state type: TrainerState - default: '' - name: control type: TrainerControl - default: '' - outputs: - - default: '' + outputs: [] lineno: 129 + has_varargs: false + has_kwargs: true on_epoch_end: name: on_epoch_end doc: '' parameters: - name: self - default: '' - name: args type: TrainingArguments - default: '' - name: state type: TrainerState - default: '' - name: control type: TrainerControl - default: '' - outputs: - - default: '' + outputs: [] lineno: 140 + has_varargs: false + has_kwargs: true on_log: name: on_log doc: '' parameters: - name: self - default: '' - name: args type: TrainingArguments - default: '' - name: state type: TrainerState - default: '' - name: control type: TrainerControl - default: '' - name: logs type: Dict[str, float] default: null - outputs: - - default: '' + outputs: [] lineno: 151 + has_varargs: false + has_kwargs: true on_train_begin: name: on_train_begin doc: '' parameters: - name: self - default: '' - name: args type: TrainingArguments - default: '' - name: state type: TrainerState - default: '' - name: control type: TrainerControl - default: '' - outputs: - - default: '' + outputs: [] lineno: 177 + has_varargs: false + has_kwargs: true on_train_end: name: on_train_end doc: '' parameters: - name: self - default: '' - name: args type: TrainingArguments - default: '' - name: state type: TrainerState - default: '' - name: control type: TrainerControl - default: '' - name: model type: PreTrainedModel default: null - name: tokenizer type: PreTrainedTokenizer default: null - outputs: - - default: '' + outputs: [] lineno: 188 + has_varargs: false + has_kwargs: true on_evaluate: name: on_evaluate doc: '' parameters: - name: self - default: '' - name: args type: TrainingArguments - default: '' - name: state type: TrainerState - default: '' - name: control type: TrainerControl - default: '' - outputs: - - default: '' + outputs: [] lineno: 201 + has_varargs: false + has_kwargs: true log_metrics: name: log_metrics doc: '' parameters: - name: self - default: '' - outputs: - - default: '' + outputs: [] lineno: 215 + has_varargs: false + has_kwargs: false log_metric_plot: name: log_metric_plot doc: '' parameters: - name: self - default: '' - name: name type: str - default: '' - name: scores type: List[float] - default: '' - outputs: - - default: '' + outputs: [] lineno: 222 + has_varargs: false + has_kwargs: false apply_mlrun: name: apply_mlrun doc: This is temporary and will be built in mlrun 1.5.0 parameters: - name: trainer type: Trainer - default: '' - name: model_name type: str default: null @@ -237,9 +217,10 @@ - name: extra_data type: dict default: null - outputs: - - default: '' + outputs: [] lineno: 244 + has_varargs: false + has_kwargs: true finetune_llm: name: finetune_llm doc: "Fine-tunes a Language Model (LLM) on a specific task using the provided\ @@ -250,11 +231,9 @@ - name: context type: MLClientCtx doc: mlrun context in order to log trained model - default: '' - name: train_dataset type: Union[str, mlrun.datastore.DataItem] doc: The train dataset used for fine-tuning the language model. - default: '' - name: eval_dataset type: str doc: The eval dataset used for evaluate the language model during training. @@ -323,9 +302,10 @@ - name: device_map type: str default: auto - outputs: - - default: '' + outputs: [] lineno: 630 + has_varargs: false + has_kwargs: true evaluate: name: evaluate doc: 'Evaluating the model using perplexity, for more information visit: @@ -334,14 +314,11 @@ parameters: - name: context doc: mlrun context - default: '' - name: model_path doc: path to the model directory - default: '' - name: data type: DataFrame doc: the data to evaluate the model - default: '' - name: model_name type: str doc: name of base model @@ -350,9 +327,10 @@ type: str doc: name of base tokenizer default: null - outputs: - - default: '' + outputs: [] lineno: 784 + has_varargs: false + has_kwargs: false description: fine-tune llm model with ease default_handler: finetune_llm disable_auto_mount: false diff --git a/functions/master/huggingface_auto_trainer/latest/static/item.html b/functions/master/huggingface_auto_trainer/latest/static/item.html index 2ca49a41..be5b35b4 100644 --- a/functions/master/huggingface_auto_trainer/latest/static/item.html +++ b/functions/master/huggingface_auto_trainer/latest/static/item.html @@ -17,6 +17,8 @@ apiVersion: v1 categories: +- huggingface +- genai - machine-learning - model-training description: fine-tune llm model with ease @@ -39,7 +41,7 @@ kind: job requirements: [] url: '' -version: 1.0.0 +version: 1.1.0 diff --git a/functions/master/pii_recognizer/0.3.0/src/data/config.csv b/functions/master/pii_recognizer/0.3.0/src/data/config.csv new file mode 100644 index 00000000..fe2c350e --- /dev/null +++ b/functions/master/pii_recognizer/0.3.0/src/data/config.csv @@ -0,0 +1,3 @@ +input_file,output_file +data/pii.txt,data/pii_out.txt +data/letter.txt,data/letter_out.txt diff --git a/functions/master/pii_recognizer/0.3.0/src/data/letter.txt b/functions/master/pii_recognizer/0.3.0/src/data/letter.txt new file mode 100644 index 00000000..59d25e78 --- /dev/null +++ b/functions/master/pii_recognizer/0.3.0/src/data/letter.txt @@ -0,0 +1,12 @@ +Dear Mr. John Doe, + +We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of Riviera. Your flight tickets have been booked, and you will be departing on July 15th, 2023. + +Please provide us with the necessary details to finalize your travel arrangements. We kindly request your full name, date of birth, passport number, and contact information. Rest assured that all provided information will be handled with utmost confidentiality and in compliance with data protection regulations. + +We look forward to creating unforgettable memories for you and your loved ones during your stay with us. If you have any questions or require further assistance, please don't hesitate to contact our customer support team. + +Best regards, + +Jane Smith +Customer Support Representative diff --git a/functions/master/pii_recognizer/0.3.0/src/data/output/letter_output.txt b/functions/master/pii_recognizer/0.3.0/src/data/output/letter_output.txt new file mode 100644 index 00000000..468533af --- /dev/null +++ b/functions/master/pii_recognizer/0.3.0/src/data/output/letter_output.txt @@ -0,0 +1,12 @@ +Dear Mr. , + +We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of . Your flight tickets have been booked, and you will be departing on July 15th, 2023. + +Please provide us with the necessary details to finalize your travel arrangements. We kindly request your full name, date of birth, passport number, and contact information. Rest assured that all provided information will be handled with utmost confidentiality and in compliance with data protection regulations. + +We look forward to creating unforgettable memories for you and your loved ones during your stay with us. If you have any questions or require further assistance, please don't hesitate to contact our customer support team. + +Best regards, + + + Support diff --git a/functions/master/pii_recognizer/0.3.0/src/data/output/pii_output.txt b/functions/master/pii_recognizer/0.3.0/src/data/output/pii_output.txt new file mode 100644 index 00000000..1160e497 --- /dev/null +++ b/functions/master/pii_recognizer/0.3.0/src/data/output/pii_output.txt @@ -0,0 +1 @@ + is , connect him with or , he can pay you with \ No newline at end of file diff --git a/functions/master/pii_recognizer/0.3.0/src/data/pii.txt b/functions/master/pii_recognizer/0.3.0/src/data/pii.txt new file mode 100644 index 00000000..8886cc08 --- /dev/null +++ b/functions/master/pii_recognizer/0.3.0/src/data/pii.txt @@ -0,0 +1 @@ +John smith's ssn is 182838483, connect him with John_smith@gmail.com or 6288389029, he can pay you with 41482929939393 \ No newline at end of file diff --git a/functions/master/pii_recognizer/0.3.0/src/function.yaml b/functions/master/pii_recognizer/0.3.0/src/function.yaml new file mode 100644 index 00000000..069fa1ff --- /dev/null +++ b/functions/master/pii_recognizer/0.3.0/src/function.yaml @@ -0,0 +1,129 @@ +kind: job +metadata: + name: pii-recognizer + tag: '' + hash: 818930645d33704e9cada919769ee9d93cbb9434 + project: '' + labels: + author: pgw + categories: + - machine-learning + - data-preparation + - NLP +spec: + command: '' + args: [] + image: '' + build: + functionSourceCode: # Copyright 2023 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import pathlib
import tempfile
import warnings
from typing import List, Set, Tuple, Union

import annotated_text.util as at_util
import mlrun
import nltk
import pandas as pd
import presidio_analyzer as pa
import presidio_anonymizer as pre_anoymizer
from presidio_anonymizer.entities import OperatorConfig
from tqdm import tqdm

try:
    import flair as fl
except ModuleNotFoundError:
    print("Flair is not installed")

# There is a conflict between Rust-based tokenizers' parallel processing
# and Python's fork operations during multiprocessing. To avoid this, we need
# the following two lines

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore")

logger = logging.getLogger("pii-recognizer")


# Add the constant classes of Models and Entities to govern the whole package
class Models:
    WHOLE = "whole"
    PATTERN = "pattern"
    SPACY = "spacy"
    FLAIR = "flair"


class Entities:
    CREDIT_CARD = "CREDIT_CARD"
    SSN = "SSN"
    PHONE = "PHONE"
    EMAIL = "EMAIL"
    LOCATION = "LOCATION"
    PERSON = "PERSON"
    NRP = "NRP"
    ORGANIZATION = "ORGANIZATION"
    DATE_TIME = "DATE_TIME"
    GPE = ("GPE",)
    MAC_ADDRESS = "MAC_ADDRESS"
    US_BANK_NUMBER = "US_BANK_NUMBER"
    IMEI = "IMEI"
    TITLE = "TITLE"
    LICENSE_PLATE = "LICENSE_PLATE"
    US_PASSPORT = "US_PASSPORT"
    CURRENCY = "CURRENCY"
    ROUTING_NUMBER = "ROUTING_NUMBER"
    US_ITIN = "US_ITIN"
    US_BANK_NUMBER = "US_BANK_NUMBER"
    US_DRIVER_LICENSE = "US_DRIVER_LICENSE"
    AGE = "AGE"
    PASSWORD = "PASSWORD"
    SWIFT_CODE = "SWIFT_CODE"


class PatternRecognizerFactory:
    """
    Factory for creating pattern recognizers, it can be extended in the future to
    add more regex pattern for different entities. For the pattern recognizer to work,
    we need construct a list of regex patterns for each entity.
    """

    RECOGNIZABLE_ENTITIES = {
        "CREDIT_CARD": [pa.Pattern("CREDIT_CARD", r"\b(?:\d[ -]*?){13,16}\b", 0.5)],
        "SSN": [pa.Pattern("SSN", r"\b\d{3}-?\d{2}-?\d{4}\b", 0.5)],
        "PHONE": [pa.Pattern("PHONE", r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", 0.5)],
        "EMAIL": [pa.Pattern("EMAIL", r"\S+@\S+", 0.5)],
    }

    # create a list of pattern recognizers
    @classmethod
    def _create_pattern_recognizer(cls):
        """
        For each entity, create a list of patterns to recognize it

        :param cls: PatternRecognizerFactory class

        :returns: List of pattern recognizers
        """

        # Entities to recognize and their regex patterns

        return [
            pa.PatternRecognizer(supported_entity=entity, patterns=pattern)
            for entity, pattern in cls.RECOGNIZABLE_ENTITIES.items()
        ]


class CustomSpacyRecognizer(pa.LocalRecognizer):
    """
    Custom Spacy Recognizer from Presidio Analyzer trained on Privy data.
    The privy data is generated using this https://github.com/pixie-io/pixie/tree/main/src/datagen/pii/privy
    It can be used to recognize custom entities, Since we want to use Presidio's Registries to generate AnalyzerEngine,
    it inherits from Presidio Analyzer's LocalRecognizer class.
    """

    # Entities to recognize

    RECOGNIZABLE_ENTITIES = {
        "LOCATION",
        "PERSON",
        "NRP",
        "ORGANIZATION",
        "DATE_TIME",
    }

    # Default explanation for this recognizer

    _DEFAULT_EXPLANATION = (
        "Identified as {} by Spacy's Named Entity Recognition (Privy-trained)"
    )

    # Label groups to check

    _DEFAULT_CHECK_LABEL_GROUPS = [
        ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}),
        ({"PERSON"}, {"PER", "PERSON"}),
        ({"NRP"}, {"NORP", "NRP"}),
        ({"ORGANIZATION"}, {"ORG"}),
        ({"DATE_TIME"}, {"DATE_TIME"}),
    ]

    # pretrained model for this recognizer

    _DEFAULT_MODEL_LANGUAGES = {
        "en": "beki/en_spacy_pii_distilbert",
    }

    _DEFAULT_PRESIDIO_EQUIVALENCES = {
        "PER": "PERSON",
        "LOC": "LOCATION",
        "ORG": "ORGANIZATION",
        "NROP": "NRP",
        "DATE_TIME": "DATE_TIME",
    }

    def __init__(
        self,
        supported_language: str = "en",
        supported_entities: List[str] = None,
        check_label_groups: Tuple[Set, Set] = None,
        context: List[str] = None,
        ner_strength: float = 1,
    ):
        """
        Initialize Spacy Recognizer.

        :param supported_language: Language to use, default is English
        :param supported_entities: Entities to use for recognition
        :param check_label_groups: Label groups to check for the entities
        :param context:            Context to use if any
        :param ner_strength:       Default confidence for NER prediction

        :returns: SpacyRecognizer object
        """

        # Default confidence for NER prediction
        self.ner_strength = ner_strength

        self.check_label_groups = check_label_groups or self._DEFAULT_CHECK_LABEL_GROUPS
        supported_entities = supported_entities or self.RECOGNIZABLE_ENTITIES
        super().__init__(
            supported_entities=supported_entities,
            supported_language=supported_language,
        )

    # get the presidio explanation for the result

    def _build_spacy_explanation(
        self, original_score: float, explanation: str
    ) -> pa.AnalysisExplanation:
        """
        Create explanation for why this result was detected.

        :param original_score: Score given by this recognizer
        :param explanation:    Explanation string

        :returns: Presidio AnalysisExplanation object
        """
        explanation = pa.AnalysisExplanation(
            recognizer=self.__class__.__name__,
            original_score=original_score,
            textual_explanation=explanation,
        )
        return explanation

    # main method for the recognizer
    def analyze(self, text: str, entities: List[str], nlp_artifacts=None):  # noqa D102
        """
        Analyze text using Spacy.

        :param text:          Text to analyze
        :param entities:      Entities to analyze
        :param nlp_artifacts: NLP artifacts to use

        :returns: List of Presidio RecognizerResult objects
        """
        results = []
        if not nlp_artifacts:
            logger.warning("Skipping SpaCy, nlp artifacts not provided...")
            return results

        ner_entities = nlp_artifacts.entities

        # recognize the supported entities
        for entity in entities:
            if entity not in self.supported_entities:
                continue
            for ent in ner_entities:
                if not self.__check_label(entity, ent.label_, self.check_label_groups):
                    continue

                # string of the explanation saying the entity is recognized by spacy
                textual_explanation = self._DEFAULT_EXPLANATION.format(ent.label_)
                explanation = self._build_spacy_explanation(
                    self.ner_strength, textual_explanation
                )

                # create the standard result with the entity, start, end, score, and explanation
                spacy_result = pa.RecognizerResult(
                    entity_type=entity,
                    start=ent.start_char,
                    end=ent.end_char,
                    score=self.ner_strength,
                    analysis_explanation=explanation,
                    recognition_metadata={
                        pa.RecognizerResult.RECOGNIZER_NAME_KEY: self.name
                    },
                )
                results.append(spacy_result)

        return results

    @staticmethod
    def __check_label(
        entity: str, label: str, check_label_groups: Tuple[Set, Set]
    ) -> bool:
        """
        Check if the label is in the label group.

        :param entity:             Entity to check
        :param label:              Label to check
        :param check_label_groups: Label groups to check

        :returns: True if the label is in the label group, False otherwise
        """
        return any(
            entity in egrp and label in lgrp for egrp, lgrp in check_label_groups
        )


# Class to use Flair with Presidio as an external recognizer.
class FlairRecognizer(pa.EntityRecognizer):
    """
    Wrapper for a flair model, if needed to be used within Presidio Analyzer.
    This is to make sure the recognizer can be registered with Presidio registry.
    """

    RECOGNIZABLE_ENTITIES = {
        "LOCATION",
        "PERSON",
        "NRP",
        "GPE",
        "ORGANIZATION",
        "MAC_ADDRESS",
        "US_BANK_NUMBER",
        "IMEI",
        "TITLE",
        "LICENSE_PLATE",
        "US_PASSPORT",
        "CURRENCY",
        "ROUTING_NUMBER",
        "US_ITIN",
        "US_BANK_NUMBER",
        "US_DRIVER_LICENSE",
        "AGE",
        "PASSWORD",
        "SWIFT_CODE",
    }

    # This is used to construct the explanation for the result

    _DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"

    _DEFAULT_CHECK_LABEL_GROUPS = [
        ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}),
        ({"PERSON"}, {"PER", "PERSON"}),
        ({"NRP"}, {"NORP", "NRP"}),
        ({"GPE"}, {"GPE"}),
        ({"ORGANIZATION"}, {"ORG"}),
        ({"MAC_ADDRESS"}, {"MAC_ADDRESS"}),
        ({"US_BANK_NUMBER"}, {"US_BANK_NUMBER"}),
        ({"IMEI"}, {"IMEI"}),
        ({"TITLE"}, {"TITLE"}),
        ({"LICENSE_PLATE"}, {"LICENSE_PLATE"}),
        ({"US_PASSPORT"}, {"US_PASSPORT"}),
        ({"CURRENCY"}, {"CURRENCY"}),
        ({"ROUTING_NUMBER"}, {"ROUTING_NUMBER"}),
        ({"AGE"}, {"AGE"}),
        ({"CURRENCY"}, {"CURRENCY"}),
        ({"SWIFT_CODE"}, {"SWIFT_CODE"}),
        ({"US_ITIN"}, {"US_ITIN"}),
        ({"US_BANK_NUMBER"}, {"US_BANK_NUMBER"}),
        ({"US_DRIVER_LICENSE"}, {"US_DRIVER_LICENSE"}),
    ]

    _DEFAULT_MODEL_LANGUAGES = {
        "en": "beki/flair-pii-distilbert",
    }

    _DEFAULT_PRESIDIO_EQUIVALENCES = {
        "PER": "PERSON",
        "LOC": "LOCATION",
        "ORG": "ORGANIZATION",
        "NROP": "NRP",
        "URL": "URL",
        "US_ITIN": "US_ITIN",
        "US_PASSPORT": "US_PASSPORT",
        "IBAN_CODE": "IBAN_CODE",
        "IP_ADDRESS": "IP_ADDRESS",
        "EMAIL_ADDRESS": "EMAIL",
        "US_DRIVER_LICENSE": "US_DRIVER_LICENSE",
        "US_BANK_NUMBER": "US_BANK_NUMBER",
    }

    def __init__(
        self,
        supported_language: str = "en",
        supported_entities: List[str] = None,
        check_label_groups: Tuple[Set, Set] = None,
    ):
        """
        Initialize the FlairRecognizer.

        :param supported_language: Language to use
        :param supported_entities: Entities to use
        :param check_label_groups: Label groups to check

        :returns: FlairRecognizer object

        """
        self.check_label_groups = check_label_groups or self._DEFAULT_CHECK_LABEL_GROUPS

        supported_entities = supported_entities or self.RECOGNIZABLE_ENTITIES
        self.model = fl.models.SequenceTagger.load(
            self._DEFAULT_MODEL_LANGUAGES.get(supported_language)
        )

        super().__init__(
            supported_entities=supported_entities,
            supported_language=supported_language,
            name="Flair Analytics",
        )

    # main method for the recognizer
    def analyze(
        self,
        text: str,
        entities: List[str],
        nlp_artifacts: pa.nlp_engine.NlpArtifacts = None,
    ) -> List[pa.RecognizerResult]:
        """
        Analyze text and return the results.

        :param text:          The text for analysis.
        :param entities:      The list of entities to recognize.
        :param nlp_artifacts: Not used by this recognizer but needed for the interface.

        :returns: The list of Presidio RecognizerResult constructed from the recognized Flair detections.
        """

        results = []

        sentences = fl.data.Sentence(text)
        self.model.predict(sentences)

        # If there are no specific list of entities, we will look for all of it.
        if not entities:
            entities = self.supported_entities

        # Go over the entities and check if they are in the supported entities list.
        for entity in entities:
            if entity not in self.supported_entities:
                continue

            # Go over the sentences and check if the entity is in the sentence.
            for ent in sentences.get_spans("ner"):
                if not self.__check_label(
                    entity, ent.labels[0].value, self.check_label_groups
                ):
                    continue

                # If the entity is in the sentence, we will add it to the results.
                textual_explanation = self._DEFAULT_EXPLANATION.format(
                    ent.labels[0].value
                )

                # Build the explanation for the result
                explanation = self._build_flair_explanation(
                    round(ent.score, 2), textual_explanation
                )

                flair_result = self._convert_to_recognizer_result(ent, explanation)

                results.append(flair_result)

        return results

    def _convert_to_recognizer_result(
        self, entity: fl.data.Span, explanation: str
    ) -> pa.RecognizerResult:
        """
        Convert Flair result to Presidio RecognizerResult.

        :param entity:      Flair entity of Span
        :param explanation: Presidio AnalysisExplanation

        :returns: Presidio RecognizerResult
        """

        # Convert the entity type to Presidio entity type
        entity_type = self._DEFAULT_PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)

        # Convert the score to Presidio score
        flair_score = round(entity.score, 2)

        # Create the Presidio RecognizerResult from the Flair entity
        flair_results = pa.RecognizerResult(
            entity_type=entity_type,
            start=entity.start_position,
            end=entity.end_position,
            score=flair_score,
            analysis_explanation=explanation,
        )

        return flair_results

    def _build_flair_explanation(
        self, original_score: float, explanation: str
    ) -> pa.AnalysisExplanation:
        """
        Create explanation for why this result was detected.

        :param original_score: Score given by this recognizer
        :param explanation:    Explanation string

        :returns: Presidio AnalysisExplanation
        """

        # Create the Presidio AnalysisExplanation for the result
        explanation = pa.AnalysisExplanation(
            recognizer=self.__class__.__name__,
            original_score=original_score,
            textual_explanation=explanation,
        )
        return explanation

    # sanity check of the entity and label before recognition
    @staticmethod
    def __check_label(
        entity: str, label: str, check_label_groups: Tuple[Set, Set]
    ) -> bool:
        return any(
            entity in egrp and label in lgrp for egrp, lgrp in check_label_groups
        )


# get the analyzer engine based on the model
def _get_analyzer_engine(
    model: str = None, entities: List[str] = None
) -> pa.AnalyzerEngine:
    """
    Return pa.AnalyzerEngine.

    :param model: The model to use. Can be "spacy", "flair", "pattern" or "whole".
    :param entities: The list of entities to use.

    :returns: pa.AnalyzerEngine
    """
    # recognizer registry that can store multiple recognizers
    registry = pa.RecognizerRegistry()
    if model == Models.SPACY:
        # custom spacy recognizer
        spacy_recognizer = CustomSpacyRecognizer()
        # add the custom build spacy recognizer
        registry.add_recognizer(spacy_recognizer)
    elif model == Models.FLAIR:
        # pre-trained flair recognizer
        flair_recognizer = FlairRecognizer()
        # add the custom build flair recognizer
        registry.add_recognizer(flair_recognizer)
    elif model == Models.PATTERN:
        # add the pattern recognizer
        pattern_recognizer_factory = PatternRecognizerFactory()
        for recognizer in pattern_recognizer_factory._create_pattern_recognizer():
            registry.add_recognizer(recognizer)
    elif model == Models.WHOLE:
        spacy_recognizer = CustomSpacyRecognizer()
        flair_recognizer = FlairRecognizer()
        registry.add_recognizer(spacy_recognizer)
        registry.add_recognizer(flair_recognizer)
        # add the pattern recognizer
        pattern_recognizer_factory = PatternRecognizerFactory()
        for recognizer in pattern_recognizer_factory._create_pattern_recognizer():
            registry.add_recognizer(recognizer)
    elif not model and entities:
        if set(entities) & CustomSpacyRecognizer.RECOGNIZABLE_ENTITIES:
            spacy_recognizer = CustomSpacyRecognizer()
            registry.add_recognizer(spacy_recognizer)
        if set(entities) & FlairRecognizer.RECOGNIZABLE_ENTITIES:
            flair_recognizer = FlairRecognizer()
            registry.add_recognizer(flair_recognizer)
        # add the pattern recognizer
        if set(entities) & (set(PatternRecognizerFactory.RECOGNIZABLE_ENTITIES.keys())):
            pattern_recognizer_factory = PatternRecognizerFactory()
            for recognizer in pattern_recognizer_factory._create_pattern_recognizer():
                registry.add_recognizer(recognizer)
    else:
        raise ValueError(
            f"argument of model and entities can not be None at the same time"
        )
    analyzer = pa.AnalyzerEngine(
        registry=registry,
        supported_languages=["en"],
    )

    supported_entities = analyzer.get_supported_entities()

    if entities and not all(item in supported_entities for item in entities):
        not_supported_entities = [
            item for item in entities if item not in supported_entities
        ]
        raise ValueError(
            f"The current model {model} doesn't support the following entities: {not_supported_entities}. "
            f"Supported entities are: {supported_entities}"
        )
    return analyzer


def _get_anonymizer_engine() -> pre_anoymizer.AnonymizerEngine:
    """
    Return AnonymizerEngine.

    :returns: The AnonymizerEngine.
    """
    return pre_anoymizer.AnonymizerEngine()


def _anonymize(
    text: str,
    analyze_results: List[pa.RecognizerResult],
    entity_operator_map: dict = None,
    is_full_text: bool = True,
) -> str:
    """
    Anonymize identified input using Presidio Abonymizer.

    :param text:                The text for analysis.
    :param analyze_results:     The list of Presidio RecognizerResult constructed from
    :param entity_operator_map: The entity_operator_map is a dictionary that maps entity to operator name and operator params.
    :param is_full_text:        Whether the text is full text or not.

    :returns: The anonymized text.
    """
    if not text:
        return ""

    anonymizer_engine = _get_anonymizer_engine()
    if not entity_operator_map:
        operators = None
    else:
        # Create OperatorConfig based on the entity_operator_map
        operators = {
            entity: OperatorConfig(operator_name, operator_params)
            for entity, (operator_name, operator_params) in entity_operator_map.items()
        }

    if is_full_text:
        # Anonymize the entire text
        return anonymizer_engine.anonymize(
            text=text, analyzer_results=analyze_results, operators=operators
        ).text
    # Tokenize the text to sentences
    sentences = nltk.sent_tokenize(text)
    anonymized_sentences = []
    current_idx = 0

    # Find the sentence that has pii entity
    for sentence in sentences:
        start_idx = current_idx
        end_idx = start_idx + len(sentence)

        # Get the entities that are in the sentence, update hte start_idx and end_idx
        sentence_results = [
            pa.RecognizerResult(
                result.entity_type,
                start=result.start - start_idx,
                end=result.end - start_idx,
                score=result.score,
            )
            for result in analyze_results
            if result.start >= start_idx and result.end <= end_idx
        ]

        # If PII is detected
        if sentence_results:
            anonymized_sentence = anonymizer_engine.anonymize(
                text=sentence, analyzer_results=sentence_results, operators=operators
            ).text
            anonymized_sentences.append(anonymized_sentence)

        current_idx = end_idx

    return " ".join(anonymized_sentences)


def _get_tokens(
    text: str, analyze_results: List[pa.RecognizerResult], is_full: bool = True
) -> List[str]:
    """
    Get the full tokens or only contains the entities that can form a sentence.

    :param text:            The text for analysis.
    :param analyze_results: The list of Presidio RecognizerResult constructed from
    :param is_full:         Whether return full tokens or just the tokens that only contains the entities that can form a sentence.

    :returns: The tokens.
    """

    tokens = []
    # sort by start index
    results = sorted(analyze_results, key=lambda x: x.start)
    for i, res in enumerate(results):
        if i == 0:
            tokens.append(text[: res.start])

        # append entity text and entity type
        tokens.append((text[res.start : res.end], res.entity_type))

        # if another entity coming i.e. we're not at the last results element,
        # add text up to next entity
        if i != len(results) - 1:
            tokens.append(text[res.end : results[i + 1].start])
        # if no more entities coming, add all remaining text
        else:
            tokens.append(text[res.end :])

    # get the tokens that only contains the entities that can form a sentence
    part_annontated_tokens = []
    if not is_full:
        last_end_sentence = 0
        for i, token in enumerate(tokens):
            if any(item in token for item in [".", "!", "?"]) and any(
                type(item) is tuple for item in tokens[last_end_sentence:i]
            ):
                part_annontated_tokens.append(tokens[last_end_sentence:i])
                last_end_sentence = i
        return part_annontated_tokens
    return tokens


def _annotate(
    text: str, st_analyze_results: List[pa.RecognizerResult], is_full_html: bool = True
) -> List[str]:
    """
    Annotate identified input using Presidio Anonymizer.

    :param text:               The text for analysis.
    :param st_analyze_results: The list of Presidio RecognizerResult constructed from analysis.
    :param is_full_html:       Whether generate full html or not.

    :returns: The list of tokens with the identified entities.

    """
    return _get_tokens(text, st_analyze_results, is_full_html)


def _process(
    text: str,
    model: pa.AnalyzerEngine,
    score_threshold: float,
    entities: List[str] = None,
    entities_operator_map: dict = None,
    is_full_text: bool = True,
) -> Tuple[str, list]:
    """
    Process the text of str using the model.

    :param text:                  Text to process
    :param model:                 Model to use for processing
    :param entities:              Entities to recognize
    :param entities_operator_map: The entity_operator_map is a dictionary that maps entity to operator name and operator params.
    :param score_threshold:       The score threshold to use for recognition
    :param is_full_text:          Whether to return the full text or just the annotated text

    :returns: A tuple of:

              * the anonymized text
              * the list of Presidio RecognizerResult constructed from analysis
    """

    # get the analyzer engine
    analyzer = model

    # analyze the text that can be used for anonymization
    results = analyzer.analyze(
        text=text,
        language="en",
        entities=entities,
        score_threshold=score_threshold,
        return_decision_process=True,
    )

    # anonymize the text, replace the pii entities with the labels
    anonymized_text = _anonymize(text, results, entities_operator_map, is_full_text)

    return anonymized_text, results


def _get_single_html(
    text: str, results: List[pa.RecognizerResult], is_full_html: bool = True
):
    """
    Generate the html for a single txt file.

    :param text:         The text for analysis.
    :param results:      The list of Presidio RecognizerResult constructed from analysis.
    :param is_full_html: Whether generate full html or not.

    :returns: The html string for a single txt file.
    """
    # convert the results to tokens to generate the html
    tokens = _annotate(text, results, is_full_html)
    html = at_util.get_annotated_html(*tokens)

    # avoid the error during rendering of the \n in the html
    backslash_char = "\\"

    html_str = f"<p>{html.replace('{backslash_char}n', '<br>')}</p>"

    return html_str


def _get_single_json(results: List[pa.RecognizerResult], is_full_report: bool = True):
    """
    Generate the json for a single txt file.

    :param results:        The list of Presidio RecognizerResult constructed from analysis.
    :param is_full_report: Whether generate full json or not.

    :returns: The json string for a single txt file.
    """
    # generate the stats report if needed
    if not is_full_report:
        stats = []
        # add the simplify stats logic here
        for item in results:
            item.analysis_explanation = None
            stats.append(item)
    else:
        stats = results

    return stats


def _get_all_html(
    txt_content: dict,
    res_dict: dict,
    is_full_html: bool = True,
):
    """
    Generate the html for all txt files.

    :param txt_content:  The dictionary of txt file name and content.
    :param res_dict:     The dictionary of txt file name and the list of Presidio RecognizerResult constructed from analysis.
    :param is_full_html: Whether generate full html or not.

    :returns: The html string for all txt files.

    """
    # These are placeholder for the html string
    html_index = "<html><head><title>Highlighted Pii Entities</title></head><body><h1>Highlighted Pii Entities</h1><ul>"
    html_content = ""
    for txt_file, results in res_dict.items():
        txt = txt_content[txt_file]
        html_index += f"<li><a href='#{txt_file}'>{txt_file}</a></li>"
        html_content += f"<li><h2>{txt_file}</h2><p>{_get_single_html(txt, results, is_full_html)}</p></li>"
    html_index += "</ul>"
    html_res = f"{html_index}{html_content}</body></html>"

    return html_res


def _get_all_rpt(res_dict: dict, is_full_report: bool = True):
    """
    Generate the stats report for all txt files.

    :param res_dict:       The dictionary of txt file name and the list of Presidio RecognizerResult constructed from analysis.
    :param is_full_report: Whether generate full report or not.

    :returns: The stats report for all txt files.
    """
    # These are placeholder for the json report
    stats_dict = {}
    for txt_file, results in res_dict.items():
        new_stats = []
        for item in _get_single_json(results, is_full_report):
            if is_full_report:
                item.analysis_explanation = item.analysis_explanation.to_dict()
                new_stats.append(item.to_dict())
            else:
                tmp_dict = item.to_dict()
                tmp_dict.pop("analysis_explanation")
                tmp_dict.pop("recognition_metadata")
                new_stats.append(tmp_dict)
        stats_dict[txt_file] = new_stats
    return stats_dict


def recognize_pii(
    context: mlrun.MLClientCtx,
    input_path: Union[str, pathlib.Path],
    html_key: str,
    score_threshold: float,
    output_directory: str = None,
    entities: List[
        str
    ] = None,  # List of entities to recognize, default is recognizing all
    entity_operator_map: dict = None,
    model: str = None,
    generate_json: bool = True,
    generate_html: bool = True,
    is_full_text: bool = True,
    is_full_html: bool = True,
    is_full_report: bool = True,
) -> Union[Tuple[str, pd.DataFrame, dict, dict], Tuple[str, pd.DataFrame, dict]]:
    """
    Walk through the input path, recognize PII in text and store the anonymized text in the output path.
    Generate the html with different colors for each entity, json report of the explanation.

    :param context:              The MLRun context. this is needed for log the artifacts.
    :param input_path:           The input path of the text files needs to be analyzed.
    :param html_key:             The html key for the artifact.
    :param score_threshold:      The score threshold to mark the recognition as trusted.
    :param output_directory:     The output directory path to store the anonymized text.
    :param entities:             The list of entities to recognize.
    :param entity_operator_map:  The map of entity to operator (mask, redact, replace, keep, hash, and its params)
    :param model:                The model to use. Can be "spacy", "flair", "pattern" or "whole".
    :param generate_json:        Whether to generate the json report of the explanation.
    :param generate_html:        Whether to generate the html report of the explanation.
    :param is_full_text:         Whether to return the full text or only the masked text.
    :param is_full_html:         Whether to return the full html or just the annotated text
    :param is_full_report:       Whether to return the full report or just the score and start, end index

    :returns: A tuple of:

              * Path to the output directory
              * The json report of the explanation (if generate_json is True)
              * A dictionary of errors files that were not processed

    """

    # Set output directory
    if output_directory is None:
        output_directory = tempfile.mkdtemp()

    # Create the output directory:
    output_directory = pathlib.Path(output_directory)
    if not output_directory.exists():
        output_directory.mkdir(parents=True, exist_ok=True)

    txt_files_directory = pathlib.Path(input_path)
    successes = []
    errors = {}

    res_dict = {}
    txt_content = {}
    # Load the model:
    analyzer = _get_analyzer_engine(model, entities)
    logger.info("Model loaded")
    # Go over the text files in the input path, analyze and anonymize them:
    for txt_file in tqdm(
        list(txt_files_directory.glob("*.txt")),
        desc="Processing files",
        unit="file",
    ):
        try:
            # Load the str from the text file
            text = txt_file.read_text()
            txt_content[str(txt_file)] = text
            # Process the text to recoginze the pii entities in it
            anonymized_text, results = _process(
                text=text,
                model=analyzer,
                entities=entities,
                entities_operator_map=entity_operator_map,
                score_threshold=score_threshold,
                is_full_text=is_full_text,
            )
            res_dict[str(txt_file)] = results
            # Store the anonymized text in the output path
            output_file = output_directory / f"{txt_file.stem}.txt"
            output_file.parent.mkdir(parents=True, exist_ok=True)
            with open(output_file, "w") as f:
                f.write(anonymized_text)
            successes.append([txt_file.name, output_file.name])
        except Exception as e:
            errors[str(txt_file)] = str(e)
            logger.error(f"Error processing {txt_file}: {e}")

    successes = pd.DataFrame(
        successes,
        columns=["original_file", "anonymized_file"],
    )

    if generate_html:
        # Generate the html report
        html_res = _get_all_html(txt_content, res_dict, is_full_html)
        # Store the html report in the context
        arti_html = mlrun.artifacts.Artifact(body=html_res, format="html", key=html_key)
        context.log_artifact(arti_html)
    if generate_json:
        # Generate the json report
        json_res = _get_all_rpt(res_dict, is_full_report)
        return str(output_directory), successes, errors, json_res
    return str(output_directory), successes, errors
 + base_image: mlrun/mlrun + commands: [] + code_origin: '' + origin_filename: '' + requirements: + - nltk + - pandas + - presidio-anonymizer + - presidio-analyzer + - torch + - flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653 + - st-annotated-text + - https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl + entry_points: + analyze: + name: analyze + doc: Analyze text and return the results. + parameters: + - name: self + - name: text + type: str + doc: The text for analysis. + - name: entities + type: List[str] + doc: The list of entities to recognize. + - name: nlp_artifacts + type: pa.nlp_engine.NlpArtifacts + doc: Not used by this recognizer but needed for the interface. + default: null + outputs: + - doc: The list of Presidio RecognizerResult constructed from the recognized + Flair detections. + type: List[pa.RecognizerResult] + lineno: 381 + has_varargs: false + has_kwargs: false + recognize_pii: + name: recognize_pii + doc: 'Walk through the input path, recognize PII in text and store the anonymized + text in the output path. + + Generate the html with different colors for each entity, json report of the + explanation.' + parameters: + - name: context + type: MLClientCtx + doc: The MLRun context. this is needed for log the artifacts. + - name: input_path + type: Union[str, Path] + doc: The input path of the text files needs to be analyzed. + - name: html_key + type: str + doc: The html key for the artifact. + - name: score_threshold + type: float + doc: The score threshold to mark the recognition as trusted. + - name: output_directory + type: str + doc: The output directory path to store the anonymized text. + default: null + - name: entities + type: List[str] + doc: The list of entities to recognize. + default: null + - name: entity_operator_map + type: dict + doc: The map of entity to operator (mask, redact, replace, keep, hash, and + its params) + default: null + - name: model + type: str + doc: The model to use. Can be "spacy", "flair", "pattern" or "whole". + default: null + - name: generate_json + type: bool + doc: Whether to generate the json report of the explanation. + default: true + - name: generate_html + type: bool + doc: Whether to generate the html report of the explanation. + default: true + - name: is_full_text + type: bool + doc: Whether to return the full text or only the masked text. + default: true + - name: is_full_html + type: bool + doc: Whether to return the full html or just the annotated text + default: true + - name: is_full_report + type: bool + doc: Whether to return the full report or just the score and start, end index + default: true + outputs: + - doc: 'A tuple of:' + type: Union[Tuple[str, pd.DataFrame, dict, dict], Tuple[str, pd.DataFrame, + dict]] + lineno: 845 + has_varargs: false + has_kwargs: false + description: This function is used to recognize PII in a directory of text files + default_handler: recognize_pii + disable_auto_mount: false + clone_target_dir: '' + env: [] + priority_class_name: '' + preemption_mode: prevent + affinity: null + tolerations: null + security_context: {} +verbose: false diff --git a/functions/master/pii_recognizer/0.3.0/src/item.yaml b/functions/master/pii_recognizer/0.3.0/src/item.yaml new file mode 100644 index 00000000..41ead33b --- /dev/null +++ b/functions/master/pii_recognizer/0.3.0/src/item.yaml @@ -0,0 +1,35 @@ +apiVersion: v1 +categories: + - machine-learning + - data-preparation + - NLP +description: This function is used to recognize PII in a directory of text files +doc: '' +example: pii_recognizer.ipynb +generationDate: 2023-08-15:10-24 +hidden: false +icon: '' +labels: + author: pgw +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.4.0 +name: pii-recognizer +platformVersion: 3.5.3 +spec: + filename: pii_recognizer.py + handler: recognize_pii + image: mlrun/mlrun + kind: job + requirements: + - nltk + - pandas + - presidio-anonymizer + - presidio-analyzer + - torch + - flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653 + - st-annotated-text + - https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl +url: '' +version: 0.3.0 +test_valid: False diff --git a/functions/master/pii_recognizer/0.3.0/src/pii_recognizer.ipynb b/functions/master/pii_recognizer/0.3.0/src/pii_recognizer.ipynb new file mode 100644 index 00000000..48d1100d --- /dev/null +++ b/functions/master/pii_recognizer/0.3.0/src/pii_recognizer.ipynb @@ -0,0 +1,2015 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7412335f", + "metadata": {}, + "source": [ + "# PII Recognizer\n", + "\n", + "A function to detect pii data and anonymize the pii entity in the text. \n", + "\n", + "In this notebook we will go over the function's docs and outputs and see an end-to-end example of running it.\n", + "\n", + "1. [Documentation](#chapter1)\n", + "2. [Results](#chapter2)\n", + "3. [End-to-end Demo](#chapter3)" + ] + }, + { + "cell_type": "markdown", + "id": "0bb6c621", + "metadata": {}, + "source": [ + "\n", + "## 1. Documentation\n", + "\n", + "The function receive a directory path with all the text files in it. It walk through the directory, get all the text file. Then it detect the pii entity inside of the text file, apply the operator on the entity. Generate the html file with all pii entity highlighted. Generate the json report has the explaination of the process.\n" + ] + }, + { + "cell_type": "markdown", + "id": "de1a1349", + "metadata": {}, + "source": [ + "### 1.1. Parameters:\n", + "* **context**: `mlrun.MLClientCtx`\n", + " \n", + " The MLRun context\n", + " \n", + "* **input_path**: `str`\n", + " \n", + " The input directory with all the text files\n", + " \n", + "* **output_path**: `str`\n", + " \n", + " The directory that is used to store the anonymized text files. it is also used for mlrun to log the artifact as zip file\n", + " \n", + "* **output_suffix**: `str`\n", + " \n", + " The suffix will added to the input file. for example if the input text file is pii.txt, if output_suffix is \"anonymized\", the output file would be pii_anonymized.txt\n", + " \n", + "* **html_key**: `str`\n", + " \n", + " The artifact name of the html file \n", + " \n", + "* **entities**: `List[str]`\n", + " \n", + " The list of the entities to recognize. Please make sure the model you choose can recognize the entities. \n", + "\n", + "* **entity_operator_map**: `List[str]`\n", + " For different entity, we can apply different operator. Now supports Keep, Mask, Replace, Redact, Hash\n", + " \n", + "
\n",
+    "     entity_operator_map = {\n",
+    "        \"PERSON\": (\"keep\", {}),\n",
+    "        \"EMAIL\": (\"mask\", {\"masking_char\": \"#\", \"chars_to_mask\": 5, \"from_end\": False}),\n",
+    "        \"PHONE\": (\"hash\", {}),\n",
+    "        \"LOCATION\": (\"redact\", {}),\n",
+    "        \"ORGANIZATION\": (\"replace\", {\"new_value\": \"Company XYZ\"})\n",
+    "        }\n",
+    "     
\n", + " \n", + " In this example:\n", + "\n", + " - \"PERSON\" entities are kept as they are using the \"keep\" operator. \n", + " - \"EMAIL_ADDRESS\" entities are masked with the \"#\" character, masking the first five characters. \n", + " - \"PHONE_NUMBER\" entities are replaced with their hashed value using the \"hash\" operator.\n", + " - \"LOCATION\" entities are completely removed using the \"redact\" operator.\n", + " - \"ORGANIZATION\" entities are replaced with the string \"Company XYZ\" using the \"replace\" operator.\n", + " \n", + "* **model**: `str`\n", + " \n", + " - \"whole\", \"spacy\", \"pattern\", \"flair\". The default is \"whole\".\n", + " \n", + " For each model, it can detect some entities. The \"whole\" model is combined all three models together. It can detect all the entities list below. \n", + " \n", + " \n", + " - \"spacy\" : [\"LOCATION\", \"PERSON\",\"NRP\",\"ORGANIZATION\",\"DATE_TIME\"]\n", + " \n", + " - \"pattern\": [\"CREDIT_CARD\", \"SSN\", \"PHONE\", \"EMAIL\"]\n", + " \n", + " - \"flair\": [ \"LOCATION\",\n", + " \"PERSON\",\n", + " \"NRP\",\n", + " \"GPE\",\n", + " \"ORGANIZATION\",\n", + " \"MAC_ADDRESS\",\n", + " \"US_BANK_NUMBER\",\n", + " \"IMEI\",\n", + " \"TITLE\",\n", + " \"LICENSE_PLATE\",\n", + " \"US_PASSPORT\",\n", + " \"CURRENCY\",\n", + " \"ROUTING_NUMBER\",\n", + " \"US_ITIN\",\n", + " \"US_BANK_NUMBER\",\n", + " \"US_DRIVER_LICENSE\",\n", + " \"AGE\",\n", + " \"PASSWORD\",\n", + " \"SWIFT_CODE\"\n", + " ]\n", + " \n", + "* **score_threshold**:\n", + " \n", + " Minimum confidence value, the default is 0 to align with presidio.AnalyzerEngine\n", + " \n", + "* **generate_json_rpt**:\n", + "\n", + " Whether to generate the json report of the explaination\n", + " \n", + "* **generate_html_rpt**:\n", + "\n", + " Whether to generate the html with highlighted pii entities or not\n", + " \n", + "* **is_full_text**:\n", + "\n", + " Whether to return the full text or just the sentences with pii entities.\n", + " \n", + "* **is_full_html**: `bool`\n", + " \n", + " Whether to return the full html or just the annotated html\n", + " \n", + "* **is_full_report**: `bool`\n", + " \n", + " Whether to return the full json report or just the score and start, end index\n", + " \n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "83f616d2", + "metadata": {}, + "source": [ + "### 1.2. Outputs:\n", + "\n", + "There are two outputs of this function. \n", + "\n", + "* **output_path**: `str`\n", + " \n", + " The directory stored all the anonymized text files\n", + "\n", + "* **rpt_json**: `dict`\n", + "\n", + " A dict of reporting to explain how does the model detect the pii entity\n", + " \n", + "* **errors** : `dict`\n", + " A dict of errors when processing the text files if any\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "310de23a", + "metadata": {}, + "source": [ + "\n", + "## 2. Results\n", + "\n", + "The result of the function looks like the following: \n", + "\n", + "For example if the input string is \n", + "\n", + "`John Doe 's ssn is 182838483, connect john doe with john_doe@gmail.com or 6288389029, he can pay you with 41482929939393`\n", + "\n", + "The anonymized_text is \n", + "\n", + "`'s is , connect with or , he can pay you with `\n", + "\n", + "The html_str is\n", + "\n", + "

John Doe'sPERSON ssnORGANIZATION is 182838483SSN, connect me with john_doe@gmail.comPERSONjohn_doe@gmail.comEMAIL or 6288389029PHONE, he can pay you with 41482929939393CREDIT_CARD\n", + "

\n", + "\n", + "The json report that explain the output is\n", + "\n", + "```yaml\n", + "\n", + "[\n", + " {\n", + " \"entity_type\": \"PERSON\", # result of the labeling\n", + " \"start\": 0, # start positon of the entity\n", + " \"end\": 9, # end postion of the entity\n", + " \"score\": 0.99, # the confident score of the model + context_improvement\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"FlairRecognizer\", # which recognizer is used to recognize this entity\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 0.99, # The original confident score from the pre-trained model\n", + " \"score\": 0.99, # the final score = original_score + score_context_improvement\n", + " \"textual_explanation\": \"Identified as PER by Flair's Named Entity Recognition\",\n", + " \"score_context_improvement\": 0, # The improvement from the context\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_identifier\": \"Flair Analytics_5577088640\",\n", + " \"recognizer_name\": \"Flair Analytics\"\n", + " }\n", + " },\n", + " ....\n", + "]\n", + "```\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "ce2199fb", + "metadata": {}, + "source": [ + "\n", + "## 3. End-to-end Demo\n" + ] + }, + { + "cell_type": "markdown", + "id": "fc42debf-f363-48f9-9512-3951d352fb1d", + "metadata": {}, + "source": [ + "### 3.1. Recognition configurations \n", + " - model: which model you want to use.\n", + " - entities: What entities to recognize? \n", + " - score_threshold: From which score to mark the recogniztion as trusted?\n" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "2a290d0f-15da-434d-b3fc-46ebb35be611", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-31 02:17:04,305 [info] Project loaded successfully: {'project_name': 'pii'}\n", + "> 2023-07-31 02:17:04,312 [warning] Failed to add git metadata, ignore if path is not part of a git repo.: {'path': './', 'error': '/User/pii_recognizer'}\n", + "> 2023-07-31 02:17:04,408 [warning] artifact/output path is not defined or is local and relative, artifacts will not be visible in the UI: {'output_path': './'}\n", + "> 2023-07-31 02:17:04,409 [info] Storing function: {'name': 'pii-recognizer-recognize-pii', 'uid': '51b5ad8144004e52a1008c08850842c8', 'db': None}\n", + "2023-07-31 02:17:04,567 loading file /User/.flair/models/flair-pii-distilbert/models--beki--flair-pii-distilbert/snapshots/20fb59f1762edcf253bce67716a94a43cb075ae6/pytorch_model.bin\n", + "2023-07-31 02:17:07,730 SequenceTagger predicts: Dictionary with 21 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-DATE_TIME, B-DATE_TIME, E-DATE_TIME, I-DATE_TIME, S-ORG, B-ORG, E-ORG, I-ORG, S-NRP, B-NRP, E-NRP, I-NRP\n", + "Model loaded\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fd09fd6ee2844e13b5839e1fd20ef222", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Processing files: 0%| | 0/2 [00:00\n", + ".dictlist {\n", + " background-color: #4EC64B;\n", + " text-align: center;\n", + " margin: 4px;\n", + " border-radius: 3px; padding: 0px 3px 1px 3px; display: inline-block;}\n", + ".artifact {\n", + " cursor: pointer;\n", + " background-color: #4EC64B;\n", + " text-align: left;\n", + " margin: 4px; border-radius: 3px; padding: 0px 3px 1px 3px; display: inline-block;\n", + "}\n", + "div.block.hidden {\n", + " display: none;\n", + "}\n", + ".clickable {\n", + " cursor: pointer;\n", + "}\n", + ".ellipsis {\n", + " display: inline-block;\n", + " max-width: 60px;\n", + " white-space: nowrap;\n", + " overflow: hidden;\n", + " text-overflow: ellipsis;\n", + "}\n", + ".master-wrapper {\n", + " display: flex;\n", + " flex-flow: row nowrap;\n", + " justify-content: flex-start;\n", + " align-items: stretch;\n", + "}\n", + ".master-tbl {\n", + " flex: 3\n", + "}\n", + ".master-wrapper > div {\n", + " margin: 4px;\n", + " padding: 10px;\n", + "}\n", + "iframe.fileview {\n", + " border: 0 none;\n", + " height: 100%;\n", + " width: 100%;\n", + " white-space: pre-wrap;\n", + "}\n", + ".pane-header-title {\n", + " width: 80%;\n", + " font-weight: 500;\n", + "}\n", + ".pane-header {\n", + " line-height: 1;\n", + " background-color: #4EC64B;\n", + " padding: 3px;\n", + "}\n", + ".pane-header .close {\n", + " font-size: 20px;\n", + " font-weight: 700;\n", + " float: right;\n", + " margin-top: -5px;\n", + "}\n", + ".master-wrapper .right-pane {\n", + " border: 1px inset silver;\n", + " width: 40%;\n", + " min-height: 300px;\n", + " flex: 3\n", + " min-width: 500px;\n", + "}\n", + ".master-wrapper * {\n", + " box-sizing: border-box;\n", + "}\n", + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
pii0Jul 31 02:17:04completedpii-recognizer-recognize-pii
v3io_user=pengw
kind=
owner=pengw
host=jupyter-pengw-5f99fb678d-mnvxl
model=whole
input_path=./data/
output_path=./data/output1/
entities=['PERSON', 'EMAIL', 'PHONE', 'LOCATION', 'ORGANIZATION']
output_suffix=output
html_key=highlighted
score_threshold=0.5
highlighted
output_path
rpt_json
errors
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-31 02:17:12,403 [info] Run execution finished: {'status': 'completed', 'name': 'pii-recognizer-recognize-pii'}\n" + ] + } + ], + "source": [ + "import mlrun\n", + "artifact_path = \"./\"\n", + "proj = mlrun.get_or_create_project(\"pii\", \"./\")\n", + "fn = mlrun.code_to_function(\n", + " project=\"pii\",\n", + " name=\"pii_recognizer\",\n", + " filename=\"pii_recognizer.py\",\n", + " kind=\"job\",\n", + " image=\"mlrun/mlrun\",\n", + " handler=\"recognize_pii\",\n", + " description=\"This function is used to recognize PII in a given text\",\n", + ")\n", + "run_obj = fn.run(\n", + " artifact_path = artifact_path,\n", + " params= {\n", + " 'model': \"whole\", \n", + " 'input_path': \"./data/\",\n", + " 'output_path': \"./data/output1/\",\n", + " \"entities\": ['PERSON', \"EMAIL\", \"PHONE\", \"LOCATION\", \"ORGANIZATION\"], # the entities that needs to recognize\n", + " \"output_suffix\": \"output\",\n", + " \"html_key\": \"highlighted\",\n", + " \"score_threshold\" : 0.5, # the score threshold to mark the recognition as trusted\n", + " },\n", + " returns = [\"output_path: path\", \"rpt_json: file\", \"errors: file\"],\n", + " local=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "38e1a44b-e045-4c50-a40f-fbc7e77d6c6b", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c23dc77030224dfc825d7da86c6c1220", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Processing files: 0%| | 0/2 [00:00,\n", + "\n", + "We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of . Your flight tickets have been booked, and you will be departing on July 15th, 2023.\n", + "\n", + "Please provide us with the necessary details to finalize your travel arrangements. We kindly request your full name, date of birth, passport number, and contact information. Rest assured that all provided information will be handled with utmost confidentiality and in compliance with data protection regulations.\n", + "\n", + "We look forward to creating unforgettable memories for you and your loved ones during your stay with us. If you have any questions or require further assistance, please don't hesitate to contact our customer support team.\n", + "\n", + " is 182838483, connect him with or , he can pay you with 9393\n" + ] + } + ], + "source": [ + "#get the mlrun context\n", + "context = mlrun.get_or_create_ctx('pii_ctx1')\n", + "import pathlib\n", + "from tqdm.auto import tqdm\n", + "for i, txt_file in enumerate(\n", + " tqdm(\n", + " list(pathlib.Path(\"./data/output1/\").glob(\"*.txt\")),\n", + " desc=\"Processing files\",\n", + " unit=\"file\",\n", + " )\n", + " ):\n", + " # Load the str from the text file\n", + " text = txt_file.read_text()\n", + " print(text)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "72c31b9c-47cc-4e73-8c76-041f78cfd305", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Highlighted Pii Entities

Highlighted Pii Entities

  • data/letter.txt

    Dear Mr. John DoePERSON,\n", + "\n", + "We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of RivieraLOCATIONRivieraORGANIZATION. Your flight tickets have been booked, and you will be departing on July 15th, 2023.\n", + "\n", + "Please provide us with the necessary details to finalize your travel arrangements. We kindly request your full name, date of birth, passport number, and contact information. Rest assured that all provided information will be handled with utmost confidentiality and in compliance with data protection regulations.\n", + "\n", + "We look forward to creating unforgettable memories for you and your loved ones during your stay with us. If you have any questions or require further assistance, please don't hesitate to contact our customer support team.\n", + "

  • data/pii_data.txt

    John smith'sPERSON ssnORGANIZATION is 182838483, connect him with JohnPERSONJohn_smith@gmail.comEMAILsmithPERSON@gmail.com or 6288389029PHONE, he can pay you with 4148292993PHONE9393

  • " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#check the highlighted html \n", + "html_output = context.get_cached_artifact(\"highlighted\")\n", + "html_str = mlrun.get_dataitem(html_output.get_target_path()).get().decode(\"utf-8\")\n", + "from IPython.core.display import display, HTML\n", + "display(HTML(html_str))" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "d4c7fb04-af53-4e63-8b0a-e14e1184f973", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"data/letter.txt\": [\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 9,\n", + " \"end\": 17,\n", + " \"score\": 1,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"CustomSpacyRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1,\n", + " \"score\": 1,\n", + " \"textual_explanation\": \"Identified as PERSON by Spacy's Named Entity Recognition (Privy-trained)\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"CustomSpacyRecognizer\",\n", + " \"recognizer_identifier\": \"CustomSpacyRecognizer_139944219101744\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"LOCATION\",\n", + " \"start\": 248,\n", + " \"end\": 255,\n", + " \"score\": 1.0,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"FlairRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1.0,\n", + " \"score\": 1.0,\n", + " \"textual_explanation\": \"Identified as LOC by Flair's Named Entity Recognition\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_identifier\": \"Flair Analytics_139944219101936\",\n", + " \"recognizer_name\": \"Flair Analytics\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"ORGANIZATION\",\n", + " \"start\": 248,\n", + " \"end\": 255,\n", + " \"score\": 1,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"CustomSpacyRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1,\n", + " \"score\": 1,\n", + " \"textual_explanation\": \"Identified as ORG by Spacy's Named Entity Recognition (Privy-trained)\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"CustomSpacyRecognizer\",\n", + " \"recognizer_identifier\": \"CustomSpacyRecognizer_139944219101744\"\n", + " }\n", + " }\n", + " ],\n", + " \"data/pii_data.txt\": [\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 0,\n", + " \"end\": 12,\n", + " \"score\": 1,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"CustomSpacyRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1,\n", + " \"score\": 1,\n", + " \"textual_explanation\": \"Identified as PERSON by Spacy's Named Entity Recognition (Privy-trained)\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"CustomSpacyRecognizer\",\n", + " \"recognizer_identifier\": \"CustomSpacyRecognizer_139944219101744\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"ORGANIZATION\",\n", + " \"start\": 13,\n", + " \"end\": 16,\n", + " \"score\": 1,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"CustomSpacyRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1,\n", + " \"score\": 1,\n", + " \"textual_explanation\": \"Identified as ORG by Spacy's Named Entity Recognition (Privy-trained)\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"CustomSpacyRecognizer\",\n", + " \"recognizer_identifier\": \"CustomSpacyRecognizer_139944219101744\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 53,\n", + " \"end\": 58,\n", + " \"score\": 1.0,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"FlairRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1.0,\n", + " \"score\": 1.0,\n", + " \"textual_explanation\": \"Identified as PER by Flair's Named Entity Recognition\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_identifier\": \"Flair Analytics_139944219101936\",\n", + " \"recognizer_name\": \"Flair Analytics\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 48,\n", + " \"end\": 52,\n", + " \"score\": 0.87,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"FlairRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 0.87,\n", + " \"score\": 0.87,\n", + " \"textual_explanation\": \"Identified as PER by Flair's Named Entity Recognition\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_identifier\": \"Flair Analytics_139944219101936\",\n", + " \"recognizer_name\": \"Flair Analytics\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"EMAIL\",\n", + " \"start\": 48,\n", + " \"end\": 68,\n", + " \"score\": 0.5,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"PatternRecognizer\",\n", + " \"pattern_name\": \"EMAIL\",\n", + " \"pattern\": \"\\\\S+@\\\\S+\",\n", + " \"original_score\": 0.5,\n", + " \"score\": 0.5,\n", + " \"textual_explanation\": null,\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"PatternRecognizer\",\n", + " \"recognizer_identifier\": \"PatternRecognizer_139944352474640\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"PHONE\",\n", + " \"start\": 72,\n", + " \"end\": 82,\n", + " \"score\": 0.5,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"PatternRecognizer\",\n", + " \"pattern_name\": \"PHONE\",\n", + " \"pattern\": \"\\\\(?\\\\d{3}\\\\)?[-.\\\\s]?\\\\d{3}[-.\\\\s]?\\\\d{4}\",\n", + " \"original_score\": 0.5,\n", + " \"score\": 0.5,\n", + " \"textual_explanation\": null,\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"PatternRecognizer\",\n", + " \"recognizer_identifier\": \"PatternRecognizer_139944352476560\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"PHONE\",\n", + " \"start\": 104,\n", + " \"end\": 114,\n", + " \"score\": 0.5,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"PatternRecognizer\",\n", + " \"pattern_name\": \"PHONE\",\n", + " \"pattern\": \"\\\\(?\\\\d{3}\\\\)?[-.\\\\s]?\\\\d{3}[-.\\\\s]?\\\\d{4}\",\n", + " \"original_score\": 0.5,\n", + " \"score\": 0.5,\n", + " \"textual_explanation\": null,\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"PatternRecognizer\",\n", + " \"recognizer_identifier\": \"PatternRecognizer_139944352476560\"\n", + " }\n", + " }\n", + " ]\n", + "}\n" + ] + } + ], + "source": [ + "#check the json report about the explanation.\n", + "rpt_output1 = context.get_cached_artifact(\"rpt_json\")\n", + "rpt_str1 = mlrun.get_dataitem(rpt_output1.get_target_path()).get().decode(\"utf-8\")\n", + "import json\n", + "obj = json.loads(rpt_str1)\n", + " \n", + "# Pretty Print JSON\n", + "json_formatted_str1 = json.dumps(obj, indent=4)\n", + "print(json_formatted_str1)" + ] + }, + { + "cell_type": "markdown", + "id": "1182c119", + "metadata": {}, + "source": [ + "### 3.2. Masking configurations \n", + " - entity_operator_map: it defined what to do with recognized tokens? Mask them? mask them with what? remove them? replace them?\n", + "
    \n",
    +    "     entity_operator_map = {\n",
    +    "        \"PERSON\": (\"keep\", {}),\n",
    +    "        \"EMAIL\": (\"mask\", {\"masking_char\": \"😀\", \"chars_to_mask\": 5, \"from_end\": False}),\n",
    +    "        \"PHONE\": (\"hash\", {}),\n",
    +    "        \"LOCATION\": (\"redact\", {}),\n",
    +    "        \"ORGANIZATION\": (\"replace\", {\"new_value\": \"Company XYZ\"})\n",
    +    "        }\n",
    +    "     
    " + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "df325ea8-4b01-4485-b835-e0196ffe83d7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-31 02:20:40,550 [info] Project loaded successfully: {'project_name': 'pii'}\n", + "> 2023-07-31 02:20:40,556 [warning] Failed to add git metadata, ignore if path is not part of a git repo.: {'path': './', 'error': '/User/pii_recognizer'}\n", + "> 2023-07-31 02:20:40,649 [warning] artifact/output path is not defined or is local and relative, artifacts will not be visible in the UI: {'output_path': './'}\n", + "> 2023-07-31 02:20:40,649 [info] Storing function: {'name': 'pii-recognizer-recognize-pii', 'uid': '2b43f80c7ca44b43b229760bb55f814d', 'db': None}\n", + "2023-07-31 02:20:40,812 loading file /User/.flair/models/flair-pii-distilbert/models--beki--flair-pii-distilbert/snapshots/20fb59f1762edcf253bce67716a94a43cb075ae6/pytorch_model.bin\n", + "2023-07-31 02:20:44,130 SequenceTagger predicts: Dictionary with 21 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-DATE_TIME, B-DATE_TIME, E-DATE_TIME, I-DATE_TIME, S-ORG, B-ORG, E-ORG, I-ORG, S-NRP, B-NRP, E-NRP, I-NRP\n", + "Model loaded\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5ad56413aad64e59b177666ca0a89a01", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Processing files: 0%| | 0/2 [00:00\n", + ".dictlist {\n", + " background-color: #4EC64B;\n", + " text-align: center;\n", + " margin: 4px;\n", + " border-radius: 3px; padding: 0px 3px 1px 3px; display: inline-block;}\n", + ".artifact {\n", + " cursor: pointer;\n", + " background-color: #4EC64B;\n", + " text-align: left;\n", + " margin: 4px; border-radius: 3px; padding: 0px 3px 1px 3px; display: inline-block;\n", + "}\n", + "div.block.hidden {\n", + " display: none;\n", + "}\n", + ".clickable {\n", + " cursor: pointer;\n", + "}\n", + ".ellipsis {\n", + " display: inline-block;\n", + " max-width: 60px;\n", + " white-space: nowrap;\n", + " overflow: hidden;\n", + " text-overflow: ellipsis;\n", + "}\n", + ".master-wrapper {\n", + " display: flex;\n", + " flex-flow: row nowrap;\n", + " justify-content: flex-start;\n", + " align-items: stretch;\n", + "}\n", + ".master-tbl {\n", + " flex: 3\n", + "}\n", + ".master-wrapper > div {\n", + " margin: 4px;\n", + " padding: 10px;\n", + "}\n", + "iframe.fileview {\n", + " border: 0 none;\n", + " height: 100%;\n", + " width: 100%;\n", + " white-space: pre-wrap;\n", + "}\n", + ".pane-header-title {\n", + " width: 80%;\n", + " font-weight: 500;\n", + "}\n", + ".pane-header {\n", + " line-height: 1;\n", + " background-color: #4EC64B;\n", + " padding: 3px;\n", + "}\n", + ".pane-header .close {\n", + " font-size: 20px;\n", + " font-weight: 700;\n", + " float: right;\n", + " margin-top: -5px;\n", + "}\n", + ".master-wrapper .right-pane {\n", + " border: 1px inset silver;\n", + " width: 40%;\n", + " min-height: 300px;\n", + " flex: 3\n", + " min-width: 500px;\n", + "}\n", + ".master-wrapper * {\n", + " box-sizing: border-box;\n", + "}\n", + "\n", + "
    \n", + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    pii0Jul 31 02:20:40completedpii-recognizer-recognize-pii
    v3io_user=pengw
    kind=
    owner=pengw
    host=jupyter-pengw-5f99fb678d-mnvxl
    model=whole
    input_path=./data/
    output_path=./data/output2/
    entities=['PERSON', 'EMAIL', 'PHONE', 'LOCATION', 'ORGANIZATION']
    output_suffix=output
    html_key=highlighted
    score_threshold=0.5
    entity_operator_map={'PERSON': ('keep', {}), 'EMAIL': ('mask', {'masking_char': '😀', 'chars_to_mask': 100, 'from_end': False, 'entity_type': 'EMAIL'}), 'PHONE': ('hash', {}), 'LOCATION': ('redact', {}), 'ORGANIZATION': ('replace', {'new_value': 'Company XYZ', 'entity_type': 'ORGANIZATION'})}
    highlighted
    output_path
    rpt_json
    errors
    \n", + "
    \n", + "
    \n", + "
    \n", + " Title\n", + " ×\n", + "
    \n", + " \n", + "
    \n", + "
    \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-31 02:20:48,903 [info] Run execution finished: {'status': 'completed', 'name': 'pii-recognizer-recognize-pii'}\n" + ] + } + ], + "source": [ + "import mlrun\n", + "artifact_path = \"./\"\n", + "proj = mlrun.get_or_create_project(\"pii\", \"./\")\n", + "fn = mlrun.code_to_function(\n", + " project=\"pii\",\n", + " name=\"pii_recognizer\",\n", + " filename=\"pii_recognizer.py\",\n", + " kind=\"job\",\n", + " image=\"mlrun/mlrun\",\n", + " handler=\"recognize_pii\",\n", + " description=\"This function is used to recognize PII in a given text\",\n", + ")\n", + "\n", + "entity_operator_map = {\n", + " \"PERSON\": (\"keep\", {}),\n", + " \"EMAIL\": (\"mask\", {\"masking_char\": \"😀\", \"chars_to_mask\" : 100, \"from_end\": False}),\n", + " \"PHONE\": (\"hash\", {}),\n", + " \"LOCATION\": (\"redact\", {}),\n", + " \"ORGANIZATION\": (\"replace\", {\"new_value\": \"Company XYZ\"})\n", + " }\n", + "run_obj = fn.run(\n", + " artifact_path = artifact_path,\n", + " params= {\n", + " 'model': \"whole\", \n", + " 'input_path': \"./data/\",\n", + " 'output_path': \"./data/output2/\",\n", + " \"entities\": ['PERSON', \"EMAIL\", \"PHONE\", \"LOCATION\", \"ORGANIZATION\"],\n", + " \"output_suffix\": \"output\",\n", + " \"html_key\": \"highlighted\",\n", + " \"score_threshold\" : 0.5,\n", + " \"entity_operator_map\": entity_operator_map,\n", + " \n", + " },\n", + " returns = [\"output_path: path\", \"rpt_json: file\", \"errors: file\"],\n", + " local=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "2583e72b-8dda-4469-8e2b-f492851015af", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "552ad96fd23e497ea6e547936c7853a0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Processing files: 0%| | 0/2 [00:00Highlighted Pii Entities

    Highlighted Pii Entities

  • data/letter.txt

    Dear Mr. John DoePERSON,\n", + "\n", + "We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of RivieraLOCATIONRivieraORGANIZATION. Your flight tickets have been booked, and you will be departing on July 15th, 2023.\n", + "\n", + "Please provide us with the necessary details to finalize your travel arrangements. We kindly request your full name, date of birth, passport number, and contact information. Rest assured that all provided information will be handled with utmost confidentiality and in compliance with data protection regulations.\n", + "\n", + "We look forward to creating unforgettable memories for you and your loved ones during your stay with us. If you have any questions or require further assistance, please don't hesitate to contact our customer support team.\n", + "

  • data/pii_data.txt

    John smith'sPERSON ssnORGANIZATION is 182838483, connect him with JohnPERSONJohn_smith@gmail.comEMAILsmithPERSON@gmail.com or 6288389029PHONE, he can pay you with 4148292993PHONE9393

  • " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#check the highlighted html \n", + "html_output = context.get_cached_artifact(\"highlighted\")\n", + "html_str = mlrun.get_dataitem(html_output.get_target_path()).get().decode(\"utf-8\")\n", + "from IPython.core.display import display, HTML\n", + "display(HTML(html_str))" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "3a087fb1-dde7-4ba9-9f53-a10f9099c769", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"data/letter.txt\": [\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 9,\n", + " \"end\": 17,\n", + " \"score\": 1.0,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"FlairRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1.0,\n", + " \"score\": 1.0,\n", + " \"textual_explanation\": \"Identified as PER by Flair's Named Entity Recognition\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_identifier\": \"Flair Analytics_139944345555488\",\n", + " \"recognizer_name\": \"Flair Analytics\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"LOCATION\",\n", + " \"start\": 248,\n", + " \"end\": 255,\n", + " \"score\": 1.0,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"FlairRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1.0,\n", + " \"score\": 1.0,\n", + " \"textual_explanation\": \"Identified as LOC by Flair's Named Entity Recognition\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_identifier\": \"Flair Analytics_139944345555488\",\n", + " \"recognizer_name\": \"Flair Analytics\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"ORGANIZATION\",\n", + " \"start\": 248,\n", + " \"end\": 255,\n", + " \"score\": 1,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"CustomSpacyRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1,\n", + " \"score\": 1,\n", + " \"textual_explanation\": \"Identified as ORG by Spacy's Named Entity Recognition (Privy-trained)\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"CustomSpacyRecognizer\",\n", + " \"recognizer_identifier\": \"CustomSpacyRecognizer_139943499301312\"\n", + " }\n", + " }\n", + " ],\n", + " \"data/pii_data.txt\": [\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 0,\n", + " \"end\": 12,\n", + " \"score\": 1,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"CustomSpacyRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1,\n", + " \"score\": 1,\n", + " \"textual_explanation\": \"Identified as PERSON by Spacy's Named Entity Recognition (Privy-trained)\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"CustomSpacyRecognizer\",\n", + " \"recognizer_identifier\": \"CustomSpacyRecognizer_139943499301312\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"ORGANIZATION\",\n", + " \"start\": 13,\n", + " \"end\": 16,\n", + " \"score\": 1,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"CustomSpacyRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1,\n", + " \"score\": 1,\n", + " \"textual_explanation\": \"Identified as ORG by Spacy's Named Entity Recognition (Privy-trained)\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"CustomSpacyRecognizer\",\n", + " \"recognizer_identifier\": \"CustomSpacyRecognizer_139943499301312\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 53,\n", + " \"end\": 58,\n", + " \"score\": 1.0,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"FlairRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1.0,\n", + " \"score\": 1.0,\n", + " \"textual_explanation\": \"Identified as PER by Flair's Named Entity Recognition\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_identifier\": \"Flair Analytics_139944345555488\",\n", + " \"recognizer_name\": \"Flair Analytics\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 48,\n", + " \"end\": 52,\n", + " \"score\": 0.87,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"FlairRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 0.87,\n", + " \"score\": 0.87,\n", + " \"textual_explanation\": \"Identified as PER by Flair's Named Entity Recognition\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_identifier\": \"Flair Analytics_139944345555488\",\n", + " \"recognizer_name\": \"Flair Analytics\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"EMAIL\",\n", + " \"start\": 48,\n", + " \"end\": 68,\n", + " \"score\": 0.5,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"PatternRecognizer\",\n", + " \"pattern_name\": \"EMAIL\",\n", + " \"pattern\": \"\\\\S+@\\\\S+\",\n", + " \"original_score\": 0.5,\n", + " \"score\": 0.5,\n", + " \"textual_explanation\": null,\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"PatternRecognizer\",\n", + " \"recognizer_identifier\": \"PatternRecognizer_139943864893792\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"PHONE\",\n", + " \"start\": 72,\n", + " \"end\": 82,\n", + " \"score\": 0.5,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"PatternRecognizer\",\n", + " \"pattern_name\": \"PHONE\",\n", + " \"pattern\": \"\\\\(?\\\\d{3}\\\\)?[-.\\\\s]?\\\\d{3}[-.\\\\s]?\\\\d{4}\",\n", + " \"original_score\": 0.5,\n", + " \"score\": 0.5,\n", + " \"textual_explanation\": null,\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"PatternRecognizer\",\n", + " \"recognizer_identifier\": \"PatternRecognizer_139943864894128\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"PHONE\",\n", + " \"start\": 104,\n", + " \"end\": 114,\n", + " \"score\": 0.5,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"PatternRecognizer\",\n", + " \"pattern_name\": \"PHONE\",\n", + " \"pattern\": \"\\\\(?\\\\d{3}\\\\)?[-.\\\\s]?\\\\d{3}[-.\\\\s]?\\\\d{4}\",\n", + " \"original_score\": 0.5,\n", + " \"score\": 0.5,\n", + " \"textual_explanation\": null,\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"PatternRecognizer\",\n", + " \"recognizer_identifier\": \"PatternRecognizer_139943864894128\"\n", + " }\n", + " }\n", + " ]\n", + "}\n" + ] + } + ], + "source": [ + "#check the json report about the explanation.\n", + "rpt_output1 = context.get_cached_artifact(\"rpt_json\")\n", + "rpt_str1 = mlrun.get_dataitem(rpt_output1.get_target_path()).get().decode(\"utf-8\")\n", + "import json\n", + "obj = json.loads(rpt_str1)\n", + " \n", + "# Pretty Print JSON\n", + "json_formatted_str1 = json.dumps(obj, indent=4)\n", + "print(json_formatted_str1)" + ] + }, + { + "cell_type": "markdown", + "id": "7c058fe3-000c-4566-a11e-80283426d945", + "metadata": {}, + "source": [ + "### 3.3 Output configurations \n", + " - is_full_text: whether produce full text or just the sentences have PII entities in it\n", + " - generate_html: whether to produce the html with highlighted pii entities\n", + " - generate_json: whether to proudce the json report with the explaination of the process\n", + " - is_full_html: whether produce full text with the pii entities highlighted or just sentences with pii entities.\n", + " - is_full_report: whether produce the json report with detailed information or just start, end index and scores." + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "6a684769", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-31 02:22:57,789 [info] Project loaded successfully: {'project_name': 'pii'}\n", + "> 2023-07-31 02:22:57,799 [warning] Failed to add git metadata, ignore if path is not part of a git repo.: {'path': './', 'error': '/User/pii_recognizer'}\n", + "> 2023-07-31 02:22:57,891 [warning] artifact/output path is not defined or is local and relative, artifacts will not be visible in the UI: {'output_path': './'}\n", + "> 2023-07-31 02:22:57,892 [info] Storing function: {'name': 'pii-recognizer-recognize-pii', 'uid': '3f6d701e423346b39026dc365698c15c', 'db': None}\n", + "2023-07-31 02:22:58,079 loading file /User/.flair/models/flair-pii-distilbert/models--beki--flair-pii-distilbert/snapshots/20fb59f1762edcf253bce67716a94a43cb075ae6/pytorch_model.bin\n", + "2023-07-31 02:23:01,565 SequenceTagger predicts: Dictionary with 21 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-DATE_TIME, B-DATE_TIME, E-DATE_TIME, I-DATE_TIME, S-ORG, B-ORG, E-ORG, I-ORG, S-NRP, B-NRP, E-NRP, I-NRP\n", + "Model loaded\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ad05f59e8c604629a01f797dc84ec530", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Processing files: 0%| | 0/2 [00:00\n", + ".dictlist {\n", + " background-color: #4EC64B;\n", + " text-align: center;\n", + " margin: 4px;\n", + " border-radius: 3px; padding: 0px 3px 1px 3px; display: inline-block;}\n", + ".artifact {\n", + " cursor: pointer;\n", + " background-color: #4EC64B;\n", + " text-align: left;\n", + " margin: 4px; border-radius: 3px; padding: 0px 3px 1px 3px; display: inline-block;\n", + "}\n", + "div.block.hidden {\n", + " display: none;\n", + "}\n", + ".clickable {\n", + " cursor: pointer;\n", + "}\n", + ".ellipsis {\n", + " display: inline-block;\n", + " max-width: 60px;\n", + " white-space: nowrap;\n", + " overflow: hidden;\n", + " text-overflow: ellipsis;\n", + "}\n", + ".master-wrapper {\n", + " display: flex;\n", + " flex-flow: row nowrap;\n", + " justify-content: flex-start;\n", + " align-items: stretch;\n", + "}\n", + ".master-tbl {\n", + " flex: 3\n", + "}\n", + ".master-wrapper > div {\n", + " margin: 4px;\n", + " padding: 10px;\n", + "}\n", + "iframe.fileview {\n", + " border: 0 none;\n", + " height: 100%;\n", + " width: 100%;\n", + " white-space: pre-wrap;\n", + "}\n", + ".pane-header-title {\n", + " width: 80%;\n", + " font-weight: 500;\n", + "}\n", + ".pane-header {\n", + " line-height: 1;\n", + " background-color: #4EC64B;\n", + " padding: 3px;\n", + "}\n", + ".pane-header .close {\n", + " font-size: 20px;\n", + " font-weight: 700;\n", + " float: right;\n", + " margin-top: -5px;\n", + "}\n", + ".master-wrapper .right-pane {\n", + " border: 1px inset silver;\n", + " width: 40%;\n", + " min-height: 300px;\n", + " flex: 3\n", + " min-width: 500px;\n", + "}\n", + ".master-wrapper * {\n", + " box-sizing: border-box;\n", + "}\n", + "\n", + "
    \n", + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    pii0Jul 31 02:22:57completedpii-recognizer-recognize-pii
    v3io_user=pengw
    kind=
    owner=pengw
    host=jupyter-pengw-5f99fb678d-mnvxl
    model=whole
    input_path=./data/
    output_path=./data/output3/
    entities=['PERSON', 'EMAIL', 'PHONE', 'LOCATION', 'ORGANIZATION']
    output_suffix=output
    html_key=highlighted
    score_threshold=0.5
    entity_operator_map={'PERSON': ('keep', {}), 'EMAIL': ('mask', {'masking_char': '😀', 'chars_to_mask': 100, 'from_end': False, 'entity_type': 'EMAIL'}), 'PHONE': ('hash', {}), 'LOCATION': ('redact', {}), 'ORGANIZATION': ('replace', {'new_value': 'Company XYZ', 'entity_type': 'ORGANIZATION'})}
    is_full_text=False
    is_full_html=False
    is_full_report=False
    highlighted
    output_path
    rpt_json
    errors
    \n", + "
    \n", + "
    \n", + "
    \n", + " Title\n", + " ×\n", + "
    \n", + " \n", + "
    \n", + "
    \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-31 02:23:06,096 [info] Run execution finished: {'status': 'completed', 'name': 'pii-recognizer-recognize-pii'}\n" + ] + } + ], + "source": [ + "import mlrun\n", + "artifact_path = \"./\"\n", + "proj = mlrun.get_or_create_project(\"pii\", \"./\")\n", + "fn = mlrun.code_to_function(\n", + " project=\"pii\",\n", + " name=\"pii_recognizer\",\n", + " filename=\"pii_recognizer.py\",\n", + " kind=\"job\",\n", + " image=\"mlrun/mlrun\",\n", + " handler=\"recognize_pii\",\n", + " description=\"This function is used to recognize PII in a given text\",\n", + ")\n", + "\n", + "entity_operator_map = {\n", + " \"PERSON\": (\"keep\", {}),\n", + " \"EMAIL\": (\"mask\", {\"masking_char\": \"😀\", \"chars_to_mask\" : 100, \"from_end\": False}),\n", + " \"PHONE\": (\"hash\", {}),\n", + " \"LOCATION\": (\"redact\", {}),\n", + " \"ORGANIZATION\": (\"replace\", {\"new_value\": \"Company XYZ\"})\n", + " }\n", + "run_obj = fn.run(\n", + " artifact_path = artifact_path,\n", + " params= {\n", + " 'model': \"whole\", \n", + " 'input_path': \"./data/\",\n", + " 'output_path': \"./data/output3/\",\n", + " \"entities\": ['PERSON', \"EMAIL\", \"PHONE\", \"LOCATION\", \"ORGANIZATION\"],\n", + " \"output_suffix\": \"output\",\n", + " \"html_key\": \"highlighted\",\n", + " \"score_threshold\" : 0.5,\n", + " \"entity_operator_map\": entity_operator_map,\n", + " \"is_full_text\": False,\n", + " \"is_full_html\": False,\n", + " \"is_full_report\": False,\n", + " },\n", + " returns = [\"output_path: path\", \"rpt_json: file\", \"errors: file\"],\n", + " local=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "0e10d8fa", + "metadata": {}, + "outputs": [], + "source": [ + "#get the mlrun context\n", + "context = mlrun.get_or_create_ctx('pii_ctx')\n", + "import pathlib\n", + "from tqdm.auto import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "fb303fef", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f478b2a3792e42beabad632b9523e169", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Processing files: 0%| | 0/2 [00:00Highlighted Pii Entities

    Highlighted Pii Entities

  • data/letter.txt

    Dear Mr. John DoePERSON,\n", + "\n", + "We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of RivieraLOCATIONRivieraORGANIZATION

  • data/pii_data.txt

    John smith'sPERSON ssnORGANIZATION is 182838483, connect him with JohnPERSONJohn_smith@gmail.comEMAILsmithPERSON

  • " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#check the highlighted html \n", + "html_output = context.get_cached_artifact(\"highlighted\")\n", + "html_str = mlrun.get_dataitem(html_output.get_target_path()).get().decode(\"utf-8\")\n", + "from IPython.core.display import display, HTML\n", + "display(HTML(html_str))" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "26f9e706", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"data/letter.txt\": [\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 9,\n", + " \"end\": 17,\n", + " \"score\": 1\n", + " },\n", + " {\n", + " \"entity_type\": \"LOCATION\",\n", + " \"start\": 248,\n", + " \"end\": 255,\n", + " \"score\": 1.0\n", + " },\n", + " {\n", + " \"entity_type\": \"ORGANIZATION\",\n", + " \"start\": 248,\n", + " \"end\": 255,\n", + " \"score\": 1\n", + " }\n", + " ],\n", + " \"data/pii_data.txt\": [\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 0,\n", + " \"end\": 12,\n", + " \"score\": 1\n", + " },\n", + " {\n", + " \"entity_type\": \"ORGANIZATION\",\n", + " \"start\": 13,\n", + " \"end\": 16,\n", + " \"score\": 1\n", + " },\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 53,\n", + " \"end\": 58,\n", + " \"score\": 1.0\n", + " },\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 48,\n", + " \"end\": 52,\n", + " \"score\": 0.87\n", + " },\n", + " {\n", + " \"entity_type\": \"EMAIL\",\n", + " \"start\": 48,\n", + " \"end\": 68,\n", + " \"score\": 0.5\n", + " },\n", + " {\n", + " \"entity_type\": \"PHONE\",\n", + " \"start\": 72,\n", + " \"end\": 82,\n", + " \"score\": 0.5\n", + " },\n", + " {\n", + " \"entity_type\": \"PHONE\",\n", + " \"start\": 104,\n", + " \"end\": 114,\n", + " \"score\": 0.5\n", + " }\n", + " ]\n", + "}\n" + ] + } + ], + "source": [ + "#check the json report about the explanation.\n", + "rpt_output = context.get_cached_artifact(\"rpt_json\")\n", + "rpt_str = mlrun.get_dataitem(rpt_output.get_target_path()).get().decode(\"utf-8\")\n", + "import json\n", + "obj = json.loads(rpt_str)\n", + " \n", + "# Pretty Print JSON\n", + "json_formatted_str = json.dumps(obj, indent=4)\n", + "print(json_formatted_str)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pii", + "language": "python", + "name": "conda-env-.conda-pii-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/functions/master/pii_recognizer/0.3.0/src/pii_recognizer.py b/functions/master/pii_recognizer/0.3.0/src/pii_recognizer.py new file mode 100644 index 00000000..0acc55dc --- /dev/null +++ b/functions/master/pii_recognizer/0.3.0/src/pii_recognizer.py @@ -0,0 +1,951 @@ +# Copyright 2023 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import pathlib +import tempfile +import warnings +from typing import List, Set, Tuple, Union + +import annotated_text.util as at_util +import mlrun +import nltk +import pandas as pd +import presidio_analyzer as pa +import presidio_anonymizer as pre_anoymizer +from presidio_anonymizer.entities import OperatorConfig +from tqdm import tqdm + +try: + import flair as fl +except ModuleNotFoundError: + print("Flair is not installed") + +# There is a conflict between Rust-based tokenizers' parallel processing +# and Python's fork operations during multiprocessing. To avoid this, we need +# the following two lines + +os.environ["TOKENIZERS_PARALLELISM"] = "false" +warnings.filterwarnings("ignore") + +logger = logging.getLogger("pii-recognizer") + + +# Add the constant classes of Models and Entities to govern the whole package +class Models: + WHOLE = "whole" + PATTERN = "pattern" + SPACY = "spacy" + FLAIR = "flair" + + +class Entities: + CREDIT_CARD = "CREDIT_CARD" + SSN = "SSN" + PHONE = "PHONE" + EMAIL = "EMAIL" + LOCATION = "LOCATION" + PERSON = "PERSON" + NRP = "NRP" + ORGANIZATION = "ORGANIZATION" + DATE_TIME = "DATE_TIME" + GPE = ("GPE",) + MAC_ADDRESS = "MAC_ADDRESS" + US_BANK_NUMBER = "US_BANK_NUMBER" + IMEI = "IMEI" + TITLE = "TITLE" + LICENSE_PLATE = "LICENSE_PLATE" + US_PASSPORT = "US_PASSPORT" + CURRENCY = "CURRENCY" + ROUTING_NUMBER = "ROUTING_NUMBER" + US_ITIN = "US_ITIN" + US_BANK_NUMBER = "US_BANK_NUMBER" + US_DRIVER_LICENSE = "US_DRIVER_LICENSE" + AGE = "AGE" + PASSWORD = "PASSWORD" + SWIFT_CODE = "SWIFT_CODE" + + +class PatternRecognizerFactory: + """ + Factory for creating pattern recognizers, it can be extended in the future to + add more regex pattern for different entities. For the pattern recognizer to work, + we need construct a list of regex patterns for each entity. + """ + + RECOGNIZABLE_ENTITIES = { + "CREDIT_CARD": [pa.Pattern("CREDIT_CARD", r"\b(?:\d[ -]*?){13,16}\b", 0.5)], + "SSN": [pa.Pattern("SSN", r"\b\d{3}-?\d{2}-?\d{4}\b", 0.5)], + "PHONE": [pa.Pattern("PHONE", r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", 0.5)], + "EMAIL": [pa.Pattern("EMAIL", r"\S+@\S+", 0.5)], + } + + # create a list of pattern recognizers + @classmethod + def _create_pattern_recognizer(cls): + """ + For each entity, create a list of patterns to recognize it + + :param cls: PatternRecognizerFactory class + + :returns: List of pattern recognizers + """ + + # Entities to recognize and their regex patterns + + return [ + pa.PatternRecognizer(supported_entity=entity, patterns=pattern) + for entity, pattern in cls.RECOGNIZABLE_ENTITIES.items() + ] + + +class CustomSpacyRecognizer(pa.LocalRecognizer): + """ + Custom Spacy Recognizer from Presidio Analyzer trained on Privy data. + The privy data is generated using this https://github.com/pixie-io/pixie/tree/main/src/datagen/pii/privy + It can be used to recognize custom entities, Since we want to use Presidio's Registries to generate AnalyzerEngine, + it inherits from Presidio Analyzer's LocalRecognizer class. + """ + + # Entities to recognize + + RECOGNIZABLE_ENTITIES = { + "LOCATION", + "PERSON", + "NRP", + "ORGANIZATION", + "DATE_TIME", + } + + # Default explanation for this recognizer + + _DEFAULT_EXPLANATION = ( + "Identified as {} by Spacy's Named Entity Recognition (Privy-trained)" + ) + + # Label groups to check + + _DEFAULT_CHECK_LABEL_GROUPS = [ + ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}), + ({"PERSON"}, {"PER", "PERSON"}), + ({"NRP"}, {"NORP", "NRP"}), + ({"ORGANIZATION"}, {"ORG"}), + ({"DATE_TIME"}, {"DATE_TIME"}), + ] + + # pretrained model for this recognizer + + _DEFAULT_MODEL_LANGUAGES = { + "en": "beki/en_spacy_pii_distilbert", + } + + _DEFAULT_PRESIDIO_EQUIVALENCES = { + "PER": "PERSON", + "LOC": "LOCATION", + "ORG": "ORGANIZATION", + "NROP": "NRP", + "DATE_TIME": "DATE_TIME", + } + + def __init__( + self, + supported_language: str = "en", + supported_entities: List[str] = None, + check_label_groups: Tuple[Set, Set] = None, + context: List[str] = None, + ner_strength: float = 1, + ): + """ + Initialize Spacy Recognizer. + + :param supported_language: Language to use, default is English + :param supported_entities: Entities to use for recognition + :param check_label_groups: Label groups to check for the entities + :param context: Context to use if any + :param ner_strength: Default confidence for NER prediction + + :returns: SpacyRecognizer object + """ + + # Default confidence for NER prediction + self.ner_strength = ner_strength + + self.check_label_groups = check_label_groups or self._DEFAULT_CHECK_LABEL_GROUPS + supported_entities = supported_entities or self.RECOGNIZABLE_ENTITIES + super().__init__( + supported_entities=supported_entities, + supported_language=supported_language, + ) + + # get the presidio explanation for the result + + def _build_spacy_explanation( + self, original_score: float, explanation: str + ) -> pa.AnalysisExplanation: + """ + Create explanation for why this result was detected. + + :param original_score: Score given by this recognizer + :param explanation: Explanation string + + :returns: Presidio AnalysisExplanation object + """ + explanation = pa.AnalysisExplanation( + recognizer=self.__class__.__name__, + original_score=original_score, + textual_explanation=explanation, + ) + return explanation + + # main method for the recognizer + def analyze(self, text: str, entities: List[str], nlp_artifacts=None): # noqa D102 + """ + Analyze text using Spacy. + + :param text: Text to analyze + :param entities: Entities to analyze + :param nlp_artifacts: NLP artifacts to use + + :returns: List of Presidio RecognizerResult objects + """ + results = [] + if not nlp_artifacts: + logger.warning("Skipping SpaCy, nlp artifacts not provided...") + return results + + ner_entities = nlp_artifacts.entities + + # recognize the supported entities + for entity in entities: + if entity not in self.supported_entities: + continue + for ent in ner_entities: + if not self.__check_label(entity, ent.label_, self.check_label_groups): + continue + + # string of the explanation saying the entity is recognized by spacy + textual_explanation = self._DEFAULT_EXPLANATION.format(ent.label_) + explanation = self._build_spacy_explanation( + self.ner_strength, textual_explanation + ) + + # create the standard result with the entity, start, end, score, and explanation + spacy_result = pa.RecognizerResult( + entity_type=entity, + start=ent.start_char, + end=ent.end_char, + score=self.ner_strength, + analysis_explanation=explanation, + recognition_metadata={ + pa.RecognizerResult.RECOGNIZER_NAME_KEY: self.name + }, + ) + results.append(spacy_result) + + return results + + @staticmethod + def __check_label( + entity: str, label: str, check_label_groups: Tuple[Set, Set] + ) -> bool: + """ + Check if the label is in the label group. + + :param entity: Entity to check + :param label: Label to check + :param check_label_groups: Label groups to check + + :returns: True if the label is in the label group, False otherwise + """ + return any( + entity in egrp and label in lgrp for egrp, lgrp in check_label_groups + ) + + +# Class to use Flair with Presidio as an external recognizer. +class FlairRecognizer(pa.EntityRecognizer): + """ + Wrapper for a flair model, if needed to be used within Presidio Analyzer. + This is to make sure the recognizer can be registered with Presidio registry. + """ + + RECOGNIZABLE_ENTITIES = { + "LOCATION", + "PERSON", + "NRP", + "GPE", + "ORGANIZATION", + "MAC_ADDRESS", + "US_BANK_NUMBER", + "IMEI", + "TITLE", + "LICENSE_PLATE", + "US_PASSPORT", + "CURRENCY", + "ROUTING_NUMBER", + "US_ITIN", + "US_BANK_NUMBER", + "US_DRIVER_LICENSE", + "AGE", + "PASSWORD", + "SWIFT_CODE", + } + + # This is used to construct the explanation for the result + + _DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition" + + _DEFAULT_CHECK_LABEL_GROUPS = [ + ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}), + ({"PERSON"}, {"PER", "PERSON"}), + ({"NRP"}, {"NORP", "NRP"}), + ({"GPE"}, {"GPE"}), + ({"ORGANIZATION"}, {"ORG"}), + ({"MAC_ADDRESS"}, {"MAC_ADDRESS"}), + ({"US_BANK_NUMBER"}, {"US_BANK_NUMBER"}), + ({"IMEI"}, {"IMEI"}), + ({"TITLE"}, {"TITLE"}), + ({"LICENSE_PLATE"}, {"LICENSE_PLATE"}), + ({"US_PASSPORT"}, {"US_PASSPORT"}), + ({"CURRENCY"}, {"CURRENCY"}), + ({"ROUTING_NUMBER"}, {"ROUTING_NUMBER"}), + ({"AGE"}, {"AGE"}), + ({"CURRENCY"}, {"CURRENCY"}), + ({"SWIFT_CODE"}, {"SWIFT_CODE"}), + ({"US_ITIN"}, {"US_ITIN"}), + ({"US_BANK_NUMBER"}, {"US_BANK_NUMBER"}), + ({"US_DRIVER_LICENSE"}, {"US_DRIVER_LICENSE"}), + ] + + _DEFAULT_MODEL_LANGUAGES = { + "en": "beki/flair-pii-distilbert", + } + + _DEFAULT_PRESIDIO_EQUIVALENCES = { + "PER": "PERSON", + "LOC": "LOCATION", + "ORG": "ORGANIZATION", + "NROP": "NRP", + "URL": "URL", + "US_ITIN": "US_ITIN", + "US_PASSPORT": "US_PASSPORT", + "IBAN_CODE": "IBAN_CODE", + "IP_ADDRESS": "IP_ADDRESS", + "EMAIL_ADDRESS": "EMAIL", + "US_DRIVER_LICENSE": "US_DRIVER_LICENSE", + "US_BANK_NUMBER": "US_BANK_NUMBER", + } + + def __init__( + self, + supported_language: str = "en", + supported_entities: List[str] = None, + check_label_groups: Tuple[Set, Set] = None, + ): + """ + Initialize the FlairRecognizer. + + :param supported_language: Language to use + :param supported_entities: Entities to use + :param check_label_groups: Label groups to check + + :returns: FlairRecognizer object + + """ + self.check_label_groups = check_label_groups or self._DEFAULT_CHECK_LABEL_GROUPS + + supported_entities = supported_entities or self.RECOGNIZABLE_ENTITIES + self.model = fl.models.SequenceTagger.load( + self._DEFAULT_MODEL_LANGUAGES.get(supported_language) + ) + + super().__init__( + supported_entities=supported_entities, + supported_language=supported_language, + name="Flair Analytics", + ) + + # main method for the recognizer + def analyze( + self, + text: str, + entities: List[str], + nlp_artifacts: pa.nlp_engine.NlpArtifacts = None, + ) -> List[pa.RecognizerResult]: + """ + Analyze text and return the results. + + :param text: The text for analysis. + :param entities: The list of entities to recognize. + :param nlp_artifacts: Not used by this recognizer but needed for the interface. + + :returns: The list of Presidio RecognizerResult constructed from the recognized Flair detections. + """ + + results = [] + + sentences = fl.data.Sentence(text) + self.model.predict(sentences) + + # If there are no specific list of entities, we will look for all of it. + if not entities: + entities = self.supported_entities + + # Go over the entities and check if they are in the supported entities list. + for entity in entities: + if entity not in self.supported_entities: + continue + + # Go over the sentences and check if the entity is in the sentence. + for ent in sentences.get_spans("ner"): + if not self.__check_label( + entity, ent.labels[0].value, self.check_label_groups + ): + continue + + # If the entity is in the sentence, we will add it to the results. + textual_explanation = self._DEFAULT_EXPLANATION.format( + ent.labels[0].value + ) + + # Build the explanation for the result + explanation = self._build_flair_explanation( + round(ent.score, 2), textual_explanation + ) + + flair_result = self._convert_to_recognizer_result(ent, explanation) + + results.append(flair_result) + + return results + + def _convert_to_recognizer_result( + self, entity: fl.data.Span, explanation: str + ) -> pa.RecognizerResult: + """ + Convert Flair result to Presidio RecognizerResult. + + :param entity: Flair entity of Span + :param explanation: Presidio AnalysisExplanation + + :returns: Presidio RecognizerResult + """ + + # Convert the entity type to Presidio entity type + entity_type = self._DEFAULT_PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag) + + # Convert the score to Presidio score + flair_score = round(entity.score, 2) + + # Create the Presidio RecognizerResult from the Flair entity + flair_results = pa.RecognizerResult( + entity_type=entity_type, + start=entity.start_position, + end=entity.end_position, + score=flair_score, + analysis_explanation=explanation, + ) + + return flair_results + + def _build_flair_explanation( + self, original_score: float, explanation: str + ) -> pa.AnalysisExplanation: + """ + Create explanation for why this result was detected. + + :param original_score: Score given by this recognizer + :param explanation: Explanation string + + :returns: Presidio AnalysisExplanation + """ + + # Create the Presidio AnalysisExplanation for the result + explanation = pa.AnalysisExplanation( + recognizer=self.__class__.__name__, + original_score=original_score, + textual_explanation=explanation, + ) + return explanation + + # sanity check of the entity and label before recognition + @staticmethod + def __check_label( + entity: str, label: str, check_label_groups: Tuple[Set, Set] + ) -> bool: + return any( + entity in egrp and label in lgrp for egrp, lgrp in check_label_groups + ) + + +# get the analyzer engine based on the model +def _get_analyzer_engine( + model: str = None, entities: List[str] = None +) -> pa.AnalyzerEngine: + """ + Return pa.AnalyzerEngine. + + :param model: The model to use. Can be "spacy", "flair", "pattern" or "whole". + :param entities: The list of entities to use. + + :returns: pa.AnalyzerEngine + """ + # recognizer registry that can store multiple recognizers + registry = pa.RecognizerRegistry() + if model == Models.SPACY: + # custom spacy recognizer + spacy_recognizer = CustomSpacyRecognizer() + # add the custom build spacy recognizer + registry.add_recognizer(spacy_recognizer) + elif model == Models.FLAIR: + # pre-trained flair recognizer + flair_recognizer = FlairRecognizer() + # add the custom build flair recognizer + registry.add_recognizer(flair_recognizer) + elif model == Models.PATTERN: + # add the pattern recognizer + pattern_recognizer_factory = PatternRecognizerFactory() + for recognizer in pattern_recognizer_factory._create_pattern_recognizer(): + registry.add_recognizer(recognizer) + elif model == Models.WHOLE: + spacy_recognizer = CustomSpacyRecognizer() + flair_recognizer = FlairRecognizer() + registry.add_recognizer(spacy_recognizer) + registry.add_recognizer(flair_recognizer) + # add the pattern recognizer + pattern_recognizer_factory = PatternRecognizerFactory() + for recognizer in pattern_recognizer_factory._create_pattern_recognizer(): + registry.add_recognizer(recognizer) + elif not model and entities: + if set(entities) & CustomSpacyRecognizer.RECOGNIZABLE_ENTITIES: + spacy_recognizer = CustomSpacyRecognizer() + registry.add_recognizer(spacy_recognizer) + if set(entities) & FlairRecognizer.RECOGNIZABLE_ENTITIES: + flair_recognizer = FlairRecognizer() + registry.add_recognizer(flair_recognizer) + # add the pattern recognizer + if set(entities) & (set(PatternRecognizerFactory.RECOGNIZABLE_ENTITIES.keys())): + pattern_recognizer_factory = PatternRecognizerFactory() + for recognizer in pattern_recognizer_factory._create_pattern_recognizer(): + registry.add_recognizer(recognizer) + else: + raise ValueError( + f"argument of model and entities can not be None at the same time" + ) + analyzer = pa.AnalyzerEngine( + registry=registry, + supported_languages=["en"], + ) + + supported_entities = analyzer.get_supported_entities() + + if entities and not all(item in supported_entities for item in entities): + not_supported_entities = [ + item for item in entities if item not in supported_entities + ] + raise ValueError( + f"The current model {model} doesn't support the following entities: {not_supported_entities}. " + f"Supported entities are: {supported_entities}" + ) + return analyzer + + +def _get_anonymizer_engine() -> pre_anoymizer.AnonymizerEngine: + """ + Return AnonymizerEngine. + + :returns: The AnonymizerEngine. + """ + return pre_anoymizer.AnonymizerEngine() + + +def _anonymize( + text: str, + analyze_results: List[pa.RecognizerResult], + entity_operator_map: dict = None, + is_full_text: bool = True, +) -> str: + """ + Anonymize identified input using Presidio Abonymizer. + + :param text: The text for analysis. + :param analyze_results: The list of Presidio RecognizerResult constructed from + :param entity_operator_map: The entity_operator_map is a dictionary that maps entity to operator name and operator params. + :param is_full_text: Whether the text is full text or not. + + :returns: The anonymized text. + """ + if not text: + return "" + + anonymizer_engine = _get_anonymizer_engine() + if not entity_operator_map: + operators = None + else: + # Create OperatorConfig based on the entity_operator_map + operators = { + entity: OperatorConfig(operator_name, operator_params) + for entity, (operator_name, operator_params) in entity_operator_map.items() + } + + if is_full_text: + # Anonymize the entire text + return anonymizer_engine.anonymize( + text=text, analyzer_results=analyze_results, operators=operators + ).text + # Tokenize the text to sentences + sentences = nltk.sent_tokenize(text) + anonymized_sentences = [] + current_idx = 0 + + # Find the sentence that has pii entity + for sentence in sentences: + start_idx = current_idx + end_idx = start_idx + len(sentence) + + # Get the entities that are in the sentence, update hte start_idx and end_idx + sentence_results = [ + pa.RecognizerResult( + result.entity_type, + start=result.start - start_idx, + end=result.end - start_idx, + score=result.score, + ) + for result in analyze_results + if result.start >= start_idx and result.end <= end_idx + ] + + # If PII is detected + if sentence_results: + anonymized_sentence = anonymizer_engine.anonymize( + text=sentence, analyzer_results=sentence_results, operators=operators + ).text + anonymized_sentences.append(anonymized_sentence) + + current_idx = end_idx + + return " ".join(anonymized_sentences) + + +def _get_tokens( + text: str, analyze_results: List[pa.RecognizerResult], is_full: bool = True +) -> List[str]: + """ + Get the full tokens or only contains the entities that can form a sentence. + + :param text: The text for analysis. + :param analyze_results: The list of Presidio RecognizerResult constructed from + :param is_full: Whether return full tokens or just the tokens that only contains the entities that can form a sentence. + + :returns: The tokens. + """ + + tokens = [] + # sort by start index + results = sorted(analyze_results, key=lambda x: x.start) + for i, res in enumerate(results): + if i == 0: + tokens.append(text[: res.start]) + + # append entity text and entity type + tokens.append((text[res.start : res.end], res.entity_type)) + + # if another entity coming i.e. we're not at the last results element, + # add text up to next entity + if i != len(results) - 1: + tokens.append(text[res.end : results[i + 1].start]) + # if no more entities coming, add all remaining text + else: + tokens.append(text[res.end :]) + + # get the tokens that only contains the entities that can form a sentence + part_annontated_tokens = [] + if not is_full: + last_end_sentence = 0 + for i, token in enumerate(tokens): + if any(item in token for item in [".", "!", "?"]) and any( + type(item) is tuple for item in tokens[last_end_sentence:i] + ): + part_annontated_tokens.append(tokens[last_end_sentence:i]) + last_end_sentence = i + return part_annontated_tokens + return tokens + + +def _annotate( + text: str, st_analyze_results: List[pa.RecognizerResult], is_full_html: bool = True +) -> List[str]: + """ + Annotate identified input using Presidio Anonymizer. + + :param text: The text for analysis. + :param st_analyze_results: The list of Presidio RecognizerResult constructed from analysis. + :param is_full_html: Whether generate full html or not. + + :returns: The list of tokens with the identified entities. + + """ + return _get_tokens(text, st_analyze_results, is_full_html) + + +def _process( + text: str, + model: pa.AnalyzerEngine, + score_threshold: float, + entities: List[str] = None, + entities_operator_map: dict = None, + is_full_text: bool = True, +) -> Tuple[str, list]: + """ + Process the text of str using the model. + + :param text: Text to process + :param model: Model to use for processing + :param entities: Entities to recognize + :param entities_operator_map: The entity_operator_map is a dictionary that maps entity to operator name and operator params. + :param score_threshold: The score threshold to use for recognition + :param is_full_text: Whether to return the full text or just the annotated text + + :returns: A tuple of: + + * the anonymized text + * the list of Presidio RecognizerResult constructed from analysis + """ + + # get the analyzer engine + analyzer = model + + # analyze the text that can be used for anonymization + results = analyzer.analyze( + text=text, + language="en", + entities=entities, + score_threshold=score_threshold, + return_decision_process=True, + ) + + # anonymize the text, replace the pii entities with the labels + anonymized_text = _anonymize(text, results, entities_operator_map, is_full_text) + + return anonymized_text, results + + +def _get_single_html( + text: str, results: List[pa.RecognizerResult], is_full_html: bool = True +): + """ + Generate the html for a single txt file. + + :param text: The text for analysis. + :param results: The list of Presidio RecognizerResult constructed from analysis. + :param is_full_html: Whether generate full html or not. + + :returns: The html string for a single txt file. + """ + # convert the results to tokens to generate the html + tokens = _annotate(text, results, is_full_html) + html = at_util.get_annotated_html(*tokens) + + # avoid the error during rendering of the \n in the html + backslash_char = "\\" + + html_str = f"

    {html.replace('{backslash_char}n', '
    ')}

    " + + return html_str + + +def _get_single_json(results: List[pa.RecognizerResult], is_full_report: bool = True): + """ + Generate the json for a single txt file. + + :param results: The list of Presidio RecognizerResult constructed from analysis. + :param is_full_report: Whether generate full json or not. + + :returns: The json string for a single txt file. + """ + # generate the stats report if needed + if not is_full_report: + stats = [] + # add the simplify stats logic here + for item in results: + item.analysis_explanation = None + stats.append(item) + else: + stats = results + + return stats + + +def _get_all_html( + txt_content: dict, + res_dict: dict, + is_full_html: bool = True, +): + """ + Generate the html for all txt files. + + :param txt_content: The dictionary of txt file name and content. + :param res_dict: The dictionary of txt file name and the list of Presidio RecognizerResult constructed from analysis. + :param is_full_html: Whether generate full html or not. + + :returns: The html string for all txt files. + + """ + # These are placeholder for the html string + html_index = "Highlighted Pii Entities

    Highlighted Pii Entities

      " + html_content = "" + for txt_file, results in res_dict.items(): + txt = txt_content[txt_file] + html_index += f"
    • {txt_file}
    • " + html_content += f"
    • {txt_file}

      {_get_single_html(txt, results, is_full_html)}

    • " + html_index += "
    " + html_res = f"{html_index}{html_content}" + + return html_res + + +def _get_all_rpt(res_dict: dict, is_full_report: bool = True): + """ + Generate the stats report for all txt files. + + :param res_dict: The dictionary of txt file name and the list of Presidio RecognizerResult constructed from analysis. + :param is_full_report: Whether generate full report or not. + + :returns: The stats report for all txt files. + """ + # These are placeholder for the json report + stats_dict = {} + for txt_file, results in res_dict.items(): + new_stats = [] + for item in _get_single_json(results, is_full_report): + if is_full_report: + item.analysis_explanation = item.analysis_explanation.to_dict() + new_stats.append(item.to_dict()) + else: + tmp_dict = item.to_dict() + tmp_dict.pop("analysis_explanation") + tmp_dict.pop("recognition_metadata") + new_stats.append(tmp_dict) + stats_dict[txt_file] = new_stats + return stats_dict + + +def recognize_pii( + context: mlrun.MLClientCtx, + input_path: Union[str, pathlib.Path], + html_key: str, + score_threshold: float, + output_directory: str = None, + entities: List[ + str + ] = None, # List of entities to recognize, default is recognizing all + entity_operator_map: dict = None, + model: str = None, + generate_json: bool = True, + generate_html: bool = True, + is_full_text: bool = True, + is_full_html: bool = True, + is_full_report: bool = True, +) -> Union[Tuple[str, pd.DataFrame, dict, dict], Tuple[str, pd.DataFrame, dict]]: + """ + Walk through the input path, recognize PII in text and store the anonymized text in the output path. + Generate the html with different colors for each entity, json report of the explanation. + + :param context: The MLRun context. this is needed for log the artifacts. + :param input_path: The input path of the text files needs to be analyzed. + :param html_key: The html key for the artifact. + :param score_threshold: The score threshold to mark the recognition as trusted. + :param output_directory: The output directory path to store the anonymized text. + :param entities: The list of entities to recognize. + :param entity_operator_map: The map of entity to operator (mask, redact, replace, keep, hash, and its params) + :param model: The model to use. Can be "spacy", "flair", "pattern" or "whole". + :param generate_json: Whether to generate the json report of the explanation. + :param generate_html: Whether to generate the html report of the explanation. + :param is_full_text: Whether to return the full text or only the masked text. + :param is_full_html: Whether to return the full html or just the annotated text + :param is_full_report: Whether to return the full report or just the score and start, end index + + :returns: A tuple of: + + * Path to the output directory + * The json report of the explanation (if generate_json is True) + * A dictionary of errors files that were not processed + + """ + + # Set output directory + if output_directory is None: + output_directory = tempfile.mkdtemp() + + # Create the output directory: + output_directory = pathlib.Path(output_directory) + if not output_directory.exists(): + output_directory.mkdir(parents=True, exist_ok=True) + + txt_files_directory = pathlib.Path(input_path) + successes = [] + errors = {} + + res_dict = {} + txt_content = {} + # Load the model: + analyzer = _get_analyzer_engine(model, entities) + logger.info("Model loaded") + # Go over the text files in the input path, analyze and anonymize them: + for txt_file in tqdm( + list(txt_files_directory.glob("*.txt")), + desc="Processing files", + unit="file", + ): + try: + # Load the str from the text file + text = txt_file.read_text() + txt_content[str(txt_file)] = text + # Process the text to recoginze the pii entities in it + anonymized_text, results = _process( + text=text, + model=analyzer, + entities=entities, + entities_operator_map=entity_operator_map, + score_threshold=score_threshold, + is_full_text=is_full_text, + ) + res_dict[str(txt_file)] = results + # Store the anonymized text in the output path + output_file = output_directory / f"{txt_file.stem}.txt" + output_file.parent.mkdir(parents=True, exist_ok=True) + with open(output_file, "w") as f: + f.write(anonymized_text) + successes.append([txt_file.name, output_file.name]) + except Exception as e: + errors[str(txt_file)] = str(e) + logger.error(f"Error processing {txt_file}: {e}") + + successes = pd.DataFrame( + successes, + columns=["original_file", "anonymized_file"], + ) + + if generate_html: + # Generate the html report + html_res = _get_all_html(txt_content, res_dict, is_full_html) + # Store the html report in the context + arti_html = mlrun.artifacts.Artifact(body=html_res, format="html", key=html_key) + context.log_artifact(arti_html) + if generate_json: + # Generate the json report + json_res = _get_all_rpt(res_dict, is_full_report) + return str(output_directory), successes, errors, json_res + return str(output_directory), successes, errors diff --git a/functions/master/pii_recognizer/0.3.0/src/requirements.txt b/functions/master/pii_recognizer/0.3.0/src/requirements.txt new file mode 100644 index 00000000..467565d4 --- /dev/null +++ b/functions/master/pii_recognizer/0.3.0/src/requirements.txt @@ -0,0 +1,12 @@ +faker +nltk +pandas +streamlit +presidio-anonymizer +presidio-analyzer +torch +st-annotated-text +streamlit +git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653#egg=flair +st-annotated-text +https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl diff --git a/functions/master/pii_recognizer/0.3.0/src/test_pii_recognizer.py b/functions/master/pii_recognizer/0.3.0/src/test_pii_recognizer.py new file mode 100644 index 00000000..81a16611 --- /dev/null +++ b/functions/master/pii_recognizer/0.3.0/src/test_pii_recognizer.py @@ -0,0 +1,251 @@ +# Copyright 2019 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import pytest +import random +from faker import Faker +import mlrun +from pii_recognizer import ( + _process, + _get_analyzer_engine, + _anonymize, + _annotate, + recognize_pii_parallel, +) + + +def generate_routing_number(): + prefix = random.randint(0, 99) + identifier = random.randint(0, 9999999) + identifier_str = str(identifier).zfill(7) + weighted_sum = ( + 3 * (int(str(prefix).zfill(2)[0])) + + 7 * (int(str(prefix).zfill(2)[1])) + + 1 * (int(identifier_str[0])) + + 3 * (int(identifier_str[1])) + + 7 * (int(identifier_str[2])) + + 1 * (int(identifier_str[3])) + + 3 * (int(identifier_str[4])) + + 7 * (int(identifier_str[5])) + + 1 * (int(identifier_str[6])) + ) + check_digit = (10 - (weighted_sum % 10)) % 10 + + routing_number = f"{prefix:02d}{identifier_str}{check_digit}" + + return routing_number + + +def generate_us_itin(): + area_number = random.randint(900, 999) + group_number = random.randint(70, 99) + serial_number = random.randint(0, 9999) + + formatted_itin = f"{area_number:03d}-{group_number:02d}-{serial_number:04d}" + return formatted_itin + + +@pytest.fixture(scope="function") +def fake_data(request): + params = request.param if hasattr(request, "param") else {} + fake = Faker("en_US") + data = { + "name": fake.name(), + "email": fake.email(), + "address": fake.address(), + "phone": fake.phone_number(), + "ssn": fake.ssn(), + "credit_card": fake.credit_card_number(), + "organization": fake.company(), + "location": fake.street_address(), + "date_time": fake.date(), + "mac_address": fake.mac_address(), + "us_bank_number": fake.bban(), + "imei": "".join(str(fake.random_int(0, 9)) for _ in range(14)), + "title": fake.job(), + "license_plate": fake.license_plate(), + "us_passport": fake.passport_number(), + "currency": fake.currency_code(), + "routing_number": generate_routing_number(), + "us_itin": generate_us_itin(), + "age": fake.random_int(1, 100), + "password": fake.password(), + "swift_code": fake.swift(), + } + + data.update(params) + + yield data + + +@pytest.mark.skip() +def test_pattern_process(fake_data): + ENTITIES = { + "CREDIT_CARD": "credit_card", + "SSN": "ssn", + "PHONE": "phone", + "EMAIL": "email", + } + + analyzer = _get_analyzer_engine(model="pattern") + text = f"He can be reached at {fake_data['email']} or {fake_data['phone']}. His credit card number is {fake_data['credit_card']} and his SSN is {fake_data['ssn']}." + res, results = _process(text, analyzer, score_threshold=0.5) + + assert any(entity in res for entity in ENTITIES.keys()) + + +@pytest.mark.skip() +def test_spacy_process(fake_data): + ENTITIES = { + "PERSON": "name", + "ORGANIZATION": "organization", + } + + analyzer = _get_analyzer_engine(model="spacy") + text = f"{fake_data['name']}'s employer is {fake_data['organization']}." + res, results = _process(text, analyzer, score_threshold=0.5) + + assert any(entity in res for entity in ENTITIES.keys()) + + +@pytest.mark.skip() +def test_flair_process(fake_data): + ENTITIES = { + "LOCATION": "location", + "PERSON": "name", + "ORGANIZATION": "organization", + "MAC_ADDRESS": "mac_address", + "US_BANK_NUMBER": "us_bank_number", + "IMEI": "imei", + "TITLE": "title", + "LICENSE_PLATE": "license_plate", + "US_PASSPORT": "us_passport", + "CURRENCY": "currency", + "ROUTING_NUMBER": "routing_number", + "US_ITIN": "us_itin", + "US_BANK_NUMBER": "us_bank_number", + "AGE": "age", + "PASSWORD": "password", + "SWIFT_CODE": "swift_code", + } + + analyzer = _get_analyzer_engine(model="flair") + text = " ".join( + [item + " is " + str(fake_data[item]) for item in ENTITIES.values()] + ) + res, results = _process(text, analyzer, score_threshold=0.5) + assert any(entity in res for entity in ENTITIES.keys()) + + +@pytest.mark.skip() +def test_whole_process(fake_data): + ENTITIES = { + "LOCATION": "location", + "PERSON": "name", + "ORGANIZATION": "organization", + "MAC_ADDRESS": "mac_address", + "US_BANK_NUMBER": "us_bank_number", + "IMEI": "imei", + "TITLE": "title", + "LICENSE_PLATE": "license_plate", + "US_PASSPORT": "us_passport", + "CURRENCY": "currency", + "ROUTING_NUMBER": "routing_number", + "US_ITIN": "us_itin", + "US_BANK_NUMBER": "us_bank_number", + "AGE": "age", + "CREDIT_CARD": "credit_card", + "SSN": "ssn", + "PHONE": "phone", + "EMAIL": "email", + "PASSWORD": "password", + "SWIFT_CODE": "swift_code", + } + text = " ".join( + [item + " is " + str(fake_data[item]) for item in ENTITIES.values()] + ) + analyzer = _get_analyzer_engine(model="whole") + res, results = _process(text, analyzer, score_threshold=0.5) + assert any(entity in res for entity in ENTITIES.keys()) + + +@pytest.mark.skip() +def test_only_entities(fake_data): + ENTITIES = { + "LOCATION": "location", + "PERSON": "name", + "ORGANIZATION": "organization", + "MAC_ADDRESS": "mac_address", + "US_BANK_NUMBER": "us_bank_number", + "IMEI": "imei", + "TITLE": "title", + "LICENSE_PLATE": "license_plate", + "US_PASSPORT": "us_passport", + "CURRENCY": "currency", + "ROUTING_NUMBER": "routing_number", + "US_ITIN": "us_itin", + "US_BANK_NUMBER": "us_bank_number", + "AGE": "age", + "CREDIT_CARD": "credit_card", + "SSN": "ssn", + "PHONE": "phone", + "EMAIL": "email", + "PASSWORD": "password", + "SWIFT_CODE": "swift_code", + } + + text = " ".join( + [item + " is " + str(fake_data[item]) for item in ENTITIES.values()] + ) + analyzer = _get_analyzer_engine(entities=list(ENTITIES.keys())[:5]) + res, results = _process(text, analyzer, score_threshold=0.5) + assert any(entity in res for entity in ENTITIES.keys()) + + +def test_parallel(): + context = mlrun.get_or_create_ctx("test_parallel") + ENTITIES = { + "LOCATION": "location", + "PERSON": "name", + "ORGANIZATION": "organization", + "MAC_ADDRESS": "mac_address", + "US_BANK_NUMBER": "us_bank_number", + "IMEI": "imei", + "TITLE": "title", + "LICENSE_PLATE": "license_plate", + "US_PASSPORT": "us_passport", + "CURRENCY": "currency", + "ROUTING_NUMBER": "routing_number", + "US_ITIN": "us_itin", + "US_BANK_NUMBER": "us_bank_number", + "AGE": "age", + "CREDIT_CARD": "credit_card", + "SSN": "ssn", + "PHONE": "phone", + "EMAIL": "email", + "PASSWORD": "password", + "SWIFT_CODE": "swift_code", + } + json_res, erros = recognize_pii_parallel( + context=context, + config_input_output="data/config.csv", + score_threshold=0.5, + html_key="test_parallel", + entities=list(ENTITIES.keys()), + model="whole", + ) + + assert len(json_res) == 2 diff --git a/functions/master/pii_recognizer/0.3.0/static/documentation.html b/functions/master/pii_recognizer/0.3.0/static/documentation.html new file mode 100644 index 00000000..68ab9f62 --- /dev/null +++ b/functions/master/pii_recognizer/0.3.0/static/documentation.html @@ -0,0 +1,444 @@ + + + + + + + +pii_recognizer package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + + + +
    +
    + + +
    +
    +
    + +
    +

    pii_recognizer package

    + +
    + +
    +
    +
    +
    +
    +

    pii_recognizer package#

    +
    +

    Submodules#

    +
    +
    +

    pii_recognizer.pii_recognizer module#

    +
    +
    +class pii_recognizer.pii_recognizer.CustomSpacyRecognizer(*args: Any, **kwargs: Any)[source]#
    +

    Bases: presidio_analyzer.

    +

    Custom Spacy Recognizer from Presidio Analyzer trained on Privy data. +The privy data is generated using this https://github.com/pixie-io/pixie/tree/main/src/datagen/pii/privy +It can be used to recognize custom entities, Since we want to use Presidio’s Registries to generate AnalyzerEngine, +it inherits from Presidio Analyzer’s LocalRecognizer class.

    +
    +
    +RECOGNIZABLE_ENTITIES = {'DATE_TIME', 'LOCATION', 'NRP', 'ORGANIZATION', 'PERSON'}#
    +
    +
    +
    +analyze(text: str, entities: List[str], nlp_artifacts=None)[source]#
    +

    Analyze text using Spacy.

    +
    +
    Parameters
    +
      +
    • text – Text to analyze

    • +
    • entities – Entities to analyze

    • +
    • nlp_artifacts – NLP artifacts to use

    • +
    +
    +
    Returns
    +

    List of Presidio RecognizerResult objects

    +
    +
    +
    +
    +
    +
    +class pii_recognizer.pii_recognizer.Entities[source]#
    +

    Bases: object

    +
    +
    +AGE = 'AGE'#
    +
    +
    +
    +CREDIT_CARD = 'CREDIT_CARD'#
    +
    +
    +
    +CURRENCY = 'CURRENCY'#
    +
    +
    +
    +DATE_TIME = 'DATE_TIME'#
    +
    +
    +
    +EMAIL = 'EMAIL'#
    +
    +
    +
    +GPE = ('GPE',)#
    +
    +
    +
    +IMEI = 'IMEI'#
    +
    +
    +
    +LICENSE_PLATE = 'LICENSE_PLATE'#
    +
    +
    +
    +LOCATION = 'LOCATION'#
    +
    +
    +
    +MAC_ADDRESS = 'MAC_ADDRESS'#
    +
    +
    +
    +NRP = 'NRP'#
    +
    +
    +
    +ORGANIZATION = 'ORGANIZATION'#
    +
    +
    +
    +PASSWORD = 'PASSWORD'#
    +
    +
    +
    +PERSON = 'PERSON'#
    +
    +
    +
    +PHONE = 'PHONE'#
    +
    +
    +
    +ROUTING_NUMBER = 'ROUTING_NUMBER'#
    +
    +
    +
    +SSN = 'SSN'#
    +
    +
    +
    +SWIFT_CODE = 'SWIFT_CODE'#
    +
    +
    +
    +TITLE = 'TITLE'#
    +
    +
    +
    +US_BANK_NUMBER = 'US_BANK_NUMBER'#
    +
    +
    +
    +US_DRIVER_LICENSE = 'US_DRIVER_LICENSE'#
    +
    +
    +
    +US_ITIN = 'US_ITIN'#
    +
    +
    +
    +US_PASSPORT = 'US_PASSPORT'#
    +
    +
    +
    +
    +class pii_recognizer.pii_recognizer.FlairRecognizer(*args: Any, **kwargs: Any)[source]#
    +

    Bases: presidio_analyzer.

    +

    Wrapper for a flair model, if needed to be used within Presidio Analyzer. +This is to make sure the recognizer can be registered with Presidio registry.

    +
    +
    +RECOGNIZABLE_ENTITIES = {'AGE', 'CURRENCY', 'GPE', 'IMEI', 'LICENSE_PLATE', 'LOCATION', 'MAC_ADDRESS', 'NRP', 'ORGANIZATION', 'PASSWORD', 'PERSON', 'ROUTING_NUMBER', 'SWIFT_CODE', 'TITLE', 'US_BANK_NUMBER', 'US_DRIVER_LICENSE', 'US_ITIN', 'US_PASSPORT'}#
    +
    +
    +
    +analyze(text: str, entities: List[str], nlp_artifacts: Optional[presidio_analyzer.nlp_engine.NlpArtifacts] = None)List[presidio_analyzer.RecognizerResult][source]#
    +

    Analyze text and return the results.

    +
    +
    Parameters
    +
      +
    • text – The text for analysis.

    • +
    • entities – The list of entities to recognize.

    • +
    • nlp_artifacts – Not used by this recognizer but needed for the interface.

    • +
    +
    +
    Returns
    +

    The list of Presidio RecognizerResult constructed from the recognized Flair detections.

    +
    +
    +
    +
    +
    +
    +class pii_recognizer.pii_recognizer.Models[source]#
    +

    Bases: object

    +
    +
    +FLAIR = 'flair'#
    +
    +
    +
    +PATTERN = 'pattern'#
    +
    +
    +
    +SPACY = 'spacy'#
    +
    +
    +
    +WHOLE = 'whole'#
    +
    +
    +
    +
    +class pii_recognizer.pii_recognizer.PatternRecognizerFactory[source]#
    +

    Bases: object

    +

    Factory for creating pattern recognizers, it can be extended in the future to +add more regex pattern for different entities. For the pattern recognizer to work, +we need construct a list of regex patterns for each entity.

    +
    +
    +RECOGNIZABLE_ENTITIES = {'CREDIT_CARD': [presidio_analyzer.Pattern], 'EMAIL': [presidio_analyzer.Pattern], 'PHONE': [presidio_analyzer.Pattern], 'SSN': [presidio_analyzer.Pattern]}#
    +
    +
    +
    +
    +pii_recognizer.pii_recognizer.recognize_pii(context: mlrun.execution.MLClientCtx, input_path: Union[str, pathlib.Path], html_key: str, score_threshold: float, output_directory: Optional[str] = None, entities: Optional[List[str]] = None, entity_operator_map: Optional[dict] = None, model: Optional[str] = None, generate_json: bool = True, generate_html: bool = True, is_full_text: bool = True, is_full_html: bool = True, is_full_report: bool = True)Union[Tuple[str, pandas.core.frame.DataFrame, dict, dict], Tuple[str, pandas.core.frame.DataFrame, dict]][source]#
    +

    Walk through the input path, recognize PII in text and store the anonymized text in the output path. +Generate the html with different colors for each entity, json report of the explanation.

    +
    +
    Parameters
    +
      +
    • context – The MLRun context. this is needed for log the artifacts.

    • +
    • input_path – The input path of the text files needs to be analyzed.

    • +
    • html_key – The html key for the artifact.

    • +
    • score_threshold – The score threshold to mark the recognition as trusted.

    • +
    • output_directory – The output directory path to store the anonymized text.

    • +
    • entities – The list of entities to recognize.

    • +
    • entity_operator_map – The map of entity to operator (mask, redact, replace, keep, hash, and its params)

    • +
    • model – The model to use. Can be “spacy”, “flair”, “pattern” or “whole”.

    • +
    • generate_json – Whether to generate the json report of the explanation.

    • +
    • generate_html – Whether to generate the html report of the explanation.

    • +
    • is_full_text – Whether to return the full text or only the masked text.

    • +
    • is_full_html – Whether to return the full html or just the annotated text

    • +
    • is_full_report – Whether to return the full report or just the score and start, end index

    • +
    +
    +
    Returns
    +

    A tuple of:

    +
      +
    • Path to the output directory

    • +
    • The json report of the explanation (if generate_json is True)

    • +
    • A dictionary of errors files that were not processed

    • +
    +

    +
    +
    +
    +
    +
    +

    Module contents#

    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/pii_recognizer/0.3.0/static/example.html b/functions/master/pii_recognizer/0.3.0/static/example.html new file mode 100644 index 00000000..2ae8f4ae --- /dev/null +++ b/functions/master/pii_recognizer/0.3.0/static/example.html @@ -0,0 +1,1922 @@ + + + + + + + +PII Recognizer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + + + +
    +
    + + +
    +
    +
    + + +
    +
    +
    +

    PII Recognizer#

    +

    A function to detect pii data and anonymize the pii entity in the text.

    +

    In this notebook we will go over the function’s docs and outputs and see an end-to-end example of running it.

    +
      +
    1. Documentation

    2. +
    3. Results

    4. +
    5. End-to-end Demo

    6. +
    +

    +
    +

    1. Documentation#

    +

    The function receive a directory path with all the text files in it. It walk through the directory, get all the text file. Then it detect the pii entity inside of the text file, apply the operator on the entity. Generate the html file with all pii entity highlighted. Generate the json report has the explaination of the process.

    +
    +

    1.1. Parameters:#

    +
      +
    • context: mlrun.MLClientCtx

      +

      The MLRun context

      +
    • +
    • input_path: str

      +

      The input directory with all the text files

      +
    • +
    • output_path: str

      +

      The directory that is used to store the anonymized text files. it is also used for mlrun to log the artifact as zip file

      +
    • +
    • output_suffix: str

      +

      The suffix will added to the input file. for example if the input text file is pii.txt, if output_suffix is “anonymized”, the output file would be pii_anonymized.txt

      +
    • +
    • html_key: str

      +

      The artifact name of the html file

      +
    • +
    • entities: List[str]

      +

      The list of the entities to recognize. Please make sure the model you choose can recognize the entities.

      +
    • +
    • entity_operator_map: List[str] +For different entity, we can apply different operator. Now supports Keep, Mask, Replace, Redact, Hash

      +
      +   entity_operator_map = {
      +      "PERSON": ("keep", {}),
      +      "EMAIL": ("mask", {"masking_char": "#", "chars_to_mask": 5, "from_end": False}),
      +      "PHONE": ("hash", {}),
      +      "LOCATION": ("redact", {}),
      +      "ORGANIZATION": ("replace", {"new_value": "Company XYZ"})
      +      }
      +   
      +

      In this example:

      +
        +
      • “PERSON” entities are kept as they are using the “keep” operator.

      • +
      • “EMAIL_ADDRESS” entities are masked with the “#” character, masking the first five characters.

      • +
      • “PHONE_NUMBER” entities are replaced with their hashed value using the “hash” operator.

      • +
      • “LOCATION” entities are completely removed using the “redact” operator.

      • +
      • “ORGANIZATION” entities are replaced with the string “Company XYZ” using the “replace” operator.

      • +
      +
    • +
    • model: str

      +
        +
      • “whole”, “spacy”, “pattern”, “flair”. The default is “whole”.

      • +
      +

      For each model, it can detect some entities. The “whole” model is combined all three models together. It can detect all the entities list below.

      +
        +
      • “spacy” : [“LOCATION”, “PERSON”,”NRP”,”ORGANIZATION”,”DATE_TIME”]

      • +
      • “pattern”: [“CREDIT_CARD”, “SSN”, “PHONE”, “EMAIL”]

      • +
      • “flair”: [ “LOCATION”, +“PERSON”, +“NRP”, +“GPE”, +“ORGANIZATION”, +“MAC_ADDRESS”, +“US_BANK_NUMBER”, +“IMEI”, +“TITLE”, +“LICENSE_PLATE”, +“US_PASSPORT”, +“CURRENCY”, +“ROUTING_NUMBER”, +“US_ITIN”, +“US_BANK_NUMBER”, +“US_DRIVER_LICENSE”, +“AGE”, +“PASSWORD”, +“SWIFT_CODE” +]

      • +
      +
    • +
    • score_threshold:

      +

      Minimum confidence value, the default is 0 to align with presidio.AnalyzerEngine

      +
    • +
    • generate_json_rpt:

      +

      Whether to generate the json report of the explaination

      +
    • +
    • generate_html_rpt:

      +

      Whether to generate the html with highlighted pii entities or not

      +
    • +
    • is_full_text:

      +

      Whether to return the full text or just the sentences with pii entities.

      +
    • +
    • is_full_html: bool

      +

      Whether to return the full html or just the annotated html

      +
    • +
    • is_full_report: bool

      +

      Whether to return the full json report or just the score and start, end index

      +
    • +
    +
    +
    +

    1.2. Outputs:#

    +

    There are two outputs of this function.

    +
      +
    • output_path: str

      +

      The directory stored all the anonymized text files

      +
    • +
    • rpt_json: dict

      +

      A dict of reporting to explain how does the model detect the pii entity

      +
    • +
    • errors : dict +A dict of errors when processing the text files if any

    • +
    +

    +
    +
    +
    +

    2. Results#

    +

    The result of the function looks like the following:

    +

    For example if the input string is

    +

    John Doe 's ssn is 182838483, connect john doe with john_doe@gmail.com or 6288389029, he can pay you with 41482929939393

    +

    The anonymized_text is

    +

    <PERSON>'s <ORGANIZATION> is <SSN>, connect <PERSON> with <PERSON> <EMAIL> or <PHONE>, he can pay you with <CREDIT_CARD>

    +

    The html_str is

    +

    John Doe'sPERSON ssnORGANIZATION is 182838483SSN, connect me with john_doe@gmail.comPERSONjohn_doe@gmail.comEMAIL or 6288389029PHONE, he can pay you with 41482929939393CREDIT_CARD +

    +

    The json report that explain the output is

    +
    [
    +  {
    +    "entity_type": "PERSON", # result of the labeling
    +    "start": 0, # start positon of the entity
    +    "end": 9,  # end postion of the entity
    +    "score": 0.99, # the confident score of the model + context_improvement
    +    "analysis_explanation": {
    +      "recognizer": "FlairRecognizer", # which recognizer is used to recognize this entity
    +      "pattern_name": null,
    +      "pattern": null,
    +      "original_score": 0.99, # The original confident score from the pre-trained model
    +      "score": 0.99, # the final score = original_score + score_context_improvement
    +      "textual_explanation": "Identified as PER by Flair's Named Entity Recognition",
    +      "score_context_improvement": 0, # The improvement from the context
    +      "supportive_context_word": "",
    +      "validation_result": null
    +    },
    +    "recognition_metadata": {
    +      "recognizer_identifier": "Flair Analytics_5577088640",
    +      "recognizer_name": "Flair Analytics"
    +    }
    +  },
    +  ....
    +]
    +
    +
    +

    +
    +
    +

    3. End-to-end Demo#

    +
    +

    3.1. Recognition configurations#

    +
      +
    • model: which model you want to use.

    • +
    • entities: What entities to recognize?

    • +
    • score_threshold: From which score to mark the recogniztion as trusted?

    • +
    +
    +
    +
    import mlrun
    +artifact_path = "./"
    +proj = mlrun.get_or_create_project("pii", "./")
    +fn = mlrun.code_to_function(
    +    project="pii",
    +    name="pii_recognizer",
    +    filename="pii_recognizer.py",
    +    kind="job",
    +    image="mlrun/mlrun",
    +    handler="recognize_pii",
    +    description="This function is used to recognize PII in a given text",
    +)
    +run_obj = fn.run(
    +    artifact_path = artifact_path,
    +    params= {
    +        'model': "whole", 
    +        'input_path': "./data/",
    +        'output_path': "./data/output1/",
    +        "entities": ['PERSON', "EMAIL", "PHONE", "LOCATION", "ORGANIZATION"], # the entities that needs to recognize
    +        "output_suffix": "output",
    +        "html_key": "highlighted",
    +        "score_threshold" : 0.5, # the score threshold to mark the recognition as trusted
    +    },
    +    returns = ["output_path: path", "rpt_json: file", "errors: file"],
    +    local=True,
    +)
    +
    +
    +
    +
    +
    > 2023-07-31 02:17:04,305 [info] Project loaded successfully: {'project_name': 'pii'}
    +> 2023-07-31 02:17:04,312 [warning] Failed to add git metadata, ignore if path is not part of a git repo.: {'path': './', 'error': '/User/pii_recognizer'}
    +> 2023-07-31 02:17:04,408 [warning] artifact/output path is not defined or is local and relative, artifacts will not be visible in the UI: {'output_path': './'}
    +> 2023-07-31 02:17:04,409 [info] Storing function: {'name': 'pii-recognizer-recognize-pii', 'uid': '51b5ad8144004e52a1008c08850842c8', 'db': None}
    +2023-07-31 02:17:04,567 loading file /User/.flair/models/flair-pii-distilbert/models--beki--flair-pii-distilbert/snapshots/20fb59f1762edcf253bce67716a94a43cb075ae6/pytorch_model.bin
    +2023-07-31 02:17:07,730 SequenceTagger predicts: Dictionary with 21 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-DATE_TIME, B-DATE_TIME, E-DATE_TIME, I-DATE_TIME, S-ORG, B-ORG, E-ORG, I-ORG, S-NRP, B-NRP, E-NRP, I-NRP
    +Model loaded
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    pii0Jul 31 02:17:04completedpii-recognizer-recognize-pii
    v3io_user=pengw
    kind=
    owner=pengw
    host=jupyter-pengw-5f99fb678d-mnvxl
    model=whole
    input_path=./data/
    output_path=./data/output1/
    entities=['PERSON', 'EMAIL', 'PHONE', 'LOCATION', 'ORGANIZATION']
    output_suffix=output
    html_key=highlighted
    score_threshold=0.5
    highlighted
    output_path
    rpt_json
    errors
    +
    + +
    +
    
    +
    +
    +
    > to track results use the .show() or .logs() methods or click here to open in UI
    > 2023-07-31 02:17:12,403 [info] Run execution finished: {'status': 'completed', 'name': 'pii-recognizer-recognize-pii'}
    +
    +
    +
    +
    +
    +
    +
    #get the mlrun context
    +context = mlrun.get_or_create_ctx('pii_ctx1')
    +import pathlib
    +from tqdm.auto import tqdm
    +for i, txt_file in enumerate(
    +        tqdm(
    +            list(pathlib.Path("./data/output1/").glob("*.txt")),
    +            desc="Processing files",
    +            unit="file",
    +        )
    +    ):
    +            # Load the str from the text file
    +        text = txt_file.read_text()
    +        print(text)
    +
    +
    +
    +
    +
    Dear Mr. <PERSON>,
    +
    +We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of <ORGANIZATION>. Your flight tickets have been booked, and you will be departing on July 15th, 2023.
    +
    +Please provide us with the necessary details to finalize your travel arrangements. We kindly request your full name, date of birth, passport number, and contact information. Rest assured that all provided information will be handled with utmost confidentiality and in compliance with data protection regulations.
    +
    +We look forward to creating unforgettable memories for you and your loved ones during your stay with us. If you have any questions or require further assistance, please don't hesitate to contact our customer support team.
    +
    +<PERSON> <ORGANIZATION> is 182838483, connect him with <EMAIL> or <PHONE>, he can pay you with <PHONE>9393
    +
    +
    +
    +
    +
    +
    +
    #check the highlighted html 
    +html_output = context.get_cached_artifact("highlighted")
    +html_str = mlrun.get_dataitem(html_output.get_target_path()).get().decode("utf-8")
    +from IPython.core.display import display, HTML
    +display(HTML(html_str))
    +
    +
    +
    +
    +
    Highlighted Pii Entities

    Highlighted Pii Entities

  • data/letter.txt

    Dear Mr. John DoePERSON, + +We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of RivieraLOCATIONRivieraORGANIZATION. Your flight tickets have been booked, and you will be departing on July 15th, 2023. + +Please provide us with the necessary details to finalize your travel arrangements. We kindly request your full name, date of birth, passport number, and contact information. Rest assured that all provided information will be handled with utmost confidentiality and in compliance with data protection regulations. + +We look forward to creating unforgettable memories for you and your loved ones during your stay with us. If you have any questions or require further assistance, please don't hesitate to contact our customer support team. +

  • data/pii_data.txt

    John smith'sPERSON ssnORGANIZATION is 182838483, connect him with JohnPERSONJohn_smith@gmail.comEMAILsmithPERSON@gmail.com or 6288389029PHONE, he can pay you with 4148292993PHONE9393

  • +
    +
    +
    +
    #check the json report about the explanation.
    +rpt_output1 = context.get_cached_artifact("rpt_json")
    +rpt_str1 = mlrun.get_dataitem(rpt_output1.get_target_path()).get().decode("utf-8")
    +import json
    +obj = json.loads(rpt_str1)
    + 
    +# Pretty Print JSON
    +json_formatted_str1 = json.dumps(obj, indent=4)
    +print(json_formatted_str1)
    +
    +
    +
    +
    +
    {
    +    "data/letter.txt": [
    +        {
    +            "entity_type": "PERSON",
    +            "start": 9,
    +            "end": 17,
    +            "score": 1,
    +            "analysis_explanation": {
    +                "recognizer": "CustomSpacyRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1,
    +                "score": 1,
    +                "textual_explanation": "Identified as PERSON by Spacy's Named Entity Recognition (Privy-trained)",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "CustomSpacyRecognizer",
    +                "recognizer_identifier": "CustomSpacyRecognizer_139944219101744"
    +            }
    +        },
    +        {
    +            "entity_type": "LOCATION",
    +            "start": 248,
    +            "end": 255,
    +            "score": 1.0,
    +            "analysis_explanation": {
    +                "recognizer": "FlairRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1.0,
    +                "score": 1.0,
    +                "textual_explanation": "Identified as LOC by Flair's Named Entity Recognition",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_identifier": "Flair Analytics_139944219101936",
    +                "recognizer_name": "Flair Analytics"
    +            }
    +        },
    +        {
    +            "entity_type": "ORGANIZATION",
    +            "start": 248,
    +            "end": 255,
    +            "score": 1,
    +            "analysis_explanation": {
    +                "recognizer": "CustomSpacyRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1,
    +                "score": 1,
    +                "textual_explanation": "Identified as ORG by Spacy's Named Entity Recognition (Privy-trained)",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "CustomSpacyRecognizer",
    +                "recognizer_identifier": "CustomSpacyRecognizer_139944219101744"
    +            }
    +        }
    +    ],
    +    "data/pii_data.txt": [
    +        {
    +            "entity_type": "PERSON",
    +            "start": 0,
    +            "end": 12,
    +            "score": 1,
    +            "analysis_explanation": {
    +                "recognizer": "CustomSpacyRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1,
    +                "score": 1,
    +                "textual_explanation": "Identified as PERSON by Spacy's Named Entity Recognition (Privy-trained)",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "CustomSpacyRecognizer",
    +                "recognizer_identifier": "CustomSpacyRecognizer_139944219101744"
    +            }
    +        },
    +        {
    +            "entity_type": "ORGANIZATION",
    +            "start": 13,
    +            "end": 16,
    +            "score": 1,
    +            "analysis_explanation": {
    +                "recognizer": "CustomSpacyRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1,
    +                "score": 1,
    +                "textual_explanation": "Identified as ORG by Spacy's Named Entity Recognition (Privy-trained)",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "CustomSpacyRecognizer",
    +                "recognizer_identifier": "CustomSpacyRecognizer_139944219101744"
    +            }
    +        },
    +        {
    +            "entity_type": "PERSON",
    +            "start": 53,
    +            "end": 58,
    +            "score": 1.0,
    +            "analysis_explanation": {
    +                "recognizer": "FlairRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1.0,
    +                "score": 1.0,
    +                "textual_explanation": "Identified as PER by Flair's Named Entity Recognition",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_identifier": "Flair Analytics_139944219101936",
    +                "recognizer_name": "Flair Analytics"
    +            }
    +        },
    +        {
    +            "entity_type": "PERSON",
    +            "start": 48,
    +            "end": 52,
    +            "score": 0.87,
    +            "analysis_explanation": {
    +                "recognizer": "FlairRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 0.87,
    +                "score": 0.87,
    +                "textual_explanation": "Identified as PER by Flair's Named Entity Recognition",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_identifier": "Flair Analytics_139944219101936",
    +                "recognizer_name": "Flair Analytics"
    +            }
    +        },
    +        {
    +            "entity_type": "EMAIL",
    +            "start": 48,
    +            "end": 68,
    +            "score": 0.5,
    +            "analysis_explanation": {
    +                "recognizer": "PatternRecognizer",
    +                "pattern_name": "EMAIL",
    +                "pattern": "\\S+@\\S+",
    +                "original_score": 0.5,
    +                "score": 0.5,
    +                "textual_explanation": null,
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "PatternRecognizer",
    +                "recognizer_identifier": "PatternRecognizer_139944352474640"
    +            }
    +        },
    +        {
    +            "entity_type": "PHONE",
    +            "start": 72,
    +            "end": 82,
    +            "score": 0.5,
    +            "analysis_explanation": {
    +                "recognizer": "PatternRecognizer",
    +                "pattern_name": "PHONE",
    +                "pattern": "\\(?\\d{3}\\)?[-.\\s]?\\d{3}[-.\\s]?\\d{4}",
    +                "original_score": 0.5,
    +                "score": 0.5,
    +                "textual_explanation": null,
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "PatternRecognizer",
    +                "recognizer_identifier": "PatternRecognizer_139944352476560"
    +            }
    +        },
    +        {
    +            "entity_type": "PHONE",
    +            "start": 104,
    +            "end": 114,
    +            "score": 0.5,
    +            "analysis_explanation": {
    +                "recognizer": "PatternRecognizer",
    +                "pattern_name": "PHONE",
    +                "pattern": "\\(?\\d{3}\\)?[-.\\s]?\\d{3}[-.\\s]?\\d{4}",
    +                "original_score": 0.5,
    +                "score": 0.5,
    +                "textual_explanation": null,
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "PatternRecognizer",
    +                "recognizer_identifier": "PatternRecognizer_139944352476560"
    +            }
    +        }
    +    ]
    +}
    +
    +
    +
    +
    +
    +
    +

    3.2. Masking configurations#

    +
      +
    • entity_operator_map: it defined what to do with recognized tokens? Mask them? mask them with what? remove them? replace them?

    • +
    +
    +     entity_operator_map = {
    +        "PERSON": ("keep", {}),
    +        "EMAIL": ("mask", {"masking_char": "😀", "chars_to_mask": 5, "from_end": False}),
    +        "PHONE": ("hash", {}),
    +        "LOCATION": ("redact", {}),
    +        "ORGANIZATION": ("replace", {"new_value": "Company XYZ"})
    +        }
    +     
    +
    +
    import mlrun
    +artifact_path = "./"
    +proj = mlrun.get_or_create_project("pii", "./")
    +fn = mlrun.code_to_function(
    +    project="pii",
    +    name="pii_recognizer",
    +    filename="pii_recognizer.py",
    +    kind="job",
    +    image="mlrun/mlrun",
    +    handler="recognize_pii",
    +    description="This function is used to recognize PII in a given text",
    +)
    +
    +entity_operator_map = {
    +        "PERSON": ("keep", {}),
    +        "EMAIL": ("mask", {"masking_char": "😀", "chars_to_mask" : 100, "from_end": False}),
    +        "PHONE": ("hash", {}),
    +        "LOCATION": ("redact", {}),
    +        "ORGANIZATION": ("replace", {"new_value": "Company XYZ"})
    +        }
    +run_obj = fn.run(
    +    artifact_path = artifact_path,
    +    params= {
    +        'model': "whole", 
    +        'input_path': "./data/",
    +        'output_path': "./data/output2/",
    +        "entities": ['PERSON', "EMAIL", "PHONE", "LOCATION", "ORGANIZATION"],
    +        "output_suffix": "output",
    +        "html_key": "highlighted",
    +        "score_threshold" : 0.5,
    +        "entity_operator_map": entity_operator_map,
    +        
    +    },
    +    returns = ["output_path: path", "rpt_json: file", "errors: file"],
    +    local=True,
    +)
    +
    +
    +
    +
    +
    > 2023-07-31 02:20:40,550 [info] Project loaded successfully: {'project_name': 'pii'}
    +> 2023-07-31 02:20:40,556 [warning] Failed to add git metadata, ignore if path is not part of a git repo.: {'path': './', 'error': '/User/pii_recognizer'}
    +> 2023-07-31 02:20:40,649 [warning] artifact/output path is not defined or is local and relative, artifacts will not be visible in the UI: {'output_path': './'}
    +> 2023-07-31 02:20:40,649 [info] Storing function: {'name': 'pii-recognizer-recognize-pii', 'uid': '2b43f80c7ca44b43b229760bb55f814d', 'db': None}
    +2023-07-31 02:20:40,812 loading file /User/.flair/models/flair-pii-distilbert/models--beki--flair-pii-distilbert/snapshots/20fb59f1762edcf253bce67716a94a43cb075ae6/pytorch_model.bin
    +2023-07-31 02:20:44,130 SequenceTagger predicts: Dictionary with 21 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-DATE_TIME, B-DATE_TIME, E-DATE_TIME, I-DATE_TIME, S-ORG, B-ORG, E-ORG, I-ORG, S-NRP, B-NRP, E-NRP, I-NRP
    +Model loaded
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    pii0Jul 31 02:20:40completedpii-recognizer-recognize-pii
    v3io_user=pengw
    kind=
    owner=pengw
    host=jupyter-pengw-5f99fb678d-mnvxl
    model=whole
    input_path=./data/
    output_path=./data/output2/
    entities=['PERSON', 'EMAIL', 'PHONE', 'LOCATION', 'ORGANIZATION']
    output_suffix=output
    html_key=highlighted
    score_threshold=0.5
    entity_operator_map={'PERSON': ('keep', {}), 'EMAIL': ('mask', {'masking_char': '😀', 'chars_to_mask': 100, 'from_end': False, 'entity_type': 'EMAIL'}), 'PHONE': ('hash', {}), 'LOCATION': ('redact', {}), 'ORGANIZATION': ('replace', {'new_value': 'Company XYZ', 'entity_type': 'ORGANIZATION'})}
    highlighted
    output_path
    rpt_json
    errors
    +
    + +
    +
    
    +
    +
    +
    > to track results use the .show() or .logs() methods or click here to open in UI
    > 2023-07-31 02:20:48,903 [info] Run execution finished: {'status': 'completed', 'name': 'pii-recognizer-recognize-pii'}
    +
    +
    +
    +
    +
    +
    +
    #get the mlrun context
    +context = mlrun.get_or_create_ctx('pii_ctx1')
    +import pathlib
    +from tqdm.auto import tqdm
    +for i, txt_file in enumerate(
    +        tqdm(
    +            list(pathlib.Path("./data/output2/").glob("*.txt")),
    +            desc="Processing files",
    +            unit="file",
    +        )
    +    ):
    +            # Load the str from the text file
    +        text = txt_file.read_text()
    +        print(text)
    +
    +
    +
    +
    +
    Dear Mr. John Doe,
    +
    +We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of Company XYZ. Your flight tickets have been booked, and you will be departing on July 15th, 2023.
    +
    +Please provide us with the necessary details to finalize your travel arrangements. We kindly request your full name, date of birth, passport number, and contact information. Rest assured that all provided information will be handled with utmost confidentiality and in compliance with data protection regulations.
    +
    +We look forward to creating unforgettable memories for you and your loved ones during your stay with us. If you have any questions or require further assistance, please don't hesitate to contact our customer support team.
    +
    +John smith's Company XYZ is 182838483, connect him with 😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀 or 3990096a212e92850c3b3c8e57ab398252d482444a32def6b030cbac2d51efa3, he can pay you with a6983d9477e93eab115305afd124bd096699e6cb7d2ce72ec6e29a6378a4e8059393
    +
    +
    +
    +
    +
    +
    +
    #check the highlighted html 
    +html_output = context.get_cached_artifact("highlighted")
    +html_str = mlrun.get_dataitem(html_output.get_target_path()).get().decode("utf-8")
    +from IPython.core.display import display, HTML
    +display(HTML(html_str))
    +
    +
    +
    +
    +
    Highlighted Pii Entities

    Highlighted Pii Entities

  • data/letter.txt

    Dear Mr. John DoePERSON, + +We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of RivieraLOCATIONRivieraORGANIZATION. Your flight tickets have been booked, and you will be departing on July 15th, 2023. + +Please provide us with the necessary details to finalize your travel arrangements. We kindly request your full name, date of birth, passport number, and contact information. Rest assured that all provided information will be handled with utmost confidentiality and in compliance with data protection regulations. + +We look forward to creating unforgettable memories for you and your loved ones during your stay with us. If you have any questions or require further assistance, please don't hesitate to contact our customer support team. +

  • data/pii_data.txt

    John smith'sPERSON ssnORGANIZATION is 182838483, connect him with JohnPERSONJohn_smith@gmail.comEMAILsmithPERSON@gmail.com or 6288389029PHONE, he can pay you with 4148292993PHONE9393

  • +
    +
    +
    +
    #check the json report about the explanation.
    +rpt_output1 = context.get_cached_artifact("rpt_json")
    +rpt_str1 = mlrun.get_dataitem(rpt_output1.get_target_path()).get().decode("utf-8")
    +import json
    +obj = json.loads(rpt_str1)
    + 
    +# Pretty Print JSON
    +json_formatted_str1 = json.dumps(obj, indent=4)
    +print(json_formatted_str1)
    +
    +
    +
    +
    +
    {
    +    "data/letter.txt": [
    +        {
    +            "entity_type": "PERSON",
    +            "start": 9,
    +            "end": 17,
    +            "score": 1.0,
    +            "analysis_explanation": {
    +                "recognizer": "FlairRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1.0,
    +                "score": 1.0,
    +                "textual_explanation": "Identified as PER by Flair's Named Entity Recognition",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_identifier": "Flair Analytics_139944345555488",
    +                "recognizer_name": "Flair Analytics"
    +            }
    +        },
    +        {
    +            "entity_type": "LOCATION",
    +            "start": 248,
    +            "end": 255,
    +            "score": 1.0,
    +            "analysis_explanation": {
    +                "recognizer": "FlairRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1.0,
    +                "score": 1.0,
    +                "textual_explanation": "Identified as LOC by Flair's Named Entity Recognition",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_identifier": "Flair Analytics_139944345555488",
    +                "recognizer_name": "Flair Analytics"
    +            }
    +        },
    +        {
    +            "entity_type": "ORGANIZATION",
    +            "start": 248,
    +            "end": 255,
    +            "score": 1,
    +            "analysis_explanation": {
    +                "recognizer": "CustomSpacyRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1,
    +                "score": 1,
    +                "textual_explanation": "Identified as ORG by Spacy's Named Entity Recognition (Privy-trained)",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "CustomSpacyRecognizer",
    +                "recognizer_identifier": "CustomSpacyRecognizer_139943499301312"
    +            }
    +        }
    +    ],
    +    "data/pii_data.txt": [
    +        {
    +            "entity_type": "PERSON",
    +            "start": 0,
    +            "end": 12,
    +            "score": 1,
    +            "analysis_explanation": {
    +                "recognizer": "CustomSpacyRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1,
    +                "score": 1,
    +                "textual_explanation": "Identified as PERSON by Spacy's Named Entity Recognition (Privy-trained)",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "CustomSpacyRecognizer",
    +                "recognizer_identifier": "CustomSpacyRecognizer_139943499301312"
    +            }
    +        },
    +        {
    +            "entity_type": "ORGANIZATION",
    +            "start": 13,
    +            "end": 16,
    +            "score": 1,
    +            "analysis_explanation": {
    +                "recognizer": "CustomSpacyRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1,
    +                "score": 1,
    +                "textual_explanation": "Identified as ORG by Spacy's Named Entity Recognition (Privy-trained)",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "CustomSpacyRecognizer",
    +                "recognizer_identifier": "CustomSpacyRecognizer_139943499301312"
    +            }
    +        },
    +        {
    +            "entity_type": "PERSON",
    +            "start": 53,
    +            "end": 58,
    +            "score": 1.0,
    +            "analysis_explanation": {
    +                "recognizer": "FlairRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1.0,
    +                "score": 1.0,
    +                "textual_explanation": "Identified as PER by Flair's Named Entity Recognition",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_identifier": "Flair Analytics_139944345555488",
    +                "recognizer_name": "Flair Analytics"
    +            }
    +        },
    +        {
    +            "entity_type": "PERSON",
    +            "start": 48,
    +            "end": 52,
    +            "score": 0.87,
    +            "analysis_explanation": {
    +                "recognizer": "FlairRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 0.87,
    +                "score": 0.87,
    +                "textual_explanation": "Identified as PER by Flair's Named Entity Recognition",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_identifier": "Flair Analytics_139944345555488",
    +                "recognizer_name": "Flair Analytics"
    +            }
    +        },
    +        {
    +            "entity_type": "EMAIL",
    +            "start": 48,
    +            "end": 68,
    +            "score": 0.5,
    +            "analysis_explanation": {
    +                "recognizer": "PatternRecognizer",
    +                "pattern_name": "EMAIL",
    +                "pattern": "\\S+@\\S+",
    +                "original_score": 0.5,
    +                "score": 0.5,
    +                "textual_explanation": null,
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "PatternRecognizer",
    +                "recognizer_identifier": "PatternRecognizer_139943864893792"
    +            }
    +        },
    +        {
    +            "entity_type": "PHONE",
    +            "start": 72,
    +            "end": 82,
    +            "score": 0.5,
    +            "analysis_explanation": {
    +                "recognizer": "PatternRecognizer",
    +                "pattern_name": "PHONE",
    +                "pattern": "\\(?\\d{3}\\)?[-.\\s]?\\d{3}[-.\\s]?\\d{4}",
    +                "original_score": 0.5,
    +                "score": 0.5,
    +                "textual_explanation": null,
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "PatternRecognizer",
    +                "recognizer_identifier": "PatternRecognizer_139943864894128"
    +            }
    +        },
    +        {
    +            "entity_type": "PHONE",
    +            "start": 104,
    +            "end": 114,
    +            "score": 0.5,
    +            "analysis_explanation": {
    +                "recognizer": "PatternRecognizer",
    +                "pattern_name": "PHONE",
    +                "pattern": "\\(?\\d{3}\\)?[-.\\s]?\\d{3}[-.\\s]?\\d{4}",
    +                "original_score": 0.5,
    +                "score": 0.5,
    +                "textual_explanation": null,
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "PatternRecognizer",
    +                "recognizer_identifier": "PatternRecognizer_139943864894128"
    +            }
    +        }
    +    ]
    +}
    +
    +
    +
    +
    +
    +
    +

    3.3 Output configurations#

    +
      +
    • is_full_text: whether produce full text or just the sentences have PII entities in it

    • +
    • generate_html: whether to produce the html with highlighted pii entities

    • +
    • generate_json: whether to proudce the json report with the explaination of the process

    • +
    • is_full_html: whether produce full text with the pii entities highlighted or just sentences with pii entities.

    • +
    • is_full_report: whether produce the json report with detailed information or just start, end index and scores.

    • +
    +
    +
    +
    import mlrun
    +artifact_path = "./"
    +proj = mlrun.get_or_create_project("pii", "./")
    +fn = mlrun.code_to_function(
    +    project="pii",
    +    name="pii_recognizer",
    +    filename="pii_recognizer.py",
    +    kind="job",
    +    image="mlrun/mlrun",
    +    handler="recognize_pii",
    +    description="This function is used to recognize PII in a given text",
    +)
    +
    +entity_operator_map = {
    +        "PERSON": ("keep", {}),
    +        "EMAIL": ("mask", {"masking_char": "😀", "chars_to_mask" : 100, "from_end": False}),
    +        "PHONE": ("hash", {}),
    +        "LOCATION": ("redact", {}),
    +        "ORGANIZATION": ("replace", {"new_value": "Company XYZ"})
    +        }
    +run_obj = fn.run(
    +    artifact_path = artifact_path,
    +    params= {
    +        'model': "whole", 
    +        'input_path': "./data/",
    +        'output_path': "./data/output3/",
    +        "entities": ['PERSON', "EMAIL", "PHONE", "LOCATION", "ORGANIZATION"],
    +        "output_suffix": "output",
    +        "html_key": "highlighted",
    +        "score_threshold" : 0.5,
    +        "entity_operator_map": entity_operator_map,
    +        "is_full_text": False,
    +        "is_full_html": False,
    +        "is_full_report": False,
    +    },
    +    returns = ["output_path: path", "rpt_json: file", "errors: file"],
    +    local=True,
    +)
    +
    +
    +
    +
    +
    > 2023-07-31 02:22:57,789 [info] Project loaded successfully: {'project_name': 'pii'}
    +> 2023-07-31 02:22:57,799 [warning] Failed to add git metadata, ignore if path is not part of a git repo.: {'path': './', 'error': '/User/pii_recognizer'}
    +> 2023-07-31 02:22:57,891 [warning] artifact/output path is not defined or is local and relative, artifacts will not be visible in the UI: {'output_path': './'}
    +> 2023-07-31 02:22:57,892 [info] Storing function: {'name': 'pii-recognizer-recognize-pii', 'uid': '3f6d701e423346b39026dc365698c15c', 'db': None}
    +2023-07-31 02:22:58,079 loading file /User/.flair/models/flair-pii-distilbert/models--beki--flair-pii-distilbert/snapshots/20fb59f1762edcf253bce67716a94a43cb075ae6/pytorch_model.bin
    +2023-07-31 02:23:01,565 SequenceTagger predicts: Dictionary with 21 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-DATE_TIME, B-DATE_TIME, E-DATE_TIME, I-DATE_TIME, S-ORG, B-ORG, E-ORG, I-ORG, S-NRP, B-NRP, E-NRP, I-NRP
    +Model loaded
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    pii0Jul 31 02:22:57completedpii-recognizer-recognize-pii
    v3io_user=pengw
    kind=
    owner=pengw
    host=jupyter-pengw-5f99fb678d-mnvxl
    model=whole
    input_path=./data/
    output_path=./data/output3/
    entities=['PERSON', 'EMAIL', 'PHONE', 'LOCATION', 'ORGANIZATION']
    output_suffix=output
    html_key=highlighted
    score_threshold=0.5
    entity_operator_map={'PERSON': ('keep', {}), 'EMAIL': ('mask', {'masking_char': '😀', 'chars_to_mask': 100, 'from_end': False, 'entity_type': 'EMAIL'}), 'PHONE': ('hash', {}), 'LOCATION': ('redact', {}), 'ORGANIZATION': ('replace', {'new_value': 'Company XYZ', 'entity_type': 'ORGANIZATION'})}
    is_full_text=False
    is_full_html=False
    is_full_report=False
    highlighted
    output_path
    rpt_json
    errors
    +
    + +
    +
    
    +
    +
    +
    > to track results use the .show() or .logs() methods or click here to open in UI
    > 2023-07-31 02:23:06,096 [info] Run execution finished: {'status': 'completed', 'name': 'pii-recognizer-recognize-pii'}
    +
    +
    +
    +
    +
    +
    +
    #get the mlrun context
    +context = mlrun.get_or_create_ctx('pii_ctx')
    +import pathlib
    +from tqdm.auto import tqdm
    +
    +
    +
    +
    +
    +
    +
    for i, txt_file in enumerate(
    +        tqdm(
    +            list(pathlib.Path("./data/output3/").glob("*.txt")),
    +            desc="Processing files",
    +            unit="file",
    +        )
    +    ):
    +            # Load the str from the text file
    +        text = txt_file.read_text()
    +        print(text)
    +
    +
    +
    +
    +
    Dear Mr. John Doe,
    +
    +We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway!
    +John smith's Company XYZ is 182838483, connect him with 😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀 or 3990096a212e92850c3b3c8e57ab398252d482444a32def6b030cbac2d51efa3, he can pay you with a6983d9477e93eab115305afd124bd096699e6cb7d2ce72ec6e29a6378a4e8059393
    +
    +
    +
    +
    +
    +
    +
    #check the highlighted html 
    +html_output = context.get_cached_artifact("highlighted")
    +html_str = mlrun.get_dataitem(html_output.get_target_path()).get().decode("utf-8")
    +from IPython.core.display import display, HTML
    +display(HTML(html_str))
    +
    +
    +
    +
    +
    Highlighted Pii Entities

    Highlighted Pii Entities

  • data/letter.txt

    Dear Mr. John DoePERSON, + +We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of RivieraLOCATIONRivieraORGANIZATION

  • data/pii_data.txt

    John smith'sPERSON ssnORGANIZATION is 182838483, connect him with JohnPERSONJohn_smith@gmail.comEMAILsmithPERSON

  • +
    +
    +
    +
    #check the json report about the explanation.
    +rpt_output = context.get_cached_artifact("rpt_json")
    +rpt_str = mlrun.get_dataitem(rpt_output.get_target_path()).get().decode("utf-8")
    +import json
    +obj = json.loads(rpt_str)
    + 
    +# Pretty Print JSON
    +json_formatted_str = json.dumps(obj, indent=4)
    +print(json_formatted_str)
    +
    +
    +
    +
    +
    {
    +    "data/letter.txt": [
    +        {
    +            "entity_type": "PERSON",
    +            "start": 9,
    +            "end": 17,
    +            "score": 1
    +        },
    +        {
    +            "entity_type": "LOCATION",
    +            "start": 248,
    +            "end": 255,
    +            "score": 1.0
    +        },
    +        {
    +            "entity_type": "ORGANIZATION",
    +            "start": 248,
    +            "end": 255,
    +            "score": 1
    +        }
    +    ],
    +    "data/pii_data.txt": [
    +        {
    +            "entity_type": "PERSON",
    +            "start": 0,
    +            "end": 12,
    +            "score": 1
    +        },
    +        {
    +            "entity_type": "ORGANIZATION",
    +            "start": 13,
    +            "end": 16,
    +            "score": 1
    +        },
    +        {
    +            "entity_type": "PERSON",
    +            "start": 53,
    +            "end": 58,
    +            "score": 1.0
    +        },
    +        {
    +            "entity_type": "PERSON",
    +            "start": 48,
    +            "end": 52,
    +            "score": 0.87
    +        },
    +        {
    +            "entity_type": "EMAIL",
    +            "start": 48,
    +            "end": 68,
    +            "score": 0.5
    +        },
    +        {
    +            "entity_type": "PHONE",
    +            "start": 72,
    +            "end": 82,
    +            "score": 0.5
    +        },
    +        {
    +            "entity_type": "PHONE",
    +            "start": 104,
    +            "end": 114,
    +            "score": 0.5
    +        }
    +    ]
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/pii_recognizer/0.3.0/static/function.html b/functions/master/pii_recognizer/0.3.0/static/function.html new file mode 100644 index 00000000..1d21cf1b --- /dev/null +++ b/functions/master/pii_recognizer/0.3.0/static/function.html @@ -0,0 +1,151 @@ + + + + + + + + + + + Source + + + + +
    +        
    +kind: job
    +metadata:
    +  name: pii-recognizer
    +  tag: ''
    +  hash: 818930645d33704e9cada919769ee9d93cbb9434
    +  project: ''
    +  labels:
    +    author: pgw
    +  categories:
    +  - machine-learning
    +  - data-preparation
    +  - NLP
    +spec:
    +  command: ''
    +  args: []
    +  image: ''
    +  build:
    +    functionSourceCode: # Copyright 2023 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import pathlib
import tempfile
import warnings
from typing import List, Set, Tuple, Union

import annotated_text.util as at_util
import mlrun
import nltk
import pandas as pd
import presidio_analyzer as pa
import presidio_anonymizer as pre_anoymizer
from presidio_anonymizer.entities import OperatorConfig
from tqdm import tqdm

try:
    import flair as fl
except ModuleNotFoundError:
    print("Flair is not installed")

# There is a conflict between Rust-based tokenizers' parallel processing
# and Python's fork operations during multiprocessing. To avoid this, we need
# the following two lines

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore")

logger = logging.getLogger("pii-recognizer")


# Add the constant classes of Models and Entities to govern the whole package
class Models:
    WHOLE = "whole"
    PATTERN = "pattern"
    SPACY = "spacy"
    FLAIR = "flair"


class Entities:
    CREDIT_CARD = "CREDIT_CARD"
    SSN = "SSN"
    PHONE = "PHONE"
    EMAIL = "EMAIL"
    LOCATION = "LOCATION"
    PERSON = "PERSON"
    NRP = "NRP"
    ORGANIZATION = "ORGANIZATION"
    DATE_TIME = "DATE_TIME"
    GPE = ("GPE",)
    MAC_ADDRESS = "MAC_ADDRESS"
    US_BANK_NUMBER = "US_BANK_NUMBER"
    IMEI = "IMEI"
    TITLE = "TITLE"
    LICENSE_PLATE = "LICENSE_PLATE"
    US_PASSPORT = "US_PASSPORT"
    CURRENCY = "CURRENCY"
    ROUTING_NUMBER = "ROUTING_NUMBER"
    US_ITIN = "US_ITIN"
    US_BANK_NUMBER = "US_BANK_NUMBER"
    US_DRIVER_LICENSE = "US_DRIVER_LICENSE"
    AGE = "AGE"
    PASSWORD = "PASSWORD"
    SWIFT_CODE = "SWIFT_CODE"


class PatternRecognizerFactory:
    """
    Factory for creating pattern recognizers, it can be extended in the future to
    add more regex pattern for different entities. For the pattern recognizer to work,
    we need construct a list of regex patterns for each entity.
    """

    RECOGNIZABLE_ENTITIES = {
        "CREDIT_CARD": [pa.Pattern("CREDIT_CARD", r"\b(?:\d[ -]*?){13,16}\b", 0.5)],
        "SSN": [pa.Pattern("SSN", r"\b\d{3}-?\d{2}-?\d{4}\b", 0.5)],
        "PHONE": [pa.Pattern("PHONE", r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", 0.5)],
        "EMAIL": [pa.Pattern("EMAIL", r"\S+@\S+", 0.5)],
    }

    # create a list of pattern recognizers
    @classmethod
    def _create_pattern_recognizer(cls):
        """
        For each entity, create a list of patterns to recognize it

        :param cls: PatternRecognizerFactory class

        :returns: List of pattern recognizers
        """

        # Entities to recognize and their regex patterns

        return [
            pa.PatternRecognizer(supported_entity=entity, patterns=pattern)
            for entity, pattern in cls.RECOGNIZABLE_ENTITIES.items()
        ]


class CustomSpacyRecognizer(pa.LocalRecognizer):
    """
    Custom Spacy Recognizer from Presidio Analyzer trained on Privy data.
    The privy data is generated using this https://github.com/pixie-io/pixie/tree/main/src/datagen/pii/privy
    It can be used to recognize custom entities, Since we want to use Presidio's Registries to generate AnalyzerEngine,
    it inherits from Presidio Analyzer's LocalRecognizer class.
    """

    # Entities to recognize

    RECOGNIZABLE_ENTITIES = {
        "LOCATION",
        "PERSON",
        "NRP",
        "ORGANIZATION",
        "DATE_TIME",
    }

    # Default explanation for this recognizer

    _DEFAULT_EXPLANATION = (
        "Identified as {} by Spacy's Named Entity Recognition (Privy-trained)"
    )

    # Label groups to check

    _DEFAULT_CHECK_LABEL_GROUPS = [
        ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}),
        ({"PERSON"}, {"PER", "PERSON"}),
        ({"NRP"}, {"NORP", "NRP"}),
        ({"ORGANIZATION"}, {"ORG"}),
        ({"DATE_TIME"}, {"DATE_TIME"}),
    ]

    # pretrained model for this recognizer

    _DEFAULT_MODEL_LANGUAGES = {
        "en": "beki/en_spacy_pii_distilbert",
    }

    _DEFAULT_PRESIDIO_EQUIVALENCES = {
        "PER": "PERSON",
        "LOC": "LOCATION",
        "ORG": "ORGANIZATION",
        "NROP": "NRP",
        "DATE_TIME": "DATE_TIME",
    }

    def __init__(
        self,
        supported_language: str = "en",
        supported_entities: List[str] = None,
        check_label_groups: Tuple[Set, Set] = None,
        context: List[str] = None,
        ner_strength: float = 1,
    ):
        """
        Initialize Spacy Recognizer.

        :param supported_language: Language to use, default is English
        :param supported_entities: Entities to use for recognition
        :param check_label_groups: Label groups to check for the entities
        :param context:            Context to use if any
        :param ner_strength:       Default confidence for NER prediction

        :returns: SpacyRecognizer object
        """

        # Default confidence for NER prediction
        self.ner_strength = ner_strength

        self.check_label_groups = check_label_groups or self._DEFAULT_CHECK_LABEL_GROUPS
        supported_entities = supported_entities or self.RECOGNIZABLE_ENTITIES
        super().__init__(
            supported_entities=supported_entities,
            supported_language=supported_language,
        )

    # get the presidio explanation for the result

    def _build_spacy_explanation(
        self, original_score: float, explanation: str
    ) -> pa.AnalysisExplanation:
        """
        Create explanation for why this result was detected.

        :param original_score: Score given by this recognizer
        :param explanation:    Explanation string

        :returns: Presidio AnalysisExplanation object
        """
        explanation = pa.AnalysisExplanation(
            recognizer=self.__class__.__name__,
            original_score=original_score,
            textual_explanation=explanation,
        )
        return explanation

    # main method for the recognizer
    def analyze(self, text: str, entities: List[str], nlp_artifacts=None):  # noqa D102
        """
        Analyze text using Spacy.

        :param text:          Text to analyze
        :param entities:      Entities to analyze
        :param nlp_artifacts: NLP artifacts to use

        :returns: List of Presidio RecognizerResult objects
        """
        results = []
        if not nlp_artifacts:
            logger.warning("Skipping SpaCy, nlp artifacts not provided...")
            return results

        ner_entities = nlp_artifacts.entities

        # recognize the supported entities
        for entity in entities:
            if entity not in self.supported_entities:
                continue
            for ent in ner_entities:
                if not self.__check_label(entity, ent.label_, self.check_label_groups):
                    continue

                # string of the explanation saying the entity is recognized by spacy
                textual_explanation = self._DEFAULT_EXPLANATION.format(ent.label_)
                explanation = self._build_spacy_explanation(
                    self.ner_strength, textual_explanation
                )

                # create the standard result with the entity, start, end, score, and explanation
                spacy_result = pa.RecognizerResult(
                    entity_type=entity,
                    start=ent.start_char,
                    end=ent.end_char,
                    score=self.ner_strength,
                    analysis_explanation=explanation,
                    recognition_metadata={
                        pa.RecognizerResult.RECOGNIZER_NAME_KEY: self.name
                    },
                )
                results.append(spacy_result)

        return results

    @staticmethod
    def __check_label(
        entity: str, label: str, check_label_groups: Tuple[Set, Set]
    ) -> bool:
        """
        Check if the label is in the label group.

        :param entity:             Entity to check
        :param label:              Label to check
        :param check_label_groups: Label groups to check

        :returns: True if the label is in the label group, False otherwise
        """
        return any(
            entity in egrp and label in lgrp for egrp, lgrp in check_label_groups
        )


# Class to use Flair with Presidio as an external recognizer.
class FlairRecognizer(pa.EntityRecognizer):
    """
    Wrapper for a flair model, if needed to be used within Presidio Analyzer.
    This is to make sure the recognizer can be registered with Presidio registry.
    """

    RECOGNIZABLE_ENTITIES = {
        "LOCATION",
        "PERSON",
        "NRP",
        "GPE",
        "ORGANIZATION",
        "MAC_ADDRESS",
        "US_BANK_NUMBER",
        "IMEI",
        "TITLE",
        "LICENSE_PLATE",
        "US_PASSPORT",
        "CURRENCY",
        "ROUTING_NUMBER",
        "US_ITIN",
        "US_BANK_NUMBER",
        "US_DRIVER_LICENSE",
        "AGE",
        "PASSWORD",
        "SWIFT_CODE",
    }

    # This is used to construct the explanation for the result

    _DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"

    _DEFAULT_CHECK_LABEL_GROUPS = [
        ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}),
        ({"PERSON"}, {"PER", "PERSON"}),
        ({"NRP"}, {"NORP", "NRP"}),
        ({"GPE"}, {"GPE"}),
        ({"ORGANIZATION"}, {"ORG"}),
        ({"MAC_ADDRESS"}, {"MAC_ADDRESS"}),
        ({"US_BANK_NUMBER"}, {"US_BANK_NUMBER"}),
        ({"IMEI"}, {"IMEI"}),
        ({"TITLE"}, {"TITLE"}),
        ({"LICENSE_PLATE"}, {"LICENSE_PLATE"}),
        ({"US_PASSPORT"}, {"US_PASSPORT"}),
        ({"CURRENCY"}, {"CURRENCY"}),
        ({"ROUTING_NUMBER"}, {"ROUTING_NUMBER"}),
        ({"AGE"}, {"AGE"}),
        ({"CURRENCY"}, {"CURRENCY"}),
        ({"SWIFT_CODE"}, {"SWIFT_CODE"}),
        ({"US_ITIN"}, {"US_ITIN"}),
        ({"US_BANK_NUMBER"}, {"US_BANK_NUMBER"}),
        ({"US_DRIVER_LICENSE"}, {"US_DRIVER_LICENSE"}),
    ]

    _DEFAULT_MODEL_LANGUAGES = {
        "en": "beki/flair-pii-distilbert",
    }

    _DEFAULT_PRESIDIO_EQUIVALENCES = {
        "PER": "PERSON",
        "LOC": "LOCATION",
        "ORG": "ORGANIZATION",
        "NROP": "NRP",
        "URL": "URL",
        "US_ITIN": "US_ITIN",
        "US_PASSPORT": "US_PASSPORT",
        "IBAN_CODE": "IBAN_CODE",
        "IP_ADDRESS": "IP_ADDRESS",
        "EMAIL_ADDRESS": "EMAIL",
        "US_DRIVER_LICENSE": "US_DRIVER_LICENSE",
        "US_BANK_NUMBER": "US_BANK_NUMBER",
    }

    def __init__(
        self,
        supported_language: str = "en",
        supported_entities: List[str] = None,
        check_label_groups: Tuple[Set, Set] = None,
    ):
        """
        Initialize the FlairRecognizer.

        :param supported_language: Language to use
        :param supported_entities: Entities to use
        :param check_label_groups: Label groups to check

        :returns: FlairRecognizer object

        """
        self.check_label_groups = check_label_groups or self._DEFAULT_CHECK_LABEL_GROUPS

        supported_entities = supported_entities or self.RECOGNIZABLE_ENTITIES
        self.model = fl.models.SequenceTagger.load(
            self._DEFAULT_MODEL_LANGUAGES.get(supported_language)
        )

        super().__init__(
            supported_entities=supported_entities,
            supported_language=supported_language,
            name="Flair Analytics",
        )

    # main method for the recognizer
    def analyze(
        self,
        text: str,
        entities: List[str],
        nlp_artifacts: pa.nlp_engine.NlpArtifacts = None,
    ) -> List[pa.RecognizerResult]:
        """
        Analyze text and return the results.

        :param text:          The text for analysis.
        :param entities:      The list of entities to recognize.
        :param nlp_artifacts: Not used by this recognizer but needed for the interface.

        :returns: The list of Presidio RecognizerResult constructed from the recognized Flair detections.
        """

        results = []

        sentences = fl.data.Sentence(text)
        self.model.predict(sentences)

        # If there are no specific list of entities, we will look for all of it.
        if not entities:
            entities = self.supported_entities

        # Go over the entities and check if they are in the supported entities list.
        for entity in entities:
            if entity not in self.supported_entities:
                continue

            # Go over the sentences and check if the entity is in the sentence.
            for ent in sentences.get_spans("ner"):
                if not self.__check_label(
                    entity, ent.labels[0].value, self.check_label_groups
                ):
                    continue

                # If the entity is in the sentence, we will add it to the results.
                textual_explanation = self._DEFAULT_EXPLANATION.format(
                    ent.labels[0].value
                )

                # Build the explanation for the result
                explanation = self._build_flair_explanation(
                    round(ent.score, 2), textual_explanation
                )

                flair_result = self._convert_to_recognizer_result(ent, explanation)

                results.append(flair_result)

        return results

    def _convert_to_recognizer_result(
        self, entity: fl.data.Span, explanation: str
    ) -> pa.RecognizerResult:
        """
        Convert Flair result to Presidio RecognizerResult.

        :param entity:      Flair entity of Span
        :param explanation: Presidio AnalysisExplanation

        :returns: Presidio RecognizerResult
        """

        # Convert the entity type to Presidio entity type
        entity_type = self._DEFAULT_PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)

        # Convert the score to Presidio score
        flair_score = round(entity.score, 2)

        # Create the Presidio RecognizerResult from the Flair entity
        flair_results = pa.RecognizerResult(
            entity_type=entity_type,
            start=entity.start_position,
            end=entity.end_position,
            score=flair_score,
            analysis_explanation=explanation,
        )

        return flair_results

    def _build_flair_explanation(
        self, original_score: float, explanation: str
    ) -> pa.AnalysisExplanation:
        """
        Create explanation for why this result was detected.

        :param original_score: Score given by this recognizer
        :param explanation:    Explanation string

        :returns: Presidio AnalysisExplanation
        """

        # Create the Presidio AnalysisExplanation for the result
        explanation = pa.AnalysisExplanation(
            recognizer=self.__class__.__name__,
            original_score=original_score,
            textual_explanation=explanation,
        )
        return explanation

    # sanity check of the entity and label before recognition
    @staticmethod
    def __check_label(
        entity: str, label: str, check_label_groups: Tuple[Set, Set]
    ) -> bool:
        return any(
            entity in egrp and label in lgrp for egrp, lgrp in check_label_groups
        )


# get the analyzer engine based on the model
def _get_analyzer_engine(
    model: str = None, entities: List[str] = None
) -> pa.AnalyzerEngine:
    """
    Return pa.AnalyzerEngine.

    :param model: The model to use. Can be "spacy", "flair", "pattern" or "whole".
    :param entities: The list of entities to use.

    :returns: pa.AnalyzerEngine
    """
    # recognizer registry that can store multiple recognizers
    registry = pa.RecognizerRegistry()
    if model == Models.SPACY:
        # custom spacy recognizer
        spacy_recognizer = CustomSpacyRecognizer()
        # add the custom build spacy recognizer
        registry.add_recognizer(spacy_recognizer)
    elif model == Models.FLAIR:
        # pre-trained flair recognizer
        flair_recognizer = FlairRecognizer()
        # add the custom build flair recognizer
        registry.add_recognizer(flair_recognizer)
    elif model == Models.PATTERN:
        # add the pattern recognizer
        pattern_recognizer_factory = PatternRecognizerFactory()
        for recognizer in pattern_recognizer_factory._create_pattern_recognizer():
            registry.add_recognizer(recognizer)
    elif model == Models.WHOLE:
        spacy_recognizer = CustomSpacyRecognizer()
        flair_recognizer = FlairRecognizer()
        registry.add_recognizer(spacy_recognizer)
        registry.add_recognizer(flair_recognizer)
        # add the pattern recognizer
        pattern_recognizer_factory = PatternRecognizerFactory()
        for recognizer in pattern_recognizer_factory._create_pattern_recognizer():
            registry.add_recognizer(recognizer)
    elif not model and entities:
        if set(entities) & CustomSpacyRecognizer.RECOGNIZABLE_ENTITIES:
            spacy_recognizer = CustomSpacyRecognizer()
            registry.add_recognizer(spacy_recognizer)
        if set(entities) & FlairRecognizer.RECOGNIZABLE_ENTITIES:
            flair_recognizer = FlairRecognizer()
            registry.add_recognizer(flair_recognizer)
        # add the pattern recognizer
        if set(entities) & (set(PatternRecognizerFactory.RECOGNIZABLE_ENTITIES.keys())):
            pattern_recognizer_factory = PatternRecognizerFactory()
            for recognizer in pattern_recognizer_factory._create_pattern_recognizer():
                registry.add_recognizer(recognizer)
    else:
        raise ValueError(
            f"argument of model and entities can not be None at the same time"
        )
    analyzer = pa.AnalyzerEngine(
        registry=registry,
        supported_languages=["en"],
    )

    supported_entities = analyzer.get_supported_entities()

    if entities and not all(item in supported_entities for item in entities):
        not_supported_entities = [
            item for item in entities if item not in supported_entities
        ]
        raise ValueError(
            f"The current model {model} doesn't support the following entities: {not_supported_entities}. "
            f"Supported entities are: {supported_entities}"
        )
    return analyzer


def _get_anonymizer_engine() -> pre_anoymizer.AnonymizerEngine:
    """
    Return AnonymizerEngine.

    :returns: The AnonymizerEngine.
    """
    return pre_anoymizer.AnonymizerEngine()


def _anonymize(
    text: str,
    analyze_results: List[pa.RecognizerResult],
    entity_operator_map: dict = None,
    is_full_text: bool = True,
) -> str:
    """
    Anonymize identified input using Presidio Abonymizer.

    :param text:                The text for analysis.
    :param analyze_results:     The list of Presidio RecognizerResult constructed from
    :param entity_operator_map: The entity_operator_map is a dictionary that maps entity to operator name and operator params.
    :param is_full_text:        Whether the text is full text or not.

    :returns: The anonymized text.
    """
    if not text:
        return ""

    anonymizer_engine = _get_anonymizer_engine()
    if not entity_operator_map:
        operators = None
    else:
        # Create OperatorConfig based on the entity_operator_map
        operators = {
            entity: OperatorConfig(operator_name, operator_params)
            for entity, (operator_name, operator_params) in entity_operator_map.items()
        }

    if is_full_text:
        # Anonymize the entire text
        return anonymizer_engine.anonymize(
            text=text, analyzer_results=analyze_results, operators=operators
        ).text
    # Tokenize the text to sentences
    sentences = nltk.sent_tokenize(text)
    anonymized_sentences = []
    current_idx = 0

    # Find the sentence that has pii entity
    for sentence in sentences:
        start_idx = current_idx
        end_idx = start_idx + len(sentence)

        # Get the entities that are in the sentence, update hte start_idx and end_idx
        sentence_results = [
            pa.RecognizerResult(
                result.entity_type,
                start=result.start - start_idx,
                end=result.end - start_idx,
                score=result.score,
            )
            for result in analyze_results
            if result.start >= start_idx and result.end <= end_idx
        ]

        # If PII is detected
        if sentence_results:
            anonymized_sentence = anonymizer_engine.anonymize(
                text=sentence, analyzer_results=sentence_results, operators=operators
            ).text
            anonymized_sentences.append(anonymized_sentence)

        current_idx = end_idx

    return " ".join(anonymized_sentences)


def _get_tokens(
    text: str, analyze_results: List[pa.RecognizerResult], is_full: bool = True
) -> List[str]:
    """
    Get the full tokens or only contains the entities that can form a sentence.

    :param text:            The text for analysis.
    :param analyze_results: The list of Presidio RecognizerResult constructed from
    :param is_full:         Whether return full tokens or just the tokens that only contains the entities that can form a sentence.

    :returns: The tokens.
    """

    tokens = []
    # sort by start index
    results = sorted(analyze_results, key=lambda x: x.start)
    for i, res in enumerate(results):
        if i == 0:
            tokens.append(text[: res.start])

        # append entity text and entity type
        tokens.append((text[res.start : res.end], res.entity_type))

        # if another entity coming i.e. we're not at the last results element,
        # add text up to next entity
        if i != len(results) - 1:
            tokens.append(text[res.end : results[i + 1].start])
        # if no more entities coming, add all remaining text
        else:
            tokens.append(text[res.end :])

    # get the tokens that only contains the entities that can form a sentence
    part_annontated_tokens = []
    if not is_full:
        last_end_sentence = 0
        for i, token in enumerate(tokens):
            if any(item in token for item in [".", "!", "?"]) and any(
                type(item) is tuple for item in tokens[last_end_sentence:i]
            ):
                part_annontated_tokens.append(tokens[last_end_sentence:i])
                last_end_sentence = i
        return part_annontated_tokens
    return tokens


def _annotate(
    text: str, st_analyze_results: List[pa.RecognizerResult], is_full_html: bool = True
) -> List[str]:
    """
    Annotate identified input using Presidio Anonymizer.

    :param text:               The text for analysis.
    :param st_analyze_results: The list of Presidio RecognizerResult constructed from analysis.
    :param is_full_html:       Whether generate full html or not.

    :returns: The list of tokens with the identified entities.

    """
    return _get_tokens(text, st_analyze_results, is_full_html)


def _process(
    text: str,
    model: pa.AnalyzerEngine,
    score_threshold: float,
    entities: List[str] = None,
    entities_operator_map: dict = None,
    is_full_text: bool = True,
) -> Tuple[str, list]:
    """
    Process the text of str using the model.

    :param text:                  Text to process
    :param model:                 Model to use for processing
    :param entities:              Entities to recognize
    :param entities_operator_map: The entity_operator_map is a dictionary that maps entity to operator name and operator params.
    :param score_threshold:       The score threshold to use for recognition
    :param is_full_text:          Whether to return the full text or just the annotated text

    :returns: A tuple of:

              * the anonymized text
              * the list of Presidio RecognizerResult constructed from analysis
    """

    # get the analyzer engine
    analyzer = model

    # analyze the text that can be used for anonymization
    results = analyzer.analyze(
        text=text,
        language="en",
        entities=entities,
        score_threshold=score_threshold,
        return_decision_process=True,
    )

    # anonymize the text, replace the pii entities with the labels
    anonymized_text = _anonymize(text, results, entities_operator_map, is_full_text)

    return anonymized_text, results


def _get_single_html(
    text: str, results: List[pa.RecognizerResult], is_full_html: bool = True
):
    """
    Generate the html for a single txt file.

    :param text:         The text for analysis.
    :param results:      The list of Presidio RecognizerResult constructed from analysis.
    :param is_full_html: Whether generate full html or not.

    :returns: The html string for a single txt file.
    """
    # convert the results to tokens to generate the html
    tokens = _annotate(text, results, is_full_html)
    html = at_util.get_annotated_html(*tokens)

    # avoid the error during rendering of the \n in the html
    backslash_char = "\\"

    html_str = f"<p>{html.replace('{backslash_char}n', '<br>')}</p>"

    return html_str


def _get_single_json(results: List[pa.RecognizerResult], is_full_report: bool = True):
    """
    Generate the json for a single txt file.

    :param results:        The list of Presidio RecognizerResult constructed from analysis.
    :param is_full_report: Whether generate full json or not.

    :returns: The json string for a single txt file.
    """
    # generate the stats report if needed
    if not is_full_report:
        stats = []
        # add the simplify stats logic here
        for item in results:
            item.analysis_explanation = None
            stats.append(item)
    else:
        stats = results

    return stats


def _get_all_html(
    txt_content: dict,
    res_dict: dict,
    is_full_html: bool = True,
):
    """
    Generate the html for all txt files.

    :param txt_content:  The dictionary of txt file name and content.
    :param res_dict:     The dictionary of txt file name and the list of Presidio RecognizerResult constructed from analysis.
    :param is_full_html: Whether generate full html or not.

    :returns: The html string for all txt files.

    """
    # These are placeholder for the html string
    html_index = "<html><head><title>Highlighted Pii Entities</title></head><body><h1>Highlighted Pii Entities</h1><ul>"
    html_content = ""
    for txt_file, results in res_dict.items():
        txt = txt_content[txt_file]
        html_index += f"<li><a href='#{txt_file}'>{txt_file}</a></li>"
        html_content += f"<li><h2>{txt_file}</h2><p>{_get_single_html(txt, results, is_full_html)}</p></li>"
    html_index += "</ul>"
    html_res = f"{html_index}{html_content}</body></html>"

    return html_res


def _get_all_rpt(res_dict: dict, is_full_report: bool = True):
    """
    Generate the stats report for all txt files.

    :param res_dict:       The dictionary of txt file name and the list of Presidio RecognizerResult constructed from analysis.
    :param is_full_report: Whether generate full report or not.

    :returns: The stats report for all txt files.
    """
    # These are placeholder for the json report
    stats_dict = {}
    for txt_file, results in res_dict.items():
        new_stats = []
        for item in _get_single_json(results, is_full_report):
            if is_full_report:
                item.analysis_explanation = item.analysis_explanation.to_dict()
                new_stats.append(item.to_dict())
            else:
                tmp_dict = item.to_dict()
                tmp_dict.pop("analysis_explanation")
                tmp_dict.pop("recognition_metadata")
                new_stats.append(tmp_dict)
        stats_dict[txt_file] = new_stats
    return stats_dict


def recognize_pii(
    context: mlrun.MLClientCtx,
    input_path: Union[str, pathlib.Path],
    html_key: str,
    score_threshold: float,
    output_directory: str = None,
    entities: List[
        str
    ] = None,  # List of entities to recognize, default is recognizing all
    entity_operator_map: dict = None,
    model: str = None,
    generate_json: bool = True,
    generate_html: bool = True,
    is_full_text: bool = True,
    is_full_html: bool = True,
    is_full_report: bool = True,
) -> Union[Tuple[str, pd.DataFrame, dict, dict], Tuple[str, pd.DataFrame, dict]]:
    """
    Walk through the input path, recognize PII in text and store the anonymized text in the output path.
    Generate the html with different colors for each entity, json report of the explanation.

    :param context:              The MLRun context. this is needed for log the artifacts.
    :param input_path:           The input path of the text files needs to be analyzed.
    :param html_key:             The html key for the artifact.
    :param score_threshold:      The score threshold to mark the recognition as trusted.
    :param output_directory:     The output directory path to store the anonymized text.
    :param entities:             The list of entities to recognize.
    :param entity_operator_map:  The map of entity to operator (mask, redact, replace, keep, hash, and its params)
    :param model:                The model to use. Can be "spacy", "flair", "pattern" or "whole".
    :param generate_json:        Whether to generate the json report of the explanation.
    :param generate_html:        Whether to generate the html report of the explanation.
    :param is_full_text:         Whether to return the full text or only the masked text.
    :param is_full_html:         Whether to return the full html or just the annotated text
    :param is_full_report:       Whether to return the full report or just the score and start, end index

    :returns: A tuple of:

              * Path to the output directory
              * The json report of the explanation (if generate_json is True)
              * A dictionary of errors files that were not processed

    """

    # Set output directory
    if output_directory is None:
        output_directory = tempfile.mkdtemp()

    # Create the output directory:
    output_directory = pathlib.Path(output_directory)
    if not output_directory.exists():
        output_directory.mkdir(parents=True, exist_ok=True)

    txt_files_directory = pathlib.Path(input_path)
    successes = []
    errors = {}

    res_dict = {}
    txt_content = {}
    # Load the model:
    analyzer = _get_analyzer_engine(model, entities)
    logger.info("Model loaded")
    # Go over the text files in the input path, analyze and anonymize them:
    for txt_file in tqdm(
        list(txt_files_directory.glob("*.txt")),
        desc="Processing files",
        unit="file",
    ):
        try:
            # Load the str from the text file
            text = txt_file.read_text()
            txt_content[str(txt_file)] = text
            # Process the text to recoginze the pii entities in it
            anonymized_text, results = _process(
                text=text,
                model=analyzer,
                entities=entities,
                entities_operator_map=entity_operator_map,
                score_threshold=score_threshold,
                is_full_text=is_full_text,
            )
            res_dict[str(txt_file)] = results
            # Store the anonymized text in the output path
            output_file = output_directory / f"{txt_file.stem}.txt"
            output_file.parent.mkdir(parents=True, exist_ok=True)
            with open(output_file, "w") as f:
                f.write(anonymized_text)
            successes.append([txt_file.name, output_file.name])
        except Exception as e:
            errors[str(txt_file)] = str(e)
            logger.error(f"Error processing {txt_file}: {e}")

    successes = pd.DataFrame(
        successes,
        columns=["original_file", "anonymized_file"],
    )

    if generate_html:
        # Generate the html report
        html_res = _get_all_html(txt_content, res_dict, is_full_html)
        # Store the html report in the context
        arti_html = mlrun.artifacts.Artifact(body=html_res, format="html", key=html_key)
        context.log_artifact(arti_html)
    if generate_json:
        # Generate the json report
        json_res = _get_all_rpt(res_dict, is_full_report)
        return str(output_directory), successes, errors, json_res
    return str(output_directory), successes, errors

    +    base_image: mlrun/mlrun
    +    commands: []
    +    code_origin: ''
    +    origin_filename: ''
    +    requirements:
    +    - nltk
    +    - pandas
    +    - presidio-anonymizer
    +    - presidio-analyzer
    +    - torch
    +    - flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653
    +    - st-annotated-text
    +    - https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl
    +  entry_points:
    +    analyze:
    +      name: analyze
    +      doc: Analyze text and return the results.
    +      parameters:
    +      - name: self
    +      - name: text
    +        type: str
    +        doc: The text for analysis.
    +      - name: entities
    +        type: List[str]
    +        doc: The list of entities to recognize.
    +      - name: nlp_artifacts
    +        type: pa.nlp_engine.NlpArtifacts
    +        doc: Not used by this recognizer but needed for the interface.
    +        default: null
    +      outputs:
    +      - doc: The list of Presidio RecognizerResult constructed from the recognized
    +          Flair detections.
    +        type: List[pa.RecognizerResult]
    +      lineno: 381
    +      has_varargs: false
    +      has_kwargs: false
    +    recognize_pii:
    +      name: recognize_pii
    +      doc: 'Walk through the input path, recognize PII in text and store the anonymized
    +        text in the output path.
    +
    +        Generate the html with different colors for each entity, json report of the
    +        explanation.'
    +      parameters:
    +      - name: context
    +        type: MLClientCtx
    +        doc: The MLRun context. this is needed for log the artifacts.
    +      - name: input_path
    +        type: Union[str, Path]
    +        doc: The input path of the text files needs to be analyzed.
    +      - name: html_key
    +        type: str
    +        doc: The html key for the artifact.
    +      - name: score_threshold
    +        type: float
    +        doc: The score threshold to mark the recognition as trusted.
    +      - name: output_directory
    +        type: str
    +        doc: The output directory path to store the anonymized text.
    +        default: null
    +      - name: entities
    +        type: List[str]
    +        doc: The list of entities to recognize.
    +        default: null
    +      - name: entity_operator_map
    +        type: dict
    +        doc: The map of entity to operator (mask, redact, replace, keep, hash, and
    +          its params)
    +        default: null
    +      - name: model
    +        type: str
    +        doc: The model to use. Can be "spacy", "flair", "pattern" or "whole".
    +        default: null
    +      - name: generate_json
    +        type: bool
    +        doc: Whether to generate the json report of the explanation.
    +        default: true
    +      - name: generate_html
    +        type: bool
    +        doc: Whether to generate the html report of the explanation.
    +        default: true
    +      - name: is_full_text
    +        type: bool
    +        doc: Whether to return the full text or only the masked text.
    +        default: true
    +      - name: is_full_html
    +        type: bool
    +        doc: Whether to return the full html or just the annotated text
    +        default: true
    +      - name: is_full_report
    +        type: bool
    +        doc: Whether to return the full report or just the score and start, end index
    +        default: true
    +      outputs:
    +      - doc: 'A tuple of:'
    +        type: Union[Tuple[str, pd.DataFrame, dict, dict], Tuple[str, pd.DataFrame,
    +          dict]]
    +      lineno: 845
    +      has_varargs: false
    +      has_kwargs: false
    +  description: This function is used to recognize PII in a directory of text files
    +  default_handler: recognize_pii
    +  disable_auto_mount: false
    +  clone_target_dir: ''
    +  env: []
    +  priority_class_name: ''
    +  preemption_mode: prevent
    +  affinity: null
    +  tolerations: null
    +  security_context: {}
    +verbose: false
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/pii_recognizer/0.3.0/static/item.html b/functions/master/pii_recognizer/0.3.0/static/item.html new file mode 100644 index 00000000..6cc24fb1 --- /dev/null +++ b/functions/master/pii_recognizer/0.3.0/static/item.html @@ -0,0 +1,57 @@ + + + + + + + + + + + Source + + + + +
    +        
    +apiVersion: v1
    +categories: 
    +  - machine-learning
    +  - data-preparation
    +  - NLP
    +description: This function is used to recognize PII in a directory of text files
    +doc: ''
    +example: pii_recognizer.ipynb
    +generationDate: 2023-08-15:10-24
    +hidden: false
    +icon: ''
    +labels:
    +  author: pgw
    +maintainers: []
    +marketplaceType: ''
    +mlrunVersion: 1.4.0
    +name: pii-recognizer
    +platformVersion: 3.5.3
    +spec:
    +  filename: pii_recognizer.py
    +  handler: recognize_pii
    +  image: mlrun/mlrun
    +  kind: job
    +  requirements:
    +   - nltk
    +   - pandas
    +   - presidio-anonymizer
    +   - presidio-analyzer
    +   - torch
    +   - flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653
    +   - st-annotated-text
    +   - https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl
    +url: ''
    +version: 0.3.0
    +test_valid: False
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/pii_recognizer/0.3.0/static/pii_recognizer.html b/functions/master/pii_recognizer/0.3.0/static/pii_recognizer.html new file mode 100644 index 00000000..81545aed --- /dev/null +++ b/functions/master/pii_recognizer/0.3.0/static/pii_recognizer.html @@ -0,0 +1,1091 @@ + + + + + + + +pii_recognizer.pii_recognizer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + +
    +
    + +
    +
    +
    +
    +
    + +
    +

    + +
    +
    +
    +
    +
    +
    +
    +

    Source code for pii_recognizer.pii_recognizer

    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
    +import logging
    +import os
    +import pathlib
    +import tempfile
    +import warnings
    +from typing import List, Set, Tuple, Union
    +
    +import annotated_text.util as at_util
    +import mlrun
    +import nltk
    +import pandas as pd
    +import presidio_analyzer as pa
    +import presidio_anonymizer as pre_anoymizer
    +from presidio_anonymizer.entities import OperatorConfig
    +from tqdm import tqdm
    +
    +try:
    +    import flair as fl
    +except ModuleNotFoundError:
    +    print("Flair is not installed")
    +
    +# There is a conflict between Rust-based tokenizers' parallel processing
    +# and Python's fork operations during multiprocessing. To avoid this, we need
    +# the following two lines
    +
    +os.environ["TOKENIZERS_PARALLELISM"] = "false"
    +warnings.filterwarnings("ignore")
    +
    +logger = logging.getLogger("pii-recognizer")
    +
    +
    +# Add the constant classes of Models and Entities to govern the whole package
    +
    [docs]class Models: + WHOLE = "whole" + PATTERN = "pattern" + SPACY = "spacy" + FLAIR = "flair"
    + + +
    [docs]class Entities: + CREDIT_CARD = "CREDIT_CARD" + SSN = "SSN" + PHONE = "PHONE" + EMAIL = "EMAIL" + LOCATION = "LOCATION" + PERSON = "PERSON" + NRP = "NRP" + ORGANIZATION = "ORGANIZATION" + DATE_TIME = "DATE_TIME" + GPE = ("GPE",) + MAC_ADDRESS = "MAC_ADDRESS" + US_BANK_NUMBER = "US_BANK_NUMBER" + IMEI = "IMEI" + TITLE = "TITLE" + LICENSE_PLATE = "LICENSE_PLATE" + US_PASSPORT = "US_PASSPORT" + CURRENCY = "CURRENCY" + ROUTING_NUMBER = "ROUTING_NUMBER" + US_ITIN = "US_ITIN" + US_BANK_NUMBER = "US_BANK_NUMBER" + US_DRIVER_LICENSE = "US_DRIVER_LICENSE" + AGE = "AGE" + PASSWORD = "PASSWORD" + SWIFT_CODE = "SWIFT_CODE"
    + + +
    [docs]class PatternRecognizerFactory: + """ + Factory for creating pattern recognizers, it can be extended in the future to + add more regex pattern for different entities. For the pattern recognizer to work, + we need construct a list of regex patterns for each entity. + """ + + RECOGNIZABLE_ENTITIES = { + "CREDIT_CARD": [pa.Pattern("CREDIT_CARD", r"\b(?:\d[ -]*?){13,16}\b", 0.5)], + "SSN": [pa.Pattern("SSN", r"\b\d{3}-?\d{2}-?\d{4}\b", 0.5)], + "PHONE": [pa.Pattern("PHONE", r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", 0.5)], + "EMAIL": [pa.Pattern("EMAIL", r"\S+@\S+", 0.5)], + } + + # create a list of pattern recognizers + @classmethod + def _create_pattern_recognizer(cls): + """ + For each entity, create a list of patterns to recognize it + + :param cls: PatternRecognizerFactory class + + :returns: List of pattern recognizers + """ + + # Entities to recognize and their regex patterns + + return [ + pa.PatternRecognizer(supported_entity=entity, patterns=pattern) + for entity, pattern in cls.RECOGNIZABLE_ENTITIES.items() + ]
    + + +
    [docs]class CustomSpacyRecognizer(pa.LocalRecognizer): + """ + Custom Spacy Recognizer from Presidio Analyzer trained on Privy data. + The privy data is generated using this https://github.com/pixie-io/pixie/tree/main/src/datagen/pii/privy + It can be used to recognize custom entities, Since we want to use Presidio's Registries to generate AnalyzerEngine, + it inherits from Presidio Analyzer's LocalRecognizer class. + """ + + # Entities to recognize + + RECOGNIZABLE_ENTITIES = { + "LOCATION", + "PERSON", + "NRP", + "ORGANIZATION", + "DATE_TIME", + } + + # Default explanation for this recognizer + + _DEFAULT_EXPLANATION = ( + "Identified as {} by Spacy's Named Entity Recognition (Privy-trained)" + ) + + # Label groups to check + + _DEFAULT_CHECK_LABEL_GROUPS = [ + ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}), + ({"PERSON"}, {"PER", "PERSON"}), + ({"NRP"}, {"NORP", "NRP"}), + ({"ORGANIZATION"}, {"ORG"}), + ({"DATE_TIME"}, {"DATE_TIME"}), + ] + + # pretrained model for this recognizer + + _DEFAULT_MODEL_LANGUAGES = { + "en": "beki/en_spacy_pii_distilbert", + } + + _DEFAULT_PRESIDIO_EQUIVALENCES = { + "PER": "PERSON", + "LOC": "LOCATION", + "ORG": "ORGANIZATION", + "NROP": "NRP", + "DATE_TIME": "DATE_TIME", + } + + def __init__( + self, + supported_language: str = "en", + supported_entities: List[str] = None, + check_label_groups: Tuple[Set, Set] = None, + context: List[str] = None, + ner_strength: float = 1, + ): + """ + Initialize Spacy Recognizer. + + :param supported_language: Language to use, default is English + :param supported_entities: Entities to use for recognition + :param check_label_groups: Label groups to check for the entities + :param context: Context to use if any + :param ner_strength: Default confidence for NER prediction + + :returns: SpacyRecognizer object + """ + + # Default confidence for NER prediction + self.ner_strength = ner_strength + + self.check_label_groups = check_label_groups or self._DEFAULT_CHECK_LABEL_GROUPS + supported_entities = supported_entities or self.RECOGNIZABLE_ENTITIES + super().__init__( + supported_entities=supported_entities, + supported_language=supported_language, + ) + + # get the presidio explanation for the result + + def _build_spacy_explanation( + self, original_score: float, explanation: str + ) -> pa.AnalysisExplanation: + """ + Create explanation for why this result was detected. + + :param original_score: Score given by this recognizer + :param explanation: Explanation string + + :returns: Presidio AnalysisExplanation object + """ + explanation = pa.AnalysisExplanation( + recognizer=self.__class__.__name__, + original_score=original_score, + textual_explanation=explanation, + ) + return explanation + + # main method for the recognizer +
    [docs] def analyze(self, text: str, entities: List[str], nlp_artifacts=None): # noqa D102 + """ + Analyze text using Spacy. + + :param text: Text to analyze + :param entities: Entities to analyze + :param nlp_artifacts: NLP artifacts to use + + :returns: List of Presidio RecognizerResult objects + """ + results = [] + if not nlp_artifacts: + logger.warning("Skipping SpaCy, nlp artifacts not provided...") + return results + + ner_entities = nlp_artifacts.entities + + # recognize the supported entities + for entity in entities: + if entity not in self.supported_entities: + continue + for ent in ner_entities: + if not self.__check_label(entity, ent.label_, self.check_label_groups): + continue + + # string of the explanation saying the entity is recognized by spacy + textual_explanation = self._DEFAULT_EXPLANATION.format(ent.label_) + explanation = self._build_spacy_explanation( + self.ner_strength, textual_explanation + ) + + # create the standard result with the entity, start, end, score, and explanation + spacy_result = pa.RecognizerResult( + entity_type=entity, + start=ent.start_char, + end=ent.end_char, + score=self.ner_strength, + analysis_explanation=explanation, + recognition_metadata={ + pa.RecognizerResult.RECOGNIZER_NAME_KEY: self.name + }, + ) + results.append(spacy_result) + + return results
    + + @staticmethod + def __check_label( + entity: str, label: str, check_label_groups: Tuple[Set, Set] + ) -> bool: + """ + Check if the label is in the label group. + + :param entity: Entity to check + :param label: Label to check + :param check_label_groups: Label groups to check + + :returns: True if the label is in the label group, False otherwise + """ + return any( + entity in egrp and label in lgrp for egrp, lgrp in check_label_groups + )
    + + +# Class to use Flair with Presidio as an external recognizer. +
    [docs]class FlairRecognizer(pa.EntityRecognizer): + """ + Wrapper for a flair model, if needed to be used within Presidio Analyzer. + This is to make sure the recognizer can be registered with Presidio registry. + """ + + RECOGNIZABLE_ENTITIES = { + "LOCATION", + "PERSON", + "NRP", + "GPE", + "ORGANIZATION", + "MAC_ADDRESS", + "US_BANK_NUMBER", + "IMEI", + "TITLE", + "LICENSE_PLATE", + "US_PASSPORT", + "CURRENCY", + "ROUTING_NUMBER", + "US_ITIN", + "US_BANK_NUMBER", + "US_DRIVER_LICENSE", + "AGE", + "PASSWORD", + "SWIFT_CODE", + } + + # This is used to construct the explanation for the result + + _DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition" + + _DEFAULT_CHECK_LABEL_GROUPS = [ + ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}), + ({"PERSON"}, {"PER", "PERSON"}), + ({"NRP"}, {"NORP", "NRP"}), + ({"GPE"}, {"GPE"}), + ({"ORGANIZATION"}, {"ORG"}), + ({"MAC_ADDRESS"}, {"MAC_ADDRESS"}), + ({"US_BANK_NUMBER"}, {"US_BANK_NUMBER"}), + ({"IMEI"}, {"IMEI"}), + ({"TITLE"}, {"TITLE"}), + ({"LICENSE_PLATE"}, {"LICENSE_PLATE"}), + ({"US_PASSPORT"}, {"US_PASSPORT"}), + ({"CURRENCY"}, {"CURRENCY"}), + ({"ROUTING_NUMBER"}, {"ROUTING_NUMBER"}), + ({"AGE"}, {"AGE"}), + ({"CURRENCY"}, {"CURRENCY"}), + ({"SWIFT_CODE"}, {"SWIFT_CODE"}), + ({"US_ITIN"}, {"US_ITIN"}), + ({"US_BANK_NUMBER"}, {"US_BANK_NUMBER"}), + ({"US_DRIVER_LICENSE"}, {"US_DRIVER_LICENSE"}), + ] + + _DEFAULT_MODEL_LANGUAGES = { + "en": "beki/flair-pii-distilbert", + } + + _DEFAULT_PRESIDIO_EQUIVALENCES = { + "PER": "PERSON", + "LOC": "LOCATION", + "ORG": "ORGANIZATION", + "NROP": "NRP", + "URL": "URL", + "US_ITIN": "US_ITIN", + "US_PASSPORT": "US_PASSPORT", + "IBAN_CODE": "IBAN_CODE", + "IP_ADDRESS": "IP_ADDRESS", + "EMAIL_ADDRESS": "EMAIL", + "US_DRIVER_LICENSE": "US_DRIVER_LICENSE", + "US_BANK_NUMBER": "US_BANK_NUMBER", + } + + def __init__( + self, + supported_language: str = "en", + supported_entities: List[str] = None, + check_label_groups: Tuple[Set, Set] = None, + ): + """ + Initialize the FlairRecognizer. + + :param supported_language: Language to use + :param supported_entities: Entities to use + :param check_label_groups: Label groups to check + + :returns: FlairRecognizer object + + """ + self.check_label_groups = check_label_groups or self._DEFAULT_CHECK_LABEL_GROUPS + + supported_entities = supported_entities or self.RECOGNIZABLE_ENTITIES + self.model = fl.models.SequenceTagger.load( + self._DEFAULT_MODEL_LANGUAGES.get(supported_language) + ) + + super().__init__( + supported_entities=supported_entities, + supported_language=supported_language, + name="Flair Analytics", + ) + + # main method for the recognizer +
    [docs] def analyze( + self, + text: str, + entities: List[str], + nlp_artifacts: pa.nlp_engine.NlpArtifacts = None, + ) -> List[pa.RecognizerResult]: + """ + Analyze text and return the results. + + :param text: The text for analysis. + :param entities: The list of entities to recognize. + :param nlp_artifacts: Not used by this recognizer but needed for the interface. + + :returns: The list of Presidio RecognizerResult constructed from the recognized Flair detections. + """ + + results = [] + + sentences = fl.data.Sentence(text) + self.model.predict(sentences) + + # If there are no specific list of entities, we will look for all of it. + if not entities: + entities = self.supported_entities + + # Go over the entities and check if they are in the supported entities list. + for entity in entities: + if entity not in self.supported_entities: + continue + + # Go over the sentences and check if the entity is in the sentence. + for ent in sentences.get_spans("ner"): + if not self.__check_label( + entity, ent.labels[0].value, self.check_label_groups + ): + continue + + # If the entity is in the sentence, we will add it to the results. + textual_explanation = self._DEFAULT_EXPLANATION.format( + ent.labels[0].value + ) + + # Build the explanation for the result + explanation = self._build_flair_explanation( + round(ent.score, 2), textual_explanation + ) + + flair_result = self._convert_to_recognizer_result(ent, explanation) + + results.append(flair_result) + + return results
    + + def _convert_to_recognizer_result( + self, entity: fl.data.Span, explanation: str + ) -> pa.RecognizerResult: + """ + Convert Flair result to Presidio RecognizerResult. + + :param entity: Flair entity of Span + :param explanation: Presidio AnalysisExplanation + + :returns: Presidio RecognizerResult + """ + + # Convert the entity type to Presidio entity type + entity_type = self._DEFAULT_PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag) + + # Convert the score to Presidio score + flair_score = round(entity.score, 2) + + # Create the Presidio RecognizerResult from the Flair entity + flair_results = pa.RecognizerResult( + entity_type=entity_type, + start=entity.start_position, + end=entity.end_position, + score=flair_score, + analysis_explanation=explanation, + ) + + return flair_results + + def _build_flair_explanation( + self, original_score: float, explanation: str + ) -> pa.AnalysisExplanation: + """ + Create explanation for why this result was detected. + + :param original_score: Score given by this recognizer + :param explanation: Explanation string + + :returns: Presidio AnalysisExplanation + """ + + # Create the Presidio AnalysisExplanation for the result + explanation = pa.AnalysisExplanation( + recognizer=self.__class__.__name__, + original_score=original_score, + textual_explanation=explanation, + ) + return explanation + + # sanity check of the entity and label before recognition + @staticmethod + def __check_label( + entity: str, label: str, check_label_groups: Tuple[Set, Set] + ) -> bool: + return any( + entity in egrp and label in lgrp for egrp, lgrp in check_label_groups + )
    + + +# get the analyzer engine based on the model +def _get_analyzer_engine( + model: str = None, entities: List[str] = None +) -> pa.AnalyzerEngine: + """ + Return pa.AnalyzerEngine. + + :param model: The model to use. Can be "spacy", "flair", "pattern" or "whole". + :param entities: The list of entities to use. + + :returns: pa.AnalyzerEngine + """ + # recognizer registry that can store multiple recognizers + registry = pa.RecognizerRegistry() + if model == Models.SPACY: + # custom spacy recognizer + spacy_recognizer = CustomSpacyRecognizer() + # add the custom build spacy recognizer + registry.add_recognizer(spacy_recognizer) + elif model == Models.FLAIR: + # pre-trained flair recognizer + flair_recognizer = FlairRecognizer() + # add the custom build flair recognizer + registry.add_recognizer(flair_recognizer) + elif model == Models.PATTERN: + # add the pattern recognizer + pattern_recognizer_factory = PatternRecognizerFactory() + for recognizer in pattern_recognizer_factory._create_pattern_recognizer(): + registry.add_recognizer(recognizer) + elif model == Models.WHOLE: + spacy_recognizer = CustomSpacyRecognizer() + flair_recognizer = FlairRecognizer() + registry.add_recognizer(spacy_recognizer) + registry.add_recognizer(flair_recognizer) + # add the pattern recognizer + pattern_recognizer_factory = PatternRecognizerFactory() + for recognizer in pattern_recognizer_factory._create_pattern_recognizer(): + registry.add_recognizer(recognizer) + elif not model and entities: + if set(entities) & CustomSpacyRecognizer.RECOGNIZABLE_ENTITIES: + spacy_recognizer = CustomSpacyRecognizer() + registry.add_recognizer(spacy_recognizer) + if set(entities) & FlairRecognizer.RECOGNIZABLE_ENTITIES: + flair_recognizer = FlairRecognizer() + registry.add_recognizer(flair_recognizer) + # add the pattern recognizer + if set(entities) & (set(PatternRecognizerFactory.RECOGNIZABLE_ENTITIES.keys())): + pattern_recognizer_factory = PatternRecognizerFactory() + for recognizer in pattern_recognizer_factory._create_pattern_recognizer(): + registry.add_recognizer(recognizer) + else: + raise ValueError( + f"argument of model and entities can not be None at the same time" + ) + analyzer = pa.AnalyzerEngine( + registry=registry, + supported_languages=["en"], + ) + + supported_entities = analyzer.get_supported_entities() + + if entities and not all(item in supported_entities for item in entities): + not_supported_entities = [ + item for item in entities if item not in supported_entities + ] + raise ValueError( + f"The current model {model} doesn't support the following entities: {not_supported_entities}. " + f"Supported entities are: {supported_entities}" + ) + return analyzer + + +def _get_anonymizer_engine() -> pre_anoymizer.AnonymizerEngine: + """ + Return AnonymizerEngine. + + :returns: The AnonymizerEngine. + """ + return pre_anoymizer.AnonymizerEngine() + + +def _anonymize( + text: str, + analyze_results: List[pa.RecognizerResult], + entity_operator_map: dict = None, + is_full_text: bool = True, +) -> str: + """ + Anonymize identified input using Presidio Abonymizer. + + :param text: The text for analysis. + :param analyze_results: The list of Presidio RecognizerResult constructed from + :param entity_operator_map: The entity_operator_map is a dictionary that maps entity to operator name and operator params. + :param is_full_text: Whether the text is full text or not. + + :returns: The anonymized text. + """ + if not text: + return "" + + anonymizer_engine = _get_anonymizer_engine() + if not entity_operator_map: + operators = None + else: + # Create OperatorConfig based on the entity_operator_map + operators = { + entity: OperatorConfig(operator_name, operator_params) + for entity, (operator_name, operator_params) in entity_operator_map.items() + } + + if is_full_text: + # Anonymize the entire text + return anonymizer_engine.anonymize( + text=text, analyzer_results=analyze_results, operators=operators + ).text + # Tokenize the text to sentences + sentences = nltk.sent_tokenize(text) + anonymized_sentences = [] + current_idx = 0 + + # Find the sentence that has pii entity + for sentence in sentences: + start_idx = current_idx + end_idx = start_idx + len(sentence) + + # Get the entities that are in the sentence, update hte start_idx and end_idx + sentence_results = [ + pa.RecognizerResult( + result.entity_type, + start=result.start - start_idx, + end=result.end - start_idx, + score=result.score, + ) + for result in analyze_results + if result.start >= start_idx and result.end <= end_idx + ] + + # If PII is detected + if sentence_results: + anonymized_sentence = anonymizer_engine.anonymize( + text=sentence, analyzer_results=sentence_results, operators=operators + ).text + anonymized_sentences.append(anonymized_sentence) + + current_idx = end_idx + + return " ".join(anonymized_sentences) + + +def _get_tokens( + text: str, analyze_results: List[pa.RecognizerResult], is_full: bool = True +) -> List[str]: + """ + Get the full tokens or only contains the entities that can form a sentence. + + :param text: The text for analysis. + :param analyze_results: The list of Presidio RecognizerResult constructed from + :param is_full: Whether return full tokens or just the tokens that only contains the entities that can form a sentence. + + :returns: The tokens. + """ + + tokens = [] + # sort by start index + results = sorted(analyze_results, key=lambda x: x.start) + for i, res in enumerate(results): + if i == 0: + tokens.append(text[: res.start]) + + # append entity text and entity type + tokens.append((text[res.start : res.end], res.entity_type)) + + # if another entity coming i.e. we're not at the last results element, + # add text up to next entity + if i != len(results) - 1: + tokens.append(text[res.end : results[i + 1].start]) + # if no more entities coming, add all remaining text + else: + tokens.append(text[res.end :]) + + # get the tokens that only contains the entities that can form a sentence + part_annontated_tokens = [] + if not is_full: + last_end_sentence = 0 + for i, token in enumerate(tokens): + if any(item in token for item in [".", "!", "?"]) and any( + type(item) is tuple for item in tokens[last_end_sentence:i] + ): + part_annontated_tokens.append(tokens[last_end_sentence:i]) + last_end_sentence = i + return part_annontated_tokens + return tokens + + +def _annotate( + text: str, st_analyze_results: List[pa.RecognizerResult], is_full_html: bool = True +) -> List[str]: + """ + Annotate identified input using Presidio Anonymizer. + + :param text: The text for analysis. + :param st_analyze_results: The list of Presidio RecognizerResult constructed from analysis. + :param is_full_html: Whether generate full html or not. + + :returns: The list of tokens with the identified entities. + + """ + return _get_tokens(text, st_analyze_results, is_full_html) + + +def _process( + text: str, + model: pa.AnalyzerEngine, + score_threshold: float, + entities: List[str] = None, + entities_operator_map: dict = None, + is_full_text: bool = True, +) -> Tuple[str, list]: + """ + Process the text of str using the model. + + :param text: Text to process + :param model: Model to use for processing + :param entities: Entities to recognize + :param entities_operator_map: The entity_operator_map is a dictionary that maps entity to operator name and operator params. + :param score_threshold: The score threshold to use for recognition + :param is_full_text: Whether to return the full text or just the annotated text + + :returns: A tuple of: + + * the anonymized text + * the list of Presidio RecognizerResult constructed from analysis + """ + + # get the analyzer engine + analyzer = model + + # analyze the text that can be used for anonymization + results = analyzer.analyze( + text=text, + language="en", + entities=entities, + score_threshold=score_threshold, + return_decision_process=True, + ) + + # anonymize the text, replace the pii entities with the labels + anonymized_text = _anonymize(text, results, entities_operator_map, is_full_text) + + return anonymized_text, results + + +def _get_single_html( + text: str, results: List[pa.RecognizerResult], is_full_html: bool = True +): + """ + Generate the html for a single txt file. + + :param text: The text for analysis. + :param results: The list of Presidio RecognizerResult constructed from analysis. + :param is_full_html: Whether generate full html or not. + + :returns: The html string for a single txt file. + """ + # convert the results to tokens to generate the html + tokens = _annotate(text, results, is_full_html) + html = at_util.get_annotated_html(*tokens) + + # avoid the error during rendering of the \n in the html + backslash_char = "\\" + + html_str = f"<p>{html.replace('{backslash_char}n', '<br>')}</p>" + + return html_str + + +def _get_single_json(results: List[pa.RecognizerResult], is_full_report: bool = True): + """ + Generate the json for a single txt file. + + :param results: The list of Presidio RecognizerResult constructed from analysis. + :param is_full_report: Whether generate full json or not. + + :returns: The json string for a single txt file. + """ + # generate the stats report if needed + if not is_full_report: + stats = [] + # add the simplify stats logic here + for item in results: + item.analysis_explanation = None + stats.append(item) + else: + stats = results + + return stats + + +def _get_all_html( + txt_content: dict, + res_dict: dict, + is_full_html: bool = True, +): + """ + Generate the html for all txt files. + + :param txt_content: The dictionary of txt file name and content. + :param res_dict: The dictionary of txt file name and the list of Presidio RecognizerResult constructed from analysis. + :param is_full_html: Whether generate full html or not. + + :returns: The html string for all txt files. + + """ + # These are placeholder for the html string + html_index = "<html><head><title>Highlighted Pii Entities</title></head><body><h1>Highlighted Pii Entities</h1><ul>" + html_content = "" + for txt_file, results in res_dict.items(): + txt = txt_content[txt_file] + html_index += f"<li><a href='#{txt_file}'>{txt_file}</a></li>" + html_content += f"<li><h2>{txt_file}</h2><p>{_get_single_html(txt, results, is_full_html)}</p></li>" + html_index += "</ul>" + html_res = f"{html_index}{html_content}</body></html>" + + return html_res + + +def _get_all_rpt(res_dict: dict, is_full_report: bool = True): + """ + Generate the stats report for all txt files. + + :param res_dict: The dictionary of txt file name and the list of Presidio RecognizerResult constructed from analysis. + :param is_full_report: Whether generate full report or not. + + :returns: The stats report for all txt files. + """ + # These are placeholder for the json report + stats_dict = {} + for txt_file, results in res_dict.items(): + new_stats = [] + for item in _get_single_json(results, is_full_report): + if is_full_report: + item.analysis_explanation = item.analysis_explanation.to_dict() + new_stats.append(item.to_dict()) + else: + tmp_dict = item.to_dict() + tmp_dict.pop("analysis_explanation") + tmp_dict.pop("recognition_metadata") + new_stats.append(tmp_dict) + stats_dict[txt_file] = new_stats + return stats_dict + + +
    [docs]def recognize_pii( + context: mlrun.MLClientCtx, + input_path: Union[str, pathlib.Path], + html_key: str, + score_threshold: float, + output_directory: str = None, + entities: List[ + str + ] = None, # List of entities to recognize, default is recognizing all + entity_operator_map: dict = None, + model: str = None, + generate_json: bool = True, + generate_html: bool = True, + is_full_text: bool = True, + is_full_html: bool = True, + is_full_report: bool = True, +) -> Union[Tuple[str, pd.DataFrame, dict, dict], Tuple[str, pd.DataFrame, dict]]: + """ + Walk through the input path, recognize PII in text and store the anonymized text in the output path. + Generate the html with different colors for each entity, json report of the explanation. + + :param context: The MLRun context. this is needed for log the artifacts. + :param input_path: The input path of the text files needs to be analyzed. + :param html_key: The html key for the artifact. + :param score_threshold: The score threshold to mark the recognition as trusted. + :param output_directory: The output directory path to store the anonymized text. + :param entities: The list of entities to recognize. + :param entity_operator_map: The map of entity to operator (mask, redact, replace, keep, hash, and its params) + :param model: The model to use. Can be "spacy", "flair", "pattern" or "whole". + :param generate_json: Whether to generate the json report of the explanation. + :param generate_html: Whether to generate the html report of the explanation. + :param is_full_text: Whether to return the full text or only the masked text. + :param is_full_html: Whether to return the full html or just the annotated text + :param is_full_report: Whether to return the full report or just the score and start, end index + + :returns: A tuple of: + + * Path to the output directory + * The json report of the explanation (if generate_json is True) + * A dictionary of errors files that were not processed + + """ + + # Set output directory + if output_directory is None: + output_directory = tempfile.mkdtemp() + + # Create the output directory: + output_directory = pathlib.Path(output_directory) + if not output_directory.exists(): + output_directory.mkdir(parents=True, exist_ok=True) + + txt_files_directory = pathlib.Path(input_path) + successes = [] + errors = {} + + res_dict = {} + txt_content = {} + # Load the model: + analyzer = _get_analyzer_engine(model, entities) + logger.info("Model loaded") + # Go over the text files in the input path, analyze and anonymize them: + for txt_file in tqdm( + list(txt_files_directory.glob("*.txt")), + desc="Processing files", + unit="file", + ): + try: + # Load the str from the text file + text = txt_file.read_text() + txt_content[str(txt_file)] = text + # Process the text to recoginze the pii entities in it + anonymized_text, results = _process( + text=text, + model=analyzer, + entities=entities, + entities_operator_map=entity_operator_map, + score_threshold=score_threshold, + is_full_text=is_full_text, + ) + res_dict[str(txt_file)] = results + # Store the anonymized text in the output path + output_file = output_directory / f"{txt_file.stem}.txt" + output_file.parent.mkdir(parents=True, exist_ok=True) + with open(output_file, "w") as f: + f.write(anonymized_text) + successes.append([txt_file.name, output_file.name]) + except Exception as e: + errors[str(txt_file)] = str(e) + logger.error(f"Error processing {txt_file}: {e}") + + successes = pd.DataFrame( + successes, + columns=["original_file", "anonymized_file"], + ) + + if generate_html: + # Generate the html report + html_res = _get_all_html(txt_content, res_dict, is_full_html) + # Store the html report in the context + arti_html = mlrun.artifacts.Artifact(body=html_res, format="html", key=html_key) + context.log_artifact(arti_html) + if generate_json: + # Generate the json report + json_res = _get_all_rpt(res_dict, is_full_report) + return str(output_directory), successes, errors, json_res + return str(output_directory), successes, errors
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/pii_recognizer/0.3.0/static/source.html b/functions/master/pii_recognizer/0.3.0/static/source.html new file mode 100644 index 00000000..afaeb9dc --- /dev/null +++ b/functions/master/pii_recognizer/0.3.0/static/source.html @@ -0,0 +1,973 @@ + + + + + + + + + + + Source + + + + +
    +        
    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
    +import logging
    +import os
    +import pathlib
    +import tempfile
    +import warnings
    +from typing import List, Set, Tuple, Union
    +
    +import annotated_text.util as at_util
    +import mlrun
    +import nltk
    +import pandas as pd
    +import presidio_analyzer as pa
    +import presidio_anonymizer as pre_anoymizer
    +from presidio_anonymizer.entities import OperatorConfig
    +from tqdm import tqdm
    +
    +try:
    +    import flair as fl
    +except ModuleNotFoundError:
    +    print("Flair is not installed")
    +
    +# There is a conflict between Rust-based tokenizers' parallel processing
    +# and Python's fork operations during multiprocessing. To avoid this, we need
    +# the following two lines
    +
    +os.environ["TOKENIZERS_PARALLELISM"] = "false"
    +warnings.filterwarnings("ignore")
    +
    +logger = logging.getLogger("pii-recognizer")
    +
    +
    +# Add the constant classes of Models and Entities to govern the whole package
    +class Models:
    +    WHOLE = "whole"
    +    PATTERN = "pattern"
    +    SPACY = "spacy"
    +    FLAIR = "flair"
    +
    +
    +class Entities:
    +    CREDIT_CARD = "CREDIT_CARD"
    +    SSN = "SSN"
    +    PHONE = "PHONE"
    +    EMAIL = "EMAIL"
    +    LOCATION = "LOCATION"
    +    PERSON = "PERSON"
    +    NRP = "NRP"
    +    ORGANIZATION = "ORGANIZATION"
    +    DATE_TIME = "DATE_TIME"
    +    GPE = ("GPE",)
    +    MAC_ADDRESS = "MAC_ADDRESS"
    +    US_BANK_NUMBER = "US_BANK_NUMBER"
    +    IMEI = "IMEI"
    +    TITLE = "TITLE"
    +    LICENSE_PLATE = "LICENSE_PLATE"
    +    US_PASSPORT = "US_PASSPORT"
    +    CURRENCY = "CURRENCY"
    +    ROUTING_NUMBER = "ROUTING_NUMBER"
    +    US_ITIN = "US_ITIN"
    +    US_BANK_NUMBER = "US_BANK_NUMBER"
    +    US_DRIVER_LICENSE = "US_DRIVER_LICENSE"
    +    AGE = "AGE"
    +    PASSWORD = "PASSWORD"
    +    SWIFT_CODE = "SWIFT_CODE"
    +
    +
    +class PatternRecognizerFactory:
    +    """
    +    Factory for creating pattern recognizers, it can be extended in the future to
    +    add more regex pattern for different entities. For the pattern recognizer to work,
    +    we need construct a list of regex patterns for each entity.
    +    """
    +
    +    RECOGNIZABLE_ENTITIES = {
    +        "CREDIT_CARD": [pa.Pattern("CREDIT_CARD", r"\b(?:\d[ -]*?){13,16}\b", 0.5)],
    +        "SSN": [pa.Pattern("SSN", r"\b\d{3}-?\d{2}-?\d{4}\b", 0.5)],
    +        "PHONE": [pa.Pattern("PHONE", r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", 0.5)],
    +        "EMAIL": [pa.Pattern("EMAIL", r"\S+@\S+", 0.5)],
    +    }
    +
    +    # create a list of pattern recognizers
    +    @classmethod
    +    def _create_pattern_recognizer(cls):
    +        """
    +        For each entity, create a list of patterns to recognize it
    +
    +        :param cls: PatternRecognizerFactory class
    +
    +        :returns: List of pattern recognizers
    +        """
    +
    +        # Entities to recognize and their regex patterns
    +
    +        return [
    +            pa.PatternRecognizer(supported_entity=entity, patterns=pattern)
    +            for entity, pattern in cls.RECOGNIZABLE_ENTITIES.items()
    +        ]
    +
    +
    +class CustomSpacyRecognizer(pa.LocalRecognizer):
    +    """
    +    Custom Spacy Recognizer from Presidio Analyzer trained on Privy data.
    +    The privy data is generated using this https://github.com/pixie-io/pixie/tree/main/src/datagen/pii/privy
    +    It can be used to recognize custom entities, Since we want to use Presidio's Registries to generate AnalyzerEngine,
    +    it inherits from Presidio Analyzer's LocalRecognizer class.
    +    """
    +
    +    # Entities to recognize
    +
    +    RECOGNIZABLE_ENTITIES = {
    +        "LOCATION",
    +        "PERSON",
    +        "NRP",
    +        "ORGANIZATION",
    +        "DATE_TIME",
    +    }
    +
    +    # Default explanation for this recognizer
    +
    +    _DEFAULT_EXPLANATION = (
    +        "Identified as {} by Spacy's Named Entity Recognition (Privy-trained)"
    +    )
    +
    +    # Label groups to check
    +
    +    _DEFAULT_CHECK_LABEL_GROUPS = [
    +        ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}),
    +        ({"PERSON"}, {"PER", "PERSON"}),
    +        ({"NRP"}, {"NORP", "NRP"}),
    +        ({"ORGANIZATION"}, {"ORG"}),
    +        ({"DATE_TIME"}, {"DATE_TIME"}),
    +    ]
    +
    +    # pretrained model for this recognizer
    +
    +    _DEFAULT_MODEL_LANGUAGES = {
    +        "en": "beki/en_spacy_pii_distilbert",
    +    }
    +
    +    _DEFAULT_PRESIDIO_EQUIVALENCES = {
    +        "PER": "PERSON",
    +        "LOC": "LOCATION",
    +        "ORG": "ORGANIZATION",
    +        "NROP": "NRP",
    +        "DATE_TIME": "DATE_TIME",
    +    }
    +
    +    def __init__(
    +        self,
    +        supported_language: str = "en",
    +        supported_entities: List[str] = None,
    +        check_label_groups: Tuple[Set, Set] = None,
    +        context: List[str] = None,
    +        ner_strength: float = 1,
    +    ):
    +        """
    +        Initialize Spacy Recognizer.
    +
    +        :param supported_language: Language to use, default is English
    +        :param supported_entities: Entities to use for recognition
    +        :param check_label_groups: Label groups to check for the entities
    +        :param context:            Context to use if any
    +        :param ner_strength:       Default confidence for NER prediction
    +
    +        :returns: SpacyRecognizer object
    +        """
    +
    +        # Default confidence for NER prediction
    +        self.ner_strength = ner_strength
    +
    +        self.check_label_groups = check_label_groups or self._DEFAULT_CHECK_LABEL_GROUPS
    +        supported_entities = supported_entities or self.RECOGNIZABLE_ENTITIES
    +        super().__init__(
    +            supported_entities=supported_entities,
    +            supported_language=supported_language,
    +        )
    +
    +    # get the presidio explanation for the result
    +
    +    def _build_spacy_explanation(
    +        self, original_score: float, explanation: str
    +    ) -> pa.AnalysisExplanation:
    +        """
    +        Create explanation for why this result was detected.
    +
    +        :param original_score: Score given by this recognizer
    +        :param explanation:    Explanation string
    +
    +        :returns: Presidio AnalysisExplanation object
    +        """
    +        explanation = pa.AnalysisExplanation(
    +            recognizer=self.__class__.__name__,
    +            original_score=original_score,
    +            textual_explanation=explanation,
    +        )
    +        return explanation
    +
    +    # main method for the recognizer
    +    def analyze(self, text: str, entities: List[str], nlp_artifacts=None):  # noqa D102
    +        """
    +        Analyze text using Spacy.
    +
    +        :param text:          Text to analyze
    +        :param entities:      Entities to analyze
    +        :param nlp_artifacts: NLP artifacts to use
    +
    +        :returns: List of Presidio RecognizerResult objects
    +        """
    +        results = []
    +        if not nlp_artifacts:
    +            logger.warning("Skipping SpaCy, nlp artifacts not provided...")
    +            return results
    +
    +        ner_entities = nlp_artifacts.entities
    +
    +        # recognize the supported entities
    +        for entity in entities:
    +            if entity not in self.supported_entities:
    +                continue
    +            for ent in ner_entities:
    +                if not self.__check_label(entity, ent.label_, self.check_label_groups):
    +                    continue
    +
    +                # string of the explanation saying the entity is recognized by spacy
    +                textual_explanation = self._DEFAULT_EXPLANATION.format(ent.label_)
    +                explanation = self._build_spacy_explanation(
    +                    self.ner_strength, textual_explanation
    +                )
    +
    +                # create the standard result with the entity, start, end, score, and explanation
    +                spacy_result = pa.RecognizerResult(
    +                    entity_type=entity,
    +                    start=ent.start_char,
    +                    end=ent.end_char,
    +                    score=self.ner_strength,
    +                    analysis_explanation=explanation,
    +                    recognition_metadata={
    +                        pa.RecognizerResult.RECOGNIZER_NAME_KEY: self.name
    +                    },
    +                )
    +                results.append(spacy_result)
    +
    +        return results
    +
    +    @staticmethod
    +    def __check_label(
    +        entity: str, label: str, check_label_groups: Tuple[Set, Set]
    +    ) -> bool:
    +        """
    +        Check if the label is in the label group.
    +
    +        :param entity:             Entity to check
    +        :param label:              Label to check
    +        :param check_label_groups: Label groups to check
    +
    +        :returns: True if the label is in the label group, False otherwise
    +        """
    +        return any(
    +            entity in egrp and label in lgrp for egrp, lgrp in check_label_groups
    +        )
    +
    +
    +# Class to use Flair with Presidio as an external recognizer.
    +class FlairRecognizer(pa.EntityRecognizer):
    +    """
    +    Wrapper for a flair model, if needed to be used within Presidio Analyzer.
    +    This is to make sure the recognizer can be registered with Presidio registry.
    +    """
    +
    +    RECOGNIZABLE_ENTITIES = {
    +        "LOCATION",
    +        "PERSON",
    +        "NRP",
    +        "GPE",
    +        "ORGANIZATION",
    +        "MAC_ADDRESS",
    +        "US_BANK_NUMBER",
    +        "IMEI",
    +        "TITLE",
    +        "LICENSE_PLATE",
    +        "US_PASSPORT",
    +        "CURRENCY",
    +        "ROUTING_NUMBER",
    +        "US_ITIN",
    +        "US_BANK_NUMBER",
    +        "US_DRIVER_LICENSE",
    +        "AGE",
    +        "PASSWORD",
    +        "SWIFT_CODE",
    +    }
    +
    +    # This is used to construct the explanation for the result
    +
    +    _DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
    +
    +    _DEFAULT_CHECK_LABEL_GROUPS = [
    +        ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}),
    +        ({"PERSON"}, {"PER", "PERSON"}),
    +        ({"NRP"}, {"NORP", "NRP"}),
    +        ({"GPE"}, {"GPE"}),
    +        ({"ORGANIZATION"}, {"ORG"}),
    +        ({"MAC_ADDRESS"}, {"MAC_ADDRESS"}),
    +        ({"US_BANK_NUMBER"}, {"US_BANK_NUMBER"}),
    +        ({"IMEI"}, {"IMEI"}),
    +        ({"TITLE"}, {"TITLE"}),
    +        ({"LICENSE_PLATE"}, {"LICENSE_PLATE"}),
    +        ({"US_PASSPORT"}, {"US_PASSPORT"}),
    +        ({"CURRENCY"}, {"CURRENCY"}),
    +        ({"ROUTING_NUMBER"}, {"ROUTING_NUMBER"}),
    +        ({"AGE"}, {"AGE"}),
    +        ({"CURRENCY"}, {"CURRENCY"}),
    +        ({"SWIFT_CODE"}, {"SWIFT_CODE"}),
    +        ({"US_ITIN"}, {"US_ITIN"}),
    +        ({"US_BANK_NUMBER"}, {"US_BANK_NUMBER"}),
    +        ({"US_DRIVER_LICENSE"}, {"US_DRIVER_LICENSE"}),
    +    ]
    +
    +    _DEFAULT_MODEL_LANGUAGES = {
    +        "en": "beki/flair-pii-distilbert",
    +    }
    +
    +    _DEFAULT_PRESIDIO_EQUIVALENCES = {
    +        "PER": "PERSON",
    +        "LOC": "LOCATION",
    +        "ORG": "ORGANIZATION",
    +        "NROP": "NRP",
    +        "URL": "URL",
    +        "US_ITIN": "US_ITIN",
    +        "US_PASSPORT": "US_PASSPORT",
    +        "IBAN_CODE": "IBAN_CODE",
    +        "IP_ADDRESS": "IP_ADDRESS",
    +        "EMAIL_ADDRESS": "EMAIL",
    +        "US_DRIVER_LICENSE": "US_DRIVER_LICENSE",
    +        "US_BANK_NUMBER": "US_BANK_NUMBER",
    +    }
    +
    +    def __init__(
    +        self,
    +        supported_language: str = "en",
    +        supported_entities: List[str] = None,
    +        check_label_groups: Tuple[Set, Set] = None,
    +    ):
    +        """
    +        Initialize the FlairRecognizer.
    +
    +        :param supported_language: Language to use
    +        :param supported_entities: Entities to use
    +        :param check_label_groups: Label groups to check
    +
    +        :returns: FlairRecognizer object
    +
    +        """
    +        self.check_label_groups = check_label_groups or self._DEFAULT_CHECK_LABEL_GROUPS
    +
    +        supported_entities = supported_entities or self.RECOGNIZABLE_ENTITIES
    +        self.model = fl.models.SequenceTagger.load(
    +            self._DEFAULT_MODEL_LANGUAGES.get(supported_language)
    +        )
    +
    +        super().__init__(
    +            supported_entities=supported_entities,
    +            supported_language=supported_language,
    +            name="Flair Analytics",
    +        )
    +
    +    # main method for the recognizer
    +    def analyze(
    +        self,
    +        text: str,
    +        entities: List[str],
    +        nlp_artifacts: pa.nlp_engine.NlpArtifacts = None,
    +    ) -> List[pa.RecognizerResult]:
    +        """
    +        Analyze text and return the results.
    +
    +        :param text:          The text for analysis.
    +        :param entities:      The list of entities to recognize.
    +        :param nlp_artifacts: Not used by this recognizer but needed for the interface.
    +
    +        :returns: The list of Presidio RecognizerResult constructed from the recognized Flair detections.
    +        """
    +
    +        results = []
    +
    +        sentences = fl.data.Sentence(text)
    +        self.model.predict(sentences)
    +
    +        # If there are no specific list of entities, we will look for all of it.
    +        if not entities:
    +            entities = self.supported_entities
    +
    +        # Go over the entities and check if they are in the supported entities list.
    +        for entity in entities:
    +            if entity not in self.supported_entities:
    +                continue
    +
    +            # Go over the sentences and check if the entity is in the sentence.
    +            for ent in sentences.get_spans("ner"):
    +                if not self.__check_label(
    +                    entity, ent.labels[0].value, self.check_label_groups
    +                ):
    +                    continue
    +
    +                # If the entity is in the sentence, we will add it to the results.
    +                textual_explanation = self._DEFAULT_EXPLANATION.format(
    +                    ent.labels[0].value
    +                )
    +
    +                # Build the explanation for the result
    +                explanation = self._build_flair_explanation(
    +                    round(ent.score, 2), textual_explanation
    +                )
    +
    +                flair_result = self._convert_to_recognizer_result(ent, explanation)
    +
    +                results.append(flair_result)
    +
    +        return results
    +
    +    def _convert_to_recognizer_result(
    +        self, entity: fl.data.Span, explanation: str
    +    ) -> pa.RecognizerResult:
    +        """
    +        Convert Flair result to Presidio RecognizerResult.
    +
    +        :param entity:      Flair entity of Span
    +        :param explanation: Presidio AnalysisExplanation
    +
    +        :returns: Presidio RecognizerResult
    +        """
    +
    +        # Convert the entity type to Presidio entity type
    +        entity_type = self._DEFAULT_PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)
    +
    +        # Convert the score to Presidio score
    +        flair_score = round(entity.score, 2)
    +
    +        # Create the Presidio RecognizerResult from the Flair entity
    +        flair_results = pa.RecognizerResult(
    +            entity_type=entity_type,
    +            start=entity.start_position,
    +            end=entity.end_position,
    +            score=flair_score,
    +            analysis_explanation=explanation,
    +        )
    +
    +        return flair_results
    +
    +    def _build_flair_explanation(
    +        self, original_score: float, explanation: str
    +    ) -> pa.AnalysisExplanation:
    +        """
    +        Create explanation for why this result was detected.
    +
    +        :param original_score: Score given by this recognizer
    +        :param explanation:    Explanation string
    +
    +        :returns: Presidio AnalysisExplanation
    +        """
    +
    +        # Create the Presidio AnalysisExplanation for the result
    +        explanation = pa.AnalysisExplanation(
    +            recognizer=self.__class__.__name__,
    +            original_score=original_score,
    +            textual_explanation=explanation,
    +        )
    +        return explanation
    +
    +    # sanity check of the entity and label before recognition
    +    @staticmethod
    +    def __check_label(
    +        entity: str, label: str, check_label_groups: Tuple[Set, Set]
    +    ) -> bool:
    +        return any(
    +            entity in egrp and label in lgrp for egrp, lgrp in check_label_groups
    +        )
    +
    +
    +# get the analyzer engine based on the model
    +def _get_analyzer_engine(
    +    model: str = None, entities: List[str] = None
    +) -> pa.AnalyzerEngine:
    +    """
    +    Return pa.AnalyzerEngine.
    +
    +    :param model: The model to use. Can be "spacy", "flair", "pattern" or "whole".
    +    :param entities: The list of entities to use.
    +
    +    :returns: pa.AnalyzerEngine
    +    """
    +    # recognizer registry that can store multiple recognizers
    +    registry = pa.RecognizerRegistry()
    +    if model == Models.SPACY:
    +        # custom spacy recognizer
    +        spacy_recognizer = CustomSpacyRecognizer()
    +        # add the custom build spacy recognizer
    +        registry.add_recognizer(spacy_recognizer)
    +    elif model == Models.FLAIR:
    +        # pre-trained flair recognizer
    +        flair_recognizer = FlairRecognizer()
    +        # add the custom build flair recognizer
    +        registry.add_recognizer(flair_recognizer)
    +    elif model == Models.PATTERN:
    +        # add the pattern recognizer
    +        pattern_recognizer_factory = PatternRecognizerFactory()
    +        for recognizer in pattern_recognizer_factory._create_pattern_recognizer():
    +            registry.add_recognizer(recognizer)
    +    elif model == Models.WHOLE:
    +        spacy_recognizer = CustomSpacyRecognizer()
    +        flair_recognizer = FlairRecognizer()
    +        registry.add_recognizer(spacy_recognizer)
    +        registry.add_recognizer(flair_recognizer)
    +        # add the pattern recognizer
    +        pattern_recognizer_factory = PatternRecognizerFactory()
    +        for recognizer in pattern_recognizer_factory._create_pattern_recognizer():
    +            registry.add_recognizer(recognizer)
    +    elif not model and entities:
    +        if set(entities) & CustomSpacyRecognizer.RECOGNIZABLE_ENTITIES:
    +            spacy_recognizer = CustomSpacyRecognizer()
    +            registry.add_recognizer(spacy_recognizer)
    +        if set(entities) & FlairRecognizer.RECOGNIZABLE_ENTITIES:
    +            flair_recognizer = FlairRecognizer()
    +            registry.add_recognizer(flair_recognizer)
    +        # add the pattern recognizer
    +        if set(entities) & (set(PatternRecognizerFactory.RECOGNIZABLE_ENTITIES.keys())):
    +            pattern_recognizer_factory = PatternRecognizerFactory()
    +            for recognizer in pattern_recognizer_factory._create_pattern_recognizer():
    +                registry.add_recognizer(recognizer)
    +    else:
    +        raise ValueError(
    +            f"argument of model and entities can not be None at the same time"
    +        )
    +    analyzer = pa.AnalyzerEngine(
    +        registry=registry,
    +        supported_languages=["en"],
    +    )
    +
    +    supported_entities = analyzer.get_supported_entities()
    +
    +    if entities and not all(item in supported_entities for item in entities):
    +        not_supported_entities = [
    +            item for item in entities if item not in supported_entities
    +        ]
    +        raise ValueError(
    +            f"The current model {model} doesn't support the following entities: {not_supported_entities}. "
    +            f"Supported entities are: {supported_entities}"
    +        )
    +    return analyzer
    +
    +
    +def _get_anonymizer_engine() -> pre_anoymizer.AnonymizerEngine:
    +    """
    +    Return AnonymizerEngine.
    +
    +    :returns: The AnonymizerEngine.
    +    """
    +    return pre_anoymizer.AnonymizerEngine()
    +
    +
    +def _anonymize(
    +    text: str,
    +    analyze_results: List[pa.RecognizerResult],
    +    entity_operator_map: dict = None,
    +    is_full_text: bool = True,
    +) -> str:
    +    """
    +    Anonymize identified input using Presidio Abonymizer.
    +
    +    :param text:                The text for analysis.
    +    :param analyze_results:     The list of Presidio RecognizerResult constructed from
    +    :param entity_operator_map: The entity_operator_map is a dictionary that maps entity to operator name and operator params.
    +    :param is_full_text:        Whether the text is full text or not.
    +
    +    :returns: The anonymized text.
    +    """
    +    if not text:
    +        return ""
    +
    +    anonymizer_engine = _get_anonymizer_engine()
    +    if not entity_operator_map:
    +        operators = None
    +    else:
    +        # Create OperatorConfig based on the entity_operator_map
    +        operators = {
    +            entity: OperatorConfig(operator_name, operator_params)
    +            for entity, (operator_name, operator_params) in entity_operator_map.items()
    +        }
    +
    +    if is_full_text:
    +        # Anonymize the entire text
    +        return anonymizer_engine.anonymize(
    +            text=text, analyzer_results=analyze_results, operators=operators
    +        ).text
    +    # Tokenize the text to sentences
    +    sentences = nltk.sent_tokenize(text)
    +    anonymized_sentences = []
    +    current_idx = 0
    +
    +    # Find the sentence that has pii entity
    +    for sentence in sentences:
    +        start_idx = current_idx
    +        end_idx = start_idx + len(sentence)
    +
    +        # Get the entities that are in the sentence, update hte start_idx and end_idx
    +        sentence_results = [
    +            pa.RecognizerResult(
    +                result.entity_type,
    +                start=result.start - start_idx,
    +                end=result.end - start_idx,
    +                score=result.score,
    +            )
    +            for result in analyze_results
    +            if result.start >= start_idx and result.end <= end_idx
    +        ]
    +
    +        # If PII is detected
    +        if sentence_results:
    +            anonymized_sentence = anonymizer_engine.anonymize(
    +                text=sentence, analyzer_results=sentence_results, operators=operators
    +            ).text
    +            anonymized_sentences.append(anonymized_sentence)
    +
    +        current_idx = end_idx
    +
    +    return " ".join(anonymized_sentences)
    +
    +
    +def _get_tokens(
    +    text: str, analyze_results: List[pa.RecognizerResult], is_full: bool = True
    +) -> List[str]:
    +    """
    +    Get the full tokens or only contains the entities that can form a sentence.
    +
    +    :param text:            The text for analysis.
    +    :param analyze_results: The list of Presidio RecognizerResult constructed from
    +    :param is_full:         Whether return full tokens or just the tokens that only contains the entities that can form a sentence.
    +
    +    :returns: The tokens.
    +    """
    +
    +    tokens = []
    +    # sort by start index
    +    results = sorted(analyze_results, key=lambda x: x.start)
    +    for i, res in enumerate(results):
    +        if i == 0:
    +            tokens.append(text[: res.start])
    +
    +        # append entity text and entity type
    +        tokens.append((text[res.start : res.end], res.entity_type))
    +
    +        # if another entity coming i.e. we're not at the last results element,
    +        # add text up to next entity
    +        if i != len(results) - 1:
    +            tokens.append(text[res.end : results[i + 1].start])
    +        # if no more entities coming, add all remaining text
    +        else:
    +            tokens.append(text[res.end :])
    +
    +    # get the tokens that only contains the entities that can form a sentence
    +    part_annontated_tokens = []
    +    if not is_full:
    +        last_end_sentence = 0
    +        for i, token in enumerate(tokens):
    +            if any(item in token for item in [".", "!", "?"]) and any(
    +                type(item) is tuple for item in tokens[last_end_sentence:i]
    +            ):
    +                part_annontated_tokens.append(tokens[last_end_sentence:i])
    +                last_end_sentence = i
    +        return part_annontated_tokens
    +    return tokens
    +
    +
    +def _annotate(
    +    text: str, st_analyze_results: List[pa.RecognizerResult], is_full_html: bool = True
    +) -> List[str]:
    +    """
    +    Annotate identified input using Presidio Anonymizer.
    +
    +    :param text:               The text for analysis.
    +    :param st_analyze_results: The list of Presidio RecognizerResult constructed from analysis.
    +    :param is_full_html:       Whether generate full html or not.
    +
    +    :returns: The list of tokens with the identified entities.
    +
    +    """
    +    return _get_tokens(text, st_analyze_results, is_full_html)
    +
    +
    +def _process(
    +    text: str,
    +    model: pa.AnalyzerEngine,
    +    score_threshold: float,
    +    entities: List[str] = None,
    +    entities_operator_map: dict = None,
    +    is_full_text: bool = True,
    +) -> Tuple[str, list]:
    +    """
    +    Process the text of str using the model.
    +
    +    :param text:                  Text to process
    +    :param model:                 Model to use for processing
    +    :param entities:              Entities to recognize
    +    :param entities_operator_map: The entity_operator_map is a dictionary that maps entity to operator name and operator params.
    +    :param score_threshold:       The score threshold to use for recognition
    +    :param is_full_text:          Whether to return the full text or just the annotated text
    +
    +    :returns: A tuple of:
    +
    +              * the anonymized text
    +              * the list of Presidio RecognizerResult constructed from analysis
    +    """
    +
    +    # get the analyzer engine
    +    analyzer = model
    +
    +    # analyze the text that can be used for anonymization
    +    results = analyzer.analyze(
    +        text=text,
    +        language="en",
    +        entities=entities,
    +        score_threshold=score_threshold,
    +        return_decision_process=True,
    +    )
    +
    +    # anonymize the text, replace the pii entities with the labels
    +    anonymized_text = _anonymize(text, results, entities_operator_map, is_full_text)
    +
    +    return anonymized_text, results
    +
    +
    +def _get_single_html(
    +    text: str, results: List[pa.RecognizerResult], is_full_html: bool = True
    +):
    +    """
    +    Generate the html for a single txt file.
    +
    +    :param text:         The text for analysis.
    +    :param results:      The list of Presidio RecognizerResult constructed from analysis.
    +    :param is_full_html: Whether generate full html or not.
    +
    +    :returns: The html string for a single txt file.
    +    """
    +    # convert the results to tokens to generate the html
    +    tokens = _annotate(text, results, is_full_html)
    +    html = at_util.get_annotated_html(*tokens)
    +
    +    # avoid the error during rendering of the \n in the html
    +    backslash_char = "\\"
    +
    +    html_str = f"

    {html.replace('{backslash_char}n', '
    ')}

    " + + return html_str + + +def _get_single_json(results: List[pa.RecognizerResult], is_full_report: bool = True): + """ + Generate the json for a single txt file. + + :param results: The list of Presidio RecognizerResult constructed from analysis. + :param is_full_report: Whether generate full json or not. + + :returns: The json string for a single txt file. + """ + # generate the stats report if needed + if not is_full_report: + stats = [] + # add the simplify stats logic here + for item in results: + item.analysis_explanation = None + stats.append(item) + else: + stats = results + + return stats + + +def _get_all_html( + txt_content: dict, + res_dict: dict, + is_full_html: bool = True, +): + """ + Generate the html for all txt files. + + :param txt_content: The dictionary of txt file name and content. + :param res_dict: The dictionary of txt file name and the list of Presidio RecognizerResult constructed from analysis. + :param is_full_html: Whether generate full html or not. + + :returns: The html string for all txt files. + + """ + # These are placeholder for the html string + html_index = "Highlighted Pii Entities

    Highlighted Pii Entities

      " + html_content = "" + for txt_file, results in res_dict.items(): + txt = txt_content[txt_file] + html_index += f"
    • {txt_file}
    • " + html_content += f"
    • {txt_file}

      {_get_single_html(txt, results, is_full_html)}

    • " + html_index += "
    " + html_res = f"{html_index}{html_content}" + + return html_res + + +def _get_all_rpt(res_dict: dict, is_full_report: bool = True): + """ + Generate the stats report for all txt files. + + :param res_dict: The dictionary of txt file name and the list of Presidio RecognizerResult constructed from analysis. + :param is_full_report: Whether generate full report or not. + + :returns: The stats report for all txt files. + """ + # These are placeholder for the json report + stats_dict = {} + for txt_file, results in res_dict.items(): + new_stats = [] + for item in _get_single_json(results, is_full_report): + if is_full_report: + item.analysis_explanation = item.analysis_explanation.to_dict() + new_stats.append(item.to_dict()) + else: + tmp_dict = item.to_dict() + tmp_dict.pop("analysis_explanation") + tmp_dict.pop("recognition_metadata") + new_stats.append(tmp_dict) + stats_dict[txt_file] = new_stats + return stats_dict + + +def recognize_pii( + context: mlrun.MLClientCtx, + input_path: Union[str, pathlib.Path], + html_key: str, + score_threshold: float, + output_directory: str = None, + entities: List[ + str + ] = None, # List of entities to recognize, default is recognizing all + entity_operator_map: dict = None, + model: str = None, + generate_json: bool = True, + generate_html: bool = True, + is_full_text: bool = True, + is_full_html: bool = True, + is_full_report: bool = True, +) -> Union[Tuple[str, pd.DataFrame, dict, dict], Tuple[str, pd.DataFrame, dict]]: + """ + Walk through the input path, recognize PII in text and store the anonymized text in the output path. + Generate the html with different colors for each entity, json report of the explanation. + + :param context: The MLRun context. this is needed for log the artifacts. + :param input_path: The input path of the text files needs to be analyzed. + :param html_key: The html key for the artifact. + :param score_threshold: The score threshold to mark the recognition as trusted. + :param output_directory: The output directory path to store the anonymized text. + :param entities: The list of entities to recognize. + :param entity_operator_map: The map of entity to operator (mask, redact, replace, keep, hash, and its params) + :param model: The model to use. Can be "spacy", "flair", "pattern" or "whole". + :param generate_json: Whether to generate the json report of the explanation. + :param generate_html: Whether to generate the html report of the explanation. + :param is_full_text: Whether to return the full text or only the masked text. + :param is_full_html: Whether to return the full html or just the annotated text + :param is_full_report: Whether to return the full report or just the score and start, end index + + :returns: A tuple of: + + * Path to the output directory + * The json report of the explanation (if generate_json is True) + * A dictionary of errors files that were not processed + + """ + + # Set output directory + if output_directory is None: + output_directory = tempfile.mkdtemp() + + # Create the output directory: + output_directory = pathlib.Path(output_directory) + if not output_directory.exists(): + output_directory.mkdir(parents=True, exist_ok=True) + + txt_files_directory = pathlib.Path(input_path) + successes = [] + errors = {} + + res_dict = {} + txt_content = {} + # Load the model: + analyzer = _get_analyzer_engine(model, entities) + logger.info("Model loaded") + # Go over the text files in the input path, analyze and anonymize them: + for txt_file in tqdm( + list(txt_files_directory.glob("*.txt")), + desc="Processing files", + unit="file", + ): + try: + # Load the str from the text file + text = txt_file.read_text() + txt_content[str(txt_file)] = text + # Process the text to recoginze the pii entities in it + anonymized_text, results = _process( + text=text, + model=analyzer, + entities=entities, + entities_operator_map=entity_operator_map, + score_threshold=score_threshold, + is_full_text=is_full_text, + ) + res_dict[str(txt_file)] = results + # Store the anonymized text in the output path + output_file = output_directory / f"{txt_file.stem}.txt" + output_file.parent.mkdir(parents=True, exist_ok=True) + with open(output_file, "w") as f: + f.write(anonymized_text) + successes.append([txt_file.name, output_file.name]) + except Exception as e: + errors[str(txt_file)] = str(e) + logger.error(f"Error processing {txt_file}: {e}") + + successes = pd.DataFrame( + successes, + columns=["original_file", "anonymized_file"], + ) + + if generate_html: + # Generate the html report + html_res = _get_all_html(txt_content, res_dict, is_full_html) + # Store the html report in the context + arti_html = mlrun.artifacts.Artifact(body=html_res, format="html", key=html_key) + context.log_artifact(arti_html) + if generate_json: + # Generate the json report + json_res = _get_all_rpt(res_dict, is_full_report) + return str(output_directory), successes, errors, json_res + return str(output_directory), successes, errors + +
    +
    + + \ No newline at end of file diff --git a/functions/master/pii_recognizer/latest/src/function.yaml b/functions/master/pii_recognizer/latest/src/function.yaml index 54b448d9..069fa1ff 100644 --- a/functions/master/pii_recognizer/latest/src/function.yaml +++ b/functions/master/pii_recognizer/latest/src/function.yaml @@ -2,13 +2,14 @@ kind: job metadata: name: pii-recognizer tag: '' - hash: b09b7b9a4ffd55088d665a0191055411e9198a2f + hash: 818930645d33704e9cada919769ee9d93cbb9434 project: '' labels: author: pgw categories: - machine-learning - data-preparation + - NLP spec: command: '' args: [] diff --git a/functions/master/pii_recognizer/latest/src/item.yaml b/functions/master/pii_recognizer/latest/src/item.yaml index 2f618feb..41ead33b 100644 --- a/functions/master/pii_recognizer/latest/src/item.yaml +++ b/functions/master/pii_recognizer/latest/src/item.yaml @@ -2,6 +2,7 @@ apiVersion: v1 categories: - machine-learning - data-preparation + - NLP description: This function is used to recognize PII in a directory of text files doc: '' example: pii_recognizer.ipynb @@ -30,5 +31,5 @@ spec: - st-annotated-text - https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl url: '' -version: 0.2.0 +version: 0.3.0 test_valid: False diff --git a/functions/master/pii_recognizer/latest/static/function.html b/functions/master/pii_recognizer/latest/static/function.html index 4cb31e5a..1d21cf1b 100644 --- a/functions/master/pii_recognizer/latest/static/function.html +++ b/functions/master/pii_recognizer/latest/static/function.html @@ -19,13 +19,14 @@ metadata: name: pii-recognizer tag: '' - hash: b09b7b9a4ffd55088d665a0191055411e9198a2f + hash: 818930645d33704e9cada919769ee9d93cbb9434 project: '' labels: author: pgw categories: - machine-learning - data-preparation + - NLP spec: command: '' args: [] diff --git a/functions/master/pii_recognizer/latest/static/item.html b/functions/master/pii_recognizer/latest/static/item.html index 0f8f593e..6cc24fb1 100644 --- a/functions/master/pii_recognizer/latest/static/item.html +++ b/functions/master/pii_recognizer/latest/static/item.html @@ -19,6 +19,7 @@ categories: - machine-learning - data-preparation + - NLP description: This function is used to recognize PII in a directory of text files doc: '' example: pii_recognizer.ipynb @@ -47,7 +48,7 @@ - st-annotated-text - https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl url: '' -version: 0.2.0 +version: 0.3.0 test_valid: False diff --git a/functions/master/pyannote_audio/1.2.0/src/assets/test_data.wav b/functions/master/pyannote_audio/1.2.0/src/assets/test_data.wav new file mode 100644 index 00000000..a3a993c2 Binary files /dev/null and b/functions/master/pyannote_audio/1.2.0/src/assets/test_data.wav differ diff --git a/functions/master/pyannote_audio/1.2.0/src/function.yaml b/functions/master/pyannote_audio/1.2.0/src/function.yaml new file mode 100644 index 00000000..30870afa --- /dev/null +++ b/functions/master/pyannote_audio/1.2.0/src/function.yaml @@ -0,0 +1,151 @@ +kind: job +metadata: + name: pyannote-audio + tag: '' + hash: aed670a0534ebf30690dd2af7acad35595c7d5b1 + project: '' + labels: + author: guyl + categories: + - deep-learning + - huggingface + - audio +spec: + command: '' + args: [] + image: '' + build: + functionSourceCode: # Copyright 2023 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import heapq
import logging
import operator
import os
import pathlib
from functools import reduce, wraps
from typing import Any, Dict, List, Tuple, Union

import pandas as pd
import pyannote.audio
import pyannote.core
import torch
import torchaudio
from tqdm import tqdm

# Get the global logger:
_LOGGER = logging.getLogger()


def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    is_mpi = False
    try:
        import mlrun

        context = mlrun.get_or_create_ctx(name="mlrun")
        is_mpi = context.labels.get("kind", "job") == "mpijob"

        if is_mpi:
            try:
                from mpi4py import MPI

                return context, MPI.COMM_WORLD
            except ModuleNotFoundError as mpi4py_not_found:
                context.logger.error(
                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
                )
                raise mpi4py_not_found
        else:
            return context, None
    except ModuleNotFoundError as module_not_found:
        if is_mpi:
            raise module_not_found
    return None, None


def open_mpi_handler(
    worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None
):
    global _LOGGER

    # Check for MLRun and OpenMPI availability:
    context, comm = _check_mlrun_and_open_mpi()

    # Check if MLRun is available, set the global logger to MLRun's:
    if context:
        _LOGGER = context.logger

    def decorator(handler):
        if comm is None or comm.Get_size() == 1:
            return handler

        @wraps(handler)
        def wrapper(**kwargs):
            # Get the open mpi environment properties:
            size = comm.Get_size()
            rank = comm.Get_rank()

            # Give the correct chunk of the workers inputs:
            for worker_input in worker_inputs:
                input_argument = kwargs[worker_input]
                if input_argument is None:
                    continue
                if isinstance(input_argument, str):
                    input_argument = _get_audio_files(
                        data_path=pathlib.Path(input_argument).absolute()
                    )
                if len(input_argument) < size:
                    raise ValueError(
                        f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. "
                        f"Please reduce the amount of workers for this input."
                    )
                even_chunk_size = len(input_argument) // size
                chunk_start = rank * even_chunk_size
                chunk_end = (
                    (rank + 1) * even_chunk_size
                    if rank + 1 < size
                    else len(input_argument)
                )
                context.logger.info(
                    f"Rank #{rank}: Processing input chunk of '{worker_input}' "
                    f"from index {chunk_start} to {chunk_end}."
                )
                if isinstance(input_argument, list):
                    input_argument = input_argument[chunk_start:chunk_end]
                elif isinstance(input_argument, pd.DataFrame):
                    input_argument = input_argument.iloc[chunk_start:chunk_end:, :]
                kwargs[worker_input] = input_argument

            # Set the root worker only arguments:
            if rank == 0 and root_worker_inputs:
                kwargs.update(root_worker_inputs)

            # Run the worker:
            output = handler(**kwargs)

            # Send the output to the root rank (rank #0):
            output = comm.gather(output, root=0)
            if rank == 0:
                # Join the outputs:
                context.logger.info("Collecting data from workers to root worker.")
                diarization_dictionary = reduce(
                    operator.ior, [dia for dia, _ in output], {}
                )
                errors_dictionary = reduce(operator.ior, [err for _, err in output], {})
                return diarization_dictionary, errors_dictionary
            return None

        return wrapper

    return decorator


@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True})
def diarize(
    data_path: Union[str, List[str]],
    model_name: str = "pyannote/speaker-diarization-3.0",
    access_token: str = None,
    device: str = None,
    speakers_labels: List[str] = None,
    speaker_prefix: str = "speaker_",
    separate_by_channels: bool = False,
    minimum_speakers: int = None,
    maximum_speakers: int = None,
    verbose: bool = False,
) -> Tuple[Dict[str, List[Tuple[float, float, str]]], Dict[str, str]]:
    """
    Perform speech diarization on given audio files using pyannote-audio (https://github.com/pyannote/pyannote-audio).
    The end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list
    of tuples: (start, end, speaker_label).

    To use the `pyannote.audio` models you must pass a Huggingface token and get access to the required models. The
    token can be passed in one of the following options:

    * Use the parameter `access_token`.
    * Set an environment variable named "HUGGING_FACE_HUB_TOKEN".
    * If using MLRun, you can pass it as a secret named "HUGGING_FACE_HUB_TOKEN".

    To get access to the models on Huggingface, visit their page. For example, to use the default diarization model set
    in this function ("pyannote/speaker-diarization-3.0"), you need access for these two models:

    * https://huggingface.co/pyannote/segmentation-3.0
    * https://huggingface.co/pyannote/speaker-diarization-3.0

    Note: To control the recognized speakers in the diarization output you can choose one of the following methods:

    * For a known speakers amount, you may set speaker labels via the `speakers_labels` parameter that will be used in
      the order of speaking in the audio (first person speaking be the first label in the list). In addition, you can do
      diarization per channel (setting the parameter `separate_by_channels` to True). Each label will be assigned to a
      specific channel by order (first label to channel 0, second label to channel 1 and so on). Notice, this will
      increase runtime.
    * For unknown speakers amount, you can set the `speaker_prefix` parameter to add a prefix for each speaker number.
      You can also help the diarization by setting the speakers range via the `speakers_amount_range` parameter.

    :param data_path:            A directory of the audio files, a single file or a list of files to transcribe.
    :param model_name:           One of the official diarization model names (referred as diarization pipelines) of
                                 `pyannote.audio` Huggingface page. Default: "pyannote/speaker-diarization-3.0".
    :param access_token:         An access token to pass for using the `pyannote.audio` models. If not provided, it
                                 will be looking for the environment variable "HUGGING_FACE_HUB_TOKEN". If MLRun is
                                 available, it will look for a secret "HUGGING_FACE_HUB_TOKEN".
    :param device:               Device to load the model. Can be one of {"cuda", "cpu"}. Default will prefer "cuda" if
                                 available.
    :param speakers_labels:      Labels to use for the recognized speakers. Default: numeric labels (0, 1, ...).
    :param separate_by_channels: If each speaker is speaking in a separate channel, you can diarize each channel and
                                 combine the result into a single diarization. Each label set in the `speakers_labels`
                                 parameter will be assigned to a specific channel by order.
    :param speaker_prefix:       A prefix to add for the speakers labels. This parameter is ignored if
                                 `speakers_labels` is not None. Default: "speaker".
    :param minimum_speakers:     Set the minimum expected amount of speakers to be in the audio files. This parameter is
                                 ignored if `speakers_labels` is not None.
    :param maximum_speakers:     Set the maximum expected amount of speakers to be in the audio files. This parameter is
                                 ignored if `speakers_labels` is not None.
    :param verbose:              Whether to present logs of a progress bar and errors. Default: True.

    :returns: A tuple of:

              * Speech diarization dictionary.
              * A dictionary of errored files that were not transcribed.
    """
    global _LOGGER

    # Get the input audio files to diarize:
    if isinstance(data_path, str):
        data_path = pathlib.Path(data_path).absolute()
        audio_files = _get_audio_files(data_path=data_path)
    else:  # Should be a list of files.
        audio_files = data_path

    # Get the Huggingface access token:
    access_token = _get_access_token(parameter=access_token)
    if access_token is None:
        raise ValueError(
            "A Huggingface access token must be provided to use `pyannote.audio` models. Access token can be passed "
            "via one of the following options:\n"
            "* Use the parameter `access_token`.\n"
            "* Set an environment variable named 'HUGGING_FACE_HUB_TOKEN'.\n"
            "* If using MLRun, you can pass it as a secret named 'HUGGING_FACE_HUB_TOKEN'."
        )

    # Load the diarization pipeline:
    pipeline = pyannote.audio.Pipeline.from_pretrained(
        checkpoint_path=model_name, use_auth_token=access_token
    )

    # Set the device:
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    if device != "cpu":
        pipeline.to(torch.device(device))

    # Prepare the successes dataframe and errors dictionary to be returned:
    diarizations = {}
    errors = {}

    # Prepare the diarization keyword arguments:
    diarize_kwargs = {}
    if speakers_labels:
        diarize_kwargs["num_speakers"] = len(speakers_labels)
    else:
        if minimum_speakers:
            diarize_kwargs["min_speakers"] = minimum_speakers
        if maximum_speakers:
            diarize_kwargs["max_speakers"] = maximum_speakers

    # Go over the audio files and diarize:
    for audio_file in tqdm(
        audio_files, desc="Diarizing", unit="file", disable=not verbose
    ):
        try:
            # Load audio file:
            audio, sample_rate = torchaudio.load(uri=audio_file, channels_first=True)
            # Get the diarization (if provided):
            diarizations[audio_file.name] = _diarize(
                audio=audio,
                sample_rate=sample_rate,
                pipeline=pipeline,
                speakers_labels=speakers_labels,
                separate_by_channels=separate_by_channels,
                speaker_prefix=speaker_prefix,
                diarize_kwargs=diarize_kwargs,
            )
        except Exception as exception:
            # Note the exception as error in the dictionary:
            if verbose:
                _LOGGER.warning(f"Error in file: '{audio_file.name}'")
            errors[str(audio_file.name)] = str(exception)
            continue

    # Print the head of the produced dataframe and return:
    if verbose:
        _LOGGER.info(f"Done ({len(diarizations)}/{len(audio_files)})\n")
    return diarizations, errors


def _get_audio_files(
    data_path: pathlib.Path,
) -> List[pathlib.Path]:
    # Check if the path is of a directory or a file:
    if data_path.is_dir():
        # Get all files inside the directory:
        audio_files = list(data_path.glob("*.*"))
    elif data_path.is_file():
        audio_files = [data_path]
    else:
        raise ValueError(
            f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. "
            f"Given: {str(data_path)} "
        )

    return audio_files


def _get_access_token(parameter: str) -> str:
    # If given as a parameter, return it:
    if parameter:
        return parameter

    # Otherwise, look at the environment variable:
    environment_variable = os.environ.get("HUGGING_FACE_HUB_TOKEN")
    if environment_variable:
        return environment_variable

    # Lastly, try look in the set secrets in MLRun:
    secret = None
    try:
        import mlrun

        context = mlrun.get_or_create_ctx(name="mlrun")
        secret = context.get_secret(key="HUGGING_FACE_HUB_TOKEN")
    except ModuleNotFoundError:
        pass

    return secret


def _diarize(
    audio: torch.Tensor,
    sample_rate: int,
    pipeline: pyannote.audio.Pipeline,
    speakers_labels: List[str],
    separate_by_channels: bool,
    speaker_prefix: str,
    diarize_kwargs: dict,
) -> List[Tuple[float, float, str]]:
    # If there is no need for separation by channels, we diarize and return:
    if not separate_by_channels:
        # Diarize:
        diarization: pyannote.core.Annotation = pipeline(
            file={"waveform": audio, "sample_rate": sample_rate}, **diarize_kwargs
        )
        # Verify speakers labels (should not fail here as we set `num_speakers=len(speakers_labels)` when inferring
        # through the pipeline):
        if speakers_labels:
            given_speakers = len(speakers_labels)
            found_speakers = len(set(diarization.labels()))
            if given_speakers < found_speakers:
                raise ValueError(
                    f"Not enough `speakers_labels` were given. Got {given_speakers} labels but the diarization "
                    f"recognized {found_speakers} speakers."
                )
        # Return as a diarization list - a sorted list of tuples of start time, end time and a label (the default label
        # returned is "SPEAKER_i" so we take only the index out of it):
        return [
            (
                segment.start,
                segment.end,
                speakers_labels[int(label.split("_")[1])]
                if speakers_labels
                else f"{speaker_prefix}{int(label.split('_')[1])}",
            )
            for segment, track, label in diarization.itertracks(yield_label=True)
        ]

    # Separate to channels and diarize (we expect only one speaker per channel):
    channel_diarizations = [
        _diarize(
            audio=audio[channel].unsqueeze(
                0
            ),  # Take channel and add a channel dimension to it.
            sample_rate=sample_rate,
            pipeline=pipeline,
            speakers_labels=[
                speakers_labels[channel]
            ],  # Take the channel's label only.
            separate_by_channels=False,
            speaker_prefix=speaker_prefix,
            diarize_kwargs={"num_speakers": 1},  # Set to one speaker.
        )
        for channel in range(audio.shape[0])
    ]

    # Merge the channel diarizations into a single sorted list:
    return list(heapq.merge(*channel_diarizations))
 + base_image: mlrun/mlrun-gpu + commands: [] + code_origin: '' + origin_filename: '' + requirements: + - pyannote.audio + - pyannote.core + - torchaudio + - tqdm + entry_points: + open_mpi_handler: + name: open_mpi_handler + doc: '' + parameters: + - name: worker_inputs + type: List[str] + - name: root_worker_inputs + type: Dict[str, Any] + default: null + outputs: [] + lineno: 61 + has_varargs: false + has_kwargs: false + decorator: + name: decorator + doc: '' + parameters: + - name: handler + outputs: [] + lineno: 73 + has_varargs: false + has_kwargs: false + wrapper: + name: wrapper + doc: '' + parameters: [] + outputs: [] + lineno: 78 + has_varargs: false + has_kwargs: true + diarize: + name: diarize + doc: "Perform speech diarization on given audio files using pyannote-audio (https://github.com/pyannote/pyannote-audio).\n\ + The end result is a dictionary with the file names as keys and their diarization\ + \ as value. A diarization is a list\nof tuples: (start, end, speaker_label).\n\ + \nTo use the `pyannote.audio` models you must pass a Huggingface token and\ + \ get access to the required models. The\ntoken can be passed in one of the\ + \ following options:\n\n* Use the parameter `access_token`.\n* Set an environment\ + \ variable named \"HUGGING_FACE_HUB_TOKEN\".\n* If using MLRun, you can pass\ + \ it as a secret named \"HUGGING_FACE_HUB_TOKEN\".\n\nTo get access to the\ + \ models on Huggingface, visit their page. For example, to use the default\ + \ diarization model set\nin this function (\"pyannote/speaker-diarization-3.0\"\ + ), you need access for these two models:\n\n* https://huggingface.co/pyannote/segmentation-3.0\n\ + * https://huggingface.co/pyannote/speaker-diarization-3.0\n\nNote: To control\ + \ the recognized speakers in the diarization output you can choose one of\ + \ the following methods:\n\n* For a known speakers amount, you may set speaker\ + \ labels via the `speakers_labels` parameter that will be used in\n the order\ + \ of speaking in the audio (first person speaking be the first label in the\ + \ list). In addition, you can do\n diarization per channel (setting the parameter\ + \ `separate_by_channels` to True). Each label will be assigned to a\n specific\ + \ channel by order (first label to channel 0, second label to channel 1 and\ + \ so on). Notice, this will\n increase runtime.\n* For unknown speakers amount,\ + \ you can set the `speaker_prefix` parameter to add a prefix for each speaker\ + \ number.\n You can also help the diarization by setting the speakers range\ + \ via the `speakers_amount_range` parameter." + parameters: + - name: data_path + type: Union[str, List[str]] + doc: A directory of the audio files, a single file or a list of files to transcribe. + - name: model_name + type: str + doc: 'One of the official diarization model names (referred as diarization + pipelines) of `pyannote.audio` Huggingface page. Default: "pyannote/speaker-diarization-3.0".' + default: pyannote/speaker-diarization-3.0 + - name: access_token + type: str + doc: An access token to pass for using the `pyannote.audio` models. If not + provided, it will be looking for the environment variable "HUGGING_FACE_HUB_TOKEN". + If MLRun is available, it will look for a secret "HUGGING_FACE_HUB_TOKEN". + default: null + - name: device + type: str + doc: Device to load the model. Can be one of {"cuda", "cpu"}. Default will + prefer "cuda" if available. + default: null + - name: speakers_labels + type: List[str] + doc: 'Labels to use for the recognized speakers. Default: numeric labels (0, + 1, ...).' + default: null + - name: speaker_prefix + type: str + doc: 'A prefix to add for the speakers labels. This parameter is ignored if + `speakers_labels` is not None. Default: "speaker".' + default: speaker_ + - name: separate_by_channels + type: bool + doc: If each speaker is speaking in a separate channel, you can diarize each + channel and combine the result into a single diarization. Each label set + in the `speakers_labels` parameter will be assigned to a specific channel + by order. + default: false + - name: minimum_speakers + type: int + doc: Set the minimum expected amount of speakers to be in the audio files. + This parameter is ignored if `speakers_labels` is not None. + default: null + - name: maximum_speakers + type: int + doc: Set the maximum expected amount of speakers to be in the audio files. + This parameter is ignored if `speakers_labels` is not None. + default: null + - name: verbose + type: bool + doc: 'Whether to present logs of a progress bar and errors. Default: True.' + default: false + outputs: + - doc: 'A tuple of:' + type: Tuple[Dict[str, List[Tuple[float, float, str]]], Dict[str, str]] + lineno: 139 + has_varargs: false + has_kwargs: false + description: pyannote's speech diarization of audio files + default_handler: diarize + disable_auto_mount: false + clone_target_dir: '' + env: [] + priority_class_name: '' + preemption_mode: prevent + affinity: null + tolerations: null + security_context: {} +verbose: false diff --git a/functions/master/pyannote_audio/1.2.0/src/item.yaml b/functions/master/pyannote_audio/1.2.0/src/item.yaml new file mode 100644 index 00000000..b69add9e --- /dev/null +++ b/functions/master/pyannote_audio/1.2.0/src/item.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +categories: +- deep-learning +- huggingface +- audio +description: pyannote's speech diarization of audio files +doc: '' +example: pyannote_audio.ipynb +generationDate: 2023-12-03:14-30 +hidden: false +icon: '' +labels: + author: guyl +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.5.2 +name: pyannote-audio +platformVersion: 3.5.3 +spec: + filename: pyannote_audio.py + handler: diarize + image: mlrun/mlrun-gpu + kind: job + requirements: + - pyannote.audio + - pyannote.core + - torchaudio + - tqdm +url: '' +version: 1.2.0 diff --git a/functions/master/pyannote_audio/1.2.0/src/pyannote_audio.ipynb b/functions/master/pyannote_audio/1.2.0/src/pyannote_audio.ipynb new file mode 100644 index 00000000..9901cc4f --- /dev/null +++ b/functions/master/pyannote_audio/1.2.0/src/pyannote_audio.ipynb @@ -0,0 +1,375 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4f17e477-db37-41b6-a76e-c69dbeea53db", + "metadata": {}, + "source": [ + "# Speech diarization example notebook" + ] + }, + { + "cell_type": "markdown", + "id": "46e7131b-42fe-4f3c-a268-08d6d4ff9cdf", + "metadata": {}, + "source": [ + "In this notebook we will utilize a call diarization capability to get per-speaker speech durations from a call recording.
    \n", + "This can be useful for quantifying participation rates in calls for things like customer service analysis.
    \n", + "\n", + "We will demonstrate this by:
    \n", + "\n", + "1. Loading in a sample call recording between multiple participants\n", + "2. Using a diarize() function to automatically detect speakers and estimate per-speaker talk time\n", + "3. Return a dictionary of described results, and a df of errors\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "53d25661-15eb-40c0-8ec8-4af9838c1d04", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import mlrun" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "68b84d16-d0aa-4e86-a716-5d92e38c9236", + "metadata": {}, + "outputs": [], + "source": [ + "# To use the `pyannote.audio` models you must pass a Huggingface token and get access to the required models. The\n", + "# token can be passed in one of the following options:\n", + "#\n", + "# * Use the parameter `access_token`.\n", + "# * Set an environment variable named \"HUGGING_FACE_HUB_TOKEN\".\n", + "# * If using MLRun, you can pass it as a secret named \"HUGGING_FACE_HUB_TOKEN\".\n", + "os.environ[\"HUGGING_FACE_HUB_TOKEN\"] = <\"add your token here\">\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2a0b1f97-6fba-400f-aacf-fe1da28e35d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-05 15:28:51,758 [info] Project loaded successfully: {'project_name': 'diarization-test'}\n" + ] + } + ], + "source": [ + "# Create an mlrun project\n", + "project = mlrun.get_or_create_project(\"diarization-test\")\n", + "\n", + "# Import the function from the yaml file, once it's in the the we can import from there \n", + "speech_diarization = project.set_function(func=\"hub://speech_diarization\", name=\"speech_diarization\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "50d9a797-a3f2-4824-b6e2-8245f6e30b17", + "metadata": {}, + "outputs": [], + "source": [ + "# Set the desired run params and files\n", + "audio_files = os.path.join(\"test_data.wav\")\n", + "device = \"cpu\"\n", + "speakers_labels = [\"Agent\", \"Client\"]\n", + "separate_by_channels = True" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "110080e5-3f54-4117-a61b-0e09f1422b1b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-05 15:28:52,229 [info] Storing function: {'name': 'speech-diarization-diarize', 'uid': 'ec6cd014e4674966b30303ea14048acf', 'db': 'http://mlrun-api:8080'}\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
    \n", + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    diarization-test0Dec 05 15:28:52completedspeech-diarization-diarize
    v3io_user=zeevr
    kind=local
    owner=zeevr
    host=jupyter-zeev-gpu-5995df47dc-rtpvr
    data_path
    device=cpu
    speakers_labels=['Agent', 'Client']
    separate_by_channels=True
    speech-diarization
    diarize-errors
    \n", + "
    \n", + "
    \n", + "
    \n", + " Title\n", + " ×\n", + "
    \n", + " \n", + "
    \n", + "
    \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-05 15:28:53,350 [info] Run execution finished: {'status': 'completed', 'name': 'speech-diarization-diarize'}\n" + ] + } + ], + "source": [ + "# Run the imported function with desired file/s and params\n", + "diarize_run = speech_diarization.run(\n", + " handler=\"diarize\",\n", + " inputs={\"data_path\": audio_files},\n", + " params={\n", + " \"device\": device,\n", + " \"speakers_labels\": speakers_labels,\n", + " \"separate_by_channels\": separate_by_channels,\n", + " },\n", + " returns=[\"speech-diarization: file\", \"diarize-errors: file\"],\n", + " local=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ede77975-8843-424f-b521-b9dd56ddad28", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlrun-base", + "language": "python", + "name": "conda-env-mlrun-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/functions/master/pyannote_audio/1.2.0/src/pyannote_audio.py b/functions/master/pyannote_audio/1.2.0/src/pyannote_audio.py new file mode 100644 index 00000000..6271da6a --- /dev/null +++ b/functions/master/pyannote_audio/1.2.0/src/pyannote_audio.py @@ -0,0 +1,376 @@ +# Copyright 2023 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import heapq +import logging +import operator +import os +import pathlib +from functools import reduce, wraps +from typing import Any, Dict, List, Tuple, Union + +import pandas as pd +import pyannote.audio +import pyannote.core +import torch +import torchaudio +from tqdm import tqdm + +# Get the global logger: +_LOGGER = logging.getLogger() + + +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]: + is_mpi = False + try: + import mlrun + + context = mlrun.get_or_create_ctx(name="mlrun") + is_mpi = context.labels.get("kind", "job") == "mpijob" + + if is_mpi: + try: + from mpi4py import MPI + + return context, MPI.COMM_WORLD + except ModuleNotFoundError as mpi4py_not_found: + context.logger.error( + "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your " + "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi." + ) + raise mpi4py_not_found + else: + return context, None + except ModuleNotFoundError as module_not_found: + if is_mpi: + raise module_not_found + return None, None + + +def open_mpi_handler( + worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None +): + global _LOGGER + + # Check for MLRun and OpenMPI availability: + context, comm = _check_mlrun_and_open_mpi() + + # Check if MLRun is available, set the global logger to MLRun's: + if context: + _LOGGER = context.logger + + def decorator(handler): + if comm is None or comm.Get_size() == 1: + return handler + + @wraps(handler) + def wrapper(**kwargs): + # Get the open mpi environment properties: + size = comm.Get_size() + rank = comm.Get_rank() + + # Give the correct chunk of the workers inputs: + for worker_input in worker_inputs: + input_argument = kwargs[worker_input] + if input_argument is None: + continue + if isinstance(input_argument, str): + input_argument = _get_audio_files( + data_path=pathlib.Path(input_argument).absolute() + ) + if len(input_argument) < size: + raise ValueError( + f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. " + f"Please reduce the amount of workers for this input." + ) + even_chunk_size = len(input_argument) // size + chunk_start = rank * even_chunk_size + chunk_end = ( + (rank + 1) * even_chunk_size + if rank + 1 < size + else len(input_argument) + ) + context.logger.info( + f"Rank #{rank}: Processing input chunk of '{worker_input}' " + f"from index {chunk_start} to {chunk_end}." + ) + if isinstance(input_argument, list): + input_argument = input_argument[chunk_start:chunk_end] + elif isinstance(input_argument, pd.DataFrame): + input_argument = input_argument.iloc[chunk_start:chunk_end:, :] + kwargs[worker_input] = input_argument + + # Set the root worker only arguments: + if rank == 0 and root_worker_inputs: + kwargs.update(root_worker_inputs) + + # Run the worker: + output = handler(**kwargs) + + # Send the output to the root rank (rank #0): + output = comm.gather(output, root=0) + if rank == 0: + # Join the outputs: + context.logger.info("Collecting data from workers to root worker.") + diarization_dictionary = reduce( + operator.ior, [dia for dia, _ in output], {} + ) + errors_dictionary = reduce(operator.ior, [err for _, err in output], {}) + return diarization_dictionary, errors_dictionary + return None + + return wrapper + + return decorator + + +@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True}) +def diarize( + data_path: Union[str, List[str]], + model_name: str = "pyannote/speaker-diarization-3.0", + access_token: str = None, + device: str = None, + speakers_labels: List[str] = None, + speaker_prefix: str = "speaker_", + separate_by_channels: bool = False, + minimum_speakers: int = None, + maximum_speakers: int = None, + verbose: bool = False, +) -> Tuple[Dict[str, List[Tuple[float, float, str]]], Dict[str, str]]: + """ + Perform speech diarization on given audio files using pyannote-audio (https://github.com/pyannote/pyannote-audio). + The end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list + of tuples: (start, end, speaker_label). + + To use the `pyannote.audio` models you must pass a Huggingface token and get access to the required models. The + token can be passed in one of the following options: + + * Use the parameter `access_token`. + * Set an environment variable named "HUGGING_FACE_HUB_TOKEN". + * If using MLRun, you can pass it as a secret named "HUGGING_FACE_HUB_TOKEN". + + To get access to the models on Huggingface, visit their page. For example, to use the default diarization model set + in this function ("pyannote/speaker-diarization-3.0"), you need access for these two models: + + * https://huggingface.co/pyannote/segmentation-3.0 + * https://huggingface.co/pyannote/speaker-diarization-3.0 + + Note: To control the recognized speakers in the diarization output you can choose one of the following methods: + + * For a known speakers amount, you may set speaker labels via the `speakers_labels` parameter that will be used in + the order of speaking in the audio (first person speaking be the first label in the list). In addition, you can do + diarization per channel (setting the parameter `separate_by_channels` to True). Each label will be assigned to a + specific channel by order (first label to channel 0, second label to channel 1 and so on). Notice, this will + increase runtime. + * For unknown speakers amount, you can set the `speaker_prefix` parameter to add a prefix for each speaker number. + You can also help the diarization by setting the speakers range via the `speakers_amount_range` parameter. + + :param data_path: A directory of the audio files, a single file or a list of files to transcribe. + :param model_name: One of the official diarization model names (referred as diarization pipelines) of + `pyannote.audio` Huggingface page. Default: "pyannote/speaker-diarization-3.0". + :param access_token: An access token to pass for using the `pyannote.audio` models. If not provided, it + will be looking for the environment variable "HUGGING_FACE_HUB_TOKEN". If MLRun is + available, it will look for a secret "HUGGING_FACE_HUB_TOKEN". + :param device: Device to load the model. Can be one of {"cuda", "cpu"}. Default will prefer "cuda" if + available. + :param speakers_labels: Labels to use for the recognized speakers. Default: numeric labels (0, 1, ...). + :param separate_by_channels: If each speaker is speaking in a separate channel, you can diarize each channel and + combine the result into a single diarization. Each label set in the `speakers_labels` + parameter will be assigned to a specific channel by order. + :param speaker_prefix: A prefix to add for the speakers labels. This parameter is ignored if + `speakers_labels` is not None. Default: "speaker". + :param minimum_speakers: Set the minimum expected amount of speakers to be in the audio files. This parameter is + ignored if `speakers_labels` is not None. + :param maximum_speakers: Set the maximum expected amount of speakers to be in the audio files. This parameter is + ignored if `speakers_labels` is not None. + :param verbose: Whether to present logs of a progress bar and errors. Default: True. + + :returns: A tuple of: + + * Speech diarization dictionary. + * A dictionary of errored files that were not transcribed. + """ + global _LOGGER + + # Get the input audio files to diarize: + if isinstance(data_path, str): + data_path = pathlib.Path(data_path).absolute() + audio_files = _get_audio_files(data_path=data_path) + else: # Should be a list of files. + audio_files = data_path + + # Get the Huggingface access token: + access_token = _get_access_token(parameter=access_token) + if access_token is None: + raise ValueError( + "A Huggingface access token must be provided to use `pyannote.audio` models. Access token can be passed " + "via one of the following options:\n" + "* Use the parameter `access_token`.\n" + "* Set an environment variable named 'HUGGING_FACE_HUB_TOKEN'.\n" + "* If using MLRun, you can pass it as a secret named 'HUGGING_FACE_HUB_TOKEN'." + ) + + # Load the diarization pipeline: + pipeline = pyannote.audio.Pipeline.from_pretrained( + checkpoint_path=model_name, use_auth_token=access_token + ) + + # Set the device: + device = device or ("cuda" if torch.cuda.is_available() else "cpu") + if device != "cpu": + pipeline.to(torch.device(device)) + + # Prepare the successes dataframe and errors dictionary to be returned: + diarizations = {} + errors = {} + + # Prepare the diarization keyword arguments: + diarize_kwargs = {} + if speakers_labels: + diarize_kwargs["num_speakers"] = len(speakers_labels) + else: + if minimum_speakers: + diarize_kwargs["min_speakers"] = minimum_speakers + if maximum_speakers: + diarize_kwargs["max_speakers"] = maximum_speakers + + # Go over the audio files and diarize: + for audio_file in tqdm( + audio_files, desc="Diarizing", unit="file", disable=not verbose + ): + try: + # Load audio file: + audio, sample_rate = torchaudio.load(uri=audio_file, channels_first=True) + # Get the diarization (if provided): + diarizations[audio_file.name] = _diarize( + audio=audio, + sample_rate=sample_rate, + pipeline=pipeline, + speakers_labels=speakers_labels, + separate_by_channels=separate_by_channels, + speaker_prefix=speaker_prefix, + diarize_kwargs=diarize_kwargs, + ) + except Exception as exception: + # Note the exception as error in the dictionary: + if verbose: + _LOGGER.warning(f"Error in file: '{audio_file.name}'") + errors[str(audio_file.name)] = str(exception) + continue + + # Print the head of the produced dataframe and return: + if verbose: + _LOGGER.info(f"Done ({len(diarizations)}/{len(audio_files)})\n") + return diarizations, errors + + +def _get_audio_files( + data_path: pathlib.Path, +) -> List[pathlib.Path]: + # Check if the path is of a directory or a file: + if data_path.is_dir(): + # Get all files inside the directory: + audio_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + audio_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. " + f"Given: {str(data_path)} " + ) + + return audio_files + + +def _get_access_token(parameter: str) -> str: + # If given as a parameter, return it: + if parameter: + return parameter + + # Otherwise, look at the environment variable: + environment_variable = os.environ.get("HUGGING_FACE_HUB_TOKEN") + if environment_variable: + return environment_variable + + # Lastly, try look in the set secrets in MLRun: + secret = None + try: + import mlrun + + context = mlrun.get_or_create_ctx(name="mlrun") + secret = context.get_secret(key="HUGGING_FACE_HUB_TOKEN") + except ModuleNotFoundError: + pass + + return secret + + +def _diarize( + audio: torch.Tensor, + sample_rate: int, + pipeline: pyannote.audio.Pipeline, + speakers_labels: List[str], + separate_by_channels: bool, + speaker_prefix: str, + diarize_kwargs: dict, +) -> List[Tuple[float, float, str]]: + # If there is no need for separation by channels, we diarize and return: + if not separate_by_channels: + # Diarize: + diarization: pyannote.core.Annotation = pipeline( + file={"waveform": audio, "sample_rate": sample_rate}, **diarize_kwargs + ) + # Verify speakers labels (should not fail here as we set `num_speakers=len(speakers_labels)` when inferring + # through the pipeline): + if speakers_labels: + given_speakers = len(speakers_labels) + found_speakers = len(set(diarization.labels())) + if given_speakers < found_speakers: + raise ValueError( + f"Not enough `speakers_labels` were given. Got {given_speakers} labels but the diarization " + f"recognized {found_speakers} speakers." + ) + # Return as a diarization list - a sorted list of tuples of start time, end time and a label (the default label + # returned is "SPEAKER_i" so we take only the index out of it): + return [ + ( + segment.start, + segment.end, + speakers_labels[int(label.split("_")[1])] + if speakers_labels + else f"{speaker_prefix}{int(label.split('_')[1])}", + ) + for segment, track, label in diarization.itertracks(yield_label=True) + ] + + # Separate to channels and diarize (we expect only one speaker per channel): + channel_diarizations = [ + _diarize( + audio=audio[channel].unsqueeze( + 0 + ), # Take channel and add a channel dimension to it. + sample_rate=sample_rate, + pipeline=pipeline, + speakers_labels=[ + speakers_labels[channel] + ], # Take the channel's label only. + separate_by_channels=False, + speaker_prefix=speaker_prefix, + diarize_kwargs={"num_speakers": 1}, # Set to one speaker. + ) + for channel in range(audio.shape[0]) + ] + + # Merge the channel diarizations into a single sorted list: + return list(heapq.merge(*channel_diarizations)) diff --git a/functions/master/pyannote_audio/1.2.0/src/test_pyannote_audio.py b/functions/master/pyannote_audio/1.2.0/src/test_pyannote_audio.py new file mode 100644 index 00000000..93da5083 --- /dev/null +++ b/functions/master/pyannote_audio/1.2.0/src/test_pyannote_audio.py @@ -0,0 +1,25 @@ +import os + +import mlrun +import pytest + + +@pytest.mark.skipif("HUGGING_FACE_HUB_TOKEN" not in os.environ, reason="no token") +def test_speech_diarization(): + project = mlrun.new_project("diarization-test2") + speech_diarization = project.set_function( + func="./function.yaml", name="speech_diarization", image="mlrun/mlrun" + ) + + diarize_run = speech_diarization.run( + handler="diarize", + inputs={"data_path": os.path.join("assets", "test_data.wav")}, + params={ + "device": "cpu", + "speakers_labels": ["Agent", "Client"], + "separate_by_channels": True, + }, + returns=["speech_diarization: file", "diarize_errors: file"], + local=True, + ) + assert diarize_run.outputs["speech_diarization"] diff --git a/functions/master/pyannote_audio/1.2.0/static/documentation.html b/functions/master/pyannote_audio/1.2.0/static/documentation.html new file mode 100644 index 00000000..1fdd1cf3 --- /dev/null +++ b/functions/master/pyannote_audio/1.2.0/static/documentation.html @@ -0,0 +1,289 @@ + + + + + + + +pyannote_audio package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + + + +
    +
    + + +
    +
    +
    + +
    +

    pyannote_audio package

    + +
    + +
    +
    +
    +
    +
    +

    pyannote_audio package#

    +
    +

    Submodules#

    +
    +
    +

    pyannote_audio.pyannote_audio module#

    +
    +
    +pyannote_audio.pyannote_audio.diarize(data_path: Union[str, List[str]], model_name: str = 'pyannote/speaker-diarization-3.0', access_token: Optional[str] = None, device: Optional[str] = None, speakers_labels: Optional[List[str]] = None, speaker_prefix: str = 'speaker_', separate_by_channels: bool = False, minimum_speakers: Optional[int] = None, maximum_speakers: Optional[int] = None, verbose: bool = False)Tuple[Dict[str, List[Tuple[float, float, str]]], Dict[str, str]][source]#
    +

    Perform speech diarization on given audio files using pyannote-audio (https://github.com/pyannote/pyannote-audio). +The end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list +of tuples: (start, end, speaker_label).

    +

    To use the pyannote.audio models you must pass a Huggingface token and get access to the required models. The +token can be passed in one of the following options:

    +
      +
    • Use the parameter access_token.

    • +
    • Set an environment variable named “HUGGING_FACE_HUB_TOKEN”.

    • +
    • If using MLRun, you can pass it as a secret named “HUGGING_FACE_HUB_TOKEN”.

    • +
    +

    To get access to the models on Huggingface, visit their page. For example, to use the default diarization model set +in this function (“pyannote/speaker-diarization-3.0”), you need access for these two models:

    + +

    Note: To control the recognized speakers in the diarization output you can choose one of the following methods:

    +
      +
    • For a known speakers amount, you may set speaker labels via the speakers_labels parameter that will be used in +the order of speaking in the audio (first person speaking be the first label in the list). In addition, you can do +diarization per channel (setting the parameter separate_by_channels to True). Each label will be assigned to a +specific channel by order (first label to channel 0, second label to channel 1 and so on). Notice, this will +increase runtime.

    • +
    • For unknown speakers amount, you can set the speaker_prefix parameter to add a prefix for each speaker number. +You can also help the diarization by setting the speakers range via the speakers_amount_range parameter.

    • +
    +
    +
    Parameters
    +
      +
    • data_path – A directory of the audio files, a single file or a list of files to transcribe.

    • +
    • model_name – One of the official diarization model names (referred as diarization pipelines) of +pyannote.audio Huggingface page. Default: “pyannote/speaker-diarization-3.0”.

    • +
    • access_token – An access token to pass for using the pyannote.audio models. If not provided, it +will be looking for the environment variable “HUGGING_FACE_HUB_TOKEN”. If MLRun is +available, it will look for a secret “HUGGING_FACE_HUB_TOKEN”.

    • +
    • device – Device to load the model. Can be one of {“cuda”, “cpu”}. Default will prefer “cuda” if +available.

    • +
    • speakers_labels – Labels to use for the recognized speakers. Default: numeric labels (0, 1, …).

    • +
    • separate_by_channels – If each speaker is speaking in a separate channel, you can diarize each channel and +combine the result into a single diarization. Each label set in the speakers_labels +parameter will be assigned to a specific channel by order.

    • +
    • speaker_prefix – A prefix to add for the speakers labels. This parameter is ignored if +speakers_labels is not None. Default: “speaker”.

    • +
    • minimum_speakers – Set the minimum expected amount of speakers to be in the audio files. This parameter is +ignored if speakers_labels is not None.

    • +
    • maximum_speakers – Set the maximum expected amount of speakers to be in the audio files. This parameter is +ignored if speakers_labels is not None.

    • +
    • verbose – Whether to present logs of a progress bar and errors. Default: True.

    • +
    +
    +
    Returns
    +

    A tuple of:

    +
      +
    • Speech diarization dictionary.

    • +
    • A dictionary of errored files that were not transcribed.

    • +
    +

    +
    +
    +
    +
    +
    +pyannote_audio.pyannote_audio.open_mpi_handler(worker_inputs: List[str], root_worker_inputs: Optional[Dict[str, Any]] = None)[source]#
    +
    +
    +
    +

    Module contents#

    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/pyannote_audio/1.2.0/static/example.html b/functions/master/pyannote_audio/1.2.0/static/example.html new file mode 100644 index 00000000..4a3c9044 --- /dev/null +++ b/functions/master/pyannote_audio/1.2.0/static/example.html @@ -0,0 +1,449 @@ + + + + + + + +Speech diarization example notebook + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + + +
    +
    + +
    +
    +
    +
    +
    + +
    +

    Speech diarization example notebook

    + +
    +
    +
    +
    +
    +
    +
    +
    +

    Speech diarization example notebook#

    +

    In this notebook we will utilize a call diarization capability to get per-speaker speech durations from a call recording.
    +This can be useful for quantifying participation rates in calls for things like customer service analysis.

    +

    We will demonstrate this by:

    +
      +
    1. Loading in a sample call recording between multiple participants

    2. +
    3. Using a diarize() function to automatically detect speakers and estimate per-speaker talk time

    4. +
    5. Return a dictionary of described results, and a df of errors

    6. +
    +
    +
    +
    import os
    +import mlrun
    +
    +
    +
    +
    +
    +
    +
    # To use the `pyannote.audio` models you must pass a Huggingface token and get access to the required models. The
    +#    token can be passed in one of the following options:
    +#
    +#    * Use the parameter `access_token`.
    +#    * Set an environment variable named "HUGGING_FACE_HUB_TOKEN".
    +#    * If using MLRun, you can pass it as a secret named "HUGGING_FACE_HUB_TOKEN".
    +os.environ["HUGGING_FACE_HUB_TOKEN"] = <"add your token here">
    +
    +
    +
    +
    +
    +
    +
    # Create an mlrun project
    +project = mlrun.get_or_create_project("diarization-test")
    +
    +# Import the function from the yaml file, once it's in the the we can import from there 
    +speech_diarization = project.set_function(func="hub://speech_diarization", name="speech_diarization")
    +
    +
    +
    +
    +
    > 2023-12-05 15:28:51,758 [info] Project loaded successfully: {'project_name': 'diarization-test'}
    +
    +
    +
    +
    +
    +
    +
    # Set the desired run params and files
    +audio_files = os.path.join("test_data.wav")
    +device = "cpu"
    +speakers_labels = ["Agent", "Client"]
    +separate_by_channels = True
    +
    +
    +
    +
    +
    +
    +
    # Run the imported function with desired file/s and params
    +diarize_run = speech_diarization.run(
    +    handler="diarize",
    +    inputs={"data_path": audio_files},
    +    params={
    +        "device": device,
    +        "speakers_labels": speakers_labels,
    +        "separate_by_channels": separate_by_channels,
    +    },
    +    returns=["speech-diarization: file", "diarize-errors: file"],
    +    local=True,
    +)
    +
    +
    +
    +
    +
    > 2023-12-05 15:28:52,229 [info] Storing function: {'name': 'speech-diarization-diarize', 'uid': 'ec6cd014e4674966b30303ea14048acf', 'db': 'http://mlrun-api:8080'}
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    diarization-test0Dec 05 15:28:52completedspeech-diarization-diarize
    v3io_user=zeevr
    kind=local
    owner=zeevr
    host=jupyter-zeev-gpu-5995df47dc-rtpvr
    data_path
    device=cpu
    speakers_labels=['Agent', 'Client']
    separate_by_channels=True
    speech-diarization
    diarize-errors
    +
    + +
    +
    
    +
    +
    +
    > to track results use the .show() or .logs() methods or click here to open in UI
    > 2023-12-05 15:28:53,350 [info] Run execution finished: {'status': 'completed', 'name': 'speech-diarization-diarize'}
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/pyannote_audio/1.2.0/static/function.html b/functions/master/pyannote_audio/1.2.0/static/function.html new file mode 100644 index 00000000..6b99972d --- /dev/null +++ b/functions/master/pyannote_audio/1.2.0/static/function.html @@ -0,0 +1,173 @@ + + + + + + + + + + + Source + + + + +
    +        
    +kind: job
    +metadata:
    +  name: pyannote-audio
    +  tag: ''
    +  hash: aed670a0534ebf30690dd2af7acad35595c7d5b1
    +  project: ''
    +  labels:
    +    author: guyl
    +  categories:
    +  - deep-learning
    +  - huggingface
    +  - audio
    +spec:
    +  command: ''
    +  args: []
    +  image: ''
    +  build:
    +    functionSourceCode: # Copyright 2023 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import heapq
import logging
import operator
import os
import pathlib
from functools import reduce, wraps
from typing import Any, Dict, List, Tuple, Union

import pandas as pd
import pyannote.audio
import pyannote.core
import torch
import torchaudio
from tqdm import tqdm

# Get the global logger:
_LOGGER = logging.getLogger()


def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    is_mpi = False
    try:
        import mlrun

        context = mlrun.get_or_create_ctx(name="mlrun")
        is_mpi = context.labels.get("kind", "job") == "mpijob"

        if is_mpi:
            try:
                from mpi4py import MPI

                return context, MPI.COMM_WORLD
            except ModuleNotFoundError as mpi4py_not_found:
                context.logger.error(
                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
                )
                raise mpi4py_not_found
        else:
            return context, None
    except ModuleNotFoundError as module_not_found:
        if is_mpi:
            raise module_not_found
    return None, None


def open_mpi_handler(
    worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None
):
    global _LOGGER

    # Check for MLRun and OpenMPI availability:
    context, comm = _check_mlrun_and_open_mpi()

    # Check if MLRun is available, set the global logger to MLRun's:
    if context:
        _LOGGER = context.logger

    def decorator(handler):
        if comm is None or comm.Get_size() == 1:
            return handler

        @wraps(handler)
        def wrapper(**kwargs):
            # Get the open mpi environment properties:
            size = comm.Get_size()
            rank = comm.Get_rank()

            # Give the correct chunk of the workers inputs:
            for worker_input in worker_inputs:
                input_argument = kwargs[worker_input]
                if input_argument is None:
                    continue
                if isinstance(input_argument, str):
                    input_argument = _get_audio_files(
                        data_path=pathlib.Path(input_argument).absolute()
                    )
                if len(input_argument) < size:
                    raise ValueError(
                        f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. "
                        f"Please reduce the amount of workers for this input."
                    )
                even_chunk_size = len(input_argument) // size
                chunk_start = rank * even_chunk_size
                chunk_end = (
                    (rank + 1) * even_chunk_size
                    if rank + 1 < size
                    else len(input_argument)
                )
                context.logger.info(
                    f"Rank #{rank}: Processing input chunk of '{worker_input}' "
                    f"from index {chunk_start} to {chunk_end}."
                )
                if isinstance(input_argument, list):
                    input_argument = input_argument[chunk_start:chunk_end]
                elif isinstance(input_argument, pd.DataFrame):
                    input_argument = input_argument.iloc[chunk_start:chunk_end:, :]
                kwargs[worker_input] = input_argument

            # Set the root worker only arguments:
            if rank == 0 and root_worker_inputs:
                kwargs.update(root_worker_inputs)

            # Run the worker:
            output = handler(**kwargs)

            # Send the output to the root rank (rank #0):
            output = comm.gather(output, root=0)
            if rank == 0:
                # Join the outputs:
                context.logger.info("Collecting data from workers to root worker.")
                diarization_dictionary = reduce(
                    operator.ior, [dia for dia, _ in output], {}
                )
                errors_dictionary = reduce(operator.ior, [err for _, err in output], {})
                return diarization_dictionary, errors_dictionary
            return None

        return wrapper

    return decorator


@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True})
def diarize(
    data_path: Union[str, List[str]],
    model_name: str = "pyannote/speaker-diarization-3.0",
    access_token: str = None,
    device: str = None,
    speakers_labels: List[str] = None,
    speaker_prefix: str = "speaker_",
    separate_by_channels: bool = False,
    minimum_speakers: int = None,
    maximum_speakers: int = None,
    verbose: bool = False,
) -> Tuple[Dict[str, List[Tuple[float, float, str]]], Dict[str, str]]:
    """
    Perform speech diarization on given audio files using pyannote-audio (https://github.com/pyannote/pyannote-audio).
    The end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list
    of tuples: (start, end, speaker_label).

    To use the `pyannote.audio` models you must pass a Huggingface token and get access to the required models. The
    token can be passed in one of the following options:

    * Use the parameter `access_token`.
    * Set an environment variable named "HUGGING_FACE_HUB_TOKEN".
    * If using MLRun, you can pass it as a secret named "HUGGING_FACE_HUB_TOKEN".

    To get access to the models on Huggingface, visit their page. For example, to use the default diarization model set
    in this function ("pyannote/speaker-diarization-3.0"), you need access for these two models:

    * https://huggingface.co/pyannote/segmentation-3.0
    * https://huggingface.co/pyannote/speaker-diarization-3.0

    Note: To control the recognized speakers in the diarization output you can choose one of the following methods:

    * For a known speakers amount, you may set speaker labels via the `speakers_labels` parameter that will be used in
      the order of speaking in the audio (first person speaking be the first label in the list). In addition, you can do
      diarization per channel (setting the parameter `separate_by_channels` to True). Each label will be assigned to a
      specific channel by order (first label to channel 0, second label to channel 1 and so on). Notice, this will
      increase runtime.
    * For unknown speakers amount, you can set the `speaker_prefix` parameter to add a prefix for each speaker number.
      You can also help the diarization by setting the speakers range via the `speakers_amount_range` parameter.

    :param data_path:            A directory of the audio files, a single file or a list of files to transcribe.
    :param model_name:           One of the official diarization model names (referred as diarization pipelines) of
                                 `pyannote.audio` Huggingface page. Default: "pyannote/speaker-diarization-3.0".
    :param access_token:         An access token to pass for using the `pyannote.audio` models. If not provided, it
                                 will be looking for the environment variable "HUGGING_FACE_HUB_TOKEN". If MLRun is
                                 available, it will look for a secret "HUGGING_FACE_HUB_TOKEN".
    :param device:               Device to load the model. Can be one of {"cuda", "cpu"}. Default will prefer "cuda" if
                                 available.
    :param speakers_labels:      Labels to use for the recognized speakers. Default: numeric labels (0, 1, ...).
    :param separate_by_channels: If each speaker is speaking in a separate channel, you can diarize each channel and
                                 combine the result into a single diarization. Each label set in the `speakers_labels`
                                 parameter will be assigned to a specific channel by order.
    :param speaker_prefix:       A prefix to add for the speakers labels. This parameter is ignored if
                                 `speakers_labels` is not None. Default: "speaker".
    :param minimum_speakers:     Set the minimum expected amount of speakers to be in the audio files. This parameter is
                                 ignored if `speakers_labels` is not None.
    :param maximum_speakers:     Set the maximum expected amount of speakers to be in the audio files. This parameter is
                                 ignored if `speakers_labels` is not None.
    :param verbose:              Whether to present logs of a progress bar and errors. Default: True.

    :returns: A tuple of:

              * Speech diarization dictionary.
              * A dictionary of errored files that were not transcribed.
    """
    global _LOGGER

    # Get the input audio files to diarize:
    if isinstance(data_path, str):
        data_path = pathlib.Path(data_path).absolute()
        audio_files = _get_audio_files(data_path=data_path)
    else:  # Should be a list of files.
        audio_files = data_path

    # Get the Huggingface access token:
    access_token = _get_access_token(parameter=access_token)
    if access_token is None:
        raise ValueError(
            "A Huggingface access token must be provided to use `pyannote.audio` models. Access token can be passed "
            "via one of the following options:\n"
            "* Use the parameter `access_token`.\n"
            "* Set an environment variable named 'HUGGING_FACE_HUB_TOKEN'.\n"
            "* If using MLRun, you can pass it as a secret named 'HUGGING_FACE_HUB_TOKEN'."
        )

    # Load the diarization pipeline:
    pipeline = pyannote.audio.Pipeline.from_pretrained(
        checkpoint_path=model_name, use_auth_token=access_token
    )

    # Set the device:
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    if device != "cpu":
        pipeline.to(torch.device(device))

    # Prepare the successes dataframe and errors dictionary to be returned:
    diarizations = {}
    errors = {}

    # Prepare the diarization keyword arguments:
    diarize_kwargs = {}
    if speakers_labels:
        diarize_kwargs["num_speakers"] = len(speakers_labels)
    else:
        if minimum_speakers:
            diarize_kwargs["min_speakers"] = minimum_speakers
        if maximum_speakers:
            diarize_kwargs["max_speakers"] = maximum_speakers

    # Go over the audio files and diarize:
    for audio_file in tqdm(
        audio_files, desc="Diarizing", unit="file", disable=not verbose
    ):
        try:
            # Load audio file:
            audio, sample_rate = torchaudio.load(uri=audio_file, channels_first=True)
            # Get the diarization (if provided):
            diarizations[audio_file.name] = _diarize(
                audio=audio,
                sample_rate=sample_rate,
                pipeline=pipeline,
                speakers_labels=speakers_labels,
                separate_by_channels=separate_by_channels,
                speaker_prefix=speaker_prefix,
                diarize_kwargs=diarize_kwargs,
            )
        except Exception as exception:
            # Note the exception as error in the dictionary:
            if verbose:
                _LOGGER.warning(f"Error in file: '{audio_file.name}'")
            errors[str(audio_file.name)] = str(exception)
            continue

    # Print the head of the produced dataframe and return:
    if verbose:
        _LOGGER.info(f"Done ({len(diarizations)}/{len(audio_files)})\n")
    return diarizations, errors


def _get_audio_files(
    data_path: pathlib.Path,
) -> List[pathlib.Path]:
    # Check if the path is of a directory or a file:
    if data_path.is_dir():
        # Get all files inside the directory:
        audio_files = list(data_path.glob("*.*"))
    elif data_path.is_file():
        audio_files = [data_path]
    else:
        raise ValueError(
            f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. "
            f"Given: {str(data_path)} "
        )

    return audio_files


def _get_access_token(parameter: str) -> str:
    # If given as a parameter, return it:
    if parameter:
        return parameter

    # Otherwise, look at the environment variable:
    environment_variable = os.environ.get("HUGGING_FACE_HUB_TOKEN")
    if environment_variable:
        return environment_variable

    # Lastly, try look in the set secrets in MLRun:
    secret = None
    try:
        import mlrun

        context = mlrun.get_or_create_ctx(name="mlrun")
        secret = context.get_secret(key="HUGGING_FACE_HUB_TOKEN")
    except ModuleNotFoundError:
        pass

    return secret


def _diarize(
    audio: torch.Tensor,
    sample_rate: int,
    pipeline: pyannote.audio.Pipeline,
    speakers_labels: List[str],
    separate_by_channels: bool,
    speaker_prefix: str,
    diarize_kwargs: dict,
) -> List[Tuple[float, float, str]]:
    # If there is no need for separation by channels, we diarize and return:
    if not separate_by_channels:
        # Diarize:
        diarization: pyannote.core.Annotation = pipeline(
            file={"waveform": audio, "sample_rate": sample_rate}, **diarize_kwargs
        )
        # Verify speakers labels (should not fail here as we set `num_speakers=len(speakers_labels)` when inferring
        # through the pipeline):
        if speakers_labels:
            given_speakers = len(speakers_labels)
            found_speakers = len(set(diarization.labels()))
            if given_speakers < found_speakers:
                raise ValueError(
                    f"Not enough `speakers_labels` were given. Got {given_speakers} labels but the diarization "
                    f"recognized {found_speakers} speakers."
                )
        # Return as a diarization list - a sorted list of tuples of start time, end time and a label (the default label
        # returned is "SPEAKER_i" so we take only the index out of it):
        return [
            (
                segment.start,
                segment.end,
                speakers_labels[int(label.split("_")[1])]
                if speakers_labels
                else f"{speaker_prefix}{int(label.split('_')[1])}",
            )
            for segment, track, label in diarization.itertracks(yield_label=True)
        ]

    # Separate to channels and diarize (we expect only one speaker per channel):
    channel_diarizations = [
        _diarize(
            audio=audio[channel].unsqueeze(
                0
            ),  # Take channel and add a channel dimension to it.
            sample_rate=sample_rate,
            pipeline=pipeline,
            speakers_labels=[
                speakers_labels[channel]
            ],  # Take the channel's label only.
            separate_by_channels=False,
            speaker_prefix=speaker_prefix,
            diarize_kwargs={"num_speakers": 1},  # Set to one speaker.
        )
        for channel in range(audio.shape[0])
    ]

    # Merge the channel diarizations into a single sorted list:
    return list(heapq.merge(*channel_diarizations))

    +    base_image: mlrun/mlrun-gpu
    +    commands: []
    +    code_origin: ''
    +    origin_filename: ''
    +    requirements:
    +    - pyannote.audio
    +    - pyannote.core
    +    - torchaudio
    +    - tqdm
    +  entry_points:
    +    open_mpi_handler:
    +      name: open_mpi_handler
    +      doc: ''
    +      parameters:
    +      - name: worker_inputs
    +        type: List[str]
    +      - name: root_worker_inputs
    +        type: Dict[str, Any]
    +        default: null
    +      outputs: []
    +      lineno: 61
    +      has_varargs: false
    +      has_kwargs: false
    +    decorator:
    +      name: decorator
    +      doc: ''
    +      parameters:
    +      - name: handler
    +      outputs: []
    +      lineno: 73
    +      has_varargs: false
    +      has_kwargs: false
    +    wrapper:
    +      name: wrapper
    +      doc: ''
    +      parameters: []
    +      outputs: []
    +      lineno: 78
    +      has_varargs: false
    +      has_kwargs: true
    +    diarize:
    +      name: diarize
    +      doc: "Perform speech diarization on given audio files using pyannote-audio (https://github.com/pyannote/pyannote-audio).\n\
    +        The end result is a dictionary with the file names as keys and their diarization\
    +        \ as value. A diarization is a list\nof tuples: (start, end, speaker_label).\n\
    +        \nTo use the `pyannote.audio` models you must pass a Huggingface token and\
    +        \ get access to the required models. The\ntoken can be passed in one of the\
    +        \ following options:\n\n* Use the parameter `access_token`.\n* Set an environment\
    +        \ variable named \"HUGGING_FACE_HUB_TOKEN\".\n* If using MLRun, you can pass\
    +        \ it as a secret named \"HUGGING_FACE_HUB_TOKEN\".\n\nTo get access to the\
    +        \ models on Huggingface, visit their page. For example, to use the default\
    +        \ diarization model set\nin this function (\"pyannote/speaker-diarization-3.0\"\
    +        ), you need access for these two models:\n\n* https://huggingface.co/pyannote/segmentation-3.0\n\
    +        * https://huggingface.co/pyannote/speaker-diarization-3.0\n\nNote: To control\
    +        \ the recognized speakers in the diarization output you can choose one of\
    +        \ the following methods:\n\n* For a known speakers amount, you may set speaker\
    +        \ labels via the `speakers_labels` parameter that will be used in\n  the order\
    +        \ of speaking in the audio (first person speaking be the first label in the\
    +        \ list). In addition, you can do\n  diarization per channel (setting the parameter\
    +        \ `separate_by_channels` to True). Each label will be assigned to a\n  specific\
    +        \ channel by order (first label to channel 0, second label to channel 1 and\
    +        \ so on). Notice, this will\n  increase runtime.\n* For unknown speakers amount,\
    +        \ you can set the `speaker_prefix` parameter to add a prefix for each speaker\
    +        \ number.\n  You can also help the diarization by setting the speakers range\
    +        \ via the `speakers_amount_range` parameter."
    +      parameters:
    +      - name: data_path
    +        type: Union[str, List[str]]
    +        doc: A directory of the audio files, a single file or a list of files to transcribe.
    +      - name: model_name
    +        type: str
    +        doc: 'One of the official diarization model names (referred as diarization
    +          pipelines) of `pyannote.audio` Huggingface page. Default: "pyannote/speaker-diarization-3.0".'
    +        default: pyannote/speaker-diarization-3.0
    +      - name: access_token
    +        type: str
    +        doc: An access token to pass for using the `pyannote.audio` models. If not
    +          provided, it will be looking for the environment variable "HUGGING_FACE_HUB_TOKEN".
    +          If MLRun is available, it will look for a secret "HUGGING_FACE_HUB_TOKEN".
    +        default: null
    +      - name: device
    +        type: str
    +        doc: Device to load the model. Can be one of {"cuda", "cpu"}. Default will
    +          prefer "cuda" if available.
    +        default: null
    +      - name: speakers_labels
    +        type: List[str]
    +        doc: 'Labels to use for the recognized speakers. Default: numeric labels (0,
    +          1, ...).'
    +        default: null
    +      - name: speaker_prefix
    +        type: str
    +        doc: 'A prefix to add for the speakers labels. This parameter is ignored if
    +          `speakers_labels` is not None. Default: "speaker".'
    +        default: speaker_
    +      - name: separate_by_channels
    +        type: bool
    +        doc: If each speaker is speaking in a separate channel, you can diarize each
    +          channel and combine the result into a single diarization. Each label set
    +          in the `speakers_labels` parameter will be assigned to a specific channel
    +          by order.
    +        default: false
    +      - name: minimum_speakers
    +        type: int
    +        doc: Set the minimum expected amount of speakers to be in the audio files.
    +          This parameter is ignored if `speakers_labels` is not None.
    +        default: null
    +      - name: maximum_speakers
    +        type: int
    +        doc: Set the maximum expected amount of speakers to be in the audio files.
    +          This parameter is ignored if `speakers_labels` is not None.
    +        default: null
    +      - name: verbose
    +        type: bool
    +        doc: 'Whether to present logs of a progress bar and errors. Default: True.'
    +        default: false
    +      outputs:
    +      - doc: 'A tuple of:'
    +        type: Tuple[Dict[str, List[Tuple[float, float, str]]], Dict[str, str]]
    +      lineno: 139
    +      has_varargs: false
    +      has_kwargs: false
    +  description: pyannote's speech diarization of audio files
    +  default_handler: diarize
    +  disable_auto_mount: false
    +  clone_target_dir: ''
    +  env: []
    +  priority_class_name: ''
    +  preemption_mode: prevent
    +  affinity: null
    +  tolerations: null
    +  security_context: {}
    +verbose: false
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/pyannote_audio/1.2.0/static/item.html b/functions/master/pyannote_audio/1.2.0/static/item.html new file mode 100644 index 00000000..4756b62f --- /dev/null +++ b/functions/master/pyannote_audio/1.2.0/static/item.html @@ -0,0 +1,52 @@ + + + + + + + + + + + Source + + + + +
    +        
    +apiVersion: v1
    +categories:
    +- deep-learning
    +- huggingface
    +- audio
    +description: pyannote's speech diarization of audio files
    +doc: ''
    +example: pyannote_audio.ipynb
    +generationDate: 2023-12-03:14-30
    +hidden: false
    +icon: ''
    +labels:
    +  author: guyl
    +maintainers: []
    +marketplaceType: ''
    +mlrunVersion: 1.5.2
    +name: pyannote-audio
    +platformVersion: 3.5.3
    +spec:
    +  filename: pyannote_audio.py
    +  handler: diarize
    +  image: mlrun/mlrun-gpu
    +  kind: job
    +  requirements:
    +  - pyannote.audio
    +  - pyannote.core
    +  - torchaudio
    +  - tqdm
    +url: ''
    +version: 1.2.0
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/pyannote_audio/1.2.0/static/pyannote_audio.html b/functions/master/pyannote_audio/1.2.0/static/pyannote_audio.html new file mode 100644 index 00000000..c58303c2 --- /dev/null +++ b/functions/master/pyannote_audio/1.2.0/static/pyannote_audio.html @@ -0,0 +1,516 @@ + + + + + + + +pyannote_audio.pyannote_audio + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + +
    +
    + +
    +
    +
    +
    +
    + +
    +

    + +
    +
    +
    +
    +
    +
    +
    +

    Source code for pyannote_audio.pyannote_audio

    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
    +import heapq
    +import logging
    +import operator
    +import os
    +import pathlib
    +from functools import reduce, wraps
    +from typing import Any, Dict, List, Tuple, Union
    +
    +import pandas as pd
    +import pyannote.audio
    +import pyannote.core
    +import torch
    +import torchaudio
    +from tqdm import tqdm
    +
    +# Get the global logger:
    +_LOGGER = logging.getLogger()
    +
    +
    +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    +    is_mpi = False
    +    try:
    +        import mlrun
    +
    +        context = mlrun.get_or_create_ctx(name="mlrun")
    +        is_mpi = context.labels.get("kind", "job") == "mpijob"
    +
    +        if is_mpi:
    +            try:
    +                from mpi4py import MPI
    +
    +                return context, MPI.COMM_WORLD
    +            except ModuleNotFoundError as mpi4py_not_found:
    +                context.logger.error(
    +                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
    +                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
    +                )
    +                raise mpi4py_not_found
    +        else:
    +            return context, None
    +    except ModuleNotFoundError as module_not_found:
    +        if is_mpi:
    +            raise module_not_found
    +    return None, None
    +
    +
    +
    [docs]def open_mpi_handler( + worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None +): + global _LOGGER + + # Check for MLRun and OpenMPI availability: + context, comm = _check_mlrun_and_open_mpi() + + # Check if MLRun is available, set the global logger to MLRun's: + if context: + _LOGGER = context.logger + + def decorator(handler): + if comm is None or comm.Get_size() == 1: + return handler + + @wraps(handler) + def wrapper(**kwargs): + # Get the open mpi environment properties: + size = comm.Get_size() + rank = comm.Get_rank() + + # Give the correct chunk of the workers inputs: + for worker_input in worker_inputs: + input_argument = kwargs[worker_input] + if input_argument is None: + continue + if isinstance(input_argument, str): + input_argument = _get_audio_files( + data_path=pathlib.Path(input_argument).absolute() + ) + if len(input_argument) < size: + raise ValueError( + f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. " + f"Please reduce the amount of workers for this input." + ) + even_chunk_size = len(input_argument) // size + chunk_start = rank * even_chunk_size + chunk_end = ( + (rank + 1) * even_chunk_size + if rank + 1 < size + else len(input_argument) + ) + context.logger.info( + f"Rank #{rank}: Processing input chunk of '{worker_input}' " + f"from index {chunk_start} to {chunk_end}." + ) + if isinstance(input_argument, list): + input_argument = input_argument[chunk_start:chunk_end] + elif isinstance(input_argument, pd.DataFrame): + input_argument = input_argument.iloc[chunk_start:chunk_end:, :] + kwargs[worker_input] = input_argument + + # Set the root worker only arguments: + if rank == 0 and root_worker_inputs: + kwargs.update(root_worker_inputs) + + # Run the worker: + output = handler(**kwargs) + + # Send the output to the root rank (rank #0): + output = comm.gather(output, root=0) + if rank == 0: + # Join the outputs: + context.logger.info("Collecting data from workers to root worker.") + diarization_dictionary = reduce( + operator.ior, [dia for dia, _ in output], {} + ) + errors_dictionary = reduce(operator.ior, [err for _, err in output], {}) + return diarization_dictionary, errors_dictionary + return None + + return wrapper + + return decorator
    + + +
    [docs]@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True}) +def diarize( + data_path: Union[str, List[str]], + model_name: str = "pyannote/speaker-diarization-3.0", + access_token: str = None, + device: str = None, + speakers_labels: List[str] = None, + speaker_prefix: str = "speaker_", + separate_by_channels: bool = False, + minimum_speakers: int = None, + maximum_speakers: int = None, + verbose: bool = False, +) -> Tuple[Dict[str, List[Tuple[float, float, str]]], Dict[str, str]]: + """ + Perform speech diarization on given audio files using pyannote-audio (https://github.com/pyannote/pyannote-audio). + The end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list + of tuples: (start, end, speaker_label). + + To use the `pyannote.audio` models you must pass a Huggingface token and get access to the required models. The + token can be passed in one of the following options: + + * Use the parameter `access_token`. + * Set an environment variable named "HUGGING_FACE_HUB_TOKEN". + * If using MLRun, you can pass it as a secret named "HUGGING_FACE_HUB_TOKEN". + + To get access to the models on Huggingface, visit their page. For example, to use the default diarization model set + in this function ("pyannote/speaker-diarization-3.0"), you need access for these two models: + + * https://huggingface.co/pyannote/segmentation-3.0 + * https://huggingface.co/pyannote/speaker-diarization-3.0 + + Note: To control the recognized speakers in the diarization output you can choose one of the following methods: + + * For a known speakers amount, you may set speaker labels via the `speakers_labels` parameter that will be used in + the order of speaking in the audio (first person speaking be the first label in the list). In addition, you can do + diarization per channel (setting the parameter `separate_by_channels` to True). Each label will be assigned to a + specific channel by order (first label to channel 0, second label to channel 1 and so on). Notice, this will + increase runtime. + * For unknown speakers amount, you can set the `speaker_prefix` parameter to add a prefix for each speaker number. + You can also help the diarization by setting the speakers range via the `speakers_amount_range` parameter. + + :param data_path: A directory of the audio files, a single file or a list of files to transcribe. + :param model_name: One of the official diarization model names (referred as diarization pipelines) of + `pyannote.audio` Huggingface page. Default: "pyannote/speaker-diarization-3.0". + :param access_token: An access token to pass for using the `pyannote.audio` models. If not provided, it + will be looking for the environment variable "HUGGING_FACE_HUB_TOKEN". If MLRun is + available, it will look for a secret "HUGGING_FACE_HUB_TOKEN". + :param device: Device to load the model. Can be one of {"cuda", "cpu"}. Default will prefer "cuda" if + available. + :param speakers_labels: Labels to use for the recognized speakers. Default: numeric labels (0, 1, ...). + :param separate_by_channels: If each speaker is speaking in a separate channel, you can diarize each channel and + combine the result into a single diarization. Each label set in the `speakers_labels` + parameter will be assigned to a specific channel by order. + :param speaker_prefix: A prefix to add for the speakers labels. This parameter is ignored if + `speakers_labels` is not None. Default: "speaker". + :param minimum_speakers: Set the minimum expected amount of speakers to be in the audio files. This parameter is + ignored if `speakers_labels` is not None. + :param maximum_speakers: Set the maximum expected amount of speakers to be in the audio files. This parameter is + ignored if `speakers_labels` is not None. + :param verbose: Whether to present logs of a progress bar and errors. Default: True. + + :returns: A tuple of: + + * Speech diarization dictionary. + * A dictionary of errored files that were not transcribed. + """ + global _LOGGER + + # Get the input audio files to diarize: + if isinstance(data_path, str): + data_path = pathlib.Path(data_path).absolute() + audio_files = _get_audio_files(data_path=data_path) + else: # Should be a list of files. + audio_files = data_path + + # Get the Huggingface access token: + access_token = _get_access_token(parameter=access_token) + if access_token is None: + raise ValueError( + "A Huggingface access token must be provided to use `pyannote.audio` models. Access token can be passed " + "via one of the following options:\n" + "* Use the parameter `access_token`.\n" + "* Set an environment variable named 'HUGGING_FACE_HUB_TOKEN'.\n" + "* If using MLRun, you can pass it as a secret named 'HUGGING_FACE_HUB_TOKEN'." + ) + + # Load the diarization pipeline: + pipeline = pyannote.audio.Pipeline.from_pretrained( + checkpoint_path=model_name, use_auth_token=access_token + ) + + # Set the device: + device = device or ("cuda" if torch.cuda.is_available() else "cpu") + if device != "cpu": + pipeline.to(torch.device(device)) + + # Prepare the successes dataframe and errors dictionary to be returned: + diarizations = {} + errors = {} + + # Prepare the diarization keyword arguments: + diarize_kwargs = {} + if speakers_labels: + diarize_kwargs["num_speakers"] = len(speakers_labels) + else: + if minimum_speakers: + diarize_kwargs["min_speakers"] = minimum_speakers + if maximum_speakers: + diarize_kwargs["max_speakers"] = maximum_speakers + + # Go over the audio files and diarize: + for audio_file in tqdm( + audio_files, desc="Diarizing", unit="file", disable=not verbose + ): + try: + # Load audio file: + audio, sample_rate = torchaudio.load(uri=audio_file, channels_first=True) + # Get the diarization (if provided): + diarizations[audio_file.name] = _diarize( + audio=audio, + sample_rate=sample_rate, + pipeline=pipeline, + speakers_labels=speakers_labels, + separate_by_channels=separate_by_channels, + speaker_prefix=speaker_prefix, + diarize_kwargs=diarize_kwargs, + ) + except Exception as exception: + # Note the exception as error in the dictionary: + if verbose: + _LOGGER.warning(f"Error in file: '{audio_file.name}'") + errors[str(audio_file.name)] = str(exception) + continue + + # Print the head of the produced dataframe and return: + if verbose: + _LOGGER.info(f"Done ({len(diarizations)}/{len(audio_files)})\n") + return diarizations, errors
    + + +def _get_audio_files( + data_path: pathlib.Path, +) -> List[pathlib.Path]: + # Check if the path is of a directory or a file: + if data_path.is_dir(): + # Get all files inside the directory: + audio_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + audio_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. " + f"Given: {str(data_path)} " + ) + + return audio_files + + +def _get_access_token(parameter: str) -> str: + # If given as a parameter, return it: + if parameter: + return parameter + + # Otherwise, look at the environment variable: + environment_variable = os.environ.get("HUGGING_FACE_HUB_TOKEN") + if environment_variable: + return environment_variable + + # Lastly, try look in the set secrets in MLRun: + secret = None + try: + import mlrun + + context = mlrun.get_or_create_ctx(name="mlrun") + secret = context.get_secret(key="HUGGING_FACE_HUB_TOKEN") + except ModuleNotFoundError: + pass + + return secret + + +def _diarize( + audio: torch.Tensor, + sample_rate: int, + pipeline: pyannote.audio.Pipeline, + speakers_labels: List[str], + separate_by_channels: bool, + speaker_prefix: str, + diarize_kwargs: dict, +) -> List[Tuple[float, float, str]]: + # If there is no need for separation by channels, we diarize and return: + if not separate_by_channels: + # Diarize: + diarization: pyannote.core.Annotation = pipeline( + file={"waveform": audio, "sample_rate": sample_rate}, **diarize_kwargs + ) + # Verify speakers labels (should not fail here as we set `num_speakers=len(speakers_labels)` when inferring + # through the pipeline): + if speakers_labels: + given_speakers = len(speakers_labels) + found_speakers = len(set(diarization.labels())) + if given_speakers < found_speakers: + raise ValueError( + f"Not enough `speakers_labels` were given. Got {given_speakers} labels but the diarization " + f"recognized {found_speakers} speakers." + ) + # Return as a diarization list - a sorted list of tuples of start time, end time and a label (the default label + # returned is "SPEAKER_i" so we take only the index out of it): + return [ + ( + segment.start, + segment.end, + speakers_labels[int(label.split("_")[1])] + if speakers_labels + else f"{speaker_prefix}{int(label.split('_')[1])}", + ) + for segment, track, label in diarization.itertracks(yield_label=True) + ] + + # Separate to channels and diarize (we expect only one speaker per channel): + channel_diarizations = [ + _diarize( + audio=audio[channel].unsqueeze( + 0 + ), # Take channel and add a channel dimension to it. + sample_rate=sample_rate, + pipeline=pipeline, + speakers_labels=[ + speakers_labels[channel] + ], # Take the channel's label only. + separate_by_channels=False, + speaker_prefix=speaker_prefix, + diarize_kwargs={"num_speakers": 1}, # Set to one speaker. + ) + for channel in range(audio.shape[0]) + ] + + # Merge the channel diarizations into a single sorted list: + return list(heapq.merge(*channel_diarizations)) +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/pyannote_audio/1.2.0/static/source.html b/functions/master/pyannote_audio/1.2.0/static/source.html new file mode 100644 index 00000000..fe7d54cd --- /dev/null +++ b/functions/master/pyannote_audio/1.2.0/static/source.html @@ -0,0 +1,398 @@ + + + + + + + + + + + Source + + + + +
    +        
    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
    +import heapq
    +import logging
    +import operator
    +import os
    +import pathlib
    +from functools import reduce, wraps
    +from typing import Any, Dict, List, Tuple, Union
    +
    +import pandas as pd
    +import pyannote.audio
    +import pyannote.core
    +import torch
    +import torchaudio
    +from tqdm import tqdm
    +
    +# Get the global logger:
    +_LOGGER = logging.getLogger()
    +
    +
    +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    +    is_mpi = False
    +    try:
    +        import mlrun
    +
    +        context = mlrun.get_or_create_ctx(name="mlrun")
    +        is_mpi = context.labels.get("kind", "job") == "mpijob"
    +
    +        if is_mpi:
    +            try:
    +                from mpi4py import MPI
    +
    +                return context, MPI.COMM_WORLD
    +            except ModuleNotFoundError as mpi4py_not_found:
    +                context.logger.error(
    +                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
    +                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
    +                )
    +                raise mpi4py_not_found
    +        else:
    +            return context, None
    +    except ModuleNotFoundError as module_not_found:
    +        if is_mpi:
    +            raise module_not_found
    +    return None, None
    +
    +
    +def open_mpi_handler(
    +    worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None
    +):
    +    global _LOGGER
    +
    +    # Check for MLRun and OpenMPI availability:
    +    context, comm = _check_mlrun_and_open_mpi()
    +
    +    # Check if MLRun is available, set the global logger to MLRun's:
    +    if context:
    +        _LOGGER = context.logger
    +
    +    def decorator(handler):
    +        if comm is None or comm.Get_size() == 1:
    +            return handler
    +
    +        @wraps(handler)
    +        def wrapper(**kwargs):
    +            # Get the open mpi environment properties:
    +            size = comm.Get_size()
    +            rank = comm.Get_rank()
    +
    +            # Give the correct chunk of the workers inputs:
    +            for worker_input in worker_inputs:
    +                input_argument = kwargs[worker_input]
    +                if input_argument is None:
    +                    continue
    +                if isinstance(input_argument, str):
    +                    input_argument = _get_audio_files(
    +                        data_path=pathlib.Path(input_argument).absolute()
    +                    )
    +                if len(input_argument) < size:
    +                    raise ValueError(
    +                        f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. "
    +                        f"Please reduce the amount of workers for this input."
    +                    )
    +                even_chunk_size = len(input_argument) // size
    +                chunk_start = rank * even_chunk_size
    +                chunk_end = (
    +                    (rank + 1) * even_chunk_size
    +                    if rank + 1 < size
    +                    else len(input_argument)
    +                )
    +                context.logger.info(
    +                    f"Rank #{rank}: Processing input chunk of '{worker_input}' "
    +                    f"from index {chunk_start} to {chunk_end}."
    +                )
    +                if isinstance(input_argument, list):
    +                    input_argument = input_argument[chunk_start:chunk_end]
    +                elif isinstance(input_argument, pd.DataFrame):
    +                    input_argument = input_argument.iloc[chunk_start:chunk_end:, :]
    +                kwargs[worker_input] = input_argument
    +
    +            # Set the root worker only arguments:
    +            if rank == 0 and root_worker_inputs:
    +                kwargs.update(root_worker_inputs)
    +
    +            # Run the worker:
    +            output = handler(**kwargs)
    +
    +            # Send the output to the root rank (rank #0):
    +            output = comm.gather(output, root=0)
    +            if rank == 0:
    +                # Join the outputs:
    +                context.logger.info("Collecting data from workers to root worker.")
    +                diarization_dictionary = reduce(
    +                    operator.ior, [dia for dia, _ in output], {}
    +                )
    +                errors_dictionary = reduce(operator.ior, [err for _, err in output], {})
    +                return diarization_dictionary, errors_dictionary
    +            return None
    +
    +        return wrapper
    +
    +    return decorator
    +
    +
    +@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True})
    +def diarize(
    +    data_path: Union[str, List[str]],
    +    model_name: str = "pyannote/speaker-diarization-3.0",
    +    access_token: str = None,
    +    device: str = None,
    +    speakers_labels: List[str] = None,
    +    speaker_prefix: str = "speaker_",
    +    separate_by_channels: bool = False,
    +    minimum_speakers: int = None,
    +    maximum_speakers: int = None,
    +    verbose: bool = False,
    +) -> Tuple[Dict[str, List[Tuple[float, float, str]]], Dict[str, str]]:
    +    """
    +    Perform speech diarization on given audio files using pyannote-audio (https://github.com/pyannote/pyannote-audio).
    +    The end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list
    +    of tuples: (start, end, speaker_label).
    +
    +    To use the `pyannote.audio` models you must pass a Huggingface token and get access to the required models. The
    +    token can be passed in one of the following options:
    +
    +    * Use the parameter `access_token`.
    +    * Set an environment variable named "HUGGING_FACE_HUB_TOKEN".
    +    * If using MLRun, you can pass it as a secret named "HUGGING_FACE_HUB_TOKEN".
    +
    +    To get access to the models on Huggingface, visit their page. For example, to use the default diarization model set
    +    in this function ("pyannote/speaker-diarization-3.0"), you need access for these two models:
    +
    +    * https://huggingface.co/pyannote/segmentation-3.0
    +    * https://huggingface.co/pyannote/speaker-diarization-3.0
    +
    +    Note: To control the recognized speakers in the diarization output you can choose one of the following methods:
    +
    +    * For a known speakers amount, you may set speaker labels via the `speakers_labels` parameter that will be used in
    +      the order of speaking in the audio (first person speaking be the first label in the list). In addition, you can do
    +      diarization per channel (setting the parameter `separate_by_channels` to True). Each label will be assigned to a
    +      specific channel by order (first label to channel 0, second label to channel 1 and so on). Notice, this will
    +      increase runtime.
    +    * For unknown speakers amount, you can set the `speaker_prefix` parameter to add a prefix for each speaker number.
    +      You can also help the diarization by setting the speakers range via the `speakers_amount_range` parameter.
    +
    +    :param data_path:            A directory of the audio files, a single file or a list of files to transcribe.
    +    :param model_name:           One of the official diarization model names (referred as diarization pipelines) of
    +                                 `pyannote.audio` Huggingface page. Default: "pyannote/speaker-diarization-3.0".
    +    :param access_token:         An access token to pass for using the `pyannote.audio` models. If not provided, it
    +                                 will be looking for the environment variable "HUGGING_FACE_HUB_TOKEN". If MLRun is
    +                                 available, it will look for a secret "HUGGING_FACE_HUB_TOKEN".
    +    :param device:               Device to load the model. Can be one of {"cuda", "cpu"}. Default will prefer "cuda" if
    +                                 available.
    +    :param speakers_labels:      Labels to use for the recognized speakers. Default: numeric labels (0, 1, ...).
    +    :param separate_by_channels: If each speaker is speaking in a separate channel, you can diarize each channel and
    +                                 combine the result into a single diarization. Each label set in the `speakers_labels`
    +                                 parameter will be assigned to a specific channel by order.
    +    :param speaker_prefix:       A prefix to add for the speakers labels. This parameter is ignored if
    +                                 `speakers_labels` is not None. Default: "speaker".
    +    :param minimum_speakers:     Set the minimum expected amount of speakers to be in the audio files. This parameter is
    +                                 ignored if `speakers_labels` is not None.
    +    :param maximum_speakers:     Set the maximum expected amount of speakers to be in the audio files. This parameter is
    +                                 ignored if `speakers_labels` is not None.
    +    :param verbose:              Whether to present logs of a progress bar and errors. Default: True.
    +
    +    :returns: A tuple of:
    +
    +              * Speech diarization dictionary.
    +              * A dictionary of errored files that were not transcribed.
    +    """
    +    global _LOGGER
    +
    +    # Get the input audio files to diarize:
    +    if isinstance(data_path, str):
    +        data_path = pathlib.Path(data_path).absolute()
    +        audio_files = _get_audio_files(data_path=data_path)
    +    else:  # Should be a list of files.
    +        audio_files = data_path
    +
    +    # Get the Huggingface access token:
    +    access_token = _get_access_token(parameter=access_token)
    +    if access_token is None:
    +        raise ValueError(
    +            "A Huggingface access token must be provided to use `pyannote.audio` models. Access token can be passed "
    +            "via one of the following options:\n"
    +            "* Use the parameter `access_token`.\n"
    +            "* Set an environment variable named 'HUGGING_FACE_HUB_TOKEN'.\n"
    +            "* If using MLRun, you can pass it as a secret named 'HUGGING_FACE_HUB_TOKEN'."
    +        )
    +
    +    # Load the diarization pipeline:
    +    pipeline = pyannote.audio.Pipeline.from_pretrained(
    +        checkpoint_path=model_name, use_auth_token=access_token
    +    )
    +
    +    # Set the device:
    +    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    +    if device != "cpu":
    +        pipeline.to(torch.device(device))
    +
    +    # Prepare the successes dataframe and errors dictionary to be returned:
    +    diarizations = {}
    +    errors = {}
    +
    +    # Prepare the diarization keyword arguments:
    +    diarize_kwargs = {}
    +    if speakers_labels:
    +        diarize_kwargs["num_speakers"] = len(speakers_labels)
    +    else:
    +        if minimum_speakers:
    +            diarize_kwargs["min_speakers"] = minimum_speakers
    +        if maximum_speakers:
    +            diarize_kwargs["max_speakers"] = maximum_speakers
    +
    +    # Go over the audio files and diarize:
    +    for audio_file in tqdm(
    +        audio_files, desc="Diarizing", unit="file", disable=not verbose
    +    ):
    +        try:
    +            # Load audio file:
    +            audio, sample_rate = torchaudio.load(uri=audio_file, channels_first=True)
    +            # Get the diarization (if provided):
    +            diarizations[audio_file.name] = _diarize(
    +                audio=audio,
    +                sample_rate=sample_rate,
    +                pipeline=pipeline,
    +                speakers_labels=speakers_labels,
    +                separate_by_channels=separate_by_channels,
    +                speaker_prefix=speaker_prefix,
    +                diarize_kwargs=diarize_kwargs,
    +            )
    +        except Exception as exception:
    +            # Note the exception as error in the dictionary:
    +            if verbose:
    +                _LOGGER.warning(f"Error in file: '{audio_file.name}'")
    +            errors[str(audio_file.name)] = str(exception)
    +            continue
    +
    +    # Print the head of the produced dataframe and return:
    +    if verbose:
    +        _LOGGER.info(f"Done ({len(diarizations)}/{len(audio_files)})\n")
    +    return diarizations, errors
    +
    +
    +def _get_audio_files(
    +    data_path: pathlib.Path,
    +) -> List[pathlib.Path]:
    +    # Check if the path is of a directory or a file:
    +    if data_path.is_dir():
    +        # Get all files inside the directory:
    +        audio_files = list(data_path.glob("*.*"))
    +    elif data_path.is_file():
    +        audio_files = [data_path]
    +    else:
    +        raise ValueError(
    +            f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. "
    +            f"Given: {str(data_path)} "
    +        )
    +
    +    return audio_files
    +
    +
    +def _get_access_token(parameter: str) -> str:
    +    # If given as a parameter, return it:
    +    if parameter:
    +        return parameter
    +
    +    # Otherwise, look at the environment variable:
    +    environment_variable = os.environ.get("HUGGING_FACE_HUB_TOKEN")
    +    if environment_variable:
    +        return environment_variable
    +
    +    # Lastly, try look in the set secrets in MLRun:
    +    secret = None
    +    try:
    +        import mlrun
    +
    +        context = mlrun.get_or_create_ctx(name="mlrun")
    +        secret = context.get_secret(key="HUGGING_FACE_HUB_TOKEN")
    +    except ModuleNotFoundError:
    +        pass
    +
    +    return secret
    +
    +
    +def _diarize(
    +    audio: torch.Tensor,
    +    sample_rate: int,
    +    pipeline: pyannote.audio.Pipeline,
    +    speakers_labels: List[str],
    +    separate_by_channels: bool,
    +    speaker_prefix: str,
    +    diarize_kwargs: dict,
    +) -> List[Tuple[float, float, str]]:
    +    # If there is no need for separation by channels, we diarize and return:
    +    if not separate_by_channels:
    +        # Diarize:
    +        diarization: pyannote.core.Annotation = pipeline(
    +            file={"waveform": audio, "sample_rate": sample_rate}, **diarize_kwargs
    +        )
    +        # Verify speakers labels (should not fail here as we set `num_speakers=len(speakers_labels)` when inferring
    +        # through the pipeline):
    +        if speakers_labels:
    +            given_speakers = len(speakers_labels)
    +            found_speakers = len(set(diarization.labels()))
    +            if given_speakers < found_speakers:
    +                raise ValueError(
    +                    f"Not enough `speakers_labels` were given. Got {given_speakers} labels but the diarization "
    +                    f"recognized {found_speakers} speakers."
    +                )
    +        # Return as a diarization list - a sorted list of tuples of start time, end time and a label (the default label
    +        # returned is "SPEAKER_i" so we take only the index out of it):
    +        return [
    +            (
    +                segment.start,
    +                segment.end,
    +                speakers_labels[int(label.split("_")[1])]
    +                if speakers_labels
    +                else f"{speaker_prefix}{int(label.split('_')[1])}",
    +            )
    +            for segment, track, label in diarization.itertracks(yield_label=True)
    +        ]
    +
    +    # Separate to channels and diarize (we expect only one speaker per channel):
    +    channel_diarizations = [
    +        _diarize(
    +            audio=audio[channel].unsqueeze(
    +                0
    +            ),  # Take channel and add a channel dimension to it.
    +            sample_rate=sample_rate,
    +            pipeline=pipeline,
    +            speakers_labels=[
    +                speakers_labels[channel]
    +            ],  # Take the channel's label only.
    +            separate_by_channels=False,
    +            speaker_prefix=speaker_prefix,
    +            diarize_kwargs={"num_speakers": 1},  # Set to one speaker.
    +        )
    +        for channel in range(audio.shape[0])
    +    ]
    +
    +    # Merge the channel diarizations into a single sorted list:
    +    return list(heapq.merge(*channel_diarizations))
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/pyannote_audio/latest/src/function.yaml b/functions/master/pyannote_audio/latest/src/function.yaml index 65c87f1a..30870afa 100644 --- a/functions/master/pyannote_audio/latest/src/function.yaml +++ b/functions/master/pyannote_audio/latest/src/function.yaml @@ -2,7 +2,7 @@ kind: job metadata: name: pyannote-audio tag: '' - hash: c45be8d7f51f0b2203155b08c307814a2cb0ac78 + hash: aed670a0534ebf30690dd2af7acad35595c7d5b1 project: '' labels: author: guyl diff --git a/functions/master/pyannote_audio/latest/src/item.yaml b/functions/master/pyannote_audio/latest/src/item.yaml index 61277b39..b69add9e 100644 --- a/functions/master/pyannote_audio/latest/src/item.yaml +++ b/functions/master/pyannote_audio/latest/src/item.yaml @@ -27,4 +27,4 @@ spec: - torchaudio - tqdm url: '' -version: 1.1.0 +version: 1.2.0 diff --git a/functions/master/pyannote_audio/latest/static/function.html b/functions/master/pyannote_audio/latest/static/function.html index 1d34bd5c..6b99972d 100644 --- a/functions/master/pyannote_audio/latest/static/function.html +++ b/functions/master/pyannote_audio/latest/static/function.html @@ -19,14 +19,14 @@ metadata: name: pyannote-audio tag: '' - hash: c45be8d7f51f0b2203155b08c307814a2cb0ac78 + hash: aed670a0534ebf30690dd2af7acad35595c7d5b1 project: '' labels: author: guyl categories: - deep-learning - - Huggingface - - Audio + - huggingface + - audio spec: command: '' args: [] diff --git a/functions/master/pyannote_audio/latest/static/item.html b/functions/master/pyannote_audio/latest/static/item.html index 2b5611bb..4756b62f 100644 --- a/functions/master/pyannote_audio/latest/static/item.html +++ b/functions/master/pyannote_audio/latest/static/item.html @@ -18,8 +18,8 @@ apiVersion: v1 categories: - deep-learning -- Huggingface -- Audio +- huggingface +- audio description: pyannote's speech diarization of audio files doc: '' example: pyannote_audio.ipynb @@ -44,7 +44,7 @@ - torchaudio - tqdm url: '' -version: 1.1.0 +version: 1.2.0 diff --git a/functions/master/question_answering/0.4.0/src/data/test-data.txt b/functions/master/question_answering/0.4.0/src/data/test-data.txt new file mode 100644 index 00000000..efe6b646 --- /dev/null +++ b/functions/master/question_answering/0.4.0/src/data/test-data.txt @@ -0,0 +1 @@ +The apple color is red. \ No newline at end of file diff --git a/functions/master/question_answering/0.4.0/src/function.yaml b/functions/master/question_answering/0.4.0/src/function.yaml new file mode 100644 index 00000000..7491b17e --- /dev/null +++ b/functions/master/question_answering/0.4.0/src/function.yaml @@ -0,0 +1,219 @@ +kind: job +metadata: + name: question-answering + tag: '' + hash: aed62db95f17576c69b457767e3595c2de1d5465 + project: '' + labels: + author: yonish + categories: + - genai + - huggingface + - machine-learning +spec: + command: '' + args: [] + image: '' + build: + functionSourceCode: # Copyright 2023 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import enum
import logging
import operator
import pathlib
from collections import Counter
from functools import reduce, wraps
from typing import Any, Dict, List, Tuple, Union

import pandas as pd
import transformers
from tqdm import tqdm

# Get the global logger:
_LOGGER = logging.getLogger()


def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    global _LOGGER

    is_mpi = False
    try:
        import mlrun

        context = mlrun.get_or_create_ctx(name="mlrun")
        _LOGGER = context.logger
        is_mpi = context.labels.get("kind", "job") == "mpijob"

        if is_mpi:
            try:
                from mpi4py import MPI

                return context, MPI.COMM_WORLD
            except ModuleNotFoundError as mpi4py_not_found:
                context.logger.error(
                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
                )
                raise mpi4py_not_found
    except ModuleNotFoundError as module_not_found:
        if is_mpi:
            raise module_not_found
    return None, None


def open_mpi_handler(
    worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None
):
    global _LOGGER

    # Check for MLRun and OpenMPI availability:
    context, comm = _check_mlrun_and_open_mpi()

    def decorator(handler):
        if comm is None or comm.Get_size() == 1:
            return handler

        @wraps(handler)
        def wrapper(**kwargs):
            # Get the open mpi environment properties:
            size = comm.Get_size()
            rank = comm.Get_rank()

            # Give the correct chunk of the workers inputs:
            for worker_input in worker_inputs:
                input_argument = kwargs[worker_input]
                if input_argument is None:
                    continue
                if isinstance(input_argument, str):
                    input_argument = _get_text_files(
                        data_path=pathlib.Path(input_argument).absolute()
                    )
                if len(input_argument) < size:
                    raise ValueError(
                        f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. "
                        f"Please reduce the amount of workers for this input."
                    )
                even_chunk_size = len(input_argument) // size
                chunk_start = rank * even_chunk_size
                chunk_end = (
                    (rank + 1) * even_chunk_size
                    if rank + 1 < size
                    else len(input_argument)
                )
                context.logger.info(
                    f"Rank #{rank}: Processing input chunk of '{worker_input}' "
                    f"from index {chunk_start} to {chunk_end}."
                )
                if isinstance(input_argument, list):
                    input_argument = input_argument[chunk_start:chunk_end]
                elif isinstance(input_argument, pd.DataFrame):
                    input_argument = input_argument.iloc[chunk_start:chunk_end:, :]
                kwargs[worker_input] = input_argument

            # Set the root worker only arguments:
            if rank == 0 and root_worker_inputs:
                kwargs.update(root_worker_inputs)

            # Run the worker:
            output = handler(**kwargs)

            # Send the output to the root rank (rank #0):
            output = comm.gather(output, root=0)
            if rank == 0:
                # Join the outputs:
                context.logger.info("Collecting data from workers to root worker.")
                dataframe = pd.concat(objs=[df for df, _ in output], axis=0)
                errors_dictionary = reduce(operator.ior, [err for _, err in output], {})
                return dataframe, errors_dictionary
            return None

        return wrapper

    return decorator


@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True})
def answer_questions(
    data_path: Union[str, List[str]],
    model_name: str,
    questions: Union[List[str], List[List[str]]],
    device_map: Union[str, dict] = None,
    model_kwargs: dict = None,
    auto_gptq_exllama_max_input_length: int = None,
    tokenizer_name: str = None,
    tokenizer_kwargs: dict = None,
    text_wrapper: Union[str, List[str]] = "",
    questions_wrapper: Union[str, List[str]] = "",
    generation_config: Union[Dict, List[Dict]] = None,
    questions_config: Union[Dict, List[Dict]] = None,
    batch_size: int = 1,
    questions_columns: List[str] = None,
    verbose: bool = False,
) -> Tuple[pd.DataFrame, dict]:
    """
    Answer questions with a context to the given text files contents by a pretrained LLM model. Each text file will have
    the following prompt built:

    start of `text_wrapper`
    <text file content>
    end of `text_wrapper`

    start of `questions_wrapper`
    1. <questions[0]>
    2. <questions[1]>
    ...
    n. <questions[n-1]>
    end of `questions_wrapper`

    :param data_path:                          A path to a directory of text files or a path to a text file to ask
                                               questions about.
    :param model_name:                         The pre-trained model name from the huggingface hub to use for asking
                                               questions.
    :param questions:                          The questions to ask.
                                               A list of lists of questions to ask per text file, and devided
                                               by question groups, the groups can be dtermained by size (in order to
                                               avoid large inputs to the llm) or by questioning method
                                               (regular or poll like questioning).
    :param device_map:                         A map to use for loading the model on multiple devices.
    :param model_kwargs:                       Keyword arguments to pass for loading the model using HuggingFace's
                                               `transformers.AutoModelForCausalLM.from_pretrained` function.
    :param auto_gptq_exllama_max_input_length: For AutoGPTQ models to set and extend the model's input buffer size.
    :param tokenizer_name:                     The tokenizer name from the huggingface hub to use. If not given, the
                                               model name will be used.
    :param tokenizer_kwargs:                   Keyword arguments to pass for loading the tokenizer using HuggingFace's
                                               `transformers.AutoTokenizer.from_pretrained` function.
    :param text_wrapper:                       A wrapper for the file's text. Will be added at the start of the prompt.
                                               Must have a placeholder ('{}') for the text of the file.
    :param questions_wrapper:                  A wrapper for the questions received. Will be added after the text
                                               wrapper in the prompt template. Must have a placeholder ('{}') for the
                                               questions.
    :param generation_config:                  HuggingFace's `GenerationConfig` keyword arguments to pass to the
                                               `generate` method.
    :param questions_config:                   A dictionary or list of dictionaries containing specific ways to answer
                                               questions (using a poll for example), each dictionary in the list is for
                                               corresponding question group and determines the question asking method
                                               for said group.
    :param batch_size:                         Batch size for inference.
    :param questions_columns:                  Columns to use for the dataframe returned.
    :param verbose:                            Whether to present logs of a progress bar and errors. Default: True.


    :returns: A tuple of:

              * A dataframe dataset of the questions answers.
              * A dictionary of errored files that were not inferred or were not answered properly.
    """
    global _LOGGER

    # Set configs to empty dict if not given:
    if generation_config is None:
        generation_config = {}
    if questions_config is None:
        questions_config = {}

    # Get the input text files to question:
    if verbose:
        _LOGGER.info("Collecting text files.")
    if isinstance(data_path, str):
        data_path = pathlib.Path(data_path).absolute()
        text_files = _get_text_files(data_path=data_path)
    else:
        text_files = data_path
    if verbose:
        _LOGGER.info(f"Collected {len(text_files)} text files.")

    # Get the prompt template:
    if verbose:
        _LOGGER.info("Creating prompt template.")

    # Organize questions as a list of list, and count number of sub-lists for future use
    number_of_question_groups = 1 if isinstance(questions[0], str) else len(questions)
    questions = _to_group_list(
        argument_value=questions,
        argument_name="questions",
        length=number_of_question_groups,
    )

    # Organize prompt parts at proper length
    text_wrapper = _to_group_list(
        argument_value=text_wrapper,
        argument_name="text_wrapper",
        length=number_of_question_groups,
    )
    questions_wrapper = _to_group_list(
        argument_value=questions_wrapper,
        argument_name="questions_wrapper",
        length=number_of_question_groups,
    )

    # Create a list of prompt according to given parts and questions
    prompt_template = []
    questions = questions if isinstance(questions[0], list) else [questions]

    # Build all prompts
    for i in range(number_of_question_groups):
        prompt_template.append(
            _get_prompt_template(
                text_wrapper=text_wrapper[i],
                questions_wrapper=questions_wrapper[i],
                questions=questions[i],
            )
        )
    if verbose:
        _LOGGER.info(f"Prompt template created:\n\n{prompt_template}\n")

    # Get the total amount of questions:
    questions_amount = sum([len(sublist) for sublist in questions])

    # Get the questions columns:
    questions_columns = questions_columns or [
        f"q{i}" for i in range(1, questions_amount + 1)
    ]

    # Check if we have the correct amount of questions columns:
    if len(questions_columns) != questions_amount:
        raise ValueError(
            f"The provided questions columns length ({len(questions_columns)}) "
            f"does not match the questions amount ({questions_amount})"
        )

    # Load the generation config:
    if verbose:
        _LOGGER.info("Loading generation configuration.")
    generation_config = [
        transformers.GenerationConfig(**(cfg or {}))
        for cfg in _to_group_list(
            argument_value=generation_config,
            argument_name="generation_config",
            length=number_of_question_groups,
        )
    ]
    if verbose:
        _LOGGER.info(f"Generation configuration loaded: {generation_config}")

    # Load the model and tokenizer into a pipeline object:
    if verbose:
        _LOGGER.info(f"Loading model '{model_name}'.")
    generation_pipeline = _get_generation_pipeline(
        model_name=model_name,
        device_map=device_map,
        tokenizer_name=tokenizer_name or model_name,
        model_kwargs=model_kwargs or {},
        tokenizer_kwargs=tokenizer_kwargs or {},
        auto_gptq_exllama_max_input_length=auto_gptq_exllama_max_input_length,
        batch_size=batch_size,
    )
    if verbose:
        _LOGGER.info("Model loaded.")

    # Prepare the successes dataframe and errors dictionary to be returned:
    successes = []
    errors = {}

    # Split the files into batches:
    file_batches = [
        text_files[i : i + batch_size]
        if i + batch_size < len(text_files)
        else text_files[i:]
        for i in range(0, len(text_files), batch_size)
    ]
    questions_config = _to_group_list(
        argument_value=questions_config,
        argument_name="questions_config",
        length=number_of_question_groups,
    )

    # Create a list of question handlers according to given configs
    handlers = []
    for cfg in questions_config:
        question_type = cfg.pop("type", "default")
        handlers.append(QUESTION_MAPPING.get(question_type)(**cfg))

    # Go over the batches of text files and question them:
    for file_batch in tqdm(
        file_batches,
        desc="Generating answers",
        unit=f"file (batch of {batch_size})",
        disable=not verbose,
    ):
        try:
            total_answers = [[] for _ in range(batch_size)]

            # Go over all question group per batch of documents
            for question_group in range(number_of_question_groups):
                current_questions_amount = len(questions[question_group])

                # Read batch (read the text from the text files):
                batched_input = _read_file_batch(
                    file_batch=file_batch,
                    prompt_template=prompt_template[question_group],
                )

                # Answer the questions with each question handler:
                batched_answers = handlers[question_group].answer(
                    questions_amount=current_questions_amount,
                    batched_input=batched_input,
                    generation_pipeline=generation_pipeline,
                    generation_config=generation_config[question_group],
                )

                # Put the answers in the correct place in the total answers list according to the place in the batch:
                for i in range(batch_size):
                    total_answers[i].extend(batched_answers[i])

            # Collect the answers and attach the file name:
            successes.extend(
                [
                    [file.name, *answers]
                    for file, answers in zip(file_batch, total_answers)
                ]
            )
        except Exception as exception:
            # Note the exception as error in the dictionary:
            batch_file_names = ", ".join([file.name for file in file_batch])
            if verbose:
                _LOGGER.warning(
                    f"Error in batch '{batch_file_names}': {str(exception)}"
                )
            errors[batch_file_names] = str(exception)
            continue

    # Construct the answers dataframe:
    columns = [
        "text_file",
        *questions_columns,
    ]

    # Create a data frame of answers by files
    successes = pd.DataFrame(
        successes,
        columns=columns,
    )

    # Print the head of the produced dataframe and return:
    if verbose:
        _LOGGER.info(
            f"Done ({successes.shape[0]}/{len(text_files)})\n"
            f"Answers summary:\n"
            f"{successes.head()}"
        )
    return successes, errors


def _get_text_files(
    data_path: pathlib.Path,
) -> List[pathlib.Path]:

    # Check if the path is of a directory or a file:
    if data_path.is_dir():

        # Get all files inside the directory:
        text_files = list(data_path.glob("*.*"))
    elif data_path.is_file():
        text_files = [data_path]
    else:
        raise ValueError(
            f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. "
            f"Given: {str(data_path)} "
        )

    return text_files


def _get_prompt_template(
    text_wrapper: str,
    questions_wrapper: str,
    questions: List[str],
) -> str:

    # Validate and build the text wrapper:
    text_wrapper = text_wrapper or (
        "Given the following text:\n" "-----\n" "{}\n" "-----"
    )
    if text_wrapper.count("{}") != 1:
        raise ValueError(
            "The `text_wrapper` must include one placeholder '{}' for the text of the file to be asked about."
        )

    # Validate and build the question wrapper:
    questions_wrapper = questions_wrapper or "Answer the questions:\n" "{}"
    if questions_wrapper.count("{}") != 1:
        raise ValueError(
            "The `questions_wrapper` must include one placeholder '{}' for the list of questions."
        )

    # Validate and parse the questions:
    if len(questions) == 0:
        raise ValueError("Please include at least one question.")
    questions = "\n".join(
        [f"{i}. {question}" for i, question in enumerate(questions, 1)]
    )

    # Construct the template:
    return f"{text_wrapper}\n{questions_wrapper.format(questions)}\n"


def _get_generation_pipeline(
    model_name: str,
    device_map: Union[str, dict],
    tokenizer_name: str,
    model_kwargs: dict,
    tokenizer_kwargs: dict,
    auto_gptq_exllama_max_input_length: int = None,
    batch_size: int = 1,
):
    # Load the model:
    model = transformers.AutoModelForCausalLM.from_pretrained(
        model_name, device_map=device_map, **model_kwargs
    )

    # Set exllama max input length if provided:
    # This changes the model's context size.
    if auto_gptq_exllama_max_input_length:
        from auto_gptq import exllama_set_max_input_length

        model = exllama_set_max_input_length(
            model=model, max_input_length=auto_gptq_exllama_max_input_length
        )

    # Load the tokenizer:
    tokenizer = transformers.AutoTokenizer.from_pretrained(
        tokenizer_name, **tokenizer_kwargs
    )

    # Initialize a generation pipline and return:
    pipe = transformers.pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        batch_size=batch_size,
    )
    pipe.tokenizer.pad_token_id = model.config.eos_token_id
    return pipe


def _read_file_batch(
    file_batch: List[pathlib.Path],
    prompt_template: str,
) -> List[str]:
    batch = []

    # Go over all files and read in usable format
    for file in file_batch:
        with open(file, "r", encoding="utf-8") as fp:
            batch.append(prompt_template.format(fp.read()))
    return batch


def _to_group_list(argument_value: list, argument_name: str, length: int):

    # Check if is list, turn to list if not
    argument_value = (
        argument_value if isinstance(argument_value, list) else [argument_value]
    )
    list_len = len(argument_value)

    # If not a list, or is a list of len 1 we duplicate for correct length
    # If list in wrong length throw an error
    if list_len != length:
        if list_len == 1:
            return argument_value * length
        raise ValueError(
            f"The argument value of '{argument_name}' is not equal to the length of the given questions - {length}"
        )
    return argument_value


class QuestionHandler:
    """
    A class for handling questions answering for a given question type.
    This class is used as a base class for all question types, and for default question type (regular question
    answering without any special handling).
    """

    class ConfigKeys:
        pass

    def __init__(self):
        pass

    @staticmethod
    def _get_answers(generated_text: str, questions_amount: int) -> List[str]:

        # Clear answer start (part before numbers):
        # TODO find better way to verify, for list of questions this is redundant for example
        if "1." not in generated_text:
            raise ValueError(
                f"Answer 1. is missing from the generated text: '{generated_text}'"
            )
        text = generated_text.split("1.", 1)[1]

        # Start extracting the answers:
        answers = []
        for i in range(1, questions_amount + 1):
            # If it's the last answer to look for, take the rest of the text:
            if i == questions_amount:
                answer_i = text
            # Verify there is a question number in the text:
            elif f"{i + 1}." not in text:
                raise ValueError(
                    f"Answer {i + 1}. is missing from the generated text: '{generated_text}'"
                )
            # Take i's answer:
            else:
                answer_i, text = text.split(f"{i + 1}.", 1)
            # Collect the answer removing redundant spaces:
            answers.append(answer_i.strip())

        return answers

    def _infer_questions(
        self,
        questions_amount: int,
        batched_input: List[str],
        generation_pipeline: transformers.Pipeline,
        generation_config: transformers.GenerationConfig,
    ) -> List[List[str]]:

        # Infer through the llm:
        batched_output = generation_pipeline(
            batched_input,
            generation_config=generation_config,
            eos_token_id=generation_pipeline.tokenizer.eos_token_id,
            return_full_text=False,
            num_return_sequences=1,
        )

        # Process the outputs to get the answers:
        batched_answers = []
        for output in batched_output:
            # Get the generated answers:
            answers = self._get_answers(
                generated_text=output[0]["generated_text"],
                questions_amount=questions_amount,
            )
            # Collect the processed answers:
            batched_answers.append(answers)
        return batched_answers

    def answer(
        self,
        questions_amount: int,
        batched_input: List[str],
        generation_pipeline: transformers.Pipeline,
        generation_config: transformers.GenerationConfig,
    ) -> List[List[str]]:
        """
        Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline.
        """
        return self._infer_questions(
            questions_amount=questions_amount,
            batched_input=batched_input,
            generation_pipeline=generation_pipeline,
            generation_config=generation_config,
        )


class PollQuestionHandler(QuestionHandler):
    """
    Static class to hold all the possible poll question configurations options keys
    """

    class ConfigKeys:
        """
        A class for handling questions answering for poll type questions.
        These type of question are answered by asking the same question multiple times
        and choosing the most common answer or the average answer.
        """

        #: The number of times to ask the same question.
        POLL_COUNT = "poll_count"

        #: The strategy to use for choosing the answer from the poll.
        POLL_STRATEGY = "poll_strategy"

    class Strategy(enum.Enum):
        #: The most common answer strategy.
        MOST_COMMON = "most_common"

        #: The average answer strategy.
        AVERAGE = "average"

        @staticmethod
        def most_common(answers):
            """
            Calculate the most common answer for a given list of answers.
            """
            count = Counter(answers)
            most_common = count.most_common(1)
            return most_common[0][0]

        @staticmethod
        def average(answers):
            """
            Calculate the average answer for a given list of answers.
            """
            if isinstance(answers[0], str):
                raise ValueError(
                    "Cannot perform poll with average answer strategy of non numeric values,"
                    " please change the question to give numeric data, or choose 'most_common' as strategy."
                )
            else:
                numeric_values = answers
            avg = sum(numeric_values) / len(numeric_values)

            # Round to the closest integer and return corresponding value
            return round(avg)

        def do(self, answers):
            """
            Perform the strategy.
            """
            return getattr(self, self.value)(answers)

    def __init__(
        self, poll_count: int = 5, poll_strategy: str = "most_common"):
        super().__init__()
        self.poll_count = poll_count
        self.poll_strategy = self.Strategy(poll_strategy)

    def answer(
        self,
        questions_amount: int,
        batched_input: List[str],
        generation_pipeline: transformers.Pipeline,
        generation_config: transformers.GenerationConfig,
    ) -> List[List[str]]:
        """
        Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline.
        """
        return self._answer_poll_questions(
            questions_amount=questions_amount,
            batched_input=batched_input,
            generation_pipeline=generation_pipeline,
            generation_config=generation_config,
        )

    def _answer_poll_questions(
        self,
        questions_amount: int,
        batched_input: List[str],
        generation_pipeline: transformers.Pipeline,
        generation_config: transformers.GenerationConfig,
    ) -> List[List[str]]:
        votes = []

        # Run the poll for each question
        for _ in range(self.poll_count):
            batched_answers = self._infer_questions(
                questions_amount=questions_amount,
                batched_input=batched_input,
                generation_pipeline=generation_pipeline,
                generation_config=generation_config,
            )
            votes.append(batched_answers)
        answers = []

        # Collect the answers according to the poll strategy
        # Average strategy works for numeric values only
        for batch in range(len(votes[0])):
            batched_answers = []
            for question in range(questions_amount):
                # Create a list of all answers to relevant question
                answer = [
                    votes[voter][batch][question] for voter in range(self.poll_count)
                ]
                answer = self.poll_strategy.do(answer)
                batched_answers.append(answer)
            answers.append(batched_answers)
        return answers


# Holds names of QuestionHandles
class QuestionTypes:
    DEFAULT = "default"
    POLL = "poll"


# Maps question types to their handlers
QUESTION_MAPPING = {
    QuestionTypes.DEFAULT: QuestionHandler,
    QuestionTypes.POLL: PollQuestionHandler,
}
 + base_image: mlrun/mlrun + commands: [] + code_origin: '' + origin_filename: '' + requirements: + - transformers + - torch + - tqdm + entry_points: + open_mpi_handler: + name: open_mpi_handler + doc: '' + parameters: + - name: worker_inputs + type: List[str] + - name: root_worker_inputs + type: Dict[str, Any] + default: null + outputs: [] + lineno: 58 + has_varargs: false + has_kwargs: false + decorator: + name: decorator + doc: '' + parameters: + - name: handler + outputs: [] + lineno: 66 + has_varargs: false + has_kwargs: false + wrapper: + name: wrapper + doc: '' + parameters: [] + outputs: [] + lineno: 71 + has_varargs: false + has_kwargs: true + answer_questions: + name: answer_questions + doc: 'Answer questions with a context to the given text files contents by a + pretrained LLM model. Each text file will have + + the following prompt built: + + + start of `text_wrapper` + + + + end of `text_wrapper` + + + start of `questions_wrapper` + + 1. + + 2. + + ... + + n. + + end of `questions_wrapper`' + parameters: + - name: data_path + type: Union[str, List[str]] + doc: A path to a directory of text files or a path to a text file to ask questions + about. + - name: model_name + type: str + doc: The pre-trained model name from the huggingface hub to use for asking + questions. + - name: questions + type: Union[List[str], List[List[str]]] + doc: The questions to ask. A list of lists of questions to ask per text file, + and devided by question groups, the groups can be dtermained by size (in + order to avoid large inputs to the llm) or by questioning method (regular + or poll like questioning). + - name: device_map + type: Union[str, dict] + doc: A map to use for loading the model on multiple devices. + default: null + - name: model_kwargs + type: dict + doc: Keyword arguments to pass for loading the model using HuggingFace's `transformers.AutoModelForCausalLM.from_pretrained` + function. + default: null + - name: auto_gptq_exllama_max_input_length + type: int + doc: For AutoGPTQ models to set and extend the model's input buffer size. + default: null + - name: tokenizer_name + type: str + doc: The tokenizer name from the huggingface hub to use. If not given, the + model name will be used. + default: null + - name: tokenizer_kwargs + type: dict + doc: Keyword arguments to pass for loading the tokenizer using HuggingFace's + `transformers.AutoTokenizer.from_pretrained` function. + default: null + - name: text_wrapper + type: Union[str, List[str]] + doc: A wrapper for the file's text. Will be added at the start of the prompt. + Must have a placeholder ('{}') for the text of the file. + default: '' + - name: questions_wrapper + type: Union[str, List[str]] + doc: A wrapper for the questions received. Will be added after the text wrapper + in the prompt template. Must have a placeholder ('{}') for the questions. + default: '' + - name: generation_config + type: Union[Dict, List[Dict]] + doc: HuggingFace's `GenerationConfig` keyword arguments to pass to the `generate` + method. + default: null + - name: questions_config + type: Union[Dict, List[Dict]] + doc: A dictionary or list of dictionaries containing specific ways to answer + questions (using a poll for example), each dictionary in the list is for + corresponding question group and determines the question asking method for + said group. + default: null + - name: batch_size + type: int + doc: Batch size for inference. + default: 1 + - name: questions_columns + type: List[str] + doc: Columns to use for the dataframe returned. + default: null + - name: verbose + type: bool + doc: 'Whether to present logs of a progress bar and errors. Default: True.' + default: false + outputs: + - doc: 'A tuple of:' + type: Tuple[pd.DataFrame, dict] + lineno: 130 + has_varargs: false + has_kwargs: false + answer: + name: answer + doc: Answer questions with a context to the given text files contents by a pretrained + LLM model in given pipeline. + parameters: + - name: self + - name: questions_amount + type: int + - name: batched_input + type: List[str] + - name: generation_pipeline + type: Pipeline + - name: generation_config + type: GenerationConfig + outputs: + - type: List[List[str]] + lineno: 674 + has_varargs: false + has_kwargs: false + most_common: + name: most_common + doc: Calculate the most common answer for a given list of answers. + parameters: + - name: answers + outputs: [] + lineno: 637 + has_varargs: false + has_kwargs: false + average: + name: average + doc: Calculate the average answer for a given list of answers. + parameters: + - name: answers + outputs: [] + lineno: 646 + has_varargs: false + has_kwargs: false + do: + name: do + doc: Perform the strategy. + parameters: + - name: self + - name: answers + outputs: [] + lineno: 662 + has_varargs: false + has_kwargs: false + description: GenAI approach of question answering on a given data + default_handler: answer_questions + disable_auto_mount: false + clone_target_dir: '' + env: [] + priority_class_name: '' + preemption_mode: prevent + affinity: null + tolerations: null + security_context: {} +verbose: false diff --git a/functions/master/question_answering/0.4.0/src/item.yaml b/functions/master/question_answering/0.4.0/src/item.yaml new file mode 100755 index 00000000..56fc5a5e --- /dev/null +++ b/functions/master/question_answering/0.4.0/src/item.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +categories: +- genai +- huggingface +- machine-learning +description: GenAI approach of question answering on a given data +doc: '' +example: question_answering.ipynb +generationDate: 2023-08-07:11-30 +hidden: false +icon: '' +labels: + author: yonish +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.5.2 +name: question_answering +platformVersion: 3.5.0 +spec: + filename: question_answering.py + handler: answer_questions + image: mlrun/mlrun + kind: job + requirements: + - transformers + - torch + - tqdm +url: '' +version: 0.4.0 diff --git a/functions/master/question_answering/0.4.0/src/question_answering.ipynb b/functions/master/question_answering/0.4.0/src/question_answering.ipynb new file mode 100644 index 00000000..7c506688 --- /dev/null +++ b/functions/master/question_answering/0.4.0/src/question_answering.ipynb @@ -0,0 +1,903 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "75860292-80d3-4dfb-89e4-66579321c78b", + "metadata": {}, + "source": [ + "# Question Answering" + ] + }, + { + "cell_type": "markdown", + "id": "4593a39d-6e91-4f92-9e7e-09dcd7dbcab7", + "metadata": {}, + "source": [ + "## Short description and explenation" + ] + }, + { + "cell_type": "markdown", + "id": "14dc0595-8b8a-4a13-b6a7-2a1bc43d8d50", + "metadata": {}, + "source": [ + "This function enables ad-hoc question answering over documents by ingesting text into a language model and returning formatted responses.
    \n", + "It accepts:
    \n", + "\n", + "* A language model
    \n", + "* Text files with content
    \n", + "* Questions to answer
    \n", + "* More inputs can be given for configuration
    \n", + "\n", + "The model processes the files to build understanding. Questions posed are then answered in one of two modes:\n", + "\n", + "Default mode:
    \n", + "The model directly answers each question using its own capabilities.\n", + "\n", + "Poll mode:
    \n", + "Additional models are included to separately answer each question. An aggregation algorithm determines the best response through consensus between models.
    \n", + "Two options exist for consensus methodology:
    \n", + "\n", + "Average Answer:
    \n", + "Each model's answer is scored. The response with the average highest score amongst models is selected. Useful for numeric or ranked responses.\n", + "\n", + "Most Common Answer:
    The answer that occurs most frequently across models is selected. Useful for textual responses to avoid outliers.\n", + "\n", + "Using multiple models via the poll mode provides accuracy improvements for questions lacking definitive answers, as it refines responses through an ensemble process.
    " + ] + }, + { + "cell_type": "markdown", + "id": "ae957ac3-2c26-4a0b-8e44-8315caeb2953", + "metadata": {}, + "source": [ + "## Background" + ] + }, + { + "cell_type": "markdown", + "id": "3a351565-6f2c-4fa3-a024-4b5d658db311", + "metadata": {}, + "source": [ + "At the core, advanced natural language processing (NLP) models called foundation models are being leveraged to read and comprehend the input text files.
    \n", + "Specifically, models such as GPT-3 or Codex from Anthropic are used as the base language model.\n", + "\n", + "When documents are fed into the function, the background process invokes these models to ingest and digest the information.
    \n", + "\n", + "This provides the knowledge base for the models to then offer informed answers tailored to any queries about the documents.
    \n", + "The parameters controlling model size and computation time provide tradeoffs between cost, speed, and sophistication of comprehension.\n", + "\n", + "Additionally, the poll option expands on a single model by sampling responses from a number of models as mentioned above.
    " + ] + }, + { + "cell_type": "markdown", + "id": "a6fc4aaa-530a-4e9e-8447-737a0cfd6ed5", + "metadata": {}, + "source": [ + "## Requirements" + ] + }, + { + "cell_type": "markdown", + "id": "685d9000-37e1-462b-93c8-1bbfcdf6aaa1", + "metadata": {}, + "source": [ + "`transformers`
    \n", + "`torch`
    \n", + "`tqdm`
    " + ] + }, + { + "cell_type": "markdown", + "id": "73d9b369-1c36-42e8-b106-491ad911f281", + "metadata": {}, + "source": [ + "## Documentation" + ] + }, + { + "cell_type": "markdown", + "id": "68e3a54d-0cd9-4845-ae14-f24068052bf3", + "metadata": {}, + "source": [ + "`data_path`: A path to a directory of text files or a path to a text file to ask questions about.
    \n", + "\n", + "`model_name`: The pre-trained model name from the huggingface hub to use for answering questions.
    \n", + "\n", + "`questions`: The questions to ask. A list of lists of questions to ask per text file, and devided
    \n", + " by question groups, the groups can be determained by size (in order to
    \n", + " avoid large inputs to the llm) or by questioning method (regular or poll like questioning).
    \n", + " \n", + "`device_map`: A map to use for loading the model on multiple devices.
    \n", + "\n", + "`model_kwargs`: Keyword arguments to pass for loading the model using HuggingFace's
    \n", + " _transformers.AutoModelForCausalLM.from_pretrained_ function.
    \n", + " \n", + "`auto_gptq_exllama_max_input_length`: For AutoGPTQ models to set and extend the model's input buffer size.
    \n", + "\n", + "`tokenizer_name`: The tokenizer name from the huggingface hub to use. If not given, the given model name will be used.
    \n", + " \n", + "`tokenizer_kwargs`: Keyword arguments to pass for loading the tokenizer using HuggingFace's
    \n", + " _transformers.AutoTokenizer.from_pretrained_ function.
    \n", + " \n", + "`text_wrapper`: Must have a placeholder ('{}') for the text of the file.
    \n", + "\n", + "`questions_wrapper`: A wrapper for the questions received. Will be added after the text wrapper in the prompt template.
    \n", + " Must have a placeholder ('{}') for the questions.
    \n", + " \n", + "`generation_config`: HuggingFace's _GenerationConfig_ keyword arguments to pass to the _generate_ method.
    \n", + " \n", + "`questions_config`: A dictionary or list of dictionaries containing specific ways to answer questions (using a poll for example),
    \n", + " each dictionary in the list is for corresponding question group and determines the question asking method
    \n", + " for said group.
    \n", + " \n", + "`batch_size`: Batch size for inference.
    \n", + "\n", + "`questions_columns`: Columns to use for the dataframe returned.
    \n", + "\n", + "`verbose`: Whether to present logs of a progress bar and errors. Default: True.
    \n" + ] + }, + { + "cell_type": "markdown", + "id": "716e5fac-3def-4cdd-8ca5-d1c93ee64f2e", + "metadata": {}, + "source": [ + "## Demo 1" + ] + }, + { + "cell_type": "markdown", + "id": "3bf4bc9b-fc5e-4155-8563-0575c22cef05", + "metadata": {}, + "source": [ + "This is a short and simple example to show the basic use of the function." + ] + }, + { + "cell_type": "markdown", + "id": "c95dcfdb-22e1-4b82-b0a3-9c89487a216f", + "metadata": {}, + "source": [ + "### (1.) Import the function (import mlrun, set project and import function)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60161e5f-468c-47c9-be98-e6554b899c9c", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun\n", + "import transformers\n", + "import tempfile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1267b60b-35d1-48bf-8ea0-dfe7a5f366e7", + "metadata": {}, + "outputs": [], + "source": [ + "project = mlrun.get_or_create_project(\n", + " name=\"call-center-demo-1\",\n", + " context=\"./\",\n", + " user_project=True,\n", + " parameters={\n", + " \"default_image\": \"mlrun/mlrun\",\n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05c8b39b-433c-40b8-9260-94923c9cbb6c", + "metadata": {}, + "outputs": [], + "source": [ + "func = project.set_function(\n", + " \"question-answering.py\",\n", + " name=\"question-answering\",\n", + " kind=\"job\",\n", + " handler=\"answer_questions\",\n", + ")\n", + "project.save()" + ] + }, + { + "cell_type": "markdown", + "id": "b9744a13-6530-4aa0-a30c-a88db94ce853", + "metadata": {}, + "source": [ + "We create a text file that the model can be asked about" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "503b874a-0c64-4a66-9b30-fe99191b5fd3", + "metadata": {}, + "outputs": [], + "source": [ + "def _make_data_dir_for_test():\n", + " data_dir = tempfile.mkdtemp()\n", + " # The information the model will need in order to answer our question\n", + " content = \"The apple is red.\"\n", + " with open(data_dir + \"/test_data.txt\", \"w\") as f:\n", + " f.write(content)\n", + " return data_dir" + ] + }, + { + "cell_type": "markdown", + "id": "7fadd06e-210b-45aa-b7ea-686058b6e7f4", + "metadata": {}, + "source": [ + "### (2.) Usage\n", + "Then we set where to take the path to the text file we want to ask about, the questions, and column name for the answer table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a634b19-d809-4436-bbdd-469fc1d61c6e", + "metadata": {}, + "outputs": [], + "source": [ + "input_path = _make_data_dir_for_test()\n", + "# The question for the model to answer\n", + "question = [\"What is the color of the apple?\"]\n", + "# The column of the answer in the data frame returned by the function\n", + "column_name = [\"color\"]" + ] + }, + { + "cell_type": "markdown", + "id": "0364ce68-079e-4769-89b6-661fcdc1d475", + "metadata": {}, + "source": [ + "Now we run the function with all the parameters we prepered earlier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "448bada9-8b52-4175-9839-ecb409ab3e35", + "metadata": {}, + "outputs": [], + "source": [ + "demo1_run = func.run(\n", + " handler=\"answer_questions\",\n", + " params={\n", + " \"model\": \"distilgpt2\",\n", + " \"input_path\": input_path,\n", + " \"questions\": question,\n", + " \"questions_columns\": column_name,\n", + " \"generation_config\": {\n", + " \"do_sample\": True,\n", + " \"temperature\": 0.8,\n", + " \"top_p\": 0.9,\n", + " \"early_stopping\": True,\n", + " \"max_new_tokens\": 20,\n", + " },\n", + " },\n", + " returns=[\n", + " \"question_answering_df: dataset\",\n", + " \"question_answering_errors: result\",\n", + " ],\n", + " local=True,\n", + " artifact_path=\"./\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "474505db-2fc8-48fd-a634-2bada802a449", + "metadata": {}, + "source": [ + "### (3.) Review results\n", + "and after the run is finished we can take a look and see our answer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4560b51d-5f96-465d-9826-e88c7d4d46aa", + "metadata": {}, + "outputs": [], + "source": [ + "demo1_run.outputs" + ] + }, + { + "cell_type": "markdown", + "id": "31a401a5-2f8a-427f-bf62-2f31f94f5ee7", + "metadata": {}, + "source": [ + "## Demo 2" + ] + }, + { + "cell_type": "markdown", + "id": "503b8a40-ad61-445f-900b-4fdaa036e417", + "metadata": {}, + "source": [ + "This is a much larger example, we will show how we use this function to analyze a number of calls between agents and customer of a internet company (all the data is generated by Iguazio).
    \n", + "For something like this, we recomend using a strong model, and putting some time into making the prompts." + ] + }, + { + "cell_type": "markdown", + "id": "759c521b-df3d-498f-8642-863182107618", + "metadata": {}, + "source": [ + "### (1.) Import the function (import mlrun, set project and import function)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "bde6a480-a3d9-4b8c-a9c0-daa235f0f0c4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-18 10:18:37,490 [warning] Client version with higher version than server version isn't supported, align your client to the server version: {'parsed_server_version': Version(major=1, minor=5, patch=2, prerelease='rc1', build='track'), 'parsed_client_version': Version(major=1, minor=6, patch=0, prerelease='rc11', build=None)}\n" + ] + } + ], + "source": [ + "import os\n", + "import mlrun\n", + "import torch\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "187a4643-53e9-40bb-a337-5096df7946d6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-18 10:18:51,651 [info] Project loaded successfully: {'project_name': 'call-center-demo-zeev55'}\n" + ] + } + ], + "source": [ + "project = mlrun.get_or_create_project(\n", + " name=\"call-center-demo-2\",\n", + " context=\"./\",\n", + " user_project=True,\n", + " parameters={\n", + " \"default_image\": \"mlrun/mlrun\",\n", + " })\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "17eb7783-9ced-482b-9bdf-c41e55995faf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "func = project.set_function(\n", + " \"question-answering.py\",\n", + " name=\"question-answering\",\n", + " kind=\"job\",\n", + " handler=\"answer_questions\",\n", + ")\n", + "project.save()" + ] + }, + { + "cell_type": "markdown", + "id": "91d3ebb2-7d4a-4e52-89ed-45287c06eb76", + "metadata": {}, + "source": [ + "### (2.) Usage\n", + "\n", + "This example is a bit more complicated as we mentioned, we give the model a list of questions, for some of them we give the model a list of answers to choose from." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2bc065e4-2dbf-4d7a-9772-6b7039f428bc", + "metadata": {}, + "outputs": [], + "source": [ + "QUESTIONS = [\n", + " \"1. Write a long summary of the text, focus on the topic (max 50 words).\",\n", + " \"2. Was the Client's concern addressed, (choose only one) [Yes, No]?\",\n", + " ]\n", + "\n", + "qa_questions_columns = [\n", + " \"Summary\",\n", + " \"is_fixed\",\n", + " ]" + ] + }, + { + "cell_type": "markdown", + "id": "aa89f316-0d1b-4ada-9990-d2293546eee3", + "metadata": {}, + "source": [ + "Another thing we give the model this time is answer examples (one/few shot answering), this can be done to show the model how you want the answer to be structured or caculated.
    \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbc093ad-dab4-46a1-b36a-2a7551cef018", + "metadata": {}, + "outputs": [], + "source": [ + "# For every file we ask about, the model will be presented with this example of a call and how we want the answers.\n", + "DEMO_CALL = (\n", + " \"Agent: Good afternoon, you've reached [Internet Service Provider] customer support. I'm Megan. How can I assist \"\n", + " \"you today?\\n\"\n", + " \"Customer: Hello, Megan. This is Lisa. I've noticed some billing discrepancies on my last statement.\\n\"\n", + " \"Agent: Thank you, Lisa. Let me pull up your account. I see the billing discrepancies you mentioned. It appears \"\n", + " \"there was an error in the charges. I apologize for the inconvenience.\\n\"\n", + " \"Customer: Thank you for acknowledging the issue, Megan. Can you please help me get it resolved?\\n\"\n", + " \"Agent: Absolutely, Lisa. I've made note of the discrepancies, and I'll escalate this to our billing department \"\n", + " \"for investigation and correction. You should see the adjustments on your next statement.\\n\"\n", + " \"Customer: That sounds good, Megan. I appreciate your help.\\n\"\n", + " \"Agent: Not a problem, Lisa. Have a wonderful day, and we'll get this sorted out for you.\\n\"\n", + ")\n", + "\n", + "DEMO_ANSWERS = (\n", + " \"1. The customer, contacted the call center regarding billing discrepancies on her statement. The agent, \"\n", + " \"acknowledged the issue, assured The customer it would be resolved, and escalated it to the billing department for \"\n", + " \"correction.\\n\"\n", + " \"2. Yes.\\n\"" + ] + }, + { + "cell_type": "markdown", + "id": "8b44ded3-fee3-4911-a02a-6a51a62a7020", + "metadata": {}, + "source": [ + "Then we need to wrap it all nicely to be given to the model as a single prompt, this is done with a text wrapper, and a question wrapper.
    \n", + "both of them will be concatenated inside the function with the questions and passed to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2108f5aa-75a6-402d-83a6-bf45f0d7223a", + "metadata": {}, + "outputs": [], + "source": [ + "# The wrappers are built according to the model's convensions to improve result\n", + "TEXT_WRAPPER = (\n", + " f\"<|im_start|>system: You are an AI assistant that answers questions accurately and shortly<|im_end|>\\n\"\n", + " f\"<|im_start|>user: Given the following text:\\n\"\n", + " f\"{DEMO_CALL}\\n\"\n", + " f\"answer the questions as accurately as you can:\\n\"\n", + " f\"{QUESTIONS}<|im_end|>\\n\"\n", + " f\"<|im_start|>assistant:\\n\"\n", + " f\"{DEMO_ANSWERS}<|im_end|>\\n\"\n", + " f\"<|im_start|>user: Given the following text:\\n\"\n", + " \"{}\"\n", + ") \n", + "QUESTIONS_WRAPPER = (\n", + " \" answer the given questions as accurately as you can, do not write more answers the questions:\\n\"\n", + " \"{}<|im_end|>\\n\"\n", + " \"<|im_start|>assistant:\\n\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1a44b391-87d2-447d-aafa-66ed45f06ba5", + "metadata": {}, + "source": [ + "The last few parameters we need to set are the model we will use, the input lenth (no available for all models) and the batch size.
    \n", + "The batch size determains how many files we want procced at each epoch, and the larger we go the faster the proccess will be, as long as our memory is sufficient. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "528cae4c-541b-49a3-b24d-deb94f7130fb", + "metadata": {}, + "outputs": [], + "source": [ + "# We like this version of mistral's model, which is small and fast but also gives great results\n", + "qa_model = \"TheBloke/Mistral-7B-OpenOrca-GPTQ\"" + ] + }, + { + "cell_type": "markdown", + "id": "47fa4eaa-f3b0-457f-b98a-18a8ee5ba4d8", + "metadata": {}, + "source": [ + "Finnaly, we run the function with all the parameters we prepared. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d200706-e852-4ce9-9b9a-61686b30e5b7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Question answering:\n", + "demo2_run = func.run(\n", + " function=\"question-answering\",\n", + " local=True,\n", + " handler=\"answer_questions\",\n", + " inputs={\"data_path\": os.path.abspath(\"./calls\")},\n", + " params={\n", + " \"model_name\": qa_model,\n", + " \"device_map\": \"auto\",\n", + " \"text_wrapper\":TEXT_WRAPPER,\n", + " \"questions\": QUESTIONS,\n", + " \"questions_wrapper\": QUESTIONS_WRAPPER,\n", + " \"questions_columns\": qa_questions_columns,\n", + " },\n", + " returns=[\n", + " \"question_answering_df: dataset\",\n", + " \"question_answering_errors: result\",\n", + " ],\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "0d505915-49b5-47fb-9f50-ce15fe6dc392", + "metadata": {}, + "source": [ + "### (3.) Review results\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa39c5bf-c959-4ff5-ad60-4ad68d00f22c", + "metadata": {}, + "outputs": [], + "source": [ + "demo2_run.outputs" + ] + }, + { + "cell_type": "markdown", + "id": "947d6ce8-b330-44ab-b13f-b6eec20e839e", + "metadata": {}, + "source": [ + "## Demo 3" + ] + }, + { + "cell_type": "markdown", + "id": "66b916d2-96b0-448d-8e51-b51fb5a5a1a7", + "metadata": {}, + "source": [ + "This is also a large example, in this case we use another option of the function to ask questions in the form of a poll." + ] + }, + { + "cell_type": "markdown", + "id": "9ec66fc7-f50b-4417-a7cc-3c42848b1f01", + "metadata": {}, + "source": [ + "### (1.) Import the function (import mlrun, set project and import function)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "dfcab8d0-5022-40e5-92ff-14b02cfa2eaa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-18 10:18:37,490 [warning] Client version with higher version than server version isn't supported, align your client to the server version: {'parsed_server_version': Version(major=1, minor=5, patch=2, prerelease='rc1', build='track'), 'parsed_client_version': Version(major=1, minor=6, patch=0, prerelease='rc11', build=None)}\n" + ] + } + ], + "source": [ + "import os\n", + "import mlrun\n", + "import torch\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "49bc523b-9bca-46c5-917d-320d5641506a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-18 10:18:51,651 [info] Project loaded successfully: {'project_name': 'call-center-demo-zeev55'}\n" + ] + } + ], + "source": [ + "project = mlrun.get_or_create_project(\n", + " name=\"call-center-demo-3\",\n", + " context=\"./\",\n", + " user_project=True,\n", + " parameters={\n", + " \"default_image\": \"mlrun/mlrun\",\n", + " })\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "370f3780-0dfc-4b9c-87aa-1dd124e62249", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "func = project.set_function(\n", + " \"question-answering.py\",\n", + " name=\"question-answering\",\n", + " kind=\"job\",\n", + " handler=\"answer_questions\",\n", + ")\n", + "project.save()" + ] + }, + { + "cell_type": "markdown", + "id": "88dbe941-b9af-40bb-a038-7fcc812d506c", + "metadata": {}, + "source": [ + "### (2.) Usage\n", + "\n", + "Like in the second demo, we make a list of questions for the function to answer." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f9b02aaa-2a31-4ade-ba26-a2d73c5d03ab", + "metadata": {}, + "outputs": [], + "source": [ + "# These questions are harder to answer, as there is no right answer.\n", + "# So we want it to be at least consistent, for that we use the poll option.\n", + "QUESTIONS = [\n", + " \"1. Rate the agent's level of empathy (The ability to understand and share the feelings of others) on a scale of 1-5.\",\n", + " \"2. Rate the agent's level of professionalism (Conducting oneself in a way that is appropriate for the workplace) on a scale of 1-5.\",\n", + "]\n", + "\n", + "qa_questions_columns = [\n", + " \"empathy\",\n", + " \"professionalism\",\n", + "\n", + " ]" + ] + }, + { + "cell_type": "markdown", + "id": "6ed8a0e3-9c5d-4524-bbe1-b345b981694a", + "metadata": {}, + "source": [ + "Another thing we give the model this time is answer examples (one/few shot answering), this can be done to show the model how you want the answer to be structured or caculated.
    \n", + "So for every file we ask about, the model will be presented with this example of a call and how we want the answers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d14e79d6-687c-4424-a01f-68376ad3dd30", + "metadata": {}, + "outputs": [], + "source": [ + "# For every file we ask about, the model will be presented with this example of a call and how we want the answers.\n", + "DEMO_CALL = (\n", + " \"Agent: Good afternoon, you've reached [Internet Service Provider] customer support. I'm Megan. How can I assist \"\n", + " \"you today?\\n\"\n", + " \"Customer: Hello, Megan. This is Lisa. I've noticed some billing discrepancies on my last statement.\\n\"\n", + " \"Agent: Thank you, Lisa. Let me pull up your account. I see the billing discrepancies you mentioned. It appears \"\n", + " \"there was an error in the charges. I apologize for the inconvenience.\\n\"\n", + " \"Customer: Thank you for acknowledging the issue, Megan. Can you please help me get it resolved?\\n\"\n", + " \"Agent: Absolutely, Lisa. I've made note of the discrepancies, and I'll escalate this to our billing department \"\n", + " \"for investigation and correction. You should see the adjustments on your next statement.\\n\"\n", + " \"Customer: That sounds good, Megan. I appreciate your help.\\n\"\n", + " \"Agent: Not a problem, Lisa. Have a wonderful day, and we'll get this sorted out for you.\\n\"\n", + ")\n", + "\n", + "\n", + "DEMO_ANSWERS = (\n", + " \"1. 4\\n\"\n", + " \"2. 5\\n\"\n", + "\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "86099fb8-895c-4e2c-979d-6bda9782ccd3", + "metadata": {}, + "source": [ + "Then we need to wrap it all nicely to be given to the model as a single prompt, this is done with a text wrapper, and a question wrapper.
    \n", + "both of them will be concatenated inside the function with the questions and passed to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5efac70-cd2c-4fc7-bc9c-4c04d18077a1", + "metadata": {}, + "outputs": [], + "source": [ + "TEXT_WRAPPER = (\n", + " f\"<|im_start|>system: You are an AI assistant that answers questions accurately and shortly<|im_end|>\\n\"\n", + " f\"<|im_start|>user: Given the following text:\\n\"\n", + " f\"{DEMO_CALL}\\n\"\n", + " f\"answer the questions as accurately as you can:\\n\"\n", + " f\"{QUESTIONS}<|im_end|>\\n\"\n", + " f\"<|im_start|>assistant:\\n\"\n", + " f\"{DEMO_ANSWERS}<|im_end|>\\n\"\n", + " f\"<|im_start|>user: Given the following text:\\n\"\n", + " \"{}\"\n", + ") \n", + "\n", + "QUESTIONS_WRAPPER = (\n", + " \" answer the given questions as accurately as you can, do not write more answers the questions:\\n\"\n", + " \"{}<|im_end|>\\n\"\n", + " \"<|im_start|>assistant:\\n\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9339816e-d436-4add-b8f3-b48e577f4bfe", + "metadata": {}, + "source": [ + "The config is for the second questioning method, we cal \"poll\", and in which we need to choose how many voting models we want participating,
    \n", + "and in what way we want do decide the result, we currentlly support `average` and `most_common` as show here.
    \n", + "\n", + "\n", + "*An explenation about both questioning methods can be found in the begginig of this notebook" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6330db65-9806-44a6-8046-0b156d2a3228", + "metadata": {}, + "outputs": [], + "source": [ + "questions_config = \n", + " {\n", + " \"type\": \"poll\",\n", + " \"poll_count\": 3, # How many 'voters'\n", + " \"poll_strategy\": \"most_common\"\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eaa0ae3d-9302-4b73-92f1-8c43ec92e9cd", + "metadata": {}, + "outputs": [], + "source": [ + "qa_model = \"TheBloke/Mistral-7B-OpenOrca-GPTQ\"" + ] + }, + { + "cell_type": "markdown", + "id": "20c0e1eb-49cf-426e-b125-eb133d440fbd", + "metadata": {}, + "source": [ + "Finnaly, we run the function with all the parameters we prepared. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03d6d619-618a-49d6-b0be-43c300902927", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Question answering:\n", + "demo3_run = func.run(\n", + " function=\"question-answering\",\n", + " local=True,\n", + " handler=\"answer_questions\",\n", + " inputs={\"data_path\": os.path.abspath(\"./calls\")},\n", + " params={\n", + " \"model_name\": qa_model,\n", + " \"device_map\": \"auto\",\n", + " \"text_wrapper\":TEXT_WRAPPER,\n", + " \"questions\": QUESTIONS,\n", + " \"questions_wrapper\": QUESTIONS_WRAPPER,\n", + " \"questions_columns\": qa_questions_columns,\n", + " \"questions_config\": questions_config, # This time we add 'questions_config'\n", + " },\n", + " returns=[\n", + " \"question_answering_df: dataset\",\n", + " \"question_answering_errors: result\",\n", + " ],\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "534edd4e-1e5b-4663-a2bb-bc6da7b603ca", + "metadata": {}, + "source": [ + "### (3.) Review results\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a61f06ad-ee28-45c9-b7da-d93c5a296810", + "metadata": {}, + "outputs": [], + "source": [ + "demo3_run.outputs" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlrun-base", + "language": "python", + "name": "conda-env-mlrun-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/functions/master/question_answering/0.4.0/src/question_answering.py b/functions/master/question_answering/0.4.0/src/question_answering.py new file mode 100644 index 00000000..2e4e96d0 --- /dev/null +++ b/functions/master/question_answering/0.4.0/src/question_answering.py @@ -0,0 +1,736 @@ +# Copyright 2023 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import enum +import logging +import operator +import pathlib +from collections import Counter +from functools import reduce, wraps +from typing import Any, Dict, List, Tuple, Union + +import pandas as pd +import transformers +from tqdm import tqdm + +# Get the global logger: +_LOGGER = logging.getLogger() + + +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]: + global _LOGGER + + is_mpi = False + try: + import mlrun + + context = mlrun.get_or_create_ctx(name="mlrun") + _LOGGER = context.logger + is_mpi = context.labels.get("kind", "job") == "mpijob" + + if is_mpi: + try: + from mpi4py import MPI + + return context, MPI.COMM_WORLD + except ModuleNotFoundError as mpi4py_not_found: + context.logger.error( + "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your " + "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi." + ) + raise mpi4py_not_found + except ModuleNotFoundError as module_not_found: + if is_mpi: + raise module_not_found + return None, None + + +def open_mpi_handler( + worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None +): + global _LOGGER + + # Check for MLRun and OpenMPI availability: + context, comm = _check_mlrun_and_open_mpi() + + def decorator(handler): + if comm is None or comm.Get_size() == 1: + return handler + + @wraps(handler) + def wrapper(**kwargs): + # Get the open mpi environment properties: + size = comm.Get_size() + rank = comm.Get_rank() + + # Give the correct chunk of the workers inputs: + for worker_input in worker_inputs: + input_argument = kwargs[worker_input] + if input_argument is None: + continue + if isinstance(input_argument, str): + input_argument = _get_text_files( + data_path=pathlib.Path(input_argument).absolute() + ) + if len(input_argument) < size: + raise ValueError( + f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. " + f"Please reduce the amount of workers for this input." + ) + even_chunk_size = len(input_argument) // size + chunk_start = rank * even_chunk_size + chunk_end = ( + (rank + 1) * even_chunk_size + if rank + 1 < size + else len(input_argument) + ) + context.logger.info( + f"Rank #{rank}: Processing input chunk of '{worker_input}' " + f"from index {chunk_start} to {chunk_end}." + ) + if isinstance(input_argument, list): + input_argument = input_argument[chunk_start:chunk_end] + elif isinstance(input_argument, pd.DataFrame): + input_argument = input_argument.iloc[chunk_start:chunk_end:, :] + kwargs[worker_input] = input_argument + + # Set the root worker only arguments: + if rank == 0 and root_worker_inputs: + kwargs.update(root_worker_inputs) + + # Run the worker: + output = handler(**kwargs) + + # Send the output to the root rank (rank #0): + output = comm.gather(output, root=0) + if rank == 0: + # Join the outputs: + context.logger.info("Collecting data from workers to root worker.") + dataframe = pd.concat(objs=[df for df, _ in output], axis=0) + errors_dictionary = reduce(operator.ior, [err for _, err in output], {}) + return dataframe, errors_dictionary + return None + + return wrapper + + return decorator + + +@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True}) +def answer_questions( + data_path: Union[str, List[str]], + model_name: str, + questions: Union[List[str], List[List[str]]], + device_map: Union[str, dict] = None, + model_kwargs: dict = None, + auto_gptq_exllama_max_input_length: int = None, + tokenizer_name: str = None, + tokenizer_kwargs: dict = None, + text_wrapper: Union[str, List[str]] = "", + questions_wrapper: Union[str, List[str]] = "", + generation_config: Union[Dict, List[Dict]] = None, + questions_config: Union[Dict, List[Dict]] = None, + batch_size: int = 1, + questions_columns: List[str] = None, + verbose: bool = False, +) -> Tuple[pd.DataFrame, dict]: + """ + Answer questions with a context to the given text files contents by a pretrained LLM model. Each text file will have + the following prompt built: + + start of `text_wrapper` + + end of `text_wrapper` + + start of `questions_wrapper` + 1. + 2. + ... + n. + end of `questions_wrapper` + + :param data_path: A path to a directory of text files or a path to a text file to ask + questions about. + :param model_name: The pre-trained model name from the huggingface hub to use for asking + questions. + :param questions: The questions to ask. + A list of lists of questions to ask per text file, and devided + by question groups, the groups can be dtermained by size (in order to + avoid large inputs to the llm) or by questioning method + (regular or poll like questioning). + :param device_map: A map to use for loading the model on multiple devices. + :param model_kwargs: Keyword arguments to pass for loading the model using HuggingFace's + `transformers.AutoModelForCausalLM.from_pretrained` function. + :param auto_gptq_exllama_max_input_length: For AutoGPTQ models to set and extend the model's input buffer size. + :param tokenizer_name: The tokenizer name from the huggingface hub to use. If not given, the + model name will be used. + :param tokenizer_kwargs: Keyword arguments to pass for loading the tokenizer using HuggingFace's + `transformers.AutoTokenizer.from_pretrained` function. + :param text_wrapper: A wrapper for the file's text. Will be added at the start of the prompt. + Must have a placeholder ('{}') for the text of the file. + :param questions_wrapper: A wrapper for the questions received. Will be added after the text + wrapper in the prompt template. Must have a placeholder ('{}') for the + questions. + :param generation_config: HuggingFace's `GenerationConfig` keyword arguments to pass to the + `generate` method. + :param questions_config: A dictionary or list of dictionaries containing specific ways to answer + questions (using a poll for example), each dictionary in the list is for + corresponding question group and determines the question asking method + for said group. + :param batch_size: Batch size for inference. + :param questions_columns: Columns to use for the dataframe returned. + :param verbose: Whether to present logs of a progress bar and errors. Default: True. + + + :returns: A tuple of: + + * A dataframe dataset of the questions answers. + * A dictionary of errored files that were not inferred or were not answered properly. + """ + global _LOGGER + + # Set configs to empty dict if not given: + if generation_config is None: + generation_config = {} + if questions_config is None: + questions_config = {} + + # Get the input text files to question: + if verbose: + _LOGGER.info("Collecting text files.") + if isinstance(data_path, str): + data_path = pathlib.Path(data_path).absolute() + text_files = _get_text_files(data_path=data_path) + else: + text_files = data_path + if verbose: + _LOGGER.info(f"Collected {len(text_files)} text files.") + + # Get the prompt template: + if verbose: + _LOGGER.info("Creating prompt template.") + + # Organize questions as a list of list, and count number of sub-lists for future use + number_of_question_groups = 1 if isinstance(questions[0], str) else len(questions) + questions = _to_group_list( + argument_value=questions, + argument_name="questions", + length=number_of_question_groups, + ) + + # Organize prompt parts at proper length + text_wrapper = _to_group_list( + argument_value=text_wrapper, + argument_name="text_wrapper", + length=number_of_question_groups, + ) + questions_wrapper = _to_group_list( + argument_value=questions_wrapper, + argument_name="questions_wrapper", + length=number_of_question_groups, + ) + + # Create a list of prompt according to given parts and questions + prompt_template = [] + questions = questions if isinstance(questions[0], list) else [questions] + + # Build all prompts + for i in range(number_of_question_groups): + prompt_template.append( + _get_prompt_template( + text_wrapper=text_wrapper[i], + questions_wrapper=questions_wrapper[i], + questions=questions[i], + ) + ) + if verbose: + _LOGGER.info(f"Prompt template created:\n\n{prompt_template}\n") + + # Get the total amount of questions: + questions_amount = sum([len(sublist) for sublist in questions]) + + # Get the questions columns: + questions_columns = questions_columns or [ + f"q{i}" for i in range(1, questions_amount + 1) + ] + + # Check if we have the correct amount of questions columns: + if len(questions_columns) != questions_amount: + raise ValueError( + f"The provided questions columns length ({len(questions_columns)}) " + f"does not match the questions amount ({questions_amount})" + ) + + # Load the generation config: + if verbose: + _LOGGER.info("Loading generation configuration.") + generation_config = [ + transformers.GenerationConfig(**(cfg or {})) + for cfg in _to_group_list( + argument_value=generation_config, + argument_name="generation_config", + length=number_of_question_groups, + ) + ] + if verbose: + _LOGGER.info(f"Generation configuration loaded: {generation_config}") + + # Load the model and tokenizer into a pipeline object: + if verbose: + _LOGGER.info(f"Loading model '{model_name}'.") + generation_pipeline = _get_generation_pipeline( + model_name=model_name, + device_map=device_map, + tokenizer_name=tokenizer_name or model_name, + model_kwargs=model_kwargs or {}, + tokenizer_kwargs=tokenizer_kwargs or {}, + auto_gptq_exllama_max_input_length=auto_gptq_exllama_max_input_length, + batch_size=batch_size, + ) + if verbose: + _LOGGER.info("Model loaded.") + + # Prepare the successes dataframe and errors dictionary to be returned: + successes = [] + errors = {} + + # Split the files into batches: + file_batches = [ + text_files[i : i + batch_size] + if i + batch_size < len(text_files) + else text_files[i:] + for i in range(0, len(text_files), batch_size) + ] + questions_config = _to_group_list( + argument_value=questions_config, + argument_name="questions_config", + length=number_of_question_groups, + ) + + # Create a list of question handlers according to given configs + handlers = [] + for cfg in questions_config: + question_type = cfg.pop("type", "default") + handlers.append(QUESTION_MAPPING.get(question_type)(**cfg)) + + # Go over the batches of text files and question them: + for file_batch in tqdm( + file_batches, + desc="Generating answers", + unit=f"file (batch of {batch_size})", + disable=not verbose, + ): + try: + total_answers = [[] for _ in range(batch_size)] + + # Go over all question group per batch of documents + for question_group in range(number_of_question_groups): + current_questions_amount = len(questions[question_group]) + + # Read batch (read the text from the text files): + batched_input = _read_file_batch( + file_batch=file_batch, + prompt_template=prompt_template[question_group], + ) + + # Answer the questions with each question handler: + batched_answers = handlers[question_group].answer( + questions_amount=current_questions_amount, + batched_input=batched_input, + generation_pipeline=generation_pipeline, + generation_config=generation_config[question_group], + ) + + # Put the answers in the correct place in the total answers list according to the place in the batch: + for i in range(batch_size): + total_answers[i].extend(batched_answers[i]) + + # Collect the answers and attach the file name: + successes.extend( + [ + [file.name, *answers] + for file, answers in zip(file_batch, total_answers) + ] + ) + except Exception as exception: + # Note the exception as error in the dictionary: + batch_file_names = ", ".join([file.name for file in file_batch]) + if verbose: + _LOGGER.warning( + f"Error in batch '{batch_file_names}': {str(exception)}" + ) + errors[batch_file_names] = str(exception) + continue + + # Construct the answers dataframe: + columns = [ + "text_file", + *questions_columns, + ] + + # Create a data frame of answers by files + successes = pd.DataFrame( + successes, + columns=columns, + ) + + # Print the head of the produced dataframe and return: + if verbose: + _LOGGER.info( + f"Done ({successes.shape[0]}/{len(text_files)})\n" + f"Answers summary:\n" + f"{successes.head()}" + ) + return successes, errors + + +def _get_text_files( + data_path: pathlib.Path, +) -> List[pathlib.Path]: + + # Check if the path is of a directory or a file: + if data_path.is_dir(): + + # Get all files inside the directory: + text_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + text_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. " + f"Given: {str(data_path)} " + ) + + return text_files + + +def _get_prompt_template( + text_wrapper: str, + questions_wrapper: str, + questions: List[str], +) -> str: + + # Validate and build the text wrapper: + text_wrapper = text_wrapper or ( + "Given the following text:\n" "-----\n" "{}\n" "-----" + ) + if text_wrapper.count("{}") != 1: + raise ValueError( + "The `text_wrapper` must include one placeholder '{}' for the text of the file to be asked about." + ) + + # Validate and build the question wrapper: + questions_wrapper = questions_wrapper or "Answer the questions:\n" "{}" + if questions_wrapper.count("{}") != 1: + raise ValueError( + "The `questions_wrapper` must include one placeholder '{}' for the list of questions." + ) + + # Validate and parse the questions: + if len(questions) == 0: + raise ValueError("Please include at least one question.") + questions = "\n".join( + [f"{i}. {question}" for i, question in enumerate(questions, 1)] + ) + + # Construct the template: + return f"{text_wrapper}\n{questions_wrapper.format(questions)}\n" + + +def _get_generation_pipeline( + model_name: str, + device_map: Union[str, dict], + tokenizer_name: str, + model_kwargs: dict, + tokenizer_kwargs: dict, + auto_gptq_exllama_max_input_length: int = None, + batch_size: int = 1, +): + # Load the model: + model = transformers.AutoModelForCausalLM.from_pretrained( + model_name, device_map=device_map, **model_kwargs + ) + + # Set exllama max input length if provided: + # This changes the model's context size. + if auto_gptq_exllama_max_input_length: + from auto_gptq import exllama_set_max_input_length + + model = exllama_set_max_input_length( + model=model, max_input_length=auto_gptq_exllama_max_input_length + ) + + # Load the tokenizer: + tokenizer = transformers.AutoTokenizer.from_pretrained( + tokenizer_name, **tokenizer_kwargs + ) + + # Initialize a generation pipline and return: + pipe = transformers.pipeline( + task="text-generation", + model=model, + tokenizer=tokenizer, + batch_size=batch_size, + ) + pipe.tokenizer.pad_token_id = model.config.eos_token_id + return pipe + + +def _read_file_batch( + file_batch: List[pathlib.Path], + prompt_template: str, +) -> List[str]: + batch = [] + + # Go over all files and read in usable format + for file in file_batch: + with open(file, "r", encoding="utf-8") as fp: + batch.append(prompt_template.format(fp.read())) + return batch + + +def _to_group_list(argument_value: list, argument_name: str, length: int): + + # Check if is list, turn to list if not + argument_value = ( + argument_value if isinstance(argument_value, list) else [argument_value] + ) + list_len = len(argument_value) + + # If not a list, or is a list of len 1 we duplicate for correct length + # If list in wrong length throw an error + if list_len != length: + if list_len == 1: + return argument_value * length + raise ValueError( + f"The argument value of '{argument_name}' is not equal to the length of the given questions - {length}" + ) + return argument_value + + +class QuestionHandler: + """ + A class for handling questions answering for a given question type. + This class is used as a base class for all question types, and for default question type (regular question + answering without any special handling). + """ + + class ConfigKeys: + pass + + def __init__(self): + pass + + @staticmethod + def _get_answers(generated_text: str, questions_amount: int) -> List[str]: + + # Clear answer start (part before numbers): + # TODO find better way to verify, for list of questions this is redundant for example + if "1." not in generated_text: + raise ValueError( + f"Answer 1. is missing from the generated text: '{generated_text}'" + ) + text = generated_text.split("1.", 1)[1] + + # Start extracting the answers: + answers = [] + for i in range(1, questions_amount + 1): + # If it's the last answer to look for, take the rest of the text: + if i == questions_amount: + answer_i = text + # Verify there is a question number in the text: + elif f"{i + 1}." not in text: + raise ValueError( + f"Answer {i + 1}. is missing from the generated text: '{generated_text}'" + ) + # Take i's answer: + else: + answer_i, text = text.split(f"{i + 1}.", 1) + # Collect the answer removing redundant spaces: + answers.append(answer_i.strip()) + + return answers + + def _infer_questions( + self, + questions_amount: int, + batched_input: List[str], + generation_pipeline: transformers.Pipeline, + generation_config: transformers.GenerationConfig, + ) -> List[List[str]]: + + # Infer through the llm: + batched_output = generation_pipeline( + batched_input, + generation_config=generation_config, + eos_token_id=generation_pipeline.tokenizer.eos_token_id, + return_full_text=False, + num_return_sequences=1, + ) + + # Process the outputs to get the answers: + batched_answers = [] + for output in batched_output: + # Get the generated answers: + answers = self._get_answers( + generated_text=output[0]["generated_text"], + questions_amount=questions_amount, + ) + # Collect the processed answers: + batched_answers.append(answers) + return batched_answers + + def answer( + self, + questions_amount: int, + batched_input: List[str], + generation_pipeline: transformers.Pipeline, + generation_config: transformers.GenerationConfig, + ) -> List[List[str]]: + """ + Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline. + """ + return self._infer_questions( + questions_amount=questions_amount, + batched_input=batched_input, + generation_pipeline=generation_pipeline, + generation_config=generation_config, + ) + + +class PollQuestionHandler(QuestionHandler): + """ + Static class to hold all the possible poll question configurations options keys + """ + + class ConfigKeys: + """ + A class for handling questions answering for poll type questions. + These type of question are answered by asking the same question multiple times + and choosing the most common answer or the average answer. + """ + + #: The number of times to ask the same question. + POLL_COUNT = "poll_count" + + #: The strategy to use for choosing the answer from the poll. + POLL_STRATEGY = "poll_strategy" + + class Strategy(enum.Enum): + #: The most common answer strategy. + MOST_COMMON = "most_common" + + #: The average answer strategy. + AVERAGE = "average" + + @staticmethod + def most_common(answers): + """ + Calculate the most common answer for a given list of answers. + """ + count = Counter(answers) + most_common = count.most_common(1) + return most_common[0][0] + + @staticmethod + def average(answers): + """ + Calculate the average answer for a given list of answers. + """ + if isinstance(answers[0], str): + raise ValueError( + "Cannot perform poll with average answer strategy of non numeric values," + " please change the question to give numeric data, or choose 'most_common' as strategy." + ) + else: + numeric_values = answers + avg = sum(numeric_values) / len(numeric_values) + + # Round to the closest integer and return corresponding value + return round(avg) + + def do(self, answers): + """ + Perform the strategy. + """ + return getattr(self, self.value)(answers) + + def __init__( + self, poll_count: int = 5, poll_strategy: str = "most_common"): + super().__init__() + self.poll_count = poll_count + self.poll_strategy = self.Strategy(poll_strategy) + + def answer( + self, + questions_amount: int, + batched_input: List[str], + generation_pipeline: transformers.Pipeline, + generation_config: transformers.GenerationConfig, + ) -> List[List[str]]: + """ + Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline. + """ + return self._answer_poll_questions( + questions_amount=questions_amount, + batched_input=batched_input, + generation_pipeline=generation_pipeline, + generation_config=generation_config, + ) + + def _answer_poll_questions( + self, + questions_amount: int, + batched_input: List[str], + generation_pipeline: transformers.Pipeline, + generation_config: transformers.GenerationConfig, + ) -> List[List[str]]: + votes = [] + + # Run the poll for each question + for _ in range(self.poll_count): + batched_answers = self._infer_questions( + questions_amount=questions_amount, + batched_input=batched_input, + generation_pipeline=generation_pipeline, + generation_config=generation_config, + ) + votes.append(batched_answers) + answers = [] + + # Collect the answers according to the poll strategy + # Average strategy works for numeric values only + for batch in range(len(votes[0])): + batched_answers = [] + for question in range(questions_amount): + # Create a list of all answers to relevant question + answer = [ + votes[voter][batch][question] for voter in range(self.poll_count) + ] + answer = self.poll_strategy.do(answer) + batched_answers.append(answer) + answers.append(batched_answers) + return answers + + +# Holds names of QuestionHandles +class QuestionTypes: + DEFAULT = "default" + POLL = "poll" + + +# Maps question types to their handlers +QUESTION_MAPPING = { + QuestionTypes.DEFAULT: QuestionHandler, + QuestionTypes.POLL: PollQuestionHandler, +} diff --git a/functions/master/question_answering/0.4.0/src/requirements.txt b/functions/master/question_answering/0.4.0/src/requirements.txt new file mode 100644 index 00000000..d05cb777 --- /dev/null +++ b/functions/master/question_answering/0.4.0/src/requirements.txt @@ -0,0 +1,4 @@ +transformers +tqdm +torch +einops \ No newline at end of file diff --git a/functions/master/question_answering/0.4.0/src/test_question_answering.py b/functions/master/question_answering/0.4.0/src/test_question_answering.py new file mode 100644 index 00000000..f35b4364 --- /dev/null +++ b/functions/master/question_answering/0.4.0/src/test_question_answering.py @@ -0,0 +1,76 @@ +# Copyright 2019 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import mlrun +import transformers +import tempfile + +APPLE_COLOR = "red" + + +def mock_pipeline_call(*args, **kwargs): + return [[{"generated_text": "1. " + APPLE_COLOR}]] + + +def _make_data_dir_for_test(): + data_dir = tempfile.mkdtemp() + content = "The apple color is red." + with open(data_dir + "/test_data.txt", "w") as f: + f.write(content) + return data_dir + + +def test_question_answering(monkeypatch): + monkeypatch.setattr(transformers.Pipeline, "__call__", mock_pipeline_call) + input_path = "./data" + artifact_path = tempfile.mkdtemp() + project = mlrun.new_project("qa", context="./") + fn = project.set_function("question_answering.py", "answer_questions", kind="job", image="mlrun/mlrun") + qa_run = fn.run( + handler="answer_questions", + params={ + "model_name": "distilgpt2", + "data_path": input_path, + "text_wrapper": ( + "Given the following sentence:\n" + "-----\n" + "{}\n" + "-----" + ), + "questions": [ + "What is the color of the apple?", + ], + "questions_columns": [ + "color", + ], + "generation_config": { + "do_sample": True, + "temperature": 0.8, + "top_p": 0.9, + "early_stopping": True, + "max_new_tokens": 20, + }, + }, + returns=[ + "question_answering_df: dataset", + "question_answering_errors: result", + ], + local=True, + artifact_path=artifact_path + ) + qa_df = mlrun.get_dataitem( + qa_run.status.artifacts[0]["spec"]["target_path"] + ).as_df() + assert qa_df["color"][0] == APPLE_COLOR + assert qa_run.outputs["question_answering_errors"] == {} diff --git a/functions/master/question_answering/0.4.0/static/documentation.html b/functions/master/question_answering/0.4.0/static/documentation.html new file mode 100644 index 00000000..3ec31796 --- /dev/null +++ b/functions/master/question_answering/0.4.0/static/documentation.html @@ -0,0 +1,377 @@ + + + + + + + +question_answering package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + + + +
    +
    + + +
    +
    +
    + +
    +

    question_answering package

    + +
    + +
    +
    +
    +
    +
    +

    question_answering package#

    +
    +

    Submodules#

    +
    +
    +

    question_answering.question_answering module#

    +
    +
    +class question_answering.question_answering.PollQuestionHandler(poll_count: int = 5, poll_strategy: str = 'most_common')[source]#
    +

    Bases: question_answering.question_answering.QuestionHandler

    +

    Static class to hold all the possible poll question configurations options keys

    +
    +
    +class ConfigKeys[source]#
    +

    Bases: object

    +

    A class for handling questions answering for poll type questions. +These type of question are answered by asking the same question multiple times +and choosing the most common answer or the average answer.

    +
    +
    +POLL_COUNT = 'poll_count'#
    +

    The number of times to ask the same question.

    +
    +
    +
    +POLL_STRATEGY = 'poll_strategy'#
    +

    The strategy to use for choosing the answer from the poll.

    +
    +
    +
    +
    +class Strategy(value)[source]#
    +

    Bases: enum.Enum

    +

    An enumeration.

    +
    +
    +AVERAGE = 'average'#
    +

    The average answer strategy.

    +
    +
    +
    +MOST_COMMON = 'most_common'#
    +

    The most common answer strategy.

    +
    +
    +
    +static average(answers)[source]#
    +

    Calculate the average answer for a given list of answers.

    +
    +
    +
    +do(answers)[source]#
    +

    Perform the strategy.

    +
    +
    +
    +static most_common(answers)[source]#
    +

    Calculate the most common answer for a given list of answers.

    +
    +
    +
    +
    +answer(questions_amount: int, batched_input: List[str], generation_pipeline: transformers.Pipeline, generation_config: transformers.GenerationConfig)List[List[str]][source]#
    +

    Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline.

    +
    +
    +
    +
    +class question_answering.question_answering.QuestionHandler[source]#
    +

    Bases: object

    +

    A class for handling questions answering for a given question type. +This class is used as a base class for all question types, and for default question type (regular question +answering without any special handling).

    +
    +
    +class ConfigKeys[source]#
    +

    Bases: object

    +
    +
    +
    +answer(questions_amount: int, batched_input: List[str], generation_pipeline: transformers.Pipeline, generation_config: transformers.GenerationConfig)List[List[str]][source]#
    +

    Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline.

    +
    +
    +
    +
    +class question_answering.question_answering.QuestionTypes[source]#
    +

    Bases: object

    +
    +
    +DEFAULT = 'default'#
    +
    +
    +
    +POLL = 'poll'#
    +
    +
    +
    +
    +question_answering.question_answering.answer_questions(data_path: Union[str, List[str]], model_name: str, questions: Union[List[str], List[List[str]]], device_map: Optional[Union[str, dict]] = None, model_kwargs: Optional[dict] = None, auto_gptq_exllama_max_input_length: Optional[int] = None, tokenizer_name: Optional[str] = None, tokenizer_kwargs: Optional[dict] = None, text_wrapper: Union[str, List[str]] = '', questions_wrapper: Union[str, List[str]] = '', generation_config: Optional[Union[Dict, List[Dict]]] = None, questions_config: Optional[Union[Dict, List[Dict]]] = None, batch_size: int = 1, questions_columns: Optional[List[str]] = None, verbose: bool = False)Tuple[pandas.core.frame.DataFrame, dict][source]#
    +

    Answer questions with a context to the given text files contents by a pretrained LLM model. Each text file will have +the following prompt built:

    +

    start of text_wrapper +<text file content> +end of text_wrapper

    +

    start of questions_wrapper +1. <questions[0]> +2. <questions[1]> +… +n. <questions[n-1]> +end of questions_wrapper

    +
    +
    Parameters
    +
      +
    • data_path – A path to a directory of text files or a path to a text file to ask +questions about.

    • +
    • model_name – The pre-trained model name from the huggingface hub to use for asking +questions.

    • +
    • questions – The questions to ask. +A list of lists of questions to ask per text file, and devided +by question groups, the groups can be dtermained by size (in order to +avoid large inputs to the llm) or by questioning method +(regular or poll like questioning).

    • +
    • device_map – A map to use for loading the model on multiple devices.

    • +
    • model_kwargs – Keyword arguments to pass for loading the model using HuggingFace’s +transformers.AutoModelForCausalLM.from_pretrained function.

    • +
    • auto_gptq_exllama_max_input_length – For AutoGPTQ models to set and extend the model’s input buffer size.

    • +
    • tokenizer_name – The tokenizer name from the huggingface hub to use. If not given, the +model name will be used.

    • +
    • tokenizer_kwargs – Keyword arguments to pass for loading the tokenizer using HuggingFace’s +transformers.AutoTokenizer.from_pretrained function.

    • +
    • text_wrapper – A wrapper for the file’s text. Will be added at the start of the prompt. +Must have a placeholder (‘{}’) for the text of the file.

    • +
    • questions_wrapper – A wrapper for the questions received. Will be added after the text +wrapper in the prompt template. Must have a placeholder (‘{}’) for the +questions.

    • +
    • generation_config – HuggingFace’s GenerationConfig keyword arguments to pass to the +generate method.

    • +
    • questions_config – A dictionary or list of dictionaries containing specific ways to answer +questions (using a poll for example), each dictionary in the list is for +corresponding question group and determines the question asking method +for said group.

    • +
    • batch_size – Batch size for inference.

    • +
    • questions_columns – Columns to use for the dataframe returned.

    • +
    • verbose – Whether to present logs of a progress bar and errors. Default: True.

    • +
    +
    +
    Returns
    +

    A tuple of:

    +
      +
    • A dataframe dataset of the questions answers.

    • +
    • A dictionary of errored files that were not inferred or were not answered properly.

    • +
    +

    +
    +
    +
    +
    +
    +question_answering.question_answering.open_mpi_handler(worker_inputs: List[str], root_worker_inputs: Optional[Dict[str, Any]] = None)[source]#
    +
    +
    +
    +

    Module contents#

    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/question_answering/0.4.0/static/example.html b/functions/master/question_answering/0.4.0/static/example.html new file mode 100644 index 00000000..f2423f96 --- /dev/null +++ b/functions/master/question_answering/0.4.0/static/example.html @@ -0,0 +1,904 @@ + + + + + + + +Question Answering + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + + + +
    +
    + + +
    +
    +
    + + +
    +
    +
    +

    Question Answering#

    +
    +

    Short description and explenation#

    +

    This function enables ad-hoc question answering over documents by ingesting text into a language model and returning formatted responses.
    +It accepts:

    +
      +
    • A language model

    • +
    • Text files with content

    • +
    • Questions to answer

    • +
    • More inputs can be given for configuration

    • +
    +

    The model processes the files to build understanding. Questions posed are then answered in one of two modes:

    +

    Default mode:
    +The model directly answers each question using its own capabilities.

    +

    Poll mode:
    +Additional models are included to separately answer each question. An aggregation algorithm determines the best response through consensus between models.
    +Two options exist for consensus methodology:

    +

    Average Answer:
    +Each model’s answer is scored. The response with the average highest score amongst models is selected. Useful for numeric or ranked responses.

    +

    Most Common Answer:
    The answer that occurs most frequently across models is selected. Useful for textual responses to avoid outliers.

    +

    Using multiple models via the poll mode provides accuracy improvements for questions lacking definitive answers, as it refines responses through an ensemble process.

    +
    +
    +

    Background#

    +

    At the core, advanced natural language processing (NLP) models called foundation models are being leveraged to read and comprehend the input text files.
    +Specifically, models such as GPT-3 or Codex from Anthropic are used as the base language model.

    +

    When documents are fed into the function, the background process invokes these models to ingest and digest the information.

    +

    This provides the knowledge base for the models to then offer informed answers tailored to any queries about the documents.
    +The parameters controlling model size and computation time provide tradeoffs between cost, speed, and sophistication of comprehension.

    +

    Additionally, the poll option expands on a single model by sampling responses from a number of models as mentioned above.

    +
    +
    +

    Requirements#

    +

    transformers
    +torch
    +tqdm

    +
    +
    +

    Documentation#

    +

    data_path: A path to a directory of text files or a path to a text file to ask questions about.

    +

    model_name: The pre-trained model name from the huggingface hub to use for answering questions.

    +

    questions: The questions to ask. A list of lists of questions to ask per text file, and devided
    +by question groups, the groups can be determained by size (in order to
    +avoid large inputs to the llm) or by questioning method (regular or poll like questioning).

    +

    device_map: A map to use for loading the model on multiple devices.

    +

    model_kwargs: Keyword arguments to pass for loading the model using HuggingFace’s
    +transformers.AutoModelForCausalLM.from_pretrained function.

    +

    auto_gptq_exllama_max_input_length: For AutoGPTQ models to set and extend the model’s input buffer size.

    +

    tokenizer_name: The tokenizer name from the huggingface hub to use. If not given, the given model name will be used.

    +

    tokenizer_kwargs: Keyword arguments to pass for loading the tokenizer using HuggingFace’s
    +transformers.AutoTokenizer.from_pretrained function.

    +

    text_wrapper: Must have a placeholder (‘{}’) for the text of the file.

    +

    questions_wrapper: A wrapper for the questions received. Will be added after the text wrapper in the prompt template.
    +Must have a placeholder (‘{}’) for the questions.

    +

    generation_config: HuggingFace’s GenerationConfig keyword arguments to pass to the generate method.

    +

    questions_config: A dictionary or list of dictionaries containing specific ways to answer questions (using a poll for example),
    +each dictionary in the list is for corresponding question group and determines the question asking method
    +for said group.

    +

    batch_size: Batch size for inference.

    +

    questions_columns: Columns to use for the dataframe returned.

    +

    verbose: Whether to present logs of a progress bar and errors. Default: True.

    +
    +
    +

    Demo 1#

    +

    This is a short and simple example to show the basic use of the function.

    +
    +

    (1.) Import the function (import mlrun, set project and import function)#

    +
    +
    +
    import mlrun
    +import transformers
    +import tempfile
    +
    +
    +
    +
    +
    +
    +
    project = mlrun.get_or_create_project(
    +    name="call-center-demo-1",
    +    context="./",
    +    user_project=True,
    +    parameters={
    +        "default_image": "mlrun/mlrun",
    +    })
    +
    +
    +
    +
    +
    +
    +
    func = project.set_function(
    +    "question-answering.py",
    +    name="question-answering",
    +    kind="job",
    +    handler="answer_questions",
    +)
    +project.save()
    +
    +
    +
    +
    +

    We create a text file that the model can be asked about

    +
    +
    +
    def _make_data_dir_for_test():
    +    data_dir = tempfile.mkdtemp()
    +    # The information the model will need in order to answer our question
    +    content = "The apple is red."
    +    with open(data_dir + "/test_data.txt", "w") as f:
    +        f.write(content)
    +    return data_dir
    +
    +
    +
    +
    +
    +
    +

    (2.) Usage#

    +

    Then we set where to take the path to the text file we want to ask about, the questions, and column name for the answer table.

    +
    +
    +
    input_path = _make_data_dir_for_test()
    +# The question for the model to answer
    +question = ["What is the color of the apple?"]
    +# The column of the answer in the data frame returned by the function
    +column_name = ["color"]
    +
    +
    +
    +
    +

    Now we run the function with all the parameters we prepered earlier

    +
    +
    +
    demo1_run = func.run(
    +    handler="answer_questions",
    +    params={
    +        "model": "distilgpt2",
    +        "input_path": input_path,
    +        "questions": question,
    +        "questions_columns": column_name,
    +        "generation_config": {
    +            "do_sample": True,
    +            "temperature": 0.8,
    +            "top_p": 0.9,
    +            "early_stopping": True,
    +            "max_new_tokens": 20,
    +        },
    +    },
    +    returns=[
    +        "question_answering_df: dataset",
    +        "question_answering_errors: result",
    +    ],
    +    local=True,
    +    artifact_path="./"
    +)
    +
    +
    +
    +
    +
    +
    +

    (3.) Review results#

    +

    and after the run is finished we can take a look and see our answer

    +
    +
    +
    demo1_run.outputs
    +
    +
    +
    +
    +
    +
    +
    +

    Demo 2#

    +

    This is a much larger example, we will show how we use this function to analyze a number of calls between agents and customer of a internet company (all the data is generated by Iguazio).
    +For something like this, we recomend using a strong model, and putting some time into making the prompts.

    +
    +

    (1.) Import the function (import mlrun, set project and import function)#

    +
    +
    +
    import os
    +import mlrun
    +import torch
    +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
    +
    +
    +
    +
    +
    > 2023-12-18 10:18:37,490 [warning] Client version with higher version than server version isn't supported, align your client to the server version: {'parsed_server_version': Version(major=1, minor=5, patch=2, prerelease='rc1', build='track'), 'parsed_client_version': Version(major=1, minor=6, patch=0, prerelease='rc11', build=None)}
    +
    +
    +
    +
    +
    +
    +
    project = mlrun.get_or_create_project(
    +    name="call-center-demo-2",
    +    context="./",
    +    user_project=True,
    +    parameters={
    +        "default_image": "mlrun/mlrun",
    +    })
    +
    +
    +
    +
    +
    > 2023-12-18 10:18:51,651 [info] Project loaded successfully: {'project_name': 'call-center-demo-zeev55'}
    +
    +
    +
    +
    +
    +
    +
    func = project.set_function(
    +    "question-answering.py",
    +    name="question-answering",
    +    kind="job",
    +    handler="answer_questions",
    +)
    +project.save()
    +
    +
    +
    +
    +
    <mlrun.projects.project.MlrunProject at 0x7f8bc5b0a370>
    +
    +
    +
    +
    +
    +
    +

    (2.) Usage#

    +

    This example is a bit more complicated as we mentioned, we give the model a list of questions, for some of them we give the model a list of answers to choose from.

    +
    +
    +
    QUESTIONS = [
    +    "1. Write a long summary of the text, focus on the topic (max 50 words).",
    +    "2. Was the Client's concern addressed, (choose only one) [Yes, No]?",
    +    ]
    +
    +qa_questions_columns = [
    +                        "Summary",
    +                        "is_fixed",
    +                        ]
    +
    +
    +
    +
    +

    Another thing we give the model this time is answer examples (one/few shot answering), this can be done to show the model how you want the answer to be structured or caculated.

    +
    +
    +
    # For every file we ask about, the model will be presented with this example of a call and how we want the answers.
    +DEMO_CALL = (
    +    "Agent: Good afternoon, you've reached [Internet Service Provider] customer support. I'm Megan. How can I assist "
    +    "you today?\n"
    +    "Customer: Hello, Megan. This is Lisa. I've noticed some billing discrepancies on my last statement.\n"
    +    "Agent: Thank you, Lisa. Let me pull up your account. I see the billing discrepancies you mentioned. It appears "
    +    "there was an error in the charges. I apologize for the inconvenience.\n"
    +    "Customer: Thank you for acknowledging the issue, Megan. Can you please help me get it resolved?\n"
    +    "Agent: Absolutely, Lisa. I've made note of the discrepancies, and I'll escalate this to our billing department "
    +    "for investigation and correction. You should see the adjustments on your next statement.\n"
    +    "Customer: That sounds good, Megan. I appreciate your help.\n"
    +    "Agent: Not a problem, Lisa. Have a wonderful day, and we'll get this sorted out for you.\n"
    +)
    +
    +DEMO_ANSWERS = (
    +    "1. The customer, contacted the call center regarding billing discrepancies on her statement. The agent, "
    +    "acknowledged the issue, assured The customer it would be resolved, and escalated it to the billing department for "
    +    "correction.\n"
    +    "2. Yes.\n"
    +
    +
    +
    +
    +

    Then we need to wrap it all nicely to be given to the model as a single prompt, this is done with a text wrapper, and a question wrapper.
    +both of them will be concatenated inside the function with the questions and passed to the model.

    +
    +
    +
    # The wrappers are built according to the model's convensions to improve result
    +TEXT_WRAPPER = (
    +    f"<|im_start|>system: You are an AI assistant that answers questions accurately and shortly<|im_end|>\n"
    +    f"<|im_start|>user: Given the following text:\n"
    +    f"{DEMO_CALL}\n"
    +    f"answer the questions as accurately as you can:\n"
    +    f"{QUESTIONS}<|im_end|>\n"
    +    f"<|im_start|>assistant:\n"
    +    f"{DEMO_ANSWERS}<|im_end|>\n"
    +    f"<|im_start|>user: Given the following text:\n"
    +    "{}"
    +) 
    +QUESTIONS_WRAPPER = (
    +    " answer the given questions as accurately as you can, do not write more answers the questions:\n"
    +    "{}<|im_end|>\n"
    +    "<|im_start|>assistant:\n"
    +)
    +
    +
    +
    +
    +

    The last few parameters we need to set are the model we will use, the input lenth (no available for all models) and the batch size.
    +The batch size determains how many files we want procced at each epoch, and the larger we go the faster the proccess will be, as long as our memory is sufficient.

    +
    +
    +
    # We like this version of mistral's model, which is small and fast but also gives great results
    +qa_model = "TheBloke/Mistral-7B-OpenOrca-GPTQ"
    +
    +
    +
    +
    +

    Finnaly, we run the function with all the parameters we prepared.

    +
    +
    +
    # Question answering:
    +demo2_run = func.run(
    +    function="question-answering",
    +    local=True,
    +    handler="answer_questions",
    +    inputs={"data_path": os.path.abspath("./calls")},
    +    params={
    +        "model_name": qa_model,
    +        "device_map": "auto",
    +        "text_wrapper":TEXT_WRAPPER,
    +        "questions": QUESTIONS,
    +        "questions_wrapper": QUESTIONS_WRAPPER,
    +        "questions_columns": qa_questions_columns,
    +    },
    +    returns=[
    +        "question_answering_df: dataset",
    +        "question_answering_errors: result",
    +    ],
    +)
    +
    +
    +
    +
    +
    +
    +

    (3.) Review results#

    +
    +
    +
    demo2_run.outputs
    +
    +
    +
    +
    +
    +
    +
    +

    Demo 3#

    +

    This is also a large example, in this case we use another option of the function to ask questions in the form of a poll.

    +
    +

    (1.) Import the function (import mlrun, set project and import function)#

    +
    +
    +
    import os
    +import mlrun
    +import torch
    +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
    +
    +
    +
    +
    +
    > 2023-12-18 10:18:37,490 [warning] Client version with higher version than server version isn't supported, align your client to the server version: {'parsed_server_version': Version(major=1, minor=5, patch=2, prerelease='rc1', build='track'), 'parsed_client_version': Version(major=1, minor=6, patch=0, prerelease='rc11', build=None)}
    +
    +
    +
    +
    +
    +
    +
    project = mlrun.get_or_create_project(
    +    name="call-center-demo-3",
    +    context="./",
    +    user_project=True,
    +    parameters={
    +        "default_image": "mlrun/mlrun",
    +    })
    +
    +
    +
    +
    +
    > 2023-12-18 10:18:51,651 [info] Project loaded successfully: {'project_name': 'call-center-demo-zeev55'}
    +
    +
    +
    +
    +
    +
    +
    func = project.set_function(
    +    "question-answering.py",
    +    name="question-answering",
    +    kind="job",
    +    handler="answer_questions",
    +)
    +project.save()
    +
    +
    +
    +
    +
    <mlrun.projects.project.MlrunProject at 0x7f8bc5b0a370>
    +
    +
    +
    +
    +
    +
    +

    (2.) Usage#

    +

    Like in the second demo, we make a list of questions for the function to answer.

    +
    +
    +
    # These questions are harder to answer, as there is no right answer.
    +# So we want it to be at least consistent, for that we use the poll option.
    +QUESTIONS = [
    +    "1. Rate the agent's level of empathy (The ability to understand and share the feelings of others) on a scale of 1-5.",
    +    "2. Rate the agent's level of professionalism (Conducting oneself in a way that is appropriate for the workplace) on a scale of 1-5.",
    +]
    +
    +qa_questions_columns = [
    +                        "empathy",
    +                        "professionalism",
    +
    +                        ]
    +
    +
    +
    +
    +

    Another thing we give the model this time is answer examples (one/few shot answering), this can be done to show the model how you want the answer to be structured or caculated.
    +So for every file we ask about, the model will be presented with this example of a call and how we want the answers.

    +
    +
    +
    # For every file we ask about, the model will be presented with this example of a call and how we want the answers.
    +DEMO_CALL = (
    +    "Agent: Good afternoon, you've reached [Internet Service Provider] customer support. I'm Megan. How can I assist "
    +    "you today?\n"
    +    "Customer: Hello, Megan. This is Lisa. I've noticed some billing discrepancies on my last statement.\n"
    +    "Agent: Thank you, Lisa. Let me pull up your account. I see the billing discrepancies you mentioned. It appears "
    +    "there was an error in the charges. I apologize for the inconvenience.\n"
    +    "Customer: Thank you for acknowledging the issue, Megan. Can you please help me get it resolved?\n"
    +    "Agent: Absolutely, Lisa. I've made note of the discrepancies, and I'll escalate this to our billing department "
    +    "for investigation and correction. You should see the adjustments on your next statement.\n"
    +    "Customer: That sounds good, Megan. I appreciate your help.\n"
    +    "Agent: Not a problem, Lisa. Have a wonderful day, and we'll get this sorted out for you.\n"
    +)
    +
    +
    +DEMO_ANSWERS = (
    +    "1. 4\n"
    +    "2. 5\n"
    +
    +)
    +
    +
    +
    +
    +

    Then we need to wrap it all nicely to be given to the model as a single prompt, this is done with a text wrapper, and a question wrapper.
    +both of them will be concatenated inside the function with the questions and passed to the model.

    +
    +
    +
    TEXT_WRAPPER = (
    +    f"<|im_start|>system: You are an AI assistant that answers questions accurately and shortly<|im_end|>\n"
    +    f"<|im_start|>user: Given the following text:\n"
    +    f"{DEMO_CALL}\n"
    +    f"answer the questions as accurately as you can:\n"
    +    f"{QUESTIONS}<|im_end|>\n"
    +    f"<|im_start|>assistant:\n"
    +    f"{DEMO_ANSWERS}<|im_end|>\n"
    +    f"<|im_start|>user: Given the following text:\n"
    +    "{}"
    +) 
    +
    +QUESTIONS_WRAPPER = (
    +    " answer the given questions as accurately as you can, do not write more answers the questions:\n"
    +    "{}<|im_end|>\n"
    +    "<|im_start|>assistant:\n"
    +)
    +
    +
    +
    +
    +

    The config is for the second questioning method, we cal “poll”, and in which we need to choose how many voting models we want participating,
    +and in what way we want do decide the result, we currentlly support average and most_common as show here.

    +

    *An explenation about both questioning methods can be found in the begginig of this notebook

    +
    +
    +
    questions_config = 
    +    {
    +        "type": "poll",
    +        "poll_count": 3, # How many 'voters'
    +        "poll_strategy": "most_common"
    +    }
    +
    +
    +
    +
    +
    +
    +
    qa_model = "TheBloke/Mistral-7B-OpenOrca-GPTQ"
    +
    +
    +
    +
    +

    Finnaly, we run the function with all the parameters we prepared.

    +
    +
    +
    # Question answering:
    +demo3_run = func.run(
    +    function="question-answering",
    +    local=True,
    +    handler="answer_questions",
    +    inputs={"data_path": os.path.abspath("./calls")},
    +    params={
    +        "model_name": qa_model,
    +        "device_map": "auto",
    +        "text_wrapper":TEXT_WRAPPER,
    +        "questions": QUESTIONS,
    +        "questions_wrapper": QUESTIONS_WRAPPER,
    +        "questions_columns": qa_questions_columns,
    +        "questions_config": questions_config, # This time we add 'questions_config'
    +    },
    +    returns=[
    +        "question_answering_df: dataset",
    +        "question_answering_errors: result",
    +    ],
    +)
    +
    +
    +
    +
    +
    +
    +

    (3.) Review results#

    +
    +
    +
    demo3_run.outputs
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/question_answering/0.4.0/static/function.html b/functions/master/question_answering/0.4.0/static/function.html new file mode 100644 index 00000000..dbaf6ff6 --- /dev/null +++ b/functions/master/question_answering/0.4.0/static/function.html @@ -0,0 +1,241 @@ + + + + + + + + + + + Source + + + + +
    +        
    +kind: job
    +metadata:
    +  name: question-answering
    +  tag: ''
    +  hash: aed62db95f17576c69b457767e3595c2de1d5465
    +  project: ''
    +  labels:
    +    author: yonish
    +  categories:
    +  - genai
    +  - huggingface
    +  - machine-learning
    +spec:
    +  command: ''
    +  args: []
    +  image: ''
    +  build:
    +    functionSourceCode: # Copyright 2023 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import enum
import logging
import operator
import pathlib
from collections import Counter
from functools import reduce, wraps
from typing import Any, Dict, List, Tuple, Union

import pandas as pd
import transformers
from tqdm import tqdm

# Get the global logger:
_LOGGER = logging.getLogger()


def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    global _LOGGER

    is_mpi = False
    try:
        import mlrun

        context = mlrun.get_or_create_ctx(name="mlrun")
        _LOGGER = context.logger
        is_mpi = context.labels.get("kind", "job") == "mpijob"

        if is_mpi:
            try:
                from mpi4py import MPI

                return context, MPI.COMM_WORLD
            except ModuleNotFoundError as mpi4py_not_found:
                context.logger.error(
                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
                )
                raise mpi4py_not_found
    except ModuleNotFoundError as module_not_found:
        if is_mpi:
            raise module_not_found
    return None, None


def open_mpi_handler(
    worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None
):
    global _LOGGER

    # Check for MLRun and OpenMPI availability:
    context, comm = _check_mlrun_and_open_mpi()

    def decorator(handler):
        if comm is None or comm.Get_size() == 1:
            return handler

        @wraps(handler)
        def wrapper(**kwargs):
            # Get the open mpi environment properties:
            size = comm.Get_size()
            rank = comm.Get_rank()

            # Give the correct chunk of the workers inputs:
            for worker_input in worker_inputs:
                input_argument = kwargs[worker_input]
                if input_argument is None:
                    continue
                if isinstance(input_argument, str):
                    input_argument = _get_text_files(
                        data_path=pathlib.Path(input_argument).absolute()
                    )
                if len(input_argument) < size:
                    raise ValueError(
                        f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. "
                        f"Please reduce the amount of workers for this input."
                    )
                even_chunk_size = len(input_argument) // size
                chunk_start = rank * even_chunk_size
                chunk_end = (
                    (rank + 1) * even_chunk_size
                    if rank + 1 < size
                    else len(input_argument)
                )
                context.logger.info(
                    f"Rank #{rank}: Processing input chunk of '{worker_input}' "
                    f"from index {chunk_start} to {chunk_end}."
                )
                if isinstance(input_argument, list):
                    input_argument = input_argument[chunk_start:chunk_end]
                elif isinstance(input_argument, pd.DataFrame):
                    input_argument = input_argument.iloc[chunk_start:chunk_end:, :]
                kwargs[worker_input] = input_argument

            # Set the root worker only arguments:
            if rank == 0 and root_worker_inputs:
                kwargs.update(root_worker_inputs)

            # Run the worker:
            output = handler(**kwargs)

            # Send the output to the root rank (rank #0):
            output = comm.gather(output, root=0)
            if rank == 0:
                # Join the outputs:
                context.logger.info("Collecting data from workers to root worker.")
                dataframe = pd.concat(objs=[df for df, _ in output], axis=0)
                errors_dictionary = reduce(operator.ior, [err for _, err in output], {})
                return dataframe, errors_dictionary
            return None

        return wrapper

    return decorator


@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True})
def answer_questions(
    data_path: Union[str, List[str]],
    model_name: str,
    questions: Union[List[str], List[List[str]]],
    device_map: Union[str, dict] = None,
    model_kwargs: dict = None,
    auto_gptq_exllama_max_input_length: int = None,
    tokenizer_name: str = None,
    tokenizer_kwargs: dict = None,
    text_wrapper: Union[str, List[str]] = "",
    questions_wrapper: Union[str, List[str]] = "",
    generation_config: Union[Dict, List[Dict]] = None,
    questions_config: Union[Dict, List[Dict]] = None,
    batch_size: int = 1,
    questions_columns: List[str] = None,
    verbose: bool = False,
) -> Tuple[pd.DataFrame, dict]:
    """
    Answer questions with a context to the given text files contents by a pretrained LLM model. Each text file will have
    the following prompt built:

    start of `text_wrapper`
    <text file content>
    end of `text_wrapper`

    start of `questions_wrapper`
    1. <questions[0]>
    2. <questions[1]>
    ...
    n. <questions[n-1]>
    end of `questions_wrapper`

    :param data_path:                          A path to a directory of text files or a path to a text file to ask
                                               questions about.
    :param model_name:                         The pre-trained model name from the huggingface hub to use for asking
                                               questions.
    :param questions:                          The questions to ask.
                                               A list of lists of questions to ask per text file, and devided
                                               by question groups, the groups can be dtermained by size (in order to
                                               avoid large inputs to the llm) or by questioning method
                                               (regular or poll like questioning).
    :param device_map:                         A map to use for loading the model on multiple devices.
    :param model_kwargs:                       Keyword arguments to pass for loading the model using HuggingFace's
                                               `transformers.AutoModelForCausalLM.from_pretrained` function.
    :param auto_gptq_exllama_max_input_length: For AutoGPTQ models to set and extend the model's input buffer size.
    :param tokenizer_name:                     The tokenizer name from the huggingface hub to use. If not given, the
                                               model name will be used.
    :param tokenizer_kwargs:                   Keyword arguments to pass for loading the tokenizer using HuggingFace's
                                               `transformers.AutoTokenizer.from_pretrained` function.
    :param text_wrapper:                       A wrapper for the file's text. Will be added at the start of the prompt.
                                               Must have a placeholder ('{}') for the text of the file.
    :param questions_wrapper:                  A wrapper for the questions received. Will be added after the text
                                               wrapper in the prompt template. Must have a placeholder ('{}') for the
                                               questions.
    :param generation_config:                  HuggingFace's `GenerationConfig` keyword arguments to pass to the
                                               `generate` method.
    :param questions_config:                   A dictionary or list of dictionaries containing specific ways to answer
                                               questions (using a poll for example), each dictionary in the list is for
                                               corresponding question group and determines the question asking method
                                               for said group.
    :param batch_size:                         Batch size for inference.
    :param questions_columns:                  Columns to use for the dataframe returned.
    :param verbose:                            Whether to present logs of a progress bar and errors. Default: True.


    :returns: A tuple of:

              * A dataframe dataset of the questions answers.
              * A dictionary of errored files that were not inferred or were not answered properly.
    """
    global _LOGGER

    # Set configs to empty dict if not given:
    if generation_config is None:
        generation_config = {}
    if questions_config is None:
        questions_config = {}

    # Get the input text files to question:
    if verbose:
        _LOGGER.info("Collecting text files.")
    if isinstance(data_path, str):
        data_path = pathlib.Path(data_path).absolute()
        text_files = _get_text_files(data_path=data_path)
    else:
        text_files = data_path
    if verbose:
        _LOGGER.info(f"Collected {len(text_files)} text files.")

    # Get the prompt template:
    if verbose:
        _LOGGER.info("Creating prompt template.")

    # Organize questions as a list of list, and count number of sub-lists for future use
    number_of_question_groups = 1 if isinstance(questions[0], str) else len(questions)
    questions = _to_group_list(
        argument_value=questions,
        argument_name="questions",
        length=number_of_question_groups,
    )

    # Organize prompt parts at proper length
    text_wrapper = _to_group_list(
        argument_value=text_wrapper,
        argument_name="text_wrapper",
        length=number_of_question_groups,
    )
    questions_wrapper = _to_group_list(
        argument_value=questions_wrapper,
        argument_name="questions_wrapper",
        length=number_of_question_groups,
    )

    # Create a list of prompt according to given parts and questions
    prompt_template = []
    questions = questions if isinstance(questions[0], list) else [questions]

    # Build all prompts
    for i in range(number_of_question_groups):
        prompt_template.append(
            _get_prompt_template(
                text_wrapper=text_wrapper[i],
                questions_wrapper=questions_wrapper[i],
                questions=questions[i],
            )
        )
    if verbose:
        _LOGGER.info(f"Prompt template created:\n\n{prompt_template}\n")

    # Get the total amount of questions:
    questions_amount = sum([len(sublist) for sublist in questions])

    # Get the questions columns:
    questions_columns = questions_columns or [
        f"q{i}" for i in range(1, questions_amount + 1)
    ]

    # Check if we have the correct amount of questions columns:
    if len(questions_columns) != questions_amount:
        raise ValueError(
            f"The provided questions columns length ({len(questions_columns)}) "
            f"does not match the questions amount ({questions_amount})"
        )

    # Load the generation config:
    if verbose:
        _LOGGER.info("Loading generation configuration.")
    generation_config = [
        transformers.GenerationConfig(**(cfg or {}))
        for cfg in _to_group_list(
            argument_value=generation_config,
            argument_name="generation_config",
            length=number_of_question_groups,
        )
    ]
    if verbose:
        _LOGGER.info(f"Generation configuration loaded: {generation_config}")

    # Load the model and tokenizer into a pipeline object:
    if verbose:
        _LOGGER.info(f"Loading model '{model_name}'.")
    generation_pipeline = _get_generation_pipeline(
        model_name=model_name,
        device_map=device_map,
        tokenizer_name=tokenizer_name or model_name,
        model_kwargs=model_kwargs or {},
        tokenizer_kwargs=tokenizer_kwargs or {},
        auto_gptq_exllama_max_input_length=auto_gptq_exllama_max_input_length,
        batch_size=batch_size,
    )
    if verbose:
        _LOGGER.info("Model loaded.")

    # Prepare the successes dataframe and errors dictionary to be returned:
    successes = []
    errors = {}

    # Split the files into batches:
    file_batches = [
        text_files[i : i + batch_size]
        if i + batch_size < len(text_files)
        else text_files[i:]
        for i in range(0, len(text_files), batch_size)
    ]
    questions_config = _to_group_list(
        argument_value=questions_config,
        argument_name="questions_config",
        length=number_of_question_groups,
    )

    # Create a list of question handlers according to given configs
    handlers = []
    for cfg in questions_config:
        question_type = cfg.pop("type", "default")
        handlers.append(QUESTION_MAPPING.get(question_type)(**cfg))

    # Go over the batches of text files and question them:
    for file_batch in tqdm(
        file_batches,
        desc="Generating answers",
        unit=f"file (batch of {batch_size})",
        disable=not verbose,
    ):
        try:
            total_answers = [[] for _ in range(batch_size)]

            # Go over all question group per batch of documents
            for question_group in range(number_of_question_groups):
                current_questions_amount = len(questions[question_group])

                # Read batch (read the text from the text files):
                batched_input = _read_file_batch(
                    file_batch=file_batch,
                    prompt_template=prompt_template[question_group],
                )

                # Answer the questions with each question handler:
                batched_answers = handlers[question_group].answer(
                    questions_amount=current_questions_amount,
                    batched_input=batched_input,
                    generation_pipeline=generation_pipeline,
                    generation_config=generation_config[question_group],
                )

                # Put the answers in the correct place in the total answers list according to the place in the batch:
                for i in range(batch_size):
                    total_answers[i].extend(batched_answers[i])

            # Collect the answers and attach the file name:
            successes.extend(
                [
                    [file.name, *answers]
                    for file, answers in zip(file_batch, total_answers)
                ]
            )
        except Exception as exception:
            # Note the exception as error in the dictionary:
            batch_file_names = ", ".join([file.name for file in file_batch])
            if verbose:
                _LOGGER.warning(
                    f"Error in batch '{batch_file_names}': {str(exception)}"
                )
            errors[batch_file_names] = str(exception)
            continue

    # Construct the answers dataframe:
    columns = [
        "text_file",
        *questions_columns,
    ]

    # Create a data frame of answers by files
    successes = pd.DataFrame(
        successes,
        columns=columns,
    )

    # Print the head of the produced dataframe and return:
    if verbose:
        _LOGGER.info(
            f"Done ({successes.shape[0]}/{len(text_files)})\n"
            f"Answers summary:\n"
            f"{successes.head()}"
        )
    return successes, errors


def _get_text_files(
    data_path: pathlib.Path,
) -> List[pathlib.Path]:

    # Check if the path is of a directory or a file:
    if data_path.is_dir():

        # Get all files inside the directory:
        text_files = list(data_path.glob("*.*"))
    elif data_path.is_file():
        text_files = [data_path]
    else:
        raise ValueError(
            f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. "
            f"Given: {str(data_path)} "
        )

    return text_files


def _get_prompt_template(
    text_wrapper: str,
    questions_wrapper: str,
    questions: List[str],
) -> str:

    # Validate and build the text wrapper:
    text_wrapper = text_wrapper or (
        "Given the following text:\n" "-----\n" "{}\n" "-----"
    )
    if text_wrapper.count("{}") != 1:
        raise ValueError(
            "The `text_wrapper` must include one placeholder '{}' for the text of the file to be asked about."
        )

    # Validate and build the question wrapper:
    questions_wrapper = questions_wrapper or "Answer the questions:\n" "{}"
    if questions_wrapper.count("{}") != 1:
        raise ValueError(
            "The `questions_wrapper` must include one placeholder '{}' for the list of questions."
        )

    # Validate and parse the questions:
    if len(questions) == 0:
        raise ValueError("Please include at least one question.")
    questions = "\n".join(
        [f"{i}. {question}" for i, question in enumerate(questions, 1)]
    )

    # Construct the template:
    return f"{text_wrapper}\n{questions_wrapper.format(questions)}\n"


def _get_generation_pipeline(
    model_name: str,
    device_map: Union[str, dict],
    tokenizer_name: str,
    model_kwargs: dict,
    tokenizer_kwargs: dict,
    auto_gptq_exllama_max_input_length: int = None,
    batch_size: int = 1,
):
    # Load the model:
    model = transformers.AutoModelForCausalLM.from_pretrained(
        model_name, device_map=device_map, **model_kwargs
    )

    # Set exllama max input length if provided:
    # This changes the model's context size.
    if auto_gptq_exllama_max_input_length:
        from auto_gptq import exllama_set_max_input_length

        model = exllama_set_max_input_length(
            model=model, max_input_length=auto_gptq_exllama_max_input_length
        )

    # Load the tokenizer:
    tokenizer = transformers.AutoTokenizer.from_pretrained(
        tokenizer_name, **tokenizer_kwargs
    )

    # Initialize a generation pipline and return:
    pipe = transformers.pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        batch_size=batch_size,
    )
    pipe.tokenizer.pad_token_id = model.config.eos_token_id
    return pipe


def _read_file_batch(
    file_batch: List[pathlib.Path],
    prompt_template: str,
) -> List[str]:
    batch = []

    # Go over all files and read in usable format
    for file in file_batch:
        with open(file, "r", encoding="utf-8") as fp:
            batch.append(prompt_template.format(fp.read()))
    return batch


def _to_group_list(argument_value: list, argument_name: str, length: int):

    # Check if is list, turn to list if not
    argument_value = (
        argument_value if isinstance(argument_value, list) else [argument_value]
    )
    list_len = len(argument_value)

    # If not a list, or is a list of len 1 we duplicate for correct length
    # If list in wrong length throw an error
    if list_len != length:
        if list_len == 1:
            return argument_value * length
        raise ValueError(
            f"The argument value of '{argument_name}' is not equal to the length of the given questions - {length}"
        )
    return argument_value


class QuestionHandler:
    """
    A class for handling questions answering for a given question type.
    This class is used as a base class for all question types, and for default question type (regular question
    answering without any special handling).
    """

    class ConfigKeys:
        pass

    def __init__(self):
        pass

    @staticmethod
    def _get_answers(generated_text: str, questions_amount: int) -> List[str]:

        # Clear answer start (part before numbers):
        # TODO find better way to verify, for list of questions this is redundant for example
        if "1." not in generated_text:
            raise ValueError(
                f"Answer 1. is missing from the generated text: '{generated_text}'"
            )
        text = generated_text.split("1.", 1)[1]

        # Start extracting the answers:
        answers = []
        for i in range(1, questions_amount + 1):
            # If it's the last answer to look for, take the rest of the text:
            if i == questions_amount:
                answer_i = text
            # Verify there is a question number in the text:
            elif f"{i + 1}." not in text:
                raise ValueError(
                    f"Answer {i + 1}. is missing from the generated text: '{generated_text}'"
                )
            # Take i's answer:
            else:
                answer_i, text = text.split(f"{i + 1}.", 1)
            # Collect the answer removing redundant spaces:
            answers.append(answer_i.strip())

        return answers

    def _infer_questions(
        self,
        questions_amount: int,
        batched_input: List[str],
        generation_pipeline: transformers.Pipeline,
        generation_config: transformers.GenerationConfig,
    ) -> List[List[str]]:

        # Infer through the llm:
        batched_output = generation_pipeline(
            batched_input,
            generation_config=generation_config,
            eos_token_id=generation_pipeline.tokenizer.eos_token_id,
            return_full_text=False,
            num_return_sequences=1,
        )

        # Process the outputs to get the answers:
        batched_answers = []
        for output in batched_output:
            # Get the generated answers:
            answers = self._get_answers(
                generated_text=output[0]["generated_text"],
                questions_amount=questions_amount,
            )
            # Collect the processed answers:
            batched_answers.append(answers)
        return batched_answers

    def answer(
        self,
        questions_amount: int,
        batched_input: List[str],
        generation_pipeline: transformers.Pipeline,
        generation_config: transformers.GenerationConfig,
    ) -> List[List[str]]:
        """
        Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline.
        """
        return self._infer_questions(
            questions_amount=questions_amount,
            batched_input=batched_input,
            generation_pipeline=generation_pipeline,
            generation_config=generation_config,
        )


class PollQuestionHandler(QuestionHandler):
    """
    Static class to hold all the possible poll question configurations options keys
    """

    class ConfigKeys:
        """
        A class for handling questions answering for poll type questions.
        These type of question are answered by asking the same question multiple times
        and choosing the most common answer or the average answer.
        """

        #: The number of times to ask the same question.
        POLL_COUNT = "poll_count"

        #: The strategy to use for choosing the answer from the poll.
        POLL_STRATEGY = "poll_strategy"

    class Strategy(enum.Enum):
        #: The most common answer strategy.
        MOST_COMMON = "most_common"

        #: The average answer strategy.
        AVERAGE = "average"

        @staticmethod
        def most_common(answers):
            """
            Calculate the most common answer for a given list of answers.
            """
            count = Counter(answers)
            most_common = count.most_common(1)
            return most_common[0][0]

        @staticmethod
        def average(answers):
            """
            Calculate the average answer for a given list of answers.
            """
            if isinstance(answers[0], str):
                raise ValueError(
                    "Cannot perform poll with average answer strategy of non numeric values,"
                    " please change the question to give numeric data, or choose 'most_common' as strategy."
                )
            else:
                numeric_values = answers
            avg = sum(numeric_values) / len(numeric_values)

            # Round to the closest integer and return corresponding value
            return round(avg)

        def do(self, answers):
            """
            Perform the strategy.
            """
            return getattr(self, self.value)(answers)

    def __init__(
        self, poll_count: int = 5, poll_strategy: str = "most_common"):
        super().__init__()
        self.poll_count = poll_count
        self.poll_strategy = self.Strategy(poll_strategy)

    def answer(
        self,
        questions_amount: int,
        batched_input: List[str],
        generation_pipeline: transformers.Pipeline,
        generation_config: transformers.GenerationConfig,
    ) -> List[List[str]]:
        """
        Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline.
        """
        return self._answer_poll_questions(
            questions_amount=questions_amount,
            batched_input=batched_input,
            generation_pipeline=generation_pipeline,
            generation_config=generation_config,
        )

    def _answer_poll_questions(
        self,
        questions_amount: int,
        batched_input: List[str],
        generation_pipeline: transformers.Pipeline,
        generation_config: transformers.GenerationConfig,
    ) -> List[List[str]]:
        votes = []

        # Run the poll for each question
        for _ in range(self.poll_count):
            batched_answers = self._infer_questions(
                questions_amount=questions_amount,
                batched_input=batched_input,
                generation_pipeline=generation_pipeline,
                generation_config=generation_config,
            )
            votes.append(batched_answers)
        answers = []

        # Collect the answers according to the poll strategy
        # Average strategy works for numeric values only
        for batch in range(len(votes[0])):
            batched_answers = []
            for question in range(questions_amount):
                # Create a list of all answers to relevant question
                answer = [
                    votes[voter][batch][question] for voter in range(self.poll_count)
                ]
                answer = self.poll_strategy.do(answer)
                batched_answers.append(answer)
            answers.append(batched_answers)
        return answers


# Holds names of QuestionHandles
class QuestionTypes:
    DEFAULT = "default"
    POLL = "poll"


# Maps question types to their handlers
QUESTION_MAPPING = {
    QuestionTypes.DEFAULT: QuestionHandler,
    QuestionTypes.POLL: PollQuestionHandler,
}

    +    base_image: mlrun/mlrun
    +    commands: []
    +    code_origin: ''
    +    origin_filename: ''
    +    requirements:
    +    - transformers
    +    - torch
    +    - tqdm
    +  entry_points:
    +    open_mpi_handler:
    +      name: open_mpi_handler
    +      doc: ''
    +      parameters:
    +      - name: worker_inputs
    +        type: List[str]
    +      - name: root_worker_inputs
    +        type: Dict[str, Any]
    +        default: null
    +      outputs: []
    +      lineno: 58
    +      has_varargs: false
    +      has_kwargs: false
    +    decorator:
    +      name: decorator
    +      doc: ''
    +      parameters:
    +      - name: handler
    +      outputs: []
    +      lineno: 66
    +      has_varargs: false
    +      has_kwargs: false
    +    wrapper:
    +      name: wrapper
    +      doc: ''
    +      parameters: []
    +      outputs: []
    +      lineno: 71
    +      has_varargs: false
    +      has_kwargs: true
    +    answer_questions:
    +      name: answer_questions
    +      doc: 'Answer questions with a context to the given text files contents by a
    +        pretrained LLM model. Each text file will have
    +
    +        the following prompt built:
    +
    +
    +        start of `text_wrapper`
    +
    +        
    +
    +        end of `text_wrapper`
    +
    +
    +        start of `questions_wrapper`
    +
    +        1. 
    +
    +        2. 
    +
    +        ...
    +
    +        n. 
    +
    +        end of `questions_wrapper`'
    +      parameters:
    +      - name: data_path
    +        type: Union[str, List[str]]
    +        doc: A path to a directory of text files or a path to a text file to ask questions
    +          about.
    +      - name: model_name
    +        type: str
    +        doc: The pre-trained model name from the huggingface hub to use for asking
    +          questions.
    +      - name: questions
    +        type: Union[List[str], List[List[str]]]
    +        doc: The questions to ask. A list of lists of questions to ask per text file,
    +          and devided by question groups, the groups can be dtermained by size (in
    +          order to avoid large inputs to the llm) or by questioning method (regular
    +          or poll like questioning).
    +      - name: device_map
    +        type: Union[str, dict]
    +        doc: A map to use for loading the model on multiple devices.
    +        default: null
    +      - name: model_kwargs
    +        type: dict
    +        doc: Keyword arguments to pass for loading the model using HuggingFace's `transformers.AutoModelForCausalLM.from_pretrained`
    +          function.
    +        default: null
    +      - name: auto_gptq_exllama_max_input_length
    +        type: int
    +        doc: For AutoGPTQ models to set and extend the model's input buffer size.
    +        default: null
    +      - name: tokenizer_name
    +        type: str
    +        doc: The tokenizer name from the huggingface hub to use. If not given, the
    +          model name will be used.
    +        default: null
    +      - name: tokenizer_kwargs
    +        type: dict
    +        doc: Keyword arguments to pass for loading the tokenizer using HuggingFace's
    +          `transformers.AutoTokenizer.from_pretrained` function.
    +        default: null
    +      - name: text_wrapper
    +        type: Union[str, List[str]]
    +        doc: A wrapper for the file's text. Will be added at the start of the prompt.
    +          Must have a placeholder ('{}') for the text of the file.
    +        default: ''
    +      - name: questions_wrapper
    +        type: Union[str, List[str]]
    +        doc: A wrapper for the questions received. Will be added after the text wrapper
    +          in the prompt template. Must have a placeholder ('{}') for the questions.
    +        default: ''
    +      - name: generation_config
    +        type: Union[Dict, List[Dict]]
    +        doc: HuggingFace's `GenerationConfig` keyword arguments to pass to the `generate`
    +          method.
    +        default: null
    +      - name: questions_config
    +        type: Union[Dict, List[Dict]]
    +        doc: A dictionary or list of dictionaries containing specific ways to answer
    +          questions (using a poll for example), each dictionary in the list is for
    +          corresponding question group and determines the question asking method for
    +          said group.
    +        default: null
    +      - name: batch_size
    +        type: int
    +        doc: Batch size for inference.
    +        default: 1
    +      - name: questions_columns
    +        type: List[str]
    +        doc: Columns to use for the dataframe returned.
    +        default: null
    +      - name: verbose
    +        type: bool
    +        doc: 'Whether to present logs of a progress bar and errors. Default: True.'
    +        default: false
    +      outputs:
    +      - doc: 'A tuple of:'
    +        type: Tuple[pd.DataFrame, dict]
    +      lineno: 130
    +      has_varargs: false
    +      has_kwargs: false
    +    answer:
    +      name: answer
    +      doc: Answer questions with a context to the given text files contents by a pretrained
    +        LLM model in given pipeline.
    +      parameters:
    +      - name: self
    +      - name: questions_amount
    +        type: int
    +      - name: batched_input
    +        type: List[str]
    +      - name: generation_pipeline
    +        type: Pipeline
    +      - name: generation_config
    +        type: GenerationConfig
    +      outputs:
    +      - type: List[List[str]]
    +      lineno: 674
    +      has_varargs: false
    +      has_kwargs: false
    +    most_common:
    +      name: most_common
    +      doc: Calculate the most common answer for a given list of answers.
    +      parameters:
    +      - name: answers
    +      outputs: []
    +      lineno: 637
    +      has_varargs: false
    +      has_kwargs: false
    +    average:
    +      name: average
    +      doc: Calculate the average answer for a given list of answers.
    +      parameters:
    +      - name: answers
    +      outputs: []
    +      lineno: 646
    +      has_varargs: false
    +      has_kwargs: false
    +    do:
    +      name: do
    +      doc: Perform the strategy.
    +      parameters:
    +      - name: self
    +      - name: answers
    +      outputs: []
    +      lineno: 662
    +      has_varargs: false
    +      has_kwargs: false
    +  description: GenAI approach of question answering on a given data
    +  default_handler: answer_questions
    +  disable_auto_mount: false
    +  clone_target_dir: ''
    +  env: []
    +  priority_class_name: ''
    +  preemption_mode: prevent
    +  affinity: null
    +  tolerations: null
    +  security_context: {}
    +verbose: false
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/question_answering/0.4.0/static/item.html b/functions/master/question_answering/0.4.0/static/item.html new file mode 100644 index 00000000..d7a7898a --- /dev/null +++ b/functions/master/question_answering/0.4.0/static/item.html @@ -0,0 +1,51 @@ + + + + + + + + + + + Source + + + + +
    +        
    +apiVersion: v1
    +categories:
    +- genai
    +- huggingface
    +- machine-learning
    +description: GenAI approach of question answering on a given data
    +doc: ''
    +example: question_answering.ipynb
    +generationDate: 2023-08-07:11-30
    +hidden: false
    +icon: ''
    +labels:
    +  author: yonish
    +maintainers: []
    +marketplaceType: ''
    +mlrunVersion: 1.5.2
    +name: question_answering
    +platformVersion: 3.5.0
    +spec:
    +  filename: question_answering.py
    +  handler: answer_questions
    +  image: mlrun/mlrun
    +  kind: job
    +  requirements:
    +    - transformers
    +    - torch
    +    - tqdm
    +url: ''
    +version: 0.4.0
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/question_answering/0.4.0/static/question_answering.html b/functions/master/question_answering/0.4.0/static/question_answering.html new file mode 100644 index 00000000..d0cffc0e --- /dev/null +++ b/functions/master/question_answering/0.4.0/static/question_answering.html @@ -0,0 +1,876 @@ + + + + + + + +question_answering.question_answering + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + +
    +
    + +
    +
    +
    +
    +
    + +
    +

    + +
    +
    +
    +
    +
    +
    +
    +

    Source code for question_answering.question_answering

    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +import enum
    +import logging
    +import operator
    +import pathlib
    +from collections import Counter
    +from functools import reduce, wraps
    +from typing import Any, Dict, List, Tuple, Union
    +
    +import pandas as pd
    +import transformers
    +from tqdm import tqdm
    +
    +# Get the global logger:
    +_LOGGER = logging.getLogger()
    +
    +
    +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    +    global _LOGGER
    +
    +    is_mpi = False
    +    try:
    +        import mlrun
    +
    +        context = mlrun.get_or_create_ctx(name="mlrun")
    +        _LOGGER = context.logger
    +        is_mpi = context.labels.get("kind", "job") == "mpijob"
    +
    +        if is_mpi:
    +            try:
    +                from mpi4py import MPI
    +
    +                return context, MPI.COMM_WORLD
    +            except ModuleNotFoundError as mpi4py_not_found:
    +                context.logger.error(
    +                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
    +                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
    +                )
    +                raise mpi4py_not_found
    +    except ModuleNotFoundError as module_not_found:
    +        if is_mpi:
    +            raise module_not_found
    +    return None, None
    +
    +
    +
    [docs]def open_mpi_handler( + worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None +): + global _LOGGER + + # Check for MLRun and OpenMPI availability: + context, comm = _check_mlrun_and_open_mpi() + + def decorator(handler): + if comm is None or comm.Get_size() == 1: + return handler + + @wraps(handler) + def wrapper(**kwargs): + # Get the open mpi environment properties: + size = comm.Get_size() + rank = comm.Get_rank() + + # Give the correct chunk of the workers inputs: + for worker_input in worker_inputs: + input_argument = kwargs[worker_input] + if input_argument is None: + continue + if isinstance(input_argument, str): + input_argument = _get_text_files( + data_path=pathlib.Path(input_argument).absolute() + ) + if len(input_argument) < size: + raise ValueError( + f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. " + f"Please reduce the amount of workers for this input." + ) + even_chunk_size = len(input_argument) // size + chunk_start = rank * even_chunk_size + chunk_end = ( + (rank + 1) * even_chunk_size + if rank + 1 < size + else len(input_argument) + ) + context.logger.info( + f"Rank #{rank}: Processing input chunk of '{worker_input}' " + f"from index {chunk_start} to {chunk_end}." + ) + if isinstance(input_argument, list): + input_argument = input_argument[chunk_start:chunk_end] + elif isinstance(input_argument, pd.DataFrame): + input_argument = input_argument.iloc[chunk_start:chunk_end:, :] + kwargs[worker_input] = input_argument + + # Set the root worker only arguments: + if rank == 0 and root_worker_inputs: + kwargs.update(root_worker_inputs) + + # Run the worker: + output = handler(**kwargs) + + # Send the output to the root rank (rank #0): + output = comm.gather(output, root=0) + if rank == 0: + # Join the outputs: + context.logger.info("Collecting data from workers to root worker.") + dataframe = pd.concat(objs=[df for df, _ in output], axis=0) + errors_dictionary = reduce(operator.ior, [err for _, err in output], {}) + return dataframe, errors_dictionary + return None + + return wrapper + + return decorator
    + + +
    [docs]@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True}) +def answer_questions( + data_path: Union[str, List[str]], + model_name: str, + questions: Union[List[str], List[List[str]]], + device_map: Union[str, dict] = None, + model_kwargs: dict = None, + auto_gptq_exllama_max_input_length: int = None, + tokenizer_name: str = None, + tokenizer_kwargs: dict = None, + text_wrapper: Union[str, List[str]] = "", + questions_wrapper: Union[str, List[str]] = "", + generation_config: Union[Dict, List[Dict]] = None, + questions_config: Union[Dict, List[Dict]] = None, + batch_size: int = 1, + questions_columns: List[str] = None, + verbose: bool = False, +) -> Tuple[pd.DataFrame, dict]: + """ + Answer questions with a context to the given text files contents by a pretrained LLM model. Each text file will have + the following prompt built: + + start of `text_wrapper` + <text file content> + end of `text_wrapper` + + start of `questions_wrapper` + 1. <questions[0]> + 2. <questions[1]> + ... + n. <questions[n-1]> + end of `questions_wrapper` + + :param data_path: A path to a directory of text files or a path to a text file to ask + questions about. + :param model_name: The pre-trained model name from the huggingface hub to use for asking + questions. + :param questions: The questions to ask. + A list of lists of questions to ask per text file, and devided + by question groups, the groups can be dtermained by size (in order to + avoid large inputs to the llm) or by questioning method + (regular or poll like questioning). + :param device_map: A map to use for loading the model on multiple devices. + :param model_kwargs: Keyword arguments to pass for loading the model using HuggingFace's + `transformers.AutoModelForCausalLM.from_pretrained` function. + :param auto_gptq_exllama_max_input_length: For AutoGPTQ models to set and extend the model's input buffer size. + :param tokenizer_name: The tokenizer name from the huggingface hub to use. If not given, the + model name will be used. + :param tokenizer_kwargs: Keyword arguments to pass for loading the tokenizer using HuggingFace's + `transformers.AutoTokenizer.from_pretrained` function. + :param text_wrapper: A wrapper for the file's text. Will be added at the start of the prompt. + Must have a placeholder ('{}') for the text of the file. + :param questions_wrapper: A wrapper for the questions received. Will be added after the text + wrapper in the prompt template. Must have a placeholder ('{}') for the + questions. + :param generation_config: HuggingFace's `GenerationConfig` keyword arguments to pass to the + `generate` method. + :param questions_config: A dictionary or list of dictionaries containing specific ways to answer + questions (using a poll for example), each dictionary in the list is for + corresponding question group and determines the question asking method + for said group. + :param batch_size: Batch size for inference. + :param questions_columns: Columns to use for the dataframe returned. + :param verbose: Whether to present logs of a progress bar and errors. Default: True. + + + :returns: A tuple of: + + * A dataframe dataset of the questions answers. + * A dictionary of errored files that were not inferred or were not answered properly. + """ + global _LOGGER + + # Set configs to empty dict if not given: + if generation_config is None: + generation_config = {} + if questions_config is None: + questions_config = {} + + # Get the input text files to question: + if verbose: + _LOGGER.info("Collecting text files.") + if isinstance(data_path, str): + data_path = pathlib.Path(data_path).absolute() + text_files = _get_text_files(data_path=data_path) + else: + text_files = data_path + if verbose: + _LOGGER.info(f"Collected {len(text_files)} text files.") + + # Get the prompt template: + if verbose: + _LOGGER.info("Creating prompt template.") + + # Organize questions as a list of list, and count number of sub-lists for future use + number_of_question_groups = 1 if isinstance(questions[0], str) else len(questions) + questions = _to_group_list( + argument_value=questions, + argument_name="questions", + length=number_of_question_groups, + ) + + # Organize prompt parts at proper length + text_wrapper = _to_group_list( + argument_value=text_wrapper, + argument_name="text_wrapper", + length=number_of_question_groups, + ) + questions_wrapper = _to_group_list( + argument_value=questions_wrapper, + argument_name="questions_wrapper", + length=number_of_question_groups, + ) + + # Create a list of prompt according to given parts and questions + prompt_template = [] + questions = questions if isinstance(questions[0], list) else [questions] + + # Build all prompts + for i in range(number_of_question_groups): + prompt_template.append( + _get_prompt_template( + text_wrapper=text_wrapper[i], + questions_wrapper=questions_wrapper[i], + questions=questions[i], + ) + ) + if verbose: + _LOGGER.info(f"Prompt template created:\n\n{prompt_template}\n") + + # Get the total amount of questions: + questions_amount = sum([len(sublist) for sublist in questions]) + + # Get the questions columns: + questions_columns = questions_columns or [ + f"q{i}" for i in range(1, questions_amount + 1) + ] + + # Check if we have the correct amount of questions columns: + if len(questions_columns) != questions_amount: + raise ValueError( + f"The provided questions columns length ({len(questions_columns)}) " + f"does not match the questions amount ({questions_amount})" + ) + + # Load the generation config: + if verbose: + _LOGGER.info("Loading generation configuration.") + generation_config = [ + transformers.GenerationConfig(**(cfg or {})) + for cfg in _to_group_list( + argument_value=generation_config, + argument_name="generation_config", + length=number_of_question_groups, + ) + ] + if verbose: + _LOGGER.info(f"Generation configuration loaded: {generation_config}") + + # Load the model and tokenizer into a pipeline object: + if verbose: + _LOGGER.info(f"Loading model '{model_name}'.") + generation_pipeline = _get_generation_pipeline( + model_name=model_name, + device_map=device_map, + tokenizer_name=tokenizer_name or model_name, + model_kwargs=model_kwargs or {}, + tokenizer_kwargs=tokenizer_kwargs or {}, + auto_gptq_exllama_max_input_length=auto_gptq_exllama_max_input_length, + batch_size=batch_size, + ) + if verbose: + _LOGGER.info("Model loaded.") + + # Prepare the successes dataframe and errors dictionary to be returned: + successes = [] + errors = {} + + # Split the files into batches: + file_batches = [ + text_files[i : i + batch_size] + if i + batch_size < len(text_files) + else text_files[i:] + for i in range(0, len(text_files), batch_size) + ] + questions_config = _to_group_list( + argument_value=questions_config, + argument_name="questions_config", + length=number_of_question_groups, + ) + + # Create a list of question handlers according to given configs + handlers = [] + for cfg in questions_config: + question_type = cfg.pop("type", "default") + handlers.append(QUESTION_MAPPING.get(question_type)(**cfg)) + + # Go over the batches of text files and question them: + for file_batch in tqdm( + file_batches, + desc="Generating answers", + unit=f"file (batch of {batch_size})", + disable=not verbose, + ): + try: + total_answers = [[] for _ in range(batch_size)] + + # Go over all question group per batch of documents + for question_group in range(number_of_question_groups): + current_questions_amount = len(questions[question_group]) + + # Read batch (read the text from the text files): + batched_input = _read_file_batch( + file_batch=file_batch, + prompt_template=prompt_template[question_group], + ) + + # Answer the questions with each question handler: + batched_answers = handlers[question_group].answer( + questions_amount=current_questions_amount, + batched_input=batched_input, + generation_pipeline=generation_pipeline, + generation_config=generation_config[question_group], + ) + + # Put the answers in the correct place in the total answers list according to the place in the batch: + for i in range(batch_size): + total_answers[i].extend(batched_answers[i]) + + # Collect the answers and attach the file name: + successes.extend( + [ + [file.name, *answers] + for file, answers in zip(file_batch, total_answers) + ] + ) + except Exception as exception: + # Note the exception as error in the dictionary: + batch_file_names = ", ".join([file.name for file in file_batch]) + if verbose: + _LOGGER.warning( + f"Error in batch '{batch_file_names}': {str(exception)}" + ) + errors[batch_file_names] = str(exception) + continue + + # Construct the answers dataframe: + columns = [ + "text_file", + *questions_columns, + ] + + # Create a data frame of answers by files + successes = pd.DataFrame( + successes, + columns=columns, + ) + + # Print the head of the produced dataframe and return: + if verbose: + _LOGGER.info( + f"Done ({successes.shape[0]}/{len(text_files)})\n" + f"Answers summary:\n" + f"{successes.head()}" + ) + return successes, errors
    + + +def _get_text_files( + data_path: pathlib.Path, +) -> List[pathlib.Path]: + + # Check if the path is of a directory or a file: + if data_path.is_dir(): + + # Get all files inside the directory: + text_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + text_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. " + f"Given: {str(data_path)} " + ) + + return text_files + + +def _get_prompt_template( + text_wrapper: str, + questions_wrapper: str, + questions: List[str], +) -> str: + + # Validate and build the text wrapper: + text_wrapper = text_wrapper or ( + "Given the following text:\n" "-----\n" "{}\n" "-----" + ) + if text_wrapper.count("{}") != 1: + raise ValueError( + "The `text_wrapper` must include one placeholder '{}' for the text of the file to be asked about." + ) + + # Validate and build the question wrapper: + questions_wrapper = questions_wrapper or "Answer the questions:\n" "{}" + if questions_wrapper.count("{}") != 1: + raise ValueError( + "The `questions_wrapper` must include one placeholder '{}' for the list of questions." + ) + + # Validate and parse the questions: + if len(questions) == 0: + raise ValueError("Please include at least one question.") + questions = "\n".join( + [f"{i}. {question}" for i, question in enumerate(questions, 1)] + ) + + # Construct the template: + return f"{text_wrapper}\n{questions_wrapper.format(questions)}\n" + + +def _get_generation_pipeline( + model_name: str, + device_map: Union[str, dict], + tokenizer_name: str, + model_kwargs: dict, + tokenizer_kwargs: dict, + auto_gptq_exllama_max_input_length: int = None, + batch_size: int = 1, +): + # Load the model: + model = transformers.AutoModelForCausalLM.from_pretrained( + model_name, device_map=device_map, **model_kwargs + ) + + # Set exllama max input length if provided: + # This changes the model's context size. + if auto_gptq_exllama_max_input_length: + from auto_gptq import exllama_set_max_input_length + + model = exllama_set_max_input_length( + model=model, max_input_length=auto_gptq_exllama_max_input_length + ) + + # Load the tokenizer: + tokenizer = transformers.AutoTokenizer.from_pretrained( + tokenizer_name, **tokenizer_kwargs + ) + + # Initialize a generation pipline and return: + pipe = transformers.pipeline( + task="text-generation", + model=model, + tokenizer=tokenizer, + batch_size=batch_size, + ) + pipe.tokenizer.pad_token_id = model.config.eos_token_id + return pipe + + +def _read_file_batch( + file_batch: List[pathlib.Path], + prompt_template: str, +) -> List[str]: + batch = [] + + # Go over all files and read in usable format + for file in file_batch: + with open(file, "r", encoding="utf-8") as fp: + batch.append(prompt_template.format(fp.read())) + return batch + + +def _to_group_list(argument_value: list, argument_name: str, length: int): + + # Check if is list, turn to list if not + argument_value = ( + argument_value if isinstance(argument_value, list) else [argument_value] + ) + list_len = len(argument_value) + + # If not a list, or is a list of len 1 we duplicate for correct length + # If list in wrong length throw an error + if list_len != length: + if list_len == 1: + return argument_value * length + raise ValueError( + f"The argument value of '{argument_name}' is not equal to the length of the given questions - {length}" + ) + return argument_value + + +
    [docs]class QuestionHandler: + """ + A class for handling questions answering for a given question type. + This class is used as a base class for all question types, and for default question type (regular question + answering without any special handling). + """ + +
    [docs] class ConfigKeys: + pass
    + + def __init__(self): + pass + + @staticmethod + def _get_answers(generated_text: str, questions_amount: int) -> List[str]: + + # Clear answer start (part before numbers): + # TODO find better way to verify, for list of questions this is redundant for example + if "1." not in generated_text: + raise ValueError( + f"Answer 1. is missing from the generated text: '{generated_text}'" + ) + text = generated_text.split("1.", 1)[1] + + # Start extracting the answers: + answers = [] + for i in range(1, questions_amount + 1): + # If it's the last answer to look for, take the rest of the text: + if i == questions_amount: + answer_i = text + # Verify there is a question number in the text: + elif f"{i + 1}." not in text: + raise ValueError( + f"Answer {i + 1}. is missing from the generated text: '{generated_text}'" + ) + # Take i's answer: + else: + answer_i, text = text.split(f"{i + 1}.", 1) + # Collect the answer removing redundant spaces: + answers.append(answer_i.strip()) + + return answers + + def _infer_questions( + self, + questions_amount: int, + batched_input: List[str], + generation_pipeline: transformers.Pipeline, + generation_config: transformers.GenerationConfig, + ) -> List[List[str]]: + + # Infer through the llm: + batched_output = generation_pipeline( + batched_input, + generation_config=generation_config, + eos_token_id=generation_pipeline.tokenizer.eos_token_id, + return_full_text=False, + num_return_sequences=1, + ) + + # Process the outputs to get the answers: + batched_answers = [] + for output in batched_output: + # Get the generated answers: + answers = self._get_answers( + generated_text=output[0]["generated_text"], + questions_amount=questions_amount, + ) + # Collect the processed answers: + batched_answers.append(answers) + return batched_answers + +
    [docs] def answer( + self, + questions_amount: int, + batched_input: List[str], + generation_pipeline: transformers.Pipeline, + generation_config: transformers.GenerationConfig, + ) -> List[List[str]]: + """ + Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline. + """ + return self._infer_questions( + questions_amount=questions_amount, + batched_input=batched_input, + generation_pipeline=generation_pipeline, + generation_config=generation_config, + )
    + + +
    [docs]class PollQuestionHandler(QuestionHandler): + """ + Static class to hold all the possible poll question configurations options keys + """ + +
    [docs] class ConfigKeys: + """ + A class for handling questions answering for poll type questions. + These type of question are answered by asking the same question multiple times + and choosing the most common answer or the average answer. + """ + + #: The number of times to ask the same question. + POLL_COUNT = "poll_count" + + #: The strategy to use for choosing the answer from the poll. + POLL_STRATEGY = "poll_strategy"
    + +
    [docs] class Strategy(enum.Enum): + #: The most common answer strategy. + MOST_COMMON = "most_common" + + #: The average answer strategy. + AVERAGE = "average" + +
    [docs] @staticmethod + def most_common(answers): + """ + Calculate the most common answer for a given list of answers. + """ + count = Counter(answers) + most_common = count.most_common(1) + return most_common[0][0]
    + +
    [docs] @staticmethod + def average(answers): + """ + Calculate the average answer for a given list of answers. + """ + if isinstance(answers[0], str): + raise ValueError( + "Cannot perform poll with average answer strategy of non numeric values," + " please change the question to give numeric data, or choose 'most_common' as strategy." + ) + else: + numeric_values = answers + avg = sum(numeric_values) / len(numeric_values) + + # Round to the closest integer and return corresponding value + return round(avg)
    + +
    [docs] def do(self, answers): + """ + Perform the strategy. + """ + return getattr(self, self.value)(answers)
    + + def __init__( + self, poll_count: int = 5, poll_strategy: str = "most_common"): + super().__init__() + self.poll_count = poll_count + self.poll_strategy = self.Strategy(poll_strategy) + +
    [docs] def answer( + self, + questions_amount: int, + batched_input: List[str], + generation_pipeline: transformers.Pipeline, + generation_config: transformers.GenerationConfig, + ) -> List[List[str]]: + """ + Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline. + """ + return self._answer_poll_questions( + questions_amount=questions_amount, + batched_input=batched_input, + generation_pipeline=generation_pipeline, + generation_config=generation_config, + )
    + + def _answer_poll_questions( + self, + questions_amount: int, + batched_input: List[str], + generation_pipeline: transformers.Pipeline, + generation_config: transformers.GenerationConfig, + ) -> List[List[str]]: + votes = [] + + # Run the poll for each question + for _ in range(self.poll_count): + batched_answers = self._infer_questions( + questions_amount=questions_amount, + batched_input=batched_input, + generation_pipeline=generation_pipeline, + generation_config=generation_config, + ) + votes.append(batched_answers) + answers = [] + + # Collect the answers according to the poll strategy + # Average strategy works for numeric values only + for batch in range(len(votes[0])): + batched_answers = [] + for question in range(questions_amount): + # Create a list of all answers to relevant question + answer = [ + votes[voter][batch][question] for voter in range(self.poll_count) + ] + answer = self.poll_strategy.do(answer) + batched_answers.append(answer) + answers.append(batched_answers) + return answers
    + + +# Holds names of QuestionHandles +
    [docs]class QuestionTypes: + DEFAULT = "default" + POLL = "poll"
    + + +# Maps question types to their handlers +QUESTION_MAPPING = { + QuestionTypes.DEFAULT: QuestionHandler, + QuestionTypes.POLL: PollQuestionHandler, +} +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/question_answering/0.4.0/static/source.html b/functions/master/question_answering/0.4.0/static/source.html new file mode 100644 index 00000000..843435d9 --- /dev/null +++ b/functions/master/question_answering/0.4.0/static/source.html @@ -0,0 +1,758 @@ + + + + + + + + + + + Source + + + + +
    +        
    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +import enum
    +import logging
    +import operator
    +import pathlib
    +from collections import Counter
    +from functools import reduce, wraps
    +from typing import Any, Dict, List, Tuple, Union
    +
    +import pandas as pd
    +import transformers
    +from tqdm import tqdm
    +
    +# Get the global logger:
    +_LOGGER = logging.getLogger()
    +
    +
    +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    +    global _LOGGER
    +
    +    is_mpi = False
    +    try:
    +        import mlrun
    +
    +        context = mlrun.get_or_create_ctx(name="mlrun")
    +        _LOGGER = context.logger
    +        is_mpi = context.labels.get("kind", "job") == "mpijob"
    +
    +        if is_mpi:
    +            try:
    +                from mpi4py import MPI
    +
    +                return context, MPI.COMM_WORLD
    +            except ModuleNotFoundError as mpi4py_not_found:
    +                context.logger.error(
    +                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
    +                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
    +                )
    +                raise mpi4py_not_found
    +    except ModuleNotFoundError as module_not_found:
    +        if is_mpi:
    +            raise module_not_found
    +    return None, None
    +
    +
    +def open_mpi_handler(
    +    worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None
    +):
    +    global _LOGGER
    +
    +    # Check for MLRun and OpenMPI availability:
    +    context, comm = _check_mlrun_and_open_mpi()
    +
    +    def decorator(handler):
    +        if comm is None or comm.Get_size() == 1:
    +            return handler
    +
    +        @wraps(handler)
    +        def wrapper(**kwargs):
    +            # Get the open mpi environment properties:
    +            size = comm.Get_size()
    +            rank = comm.Get_rank()
    +
    +            # Give the correct chunk of the workers inputs:
    +            for worker_input in worker_inputs:
    +                input_argument = kwargs[worker_input]
    +                if input_argument is None:
    +                    continue
    +                if isinstance(input_argument, str):
    +                    input_argument = _get_text_files(
    +                        data_path=pathlib.Path(input_argument).absolute()
    +                    )
    +                if len(input_argument) < size:
    +                    raise ValueError(
    +                        f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. "
    +                        f"Please reduce the amount of workers for this input."
    +                    )
    +                even_chunk_size = len(input_argument) // size
    +                chunk_start = rank * even_chunk_size
    +                chunk_end = (
    +                    (rank + 1) * even_chunk_size
    +                    if rank + 1 < size
    +                    else len(input_argument)
    +                )
    +                context.logger.info(
    +                    f"Rank #{rank}: Processing input chunk of '{worker_input}' "
    +                    f"from index {chunk_start} to {chunk_end}."
    +                )
    +                if isinstance(input_argument, list):
    +                    input_argument = input_argument[chunk_start:chunk_end]
    +                elif isinstance(input_argument, pd.DataFrame):
    +                    input_argument = input_argument.iloc[chunk_start:chunk_end:, :]
    +                kwargs[worker_input] = input_argument
    +
    +            # Set the root worker only arguments:
    +            if rank == 0 and root_worker_inputs:
    +                kwargs.update(root_worker_inputs)
    +
    +            # Run the worker:
    +            output = handler(**kwargs)
    +
    +            # Send the output to the root rank (rank #0):
    +            output = comm.gather(output, root=0)
    +            if rank == 0:
    +                # Join the outputs:
    +                context.logger.info("Collecting data from workers to root worker.")
    +                dataframe = pd.concat(objs=[df for df, _ in output], axis=0)
    +                errors_dictionary = reduce(operator.ior, [err for _, err in output], {})
    +                return dataframe, errors_dictionary
    +            return None
    +
    +        return wrapper
    +
    +    return decorator
    +
    +
    +@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True})
    +def answer_questions(
    +    data_path: Union[str, List[str]],
    +    model_name: str,
    +    questions: Union[List[str], List[List[str]]],
    +    device_map: Union[str, dict] = None,
    +    model_kwargs: dict = None,
    +    auto_gptq_exllama_max_input_length: int = None,
    +    tokenizer_name: str = None,
    +    tokenizer_kwargs: dict = None,
    +    text_wrapper: Union[str, List[str]] = "",
    +    questions_wrapper: Union[str, List[str]] = "",
    +    generation_config: Union[Dict, List[Dict]] = None,
    +    questions_config: Union[Dict, List[Dict]] = None,
    +    batch_size: int = 1,
    +    questions_columns: List[str] = None,
    +    verbose: bool = False,
    +) -> Tuple[pd.DataFrame, dict]:
    +    """
    +    Answer questions with a context to the given text files contents by a pretrained LLM model. Each text file will have
    +    the following prompt built:
    +
    +    start of `text_wrapper`
    +    
    +    end of `text_wrapper`
    +
    +    start of `questions_wrapper`
    +    1. 
    +    2. 
    +    ...
    +    n. 
    +    end of `questions_wrapper`
    +
    +    :param data_path:                          A path to a directory of text files or a path to a text file to ask
    +                                               questions about.
    +    :param model_name:                         The pre-trained model name from the huggingface hub to use for asking
    +                                               questions.
    +    :param questions:                          The questions to ask.
    +                                               A list of lists of questions to ask per text file, and devided
    +                                               by question groups, the groups can be dtermained by size (in order to
    +                                               avoid large inputs to the llm) or by questioning method
    +                                               (regular or poll like questioning).
    +    :param device_map:                         A map to use for loading the model on multiple devices.
    +    :param model_kwargs:                       Keyword arguments to pass for loading the model using HuggingFace's
    +                                               `transformers.AutoModelForCausalLM.from_pretrained` function.
    +    :param auto_gptq_exllama_max_input_length: For AutoGPTQ models to set and extend the model's input buffer size.
    +    :param tokenizer_name:                     The tokenizer name from the huggingface hub to use. If not given, the
    +                                               model name will be used.
    +    :param tokenizer_kwargs:                   Keyword arguments to pass for loading the tokenizer using HuggingFace's
    +                                               `transformers.AutoTokenizer.from_pretrained` function.
    +    :param text_wrapper:                       A wrapper for the file's text. Will be added at the start of the prompt.
    +                                               Must have a placeholder ('{}') for the text of the file.
    +    :param questions_wrapper:                  A wrapper for the questions received. Will be added after the text
    +                                               wrapper in the prompt template. Must have a placeholder ('{}') for the
    +                                               questions.
    +    :param generation_config:                  HuggingFace's `GenerationConfig` keyword arguments to pass to the
    +                                               `generate` method.
    +    :param questions_config:                   A dictionary or list of dictionaries containing specific ways to answer
    +                                               questions (using a poll for example), each dictionary in the list is for
    +                                               corresponding question group and determines the question asking method
    +                                               for said group.
    +    :param batch_size:                         Batch size for inference.
    +    :param questions_columns:                  Columns to use for the dataframe returned.
    +    :param verbose:                            Whether to present logs of a progress bar and errors. Default: True.
    +
    +
    +    :returns: A tuple of:
    +
    +              * A dataframe dataset of the questions answers.
    +              * A dictionary of errored files that were not inferred or were not answered properly.
    +    """
    +    global _LOGGER
    +
    +    # Set configs to empty dict if not given:
    +    if generation_config is None:
    +        generation_config = {}
    +    if questions_config is None:
    +        questions_config = {}
    +
    +    # Get the input text files to question:
    +    if verbose:
    +        _LOGGER.info("Collecting text files.")
    +    if isinstance(data_path, str):
    +        data_path = pathlib.Path(data_path).absolute()
    +        text_files = _get_text_files(data_path=data_path)
    +    else:
    +        text_files = data_path
    +    if verbose:
    +        _LOGGER.info(f"Collected {len(text_files)} text files.")
    +
    +    # Get the prompt template:
    +    if verbose:
    +        _LOGGER.info("Creating prompt template.")
    +
    +    # Organize questions as a list of list, and count number of sub-lists for future use
    +    number_of_question_groups = 1 if isinstance(questions[0], str) else len(questions)
    +    questions = _to_group_list(
    +        argument_value=questions,
    +        argument_name="questions",
    +        length=number_of_question_groups,
    +    )
    +
    +    # Organize prompt parts at proper length
    +    text_wrapper = _to_group_list(
    +        argument_value=text_wrapper,
    +        argument_name="text_wrapper",
    +        length=number_of_question_groups,
    +    )
    +    questions_wrapper = _to_group_list(
    +        argument_value=questions_wrapper,
    +        argument_name="questions_wrapper",
    +        length=number_of_question_groups,
    +    )
    +
    +    # Create a list of prompt according to given parts and questions
    +    prompt_template = []
    +    questions = questions if isinstance(questions[0], list) else [questions]
    +
    +    # Build all prompts
    +    for i in range(number_of_question_groups):
    +        prompt_template.append(
    +            _get_prompt_template(
    +                text_wrapper=text_wrapper[i],
    +                questions_wrapper=questions_wrapper[i],
    +                questions=questions[i],
    +            )
    +        )
    +    if verbose:
    +        _LOGGER.info(f"Prompt template created:\n\n{prompt_template}\n")
    +
    +    # Get the total amount of questions:
    +    questions_amount = sum([len(sublist) for sublist in questions])
    +
    +    # Get the questions columns:
    +    questions_columns = questions_columns or [
    +        f"q{i}" for i in range(1, questions_amount + 1)
    +    ]
    +
    +    # Check if we have the correct amount of questions columns:
    +    if len(questions_columns) != questions_amount:
    +        raise ValueError(
    +            f"The provided questions columns length ({len(questions_columns)}) "
    +            f"does not match the questions amount ({questions_amount})"
    +        )
    +
    +    # Load the generation config:
    +    if verbose:
    +        _LOGGER.info("Loading generation configuration.")
    +    generation_config = [
    +        transformers.GenerationConfig(**(cfg or {}))
    +        for cfg in _to_group_list(
    +            argument_value=generation_config,
    +            argument_name="generation_config",
    +            length=number_of_question_groups,
    +        )
    +    ]
    +    if verbose:
    +        _LOGGER.info(f"Generation configuration loaded: {generation_config}")
    +
    +    # Load the model and tokenizer into a pipeline object:
    +    if verbose:
    +        _LOGGER.info(f"Loading model '{model_name}'.")
    +    generation_pipeline = _get_generation_pipeline(
    +        model_name=model_name,
    +        device_map=device_map,
    +        tokenizer_name=tokenizer_name or model_name,
    +        model_kwargs=model_kwargs or {},
    +        tokenizer_kwargs=tokenizer_kwargs or {},
    +        auto_gptq_exllama_max_input_length=auto_gptq_exllama_max_input_length,
    +        batch_size=batch_size,
    +    )
    +    if verbose:
    +        _LOGGER.info("Model loaded.")
    +
    +    # Prepare the successes dataframe and errors dictionary to be returned:
    +    successes = []
    +    errors = {}
    +
    +    # Split the files into batches:
    +    file_batches = [
    +        text_files[i : i + batch_size]
    +        if i + batch_size < len(text_files)
    +        else text_files[i:]
    +        for i in range(0, len(text_files), batch_size)
    +    ]
    +    questions_config = _to_group_list(
    +        argument_value=questions_config,
    +        argument_name="questions_config",
    +        length=number_of_question_groups,
    +    )
    +
    +    # Create a list of question handlers according to given configs
    +    handlers = []
    +    for cfg in questions_config:
    +        question_type = cfg.pop("type", "default")
    +        handlers.append(QUESTION_MAPPING.get(question_type)(**cfg))
    +
    +    # Go over the batches of text files and question them:
    +    for file_batch in tqdm(
    +        file_batches,
    +        desc="Generating answers",
    +        unit=f"file (batch of {batch_size})",
    +        disable=not verbose,
    +    ):
    +        try:
    +            total_answers = [[] for _ in range(batch_size)]
    +
    +            # Go over all question group per batch of documents
    +            for question_group in range(number_of_question_groups):
    +                current_questions_amount = len(questions[question_group])
    +
    +                # Read batch (read the text from the text files):
    +                batched_input = _read_file_batch(
    +                    file_batch=file_batch,
    +                    prompt_template=prompt_template[question_group],
    +                )
    +
    +                # Answer the questions with each question handler:
    +                batched_answers = handlers[question_group].answer(
    +                    questions_amount=current_questions_amount,
    +                    batched_input=batched_input,
    +                    generation_pipeline=generation_pipeline,
    +                    generation_config=generation_config[question_group],
    +                )
    +
    +                # Put the answers in the correct place in the total answers list according to the place in the batch:
    +                for i in range(batch_size):
    +                    total_answers[i].extend(batched_answers[i])
    +
    +            # Collect the answers and attach the file name:
    +            successes.extend(
    +                [
    +                    [file.name, *answers]
    +                    for file, answers in zip(file_batch, total_answers)
    +                ]
    +            )
    +        except Exception as exception:
    +            # Note the exception as error in the dictionary:
    +            batch_file_names = ", ".join([file.name for file in file_batch])
    +            if verbose:
    +                _LOGGER.warning(
    +                    f"Error in batch '{batch_file_names}': {str(exception)}"
    +                )
    +            errors[batch_file_names] = str(exception)
    +            continue
    +
    +    # Construct the answers dataframe:
    +    columns = [
    +        "text_file",
    +        *questions_columns,
    +    ]
    +
    +    # Create a data frame of answers by files
    +    successes = pd.DataFrame(
    +        successes,
    +        columns=columns,
    +    )
    +
    +    # Print the head of the produced dataframe and return:
    +    if verbose:
    +        _LOGGER.info(
    +            f"Done ({successes.shape[0]}/{len(text_files)})\n"
    +            f"Answers summary:\n"
    +            f"{successes.head()}"
    +        )
    +    return successes, errors
    +
    +
    +def _get_text_files(
    +    data_path: pathlib.Path,
    +) -> List[pathlib.Path]:
    +
    +    # Check if the path is of a directory or a file:
    +    if data_path.is_dir():
    +
    +        # Get all files inside the directory:
    +        text_files = list(data_path.glob("*.*"))
    +    elif data_path.is_file():
    +        text_files = [data_path]
    +    else:
    +        raise ValueError(
    +            f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. "
    +            f"Given: {str(data_path)} "
    +        )
    +
    +    return text_files
    +
    +
    +def _get_prompt_template(
    +    text_wrapper: str,
    +    questions_wrapper: str,
    +    questions: List[str],
    +) -> str:
    +
    +    # Validate and build the text wrapper:
    +    text_wrapper = text_wrapper or (
    +        "Given the following text:\n" "-----\n" "{}\n" "-----"
    +    )
    +    if text_wrapper.count("{}") != 1:
    +        raise ValueError(
    +            "The `text_wrapper` must include one placeholder '{}' for the text of the file to be asked about."
    +        )
    +
    +    # Validate and build the question wrapper:
    +    questions_wrapper = questions_wrapper or "Answer the questions:\n" "{}"
    +    if questions_wrapper.count("{}") != 1:
    +        raise ValueError(
    +            "The `questions_wrapper` must include one placeholder '{}' for the list of questions."
    +        )
    +
    +    # Validate and parse the questions:
    +    if len(questions) == 0:
    +        raise ValueError("Please include at least one question.")
    +    questions = "\n".join(
    +        [f"{i}. {question}" for i, question in enumerate(questions, 1)]
    +    )
    +
    +    # Construct the template:
    +    return f"{text_wrapper}\n{questions_wrapper.format(questions)}\n"
    +
    +
    +def _get_generation_pipeline(
    +    model_name: str,
    +    device_map: Union[str, dict],
    +    tokenizer_name: str,
    +    model_kwargs: dict,
    +    tokenizer_kwargs: dict,
    +    auto_gptq_exllama_max_input_length: int = None,
    +    batch_size: int = 1,
    +):
    +    # Load the model:
    +    model = transformers.AutoModelForCausalLM.from_pretrained(
    +        model_name, device_map=device_map, **model_kwargs
    +    )
    +
    +    # Set exllama max input length if provided:
    +    # This changes the model's context size.
    +    if auto_gptq_exllama_max_input_length:
    +        from auto_gptq import exllama_set_max_input_length
    +
    +        model = exllama_set_max_input_length(
    +            model=model, max_input_length=auto_gptq_exllama_max_input_length
    +        )
    +
    +    # Load the tokenizer:
    +    tokenizer = transformers.AutoTokenizer.from_pretrained(
    +        tokenizer_name, **tokenizer_kwargs
    +    )
    +
    +    # Initialize a generation pipline and return:
    +    pipe = transformers.pipeline(
    +        task="text-generation",
    +        model=model,
    +        tokenizer=tokenizer,
    +        batch_size=batch_size,
    +    )
    +    pipe.tokenizer.pad_token_id = model.config.eos_token_id
    +    return pipe
    +
    +
    +def _read_file_batch(
    +    file_batch: List[pathlib.Path],
    +    prompt_template: str,
    +) -> List[str]:
    +    batch = []
    +
    +    # Go over all files and read in usable format
    +    for file in file_batch:
    +        with open(file, "r", encoding="utf-8") as fp:
    +            batch.append(prompt_template.format(fp.read()))
    +    return batch
    +
    +
    +def _to_group_list(argument_value: list, argument_name: str, length: int):
    +
    +    # Check if is list, turn to list if not
    +    argument_value = (
    +        argument_value if isinstance(argument_value, list) else [argument_value]
    +    )
    +    list_len = len(argument_value)
    +
    +    # If not a list, or is a list of len 1 we duplicate for correct length
    +    # If list in wrong length throw an error
    +    if list_len != length:
    +        if list_len == 1:
    +            return argument_value * length
    +        raise ValueError(
    +            f"The argument value of '{argument_name}' is not equal to the length of the given questions - {length}"
    +        )
    +    return argument_value
    +
    +
    +class QuestionHandler:
    +    """
    +    A class for handling questions answering for a given question type.
    +    This class is used as a base class for all question types, and for default question type (regular question
    +    answering without any special handling).
    +    """
    +
    +    class ConfigKeys:
    +        pass
    +
    +    def __init__(self):
    +        pass
    +
    +    @staticmethod
    +    def _get_answers(generated_text: str, questions_amount: int) -> List[str]:
    +
    +        # Clear answer start (part before numbers):
    +        # TODO find better way to verify, for list of questions this is redundant for example
    +        if "1." not in generated_text:
    +            raise ValueError(
    +                f"Answer 1. is missing from the generated text: '{generated_text}'"
    +            )
    +        text = generated_text.split("1.", 1)[1]
    +
    +        # Start extracting the answers:
    +        answers = []
    +        for i in range(1, questions_amount + 1):
    +            # If it's the last answer to look for, take the rest of the text:
    +            if i == questions_amount:
    +                answer_i = text
    +            # Verify there is a question number in the text:
    +            elif f"{i + 1}." not in text:
    +                raise ValueError(
    +                    f"Answer {i + 1}. is missing from the generated text: '{generated_text}'"
    +                )
    +            # Take i's answer:
    +            else:
    +                answer_i, text = text.split(f"{i + 1}.", 1)
    +            # Collect the answer removing redundant spaces:
    +            answers.append(answer_i.strip())
    +
    +        return answers
    +
    +    def _infer_questions(
    +        self,
    +        questions_amount: int,
    +        batched_input: List[str],
    +        generation_pipeline: transformers.Pipeline,
    +        generation_config: transformers.GenerationConfig,
    +    ) -> List[List[str]]:
    +
    +        # Infer through the llm:
    +        batched_output = generation_pipeline(
    +            batched_input,
    +            generation_config=generation_config,
    +            eos_token_id=generation_pipeline.tokenizer.eos_token_id,
    +            return_full_text=False,
    +            num_return_sequences=1,
    +        )
    +
    +        # Process the outputs to get the answers:
    +        batched_answers = []
    +        for output in batched_output:
    +            # Get the generated answers:
    +            answers = self._get_answers(
    +                generated_text=output[0]["generated_text"],
    +                questions_amount=questions_amount,
    +            )
    +            # Collect the processed answers:
    +            batched_answers.append(answers)
    +        return batched_answers
    +
    +    def answer(
    +        self,
    +        questions_amount: int,
    +        batched_input: List[str],
    +        generation_pipeline: transformers.Pipeline,
    +        generation_config: transformers.GenerationConfig,
    +    ) -> List[List[str]]:
    +        """
    +        Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline.
    +        """
    +        return self._infer_questions(
    +            questions_amount=questions_amount,
    +            batched_input=batched_input,
    +            generation_pipeline=generation_pipeline,
    +            generation_config=generation_config,
    +        )
    +
    +
    +class PollQuestionHandler(QuestionHandler):
    +    """
    +    Static class to hold all the possible poll question configurations options keys
    +    """
    +
    +    class ConfigKeys:
    +        """
    +        A class for handling questions answering for poll type questions.
    +        These type of question are answered by asking the same question multiple times
    +        and choosing the most common answer or the average answer.
    +        """
    +
    +        #: The number of times to ask the same question.
    +        POLL_COUNT = "poll_count"
    +
    +        #: The strategy to use for choosing the answer from the poll.
    +        POLL_STRATEGY = "poll_strategy"
    +
    +    class Strategy(enum.Enum):
    +        #: The most common answer strategy.
    +        MOST_COMMON = "most_common"
    +
    +        #: The average answer strategy.
    +        AVERAGE = "average"
    +
    +        @staticmethod
    +        def most_common(answers):
    +            """
    +            Calculate the most common answer for a given list of answers.
    +            """
    +            count = Counter(answers)
    +            most_common = count.most_common(1)
    +            return most_common[0][0]
    +
    +        @staticmethod
    +        def average(answers):
    +            """
    +            Calculate the average answer for a given list of answers.
    +            """
    +            if isinstance(answers[0], str):
    +                raise ValueError(
    +                    "Cannot perform poll with average answer strategy of non numeric values,"
    +                    " please change the question to give numeric data, or choose 'most_common' as strategy."
    +                )
    +            else:
    +                numeric_values = answers
    +            avg = sum(numeric_values) / len(numeric_values)
    +
    +            # Round to the closest integer and return corresponding value
    +            return round(avg)
    +
    +        def do(self, answers):
    +            """
    +            Perform the strategy.
    +            """
    +            return getattr(self, self.value)(answers)
    +
    +    def __init__(
    +        self, poll_count: int = 5, poll_strategy: str = "most_common"):
    +        super().__init__()
    +        self.poll_count = poll_count
    +        self.poll_strategy = self.Strategy(poll_strategy)
    +
    +    def answer(
    +        self,
    +        questions_amount: int,
    +        batched_input: List[str],
    +        generation_pipeline: transformers.Pipeline,
    +        generation_config: transformers.GenerationConfig,
    +    ) -> List[List[str]]:
    +        """
    +        Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline.
    +        """
    +        return self._answer_poll_questions(
    +            questions_amount=questions_amount,
    +            batched_input=batched_input,
    +            generation_pipeline=generation_pipeline,
    +            generation_config=generation_config,
    +        )
    +
    +    def _answer_poll_questions(
    +        self,
    +        questions_amount: int,
    +        batched_input: List[str],
    +        generation_pipeline: transformers.Pipeline,
    +        generation_config: transformers.GenerationConfig,
    +    ) -> List[List[str]]:
    +        votes = []
    +
    +        # Run the poll for each question
    +        for _ in range(self.poll_count):
    +            batched_answers = self._infer_questions(
    +                questions_amount=questions_amount,
    +                batched_input=batched_input,
    +                generation_pipeline=generation_pipeline,
    +                generation_config=generation_config,
    +            )
    +            votes.append(batched_answers)
    +        answers = []
    +
    +        # Collect the answers according to the poll strategy
    +        # Average strategy works for numeric values only
    +        for batch in range(len(votes[0])):
    +            batched_answers = []
    +            for question in range(questions_amount):
    +                # Create a list of all answers to relevant question
    +                answer = [
    +                    votes[voter][batch][question] for voter in range(self.poll_count)
    +                ]
    +                answer = self.poll_strategy.do(answer)
    +                batched_answers.append(answer)
    +            answers.append(batched_answers)
    +        return answers
    +
    +
    +# Holds names of QuestionHandles
    +class QuestionTypes:
    +    DEFAULT = "default"
    +    POLL = "poll"
    +
    +
    +# Maps question types to their handlers
    +QUESTION_MAPPING = {
    +    QuestionTypes.DEFAULT: QuestionHandler,
    +    QuestionTypes.POLL: PollQuestionHandler,
    +}
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/question_answering/latest/src/function.yaml b/functions/master/question_answering/latest/src/function.yaml index a3361415..7491b17e 100644 --- a/functions/master/question_answering/latest/src/function.yaml +++ b/functions/master/question_answering/latest/src/function.yaml @@ -2,11 +2,13 @@ kind: job metadata: name: question-answering tag: '' - hash: 90e67d116b256a98da7d5819724e43df01d8b4eb + hash: aed62db95f17576c69b457767e3595c2de1d5465 project: '' labels: author: yonish categories: + - genai + - huggingface - machine-learning spec: command: '' diff --git a/functions/master/question_answering/latest/src/item.yaml b/functions/master/question_answering/latest/src/item.yaml index 58ab5cc3..56fc5a5e 100755 --- a/functions/master/question_answering/latest/src/item.yaml +++ b/functions/master/question_answering/latest/src/item.yaml @@ -1,5 +1,7 @@ apiVersion: v1 categories: +- genai +- huggingface - machine-learning description: GenAI approach of question answering on a given data doc: '' @@ -24,4 +26,4 @@ spec: - torch - tqdm url: '' -version: 0.3.1 +version: 0.4.0 diff --git a/functions/master/question_answering/latest/static/function.html b/functions/master/question_answering/latest/static/function.html index 0f9f7a27..dbaf6ff6 100644 --- a/functions/master/question_answering/latest/static/function.html +++ b/functions/master/question_answering/latest/static/function.html @@ -19,11 +19,13 @@ metadata: name: question-answering tag: '' - hash: 90e67d116b256a98da7d5819724e43df01d8b4eb + hash: aed62db95f17576c69b457767e3595c2de1d5465 project: '' labels: author: yonish categories: + - genai + - huggingface - machine-learning spec: command: '' diff --git a/functions/master/question_answering/latest/static/item.html b/functions/master/question_answering/latest/static/item.html index b7ae1749..d7a7898a 100644 --- a/functions/master/question_answering/latest/static/item.html +++ b/functions/master/question_answering/latest/static/item.html @@ -17,6 +17,8 @@ apiVersion: v1 categories: +- genai +- huggingface - machine-learning description: GenAI approach of question answering on a given data doc: '' @@ -41,7 +43,7 @@ - torch - tqdm url: '' -version: 0.3.1 +version: 0.4.0 diff --git a/functions/master/silero_vad/1.3.0/src/assets/test_data.wav b/functions/master/silero_vad/1.3.0/src/assets/test_data.wav new file mode 100644 index 00000000..a3a993c2 Binary files /dev/null and b/functions/master/silero_vad/1.3.0/src/assets/test_data.wav differ diff --git a/functions/master/silero_vad/1.3.0/src/function.yaml b/functions/master/silero_vad/1.3.0/src/function.yaml new file mode 100644 index 00000000..8ec121a6 --- /dev/null +++ b/functions/master/silero_vad/1.3.0/src/function.yaml @@ -0,0 +1,291 @@ +kind: job +metadata: + name: silero-vad + tag: '' + hash: 59336f808643a74f3a2c5d506977387010427208 + project: '' + labels: + author: guyl + categories: + - deep-learning + - pytorch + - audio +spec: + command: '' + args: [] + image: '' + build: + functionSourceCode: # Copyright 2024 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from multiprocessing import Process, Queue
from pathlib import Path
from types import FunctionType
from typing import Dict, List, Tuple, Type, Union

import torch
import torchaudio
from tqdm import tqdm


class BaseTask:
    """
    A base class for a task to complete after VAD.
    """

    def __init__(self, audio_file: Path):
        """
        Initialize the base task.

        :param audio_file: The audio file assigned to the task.
        """
        # Store the audio file:
        self._audio_file = audio_file

        # Prepare the result:
        self._result = None

    @property
    def audio_file(self) -> Path:
        """
        Get the audio file of the task.

        :returns: The audio file of the task.
        """
        return self._audio_file

    def do_task(
        self, speech_timestamps: Union[List[Dict[str, int]], List[List[Dict[str, int]]]]
    ):
        """
        Do the task on the given speech timestamps. The base task will simply save the speech timestamps as the result.

        :param speech_timestamps: The speech timestamps to do the task on as outputted from the VAD.
        """
        self._result = speech_timestamps

    def get_result(self) -> Tuple[str, list]:
        """
        Get the result of the task. A tuple of the audio file name and the result.

        :returns: The result of the task.
        """
        return self._audio_file.name, self._result

    def to_tuple(self) -> Tuple[str, dict]:
        """
        Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

        :returns: The converted task.
        """
        return self.__class__.__name__, {"audio_file": self._audio_file}


class SpeechDiarizationTask(BaseTask):
    """
    A speech diarization task. The task will diarize the VAD speech timestamps into speakers.
    """

    def __init__(self, audio_file: Path, speaker_labels: List[str]):
        """
        Initialize the speech diarization task.

        :param audio_file:     The audio file assigned to the task.
        :param speaker_labels: The speaker labels to use for the diarization. If not given, the speakers will be named
                               "speaker_0", "speaker_1", etc.
        """
        super().__init__(audio_file=audio_file)
        self._speaker_labels = speaker_labels

    def do_task(self, speech_timestamps: List[List[Dict[str, int]]]):
        """
        Do the task on the given speech timestamps. The task will diarize the VAD speech timestamps into speakers.

        :param speech_timestamps: The speech timestamps per channel to do the task on as outputted from the VAD.
        """
        # Get the speaker labels (set default if not given):
        speaker_labels = self._speaker_labels or [
            f"speaker_{i}" for i in range(len(speech_timestamps))
        ]

        # Diarize - organize the speech timestamps into a single list of speakers and sort it by start time:
        speech_diarization = [
            (speech_timestamp["start"], speech_timestamp["end"], speaker_label)
            for speaker_label, channel_speech_timestamps in zip(
                speaker_labels, speech_timestamps
            )
            for speech_timestamp in channel_speech_timestamps
        ]
        speech_diarization.sort()
        self._result = speech_diarization

    def to_tuple(self) -> Tuple[str, dict]:
        """
        Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

        :returns: The converted task.
        """
        task_class, task_kwargs = super().to_tuple()
        return task_class, {**task_kwargs, "speaker_labels": self._speaker_labels}


class TaskCreator:
    """
    A task creator to create different tasks to run after the VAD.
    """

    #: A map from task class name to task class to use in `from_tuple`:
    _MAP = {
        BaseTask.__name__: BaseTask,
        SpeechDiarizationTask.__name__: SpeechDiarizationTask,
    }

    def __init__(self, task_type: Type[BaseTask], task_kwargs: dict = None):
        """
        Initialize the task creator.
        :param task_type: The task type - a `BaseTask` subclass.
        :param task_kwargs: Additional keyword arguments to pass to the to be created tasks.
        """
        self._task_type = task_type
        self._task_kwargs = task_kwargs or {}

    def create_task(self, audio_file: Path) -> BaseTask:
        """
        Create a task with the given audio file.

        :param audio_file: The audio file to assign to the task.

        :returns: The created task.
        """
        return self._task_type(audio_file=audio_file, **self._task_kwargs)

    @classmethod
    def from_tuple(cls, task_tuple: Tuple[str, dict]) -> BaseTask:
        """
        Create a task from a tuple of the audio file name and the task kwargs.

        :param task_tuple: The task tuple to create the task from.

        :returns: The created task.
        """
        task_class, task_kwargs = task_tuple
        return cls._MAP[task_class](**task_kwargs)


class VoiceActivityDetector:
    """
    A voice activity detection wrapper for the silero VAD model - https://github.com/snakers4/silero-vad.
    """

    def __init__(
        self,
        # Model loading kwargs:
        use_onnx: bool = True,
        force_onnx_cpu: bool = True,
        # Detection kwargs:
        threshold: float = 0.5,
        sampling_rate: int = 16_000,
        min_speech_duration_ms: int = 250,
        max_speech_duration_s: float = float("inf"),
        min_silence_duration_ms: int = 100,
        window_size_samples: int = 512,
        speech_pad_ms: int = 30,
        return_seconds: bool = False,
        per_channel: bool = False,
    ):
        """
        Initialize the voice activity detector.

        :param use_onnx:                Whether to use ONNX for inference. Default is True.
        :param force_onnx_cpu:          Whether to force ONNX to use CPU for inference. Default is True.
        :param threshold:               Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
                                        probabilities ABOVE this value are considered as SPEECH. It is better to tune
                                        this parameter for each dataset separately, but "lazy" 0.5 is pretty good for
                                        most datasets.
        :param sampling_rate:           Currently, silero VAD models support 8000 and 16000 sample rates.
        :param min_speech_duration_ms:  Final speech chunks shorter min_speech_duration_ms are thrown out.
        :param max_speech_duration_s:   Maximum duration of speech chunks in seconds. Chunks longer than
                                        `max_speech_duration_s` will be split at the timestamp of the last silence that
                                        lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise,
                                        they will be split aggressively just before max_speech_duration_s.
        :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before
                                        separating it.
        :param window_size_samples:     Audio chunks of window_size_samples size are fed to the silero VAD model.
                                        WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000
                                        sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than
                                        these may affect model performance!
        :param speech_pad_ms:           Final speech chunks are padded by speech_pad_ms each side.
        :param return_seconds:          Whether return timestamps in seconds. False means to return timestamps in
                                        samples (default - False).
        :param per_channel:             Whether to return timestamps per channel (default - False). This will run VAD
                                        on each channel separately and return a list of timestamps per channel.
        """
        # Store configurations:
        self._use_onnx = use_onnx
        self._force_onnx_cpu = force_onnx_cpu
        self._threshold = threshold
        self._sampling_rate = sampling_rate
        self._min_speech_duration_ms = min_speech_duration_ms
        self._max_speech_duration_s = max_speech_duration_s
        self._min_silence_duration_ms = min_silence_duration_ms
        self._window_size_samples = window_size_samples
        self._speech_pad_ms = speech_pad_ms
        self._return_seconds = return_seconds
        self._per_channel = per_channel

        # Prepare the model variables
        self._model: torch.Module = None
        self._get_speech_timestamps: FunctionType = None

    def load(self, force_reload: bool = True):
        """
        Load the VAD model.

        :param force_reload: Whether to force reload the model even if it was already loaded. Default is True.
        """
        model, utils = torch.hub.load(
            repo_or_dir="snakers4/silero-vad",
            model="silero_vad",
            force_reload=force_reload,
            onnx=self._use_onnx,
            force_onnx_cpu=self._force_onnx_cpu,
        )
        self._model = model
        (
            self._get_speech_timestamps,
            _,  # save_audio,
            _,  # read_audio,
            _,  # VADIterator,
            _,  # collect_chunks
        ) = utils

    def detect_voice(
        self,
        audio_file: Path,
    ) -> Union[List[Dict[str, int]], List[List[Dict[str, int]]]]:
        """
        Infer the audio through the VAD model and return the speech timestamps.

        :param audio_file: The audio file to infer.

        :returns: The speech timestamps in the audio. A list of timestamps where each timestamp is a dictionary with the
                 following keys:

                 * "start": The start sample index of the speech in the audio.
                 * "end":   The end sample index of the speech in the audio.

                 If `per_channel` is True, a list of timestamps per channel will be returned.
        """
        # Cast to a numpy array:
        audio = self._read_audio(audio_file)

        # Detect speech:
        if not self._per_channel:
            return self._get_speech_timestamps(
                audio,
                self._model,
                threshold=self._threshold,
                min_speech_duration_ms=self._min_speech_duration_ms,
                max_speech_duration_s=self._max_speech_duration_s,
                min_silence_duration_ms=self._min_silence_duration_ms,
                speech_pad_ms=self._speech_pad_ms,
                sampling_rate=self._sampling_rate,
                window_size_samples=self._window_size_samples,
                return_seconds=self._return_seconds,
            )

        # Per channel:
        speech_timestamps = []
        for channel in audio:
            speech_timestamps.append(
                self._get_speech_timestamps(
                    channel,
                    self._model,
                    threshold=self._threshold,
                    min_speech_duration_ms=self._min_speech_duration_ms,
                    max_speech_duration_s=self._max_speech_duration_s,
                    min_silence_duration_ms=self._min_silence_duration_ms,
                    speech_pad_ms=self._speech_pad_ms,
                    sampling_rate=self._sampling_rate,
                    window_size_samples=self._window_size_samples,
                    return_seconds=self._return_seconds,
                )
            )

        return speech_timestamps

    def _read_audio(
        self,
        path: Path,
    ) -> torch.Tensor:
        """
        Read the audio from the given path and return it as a tensor.

        :param path: The path to the audio file.

        :returns: The audio as a tensor.
        """
        # Read the audio:
        audio, sampling_rate = torchaudio.load(str(path))

        # Check if the audio is stereo and if so, convert it to mono (only if not per channel):
        if audio.size(0) > 1 and not self._per_channel:
            audio = audio.mean(dim=0, keepdim=True)

        # Resample the audio if needed:
        if sampling_rate != self._sampling_rate:
            transform = torchaudio.transforms.Resample(
                orig_freq=sampling_rate, new_freq=self._sampling_rate
            )
            audio = transform(audio)

        # Return the audio (squeeze if not per channel):
        return audio if self._per_channel else audio.squeeze(0)


#: The value to send into multiprocessing queues to stop the process:
_MULTIPROCESSING_STOP_MARK = "STOP"


def _multiprocessing_complete_tasks(
    vad_init_kwargs: dict, tasks_queue: Queue, results_queue: Queue
):
    """
    Complete the tasks in the given queue and put the results in the given results queue. The function will stop when
    the given tasks queue will receive the stop mark. It is aimed to be used with multiprocessing as a process.

    :param vad_init_kwargs: The VAD initialization kwargs.
    :param tasks_queue:     A queue to get the tasks from.
    :param results_queue:   A queue to put the results in.
    """
    # Initialize and load the VAD:
    vad = VoiceActivityDetector(**vad_init_kwargs)
    vad.load(force_reload=False)

    # Start listening to the tasks queue:
    while True:
        # Get the task:
        task: Tuple[str, dict] = tasks_queue.get()
        if task == _MULTIPROCESSING_STOP_MARK:
            break
        try:
            # Create the task:
            task = TaskCreator.from_tuple(task_tuple=task)
            # Run the file through the VAD:
            speech_timestamps = vad.detect_voice(audio_file=task.audio_file)
            # Complete the task:
            task.do_task(speech_timestamps=speech_timestamps)
            # Build the result:
            result = (False, task.get_result())
        except Exception as exception:
            # Build the error:
            result = (True, (task.audio_file.name, str(exception)))
        # Collect the result / error:
        results_queue.put(result)

    # Mark the end of the tasks:
    results_queue.put(_MULTIPROCESSING_STOP_MARK)


# Get the global logger:
try:
    import mlrun

    _LOGGER = mlrun.get_or_create_ctx("silero_vad").logger
except ModuleNotFoundError:
    _LOGGER = logging.getLogger()


def detect_voice(
    # Input kwargs:
    data_path: Union[str, Path, List[Union[str, Path]]],
    # Model loading kwargs:
    use_onnx: bool = True,
    force_onnx_cpu: bool = True,
    # Detection kwargs:
    threshold: float = 0.5,
    sampling_rate: int = 16_000,
    min_speech_duration_ms: int = 250,
    max_speech_duration_s: float = float("inf"),
    min_silence_duration_ms: int = 100,
    window_size_samples: int = 512,
    speech_pad_ms: int = 30,
    return_seconds: bool = False,
    per_channel: bool = False,
    # Other kwargs:
    use_multiprocessing: int = 0,
    verbose: bool = False,
):
    """
    Perform voice activity detection on given audio files using the silero VAD model -
    https://github.com/snakers4/silero-vad. The end result is a dictionary with the file names as keys and their
    VAD timestamps dictionaries as value.

    For example::

        {
            "file_1.wav": [
                {"start": 0, "end": 16000},
                {"start": 16000, "end": 32000},
                {"start": 32000, "end": 48000},
                ...
            ],
            "file_2.wav": [
                {"start": 0, "end": 16000},
                {"start": 16000, "end": 32000},
                {"start": 32000, "end": 48000},
                ...
            ],
            ...
        }


    :param data_path:               The path to the audio files to diarize. Can be a path to a single file, a path to a
                                    directory or a list of paths to files.
    :param use_onnx:                Whether to use ONNX for inference. Default is True.
    :param force_onnx_cpu:          Whether to force ONNX to use CPU for inference. Default is True.
    :param threshold:               Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
                                    probabilities ABOVE this value are considered as SPEECH. It is better to tune
                                    this parameter for each dataset separately, but "lazy" 0.5 is pretty good for
                                    most datasets.
    :param sampling_rate:           Currently, silero VAD models support 8000 and 16000 sample rates.
    :param min_speech_duration_ms:  Final speech chunks shorter min_speech_duration_ms are thrown out.
    :param max_speech_duration_s:   Maximum duration of speech chunks in seconds. Chunks longer than
                                    `max_speech_duration_s` will be split at the timestamp of the last silence that
                                    lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will
                                    be split aggressively just before max_speech_duration_s.
    :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before separating
                                    it.
    :param window_size_samples:     Audio chunks of window_size_samples size are fed to the silero VAD model.

                                    WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000
                                    sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than
                                    these may affect model performance!
    :param speech_pad_ms:           Final speech chunks are padded by speech_pad_ms each side.
    :param return_seconds:          Whether return timestamps in seconds. False means to return timestamps in samples
                                    (default - False).
    :param per_channel:             Whether to return timestamps per channel (default - False). This will run VAD on
                                    each channel separately and return a list of timestamps per channel.
    :param use_multiprocessing:     The number of workers to use for multiprocessing. If 0, no multiprocessing will
                                    be used. Default is 0.
    :param verbose:                 Verbosity.
    """
    global _LOGGER

    # Get the input audio files to transcribe:
    if verbose:
        _LOGGER.info("Collecting audio files.")
    audio_files = _get_audio_files(data_path=data_path)
    if verbose:
        _LOGGER.info(f"Collected {len(audio_files)} audio files.")

    # Initialize the transcription pipeline:
    vad_init_kwargs = {
        "use_onnx": use_onnx,
        "force_onnx_cpu": force_onnx_cpu,
        "threshold": threshold,
        "sampling_rate": sampling_rate,
        "min_speech_duration_ms": min_speech_duration_ms,
        "max_speech_duration_s": max_speech_duration_s,
        "min_silence_duration_ms": min_silence_duration_ms,
        "window_size_samples": window_size_samples,
        "speech_pad_ms": speech_pad_ms,
        "return_seconds": return_seconds,
        "per_channel": per_channel,
    }

    # Create the task creator:
    task_creator = TaskCreator(task_type=BaseTask)

    # Run the transcription:
    if use_multiprocessing:
        results = _parallel_run(
            n_workers=use_multiprocessing,
            audio_files=audio_files,
            description="Detecting voice",
            vad_init_kwargs=vad_init_kwargs,
            task_creator=task_creator,
            verbose=verbose,
        )
    else:
        results = _run(
            audio_files=audio_files,
            description="Detecting voice",
            vad_init_kwargs=vad_init_kwargs,
            task_creator=task_creator,
            verbose=verbose,
        )

    # Process the results:
    return _process_results(results=results, verbose=verbose)


def diarize(
    # Input / Output kwargs:
    data_path: Union[str, Path, List[Union[str, Path]]],
    # Model loading kwargs:
    use_onnx: bool = True,
    force_onnx_cpu: bool = True,
    # Detection kwargs:
    threshold: float = 0.5,
    sampling_rate: int = 16_000,
    min_speech_duration_ms: int = 250,
    max_speech_duration_s: float = float("inf"),
    min_silence_duration_ms: int = 100,
    window_size_samples: int = 512,
    speech_pad_ms: int = 30,
    # Diarization kwargs:
    speaker_labels: List[str] = None,
    # Other kwargs:
    use_multiprocessing: int = 0,
    verbose: bool = False,
):
    """
    Perform speech diarization on given audio files using the silero VAD model - https://github.com/snakers4/silero-vad.
    The speech diarization is performed per channel so that each channel in the audio belong to a different speaker. The
    end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list
    of tuples: (start, end, speaker_label).

    For example::

        {
            "file_1.wav": [
                (0.0, 1.0, "speaker_0"),
                (1.0, 2.0, "speaker_1"),
                (2.0, 3.0, "speaker_0"),
                ...
            ],
            "file_2.wav": [
                (0.0, 1.0, "speaker_0"),
                (1.0, 2.0, "speaker_1"),
                (2.0, 3.0, "speaker_0"),
                ...
            ],
            ...
        }


    :param data_path:               The path to the audio files to diarize. Can be a path to a single file, a path to a
                                    directory or a list of paths to files.
    :param use_onnx:                Whether to use ONNX for inference. Default is True.
    :param force_onnx_cpu:          Whether to force ONNX to use CPU for inference. Default is True.
    :param threshold:               Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
                                    probabilities ABOVE this value are considered as SPEECH. It is better to tune
                                    this parameter for each dataset separately, but "lazy" 0.5 is pretty good for
                                    most datasets.
    :param sampling_rate:           Currently, silero VAD models support 8000 and 16000 sample rates.
    :param min_speech_duration_ms:  Final speech chunks shorter min_speech_duration_ms are thrown out.
    :param max_speech_duration_s:   Maximum duration of speech chunks in seconds. Chunks longer than
                                    `max_speech_duration_s` will be split at the timestamp of the last silence that
                                    lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will
                                    be split aggressively just before max_speech_duration_s.
    :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before separating
                                    it.
    :param window_size_samples:     Audio chunks of window_size_samples size are fed to the silero VAD model.

                                    WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000
                                    sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than
                                    these may affect model performance!
    :param speech_pad_ms:           Final speech chunks are padded by speech_pad_ms each side.
    :param speaker_labels:          The speaker labels to use for the diarization. If not given, the speakers will be
                                    named "speaker_0", "speaker_1", etc.
    :param use_multiprocessing:     The number of workers to use for multiprocessing. If 0, no multiprocessing will
                                    be used. Default is 0.
    :param verbose:                 Verbosity.
    """
    global _LOGGER

    # Get the input audio files to transcribe:
    if verbose:
        _LOGGER.info("Collecting audio files.")
    audio_files = _get_audio_files(data_path=data_path)
    if verbose:
        _LOGGER.info(f"Collected {len(audio_files)} audio files.")

    # Initialize the transcription pipeline:
    vad_init_kwargs = {
        "use_onnx": use_onnx,
        "force_onnx_cpu": force_onnx_cpu,
        "threshold": threshold,
        "sampling_rate": sampling_rate,
        "min_speech_duration_ms": min_speech_duration_ms,
        "max_speech_duration_s": max_speech_duration_s,
        "min_silence_duration_ms": min_silence_duration_ms,
        "window_size_samples": window_size_samples,
        "speech_pad_ms": speech_pad_ms,
        "return_seconds": True,
        "per_channel": True,
    }

    # Create the task creator:
    task_creator = TaskCreator(
        task_type=SpeechDiarizationTask, task_kwargs={"speaker_labels": speaker_labels}
    )

    # Run the transcription:
    if use_multiprocessing:
        results = _parallel_run(
            n_workers=use_multiprocessing,
            audio_files=audio_files,
            description="Diarizing",
            vad_init_kwargs=vad_init_kwargs,
            task_creator=task_creator,
            verbose=verbose,
        )
    else:
        results = _run(
            audio_files=audio_files,
            description="Diarizing",
            vad_init_kwargs=vad_init_kwargs,
            task_creator=task_creator,
            verbose=verbose,
        )

    # Process the results:
    return _process_results(results=results, verbose=verbose)


def _get_audio_files(
    data_path: Union[Path, str, list],
) -> List[Path]:
    """
    Get the audio files from the data path. If a path to a directory is given, all files in the directory will be
    collected.

    :param data_path: The data path to collect the audio files from.

    :returns: The audio files list.
    """
    # Check if given a list of paths:
    if isinstance(data_path, list):
        audio_files = []
        for path in data_path:
            audio_files.extend(_get_audio_files(data_path=path))
        return audio_files

    # Check if given a single string path to cast it to a `pathlib.Path`:
    if isinstance(data_path, str):
        data_path = Path(data_path).absolute()

    # Check if the path is of a directory or a file:
    if data_path.is_dir():
        # Get all files inside the directory:
        audio_files = list(data_path.glob("*.*"))
    elif data_path.is_file():
        audio_files = [data_path]
    else:
        raise ValueError(
            f"Unrecognized data path. The parameter `data_path` must be a valid path to either a directory path or a "
            f"file. Given: {str(data_path)} "
        )

    return audio_files


def _run(
    audio_files: List[Path],
    description: str,
    vad_init_kwargs: dict,
    task_creator: TaskCreator,
    verbose: bool,
) -> List[Tuple[bool, Tuple[str, list]]]:
    """
    Load a VAD and use it to complete the tasks that will be created on the provided files using the given task creator.

    :param audio_files:     The audio files to use.
    :param description:     The description to use for the progress bar.
    :param vad_init_kwargs: The VAD initialization keyword arguments.
    :param task_creator:    The task creator to use to create the tasks.
    :param verbose:         Verbosity.

    :returns: The collected results.
    """
    # Load the VAD:
    vad = VoiceActivityDetector(**vad_init_kwargs)
    if verbose:
        _LOGGER.info(f"Loading the VAD model.")
    vad.load()
    if verbose:
        _LOGGER.info("VAD model loaded.")

    # Run the VAD on the audio files and collect the results:
    results = []
    for audio_file in tqdm(
        audio_files,
        desc=description,
        unit="file",
        total=len(audio_files),
        disable=not verbose,
    ):
        try:
            # Create the task:
            task = task_creator.create_task(audio_file=audio_file)
            # Run the file through the VAD:
            speech_timestamps = vad.detect_voice(audio_file=audio_file)
            # Complete the task:
            task.do_task(speech_timestamps=speech_timestamps)
            # Collect the result:
            results.append((False, task.get_result()))
        except Exception as exception:
            # Collect the error:
            results.append((True, (audio_file.name, str(exception))))

    return results


def _parallel_run(
    n_workers: int,
    audio_files: List[Path],
    description: str,
    vad_init_kwargs: dict,
    task_creator: TaskCreator,
    verbose: bool,
) -> List[Tuple[bool, Tuple[str, list]]]:
    """
    Run multiple VAD workers with multiprocessing to complete the tasks that will be created on the provided files using
    the given task creator.

    :param n_workers:       The number of workers to use.
    :param audio_files:     The audio files to use.
    :param description:     The description to use for the progress bar.
    :param vad_init_kwargs: The VAD initialization keyword arguments.
    :param task_creator:    The task creator to use to create the tasks.
    :param verbose:         Verbosity.

    :returns: The collected results.
    """
    # Load the VAD (download once, and it will be loaded then per process later on):
    if verbose:
        _LOGGER.info(f"Loading the VAD model.")
    vad = VoiceActivityDetector(**vad_init_kwargs)
    vad.load()
    if verbose:
        _LOGGER.info("VAD model loaded.")

    # Check the number of workers:
    if n_workers > len(audio_files):
        _LOGGER.warning(
            f"The number of workers ({n_workers}) is larger than the number of audio files ({len(audio_files)}). "
            f"Setting the number of workers to {len(audio_files)}."
        )
        n_workers = len(audio_files)

    # Initialize the multiprocessing queues:
    tasks_queue = Queue()
    results_queue = Queue()

    # Initialize the multiprocessing processes:
    task_completion_processes = [
        Process(
            target=_multiprocessing_complete_tasks,
            kwargs={
                "vad_init_kwargs": vad_init_kwargs,
                "tasks_queue": tasks_queue,
                "results_queue": results_queue,
            },
        )
        for _ in range(n_workers)
    ]

    # Start the multiprocessing processes:
    for p in task_completion_processes:
        p.start()

    # Put the tasks in the queue:
    for audio_file in audio_files:
        tasks_queue.put(task_creator.create_task(audio_file=audio_file).to_tuple())

    # Put the stop marks in the queue:
    for _ in range(n_workers):
        tasks_queue.put(_MULTIPROCESSING_STOP_MARK)

    # Collect the results:
    results = []
    stop_marks_counter = 0
    with tqdm(
        desc=description,
        unit="file",
        total=len(audio_files),
        disable=not verbose,
    ) as progressbar:
        while True:
            # Get a result from the queue:
            result: Tuple[bool, Tuple[str, list]] = results_queue.get()
            if result == _MULTIPROCESSING_STOP_MARK:
                stop_marks_counter += 1
                if stop_marks_counter == n_workers:
                    break
            else:
                # Collect the result:
                results.append(result)
                progressbar.update(1)

    # Wait for the processes to finish:
    for p in task_completion_processes:
        p.join()

    return results


def _process_results(
    results: List[Tuple[bool, Tuple[str, list]]], verbose: bool
) -> Tuple[dict, dict]:
    """
    Process the results of the tasks.

    :param results: The results to process.
    :param verbose: Verbosity.

    :returns: The processed results as a tuple of successes and errors.
    """
    if verbose:
        _LOGGER.info("Summarizing the results.")
    successes = {}
    errors = {}
    for is_error, result in results:
        if is_error:
            errors[result[0]] = result[1]
        else:
            successes[result[0]] = result[1]
    if verbose:
        _LOGGER.info(f"Done ({len(successes)}/{len(successes) + len(errors)})\n")

    return successes, errors
 + base_image: mlrun/mlrun + commands: [] + code_origin: '' + origin_filename: '' + requirements: + - torch + - torchaudio + - tqdm + - onnxruntime + entry_points: + audio_file: + name: audio_file + doc: Get the audio file of the task. + parameters: + - name: self + outputs: + - doc: The audio file of the task. + type: Path + lineno: 43 + has_varargs: false + has_kwargs: false + do_task: + name: do_task + doc: Do the task on the given speech timestamps. The task will diarize the VAD + speech timestamps into speakers. + parameters: + - name: self + - name: speech_timestamps + type: List[List[Dict[str, int]]] + doc: The speech timestamps per channel to do the task on as outputted from + the VAD. + outputs: [] + lineno: 94 + has_varargs: false + has_kwargs: false + get_result: + name: get_result + doc: Get the result of the task. A tuple of the audio file name and the result. + parameters: + - name: self + outputs: + - doc: The result of the task. + type: Tuple[str, list] + lineno: 61 + has_varargs: false + has_kwargs: false + to_tuple: + name: to_tuple + doc: Convert the task to a tuple to reconstruct it later (used for multiprocessing + to pass in queue). + parameters: + - name: self + outputs: + - doc: The converted task. + type: Tuple[str, dict] + lineno: 116 + has_varargs: false + has_kwargs: false + create_task: + name: create_task + doc: Create a task with the given audio file. + parameters: + - name: self + - name: audio_file + type: Path + doc: The audio file to assign to the task. + outputs: + - doc: The created task. + type: BaseTask + lineno: 146 + has_varargs: false + has_kwargs: false + from_tuple: + name: from_tuple + doc: Create a task from a tuple of the audio file name and the task kwargs. + parameters: + - name: cls + - name: task_tuple + type: Tuple[str, dict] + doc: The task tuple to create the task from. + outputs: + - doc: The created task. + type: BaseTask + lineno: 157 + has_varargs: false + has_kwargs: false + load: + name: load + doc: Load the VAD model. + parameters: + - name: self + - name: force_reload + type: bool + doc: Whether to force reload the model even if it was already loaded. Default + is True. + default: true + outputs: [] + lineno: 234 + has_varargs: false + has_kwargs: false + detect_voice: + name: detect_voice + doc: "Perform voice activity detection on given audio files using the silero\ + \ VAD model -\nhttps://github.com/snakers4/silero-vad. The end result is a\ + \ dictionary with the file names as keys and their\nVAD timestamps dictionaries\ + \ as value.\n\nFor example::\n\n {\n \"file_1.wav\": [\n \ + \ {\"start\": 0, \"end\": 16000},\n {\"start\": 16000, \"end\"\ + : 32000},\n {\"start\": 32000, \"end\": 48000},\n ...\n\ + \ ],\n \"file_2.wav\": [\n {\"start\": 0, \"end\"\ + : 16000},\n {\"start\": 16000, \"end\": 32000},\n {\"\ + start\": 32000, \"end\": 48000},\n ...\n ],\n ...\n\ + \ }" + parameters: + - name: data_path + type: Union[str, Path, List[Union[str, Path]]] + doc: The path to the audio files to diarize. Can be a path to a single file, + a path to a directory or a list of paths to files. + - name: use_onnx + type: bool + doc: Whether to use ONNX for inference. Default is True. + default: true + - name: force_onnx_cpu + type: bool + doc: Whether to force ONNX to use CPU for inference. Default is True. + default: true + - name: threshold + type: float + doc: Speech threshold. Silero VAD outputs speech probabilities for each audio + chunk, probabilities ABOVE this value are considered as SPEECH. It is better + to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty + good for most datasets. + default: 0.5 + - name: sampling_rate + type: int + doc: Currently, silero VAD models support 8000 and 16000 sample rates. + default: 16000 + - name: min_speech_duration_ms + type: int + doc: Final speech chunks shorter min_speech_duration_ms are thrown out. + default: 250 + - name: max_speech_duration_s + type: float + doc: Maximum duration of speech chunks in seconds. Chunks longer than `max_speech_duration_s` + will be split at the timestamp of the last silence that lasts more than + 100ms (if any), to prevent aggressive cutting. Otherwise, they will be split + aggressively just before max_speech_duration_s. + default: float('inf') + - name: min_silence_duration_ms + type: int + doc: In the end of each speech chunk wait for min_silence_duration_ms before + separating it. + default: 100 + - name: window_size_samples + type: int + doc: Audio chunks of window_size_samples size are fed to the silero VAD model. + default: 512 + - name: speech_pad_ms + type: int + doc: Final speech chunks are padded by speech_pad_ms each side. + default: 30 + - name: return_seconds + type: bool + doc: Whether return timestamps in seconds. False means to return timestamps + in samples (default - False). + default: false + - name: per_channel + type: bool + doc: Whether to return timestamps per channel (default - False). This will + run VAD on each channel separately and return a list of timestamps per channel. + default: false + - name: use_multiprocessing + type: int + doc: The number of workers to use for multiprocessing. If 0, no multiprocessing + will be used. Default is 0. + default: 0 + - name: verbose + type: bool + doc: Verbosity. + default: false + outputs: [] + lineno: 393 + has_varargs: false + has_kwargs: false + diarize: + name: diarize + doc: "Perform speech diarization on given audio files using the silero VAD model\ + \ - https://github.com/snakers4/silero-vad.\nThe speech diarization is performed\ + \ per channel so that each channel in the audio belong to a different speaker.\ + \ The\nend result is a dictionary with the file names as keys and their diarization\ + \ as value. A diarization is a list\nof tuples: (start, end, speaker_label).\n\ + \nFor example::\n\n {\n \"file_1.wav\": [\n (0.0, 1.0,\ + \ \"speaker_0\"),\n (1.0, 2.0, \"speaker_1\"),\n (2.0,\ + \ 3.0, \"speaker_0\"),\n ...\n ],\n \"file_2.wav\"\ + : [\n (0.0, 1.0, \"speaker_0\"),\n (1.0, 2.0, \"speaker_1\"\ + ),\n (2.0, 3.0, \"speaker_0\"),\n ...\n ],\n\ + \ ...\n }" + parameters: + - name: data_path + type: Union[str, Path, List[Union[str, Path]]] + doc: The path to the audio files to diarize. Can be a path to a single file, + a path to a directory or a list of paths to files. + - name: use_onnx + type: bool + doc: Whether to use ONNX for inference. Default is True. + default: true + - name: force_onnx_cpu + type: bool + doc: Whether to force ONNX to use CPU for inference. Default is True. + default: true + - name: threshold + type: float + doc: Speech threshold. Silero VAD outputs speech probabilities for each audio + chunk, probabilities ABOVE this value are considered as SPEECH. It is better + to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty + good for most datasets. + default: 0.5 + - name: sampling_rate + type: int + doc: Currently, silero VAD models support 8000 and 16000 sample rates. + default: 16000 + - name: min_speech_duration_ms + type: int + doc: Final speech chunks shorter min_speech_duration_ms are thrown out. + default: 250 + - name: max_speech_duration_s + type: float + doc: Maximum duration of speech chunks in seconds. Chunks longer than `max_speech_duration_s` + will be split at the timestamp of the last silence that lasts more than + 100ms (if any), to prevent aggressive cutting. Otherwise, they will be split + aggressively just before max_speech_duration_s. + default: float('inf') + - name: min_silence_duration_ms + type: int + doc: In the end of each speech chunk wait for min_silence_duration_ms before + separating it. + default: 100 + - name: window_size_samples + type: int + doc: Audio chunks of window_size_samples size are fed to the silero VAD model. + default: 512 + - name: speech_pad_ms + type: int + doc: Final speech chunks are padded by speech_pad_ms each side. + default: 30 + - name: speaker_labels + type: List[str] + doc: The speaker labels to use for the diarization. If not given, the speakers + will be named "speaker_0", "speaker_1", etc. + default: null + - name: use_multiprocessing + type: int + doc: The number of workers to use for multiprocessing. If 0, no multiprocessing + will be used. Default is 0. + default: 0 + - name: verbose + type: bool + doc: Verbosity. + default: false + outputs: [] + lineno: 517 + has_varargs: false + has_kwargs: false + description: Silero VAD (Voice Activity Detection) functions. + default_handler: detect_voice + disable_auto_mount: false + clone_target_dir: '' + env: [] + priority_class_name: '' + preemption_mode: prevent + affinity: null + tolerations: null + security_context: {} +verbose: false diff --git a/functions/master/silero_vad/1.3.0/src/item.yaml b/functions/master/silero_vad/1.3.0/src/item.yaml new file mode 100644 index 00000000..9ce9a5d2 --- /dev/null +++ b/functions/master/silero_vad/1.3.0/src/item.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +categories: +- deep-learning +- pytorch +- audio +description: Silero VAD (Voice Activity Detection) functions. +doc: '' +example: silero_vad.ipynb +generationDate: 2023-12-03:14-30 +hidden: false +icon: '' +labels: + author: guyl +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.5.2 +name: silero_vad +platformVersion: 3.5.3 +spec: + filename: silero_vad.py + handler: detect_voice + image: mlrun/mlrun + kind: job + requirements: + - torch + - torchaudio + - tqdm + - onnxruntime +url: '' +version: 1.3.0 diff --git a/functions/master/silero_vad/1.3.0/src/silero_vad.ipynb b/functions/master/silero_vad/1.3.0/src/silero_vad.ipynb new file mode 100644 index 00000000..29cd7437 --- /dev/null +++ b/functions/master/silero_vad/1.3.0/src/silero_vad.ipynb @@ -0,0 +1,35 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "initial_id", + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/functions/master/silero_vad/1.3.0/src/silero_vad.py b/functions/master/silero_vad/1.3.0/src/silero_vad.py new file mode 100644 index 00000000..a477d4ec --- /dev/null +++ b/functions/master/silero_vad/1.3.0/src/silero_vad.py @@ -0,0 +1,847 @@ +# Copyright 2024 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +from multiprocessing import Process, Queue +from pathlib import Path +from types import FunctionType +from typing import Dict, List, Tuple, Type, Union + +import torch +import torchaudio +from tqdm import tqdm + + +class BaseTask: + """ + A base class for a task to complete after VAD. + """ + + def __init__(self, audio_file: Path): + """ + Initialize the base task. + + :param audio_file: The audio file assigned to the task. + """ + # Store the audio file: + self._audio_file = audio_file + + # Prepare the result: + self._result = None + + @property + def audio_file(self) -> Path: + """ + Get the audio file of the task. + + :returns: The audio file of the task. + """ + return self._audio_file + + def do_task( + self, speech_timestamps: Union[List[Dict[str, int]], List[List[Dict[str, int]]]] + ): + """ + Do the task on the given speech timestamps. The base task will simply save the speech timestamps as the result. + + :param speech_timestamps: The speech timestamps to do the task on as outputted from the VAD. + """ + self._result = speech_timestamps + + def get_result(self) -> Tuple[str, list]: + """ + Get the result of the task. A tuple of the audio file name and the result. + + :returns: The result of the task. + """ + return self._audio_file.name, self._result + + def to_tuple(self) -> Tuple[str, dict]: + """ + Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + + :returns: The converted task. + """ + return self.__class__.__name__, {"audio_file": self._audio_file} + + +class SpeechDiarizationTask(BaseTask): + """ + A speech diarization task. The task will diarize the VAD speech timestamps into speakers. + """ + + def __init__(self, audio_file: Path, speaker_labels: List[str]): + """ + Initialize the speech diarization task. + + :param audio_file: The audio file assigned to the task. + :param speaker_labels: The speaker labels to use for the diarization. If not given, the speakers will be named + "speaker_0", "speaker_1", etc. + """ + super().__init__(audio_file=audio_file) + self._speaker_labels = speaker_labels + + def do_task(self, speech_timestamps: List[List[Dict[str, int]]]): + """ + Do the task on the given speech timestamps. The task will diarize the VAD speech timestamps into speakers. + + :param speech_timestamps: The speech timestamps per channel to do the task on as outputted from the VAD. + """ + # Get the speaker labels (set default if not given): + speaker_labels = self._speaker_labels or [ + f"speaker_{i}" for i in range(len(speech_timestamps)) + ] + + # Diarize - organize the speech timestamps into a single list of speakers and sort it by start time: + speech_diarization = [ + (speech_timestamp["start"], speech_timestamp["end"], speaker_label) + for speaker_label, channel_speech_timestamps in zip( + speaker_labels, speech_timestamps + ) + for speech_timestamp in channel_speech_timestamps + ] + speech_diarization.sort() + self._result = speech_diarization + + def to_tuple(self) -> Tuple[str, dict]: + """ + Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + + :returns: The converted task. + """ + task_class, task_kwargs = super().to_tuple() + return task_class, {**task_kwargs, "speaker_labels": self._speaker_labels} + + +class TaskCreator: + """ + A task creator to create different tasks to run after the VAD. + """ + + #: A map from task class name to task class to use in `from_tuple`: + _MAP = { + BaseTask.__name__: BaseTask, + SpeechDiarizationTask.__name__: SpeechDiarizationTask, + } + + def __init__(self, task_type: Type[BaseTask], task_kwargs: dict = None): + """ + Initialize the task creator. + :param task_type: The task type - a `BaseTask` subclass. + :param task_kwargs: Additional keyword arguments to pass to the to be created tasks. + """ + self._task_type = task_type + self._task_kwargs = task_kwargs or {} + + def create_task(self, audio_file: Path) -> BaseTask: + """ + Create a task with the given audio file. + + :param audio_file: The audio file to assign to the task. + + :returns: The created task. + """ + return self._task_type(audio_file=audio_file, **self._task_kwargs) + + @classmethod + def from_tuple(cls, task_tuple: Tuple[str, dict]) -> BaseTask: + """ + Create a task from a tuple of the audio file name and the task kwargs. + + :param task_tuple: The task tuple to create the task from. + + :returns: The created task. + """ + task_class, task_kwargs = task_tuple + return cls._MAP[task_class](**task_kwargs) + + +class VoiceActivityDetector: + """ + A voice activity detection wrapper for the silero VAD model - https://github.com/snakers4/silero-vad. + """ + + def __init__( + self, + # Model loading kwargs: + use_onnx: bool = True, + force_onnx_cpu: bool = True, + # Detection kwargs: + threshold: float = 0.5, + sampling_rate: int = 16_000, + min_speech_duration_ms: int = 250, + max_speech_duration_s: float = float("inf"), + min_silence_duration_ms: int = 100, + window_size_samples: int = 512, + speech_pad_ms: int = 30, + return_seconds: bool = False, + per_channel: bool = False, + ): + """ + Initialize the voice activity detector. + + :param use_onnx: Whether to use ONNX for inference. Default is True. + :param force_onnx_cpu: Whether to force ONNX to use CPU for inference. Default is True. + :param threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, + probabilities ABOVE this value are considered as SPEECH. It is better to tune + this parameter for each dataset separately, but "lazy" 0.5 is pretty good for + most datasets. + :param sampling_rate: Currently, silero VAD models support 8000 and 16000 sample rates. + :param min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out. + :param max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer than + `max_speech_duration_s` will be split at the timestamp of the last silence that + lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, + they will be split aggressively just before max_speech_duration_s. + :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before + separating it. + :param window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model. + WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 + sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than + these may affect model performance! + :param speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side. + :param return_seconds: Whether return timestamps in seconds. False means to return timestamps in + samples (default - False). + :param per_channel: Whether to return timestamps per channel (default - False). This will run VAD + on each channel separately and return a list of timestamps per channel. + """ + # Store configurations: + self._use_onnx = use_onnx + self._force_onnx_cpu = force_onnx_cpu + self._threshold = threshold + self._sampling_rate = sampling_rate + self._min_speech_duration_ms = min_speech_duration_ms + self._max_speech_duration_s = max_speech_duration_s + self._min_silence_duration_ms = min_silence_duration_ms + self._window_size_samples = window_size_samples + self._speech_pad_ms = speech_pad_ms + self._return_seconds = return_seconds + self._per_channel = per_channel + + # Prepare the model variables + self._model: torch.Module = None + self._get_speech_timestamps: FunctionType = None + + def load(self, force_reload: bool = True): + """ + Load the VAD model. + + :param force_reload: Whether to force reload the model even if it was already loaded. Default is True. + """ + model, utils = torch.hub.load( + repo_or_dir="snakers4/silero-vad", + model="silero_vad", + force_reload=force_reload, + onnx=self._use_onnx, + force_onnx_cpu=self._force_onnx_cpu, + ) + self._model = model + ( + self._get_speech_timestamps, + _, # save_audio, + _, # read_audio, + _, # VADIterator, + _, # collect_chunks + ) = utils + + def detect_voice( + self, + audio_file: Path, + ) -> Union[List[Dict[str, int]], List[List[Dict[str, int]]]]: + """ + Infer the audio through the VAD model and return the speech timestamps. + + :param audio_file: The audio file to infer. + + :returns: The speech timestamps in the audio. A list of timestamps where each timestamp is a dictionary with the + following keys: + + * "start": The start sample index of the speech in the audio. + * "end": The end sample index of the speech in the audio. + + If `per_channel` is True, a list of timestamps per channel will be returned. + """ + # Cast to a numpy array: + audio = self._read_audio(audio_file) + + # Detect speech: + if not self._per_channel: + return self._get_speech_timestamps( + audio, + self._model, + threshold=self._threshold, + min_speech_duration_ms=self._min_speech_duration_ms, + max_speech_duration_s=self._max_speech_duration_s, + min_silence_duration_ms=self._min_silence_duration_ms, + speech_pad_ms=self._speech_pad_ms, + sampling_rate=self._sampling_rate, + window_size_samples=self._window_size_samples, + return_seconds=self._return_seconds, + ) + + # Per channel: + speech_timestamps = [] + for channel in audio: + speech_timestamps.append( + self._get_speech_timestamps( + channel, + self._model, + threshold=self._threshold, + min_speech_duration_ms=self._min_speech_duration_ms, + max_speech_duration_s=self._max_speech_duration_s, + min_silence_duration_ms=self._min_silence_duration_ms, + speech_pad_ms=self._speech_pad_ms, + sampling_rate=self._sampling_rate, + window_size_samples=self._window_size_samples, + return_seconds=self._return_seconds, + ) + ) + + return speech_timestamps + + def _read_audio( + self, + path: Path, + ) -> torch.Tensor: + """ + Read the audio from the given path and return it as a tensor. + + :param path: The path to the audio file. + + :returns: The audio as a tensor. + """ + # Read the audio: + audio, sampling_rate = torchaudio.load(str(path)) + + # Check if the audio is stereo and if so, convert it to mono (only if not per channel): + if audio.size(0) > 1 and not self._per_channel: + audio = audio.mean(dim=0, keepdim=True) + + # Resample the audio if needed: + if sampling_rate != self._sampling_rate: + transform = torchaudio.transforms.Resample( + orig_freq=sampling_rate, new_freq=self._sampling_rate + ) + audio = transform(audio) + + # Return the audio (squeeze if not per channel): + return audio if self._per_channel else audio.squeeze(0) + + +#: The value to send into multiprocessing queues to stop the process: +_MULTIPROCESSING_STOP_MARK = "STOP" + + +def _multiprocessing_complete_tasks( + vad_init_kwargs: dict, tasks_queue: Queue, results_queue: Queue +): + """ + Complete the tasks in the given queue and put the results in the given results queue. The function will stop when + the given tasks queue will receive the stop mark. It is aimed to be used with multiprocessing as a process. + + :param vad_init_kwargs: The VAD initialization kwargs. + :param tasks_queue: A queue to get the tasks from. + :param results_queue: A queue to put the results in. + """ + # Initialize and load the VAD: + vad = VoiceActivityDetector(**vad_init_kwargs) + vad.load(force_reload=False) + + # Start listening to the tasks queue: + while True: + # Get the task: + task: Tuple[str, dict] = tasks_queue.get() + if task == _MULTIPROCESSING_STOP_MARK: + break + try: + # Create the task: + task = TaskCreator.from_tuple(task_tuple=task) + # Run the file through the VAD: + speech_timestamps = vad.detect_voice(audio_file=task.audio_file) + # Complete the task: + task.do_task(speech_timestamps=speech_timestamps) + # Build the result: + result = (False, task.get_result()) + except Exception as exception: + # Build the error: + result = (True, (task.audio_file.name, str(exception))) + # Collect the result / error: + results_queue.put(result) + + # Mark the end of the tasks: + results_queue.put(_MULTIPROCESSING_STOP_MARK) + + +# Get the global logger: +try: + import mlrun + + _LOGGER = mlrun.get_or_create_ctx("silero_vad").logger +except ModuleNotFoundError: + _LOGGER = logging.getLogger() + + +def detect_voice( + # Input kwargs: + data_path: Union[str, Path, List[Union[str, Path]]], + # Model loading kwargs: + use_onnx: bool = True, + force_onnx_cpu: bool = True, + # Detection kwargs: + threshold: float = 0.5, + sampling_rate: int = 16_000, + min_speech_duration_ms: int = 250, + max_speech_duration_s: float = float("inf"), + min_silence_duration_ms: int = 100, + window_size_samples: int = 512, + speech_pad_ms: int = 30, + return_seconds: bool = False, + per_channel: bool = False, + # Other kwargs: + use_multiprocessing: int = 0, + verbose: bool = False, +): + """ + Perform voice activity detection on given audio files using the silero VAD model - + https://github.com/snakers4/silero-vad. The end result is a dictionary with the file names as keys and their + VAD timestamps dictionaries as value. + + For example:: + + { + "file_1.wav": [ + {"start": 0, "end": 16000}, + {"start": 16000, "end": 32000}, + {"start": 32000, "end": 48000}, + ... + ], + "file_2.wav": [ + {"start": 0, "end": 16000}, + {"start": 16000, "end": 32000}, + {"start": 32000, "end": 48000}, + ... + ], + ... + } + + + :param data_path: The path to the audio files to diarize. Can be a path to a single file, a path to a + directory or a list of paths to files. + :param use_onnx: Whether to use ONNX for inference. Default is True. + :param force_onnx_cpu: Whether to force ONNX to use CPU for inference. Default is True. + :param threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, + probabilities ABOVE this value are considered as SPEECH. It is better to tune + this parameter for each dataset separately, but "lazy" 0.5 is pretty good for + most datasets. + :param sampling_rate: Currently, silero VAD models support 8000 and 16000 sample rates. + :param min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out. + :param max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer than + `max_speech_duration_s` will be split at the timestamp of the last silence that + lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will + be split aggressively just before max_speech_duration_s. + :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before separating + it. + :param window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model. + + WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 + sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than + these may affect model performance! + :param speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side. + :param return_seconds: Whether return timestamps in seconds. False means to return timestamps in samples + (default - False). + :param per_channel: Whether to return timestamps per channel (default - False). This will run VAD on + each channel separately and return a list of timestamps per channel. + :param use_multiprocessing: The number of workers to use for multiprocessing. If 0, no multiprocessing will + be used. Default is 0. + :param verbose: Verbosity. + """ + global _LOGGER + + # Get the input audio files to transcribe: + if verbose: + _LOGGER.info("Collecting audio files.") + audio_files = _get_audio_files(data_path=data_path) + if verbose: + _LOGGER.info(f"Collected {len(audio_files)} audio files.") + + # Initialize the transcription pipeline: + vad_init_kwargs = { + "use_onnx": use_onnx, + "force_onnx_cpu": force_onnx_cpu, + "threshold": threshold, + "sampling_rate": sampling_rate, + "min_speech_duration_ms": min_speech_duration_ms, + "max_speech_duration_s": max_speech_duration_s, + "min_silence_duration_ms": min_silence_duration_ms, + "window_size_samples": window_size_samples, + "speech_pad_ms": speech_pad_ms, + "return_seconds": return_seconds, + "per_channel": per_channel, + } + + # Create the task creator: + task_creator = TaskCreator(task_type=BaseTask) + + # Run the transcription: + if use_multiprocessing: + results = _parallel_run( + n_workers=use_multiprocessing, + audio_files=audio_files, + description="Detecting voice", + vad_init_kwargs=vad_init_kwargs, + task_creator=task_creator, + verbose=verbose, + ) + else: + results = _run( + audio_files=audio_files, + description="Detecting voice", + vad_init_kwargs=vad_init_kwargs, + task_creator=task_creator, + verbose=verbose, + ) + + # Process the results: + return _process_results(results=results, verbose=verbose) + + +def diarize( + # Input / Output kwargs: + data_path: Union[str, Path, List[Union[str, Path]]], + # Model loading kwargs: + use_onnx: bool = True, + force_onnx_cpu: bool = True, + # Detection kwargs: + threshold: float = 0.5, + sampling_rate: int = 16_000, + min_speech_duration_ms: int = 250, + max_speech_duration_s: float = float("inf"), + min_silence_duration_ms: int = 100, + window_size_samples: int = 512, + speech_pad_ms: int = 30, + # Diarization kwargs: + speaker_labels: List[str] = None, + # Other kwargs: + use_multiprocessing: int = 0, + verbose: bool = False, +): + """ + Perform speech diarization on given audio files using the silero VAD model - https://github.com/snakers4/silero-vad. + The speech diarization is performed per channel so that each channel in the audio belong to a different speaker. The + end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list + of tuples: (start, end, speaker_label). + + For example:: + + { + "file_1.wav": [ + (0.0, 1.0, "speaker_0"), + (1.0, 2.0, "speaker_1"), + (2.0, 3.0, "speaker_0"), + ... + ], + "file_2.wav": [ + (0.0, 1.0, "speaker_0"), + (1.0, 2.0, "speaker_1"), + (2.0, 3.0, "speaker_0"), + ... + ], + ... + } + + + :param data_path: The path to the audio files to diarize. Can be a path to a single file, a path to a + directory or a list of paths to files. + :param use_onnx: Whether to use ONNX for inference. Default is True. + :param force_onnx_cpu: Whether to force ONNX to use CPU for inference. Default is True. + :param threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, + probabilities ABOVE this value are considered as SPEECH. It is better to tune + this parameter for each dataset separately, but "lazy" 0.5 is pretty good for + most datasets. + :param sampling_rate: Currently, silero VAD models support 8000 and 16000 sample rates. + :param min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out. + :param max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer than + `max_speech_duration_s` will be split at the timestamp of the last silence that + lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will + be split aggressively just before max_speech_duration_s. + :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before separating + it. + :param window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model. + + WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 + sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than + these may affect model performance! + :param speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side. + :param speaker_labels: The speaker labels to use for the diarization. If not given, the speakers will be + named "speaker_0", "speaker_1", etc. + :param use_multiprocessing: The number of workers to use for multiprocessing. If 0, no multiprocessing will + be used. Default is 0. + :param verbose: Verbosity. + """ + global _LOGGER + + # Get the input audio files to transcribe: + if verbose: + _LOGGER.info("Collecting audio files.") + audio_files = _get_audio_files(data_path=data_path) + if verbose: + _LOGGER.info(f"Collected {len(audio_files)} audio files.") + + # Initialize the transcription pipeline: + vad_init_kwargs = { + "use_onnx": use_onnx, + "force_onnx_cpu": force_onnx_cpu, + "threshold": threshold, + "sampling_rate": sampling_rate, + "min_speech_duration_ms": min_speech_duration_ms, + "max_speech_duration_s": max_speech_duration_s, + "min_silence_duration_ms": min_silence_duration_ms, + "window_size_samples": window_size_samples, + "speech_pad_ms": speech_pad_ms, + "return_seconds": True, + "per_channel": True, + } + + # Create the task creator: + task_creator = TaskCreator( + task_type=SpeechDiarizationTask, task_kwargs={"speaker_labels": speaker_labels} + ) + + # Run the transcription: + if use_multiprocessing: + results = _parallel_run( + n_workers=use_multiprocessing, + audio_files=audio_files, + description="Diarizing", + vad_init_kwargs=vad_init_kwargs, + task_creator=task_creator, + verbose=verbose, + ) + else: + results = _run( + audio_files=audio_files, + description="Diarizing", + vad_init_kwargs=vad_init_kwargs, + task_creator=task_creator, + verbose=verbose, + ) + + # Process the results: + return _process_results(results=results, verbose=verbose) + + +def _get_audio_files( + data_path: Union[Path, str, list], +) -> List[Path]: + """ + Get the audio files from the data path. If a path to a directory is given, all files in the directory will be + collected. + + :param data_path: The data path to collect the audio files from. + + :returns: The audio files list. + """ + # Check if given a list of paths: + if isinstance(data_path, list): + audio_files = [] + for path in data_path: + audio_files.extend(_get_audio_files(data_path=path)) + return audio_files + + # Check if given a single string path to cast it to a `pathlib.Path`: + if isinstance(data_path, str): + data_path = Path(data_path).absolute() + + # Check if the path is of a directory or a file: + if data_path.is_dir(): + # Get all files inside the directory: + audio_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + audio_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be a valid path to either a directory path or a " + f"file. Given: {str(data_path)} " + ) + + return audio_files + + +def _run( + audio_files: List[Path], + description: str, + vad_init_kwargs: dict, + task_creator: TaskCreator, + verbose: bool, +) -> List[Tuple[bool, Tuple[str, list]]]: + """ + Load a VAD and use it to complete the tasks that will be created on the provided files using the given task creator. + + :param audio_files: The audio files to use. + :param description: The description to use for the progress bar. + :param vad_init_kwargs: The VAD initialization keyword arguments. + :param task_creator: The task creator to use to create the tasks. + :param verbose: Verbosity. + + :returns: The collected results. + """ + # Load the VAD: + vad = VoiceActivityDetector(**vad_init_kwargs) + if verbose: + _LOGGER.info(f"Loading the VAD model.") + vad.load() + if verbose: + _LOGGER.info("VAD model loaded.") + + # Run the VAD on the audio files and collect the results: + results = [] + for audio_file in tqdm( + audio_files, + desc=description, + unit="file", + total=len(audio_files), + disable=not verbose, + ): + try: + # Create the task: + task = task_creator.create_task(audio_file=audio_file) + # Run the file through the VAD: + speech_timestamps = vad.detect_voice(audio_file=audio_file) + # Complete the task: + task.do_task(speech_timestamps=speech_timestamps) + # Collect the result: + results.append((False, task.get_result())) + except Exception as exception: + # Collect the error: + results.append((True, (audio_file.name, str(exception)))) + + return results + + +def _parallel_run( + n_workers: int, + audio_files: List[Path], + description: str, + vad_init_kwargs: dict, + task_creator: TaskCreator, + verbose: bool, +) -> List[Tuple[bool, Tuple[str, list]]]: + """ + Run multiple VAD workers with multiprocessing to complete the tasks that will be created on the provided files using + the given task creator. + + :param n_workers: The number of workers to use. + :param audio_files: The audio files to use. + :param description: The description to use for the progress bar. + :param vad_init_kwargs: The VAD initialization keyword arguments. + :param task_creator: The task creator to use to create the tasks. + :param verbose: Verbosity. + + :returns: The collected results. + """ + # Load the VAD (download once, and it will be loaded then per process later on): + if verbose: + _LOGGER.info(f"Loading the VAD model.") + vad = VoiceActivityDetector(**vad_init_kwargs) + vad.load() + if verbose: + _LOGGER.info("VAD model loaded.") + + # Check the number of workers: + if n_workers > len(audio_files): + _LOGGER.warning( + f"The number of workers ({n_workers}) is larger than the number of audio files ({len(audio_files)}). " + f"Setting the number of workers to {len(audio_files)}." + ) + n_workers = len(audio_files) + + # Initialize the multiprocessing queues: + tasks_queue = Queue() + results_queue = Queue() + + # Initialize the multiprocessing processes: + task_completion_processes = [ + Process( + target=_multiprocessing_complete_tasks, + kwargs={ + "vad_init_kwargs": vad_init_kwargs, + "tasks_queue": tasks_queue, + "results_queue": results_queue, + }, + ) + for _ in range(n_workers) + ] + + # Start the multiprocessing processes: + for p in task_completion_processes: + p.start() + + # Put the tasks in the queue: + for audio_file in audio_files: + tasks_queue.put(task_creator.create_task(audio_file=audio_file).to_tuple()) + + # Put the stop marks in the queue: + for _ in range(n_workers): + tasks_queue.put(_MULTIPROCESSING_STOP_MARK) + + # Collect the results: + results = [] + stop_marks_counter = 0 + with tqdm( + desc=description, + unit="file", + total=len(audio_files), + disable=not verbose, + ) as progressbar: + while True: + # Get a result from the queue: + result: Tuple[bool, Tuple[str, list]] = results_queue.get() + if result == _MULTIPROCESSING_STOP_MARK: + stop_marks_counter += 1 + if stop_marks_counter == n_workers: + break + else: + # Collect the result: + results.append(result) + progressbar.update(1) + + # Wait for the processes to finish: + for p in task_completion_processes: + p.join() + + return results + + +def _process_results( + results: List[Tuple[bool, Tuple[str, list]]], verbose: bool +) -> Tuple[dict, dict]: + """ + Process the results of the tasks. + + :param results: The results to process. + :param verbose: Verbosity. + + :returns: The processed results as a tuple of successes and errors. + """ + if verbose: + _LOGGER.info("Summarizing the results.") + successes = {} + errors = {} + for is_error, result in results: + if is_error: + errors[result[0]] = result[1] + else: + successes[result[0]] = result[1] + if verbose: + _LOGGER.info(f"Done ({len(successes)}/{len(successes) + len(errors)})\n") + + return successes, errors diff --git a/functions/master/silero_vad/1.3.0/src/test_silero_vad.py b/functions/master/silero_vad/1.3.0/src/test_silero_vad.py new file mode 100644 index 00000000..d46471a5 --- /dev/null +++ b/functions/master/silero_vad/1.3.0/src/test_silero_vad.py @@ -0,0 +1,44 @@ +import os +import tempfile + +import mlrun +import pytest + + +@pytest.fixture() +def setup_test(): + with tempfile.TemporaryDirectory() as artifact_path: + project = mlrun.get_or_create_project(name="default", context=artifact_path) + func = project.set_function( + func=os.path.abspath("./function.yaml"), + name="silero-vad", + image="mlrun/mlrun", + ) + yield func, artifact_path + + +def test_detect_voice(setup_test): + silero_vad_function, artifact_path = setup_test + run = silero_vad_function.run( + handler="detect_voice", + inputs={"data_path": "./assets"}, + returns=["vad_outputs: file", "errors: file"], + artifact_path=artifact_path, + local=True, + ) + assert run.outputs["vad_outputs"] + + +def test_diarize(setup_test): + silero_vad_function, artifact_path = setup_test + run = silero_vad_function.run( + handler="diarize", + inputs={"data_path": "./assets"}, + params={ + "speakers_labels": ["Agent", "Client"], + }, + returns=["speech_diarization: file", "errors: file"], + artifact_path=artifact_path, + local=True, + ) + assert run.outputs["speech_diarization"] diff --git a/functions/master/silero_vad/1.3.0/static/documentation.html b/functions/master/silero_vad/1.3.0/static/documentation.html new file mode 100644 index 00000000..d9cd1445 --- /dev/null +++ b/functions/master/silero_vad/1.3.0/static/documentation.html @@ -0,0 +1,481 @@ + + + + + + + +silero_vad package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + + + +
    +
    + + +
    +
    +
    + +
    +

    silero_vad package

    + +
    + +
    +
    +
    +
    +
    +

    silero_vad package#

    +
    +

    Submodules#

    +
    +
    +

    silero_vad.silero_vad module#

    +
    +
    +class silero_vad.silero_vad.BaseTask(audio_file: pathlib.Path)[source]#
    +

    Bases: object

    +

    A base class for a task to complete after VAD.

    +
    +
    +property audio_file: pathlib.Path#
    +

    Get the audio file of the task.

    +
    +
    Returns
    +

    The audio file of the task.

    +
    +
    +
    +
    +
    +do_task(speech_timestamps: Union[List[Dict[str, int]], List[List[Dict[str, int]]]])[source]#
    +

    Do the task on the given speech timestamps. The base task will simply save the speech timestamps as the result.

    +
    +
    Parameters
    +

    speech_timestamps – The speech timestamps to do the task on as outputted from the VAD.

    +
    +
    +
    +
    +
    +get_result()Tuple[str, list][source]#
    +

    Get the result of the task. A tuple of the audio file name and the result.

    +
    +
    Returns
    +

    The result of the task.

    +
    +
    +
    +
    +
    +to_tuple()Tuple[str, dict][source]#
    +

    Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

    +
    +
    Returns
    +

    The converted task.

    +
    +
    +
    +
    +
    +
    +class silero_vad.silero_vad.SpeechDiarizationTask(audio_file: pathlib.Path, speaker_labels: List[str])[source]#
    +

    Bases: silero_vad.silero_vad.BaseTask

    +

    A speech diarization task. The task will diarize the VAD speech timestamps into speakers.

    +
    +
    +do_task(speech_timestamps: List[List[Dict[str, int]]])[source]#
    +

    Do the task on the given speech timestamps. The task will diarize the VAD speech timestamps into speakers.

    +
    +
    Parameters
    +

    speech_timestamps – The speech timestamps per channel to do the task on as outputted from the VAD.

    +
    +
    +
    +
    +
    +to_tuple()Tuple[str, dict][source]#
    +

    Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

    +
    +
    Returns
    +

    The converted task.

    +
    +
    +
    +
    +
    +
    +class silero_vad.silero_vad.TaskCreator(task_type: Type[silero_vad.silero_vad.BaseTask], task_kwargs: Optional[dict] = None)[source]#
    +

    Bases: object

    +

    A task creator to create different tasks to run after the VAD.

    +
    +
    +create_task(audio_file: pathlib.Path)silero_vad.silero_vad.BaseTask[source]#
    +

    Create a task with the given audio file.

    +
    +
    Parameters
    +

    audio_file – The audio file to assign to the task.

    +
    +
    Returns
    +

    The created task.

    +
    +
    +
    +
    +
    +classmethod from_tuple(task_tuple: Tuple[str, dict])silero_vad.silero_vad.BaseTask[source]#
    +

    Create a task from a tuple of the audio file name and the task kwargs.

    +
    +
    Parameters
    +

    task_tuple – The task tuple to create the task from.

    +
    +
    Returns
    +

    The created task.

    +
    +
    +
    +
    +
    +
    +class silero_vad.silero_vad.VoiceActivityDetector(use_onnx: bool = True, force_onnx_cpu: bool = True, threshold: float = 0.5, sampling_rate: int = 16000, min_speech_duration_ms: int = 250, max_speech_duration_s: float = inf, min_silence_duration_ms: int = 100, window_size_samples: int = 512, speech_pad_ms: int = 30, return_seconds: bool = False, per_channel: bool = False)[source]#
    +

    Bases: object

    +

    A voice activity detection wrapper for the silero VAD model - https://github.com/snakers4/silero-vad.

    +
    +
    +detect_voice(audio_file: pathlib.Path)Union[List[Dict[str, int]], List[List[Dict[str, int]]]][source]#
    +

    Infer the audio through the VAD model and return the speech timestamps.

    +
    +
    Parameters
    +

    audio_file – The audio file to infer.

    +
    +
    Returns
    +

    The speech timestamps in the audio. A list of timestamps where each timestamp is a dictionary with the +following keys:

    +
      +
    • ”start”: The start sample index of the speech in the audio.

    • +
    • ”end”: The end sample index of the speech in the audio.

    • +
    +

    If per_channel is True, a list of timestamps per channel will be returned.

    +

    +
    +
    +
    +
    +
    +load(force_reload: bool = True)[source]#
    +

    Load the VAD model.

    +
    +
    Parameters
    +

    force_reload – Whether to force reload the model even if it was already loaded. Default is True.

    +
    +
    +
    +
    +
    +
    +silero_vad.silero_vad.detect_voice(data_path: Union[str, pathlib.Path, List[Union[str, pathlib.Path]]], use_onnx: bool = True, force_onnx_cpu: bool = True, threshold: float = 0.5, sampling_rate: int = 16000, min_speech_duration_ms: int = 250, max_speech_duration_s: float = inf, min_silence_duration_ms: int = 100, window_size_samples: int = 512, speech_pad_ms: int = 30, return_seconds: bool = False, per_channel: bool = False, use_multiprocessing: int = 0, verbose: bool = False)[source]#
    +

    Perform voice activity detection on given audio files using the silero VAD model - +https://github.com/snakers4/silero-vad. The end result is a dictionary with the file names as keys and their +VAD timestamps dictionaries as value.

    +

    For example:

    +
    {
    +    "file_1.wav": [
    +        {"start": 0, "end": 16000},
    +        {"start": 16000, "end": 32000},
    +        {"start": 32000, "end": 48000},
    +        ...
    +    ],
    +    "file_2.wav": [
    +        {"start": 0, "end": 16000},
    +        {"start": 16000, "end": 32000},
    +        {"start": 32000, "end": 48000},
    +        ...
    +    ],
    +    ...
    +}
    +
    +
    +
    +
    Parameters
    +
      +
    • data_path – The path to the audio files to diarize. Can be a path to a single file, a path to a +directory or a list of paths to files.

    • +
    • use_onnx – Whether to use ONNX for inference. Default is True.

    • +
    • force_onnx_cpu – Whether to force ONNX to use CPU for inference. Default is True.

    • +
    • threshold – Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, +probabilities ABOVE this value are considered as SPEECH. It is better to tune +this parameter for each dataset separately, but “lazy” 0.5 is pretty good for +most datasets.

    • +
    • sampling_rate – Currently, silero VAD models support 8000 and 16000 sample rates.

    • +
    • min_speech_duration_ms – Final speech chunks shorter min_speech_duration_ms are thrown out.

    • +
    • max_speech_duration_s – Maximum duration of speech chunks in seconds. Chunks longer than +max_speech_duration_s will be split at the timestamp of the last silence that +lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will +be split aggressively just before max_speech_duration_s.

    • +
    • min_silence_duration_ms – In the end of each speech chunk wait for min_silence_duration_ms before separating +it.

    • +
    • window_size_samples

      Audio chunks of window_size_samples size are fed to the silero VAD model.

      +

      WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 +sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than +these may affect model performance!

      +

    • +
    • speech_pad_ms – Final speech chunks are padded by speech_pad_ms each side.

    • +
    • return_seconds – Whether return timestamps in seconds. False means to return timestamps in samples +(default - False).

    • +
    • per_channel – Whether to return timestamps per channel (default - False). This will run VAD on +each channel separately and return a list of timestamps per channel.

    • +
    • use_multiprocessing – The number of workers to use for multiprocessing. If 0, no multiprocessing will +be used. Default is 0.

    • +
    • verbose – Verbosity.

    • +
    +
    +
    +
    +
    +
    +silero_vad.silero_vad.diarize(data_path: Union[str, pathlib.Path, List[Union[str, pathlib.Path]]], use_onnx: bool = True, force_onnx_cpu: bool = True, threshold: float = 0.5, sampling_rate: int = 16000, min_speech_duration_ms: int = 250, max_speech_duration_s: float = inf, min_silence_duration_ms: int = 100, window_size_samples: int = 512, speech_pad_ms: int = 30, speaker_labels: Optional[List[str]] = None, use_multiprocessing: int = 0, verbose: bool = False)[source]#
    +

    Perform speech diarization on given audio files using the silero VAD model - https://github.com/snakers4/silero-vad. +The speech diarization is performed per channel so that each channel in the audio belong to a different speaker. The +end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list +of tuples: (start, end, speaker_label).

    +

    For example:

    +
    {
    +    "file_1.wav": [
    +        (0.0, 1.0, "speaker_0"),
    +        (1.0, 2.0, "speaker_1"),
    +        (2.0, 3.0, "speaker_0"),
    +        ...
    +    ],
    +    "file_2.wav": [
    +        (0.0, 1.0, "speaker_0"),
    +        (1.0, 2.0, "speaker_1"),
    +        (2.0, 3.0, "speaker_0"),
    +        ...
    +    ],
    +    ...
    +}
    +
    +
    +
    +
    Parameters
    +
      +
    • data_path – The path to the audio files to diarize. Can be a path to a single file, a path to a +directory or a list of paths to files.

    • +
    • use_onnx – Whether to use ONNX for inference. Default is True.

    • +
    • force_onnx_cpu – Whether to force ONNX to use CPU for inference. Default is True.

    • +
    • threshold – Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, +probabilities ABOVE this value are considered as SPEECH. It is better to tune +this parameter for each dataset separately, but “lazy” 0.5 is pretty good for +most datasets.

    • +
    • sampling_rate – Currently, silero VAD models support 8000 and 16000 sample rates.

    • +
    • min_speech_duration_ms – Final speech chunks shorter min_speech_duration_ms are thrown out.

    • +
    • max_speech_duration_s – Maximum duration of speech chunks in seconds. Chunks longer than +max_speech_duration_s will be split at the timestamp of the last silence that +lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will +be split aggressively just before max_speech_duration_s.

    • +
    • min_silence_duration_ms – In the end of each speech chunk wait for min_silence_duration_ms before separating +it.

    • +
    • window_size_samples

      Audio chunks of window_size_samples size are fed to the silero VAD model.

      +

      WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 +sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than +these may affect model performance!

      +

    • +
    • speech_pad_ms – Final speech chunks are padded by speech_pad_ms each side.

    • +
    • speaker_labels – The speaker labels to use for the diarization. If not given, the speakers will be +named “speaker_0”, “speaker_1”, etc.

    • +
    • use_multiprocessing – The number of workers to use for multiprocessing. If 0, no multiprocessing will +be used. Default is 0.

    • +
    • verbose – Verbosity.

    • +
    +
    +
    +
    +
    +
    +

    Module contents#

    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/silero_vad/1.3.0/static/example.html b/functions/master/silero_vad/1.3.0/static/example.html new file mode 100644 index 00000000..56d0efa9 --- /dev/null +++ b/functions/master/silero_vad/1.3.0/static/example.html @@ -0,0 +1,190 @@ + + + + + + + +<no title> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + + + +
    +
    + +
    +
    + Contents +
    + +
    +
    +
    +
    + +
    +

    + +
    +
    +
    +

    Contents

    +
    + +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/silero_vad/1.3.0/static/function.html b/functions/master/silero_vad/1.3.0/static/function.html new file mode 100644 index 00000000..3fd4119d --- /dev/null +++ b/functions/master/silero_vad/1.3.0/static/function.html @@ -0,0 +1,313 @@ + + + + + + + + + + + Source + + + + +
    +        
    +kind: job
    +metadata:
    +  name: silero-vad
    +  tag: ''
    +  hash: 59336f808643a74f3a2c5d506977387010427208
    +  project: ''
    +  labels:
    +    author: guyl
    +  categories:
    +  - deep-learning
    +  - pytorch
    +  - audio
    +spec:
    +  command: ''
    +  args: []
    +  image: ''
    +  build:
    +    functionSourceCode: # Copyright 2024 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from multiprocessing import Process, Queue
from pathlib import Path
from types import FunctionType
from typing import Dict, List, Tuple, Type, Union

import torch
import torchaudio
from tqdm import tqdm


class BaseTask:
    """
    A base class for a task to complete after VAD.
    """

    def __init__(self, audio_file: Path):
        """
        Initialize the base task.

        :param audio_file: The audio file assigned to the task.
        """
        # Store the audio file:
        self._audio_file = audio_file

        # Prepare the result:
        self._result = None

    @property
    def audio_file(self) -> Path:
        """
        Get the audio file of the task.

        :returns: The audio file of the task.
        """
        return self._audio_file

    def do_task(
        self, speech_timestamps: Union[List[Dict[str, int]], List[List[Dict[str, int]]]]
    ):
        """
        Do the task on the given speech timestamps. The base task will simply save the speech timestamps as the result.

        :param speech_timestamps: The speech timestamps to do the task on as outputted from the VAD.
        """
        self._result = speech_timestamps

    def get_result(self) -> Tuple[str, list]:
        """
        Get the result of the task. A tuple of the audio file name and the result.

        :returns: The result of the task.
        """
        return self._audio_file.name, self._result

    def to_tuple(self) -> Tuple[str, dict]:
        """
        Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

        :returns: The converted task.
        """
        return self.__class__.__name__, {"audio_file": self._audio_file}


class SpeechDiarizationTask(BaseTask):
    """
    A speech diarization task. The task will diarize the VAD speech timestamps into speakers.
    """

    def __init__(self, audio_file: Path, speaker_labels: List[str]):
        """
        Initialize the speech diarization task.

        :param audio_file:     The audio file assigned to the task.
        :param speaker_labels: The speaker labels to use for the diarization. If not given, the speakers will be named
                               "speaker_0", "speaker_1", etc.
        """
        super().__init__(audio_file=audio_file)
        self._speaker_labels = speaker_labels

    def do_task(self, speech_timestamps: List[List[Dict[str, int]]]):
        """
        Do the task on the given speech timestamps. The task will diarize the VAD speech timestamps into speakers.

        :param speech_timestamps: The speech timestamps per channel to do the task on as outputted from the VAD.
        """
        # Get the speaker labels (set default if not given):
        speaker_labels = self._speaker_labels or [
            f"speaker_{i}" for i in range(len(speech_timestamps))
        ]

        # Diarize - organize the speech timestamps into a single list of speakers and sort it by start time:
        speech_diarization = [
            (speech_timestamp["start"], speech_timestamp["end"], speaker_label)
            for speaker_label, channel_speech_timestamps in zip(
                speaker_labels, speech_timestamps
            )
            for speech_timestamp in channel_speech_timestamps
        ]
        speech_diarization.sort()
        self._result = speech_diarization

    def to_tuple(self) -> Tuple[str, dict]:
        """
        Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

        :returns: The converted task.
        """
        task_class, task_kwargs = super().to_tuple()
        return task_class, {**task_kwargs, "speaker_labels": self._speaker_labels}


class TaskCreator:
    """
    A task creator to create different tasks to run after the VAD.
    """

    #: A map from task class name to task class to use in `from_tuple`:
    _MAP = {
        BaseTask.__name__: BaseTask,
        SpeechDiarizationTask.__name__: SpeechDiarizationTask,
    }

    def __init__(self, task_type: Type[BaseTask], task_kwargs: dict = None):
        """
        Initialize the task creator.
        :param task_type: The task type - a `BaseTask` subclass.
        :param task_kwargs: Additional keyword arguments to pass to the to be created tasks.
        """
        self._task_type = task_type
        self._task_kwargs = task_kwargs or {}

    def create_task(self, audio_file: Path) -> BaseTask:
        """
        Create a task with the given audio file.

        :param audio_file: The audio file to assign to the task.

        :returns: The created task.
        """
        return self._task_type(audio_file=audio_file, **self._task_kwargs)

    @classmethod
    def from_tuple(cls, task_tuple: Tuple[str, dict]) -> BaseTask:
        """
        Create a task from a tuple of the audio file name and the task kwargs.

        :param task_tuple: The task tuple to create the task from.

        :returns: The created task.
        """
        task_class, task_kwargs = task_tuple
        return cls._MAP[task_class](**task_kwargs)


class VoiceActivityDetector:
    """
    A voice activity detection wrapper for the silero VAD model - https://github.com/snakers4/silero-vad.
    """

    def __init__(
        self,
        # Model loading kwargs:
        use_onnx: bool = True,
        force_onnx_cpu: bool = True,
        # Detection kwargs:
        threshold: float = 0.5,
        sampling_rate: int = 16_000,
        min_speech_duration_ms: int = 250,
        max_speech_duration_s: float = float("inf"),
        min_silence_duration_ms: int = 100,
        window_size_samples: int = 512,
        speech_pad_ms: int = 30,
        return_seconds: bool = False,
        per_channel: bool = False,
    ):
        """
        Initialize the voice activity detector.

        :param use_onnx:                Whether to use ONNX for inference. Default is True.
        :param force_onnx_cpu:          Whether to force ONNX to use CPU for inference. Default is True.
        :param threshold:               Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
                                        probabilities ABOVE this value are considered as SPEECH. It is better to tune
                                        this parameter for each dataset separately, but "lazy" 0.5 is pretty good for
                                        most datasets.
        :param sampling_rate:           Currently, silero VAD models support 8000 and 16000 sample rates.
        :param min_speech_duration_ms:  Final speech chunks shorter min_speech_duration_ms are thrown out.
        :param max_speech_duration_s:   Maximum duration of speech chunks in seconds. Chunks longer than
                                        `max_speech_duration_s` will be split at the timestamp of the last silence that
                                        lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise,
                                        they will be split aggressively just before max_speech_duration_s.
        :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before
                                        separating it.
        :param window_size_samples:     Audio chunks of window_size_samples size are fed to the silero VAD model.
                                        WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000
                                        sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than
                                        these may affect model performance!
        :param speech_pad_ms:           Final speech chunks are padded by speech_pad_ms each side.
        :param return_seconds:          Whether return timestamps in seconds. False means to return timestamps in
                                        samples (default - False).
        :param per_channel:             Whether to return timestamps per channel (default - False). This will run VAD
                                        on each channel separately and return a list of timestamps per channel.
        """
        # Store configurations:
        self._use_onnx = use_onnx
        self._force_onnx_cpu = force_onnx_cpu
        self._threshold = threshold
        self._sampling_rate = sampling_rate
        self._min_speech_duration_ms = min_speech_duration_ms
        self._max_speech_duration_s = max_speech_duration_s
        self._min_silence_duration_ms = min_silence_duration_ms
        self._window_size_samples = window_size_samples
        self._speech_pad_ms = speech_pad_ms
        self._return_seconds = return_seconds
        self._per_channel = per_channel

        # Prepare the model variables
        self._model: torch.Module = None
        self._get_speech_timestamps: FunctionType = None

    def load(self, force_reload: bool = True):
        """
        Load the VAD model.

        :param force_reload: Whether to force reload the model even if it was already loaded. Default is True.
        """
        model, utils = torch.hub.load(
            repo_or_dir="snakers4/silero-vad",
            model="silero_vad",
            force_reload=force_reload,
            onnx=self._use_onnx,
            force_onnx_cpu=self._force_onnx_cpu,
        )
        self._model = model
        (
            self._get_speech_timestamps,
            _,  # save_audio,
            _,  # read_audio,
            _,  # VADIterator,
            _,  # collect_chunks
        ) = utils

    def detect_voice(
        self,
        audio_file: Path,
    ) -> Union[List[Dict[str, int]], List[List[Dict[str, int]]]]:
        """
        Infer the audio through the VAD model and return the speech timestamps.

        :param audio_file: The audio file to infer.

        :returns: The speech timestamps in the audio. A list of timestamps where each timestamp is a dictionary with the
                 following keys:

                 * "start": The start sample index of the speech in the audio.
                 * "end":   The end sample index of the speech in the audio.

                 If `per_channel` is True, a list of timestamps per channel will be returned.
        """
        # Cast to a numpy array:
        audio = self._read_audio(audio_file)

        # Detect speech:
        if not self._per_channel:
            return self._get_speech_timestamps(
                audio,
                self._model,
                threshold=self._threshold,
                min_speech_duration_ms=self._min_speech_duration_ms,
                max_speech_duration_s=self._max_speech_duration_s,
                min_silence_duration_ms=self._min_silence_duration_ms,
                speech_pad_ms=self._speech_pad_ms,
                sampling_rate=self._sampling_rate,
                window_size_samples=self._window_size_samples,
                return_seconds=self._return_seconds,
            )

        # Per channel:
        speech_timestamps = []
        for channel in audio:
            speech_timestamps.append(
                self._get_speech_timestamps(
                    channel,
                    self._model,
                    threshold=self._threshold,
                    min_speech_duration_ms=self._min_speech_duration_ms,
                    max_speech_duration_s=self._max_speech_duration_s,
                    min_silence_duration_ms=self._min_silence_duration_ms,
                    speech_pad_ms=self._speech_pad_ms,
                    sampling_rate=self._sampling_rate,
                    window_size_samples=self._window_size_samples,
                    return_seconds=self._return_seconds,
                )
            )

        return speech_timestamps

    def _read_audio(
        self,
        path: Path,
    ) -> torch.Tensor:
        """
        Read the audio from the given path and return it as a tensor.

        :param path: The path to the audio file.

        :returns: The audio as a tensor.
        """
        # Read the audio:
        audio, sampling_rate = torchaudio.load(str(path))

        # Check if the audio is stereo and if so, convert it to mono (only if not per channel):
        if audio.size(0) > 1 and not self._per_channel:
            audio = audio.mean(dim=0, keepdim=True)

        # Resample the audio if needed:
        if sampling_rate != self._sampling_rate:
            transform = torchaudio.transforms.Resample(
                orig_freq=sampling_rate, new_freq=self._sampling_rate
            )
            audio = transform(audio)

        # Return the audio (squeeze if not per channel):
        return audio if self._per_channel else audio.squeeze(0)


#: The value to send into multiprocessing queues to stop the process:
_MULTIPROCESSING_STOP_MARK = "STOP"


def _multiprocessing_complete_tasks(
    vad_init_kwargs: dict, tasks_queue: Queue, results_queue: Queue
):
    """
    Complete the tasks in the given queue and put the results in the given results queue. The function will stop when
    the given tasks queue will receive the stop mark. It is aimed to be used with multiprocessing as a process.

    :param vad_init_kwargs: The VAD initialization kwargs.
    :param tasks_queue:     A queue to get the tasks from.
    :param results_queue:   A queue to put the results in.
    """
    # Initialize and load the VAD:
    vad = VoiceActivityDetector(**vad_init_kwargs)
    vad.load(force_reload=False)

    # Start listening to the tasks queue:
    while True:
        # Get the task:
        task: Tuple[str, dict] = tasks_queue.get()
        if task == _MULTIPROCESSING_STOP_MARK:
            break
        try:
            # Create the task:
            task = TaskCreator.from_tuple(task_tuple=task)
            # Run the file through the VAD:
            speech_timestamps = vad.detect_voice(audio_file=task.audio_file)
            # Complete the task:
            task.do_task(speech_timestamps=speech_timestamps)
            # Build the result:
            result = (False, task.get_result())
        except Exception as exception:
            # Build the error:
            result = (True, (task.audio_file.name, str(exception)))
        # Collect the result / error:
        results_queue.put(result)

    # Mark the end of the tasks:
    results_queue.put(_MULTIPROCESSING_STOP_MARK)


# Get the global logger:
try:
    import mlrun

    _LOGGER = mlrun.get_or_create_ctx("silero_vad").logger
except ModuleNotFoundError:
    _LOGGER = logging.getLogger()


def detect_voice(
    # Input kwargs:
    data_path: Union[str, Path, List[Union[str, Path]]],
    # Model loading kwargs:
    use_onnx: bool = True,
    force_onnx_cpu: bool = True,
    # Detection kwargs:
    threshold: float = 0.5,
    sampling_rate: int = 16_000,
    min_speech_duration_ms: int = 250,
    max_speech_duration_s: float = float("inf"),
    min_silence_duration_ms: int = 100,
    window_size_samples: int = 512,
    speech_pad_ms: int = 30,
    return_seconds: bool = False,
    per_channel: bool = False,
    # Other kwargs:
    use_multiprocessing: int = 0,
    verbose: bool = False,
):
    """
    Perform voice activity detection on given audio files using the silero VAD model -
    https://github.com/snakers4/silero-vad. The end result is a dictionary with the file names as keys and their
    VAD timestamps dictionaries as value.

    For example::

        {
            "file_1.wav": [
                {"start": 0, "end": 16000},
                {"start": 16000, "end": 32000},
                {"start": 32000, "end": 48000},
                ...
            ],
            "file_2.wav": [
                {"start": 0, "end": 16000},
                {"start": 16000, "end": 32000},
                {"start": 32000, "end": 48000},
                ...
            ],
            ...
        }


    :param data_path:               The path to the audio files to diarize. Can be a path to a single file, a path to a
                                    directory or a list of paths to files.
    :param use_onnx:                Whether to use ONNX for inference. Default is True.
    :param force_onnx_cpu:          Whether to force ONNX to use CPU for inference. Default is True.
    :param threshold:               Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
                                    probabilities ABOVE this value are considered as SPEECH. It is better to tune
                                    this parameter for each dataset separately, but "lazy" 0.5 is pretty good for
                                    most datasets.
    :param sampling_rate:           Currently, silero VAD models support 8000 and 16000 sample rates.
    :param min_speech_duration_ms:  Final speech chunks shorter min_speech_duration_ms are thrown out.
    :param max_speech_duration_s:   Maximum duration of speech chunks in seconds. Chunks longer than
                                    `max_speech_duration_s` will be split at the timestamp of the last silence that
                                    lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will
                                    be split aggressively just before max_speech_duration_s.
    :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before separating
                                    it.
    :param window_size_samples:     Audio chunks of window_size_samples size are fed to the silero VAD model.

                                    WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000
                                    sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than
                                    these may affect model performance!
    :param speech_pad_ms:           Final speech chunks are padded by speech_pad_ms each side.
    :param return_seconds:          Whether return timestamps in seconds. False means to return timestamps in samples
                                    (default - False).
    :param per_channel:             Whether to return timestamps per channel (default - False). This will run VAD on
                                    each channel separately and return a list of timestamps per channel.
    :param use_multiprocessing:     The number of workers to use for multiprocessing. If 0, no multiprocessing will
                                    be used. Default is 0.
    :param verbose:                 Verbosity.
    """
    global _LOGGER

    # Get the input audio files to transcribe:
    if verbose:
        _LOGGER.info("Collecting audio files.")
    audio_files = _get_audio_files(data_path=data_path)
    if verbose:
        _LOGGER.info(f"Collected {len(audio_files)} audio files.")

    # Initialize the transcription pipeline:
    vad_init_kwargs = {
        "use_onnx": use_onnx,
        "force_onnx_cpu": force_onnx_cpu,
        "threshold": threshold,
        "sampling_rate": sampling_rate,
        "min_speech_duration_ms": min_speech_duration_ms,
        "max_speech_duration_s": max_speech_duration_s,
        "min_silence_duration_ms": min_silence_duration_ms,
        "window_size_samples": window_size_samples,
        "speech_pad_ms": speech_pad_ms,
        "return_seconds": return_seconds,
        "per_channel": per_channel,
    }

    # Create the task creator:
    task_creator = TaskCreator(task_type=BaseTask)

    # Run the transcription:
    if use_multiprocessing:
        results = _parallel_run(
            n_workers=use_multiprocessing,
            audio_files=audio_files,
            description="Detecting voice",
            vad_init_kwargs=vad_init_kwargs,
            task_creator=task_creator,
            verbose=verbose,
        )
    else:
        results = _run(
            audio_files=audio_files,
            description="Detecting voice",
            vad_init_kwargs=vad_init_kwargs,
            task_creator=task_creator,
            verbose=verbose,
        )

    # Process the results:
    return _process_results(results=results, verbose=verbose)


def diarize(
    # Input / Output kwargs:
    data_path: Union[str, Path, List[Union[str, Path]]],
    # Model loading kwargs:
    use_onnx: bool = True,
    force_onnx_cpu: bool = True,
    # Detection kwargs:
    threshold: float = 0.5,
    sampling_rate: int = 16_000,
    min_speech_duration_ms: int = 250,
    max_speech_duration_s: float = float("inf"),
    min_silence_duration_ms: int = 100,
    window_size_samples: int = 512,
    speech_pad_ms: int = 30,
    # Diarization kwargs:
    speaker_labels: List[str] = None,
    # Other kwargs:
    use_multiprocessing: int = 0,
    verbose: bool = False,
):
    """
    Perform speech diarization on given audio files using the silero VAD model - https://github.com/snakers4/silero-vad.
    The speech diarization is performed per channel so that each channel in the audio belong to a different speaker. The
    end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list
    of tuples: (start, end, speaker_label).

    For example::

        {
            "file_1.wav": [
                (0.0, 1.0, "speaker_0"),
                (1.0, 2.0, "speaker_1"),
                (2.0, 3.0, "speaker_0"),
                ...
            ],
            "file_2.wav": [
                (0.0, 1.0, "speaker_0"),
                (1.0, 2.0, "speaker_1"),
                (2.0, 3.0, "speaker_0"),
                ...
            ],
            ...
        }


    :param data_path:               The path to the audio files to diarize. Can be a path to a single file, a path to a
                                    directory or a list of paths to files.
    :param use_onnx:                Whether to use ONNX for inference. Default is True.
    :param force_onnx_cpu:          Whether to force ONNX to use CPU for inference. Default is True.
    :param threshold:               Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
                                    probabilities ABOVE this value are considered as SPEECH. It is better to tune
                                    this parameter for each dataset separately, but "lazy" 0.5 is pretty good for
                                    most datasets.
    :param sampling_rate:           Currently, silero VAD models support 8000 and 16000 sample rates.
    :param min_speech_duration_ms:  Final speech chunks shorter min_speech_duration_ms are thrown out.
    :param max_speech_duration_s:   Maximum duration of speech chunks in seconds. Chunks longer than
                                    `max_speech_duration_s` will be split at the timestamp of the last silence that
                                    lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will
                                    be split aggressively just before max_speech_duration_s.
    :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before separating
                                    it.
    :param window_size_samples:     Audio chunks of window_size_samples size are fed to the silero VAD model.

                                    WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000
                                    sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than
                                    these may affect model performance!
    :param speech_pad_ms:           Final speech chunks are padded by speech_pad_ms each side.
    :param speaker_labels:          The speaker labels to use for the diarization. If not given, the speakers will be
                                    named "speaker_0", "speaker_1", etc.
    :param use_multiprocessing:     The number of workers to use for multiprocessing. If 0, no multiprocessing will
                                    be used. Default is 0.
    :param verbose:                 Verbosity.
    """
    global _LOGGER

    # Get the input audio files to transcribe:
    if verbose:
        _LOGGER.info("Collecting audio files.")
    audio_files = _get_audio_files(data_path=data_path)
    if verbose:
        _LOGGER.info(f"Collected {len(audio_files)} audio files.")

    # Initialize the transcription pipeline:
    vad_init_kwargs = {
        "use_onnx": use_onnx,
        "force_onnx_cpu": force_onnx_cpu,
        "threshold": threshold,
        "sampling_rate": sampling_rate,
        "min_speech_duration_ms": min_speech_duration_ms,
        "max_speech_duration_s": max_speech_duration_s,
        "min_silence_duration_ms": min_silence_duration_ms,
        "window_size_samples": window_size_samples,
        "speech_pad_ms": speech_pad_ms,
        "return_seconds": True,
        "per_channel": True,
    }

    # Create the task creator:
    task_creator = TaskCreator(
        task_type=SpeechDiarizationTask, task_kwargs={"speaker_labels": speaker_labels}
    )

    # Run the transcription:
    if use_multiprocessing:
        results = _parallel_run(
            n_workers=use_multiprocessing,
            audio_files=audio_files,
            description="Diarizing",
            vad_init_kwargs=vad_init_kwargs,
            task_creator=task_creator,
            verbose=verbose,
        )
    else:
        results = _run(
            audio_files=audio_files,
            description="Diarizing",
            vad_init_kwargs=vad_init_kwargs,
            task_creator=task_creator,
            verbose=verbose,
        )

    # Process the results:
    return _process_results(results=results, verbose=verbose)


def _get_audio_files(
    data_path: Union[Path, str, list],
) -> List[Path]:
    """
    Get the audio files from the data path. If a path to a directory is given, all files in the directory will be
    collected.

    :param data_path: The data path to collect the audio files from.

    :returns: The audio files list.
    """
    # Check if given a list of paths:
    if isinstance(data_path, list):
        audio_files = []
        for path in data_path:
            audio_files.extend(_get_audio_files(data_path=path))
        return audio_files

    # Check if given a single string path to cast it to a `pathlib.Path`:
    if isinstance(data_path, str):
        data_path = Path(data_path).absolute()

    # Check if the path is of a directory or a file:
    if data_path.is_dir():
        # Get all files inside the directory:
        audio_files = list(data_path.glob("*.*"))
    elif data_path.is_file():
        audio_files = [data_path]
    else:
        raise ValueError(
            f"Unrecognized data path. The parameter `data_path` must be a valid path to either a directory path or a "
            f"file. Given: {str(data_path)} "
        )

    return audio_files


def _run(
    audio_files: List[Path],
    description: str,
    vad_init_kwargs: dict,
    task_creator: TaskCreator,
    verbose: bool,
) -> List[Tuple[bool, Tuple[str, list]]]:
    """
    Load a VAD and use it to complete the tasks that will be created on the provided files using the given task creator.

    :param audio_files:     The audio files to use.
    :param description:     The description to use for the progress bar.
    :param vad_init_kwargs: The VAD initialization keyword arguments.
    :param task_creator:    The task creator to use to create the tasks.
    :param verbose:         Verbosity.

    :returns: The collected results.
    """
    # Load the VAD:
    vad = VoiceActivityDetector(**vad_init_kwargs)
    if verbose:
        _LOGGER.info(f"Loading the VAD model.")
    vad.load()
    if verbose:
        _LOGGER.info("VAD model loaded.")

    # Run the VAD on the audio files and collect the results:
    results = []
    for audio_file in tqdm(
        audio_files,
        desc=description,
        unit="file",
        total=len(audio_files),
        disable=not verbose,
    ):
        try:
            # Create the task:
            task = task_creator.create_task(audio_file=audio_file)
            # Run the file through the VAD:
            speech_timestamps = vad.detect_voice(audio_file=audio_file)
            # Complete the task:
            task.do_task(speech_timestamps=speech_timestamps)
            # Collect the result:
            results.append((False, task.get_result()))
        except Exception as exception:
            # Collect the error:
            results.append((True, (audio_file.name, str(exception))))

    return results


def _parallel_run(
    n_workers: int,
    audio_files: List[Path],
    description: str,
    vad_init_kwargs: dict,
    task_creator: TaskCreator,
    verbose: bool,
) -> List[Tuple[bool, Tuple[str, list]]]:
    """
    Run multiple VAD workers with multiprocessing to complete the tasks that will be created on the provided files using
    the given task creator.

    :param n_workers:       The number of workers to use.
    :param audio_files:     The audio files to use.
    :param description:     The description to use for the progress bar.
    :param vad_init_kwargs: The VAD initialization keyword arguments.
    :param task_creator:    The task creator to use to create the tasks.
    :param verbose:         Verbosity.

    :returns: The collected results.
    """
    # Load the VAD (download once, and it will be loaded then per process later on):
    if verbose:
        _LOGGER.info(f"Loading the VAD model.")
    vad = VoiceActivityDetector(**vad_init_kwargs)
    vad.load()
    if verbose:
        _LOGGER.info("VAD model loaded.")

    # Check the number of workers:
    if n_workers > len(audio_files):
        _LOGGER.warning(
            f"The number of workers ({n_workers}) is larger than the number of audio files ({len(audio_files)}). "
            f"Setting the number of workers to {len(audio_files)}."
        )
        n_workers = len(audio_files)

    # Initialize the multiprocessing queues:
    tasks_queue = Queue()
    results_queue = Queue()

    # Initialize the multiprocessing processes:
    task_completion_processes = [
        Process(
            target=_multiprocessing_complete_tasks,
            kwargs={
                "vad_init_kwargs": vad_init_kwargs,
                "tasks_queue": tasks_queue,
                "results_queue": results_queue,
            },
        )
        for _ in range(n_workers)
    ]

    # Start the multiprocessing processes:
    for p in task_completion_processes:
        p.start()

    # Put the tasks in the queue:
    for audio_file in audio_files:
        tasks_queue.put(task_creator.create_task(audio_file=audio_file).to_tuple())

    # Put the stop marks in the queue:
    for _ in range(n_workers):
        tasks_queue.put(_MULTIPROCESSING_STOP_MARK)

    # Collect the results:
    results = []
    stop_marks_counter = 0
    with tqdm(
        desc=description,
        unit="file",
        total=len(audio_files),
        disable=not verbose,
    ) as progressbar:
        while True:
            # Get a result from the queue:
            result: Tuple[bool, Tuple[str, list]] = results_queue.get()
            if result == _MULTIPROCESSING_STOP_MARK:
                stop_marks_counter += 1
                if stop_marks_counter == n_workers:
                    break
            else:
                # Collect the result:
                results.append(result)
                progressbar.update(1)

    # Wait for the processes to finish:
    for p in task_completion_processes:
        p.join()

    return results


def _process_results(
    results: List[Tuple[bool, Tuple[str, list]]], verbose: bool
) -> Tuple[dict, dict]:
    """
    Process the results of the tasks.

    :param results: The results to process.
    :param verbose: Verbosity.

    :returns: The processed results as a tuple of successes and errors.
    """
    if verbose:
        _LOGGER.info("Summarizing the results.")
    successes = {}
    errors = {}
    for is_error, result in results:
        if is_error:
            errors[result[0]] = result[1]
        else:
            successes[result[0]] = result[1]
    if verbose:
        _LOGGER.info(f"Done ({len(successes)}/{len(successes) + len(errors)})\n")

    return successes, errors

    +    base_image: mlrun/mlrun
    +    commands: []
    +    code_origin: ''
    +    origin_filename: ''
    +    requirements:
    +    - torch
    +    - torchaudio
    +    - tqdm
    +    - onnxruntime
    +  entry_points:
    +    audio_file:
    +      name: audio_file
    +      doc: Get the audio file of the task.
    +      parameters:
    +      - name: self
    +      outputs:
    +      - doc: The audio file of the task.
    +        type: Path
    +      lineno: 43
    +      has_varargs: false
    +      has_kwargs: false
    +    do_task:
    +      name: do_task
    +      doc: Do the task on the given speech timestamps. The task will diarize the VAD
    +        speech timestamps into speakers.
    +      parameters:
    +      - name: self
    +      - name: speech_timestamps
    +        type: List[List[Dict[str, int]]]
    +        doc: The speech timestamps per channel to do the task on as outputted from
    +          the VAD.
    +      outputs: []
    +      lineno: 94
    +      has_varargs: false
    +      has_kwargs: false
    +    get_result:
    +      name: get_result
    +      doc: Get the result of the task. A tuple of the audio file name and the result.
    +      parameters:
    +      - name: self
    +      outputs:
    +      - doc: The result of the task.
    +        type: Tuple[str, list]
    +      lineno: 61
    +      has_varargs: false
    +      has_kwargs: false
    +    to_tuple:
    +      name: to_tuple
    +      doc: Convert the task to a tuple to reconstruct it later (used for multiprocessing
    +        to pass in queue).
    +      parameters:
    +      - name: self
    +      outputs:
    +      - doc: The converted task.
    +        type: Tuple[str, dict]
    +      lineno: 116
    +      has_varargs: false
    +      has_kwargs: false
    +    create_task:
    +      name: create_task
    +      doc: Create a task with the given audio file.
    +      parameters:
    +      - name: self
    +      - name: audio_file
    +        type: Path
    +        doc: The audio file to assign to the task.
    +      outputs:
    +      - doc: The created task.
    +        type: BaseTask
    +      lineno: 146
    +      has_varargs: false
    +      has_kwargs: false
    +    from_tuple:
    +      name: from_tuple
    +      doc: Create a task from a tuple of the audio file name and the task kwargs.
    +      parameters:
    +      - name: cls
    +      - name: task_tuple
    +        type: Tuple[str, dict]
    +        doc: The task tuple to create the task from.
    +      outputs:
    +      - doc: The created task.
    +        type: BaseTask
    +      lineno: 157
    +      has_varargs: false
    +      has_kwargs: false
    +    load:
    +      name: load
    +      doc: Load the VAD model.
    +      parameters:
    +      - name: self
    +      - name: force_reload
    +        type: bool
    +        doc: Whether to force reload the model even if it was already loaded. Default
    +          is True.
    +        default: true
    +      outputs: []
    +      lineno: 234
    +      has_varargs: false
    +      has_kwargs: false
    +    detect_voice:
    +      name: detect_voice
    +      doc: "Perform voice activity detection on given audio files using the silero\
    +        \ VAD model -\nhttps://github.com/snakers4/silero-vad. The end result is a\
    +        \ dictionary with the file names as keys and their\nVAD timestamps dictionaries\
    +        \ as value.\n\nFor example::\n\n    {\n        \"file_1.wav\": [\n       \
    +        \     {\"start\": 0, \"end\": 16000},\n            {\"start\": 16000, \"end\"\
    +        : 32000},\n            {\"start\": 32000, \"end\": 48000},\n            ...\n\
    +        \        ],\n        \"file_2.wav\": [\n            {\"start\": 0, \"end\"\
    +        : 16000},\n            {\"start\": 16000, \"end\": 32000},\n            {\"\
    +        start\": 32000, \"end\": 48000},\n            ...\n        ],\n        ...\n\
    +        \    }"
    +      parameters:
    +      - name: data_path
    +        type: Union[str, Path, List[Union[str, Path]]]
    +        doc: The path to the audio files to diarize. Can be a path to a single file,
    +          a path to a directory or a list of paths to files.
    +      - name: use_onnx
    +        type: bool
    +        doc: Whether to use ONNX for inference. Default is True.
    +        default: true
    +      - name: force_onnx_cpu
    +        type: bool
    +        doc: Whether to force ONNX to use CPU for inference. Default is True.
    +        default: true
    +      - name: threshold
    +        type: float
    +        doc: Speech threshold. Silero VAD outputs speech probabilities for each audio
    +          chunk, probabilities ABOVE this value are considered as SPEECH. It is better
    +          to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty
    +          good for most datasets.
    +        default: 0.5
    +      - name: sampling_rate
    +        type: int
    +        doc: Currently, silero VAD models support 8000 and 16000 sample rates.
    +        default: 16000
    +      - name: min_speech_duration_ms
    +        type: int
    +        doc: Final speech chunks shorter min_speech_duration_ms are thrown out.
    +        default: 250
    +      - name: max_speech_duration_s
    +        type: float
    +        doc: Maximum duration of speech chunks in seconds. Chunks longer than `max_speech_duration_s`
    +          will be split at the timestamp of the last silence that lasts more than
    +          100ms (if any), to prevent aggressive cutting. Otherwise, they will be split
    +          aggressively just before max_speech_duration_s.
    +        default: float('inf')
    +      - name: min_silence_duration_ms
    +        type: int
    +        doc: In the end of each speech chunk wait for min_silence_duration_ms before
    +          separating it.
    +        default: 100
    +      - name: window_size_samples
    +        type: int
    +        doc: Audio chunks of window_size_samples size are fed to the silero VAD model.
    +        default: 512
    +      - name: speech_pad_ms
    +        type: int
    +        doc: Final speech chunks are padded by speech_pad_ms each side.
    +        default: 30
    +      - name: return_seconds
    +        type: bool
    +        doc: Whether return timestamps in seconds. False means to return timestamps
    +          in samples (default - False).
    +        default: false
    +      - name: per_channel
    +        type: bool
    +        doc: Whether to return timestamps per channel (default - False). This will
    +          run VAD on each channel separately and return a list of timestamps per channel.
    +        default: false
    +      - name: use_multiprocessing
    +        type: int
    +        doc: The number of workers to use for multiprocessing. If 0, no multiprocessing
    +          will be used. Default is 0.
    +        default: 0
    +      - name: verbose
    +        type: bool
    +        doc: Verbosity.
    +        default: false
    +      outputs: []
    +      lineno: 393
    +      has_varargs: false
    +      has_kwargs: false
    +    diarize:
    +      name: diarize
    +      doc: "Perform speech diarization on given audio files using the silero VAD model\
    +        \ - https://github.com/snakers4/silero-vad.\nThe speech diarization is performed\
    +        \ per channel so that each channel in the audio belong to a different speaker.\
    +        \ The\nend result is a dictionary with the file names as keys and their diarization\
    +        \ as value. A diarization is a list\nof tuples: (start, end, speaker_label).\n\
    +        \nFor example::\n\n    {\n        \"file_1.wav\": [\n            (0.0, 1.0,\
    +        \ \"speaker_0\"),\n            (1.0, 2.0, \"speaker_1\"),\n            (2.0,\
    +        \ 3.0, \"speaker_0\"),\n            ...\n        ],\n        \"file_2.wav\"\
    +        : [\n            (0.0, 1.0, \"speaker_0\"),\n            (1.0, 2.0, \"speaker_1\"\
    +        ),\n            (2.0, 3.0, \"speaker_0\"),\n            ...\n        ],\n\
    +        \        ...\n    }"
    +      parameters:
    +      - name: data_path
    +        type: Union[str, Path, List[Union[str, Path]]]
    +        doc: The path to the audio files to diarize. Can be a path to a single file,
    +          a path to a directory or a list of paths to files.
    +      - name: use_onnx
    +        type: bool
    +        doc: Whether to use ONNX for inference. Default is True.
    +        default: true
    +      - name: force_onnx_cpu
    +        type: bool
    +        doc: Whether to force ONNX to use CPU for inference. Default is True.
    +        default: true
    +      - name: threshold
    +        type: float
    +        doc: Speech threshold. Silero VAD outputs speech probabilities for each audio
    +          chunk, probabilities ABOVE this value are considered as SPEECH. It is better
    +          to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty
    +          good for most datasets.
    +        default: 0.5
    +      - name: sampling_rate
    +        type: int
    +        doc: Currently, silero VAD models support 8000 and 16000 sample rates.
    +        default: 16000
    +      - name: min_speech_duration_ms
    +        type: int
    +        doc: Final speech chunks shorter min_speech_duration_ms are thrown out.
    +        default: 250
    +      - name: max_speech_duration_s
    +        type: float
    +        doc: Maximum duration of speech chunks in seconds. Chunks longer than `max_speech_duration_s`
    +          will be split at the timestamp of the last silence that lasts more than
    +          100ms (if any), to prevent aggressive cutting. Otherwise, they will be split
    +          aggressively just before max_speech_duration_s.
    +        default: float('inf')
    +      - name: min_silence_duration_ms
    +        type: int
    +        doc: In the end of each speech chunk wait for min_silence_duration_ms before
    +          separating it.
    +        default: 100
    +      - name: window_size_samples
    +        type: int
    +        doc: Audio chunks of window_size_samples size are fed to the silero VAD model.
    +        default: 512
    +      - name: speech_pad_ms
    +        type: int
    +        doc: Final speech chunks are padded by speech_pad_ms each side.
    +        default: 30
    +      - name: speaker_labels
    +        type: List[str]
    +        doc: The speaker labels to use for the diarization. If not given, the speakers
    +          will be named "speaker_0", "speaker_1", etc.
    +        default: null
    +      - name: use_multiprocessing
    +        type: int
    +        doc: The number of workers to use for multiprocessing. If 0, no multiprocessing
    +          will be used. Default is 0.
    +        default: 0
    +      - name: verbose
    +        type: bool
    +        doc: Verbosity.
    +        default: false
    +      outputs: []
    +      lineno: 517
    +      has_varargs: false
    +      has_kwargs: false
    +  description: Silero VAD (Voice Activity Detection) functions.
    +  default_handler: detect_voice
    +  disable_auto_mount: false
    +  clone_target_dir: ''
    +  env: []
    +  priority_class_name: ''
    +  preemption_mode: prevent
    +  affinity: null
    +  tolerations: null
    +  security_context: {}
    +verbose: false
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/silero_vad/1.3.0/static/item.html b/functions/master/silero_vad/1.3.0/static/item.html new file mode 100644 index 00000000..0188aec0 --- /dev/null +++ b/functions/master/silero_vad/1.3.0/static/item.html @@ -0,0 +1,52 @@ + + + + + + + + + + + Source + + + + +
    +        
    +apiVersion: v1
    +categories:
    +- deep-learning
    +- pytorch
    +- audio
    +description: Silero VAD (Voice Activity Detection) functions.
    +doc: ''
    +example: silero_vad.ipynb
    +generationDate: 2023-12-03:14-30
    +hidden: false
    +icon: ''
    +labels:
    +  author: guyl
    +maintainers: []
    +marketplaceType: ''
    +mlrunVersion: 1.5.2
    +name: silero_vad
    +platformVersion: 3.5.3
    +spec:
    +  filename: silero_vad.py
    +  handler: detect_voice
    +  image: mlrun/mlrun
    +  kind: job
    +  requirements:
    +  - torch
    +  - torchaudio
    +  - tqdm
    +  - onnxruntime
    +url: ''
    +version: 1.3.0
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/silero_vad/1.3.0/static/silero_vad.html b/functions/master/silero_vad/1.3.0/static/silero_vad.html new file mode 100644 index 00000000..ae769dad --- /dev/null +++ b/functions/master/silero_vad/1.3.0/static/silero_vad.html @@ -0,0 +1,987 @@ + + + + + + + +silero_vad.silero_vad + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + +
    +
    + +
    +
    +
    +
    +
    + +
    +

    + +
    +
    +
    +
    +
    +
    +
    +

    Source code for silero_vad.silero_vad

    +# Copyright 2024 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +import logging
    +from multiprocessing import Process, Queue
    +from pathlib import Path
    +from types import FunctionType
    +from typing import Dict, List, Tuple, Type, Union
    +
    +import torch
    +import torchaudio
    +from tqdm import tqdm
    +
    +
    +
    [docs]class BaseTask: + """ + A base class for a task to complete after VAD. + """ + + def __init__(self, audio_file: Path): + """ + Initialize the base task. + + :param audio_file: The audio file assigned to the task. + """ + # Store the audio file: + self._audio_file = audio_file + + # Prepare the result: + self._result = None + + @property + def audio_file(self) -> Path: + """ + Get the audio file of the task. + + :returns: The audio file of the task. + """ + return self._audio_file + +
    [docs] def do_task( + self, speech_timestamps: Union[List[Dict[str, int]], List[List[Dict[str, int]]]] + ): + """ + Do the task on the given speech timestamps. The base task will simply save the speech timestamps as the result. + + :param speech_timestamps: The speech timestamps to do the task on as outputted from the VAD. + """ + self._result = speech_timestamps
    + +
    [docs] def get_result(self) -> Tuple[str, list]: + """ + Get the result of the task. A tuple of the audio file name and the result. + + :returns: The result of the task. + """ + return self._audio_file.name, self._result
    + +
    [docs] def to_tuple(self) -> Tuple[str, dict]: + """ + Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + + :returns: The converted task. + """ + return self.__class__.__name__, {"audio_file": self._audio_file}
    + + +
    [docs]class SpeechDiarizationTask(BaseTask): + """ + A speech diarization task. The task will diarize the VAD speech timestamps into speakers. + """ + + def __init__(self, audio_file: Path, speaker_labels: List[str]): + """ + Initialize the speech diarization task. + + :param audio_file: The audio file assigned to the task. + :param speaker_labels: The speaker labels to use for the diarization. If not given, the speakers will be named + "speaker_0", "speaker_1", etc. + """ + super().__init__(audio_file=audio_file) + self._speaker_labels = speaker_labels + +
    [docs] def do_task(self, speech_timestamps: List[List[Dict[str, int]]]): + """ + Do the task on the given speech timestamps. The task will diarize the VAD speech timestamps into speakers. + + :param speech_timestamps: The speech timestamps per channel to do the task on as outputted from the VAD. + """ + # Get the speaker labels (set default if not given): + speaker_labels = self._speaker_labels or [ + f"speaker_{i}" for i in range(len(speech_timestamps)) + ] + + # Diarize - organize the speech timestamps into a single list of speakers and sort it by start time: + speech_diarization = [ + (speech_timestamp["start"], speech_timestamp["end"], speaker_label) + for speaker_label, channel_speech_timestamps in zip( + speaker_labels, speech_timestamps + ) + for speech_timestamp in channel_speech_timestamps + ] + speech_diarization.sort() + self._result = speech_diarization
    + +
    [docs] def to_tuple(self) -> Tuple[str, dict]: + """ + Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + + :returns: The converted task. + """ + task_class, task_kwargs = super().to_tuple() + return task_class, {**task_kwargs, "speaker_labels": self._speaker_labels}
    + + +
    [docs]class TaskCreator: + """ + A task creator to create different tasks to run after the VAD. + """ + + #: A map from task class name to task class to use in `from_tuple`: + _MAP = { + BaseTask.__name__: BaseTask, + SpeechDiarizationTask.__name__: SpeechDiarizationTask, + } + + def __init__(self, task_type: Type[BaseTask], task_kwargs: dict = None): + """ + Initialize the task creator. + :param task_type: The task type - a `BaseTask` subclass. + :param task_kwargs: Additional keyword arguments to pass to the to be created tasks. + """ + self._task_type = task_type + self._task_kwargs = task_kwargs or {} + +
    [docs] def create_task(self, audio_file: Path) -> BaseTask: + """ + Create a task with the given audio file. + + :param audio_file: The audio file to assign to the task. + + :returns: The created task. + """ + return self._task_type(audio_file=audio_file, **self._task_kwargs)
    + +
    [docs] @classmethod + def from_tuple(cls, task_tuple: Tuple[str, dict]) -> BaseTask: + """ + Create a task from a tuple of the audio file name and the task kwargs. + + :param task_tuple: The task tuple to create the task from. + + :returns: The created task. + """ + task_class, task_kwargs = task_tuple + return cls._MAP[task_class](**task_kwargs)
    + + +
    [docs]class VoiceActivityDetector: + """ + A voice activity detection wrapper for the silero VAD model - https://github.com/snakers4/silero-vad. + """ + + def __init__( + self, + # Model loading kwargs: + use_onnx: bool = True, + force_onnx_cpu: bool = True, + # Detection kwargs: + threshold: float = 0.5, + sampling_rate: int = 16_000, + min_speech_duration_ms: int = 250, + max_speech_duration_s: float = float("inf"), + min_silence_duration_ms: int = 100, + window_size_samples: int = 512, + speech_pad_ms: int = 30, + return_seconds: bool = False, + per_channel: bool = False, + ): + """ + Initialize the voice activity detector. + + :param use_onnx: Whether to use ONNX for inference. Default is True. + :param force_onnx_cpu: Whether to force ONNX to use CPU for inference. Default is True. + :param threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, + probabilities ABOVE this value are considered as SPEECH. It is better to tune + this parameter for each dataset separately, but "lazy" 0.5 is pretty good for + most datasets. + :param sampling_rate: Currently, silero VAD models support 8000 and 16000 sample rates. + :param min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out. + :param max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer than + `max_speech_duration_s` will be split at the timestamp of the last silence that + lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, + they will be split aggressively just before max_speech_duration_s. + :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before + separating it. + :param window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model. + WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 + sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than + these may affect model performance! + :param speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side. + :param return_seconds: Whether return timestamps in seconds. False means to return timestamps in + samples (default - False). + :param per_channel: Whether to return timestamps per channel (default - False). This will run VAD + on each channel separately and return a list of timestamps per channel. + """ + # Store configurations: + self._use_onnx = use_onnx + self._force_onnx_cpu = force_onnx_cpu + self._threshold = threshold + self._sampling_rate = sampling_rate + self._min_speech_duration_ms = min_speech_duration_ms + self._max_speech_duration_s = max_speech_duration_s + self._min_silence_duration_ms = min_silence_duration_ms + self._window_size_samples = window_size_samples + self._speech_pad_ms = speech_pad_ms + self._return_seconds = return_seconds + self._per_channel = per_channel + + # Prepare the model variables + self._model: torch.Module = None + self._get_speech_timestamps: FunctionType = None + +
    [docs] def load(self, force_reload: bool = True): + """ + Load the VAD model. + + :param force_reload: Whether to force reload the model even if it was already loaded. Default is True. + """ + model, utils = torch.hub.load( + repo_or_dir="snakers4/silero-vad", + model="silero_vad", + force_reload=force_reload, + onnx=self._use_onnx, + force_onnx_cpu=self._force_onnx_cpu, + ) + self._model = model + ( + self._get_speech_timestamps, + _, # save_audio, + _, # read_audio, + _, # VADIterator, + _, # collect_chunks + ) = utils
    + +
    [docs] def detect_voice( + self, + audio_file: Path, + ) -> Union[List[Dict[str, int]], List[List[Dict[str, int]]]]: + """ + Infer the audio through the VAD model and return the speech timestamps. + + :param audio_file: The audio file to infer. + + :returns: The speech timestamps in the audio. A list of timestamps where each timestamp is a dictionary with the + following keys: + + * "start": The start sample index of the speech in the audio. + * "end": The end sample index of the speech in the audio. + + If `per_channel` is True, a list of timestamps per channel will be returned. + """ + # Cast to a numpy array: + audio = self._read_audio(audio_file) + + # Detect speech: + if not self._per_channel: + return self._get_speech_timestamps( + audio, + self._model, + threshold=self._threshold, + min_speech_duration_ms=self._min_speech_duration_ms, + max_speech_duration_s=self._max_speech_duration_s, + min_silence_duration_ms=self._min_silence_duration_ms, + speech_pad_ms=self._speech_pad_ms, + sampling_rate=self._sampling_rate, + window_size_samples=self._window_size_samples, + return_seconds=self._return_seconds, + ) + + # Per channel: + speech_timestamps = [] + for channel in audio: + speech_timestamps.append( + self._get_speech_timestamps( + channel, + self._model, + threshold=self._threshold, + min_speech_duration_ms=self._min_speech_duration_ms, + max_speech_duration_s=self._max_speech_duration_s, + min_silence_duration_ms=self._min_silence_duration_ms, + speech_pad_ms=self._speech_pad_ms, + sampling_rate=self._sampling_rate, + window_size_samples=self._window_size_samples, + return_seconds=self._return_seconds, + ) + ) + + return speech_timestamps
    + + def _read_audio( + self, + path: Path, + ) -> torch.Tensor: + """ + Read the audio from the given path and return it as a tensor. + + :param path: The path to the audio file. + + :returns: The audio as a tensor. + """ + # Read the audio: + audio, sampling_rate = torchaudio.load(str(path)) + + # Check if the audio is stereo and if so, convert it to mono (only if not per channel): + if audio.size(0) > 1 and not self._per_channel: + audio = audio.mean(dim=0, keepdim=True) + + # Resample the audio if needed: + if sampling_rate != self._sampling_rate: + transform = torchaudio.transforms.Resample( + orig_freq=sampling_rate, new_freq=self._sampling_rate + ) + audio = transform(audio) + + # Return the audio (squeeze if not per channel): + return audio if self._per_channel else audio.squeeze(0)
    + + +#: The value to send into multiprocessing queues to stop the process: +_MULTIPROCESSING_STOP_MARK = "STOP" + + +def _multiprocessing_complete_tasks( + vad_init_kwargs: dict, tasks_queue: Queue, results_queue: Queue +): + """ + Complete the tasks in the given queue and put the results in the given results queue. The function will stop when + the given tasks queue will receive the stop mark. It is aimed to be used with multiprocessing as a process. + + :param vad_init_kwargs: The VAD initialization kwargs. + :param tasks_queue: A queue to get the tasks from. + :param results_queue: A queue to put the results in. + """ + # Initialize and load the VAD: + vad = VoiceActivityDetector(**vad_init_kwargs) + vad.load(force_reload=False) + + # Start listening to the tasks queue: + while True: + # Get the task: + task: Tuple[str, dict] = tasks_queue.get() + if task == _MULTIPROCESSING_STOP_MARK: + break + try: + # Create the task: + task = TaskCreator.from_tuple(task_tuple=task) + # Run the file through the VAD: + speech_timestamps = vad.detect_voice(audio_file=task.audio_file) + # Complete the task: + task.do_task(speech_timestamps=speech_timestamps) + # Build the result: + result = (False, task.get_result()) + except Exception as exception: + # Build the error: + result = (True, (task.audio_file.name, str(exception))) + # Collect the result / error: + results_queue.put(result) + + # Mark the end of the tasks: + results_queue.put(_MULTIPROCESSING_STOP_MARK) + + +# Get the global logger: +try: + import mlrun + + _LOGGER = mlrun.get_or_create_ctx("silero_vad").logger +except ModuleNotFoundError: + _LOGGER = logging.getLogger() + + +
    [docs]def detect_voice( + # Input kwargs: + data_path: Union[str, Path, List[Union[str, Path]]], + # Model loading kwargs: + use_onnx: bool = True, + force_onnx_cpu: bool = True, + # Detection kwargs: + threshold: float = 0.5, + sampling_rate: int = 16_000, + min_speech_duration_ms: int = 250, + max_speech_duration_s: float = float("inf"), + min_silence_duration_ms: int = 100, + window_size_samples: int = 512, + speech_pad_ms: int = 30, + return_seconds: bool = False, + per_channel: bool = False, + # Other kwargs: + use_multiprocessing: int = 0, + verbose: bool = False, +): + """ + Perform voice activity detection on given audio files using the silero VAD model - + https://github.com/snakers4/silero-vad. The end result is a dictionary with the file names as keys and their + VAD timestamps dictionaries as value. + + For example:: + + { + "file_1.wav": [ + {"start": 0, "end": 16000}, + {"start": 16000, "end": 32000}, + {"start": 32000, "end": 48000}, + ... + ], + "file_2.wav": [ + {"start": 0, "end": 16000}, + {"start": 16000, "end": 32000}, + {"start": 32000, "end": 48000}, + ... + ], + ... + } + + + :param data_path: The path to the audio files to diarize. Can be a path to a single file, a path to a + directory or a list of paths to files. + :param use_onnx: Whether to use ONNX for inference. Default is True. + :param force_onnx_cpu: Whether to force ONNX to use CPU for inference. Default is True. + :param threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, + probabilities ABOVE this value are considered as SPEECH. It is better to tune + this parameter for each dataset separately, but "lazy" 0.5 is pretty good for + most datasets. + :param sampling_rate: Currently, silero VAD models support 8000 and 16000 sample rates. + :param min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out. + :param max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer than + `max_speech_duration_s` will be split at the timestamp of the last silence that + lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will + be split aggressively just before max_speech_duration_s. + :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before separating + it. + :param window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model. + + WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 + sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than + these may affect model performance! + :param speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side. + :param return_seconds: Whether return timestamps in seconds. False means to return timestamps in samples + (default - False). + :param per_channel: Whether to return timestamps per channel (default - False). This will run VAD on + each channel separately and return a list of timestamps per channel. + :param use_multiprocessing: The number of workers to use for multiprocessing. If 0, no multiprocessing will + be used. Default is 0. + :param verbose: Verbosity. + """ + global _LOGGER + + # Get the input audio files to transcribe: + if verbose: + _LOGGER.info("Collecting audio files.") + audio_files = _get_audio_files(data_path=data_path) + if verbose: + _LOGGER.info(f"Collected {len(audio_files)} audio files.") + + # Initialize the transcription pipeline: + vad_init_kwargs = { + "use_onnx": use_onnx, + "force_onnx_cpu": force_onnx_cpu, + "threshold": threshold, + "sampling_rate": sampling_rate, + "min_speech_duration_ms": min_speech_duration_ms, + "max_speech_duration_s": max_speech_duration_s, + "min_silence_duration_ms": min_silence_duration_ms, + "window_size_samples": window_size_samples, + "speech_pad_ms": speech_pad_ms, + "return_seconds": return_seconds, + "per_channel": per_channel, + } + + # Create the task creator: + task_creator = TaskCreator(task_type=BaseTask) + + # Run the transcription: + if use_multiprocessing: + results = _parallel_run( + n_workers=use_multiprocessing, + audio_files=audio_files, + description="Detecting voice", + vad_init_kwargs=vad_init_kwargs, + task_creator=task_creator, + verbose=verbose, + ) + else: + results = _run( + audio_files=audio_files, + description="Detecting voice", + vad_init_kwargs=vad_init_kwargs, + task_creator=task_creator, + verbose=verbose, + ) + + # Process the results: + return _process_results(results=results, verbose=verbose)
    + + +
    [docs]def diarize( + # Input / Output kwargs: + data_path: Union[str, Path, List[Union[str, Path]]], + # Model loading kwargs: + use_onnx: bool = True, + force_onnx_cpu: bool = True, + # Detection kwargs: + threshold: float = 0.5, + sampling_rate: int = 16_000, + min_speech_duration_ms: int = 250, + max_speech_duration_s: float = float("inf"), + min_silence_duration_ms: int = 100, + window_size_samples: int = 512, + speech_pad_ms: int = 30, + # Diarization kwargs: + speaker_labels: List[str] = None, + # Other kwargs: + use_multiprocessing: int = 0, + verbose: bool = False, +): + """ + Perform speech diarization on given audio files using the silero VAD model - https://github.com/snakers4/silero-vad. + The speech diarization is performed per channel so that each channel in the audio belong to a different speaker. The + end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list + of tuples: (start, end, speaker_label). + + For example:: + + { + "file_1.wav": [ + (0.0, 1.0, "speaker_0"), + (1.0, 2.0, "speaker_1"), + (2.0, 3.0, "speaker_0"), + ... + ], + "file_2.wav": [ + (0.0, 1.0, "speaker_0"), + (1.0, 2.0, "speaker_1"), + (2.0, 3.0, "speaker_0"), + ... + ], + ... + } + + + :param data_path: The path to the audio files to diarize. Can be a path to a single file, a path to a + directory or a list of paths to files. + :param use_onnx: Whether to use ONNX for inference. Default is True. + :param force_onnx_cpu: Whether to force ONNX to use CPU for inference. Default is True. + :param threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, + probabilities ABOVE this value are considered as SPEECH. It is better to tune + this parameter for each dataset separately, but "lazy" 0.5 is pretty good for + most datasets. + :param sampling_rate: Currently, silero VAD models support 8000 and 16000 sample rates. + :param min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out. + :param max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer than + `max_speech_duration_s` will be split at the timestamp of the last silence that + lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will + be split aggressively just before max_speech_duration_s. + :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before separating + it. + :param window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model. + + WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 + sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than + these may affect model performance! + :param speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side. + :param speaker_labels: The speaker labels to use for the diarization. If not given, the speakers will be + named "speaker_0", "speaker_1", etc. + :param use_multiprocessing: The number of workers to use for multiprocessing. If 0, no multiprocessing will + be used. Default is 0. + :param verbose: Verbosity. + """ + global _LOGGER + + # Get the input audio files to transcribe: + if verbose: + _LOGGER.info("Collecting audio files.") + audio_files = _get_audio_files(data_path=data_path) + if verbose: + _LOGGER.info(f"Collected {len(audio_files)} audio files.") + + # Initialize the transcription pipeline: + vad_init_kwargs = { + "use_onnx": use_onnx, + "force_onnx_cpu": force_onnx_cpu, + "threshold": threshold, + "sampling_rate": sampling_rate, + "min_speech_duration_ms": min_speech_duration_ms, + "max_speech_duration_s": max_speech_duration_s, + "min_silence_duration_ms": min_silence_duration_ms, + "window_size_samples": window_size_samples, + "speech_pad_ms": speech_pad_ms, + "return_seconds": True, + "per_channel": True, + } + + # Create the task creator: + task_creator = TaskCreator( + task_type=SpeechDiarizationTask, task_kwargs={"speaker_labels": speaker_labels} + ) + + # Run the transcription: + if use_multiprocessing: + results = _parallel_run( + n_workers=use_multiprocessing, + audio_files=audio_files, + description="Diarizing", + vad_init_kwargs=vad_init_kwargs, + task_creator=task_creator, + verbose=verbose, + ) + else: + results = _run( + audio_files=audio_files, + description="Diarizing", + vad_init_kwargs=vad_init_kwargs, + task_creator=task_creator, + verbose=verbose, + ) + + # Process the results: + return _process_results(results=results, verbose=verbose)
    + + +def _get_audio_files( + data_path: Union[Path, str, list], +) -> List[Path]: + """ + Get the audio files from the data path. If a path to a directory is given, all files in the directory will be + collected. + + :param data_path: The data path to collect the audio files from. + + :returns: The audio files list. + """ + # Check if given a list of paths: + if isinstance(data_path, list): + audio_files = [] + for path in data_path: + audio_files.extend(_get_audio_files(data_path=path)) + return audio_files + + # Check if given a single string path to cast it to a `pathlib.Path`: + if isinstance(data_path, str): + data_path = Path(data_path).absolute() + + # Check if the path is of a directory or a file: + if data_path.is_dir(): + # Get all files inside the directory: + audio_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + audio_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be a valid path to either a directory path or a " + f"file. Given: {str(data_path)} " + ) + + return audio_files + + +def _run( + audio_files: List[Path], + description: str, + vad_init_kwargs: dict, + task_creator: TaskCreator, + verbose: bool, +) -> List[Tuple[bool, Tuple[str, list]]]: + """ + Load a VAD and use it to complete the tasks that will be created on the provided files using the given task creator. + + :param audio_files: The audio files to use. + :param description: The description to use for the progress bar. + :param vad_init_kwargs: The VAD initialization keyword arguments. + :param task_creator: The task creator to use to create the tasks. + :param verbose: Verbosity. + + :returns: The collected results. + """ + # Load the VAD: + vad = VoiceActivityDetector(**vad_init_kwargs) + if verbose: + _LOGGER.info(f"Loading the VAD model.") + vad.load() + if verbose: + _LOGGER.info("VAD model loaded.") + + # Run the VAD on the audio files and collect the results: + results = [] + for audio_file in tqdm( + audio_files, + desc=description, + unit="file", + total=len(audio_files), + disable=not verbose, + ): + try: + # Create the task: + task = task_creator.create_task(audio_file=audio_file) + # Run the file through the VAD: + speech_timestamps = vad.detect_voice(audio_file=audio_file) + # Complete the task: + task.do_task(speech_timestamps=speech_timestamps) + # Collect the result: + results.append((False, task.get_result())) + except Exception as exception: + # Collect the error: + results.append((True, (audio_file.name, str(exception)))) + + return results + + +def _parallel_run( + n_workers: int, + audio_files: List[Path], + description: str, + vad_init_kwargs: dict, + task_creator: TaskCreator, + verbose: bool, +) -> List[Tuple[bool, Tuple[str, list]]]: + """ + Run multiple VAD workers with multiprocessing to complete the tasks that will be created on the provided files using + the given task creator. + + :param n_workers: The number of workers to use. + :param audio_files: The audio files to use. + :param description: The description to use for the progress bar. + :param vad_init_kwargs: The VAD initialization keyword arguments. + :param task_creator: The task creator to use to create the tasks. + :param verbose: Verbosity. + + :returns: The collected results. + """ + # Load the VAD (download once, and it will be loaded then per process later on): + if verbose: + _LOGGER.info(f"Loading the VAD model.") + vad = VoiceActivityDetector(**vad_init_kwargs) + vad.load() + if verbose: + _LOGGER.info("VAD model loaded.") + + # Check the number of workers: + if n_workers > len(audio_files): + _LOGGER.warning( + f"The number of workers ({n_workers}) is larger than the number of audio files ({len(audio_files)}). " + f"Setting the number of workers to {len(audio_files)}." + ) + n_workers = len(audio_files) + + # Initialize the multiprocessing queues: + tasks_queue = Queue() + results_queue = Queue() + + # Initialize the multiprocessing processes: + task_completion_processes = [ + Process( + target=_multiprocessing_complete_tasks, + kwargs={ + "vad_init_kwargs": vad_init_kwargs, + "tasks_queue": tasks_queue, + "results_queue": results_queue, + }, + ) + for _ in range(n_workers) + ] + + # Start the multiprocessing processes: + for p in task_completion_processes: + p.start() + + # Put the tasks in the queue: + for audio_file in audio_files: + tasks_queue.put(task_creator.create_task(audio_file=audio_file).to_tuple()) + + # Put the stop marks in the queue: + for _ in range(n_workers): + tasks_queue.put(_MULTIPROCESSING_STOP_MARK) + + # Collect the results: + results = [] + stop_marks_counter = 0 + with tqdm( + desc=description, + unit="file", + total=len(audio_files), + disable=not verbose, + ) as progressbar: + while True: + # Get a result from the queue: + result: Tuple[bool, Tuple[str, list]] = results_queue.get() + if result == _MULTIPROCESSING_STOP_MARK: + stop_marks_counter += 1 + if stop_marks_counter == n_workers: + break + else: + # Collect the result: + results.append(result) + progressbar.update(1) + + # Wait for the processes to finish: + for p in task_completion_processes: + p.join() + + return results + + +def _process_results( + results: List[Tuple[bool, Tuple[str, list]]], verbose: bool +) -> Tuple[dict, dict]: + """ + Process the results of the tasks. + + :param results: The results to process. + :param verbose: Verbosity. + + :returns: The processed results as a tuple of successes and errors. + """ + if verbose: + _LOGGER.info("Summarizing the results.") + successes = {} + errors = {} + for is_error, result in results: + if is_error: + errors[result[0]] = result[1] + else: + successes[result[0]] = result[1] + if verbose: + _LOGGER.info(f"Done ({len(successes)}/{len(successes) + len(errors)})\n") + + return successes, errors +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/silero_vad/1.3.0/static/source.html b/functions/master/silero_vad/1.3.0/static/source.html new file mode 100644 index 00000000..d4fba18f --- /dev/null +++ b/functions/master/silero_vad/1.3.0/static/source.html @@ -0,0 +1,869 @@ + + + + + + + + + + + Source + + + + +
    +        
    +# Copyright 2024 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +import logging
    +from multiprocessing import Process, Queue
    +from pathlib import Path
    +from types import FunctionType
    +from typing import Dict, List, Tuple, Type, Union
    +
    +import torch
    +import torchaudio
    +from tqdm import tqdm
    +
    +
    +class BaseTask:
    +    """
    +    A base class for a task to complete after VAD.
    +    """
    +
    +    def __init__(self, audio_file: Path):
    +        """
    +        Initialize the base task.
    +
    +        :param audio_file: The audio file assigned to the task.
    +        """
    +        # Store the audio file:
    +        self._audio_file = audio_file
    +
    +        # Prepare the result:
    +        self._result = None
    +
    +    @property
    +    def audio_file(self) -> Path:
    +        """
    +        Get the audio file of the task.
    +
    +        :returns: The audio file of the task.
    +        """
    +        return self._audio_file
    +
    +    def do_task(
    +        self, speech_timestamps: Union[List[Dict[str, int]], List[List[Dict[str, int]]]]
    +    ):
    +        """
    +        Do the task on the given speech timestamps. The base task will simply save the speech timestamps as the result.
    +
    +        :param speech_timestamps: The speech timestamps to do the task on as outputted from the VAD.
    +        """
    +        self._result = speech_timestamps
    +
    +    def get_result(self) -> Tuple[str, list]:
    +        """
    +        Get the result of the task. A tuple of the audio file name and the result.
    +
    +        :returns: The result of the task.
    +        """
    +        return self._audio_file.name, self._result
    +
    +    def to_tuple(self) -> Tuple[str, dict]:
    +        """
    +        Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).
    +
    +        :returns: The converted task.
    +        """
    +        return self.__class__.__name__, {"audio_file": self._audio_file}
    +
    +
    +class SpeechDiarizationTask(BaseTask):
    +    """
    +    A speech diarization task. The task will diarize the VAD speech timestamps into speakers.
    +    """
    +
    +    def __init__(self, audio_file: Path, speaker_labels: List[str]):
    +        """
    +        Initialize the speech diarization task.
    +
    +        :param audio_file:     The audio file assigned to the task.
    +        :param speaker_labels: The speaker labels to use for the diarization. If not given, the speakers will be named
    +                               "speaker_0", "speaker_1", etc.
    +        """
    +        super().__init__(audio_file=audio_file)
    +        self._speaker_labels = speaker_labels
    +
    +    def do_task(self, speech_timestamps: List[List[Dict[str, int]]]):
    +        """
    +        Do the task on the given speech timestamps. The task will diarize the VAD speech timestamps into speakers.
    +
    +        :param speech_timestamps: The speech timestamps per channel to do the task on as outputted from the VAD.
    +        """
    +        # Get the speaker labels (set default if not given):
    +        speaker_labels = self._speaker_labels or [
    +            f"speaker_{i}" for i in range(len(speech_timestamps))
    +        ]
    +
    +        # Diarize - organize the speech timestamps into a single list of speakers and sort it by start time:
    +        speech_diarization = [
    +            (speech_timestamp["start"], speech_timestamp["end"], speaker_label)
    +            for speaker_label, channel_speech_timestamps in zip(
    +                speaker_labels, speech_timestamps
    +            )
    +            for speech_timestamp in channel_speech_timestamps
    +        ]
    +        speech_diarization.sort()
    +        self._result = speech_diarization
    +
    +    def to_tuple(self) -> Tuple[str, dict]:
    +        """
    +        Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).
    +
    +        :returns: The converted task.
    +        """
    +        task_class, task_kwargs = super().to_tuple()
    +        return task_class, {**task_kwargs, "speaker_labels": self._speaker_labels}
    +
    +
    +class TaskCreator:
    +    """
    +    A task creator to create different tasks to run after the VAD.
    +    """
    +
    +    #: A map from task class name to task class to use in `from_tuple`:
    +    _MAP = {
    +        BaseTask.__name__: BaseTask,
    +        SpeechDiarizationTask.__name__: SpeechDiarizationTask,
    +    }
    +
    +    def __init__(self, task_type: Type[BaseTask], task_kwargs: dict = None):
    +        """
    +        Initialize the task creator.
    +        :param task_type: The task type - a `BaseTask` subclass.
    +        :param task_kwargs: Additional keyword arguments to pass to the to be created tasks.
    +        """
    +        self._task_type = task_type
    +        self._task_kwargs = task_kwargs or {}
    +
    +    def create_task(self, audio_file: Path) -> BaseTask:
    +        """
    +        Create a task with the given audio file.
    +
    +        :param audio_file: The audio file to assign to the task.
    +
    +        :returns: The created task.
    +        """
    +        return self._task_type(audio_file=audio_file, **self._task_kwargs)
    +
    +    @classmethod
    +    def from_tuple(cls, task_tuple: Tuple[str, dict]) -> BaseTask:
    +        """
    +        Create a task from a tuple of the audio file name and the task kwargs.
    +
    +        :param task_tuple: The task tuple to create the task from.
    +
    +        :returns: The created task.
    +        """
    +        task_class, task_kwargs = task_tuple
    +        return cls._MAP[task_class](**task_kwargs)
    +
    +
    +class VoiceActivityDetector:
    +    """
    +    A voice activity detection wrapper for the silero VAD model - https://github.com/snakers4/silero-vad.
    +    """
    +
    +    def __init__(
    +        self,
    +        # Model loading kwargs:
    +        use_onnx: bool = True,
    +        force_onnx_cpu: bool = True,
    +        # Detection kwargs:
    +        threshold: float = 0.5,
    +        sampling_rate: int = 16_000,
    +        min_speech_duration_ms: int = 250,
    +        max_speech_duration_s: float = float("inf"),
    +        min_silence_duration_ms: int = 100,
    +        window_size_samples: int = 512,
    +        speech_pad_ms: int = 30,
    +        return_seconds: bool = False,
    +        per_channel: bool = False,
    +    ):
    +        """
    +        Initialize the voice activity detector.
    +
    +        :param use_onnx:                Whether to use ONNX for inference. Default is True.
    +        :param force_onnx_cpu:          Whether to force ONNX to use CPU for inference. Default is True.
    +        :param threshold:               Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
    +                                        probabilities ABOVE this value are considered as SPEECH. It is better to tune
    +                                        this parameter for each dataset separately, but "lazy" 0.5 is pretty good for
    +                                        most datasets.
    +        :param sampling_rate:           Currently, silero VAD models support 8000 and 16000 sample rates.
    +        :param min_speech_duration_ms:  Final speech chunks shorter min_speech_duration_ms are thrown out.
    +        :param max_speech_duration_s:   Maximum duration of speech chunks in seconds. Chunks longer than
    +                                        `max_speech_duration_s` will be split at the timestamp of the last silence that
    +                                        lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise,
    +                                        they will be split aggressively just before max_speech_duration_s.
    +        :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before
    +                                        separating it.
    +        :param window_size_samples:     Audio chunks of window_size_samples size are fed to the silero VAD model.
    +                                        WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000
    +                                        sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than
    +                                        these may affect model performance!
    +        :param speech_pad_ms:           Final speech chunks are padded by speech_pad_ms each side.
    +        :param return_seconds:          Whether return timestamps in seconds. False means to return timestamps in
    +                                        samples (default - False).
    +        :param per_channel:             Whether to return timestamps per channel (default - False). This will run VAD
    +                                        on each channel separately and return a list of timestamps per channel.
    +        """
    +        # Store configurations:
    +        self._use_onnx = use_onnx
    +        self._force_onnx_cpu = force_onnx_cpu
    +        self._threshold = threshold
    +        self._sampling_rate = sampling_rate
    +        self._min_speech_duration_ms = min_speech_duration_ms
    +        self._max_speech_duration_s = max_speech_duration_s
    +        self._min_silence_duration_ms = min_silence_duration_ms
    +        self._window_size_samples = window_size_samples
    +        self._speech_pad_ms = speech_pad_ms
    +        self._return_seconds = return_seconds
    +        self._per_channel = per_channel
    +
    +        # Prepare the model variables
    +        self._model: torch.Module = None
    +        self._get_speech_timestamps: FunctionType = None
    +
    +    def load(self, force_reload: bool = True):
    +        """
    +        Load the VAD model.
    +
    +        :param force_reload: Whether to force reload the model even if it was already loaded. Default is True.
    +        """
    +        model, utils = torch.hub.load(
    +            repo_or_dir="snakers4/silero-vad",
    +            model="silero_vad",
    +            force_reload=force_reload,
    +            onnx=self._use_onnx,
    +            force_onnx_cpu=self._force_onnx_cpu,
    +        )
    +        self._model = model
    +        (
    +            self._get_speech_timestamps,
    +            _,  # save_audio,
    +            _,  # read_audio,
    +            _,  # VADIterator,
    +            _,  # collect_chunks
    +        ) = utils
    +
    +    def detect_voice(
    +        self,
    +        audio_file: Path,
    +    ) -> Union[List[Dict[str, int]], List[List[Dict[str, int]]]]:
    +        """
    +        Infer the audio through the VAD model and return the speech timestamps.
    +
    +        :param audio_file: The audio file to infer.
    +
    +        :returns: The speech timestamps in the audio. A list of timestamps where each timestamp is a dictionary with the
    +                 following keys:
    +
    +                 * "start": The start sample index of the speech in the audio.
    +                 * "end":   The end sample index of the speech in the audio.
    +
    +                 If `per_channel` is True, a list of timestamps per channel will be returned.
    +        """
    +        # Cast to a numpy array:
    +        audio = self._read_audio(audio_file)
    +
    +        # Detect speech:
    +        if not self._per_channel:
    +            return self._get_speech_timestamps(
    +                audio,
    +                self._model,
    +                threshold=self._threshold,
    +                min_speech_duration_ms=self._min_speech_duration_ms,
    +                max_speech_duration_s=self._max_speech_duration_s,
    +                min_silence_duration_ms=self._min_silence_duration_ms,
    +                speech_pad_ms=self._speech_pad_ms,
    +                sampling_rate=self._sampling_rate,
    +                window_size_samples=self._window_size_samples,
    +                return_seconds=self._return_seconds,
    +            )
    +
    +        # Per channel:
    +        speech_timestamps = []
    +        for channel in audio:
    +            speech_timestamps.append(
    +                self._get_speech_timestamps(
    +                    channel,
    +                    self._model,
    +                    threshold=self._threshold,
    +                    min_speech_duration_ms=self._min_speech_duration_ms,
    +                    max_speech_duration_s=self._max_speech_duration_s,
    +                    min_silence_duration_ms=self._min_silence_duration_ms,
    +                    speech_pad_ms=self._speech_pad_ms,
    +                    sampling_rate=self._sampling_rate,
    +                    window_size_samples=self._window_size_samples,
    +                    return_seconds=self._return_seconds,
    +                )
    +            )
    +
    +        return speech_timestamps
    +
    +    def _read_audio(
    +        self,
    +        path: Path,
    +    ) -> torch.Tensor:
    +        """
    +        Read the audio from the given path and return it as a tensor.
    +
    +        :param path: The path to the audio file.
    +
    +        :returns: The audio as a tensor.
    +        """
    +        # Read the audio:
    +        audio, sampling_rate = torchaudio.load(str(path))
    +
    +        # Check if the audio is stereo and if so, convert it to mono (only if not per channel):
    +        if audio.size(0) > 1 and not self._per_channel:
    +            audio = audio.mean(dim=0, keepdim=True)
    +
    +        # Resample the audio if needed:
    +        if sampling_rate != self._sampling_rate:
    +            transform = torchaudio.transforms.Resample(
    +                orig_freq=sampling_rate, new_freq=self._sampling_rate
    +            )
    +            audio = transform(audio)
    +
    +        # Return the audio (squeeze if not per channel):
    +        return audio if self._per_channel else audio.squeeze(0)
    +
    +
    +#: The value to send into multiprocessing queues to stop the process:
    +_MULTIPROCESSING_STOP_MARK = "STOP"
    +
    +
    +def _multiprocessing_complete_tasks(
    +    vad_init_kwargs: dict, tasks_queue: Queue, results_queue: Queue
    +):
    +    """
    +    Complete the tasks in the given queue and put the results in the given results queue. The function will stop when
    +    the given tasks queue will receive the stop mark. It is aimed to be used with multiprocessing as a process.
    +
    +    :param vad_init_kwargs: The VAD initialization kwargs.
    +    :param tasks_queue:     A queue to get the tasks from.
    +    :param results_queue:   A queue to put the results in.
    +    """
    +    # Initialize and load the VAD:
    +    vad = VoiceActivityDetector(**vad_init_kwargs)
    +    vad.load(force_reload=False)
    +
    +    # Start listening to the tasks queue:
    +    while True:
    +        # Get the task:
    +        task: Tuple[str, dict] = tasks_queue.get()
    +        if task == _MULTIPROCESSING_STOP_MARK:
    +            break
    +        try:
    +            # Create the task:
    +            task = TaskCreator.from_tuple(task_tuple=task)
    +            # Run the file through the VAD:
    +            speech_timestamps = vad.detect_voice(audio_file=task.audio_file)
    +            # Complete the task:
    +            task.do_task(speech_timestamps=speech_timestamps)
    +            # Build the result:
    +            result = (False, task.get_result())
    +        except Exception as exception:
    +            # Build the error:
    +            result = (True, (task.audio_file.name, str(exception)))
    +        # Collect the result / error:
    +        results_queue.put(result)
    +
    +    # Mark the end of the tasks:
    +    results_queue.put(_MULTIPROCESSING_STOP_MARK)
    +
    +
    +# Get the global logger:
    +try:
    +    import mlrun
    +
    +    _LOGGER = mlrun.get_or_create_ctx("silero_vad").logger
    +except ModuleNotFoundError:
    +    _LOGGER = logging.getLogger()
    +
    +
    +def detect_voice(
    +    # Input kwargs:
    +    data_path: Union[str, Path, List[Union[str, Path]]],
    +    # Model loading kwargs:
    +    use_onnx: bool = True,
    +    force_onnx_cpu: bool = True,
    +    # Detection kwargs:
    +    threshold: float = 0.5,
    +    sampling_rate: int = 16_000,
    +    min_speech_duration_ms: int = 250,
    +    max_speech_duration_s: float = float("inf"),
    +    min_silence_duration_ms: int = 100,
    +    window_size_samples: int = 512,
    +    speech_pad_ms: int = 30,
    +    return_seconds: bool = False,
    +    per_channel: bool = False,
    +    # Other kwargs:
    +    use_multiprocessing: int = 0,
    +    verbose: bool = False,
    +):
    +    """
    +    Perform voice activity detection on given audio files using the silero VAD model -
    +    https://github.com/snakers4/silero-vad. The end result is a dictionary with the file names as keys and their
    +    VAD timestamps dictionaries as value.
    +
    +    For example::
    +
    +        {
    +            "file_1.wav": [
    +                {"start": 0, "end": 16000},
    +                {"start": 16000, "end": 32000},
    +                {"start": 32000, "end": 48000},
    +                ...
    +            ],
    +            "file_2.wav": [
    +                {"start": 0, "end": 16000},
    +                {"start": 16000, "end": 32000},
    +                {"start": 32000, "end": 48000},
    +                ...
    +            ],
    +            ...
    +        }
    +
    +
    +    :param data_path:               The path to the audio files to diarize. Can be a path to a single file, a path to a
    +                                    directory or a list of paths to files.
    +    :param use_onnx:                Whether to use ONNX for inference. Default is True.
    +    :param force_onnx_cpu:          Whether to force ONNX to use CPU for inference. Default is True.
    +    :param threshold:               Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
    +                                    probabilities ABOVE this value are considered as SPEECH. It is better to tune
    +                                    this parameter for each dataset separately, but "lazy" 0.5 is pretty good for
    +                                    most datasets.
    +    :param sampling_rate:           Currently, silero VAD models support 8000 and 16000 sample rates.
    +    :param min_speech_duration_ms:  Final speech chunks shorter min_speech_duration_ms are thrown out.
    +    :param max_speech_duration_s:   Maximum duration of speech chunks in seconds. Chunks longer than
    +                                    `max_speech_duration_s` will be split at the timestamp of the last silence that
    +                                    lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will
    +                                    be split aggressively just before max_speech_duration_s.
    +    :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before separating
    +                                    it.
    +    :param window_size_samples:     Audio chunks of window_size_samples size are fed to the silero VAD model.
    +
    +                                    WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000
    +                                    sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than
    +                                    these may affect model performance!
    +    :param speech_pad_ms:           Final speech chunks are padded by speech_pad_ms each side.
    +    :param return_seconds:          Whether return timestamps in seconds. False means to return timestamps in samples
    +                                    (default - False).
    +    :param per_channel:             Whether to return timestamps per channel (default - False). This will run VAD on
    +                                    each channel separately and return a list of timestamps per channel.
    +    :param use_multiprocessing:     The number of workers to use for multiprocessing. If 0, no multiprocessing will
    +                                    be used. Default is 0.
    +    :param verbose:                 Verbosity.
    +    """
    +    global _LOGGER
    +
    +    # Get the input audio files to transcribe:
    +    if verbose:
    +        _LOGGER.info("Collecting audio files.")
    +    audio_files = _get_audio_files(data_path=data_path)
    +    if verbose:
    +        _LOGGER.info(f"Collected {len(audio_files)} audio files.")
    +
    +    # Initialize the transcription pipeline:
    +    vad_init_kwargs = {
    +        "use_onnx": use_onnx,
    +        "force_onnx_cpu": force_onnx_cpu,
    +        "threshold": threshold,
    +        "sampling_rate": sampling_rate,
    +        "min_speech_duration_ms": min_speech_duration_ms,
    +        "max_speech_duration_s": max_speech_duration_s,
    +        "min_silence_duration_ms": min_silence_duration_ms,
    +        "window_size_samples": window_size_samples,
    +        "speech_pad_ms": speech_pad_ms,
    +        "return_seconds": return_seconds,
    +        "per_channel": per_channel,
    +    }
    +
    +    # Create the task creator:
    +    task_creator = TaskCreator(task_type=BaseTask)
    +
    +    # Run the transcription:
    +    if use_multiprocessing:
    +        results = _parallel_run(
    +            n_workers=use_multiprocessing,
    +            audio_files=audio_files,
    +            description="Detecting voice",
    +            vad_init_kwargs=vad_init_kwargs,
    +            task_creator=task_creator,
    +            verbose=verbose,
    +        )
    +    else:
    +        results = _run(
    +            audio_files=audio_files,
    +            description="Detecting voice",
    +            vad_init_kwargs=vad_init_kwargs,
    +            task_creator=task_creator,
    +            verbose=verbose,
    +        )
    +
    +    # Process the results:
    +    return _process_results(results=results, verbose=verbose)
    +
    +
    +def diarize(
    +    # Input / Output kwargs:
    +    data_path: Union[str, Path, List[Union[str, Path]]],
    +    # Model loading kwargs:
    +    use_onnx: bool = True,
    +    force_onnx_cpu: bool = True,
    +    # Detection kwargs:
    +    threshold: float = 0.5,
    +    sampling_rate: int = 16_000,
    +    min_speech_duration_ms: int = 250,
    +    max_speech_duration_s: float = float("inf"),
    +    min_silence_duration_ms: int = 100,
    +    window_size_samples: int = 512,
    +    speech_pad_ms: int = 30,
    +    # Diarization kwargs:
    +    speaker_labels: List[str] = None,
    +    # Other kwargs:
    +    use_multiprocessing: int = 0,
    +    verbose: bool = False,
    +):
    +    """
    +    Perform speech diarization on given audio files using the silero VAD model - https://github.com/snakers4/silero-vad.
    +    The speech diarization is performed per channel so that each channel in the audio belong to a different speaker. The
    +    end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list
    +    of tuples: (start, end, speaker_label).
    +
    +    For example::
    +
    +        {
    +            "file_1.wav": [
    +                (0.0, 1.0, "speaker_0"),
    +                (1.0, 2.0, "speaker_1"),
    +                (2.0, 3.0, "speaker_0"),
    +                ...
    +            ],
    +            "file_2.wav": [
    +                (0.0, 1.0, "speaker_0"),
    +                (1.0, 2.0, "speaker_1"),
    +                (2.0, 3.0, "speaker_0"),
    +                ...
    +            ],
    +            ...
    +        }
    +
    +
    +    :param data_path:               The path to the audio files to diarize. Can be a path to a single file, a path to a
    +                                    directory or a list of paths to files.
    +    :param use_onnx:                Whether to use ONNX for inference. Default is True.
    +    :param force_onnx_cpu:          Whether to force ONNX to use CPU for inference. Default is True.
    +    :param threshold:               Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
    +                                    probabilities ABOVE this value are considered as SPEECH. It is better to tune
    +                                    this parameter for each dataset separately, but "lazy" 0.5 is pretty good for
    +                                    most datasets.
    +    :param sampling_rate:           Currently, silero VAD models support 8000 and 16000 sample rates.
    +    :param min_speech_duration_ms:  Final speech chunks shorter min_speech_duration_ms are thrown out.
    +    :param max_speech_duration_s:   Maximum duration of speech chunks in seconds. Chunks longer than
    +                                    `max_speech_duration_s` will be split at the timestamp of the last silence that
    +                                    lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will
    +                                    be split aggressively just before max_speech_duration_s.
    +    :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before separating
    +                                    it.
    +    :param window_size_samples:     Audio chunks of window_size_samples size are fed to the silero VAD model.
    +
    +                                    WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000
    +                                    sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than
    +                                    these may affect model performance!
    +    :param speech_pad_ms:           Final speech chunks are padded by speech_pad_ms each side.
    +    :param speaker_labels:          The speaker labels to use for the diarization. If not given, the speakers will be
    +                                    named "speaker_0", "speaker_1", etc.
    +    :param use_multiprocessing:     The number of workers to use for multiprocessing. If 0, no multiprocessing will
    +                                    be used. Default is 0.
    +    :param verbose:                 Verbosity.
    +    """
    +    global _LOGGER
    +
    +    # Get the input audio files to transcribe:
    +    if verbose:
    +        _LOGGER.info("Collecting audio files.")
    +    audio_files = _get_audio_files(data_path=data_path)
    +    if verbose:
    +        _LOGGER.info(f"Collected {len(audio_files)} audio files.")
    +
    +    # Initialize the transcription pipeline:
    +    vad_init_kwargs = {
    +        "use_onnx": use_onnx,
    +        "force_onnx_cpu": force_onnx_cpu,
    +        "threshold": threshold,
    +        "sampling_rate": sampling_rate,
    +        "min_speech_duration_ms": min_speech_duration_ms,
    +        "max_speech_duration_s": max_speech_duration_s,
    +        "min_silence_duration_ms": min_silence_duration_ms,
    +        "window_size_samples": window_size_samples,
    +        "speech_pad_ms": speech_pad_ms,
    +        "return_seconds": True,
    +        "per_channel": True,
    +    }
    +
    +    # Create the task creator:
    +    task_creator = TaskCreator(
    +        task_type=SpeechDiarizationTask, task_kwargs={"speaker_labels": speaker_labels}
    +    )
    +
    +    # Run the transcription:
    +    if use_multiprocessing:
    +        results = _parallel_run(
    +            n_workers=use_multiprocessing,
    +            audio_files=audio_files,
    +            description="Diarizing",
    +            vad_init_kwargs=vad_init_kwargs,
    +            task_creator=task_creator,
    +            verbose=verbose,
    +        )
    +    else:
    +        results = _run(
    +            audio_files=audio_files,
    +            description="Diarizing",
    +            vad_init_kwargs=vad_init_kwargs,
    +            task_creator=task_creator,
    +            verbose=verbose,
    +        )
    +
    +    # Process the results:
    +    return _process_results(results=results, verbose=verbose)
    +
    +
    +def _get_audio_files(
    +    data_path: Union[Path, str, list],
    +) -> List[Path]:
    +    """
    +    Get the audio files from the data path. If a path to a directory is given, all files in the directory will be
    +    collected.
    +
    +    :param data_path: The data path to collect the audio files from.
    +
    +    :returns: The audio files list.
    +    """
    +    # Check if given a list of paths:
    +    if isinstance(data_path, list):
    +        audio_files = []
    +        for path in data_path:
    +            audio_files.extend(_get_audio_files(data_path=path))
    +        return audio_files
    +
    +    # Check if given a single string path to cast it to a `pathlib.Path`:
    +    if isinstance(data_path, str):
    +        data_path = Path(data_path).absolute()
    +
    +    # Check if the path is of a directory or a file:
    +    if data_path.is_dir():
    +        # Get all files inside the directory:
    +        audio_files = list(data_path.glob("*.*"))
    +    elif data_path.is_file():
    +        audio_files = [data_path]
    +    else:
    +        raise ValueError(
    +            f"Unrecognized data path. The parameter `data_path` must be a valid path to either a directory path or a "
    +            f"file. Given: {str(data_path)} "
    +        )
    +
    +    return audio_files
    +
    +
    +def _run(
    +    audio_files: List[Path],
    +    description: str,
    +    vad_init_kwargs: dict,
    +    task_creator: TaskCreator,
    +    verbose: bool,
    +) -> List[Tuple[bool, Tuple[str, list]]]:
    +    """
    +    Load a VAD and use it to complete the tasks that will be created on the provided files using the given task creator.
    +
    +    :param audio_files:     The audio files to use.
    +    :param description:     The description to use for the progress bar.
    +    :param vad_init_kwargs: The VAD initialization keyword arguments.
    +    :param task_creator:    The task creator to use to create the tasks.
    +    :param verbose:         Verbosity.
    +
    +    :returns: The collected results.
    +    """
    +    # Load the VAD:
    +    vad = VoiceActivityDetector(**vad_init_kwargs)
    +    if verbose:
    +        _LOGGER.info(f"Loading the VAD model.")
    +    vad.load()
    +    if verbose:
    +        _LOGGER.info("VAD model loaded.")
    +
    +    # Run the VAD on the audio files and collect the results:
    +    results = []
    +    for audio_file in tqdm(
    +        audio_files,
    +        desc=description,
    +        unit="file",
    +        total=len(audio_files),
    +        disable=not verbose,
    +    ):
    +        try:
    +            # Create the task:
    +            task = task_creator.create_task(audio_file=audio_file)
    +            # Run the file through the VAD:
    +            speech_timestamps = vad.detect_voice(audio_file=audio_file)
    +            # Complete the task:
    +            task.do_task(speech_timestamps=speech_timestamps)
    +            # Collect the result:
    +            results.append((False, task.get_result()))
    +        except Exception as exception:
    +            # Collect the error:
    +            results.append((True, (audio_file.name, str(exception))))
    +
    +    return results
    +
    +
    +def _parallel_run(
    +    n_workers: int,
    +    audio_files: List[Path],
    +    description: str,
    +    vad_init_kwargs: dict,
    +    task_creator: TaskCreator,
    +    verbose: bool,
    +) -> List[Tuple[bool, Tuple[str, list]]]:
    +    """
    +    Run multiple VAD workers with multiprocessing to complete the tasks that will be created on the provided files using
    +    the given task creator.
    +
    +    :param n_workers:       The number of workers to use.
    +    :param audio_files:     The audio files to use.
    +    :param description:     The description to use for the progress bar.
    +    :param vad_init_kwargs: The VAD initialization keyword arguments.
    +    :param task_creator:    The task creator to use to create the tasks.
    +    :param verbose:         Verbosity.
    +
    +    :returns: The collected results.
    +    """
    +    # Load the VAD (download once, and it will be loaded then per process later on):
    +    if verbose:
    +        _LOGGER.info(f"Loading the VAD model.")
    +    vad = VoiceActivityDetector(**vad_init_kwargs)
    +    vad.load()
    +    if verbose:
    +        _LOGGER.info("VAD model loaded.")
    +
    +    # Check the number of workers:
    +    if n_workers > len(audio_files):
    +        _LOGGER.warning(
    +            f"The number of workers ({n_workers}) is larger than the number of audio files ({len(audio_files)}). "
    +            f"Setting the number of workers to {len(audio_files)}."
    +        )
    +        n_workers = len(audio_files)
    +
    +    # Initialize the multiprocessing queues:
    +    tasks_queue = Queue()
    +    results_queue = Queue()
    +
    +    # Initialize the multiprocessing processes:
    +    task_completion_processes = [
    +        Process(
    +            target=_multiprocessing_complete_tasks,
    +            kwargs={
    +                "vad_init_kwargs": vad_init_kwargs,
    +                "tasks_queue": tasks_queue,
    +                "results_queue": results_queue,
    +            },
    +        )
    +        for _ in range(n_workers)
    +    ]
    +
    +    # Start the multiprocessing processes:
    +    for p in task_completion_processes:
    +        p.start()
    +
    +    # Put the tasks in the queue:
    +    for audio_file in audio_files:
    +        tasks_queue.put(task_creator.create_task(audio_file=audio_file).to_tuple())
    +
    +    # Put the stop marks in the queue:
    +    for _ in range(n_workers):
    +        tasks_queue.put(_MULTIPROCESSING_STOP_MARK)
    +
    +    # Collect the results:
    +    results = []
    +    stop_marks_counter = 0
    +    with tqdm(
    +        desc=description,
    +        unit="file",
    +        total=len(audio_files),
    +        disable=not verbose,
    +    ) as progressbar:
    +        while True:
    +            # Get a result from the queue:
    +            result: Tuple[bool, Tuple[str, list]] = results_queue.get()
    +            if result == _MULTIPROCESSING_STOP_MARK:
    +                stop_marks_counter += 1
    +                if stop_marks_counter == n_workers:
    +                    break
    +            else:
    +                # Collect the result:
    +                results.append(result)
    +                progressbar.update(1)
    +
    +    # Wait for the processes to finish:
    +    for p in task_completion_processes:
    +        p.join()
    +
    +    return results
    +
    +
    +def _process_results(
    +    results: List[Tuple[bool, Tuple[str, list]]], verbose: bool
    +) -> Tuple[dict, dict]:
    +    """
    +    Process the results of the tasks.
    +
    +    :param results: The results to process.
    +    :param verbose: Verbosity.
    +
    +    :returns: The processed results as a tuple of successes and errors.
    +    """
    +    if verbose:
    +        _LOGGER.info("Summarizing the results.")
    +    successes = {}
    +    errors = {}
    +    for is_error, result in results:
    +        if is_error:
    +            errors[result[0]] = result[1]
    +        else:
    +            successes[result[0]] = result[1]
    +    if verbose:
    +        _LOGGER.info(f"Done ({len(successes)}/{len(successes) + len(errors)})\n")
    +
    +    return successes, errors
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/silero_vad/latest/src/function.yaml b/functions/master/silero_vad/latest/src/function.yaml index 1e6a5a89..8ec121a6 100644 --- a/functions/master/silero_vad/latest/src/function.yaml +++ b/functions/master/silero_vad/latest/src/function.yaml @@ -2,7 +2,7 @@ kind: job metadata: name: silero-vad tag: '' - hash: 61b7a70c167b7819481fdabf9350fc6fa344d2f5 + hash: 59336f808643a74f3a2c5d506977387010427208 project: '' labels: author: guyl diff --git a/functions/master/silero_vad/latest/src/item.yaml b/functions/master/silero_vad/latest/src/item.yaml index b9dac63c..9ce9a5d2 100644 --- a/functions/master/silero_vad/latest/src/item.yaml +++ b/functions/master/silero_vad/latest/src/item.yaml @@ -27,4 +27,4 @@ spec: - tqdm - onnxruntime url: '' -version: 1.2.0 +version: 1.3.0 diff --git a/functions/master/silero_vad/latest/static/function.html b/functions/master/silero_vad/latest/static/function.html index 0f90fdfc..3fd4119d 100644 --- a/functions/master/silero_vad/latest/static/function.html +++ b/functions/master/silero_vad/latest/static/function.html @@ -19,14 +19,14 @@ metadata: name: silero-vad tag: '' - hash: 61b7a70c167b7819481fdabf9350fc6fa344d2f5 + hash: 59336f808643a74f3a2c5d506977387010427208 project: '' labels: author: guyl categories: - deep-learning - - PyTorch - - Audio + - pytorch + - audio spec: command: '' args: [] diff --git a/functions/master/silero_vad/latest/static/item.html b/functions/master/silero_vad/latest/static/item.html index 5c26352e..0188aec0 100644 --- a/functions/master/silero_vad/latest/static/item.html +++ b/functions/master/silero_vad/latest/static/item.html @@ -18,8 +18,8 @@ apiVersion: v1 categories: - deep-learning -- PyTorch -- Audio +- pytorch +- audio description: Silero VAD (Voice Activity Detection) functions. doc: '' example: silero_vad.ipynb @@ -44,7 +44,7 @@ - tqdm - onnxruntime url: '' -version: 1.2.0 +version: 1.3.0 diff --git a/functions/master/structured_data_generator/1.5.0/src/function.yaml b/functions/master/structured_data_generator/1.5.0/src/function.yaml new file mode 100644 index 00000000..1093e178 --- /dev/null +++ b/functions/master/structured_data_generator/1.5.0/src/function.yaml @@ -0,0 +1,71 @@ +kind: job +metadata: + name: structured-data-generator + tag: '' + hash: 44bb39f4bc55b38fc7ead1df24cb02bcf7f05bc9 + project: '' + labels: + author: zeevr + categories: + - machine-learning + - data-preparation + - data-generation + - genai +spec: + command: '' + args: [] + image: '' + build: + functionSourceCode: IyBDb3B5cmlnaHQgMjAyMyBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgIGh0dHA6Ly93d3cuYXBhY2hlLm9yZy9saWNlbnNlcy9MSUNFTlNFLTIuMAojCiMgVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQojIGRpc3RyaWJ1dGVkIHVuZGVyIHRoZSBMaWNlbnNlIGlzIGRpc3RyaWJ1dGVkIG9uIGFuICJBUyBJUyIgQkFTSVMsCiMgV0lUSE9VVCBXQVJSQU5USUVTIE9SIENPTkRJVElPTlMgT0YgQU5ZIEtJTkQsIGVpdGhlciBleHByZXNzIG9yIGltcGxpZWQuCiMgU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAojIGxpbWl0YXRpb25zIHVuZGVyIHRoZSBMaWNlbnNlLgppbXBvcnQgYXN0CmltcG9ydCBvcwoKaW1wb3J0IHRxZG0KZnJvbSBsYW5nY2hhaW4uY2hhdF9tb2RlbHMgaW1wb3J0IENoYXRPcGVuQUkKCgpkZWYgX3NldF9vcGVuYWlfc2VjcmV0cygpIC0+IGJvb2w6CiAgICBrZXkgPSAiT1BFTkFJX0FQSV9LRVkiCiAgICBiYXNlID0gIk9QRU5BSV9BUElfQkFTRSIKICAgICMgQ2hlY2sgaWYgdGhlIGtleSBpcyBhbHJlYWR5IGluIHRoZSBlbnZpcm9ubWVudCB2YXJpYWJsZXM6CiAgICBpZiBrZXkgaW4gb3MuZW52aXJvbiBhbmQgYmFzZSBpbiBvcy5lbnZpcm9uOgogICAgICAgIHJldHVybiBUcnVlCiAgICAjIENoZWNrIGlmIG1scnVuIGlzIGluc3RhbGxlZDoKICAgIHRyeToKICAgICAgICBpbXBvcnQgbWxydW4KICAgIGV4Y2VwdCBNb2R1bGVOb3RGb3VuZEVycm9yOgogICAgICAgIHJhaXNlIEVudmlyb25tZW50RXJyb3IoCiAgICAgICAgICAgIGYiT25lIG9yIG1vcmUgb2YgdGhlIE9wZW5BSSByZXF1aXJlZCBlbnZpcm9ubWVudCB2YXJpYWJsZXMgKCd7a2V5fScsICd7YmFzZX0nKSBhcmUgbWlzc2luZy4iCiAgICAgICAgICAgIGYiUGxlYXNlIHNldCB0aGVtIGFzIGVudmlyb25tZW50IHZhcmlhYmxlcyBvciBpbnN0YWxsIG1scnVuIChgcGlwIGluc3RhbGwgbWxydW5gKSIKICAgICAgICAgICAgZiJhbmQgc2V0IHRoZW0gYXMgcHJvamVjdCBzZWNyZXRzIHVzaW5nIGBwcm9qZWN5LnNldF9zZWNyZXRzYC4iCiAgICAgICAgKQoKICAgICMgQ2hlY2sgaWYgdGhlIGtleSBpcyBpbiB0aGUgc2VjcmV0czoKICAgIGNvbnRleHQgPSBtbHJ1bi5nZXRfb3JfY3JlYXRlX2N0eChuYW1lPSJjb250ZXh0IikKICAgIG9wZW5haV9rZXkgPSBjb250ZXh0LmdldF9zZWNyZXQoa2V5KQogICAgb3BlbmFpX2Jhc2UgPSBjb250ZXh0LmdldF9zZWNyZXQoYmFzZSkKCiAgICAjIElmIHRoZSBrZXkgaXMgbm90IGluIHRoZSBzZWNyZXRzLCByZXR1cm4gRmFsc2U6CiAgICBpZiBub3Qgb3BlbmFpX2tleToKICAgICAgICByYWlzZSBFbnZpcm9ubWVudEVycm9yKAogICAgICAgICAgICBmIkNvdWxkIG5vdCBmaW5kIE9wZW5BSSBBUEkga2V5IGluIHRoZSBlbnZpcm9ubWVudCB2YXJpYWJsZXMgb3Igc2VjcmV0cywiCiAgICAgICAgICAgIGYiIHBsZWFzZSBzZXQgaXQgYXM6IHtrZXl9LiIKICAgICAgICApCiAgICBpZiBub3Qgb3BlbmFpX2Jhc2U6CiAgICAgICAgcmFpc2UgRW52aXJvbm1lbnRFcnJvcigKICAgICAgICAgICAgZiJDb3VsZCBub3QgZmluZCBPcGVuQUkgQVBJIGJhc2UgaW4gdGhlIGVudmlyb25tZW50IHZhcmlhYmxlcyBvciBzZWNyZXRzLCIKICAgICAgICAgICAgZiIgcGxlYXNlIHNldCBpdCBhczoge2Jhc2V9LiIKICAgICAgICApCiAgICAjIElmIHRoZSBrZXkgaXMgaW4gdGhlIHNlY3JldHMsIHNldCBpdCBpbiB0aGUgZW52aXJvbm1lbnQgdmFyaWFibGVzIGFuZCByZXR1cm4gVHJ1ZToKICAgIG9zLmVudmlyb25ba2V5XSA9IG9wZW5haV9rZXkKICAgIG9zLmVudmlyb25bYmFzZV0gPSBvcGVuYWlfYmFzZQogICAgcmV0dXJuIFRydWUKCgpkZWYgZ2VuZXJhdGVfZGF0YSgKICAgIGZpZWxkczogbGlzdCwKICAgIGFtb3VudDogaW50ID0gMTAsCiAgICBtb2RlbF9uYW1lOiBzdHIgPSAiZ3B0LTMuNS10dXJibyIsCiAgICBsYW5ndWFnZTogc3RyID0gImVuIiwKICAgIGNodW5rX3NpemU6IGludCA9IDUwLAopIC0+IGxpc3Q6CiAgICAiIiIKICAgIFN0cnVjdHVyZWQgZGF0YSBvZiBlbGVtZW50cyBhY2NvcmRpbmcgdG8gdGhlIGdpdmVuIHBhcmFtZXRlcnMuCiAgICBUaGUgZGF0YSBjYW4gYmUgbGF0ZXIgbG9nZ2VkIGFzIGEgc3RydWN0dXJlZCBmaWxlIHdpdGggTUxSdW4ncyBgcmV0dXJuc2AgcGFyYW1ldGVyLgoKICAgIDpwYXJhbSBmaWVsZHM6IEEgbGlzdCBvZiBmaWVsZHMgdG8gcmFuZG9tbHkgZ2VuZXJhdGUuCiAgICA6cGFyYW0gYW1vdW50OiBUaGUgbnVtYmVyIG9mIHZhcmlhbnRzIHRvIGdlbmVyYXRlLgogICAgOnBhcmFtIG1vZGVsX25hbWU6IFRoZSBuYW1lIG9mIHRoZSBtb2RlbCB0byB1c2UgZm9yIGNvbnZlcnNhdGlvbiBnZW5lcmF0aW9uLgogICAgICAgICAgICAgICAgICAgICAgIFlvdSBzaG91bGQgY2hvb3NlIG9uZSBvZiBHUFQtNCBvciBHUFQtMy41IGZyb20gdGhlIGxpc3QgaGVyZTogaHR0cHM6Ly9wbGF0Zm9ybS5vcGVuYWkuY29tL2RvY3MvbW9kZWxzLgogICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHQ6ICdncHQtMy41LXR1cmJvJy4KICAgIDpwYXJhbSBsYW5ndWFnZTogVGhlIGxhbmd1YWdlIHRvIHVzZSBmb3IgdGhlIGdlbmVyYXRlZCBjb252ZXJzYXRpb24gdGV4dC4KICAgIDpwYXJhbSBjaHVua19zaXplOiBOdW1iZXIgb2Ygc2FtcGxlcyBnZW5lcmF0ZWQgYXQgZWFjaCBHUFQgcXVlcnkuCiAgICAiIiIKICAgIGluc3RydWN0aW9ucyA9ICIiCiAgICBmb3IgZmllbGQgaW4gZmllbGRzOgogICAgICAgICMgU3BsaXQgdGhlIGZpZWxkIHRvIGtleSBhbmQgaW5zdHJ1Y3Rpb246CiAgICAgICAgaWYgIjoiIGluIGZpZWxkOgogICAgICAgICAgICBrZXksIGluc3RydWN0aW9uID0gZmllbGQuc3BsaXQoIjoiLCAxKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGtleSwgaW5zdHJ1Y3Rpb24gPSBmaWVsZCwgIm5vIHNwZWNpYWwgaW5zdHJ1Y3Rpb24iCiAgICAgICAgIyBSZXBsYWNlIHNwYWNlcyB3aXRoIHVuZGVyc2NvcmVzIGZvciB0aGUga2V5IHRvIGJlIHVzZWQgYXMgYSBqc29uIGtleToKICAgICAgICBrZXkgPSBrZXkuc3RyaXAoKS5yZXBsYWNlKCIgIiwgIl8iKQogICAgICAgIGluc3RydWN0aW9ucyArPSBmIioge2tleX06IHtpbnN0cnVjdGlvbn1cbiIKCiAgICAjIENyZWF0ZSB0aGUgcHJvbXB0IHN0cnVjdHVyZToKICAgIHByb21wdF9zdHJ1Y3R1cmUgPSAoCiAgICAgICAgZiJnZW5lcmF0ZSB0aGUgZm9sbG93aW5nIHZhbHVlcyB7YW1vdW50fSB0aW1lcyByYW5kb21seSwgaW4gYW4gb3JkZXIgdGhhdCBjcmVhdGVzIGEganNvbiB0YWJsZS5cbiIKICAgICAgICBmIlVzZSB0aGUgZm9sbG93aW5nIGtleXMgYW5kIGluc3RydWN0aW9ucyAoZXhhbXBsZTogJ2tleTogaW5zdHJ1Y3Rpb24gb3Igbm8gc3BlY2lhbCBpbnN0cnVjdGlvbicpOiAiCiAgICAgICAgZiJ7aW5zdHJ1Y3Rpb25zfS5cbiIKICAgICAgICBmIlBsZWFzZSBnZW5lcmF0ZSB0aGUgdmFsdWVzIGluIHtsYW5ndWFnZX0gbGFuZ3VhZ2UuIFxuIgogICAgICAgIGYiTWFrZSBzdXJlIHRoZSBuYW1lcyBvZiB0aGUga2V5cyBhcmUgdGhlIHNhbWUgYXMgdGhlIGdpdmVuIGZpZWxkIG5hbWUuXG4iCiAgICAgICAgZiJQbGVhc2UgcmV0dXJuIG9ubHkgdGhlIGpzb24gZm9ybWF0IHdpdGhvdXQgYW55IGludHJvZHVjdGlvbiBhbmQgZW5kaW5nIgogICAgKQoKICAgICMgU2V0IHRoZSBPcGVuQUkgc2VjcmV0czoKICAgIF9zZXRfb3BlbmFpX3NlY3JldHMoKQoKICAgICMgTG9hZCB0aGUgT3BlbkFJIG1vZGVsIHVzaW5nIGxhbmdjaGFpbjoKICAgIGxsbSA9IENoYXRPcGVuQUkobW9kZWw9bW9kZWxfbmFtZSkKCiAgICAjIFN0YXJ0IGdlbmVyYXRpbmcgZGF0YToKICAgIGRhdGEgPSBbXQogICAgZm9yIF8gaW4gdHFkbS50cWRtKHJhbmdlKChhbW91bnQgLy8gY2h1bmtfc2l6ZSkgKyAxKSwgZGVzYz0iR2VuZXJhdGluZyIpOgogICAgICAgICMgV2UgdHJ5IHRvIGdlbmVyYXRlIHRoZSBkYXRhIDMgdGltZXMsIGlmIHdlIGZhaWwgd2UgcmFpc2UgYW4gZXJyb3I6CiAgICAgICAgZm9yIHRyeW91dCBpbiByYW5nZSgzKToKICAgICAgICAgICAgIyBJZiB0aGUgYW1vdW50IHdhbnRlZCBpcyBiaWdnZXIgdGhhbiB0aGUgY2h1bmsgc2l6ZSwgd2UgZ2VuZXJhdGUgYSBjaHVuayBvZiBkYXRhIGluIHRoZSBzaXplIG9mIHRoZSBjaHVuawogICAgICAgICAgICAjIGFuZCBkZWNyZWFzZSB0aGUgYW1vdW50IGJ5IHRoZSBjaHVuayBzaXplLgogICAgICAgICAgICAjIG90aGVyd2lzZSB3ZSBnZW5lcmF0ZSBhIGNodW5rIG9mIGRhdGEgaW4gdGhlIHNpemUgb2YgdGhlIGFtb3VudDoKICAgICAgICAgICAgaWYgYW1vdW50ID4gY2h1bmtfc2l6ZToKICAgICAgICAgICAgICAgIGN1cnJlbnRfY2h1bmtfc2l6ZSA9IGNodW5rX3NpemUKICAgICAgICAgICAgICAgIGFtb3VudCAtPSBjaHVua19zaXplCiAgICAgICAgICAgIGVsc2U6CiAgICAgICAgICAgICAgICBjdXJyZW50X2NodW5rX3NpemUgPSBhbW91bnQKCiAgICAgICAgICAgICMgQ3JlYXRlIHRoZSBwcm9tcHQ6CiAgICAgICAgICAgIHByb21wdCA9IHByb21wdF9zdHJ1Y3R1cmUuZm9ybWF0KAogICAgICAgICAgICAgICAgYW1vdW50PWN1cnJlbnRfY2h1bmtfc2l6ZSwKICAgICAgICAgICAgKQoKICAgICAgICAgICAgIyBHZW5lcmF0ZSBhIGNodW5rIG9mIGRhdGE6CiAgICAgICAgICAgIGNodW5rX2RhdGEgPSBsbG0ucHJlZGljdCh0ZXh0PXByb21wdCkKCiAgICAgICAgICAgICMgVmFsaWRhdGUgdGhlIHJlc3BvbnNlIGZvciBjb3JyZWN0IHB5dGhvbiBgbGlzdGAgc3RydWN0dXJlCiAgICAgICAgICAgIGNodW5rX2RhdGEgPSBjaHVua19kYXRhW2NodW5rX2RhdGEuZmluZCgiWyIpIDogY2h1bmtfZGF0YS5yZmluZCgiXSIpICsgMV0KICAgICAgICAgICAgaWYgY2h1bmtfZGF0YS5jb3VudCgiWyIpICE9IGNodW5rX2RhdGEuY291bnQoIl0iKToKICAgICAgICAgICAgICAgIHByaW50KAogICAgICAgICAgICAgICAgICAgICJGYWlsZWQgdG8gZ2V0IHByb3BlciBqc29uIGZvcm1hdCBmcm9tIG1vZGVsLCBudW1iZXIgb2YgJ1snIGRvZXNuJ3QgbWF0Y2ggbnVtYmVyIG9mICddJy4iCiAgICAgICAgICAgICAgICApCiAgICAgICAgICAgICAgICBjb250aW51ZQogICAgICAgICAgICBjaHVua19kYXRhID0gYXN0LmxpdGVyYWxfZXZhbChjaHVua19kYXRhKQogICAgICAgICAgICBkYXRhICs9IGNodW5rX2RhdGEKICAgICAgICAgICAgYnJlYWsKICAgICAgICBpZiB0cnlvdXQgPT0gMzoKICAgICAgICAgICAgcmFpc2UgUnVudGltZUVycm9yKAogICAgICAgICAgICAgICAgZiJDb3VsZCBub3QgZ2VuZXJhdGUgYSBwcm9wZXIganNvbiBmb3JtYXQgZm9yIHRoZSBnaXZlbiBmaWVsZHMsIHVzaW5nIGdpdmVuIG1vZGVsOiB7bW9kZWxfbmFtZX0uIgogICAgICAgICAgICAgICAgZiIgSGludDogR3B0LTQgd29ya3MgYmVzdCBmb3IgbW9zdCBzY2VuYXJpb3MuIgogICAgICAgICAgICApCiAgICByZXR1cm4gZGF0YQo= + base_image: mlrun/mlrun + commands: [] + code_origin: '' + origin_filename: '' + requirements: + - langchain + - tqdm + entry_points: + generate_data: + name: generate_data + doc: 'Structured data of elements according to the given parameters. + + The data can be later logged as a structured file with MLRun''s `returns` + parameter.' + parameters: + - name: fields + type: list + doc: A list of fields to randomly generate. + - name: amount + type: int + doc: The number of variants to generate. + default: 10 + - name: model_name + type: str + doc: 'The name of the model to use for conversation generation. You should + choose one of GPT-4 or GPT-3.5 from the list here: https://platform.openai.com/docs/models. + Default: ''gpt-3.5-turbo''.' + default: gpt-3.5-turbo + - name: language + type: str + doc: The language to use for the generated conversation text. + default: en + - name: chunk_size + type: int + doc: Number of samples generated at each GPT query. + default: 50 + outputs: + - type: list + lineno: 59 + has_varargs: false + has_kwargs: false + description: GenAI approach of generating structured data according to a given schema + default_handler: generate_data + disable_auto_mount: false + clone_target_dir: '' + env: [] + priority_class_name: '' + preemption_mode: prevent + affinity: null + tolerations: null + security_context: {} +verbose: false diff --git a/functions/master/structured_data_generator/1.5.0/src/item.yaml b/functions/master/structured_data_generator/1.5.0/src/item.yaml new file mode 100755 index 00000000..be2a2a94 --- /dev/null +++ b/functions/master/structured_data_generator/1.5.0/src/item.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +categories: +- machine-learning +- data-preparation +- data-generation +- genai +description: GenAI approach of generating structured data according to a given schema +doc: '' +example: structured_data_generator.ipynb +generationDate: 2023-12-14:10-50 +hidden: false +icon: '' +labels: + author: zeevr +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.6.1 +name: structured_data_generator +platformVersion: 3.5.5 +spec: + filename: structured_data_generator.py + handler: generate_data + image: mlrun/mlrun + kind: job + requirements: + - langchain + - tqdm +url: '' +version: 1.5.0 diff --git a/functions/master/structured_data_generator/1.5.0/src/structured_data_generator.ipynb b/functions/master/structured_data_generator/1.5.0/src/structured_data_generator.ipynb new file mode 100644 index 00000000..12f87cf0 --- /dev/null +++ b/functions/master/structured_data_generator/1.5.0/src/structured_data_generator.ipynb @@ -0,0 +1,137 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9f7d79e7-8199-4680-919f-5039e8d7a0fe", + "metadata": {}, + "source": [ + "# structured_data_generator example" + ] + }, + { + "cell_type": "markdown", + "id": "4df1c846-2391-49a4-b65f-e7cff69dcdd9", + "metadata": {}, + "source": [ + "Introducing our innovative hub function, structured_data_generator, designed to streamline the process of creating structured files based on a list of fields.
    \n", + "This powerful function takes user-provided fields as input and dynamically generates relevant data, crafting a comprehensive structured file that aligns with the specified themes.
    \n", + "Whether you're working on content creation, testing scenarios, or simply need diverse data for development purposes, structured_data_generator is your go-to tool.
    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3913a3b7-48c1-4b5a-8a28-8f2c93fc05d1", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import mlrun" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "010c16e7-9d0a-42b1-9f09-141b72048885", + "metadata": {}, + "outputs": [], + "source": [ + "# OpenAI tokens:\n", + "OPENAI_API_KEY = \"\"\n", + "OPENAI_API_BASE = \"\"\n", + "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY\n", + "os.environ[\"OPENAI_API_BASE\"] = OPENAI_API_BASE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "324f2120-bcd9-4b61-a418-9c810709b6cf", + "metadata": {}, + "outputs": [], + "source": [ + "# Create mlrun project\n", + "project = mlrun.get_or_create_project(\"structured-data-generator-test\")\n", + "\n", + "# Import the function from the yaml file, once it's in the hub we can import from there \n", + "data_generation = project.set_function(func=\"./structured_data_generator.py\", name=\"structured_data_generator\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "999739d0-c8bf-48c3-8f57-b3c9ffec1a7f", + "metadata": {}, + "outputs": [], + "source": [ + "# Run the imported function with desired file/s and params\n", + "data_generation_run = data_generation.run(\n", + " handler=\"generate_data\",\n", + " params={\n", + " \"amount\": 5,\n", + " \"model_name\": \"gpt-4\",\n", + " \"language\": \"en\",\n", + " \"fields\": [\"first name\", \"last_name\", \"phone_number: at least 9 digits long\", \"email\", \"client_id: at least 8 digits long, only numbers\"],\n", + " },\n", + " returns=[\n", + " \"clients: file\",\n", + " ],\n", + " local=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dde97e2b-8570-4df4-84aa-04c341f455c9", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d70ceee-d17b-4901-9e8c-c9eda72f4e57", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3ea3341-80cc-4c87-a914-f2f3ffa1d491", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24983bf4-9fb0-4ebd-97cb-20e87859c22a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlrun-base", + "language": "python", + "name": "conda-env-mlrun-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/functions/master/structured_data_generator/1.5.0/src/structured_data_generator.py b/functions/master/structured_data_generator/1.5.0/src/structured_data_generator.py new file mode 100644 index 00000000..34fa36d4 --- /dev/null +++ b/functions/master/structured_data_generator/1.5.0/src/structured_data_generator.py @@ -0,0 +1,142 @@ +# Copyright 2023 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import ast +import os + +import tqdm +from langchain.chat_models import ChatOpenAI + + +def _set_openai_secrets() -> bool: + key = "OPENAI_API_KEY" + base = "OPENAI_API_BASE" + # Check if the key is already in the environment variables: + if key in os.environ and base in os.environ: + return True + # Check if mlrun is installed: + try: + import mlrun + except ModuleNotFoundError: + raise EnvironmentError( + f"One or more of the OpenAI required environment variables ('{key}', '{base}') are missing." + f"Please set them as environment variables or install mlrun (`pip install mlrun`)" + f"and set them as project secrets using `projecy.set_secrets`." + ) + + # Check if the key is in the secrets: + context = mlrun.get_or_create_ctx(name="context") + openai_key = context.get_secret(key) + openai_base = context.get_secret(base) + + # If the key is not in the secrets, return False: + if not openai_key: + raise EnvironmentError( + f"Could not find OpenAI API key in the environment variables or secrets," + f" please set it as: {key}." + ) + if not openai_base: + raise EnvironmentError( + f"Could not find OpenAI API base in the environment variables or secrets," + f" please set it as: {base}." + ) + # If the key is in the secrets, set it in the environment variables and return True: + os.environ[key] = openai_key + os.environ[base] = openai_base + return True + + +def generate_data( + fields: list, + amount: int = 10, + model_name: str = "gpt-3.5-turbo", + language: str = "en", + chunk_size: int = 50, +) -> list: + """ + Structured data of elements according to the given parameters. + The data can be later logged as a structured file with MLRun's `returns` parameter. + + :param fields: A list of fields to randomly generate. + :param amount: The number of variants to generate. + :param model_name: The name of the model to use for conversation generation. + You should choose one of GPT-4 or GPT-3.5 from the list here: https://platform.openai.com/docs/models. + Default: 'gpt-3.5-turbo'. + :param language: The language to use for the generated conversation text. + :param chunk_size: Number of samples generated at each GPT query. + """ + instructions = "" + for field in fields: + # Split the field to key and instruction: + if ":" in field: + key, instruction = field.split(":", 1) + else: + key, instruction = field, "no special instruction" + # Replace spaces with underscores for the key to be used as a json key: + key = key.strip().replace(" ", "_") + instructions += f"* {key}: {instruction}\n" + + # Create the prompt structure: + prompt_structure = ( + f"generate the following values {amount} times randomly, in an order that creates a json table.\n" + f"Use the following keys and instructions (example: 'key: instruction or no special instruction'): " + f"{instructions}.\n" + f"Please generate the values in {language} language. \n" + f"Make sure the names of the keys are the same as the given field name.\n" + f"Please return only the json format without any introduction and ending" + ) + + # Set the OpenAI secrets: + _set_openai_secrets() + + # Load the OpenAI model using langchain: + llm = ChatOpenAI(model=model_name) + + # Start generating data: + data = [] + for _ in tqdm.tqdm(range((amount // chunk_size) + 1), desc="Generating"): + # We try to generate the data 3 times, if we fail we raise an error: + for tryout in range(3): + # If the amount wanted is bigger than the chunk size, we generate a chunk of data in the size of the chunk + # and decrease the amount by the chunk size. + # otherwise we generate a chunk of data in the size of the amount: + if amount > chunk_size: + current_chunk_size = chunk_size + amount -= chunk_size + else: + current_chunk_size = amount + + # Create the prompt: + prompt = prompt_structure.format( + amount=current_chunk_size, + ) + + # Generate a chunk of data: + chunk_data = llm.predict(text=prompt) + + # Validate the response for correct python `list` structure + chunk_data = chunk_data[chunk_data.find("[") : chunk_data.rfind("]") + 1] + if chunk_data.count("[") != chunk_data.count("]"): + print( + "Failed to get proper json format from model, number of '[' doesn't match number of ']'." + ) + continue + chunk_data = ast.literal_eval(chunk_data) + data += chunk_data + break + if tryout == 3: + raise RuntimeError( + f"Could not generate a proper json format for the given fields, using given model: {model_name}." + f" Hint: Gpt-4 works best for most scenarios." + ) + return data diff --git a/functions/master/structured_data_generator/1.5.0/src/test_structured_data_generator.py b/functions/master/structured_data_generator/1.5.0/src/test_structured_data_generator.py new file mode 100644 index 00000000..3a7a7aa5 --- /dev/null +++ b/functions/master/structured_data_generator/1.5.0/src/test_structured_data_generator.py @@ -0,0 +1,37 @@ +import os +import mlrun +import pytest + + +@pytest.mark.skipif("OPENAI_API_KEY" not in os.environ, reason="no token") +def test_structured_data_generator(): + # Create mlrun project + project = mlrun.get_or_create_project("structured-data-generator-test") + + #Set secrets + # project.set_secrets({"OPENAI_API_KEY": "", "OPENAI_API_BASE": ""}) + + # Import the function from the yaml file, once it's in the hub we can import from there + data_generation = project.set_function(func="structured_data_generator.py", name="structured_data_generator") + + # Run the imported function with desired file/s and params + data_generation_run = data_generation.run( + handler="generate_data", + params={ + "amount": 3, + "model_name": "gpt-4", + "language": "en", + "fields": [ + "first name", + "last_name", + "phone_number: at least 9 digits long", + "email", + "client_id: at least 8 digits long, only numbers" + ], + }, + returns=[ + "clients: file", + ], + local=True, + ) + assert data_generation_run.outputs["clients"] \ No newline at end of file diff --git a/functions/master/structured_data_generator/1.5.0/static/documentation.html b/functions/master/structured_data_generator/1.5.0/static/documentation.html new file mode 100644 index 00000000..e037a5f7 --- /dev/null +++ b/functions/master/structured_data_generator/1.5.0/static/documentation.html @@ -0,0 +1,241 @@ + + + + + + + +structured_data_generator package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + + + +
    +
    + + +
    +
    +
    + +
    +

    structured_data_generator package

    + +
    + +
    +
    +
    +
    +
    +

    structured_data_generator package#

    +
    +

    Submodules#

    +
    +
    +

    structured_data_generator.structured_data_generator module#

    +
    +
    +structured_data_generator.structured_data_generator.generate_data(fields: list, amount: int = 10, model_name: str = 'gpt-3.5-turbo', language: str = 'en', chunk_size: int = 50)list[source]#
    +

    Structured data of elements according to the given parameters. +The data can be later logged as a structured file with MLRun’s returns parameter.

    +
    +
    Parameters
    +
      +
    • fields – A list of fields to randomly generate.

    • +
    • amount – The number of variants to generate.

    • +
    • model_name – The name of the model to use for conversation generation. +You should choose one of GPT-4 or GPT-3.5 from the list here: https://platform.openai.com/docs/models. +Default: ‘gpt-3.5-turbo’.

    • +
    • language – The language to use for the generated conversation text.

    • +
    • chunk_size – Number of samples generated at each GPT query.

    • +
    +
    +
    +
    +
    +
    +

    Module contents#

    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/structured_data_generator/1.5.0/static/example.html b/functions/master/structured_data_generator/1.5.0/static/example.html new file mode 100644 index 00000000..c7a44b9d --- /dev/null +++ b/functions/master/structured_data_generator/1.5.0/static/example.html @@ -0,0 +1,227 @@ + + + + + + + +structured_data_generator example + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + + +
    +
    + +
    +
    +
    +
    +
    + +
    +

    structured_data_generator example

    + +
    +
    +
    +
    +
    +
    +
    +
    +

    structured_data_generator example#

    +

    Introducing our innovative hub function, structured_data_generator, designed to streamline the process of creating structured files based on a list of fields.
    +This powerful function takes user-provided fields as input and dynamically generates relevant data, crafting a comprehensive structured file that aligns with the specified themes.
    +Whether you’re working on content creation, testing scenarios, or simply need diverse data for development purposes, structured_data_generator is your go-to tool.

    +
    +
    +
    import os
    +import mlrun
    +
    +
    +
    +
    +
    +
    +
    # OpenAI tokens:
    +OPENAI_API_KEY = ""
    +OPENAI_API_BASE = ""
    +os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
    +os.environ["OPENAI_API_BASE"] = OPENAI_API_BASE
    +
    +
    +
    +
    +
    +
    +
    # Create mlrun project
    +project = mlrun.get_or_create_project("structured-data-generator-test")
    +
    +# Import the function from the yaml file, once it's in the hub we can import from there 
    +data_generation = project.set_function(func="./structured_data_generator.py", name="structured_data_generator")
    +
    +
    +
    +
    +
    +
    +
    # Run the imported function with desired file/s and params
    +data_generation_run = data_generation.run(
    +    handler="generate_data",
    +            params={
    +                "amount": 5,
    +                "model_name": "gpt-4",
    +                "language": "en",
    +                "fields": ["first name", "last_name", "phone_number: at least 9 digits long", "email", "client_id: at least 8 digits long, only numbers"],
    +            },
    +            returns=[
    +                "clients: file",
    +            ],
    +    local=True,
    +)
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/structured_data_generator/1.5.0/static/function.html b/functions/master/structured_data_generator/1.5.0/static/function.html new file mode 100644 index 00000000..0594bffc --- /dev/null +++ b/functions/master/structured_data_generator/1.5.0/static/function.html @@ -0,0 +1,93 @@ + + + + + + + + + + + Source + + + + +
    +        
    +kind: job
    +metadata:
    +  name: structured-data-generator
    +  tag: ''
    +  hash: 44bb39f4bc55b38fc7ead1df24cb02bcf7f05bc9
    +  project: ''
    +  labels:
    +    author: zeevr
    +  categories:
    +  - machine-learning
    +  - data-preparation
    +  - data-generation
    +  - genai
    +spec:
    +  command: ''
    +  args: []
    +  image: ''
    +  build:
    +    functionSourceCode: IyBDb3B5cmlnaHQgMjAyMyBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgIGh0dHA6Ly93d3cuYXBhY2hlLm9yZy9saWNlbnNlcy9MSUNFTlNFLTIuMAojCiMgVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQojIGRpc3RyaWJ1dGVkIHVuZGVyIHRoZSBMaWNlbnNlIGlzIGRpc3RyaWJ1dGVkIG9uIGFuICJBUyBJUyIgQkFTSVMsCiMgV0lUSE9VVCBXQVJSQU5USUVTIE9SIENPTkRJVElPTlMgT0YgQU5ZIEtJTkQsIGVpdGhlciBleHByZXNzIG9yIGltcGxpZWQuCiMgU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAojIGxpbWl0YXRpb25zIHVuZGVyIHRoZSBMaWNlbnNlLgppbXBvcnQgYXN0CmltcG9ydCBvcwoKaW1wb3J0IHRxZG0KZnJvbSBsYW5nY2hhaW4uY2hhdF9tb2RlbHMgaW1wb3J0IENoYXRPcGVuQUkKCgpkZWYgX3NldF9vcGVuYWlfc2VjcmV0cygpIC0+IGJvb2w6CiAgICBrZXkgPSAiT1BFTkFJX0FQSV9LRVkiCiAgICBiYXNlID0gIk9QRU5BSV9BUElfQkFTRSIKICAgICMgQ2hlY2sgaWYgdGhlIGtleSBpcyBhbHJlYWR5IGluIHRoZSBlbnZpcm9ubWVudCB2YXJpYWJsZXM6CiAgICBpZiBrZXkgaW4gb3MuZW52aXJvbiBhbmQgYmFzZSBpbiBvcy5lbnZpcm9uOgogICAgICAgIHJldHVybiBUcnVlCiAgICAjIENoZWNrIGlmIG1scnVuIGlzIGluc3RhbGxlZDoKICAgIHRyeToKICAgICAgICBpbXBvcnQgbWxydW4KICAgIGV4Y2VwdCBNb2R1bGVOb3RGb3VuZEVycm9yOgogICAgICAgIHJhaXNlIEVudmlyb25tZW50RXJyb3IoCiAgICAgICAgICAgIGYiT25lIG9yIG1vcmUgb2YgdGhlIE9wZW5BSSByZXF1aXJlZCBlbnZpcm9ubWVudCB2YXJpYWJsZXMgKCd7a2V5fScsICd7YmFzZX0nKSBhcmUgbWlzc2luZy4iCiAgICAgICAgICAgIGYiUGxlYXNlIHNldCB0aGVtIGFzIGVudmlyb25tZW50IHZhcmlhYmxlcyBvciBpbnN0YWxsIG1scnVuIChgcGlwIGluc3RhbGwgbWxydW5gKSIKICAgICAgICAgICAgZiJhbmQgc2V0IHRoZW0gYXMgcHJvamVjdCBzZWNyZXRzIHVzaW5nIGBwcm9qZWN5LnNldF9zZWNyZXRzYC4iCiAgICAgICAgKQoKICAgICMgQ2hlY2sgaWYgdGhlIGtleSBpcyBpbiB0aGUgc2VjcmV0czoKICAgIGNvbnRleHQgPSBtbHJ1bi5nZXRfb3JfY3JlYXRlX2N0eChuYW1lPSJjb250ZXh0IikKICAgIG9wZW5haV9rZXkgPSBjb250ZXh0LmdldF9zZWNyZXQoa2V5KQogICAgb3BlbmFpX2Jhc2UgPSBjb250ZXh0LmdldF9zZWNyZXQoYmFzZSkKCiAgICAjIElmIHRoZSBrZXkgaXMgbm90IGluIHRoZSBzZWNyZXRzLCByZXR1cm4gRmFsc2U6CiAgICBpZiBub3Qgb3BlbmFpX2tleToKICAgICAgICByYWlzZSBFbnZpcm9ubWVudEVycm9yKAogICAgICAgICAgICBmIkNvdWxkIG5vdCBmaW5kIE9wZW5BSSBBUEkga2V5IGluIHRoZSBlbnZpcm9ubWVudCB2YXJpYWJsZXMgb3Igc2VjcmV0cywiCiAgICAgICAgICAgIGYiIHBsZWFzZSBzZXQgaXQgYXM6IHtrZXl9LiIKICAgICAgICApCiAgICBpZiBub3Qgb3BlbmFpX2Jhc2U6CiAgICAgICAgcmFpc2UgRW52aXJvbm1lbnRFcnJvcigKICAgICAgICAgICAgZiJDb3VsZCBub3QgZmluZCBPcGVuQUkgQVBJIGJhc2UgaW4gdGhlIGVudmlyb25tZW50IHZhcmlhYmxlcyBvciBzZWNyZXRzLCIKICAgICAgICAgICAgZiIgcGxlYXNlIHNldCBpdCBhczoge2Jhc2V9LiIKICAgICAgICApCiAgICAjIElmIHRoZSBrZXkgaXMgaW4gdGhlIHNlY3JldHMsIHNldCBpdCBpbiB0aGUgZW52aXJvbm1lbnQgdmFyaWFibGVzIGFuZCByZXR1cm4gVHJ1ZToKICAgIG9zLmVudmlyb25ba2V5XSA9IG9wZW5haV9rZXkKICAgIG9zLmVudmlyb25bYmFzZV0gPSBvcGVuYWlfYmFzZQogICAgcmV0dXJuIFRydWUKCgpkZWYgZ2VuZXJhdGVfZGF0YSgKICAgIGZpZWxkczogbGlzdCwKICAgIGFtb3VudDogaW50ID0gMTAsCiAgICBtb2RlbF9uYW1lOiBzdHIgPSAiZ3B0LTMuNS10dXJibyIsCiAgICBsYW5ndWFnZTogc3RyID0gImVuIiwKICAgIGNodW5rX3NpemU6IGludCA9IDUwLAopIC0+IGxpc3Q6CiAgICAiIiIKICAgIFN0cnVjdHVyZWQgZGF0YSBvZiBlbGVtZW50cyBhY2NvcmRpbmcgdG8gdGhlIGdpdmVuIHBhcmFtZXRlcnMuCiAgICBUaGUgZGF0YSBjYW4gYmUgbGF0ZXIgbG9nZ2VkIGFzIGEgc3RydWN0dXJlZCBmaWxlIHdpdGggTUxSdW4ncyBgcmV0dXJuc2AgcGFyYW1ldGVyLgoKICAgIDpwYXJhbSBmaWVsZHM6IEEgbGlzdCBvZiBmaWVsZHMgdG8gcmFuZG9tbHkgZ2VuZXJhdGUuCiAgICA6cGFyYW0gYW1vdW50OiBUaGUgbnVtYmVyIG9mIHZhcmlhbnRzIHRvIGdlbmVyYXRlLgogICAgOnBhcmFtIG1vZGVsX25hbWU6IFRoZSBuYW1lIG9mIHRoZSBtb2RlbCB0byB1c2UgZm9yIGNvbnZlcnNhdGlvbiBnZW5lcmF0aW9uLgogICAgICAgICAgICAgICAgICAgICAgIFlvdSBzaG91bGQgY2hvb3NlIG9uZSBvZiBHUFQtNCBvciBHUFQtMy41IGZyb20gdGhlIGxpc3QgaGVyZTogaHR0cHM6Ly9wbGF0Zm9ybS5vcGVuYWkuY29tL2RvY3MvbW9kZWxzLgogICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHQ6ICdncHQtMy41LXR1cmJvJy4KICAgIDpwYXJhbSBsYW5ndWFnZTogVGhlIGxhbmd1YWdlIHRvIHVzZSBmb3IgdGhlIGdlbmVyYXRlZCBjb252ZXJzYXRpb24gdGV4dC4KICAgIDpwYXJhbSBjaHVua19zaXplOiBOdW1iZXIgb2Ygc2FtcGxlcyBnZW5lcmF0ZWQgYXQgZWFjaCBHUFQgcXVlcnkuCiAgICAiIiIKICAgIGluc3RydWN0aW9ucyA9ICIiCiAgICBmb3IgZmllbGQgaW4gZmllbGRzOgogICAgICAgICMgU3BsaXQgdGhlIGZpZWxkIHRvIGtleSBhbmQgaW5zdHJ1Y3Rpb246CiAgICAgICAgaWYgIjoiIGluIGZpZWxkOgogICAgICAgICAgICBrZXksIGluc3RydWN0aW9uID0gZmllbGQuc3BsaXQoIjoiLCAxKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGtleSwgaW5zdHJ1Y3Rpb24gPSBmaWVsZCwgIm5vIHNwZWNpYWwgaW5zdHJ1Y3Rpb24iCiAgICAgICAgIyBSZXBsYWNlIHNwYWNlcyB3aXRoIHVuZGVyc2NvcmVzIGZvciB0aGUga2V5IHRvIGJlIHVzZWQgYXMgYSBqc29uIGtleToKICAgICAgICBrZXkgPSBrZXkuc3RyaXAoKS5yZXBsYWNlKCIgIiwgIl8iKQogICAgICAgIGluc3RydWN0aW9ucyArPSBmIioge2tleX06IHtpbnN0cnVjdGlvbn1cbiIKCiAgICAjIENyZWF0ZSB0aGUgcHJvbXB0IHN0cnVjdHVyZToKICAgIHByb21wdF9zdHJ1Y3R1cmUgPSAoCiAgICAgICAgZiJnZW5lcmF0ZSB0aGUgZm9sbG93aW5nIHZhbHVlcyB7YW1vdW50fSB0aW1lcyByYW5kb21seSwgaW4gYW4gb3JkZXIgdGhhdCBjcmVhdGVzIGEganNvbiB0YWJsZS5cbiIKICAgICAgICBmIlVzZSB0aGUgZm9sbG93aW5nIGtleXMgYW5kIGluc3RydWN0aW9ucyAoZXhhbXBsZTogJ2tleTogaW5zdHJ1Y3Rpb24gb3Igbm8gc3BlY2lhbCBpbnN0cnVjdGlvbicpOiAiCiAgICAgICAgZiJ7aW5zdHJ1Y3Rpb25zfS5cbiIKICAgICAgICBmIlBsZWFzZSBnZW5lcmF0ZSB0aGUgdmFsdWVzIGluIHtsYW5ndWFnZX0gbGFuZ3VhZ2UuIFxuIgogICAgICAgIGYiTWFrZSBzdXJlIHRoZSBuYW1lcyBvZiB0aGUga2V5cyBhcmUgdGhlIHNhbWUgYXMgdGhlIGdpdmVuIGZpZWxkIG5hbWUuXG4iCiAgICAgICAgZiJQbGVhc2UgcmV0dXJuIG9ubHkgdGhlIGpzb24gZm9ybWF0IHdpdGhvdXQgYW55IGludHJvZHVjdGlvbiBhbmQgZW5kaW5nIgogICAgKQoKICAgICMgU2V0IHRoZSBPcGVuQUkgc2VjcmV0czoKICAgIF9zZXRfb3BlbmFpX3NlY3JldHMoKQoKICAgICMgTG9hZCB0aGUgT3BlbkFJIG1vZGVsIHVzaW5nIGxhbmdjaGFpbjoKICAgIGxsbSA9IENoYXRPcGVuQUkobW9kZWw9bW9kZWxfbmFtZSkKCiAgICAjIFN0YXJ0IGdlbmVyYXRpbmcgZGF0YToKICAgIGRhdGEgPSBbXQogICAgZm9yIF8gaW4gdHFkbS50cWRtKHJhbmdlKChhbW91bnQgLy8gY2h1bmtfc2l6ZSkgKyAxKSwgZGVzYz0iR2VuZXJhdGluZyIpOgogICAgICAgICMgV2UgdHJ5IHRvIGdlbmVyYXRlIHRoZSBkYXRhIDMgdGltZXMsIGlmIHdlIGZhaWwgd2UgcmFpc2UgYW4gZXJyb3I6CiAgICAgICAgZm9yIHRyeW91dCBpbiByYW5nZSgzKToKICAgICAgICAgICAgIyBJZiB0aGUgYW1vdW50IHdhbnRlZCBpcyBiaWdnZXIgdGhhbiB0aGUgY2h1bmsgc2l6ZSwgd2UgZ2VuZXJhdGUgYSBjaHVuayBvZiBkYXRhIGluIHRoZSBzaXplIG9mIHRoZSBjaHVuawogICAgICAgICAgICAjIGFuZCBkZWNyZWFzZSB0aGUgYW1vdW50IGJ5IHRoZSBjaHVuayBzaXplLgogICAgICAgICAgICAjIG90aGVyd2lzZSB3ZSBnZW5lcmF0ZSBhIGNodW5rIG9mIGRhdGEgaW4gdGhlIHNpemUgb2YgdGhlIGFtb3VudDoKICAgICAgICAgICAgaWYgYW1vdW50ID4gY2h1bmtfc2l6ZToKICAgICAgICAgICAgICAgIGN1cnJlbnRfY2h1bmtfc2l6ZSA9IGNodW5rX3NpemUKICAgICAgICAgICAgICAgIGFtb3VudCAtPSBjaHVua19zaXplCiAgICAgICAgICAgIGVsc2U6CiAgICAgICAgICAgICAgICBjdXJyZW50X2NodW5rX3NpemUgPSBhbW91bnQKCiAgICAgICAgICAgICMgQ3JlYXRlIHRoZSBwcm9tcHQ6CiAgICAgICAgICAgIHByb21wdCA9IHByb21wdF9zdHJ1Y3R1cmUuZm9ybWF0KAogICAgICAgICAgICAgICAgYW1vdW50PWN1cnJlbnRfY2h1bmtfc2l6ZSwKICAgICAgICAgICAgKQoKICAgICAgICAgICAgIyBHZW5lcmF0ZSBhIGNodW5rIG9mIGRhdGE6CiAgICAgICAgICAgIGNodW5rX2RhdGEgPSBsbG0ucHJlZGljdCh0ZXh0PXByb21wdCkKCiAgICAgICAgICAgICMgVmFsaWRhdGUgdGhlIHJlc3BvbnNlIGZvciBjb3JyZWN0IHB5dGhvbiBgbGlzdGAgc3RydWN0dXJlCiAgICAgICAgICAgIGNodW5rX2RhdGEgPSBjaHVua19kYXRhW2NodW5rX2RhdGEuZmluZCgiWyIpIDogY2h1bmtfZGF0YS5yZmluZCgiXSIpICsgMV0KICAgICAgICAgICAgaWYgY2h1bmtfZGF0YS5jb3VudCgiWyIpICE9IGNodW5rX2RhdGEuY291bnQoIl0iKToKICAgICAgICAgICAgICAgIHByaW50KAogICAgICAgICAgICAgICAgICAgICJGYWlsZWQgdG8gZ2V0IHByb3BlciBqc29uIGZvcm1hdCBmcm9tIG1vZGVsLCBudW1iZXIgb2YgJ1snIGRvZXNuJ3QgbWF0Y2ggbnVtYmVyIG9mICddJy4iCiAgICAgICAgICAgICAgICApCiAgICAgICAgICAgICAgICBjb250aW51ZQogICAgICAgICAgICBjaHVua19kYXRhID0gYXN0LmxpdGVyYWxfZXZhbChjaHVua19kYXRhKQogICAgICAgICAgICBkYXRhICs9IGNodW5rX2RhdGEKICAgICAgICAgICAgYnJlYWsKICAgICAgICBpZiB0cnlvdXQgPT0gMzoKICAgICAgICAgICAgcmFpc2UgUnVudGltZUVycm9yKAogICAgICAgICAgICAgICAgZiJDb3VsZCBub3QgZ2VuZXJhdGUgYSBwcm9wZXIganNvbiBmb3JtYXQgZm9yIHRoZSBnaXZlbiBmaWVsZHMsIHVzaW5nIGdpdmVuIG1vZGVsOiB7bW9kZWxfbmFtZX0uIgogICAgICAgICAgICAgICAgZiIgSGludDogR3B0LTQgd29ya3MgYmVzdCBmb3IgbW9zdCBzY2VuYXJpb3MuIgogICAgICAgICAgICApCiAgICByZXR1cm4gZGF0YQo=
    +    base_image: mlrun/mlrun
    +    commands: []
    +    code_origin: ''
    +    origin_filename: ''
    +    requirements:
    +    - langchain
    +    - tqdm
    +  entry_points:
    +    generate_data:
    +      name: generate_data
    +      doc: 'Structured data of elements according to the given parameters.
    +
    +        The data can be later logged as a structured file with MLRun''s `returns`
    +        parameter.'
    +      parameters:
    +      - name: fields
    +        type: list
    +        doc: A list of fields to randomly generate.
    +      - name: amount
    +        type: int
    +        doc: The number of variants to generate.
    +        default: 10
    +      - name: model_name
    +        type: str
    +        doc: 'The name of the model to use for conversation generation. You should
    +          choose one of GPT-4 or GPT-3.5 from the list here: https://platform.openai.com/docs/models.
    +          Default: ''gpt-3.5-turbo''.'
    +        default: gpt-3.5-turbo
    +      - name: language
    +        type: str
    +        doc: The language to use for the generated conversation text.
    +        default: en
    +      - name: chunk_size
    +        type: int
    +        doc: Number of samples generated at each GPT query.
    +        default: 50
    +      outputs:
    +      - type: list
    +      lineno: 59
    +      has_varargs: false
    +      has_kwargs: false
    +  description: GenAI approach of generating structured data according to a given schema
    +  default_handler: generate_data
    +  disable_auto_mount: false
    +  clone_target_dir: ''
    +  env: []
    +  priority_class_name: ''
    +  preemption_mode: prevent
    +  affinity: null
    +  tolerations: null
    +  security_context: {}
    +verbose: false
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/structured_data_generator/1.5.0/static/item.html b/functions/master/structured_data_generator/1.5.0/static/item.html new file mode 100644 index 00000000..b5ab71d0 --- /dev/null +++ b/functions/master/structured_data_generator/1.5.0/static/item.html @@ -0,0 +1,51 @@ + + + + + + + + + + + Source + + + + +
    +        
    +apiVersion: v1
    +categories:
    +- machine-learning
    +- data-preparation
    +- data-generation
    +- genai
    +description: GenAI approach of generating structured data according to a given schema
    +doc: ''
    +example: structured_data_generator.ipynb
    +generationDate: 2023-12-14:10-50
    +hidden: false
    +icon: ''
    +labels:
    +  author: zeevr
    +maintainers: []
    +marketplaceType: ''
    +mlrunVersion: 1.6.1
    +name: structured_data_generator
    +platformVersion: 3.5.5
    +spec:
    +  filename: structured_data_generator.py
    +  handler: generate_data
    +  image: mlrun/mlrun
    +  kind: job
    +  requirements:
    +  - langchain
    +  - tqdm
    +url: ''
    +version: 1.5.0
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/structured_data_generator/1.5.0/static/source.html b/functions/master/structured_data_generator/1.5.0/static/source.html new file mode 100644 index 00000000..646f8844 --- /dev/null +++ b/functions/master/structured_data_generator/1.5.0/static/source.html @@ -0,0 +1,164 @@ + + + + + + + + + + + Source + + + + +
    +        
    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +import ast
    +import os
    +
    +import tqdm
    +from langchain.chat_models import ChatOpenAI
    +
    +
    +def _set_openai_secrets() -> bool:
    +    key = "OPENAI_API_KEY"
    +    base = "OPENAI_API_BASE"
    +    # Check if the key is already in the environment variables:
    +    if key in os.environ and base in os.environ:
    +        return True
    +    # Check if mlrun is installed:
    +    try:
    +        import mlrun
    +    except ModuleNotFoundError:
    +        raise EnvironmentError(
    +            f"One or more of the OpenAI required environment variables ('{key}', '{base}') are missing."
    +            f"Please set them as environment variables or install mlrun (`pip install mlrun`)"
    +            f"and set them as project secrets using `projecy.set_secrets`."
    +        )
    +
    +    # Check if the key is in the secrets:
    +    context = mlrun.get_or_create_ctx(name="context")
    +    openai_key = context.get_secret(key)
    +    openai_base = context.get_secret(base)
    +
    +    # If the key is not in the secrets, return False:
    +    if not openai_key:
    +        raise EnvironmentError(
    +            f"Could not find OpenAI API key in the environment variables or secrets,"
    +            f" please set it as: {key}."
    +        )
    +    if not openai_base:
    +        raise EnvironmentError(
    +            f"Could not find OpenAI API base in the environment variables or secrets,"
    +            f" please set it as: {base}."
    +        )
    +    # If the key is in the secrets, set it in the environment variables and return True:
    +    os.environ[key] = openai_key
    +    os.environ[base] = openai_base
    +    return True
    +
    +
    +def generate_data(
    +    fields: list,
    +    amount: int = 10,
    +    model_name: str = "gpt-3.5-turbo",
    +    language: str = "en",
    +    chunk_size: int = 50,
    +) -> list:
    +    """
    +    Structured data of elements according to the given parameters.
    +    The data can be later logged as a structured file with MLRun's `returns` parameter.
    +
    +    :param fields: A list of fields to randomly generate.
    +    :param amount: The number of variants to generate.
    +    :param model_name: The name of the model to use for conversation generation.
    +                       You should choose one of GPT-4 or GPT-3.5 from the list here: https://platform.openai.com/docs/models.
    +                       Default: 'gpt-3.5-turbo'.
    +    :param language: The language to use for the generated conversation text.
    +    :param chunk_size: Number of samples generated at each GPT query.
    +    """
    +    instructions = ""
    +    for field in fields:
    +        # Split the field to key and instruction:
    +        if ":" in field:
    +            key, instruction = field.split(":", 1)
    +        else:
    +            key, instruction = field, "no special instruction"
    +        # Replace spaces with underscores for the key to be used as a json key:
    +        key = key.strip().replace(" ", "_")
    +        instructions += f"* {key}: {instruction}\n"
    +
    +    # Create the prompt structure:
    +    prompt_structure = (
    +        f"generate the following values {amount} times randomly, in an order that creates a json table.\n"
    +        f"Use the following keys and instructions (example: 'key: instruction or no special instruction'): "
    +        f"{instructions}.\n"
    +        f"Please generate the values in {language} language. \n"
    +        f"Make sure the names of the keys are the same as the given field name.\n"
    +        f"Please return only the json format without any introduction and ending"
    +    )
    +
    +    # Set the OpenAI secrets:
    +    _set_openai_secrets()
    +
    +    # Load the OpenAI model using langchain:
    +    llm = ChatOpenAI(model=model_name)
    +
    +    # Start generating data:
    +    data = []
    +    for _ in tqdm.tqdm(range((amount // chunk_size) + 1), desc="Generating"):
    +        # We try to generate the data 3 times, if we fail we raise an error:
    +        for tryout in range(3):
    +            # If the amount wanted is bigger than the chunk size, we generate a chunk of data in the size of the chunk
    +            # and decrease the amount by the chunk size.
    +            # otherwise we generate a chunk of data in the size of the amount:
    +            if amount > chunk_size:
    +                current_chunk_size = chunk_size
    +                amount -= chunk_size
    +            else:
    +                current_chunk_size = amount
    +
    +            # Create the prompt:
    +            prompt = prompt_structure.format(
    +                amount=current_chunk_size,
    +            )
    +
    +            # Generate a chunk of data:
    +            chunk_data = llm.predict(text=prompt)
    +
    +            # Validate the response for correct python `list` structure
    +            chunk_data = chunk_data[chunk_data.find("[") : chunk_data.rfind("]") + 1]
    +            if chunk_data.count("[") != chunk_data.count("]"):
    +                print(
    +                    "Failed to get proper json format from model, number of '[' doesn't match number of ']'."
    +                )
    +                continue
    +            chunk_data = ast.literal_eval(chunk_data)
    +            data += chunk_data
    +            break
    +        if tryout == 3:
    +            raise RuntimeError(
    +                f"Could not generate a proper json format for the given fields, using given model: {model_name}."
    +                f" Hint: Gpt-4 works best for most scenarios."
    +            )
    +    return data
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/structured_data_generator/1.5.0/static/structured_data_generator.html b/functions/master/structured_data_generator/1.5.0/static/structured_data_generator.html new file mode 100644 index 00000000..e2a89dce --- /dev/null +++ b/functions/master/structured_data_generator/1.5.0/static/structured_data_generator.html @@ -0,0 +1,282 @@ + + + + + + + +structured_data_generator.structured_data_generator + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + +
    +
    + +
    +
    +
    +
    +
    + +
    +

    + +
    +
    +
    +
    +
    +
    +
    +

    Source code for structured_data_generator.structured_data_generator

    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +import ast
    +import os
    +
    +import tqdm
    +from langchain.chat_models import ChatOpenAI
    +
    +
    +def _set_openai_secrets() -> bool:
    +    key = "OPENAI_API_KEY"
    +    base = "OPENAI_API_BASE"
    +    # Check if the key is already in the environment variables:
    +    if key in os.environ and base in os.environ:
    +        return True
    +    # Check if mlrun is installed:
    +    try:
    +        import mlrun
    +    except ModuleNotFoundError:
    +        raise EnvironmentError(
    +            f"One or more of the OpenAI required environment variables ('{key}', '{base}') are missing."
    +            f"Please set them as environment variables or install mlrun (`pip install mlrun`)"
    +            f"and set them as project secrets using `projecy.set_secrets`."
    +        )
    +
    +    # Check if the key is in the secrets:
    +    context = mlrun.get_or_create_ctx(name="context")
    +    openai_key = context.get_secret(key)
    +    openai_base = context.get_secret(base)
    +
    +    # If the key is not in the secrets, return False:
    +    if not openai_key:
    +        raise EnvironmentError(
    +            f"Could not find OpenAI API key in the environment variables or secrets,"
    +            f" please set it as: {key}."
    +        )
    +    if not openai_base:
    +        raise EnvironmentError(
    +            f"Could not find OpenAI API base in the environment variables or secrets,"
    +            f" please set it as: {base}."
    +        )
    +    # If the key is in the secrets, set it in the environment variables and return True:
    +    os.environ[key] = openai_key
    +    os.environ[base] = openai_base
    +    return True
    +
    +
    +
    [docs]def generate_data( + fields: list, + amount: int = 10, + model_name: str = "gpt-3.5-turbo", + language: str = "en", + chunk_size: int = 50, +) -> list: + """ + Structured data of elements according to the given parameters. + The data can be later logged as a structured file with MLRun's `returns` parameter. + + :param fields: A list of fields to randomly generate. + :param amount: The number of variants to generate. + :param model_name: The name of the model to use for conversation generation. + You should choose one of GPT-4 or GPT-3.5 from the list here: https://platform.openai.com/docs/models. + Default: 'gpt-3.5-turbo'. + :param language: The language to use for the generated conversation text. + :param chunk_size: Number of samples generated at each GPT query. + """ + instructions = "" + for field in fields: + # Split the field to key and instruction: + if ":" in field: + key, instruction = field.split(":", 1) + else: + key, instruction = field, "no special instruction" + # Replace spaces with underscores for the key to be used as a json key: + key = key.strip().replace(" ", "_") + instructions += f"* {key}: {instruction}\n" + + # Create the prompt structure: + prompt_structure = ( + f"generate the following values {amount} times randomly, in an order that creates a json table.\n" + f"Use the following keys and instructions (example: 'key: instruction or no special instruction'): " + f"{instructions}.\n" + f"Please generate the values in {language} language. \n" + f"Make sure the names of the keys are the same as the given field name.\n" + f"Please return only the json format without any introduction and ending" + ) + + # Set the OpenAI secrets: + _set_openai_secrets() + + # Load the OpenAI model using langchain: + llm = ChatOpenAI(model=model_name) + + # Start generating data: + data = [] + for _ in tqdm.tqdm(range((amount // chunk_size) + 1), desc="Generating"): + # We try to generate the data 3 times, if we fail we raise an error: + for tryout in range(3): + # If the amount wanted is bigger than the chunk size, we generate a chunk of data in the size of the chunk + # and decrease the amount by the chunk size. + # otherwise we generate a chunk of data in the size of the amount: + if amount > chunk_size: + current_chunk_size = chunk_size + amount -= chunk_size + else: + current_chunk_size = amount + + # Create the prompt: + prompt = prompt_structure.format( + amount=current_chunk_size, + ) + + # Generate a chunk of data: + chunk_data = llm.predict(text=prompt) + + # Validate the response for correct python `list` structure + chunk_data = chunk_data[chunk_data.find("[") : chunk_data.rfind("]") + 1] + if chunk_data.count("[") != chunk_data.count("]"): + print( + "Failed to get proper json format from model, number of '[' doesn't match number of ']'." + ) + continue + chunk_data = ast.literal_eval(chunk_data) + data += chunk_data + break + if tryout == 3: + raise RuntimeError( + f"Could not generate a proper json format for the given fields, using given model: {model_name}." + f" Hint: Gpt-4 works best for most scenarios." + ) + return data
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/structured_data_generator/latest/src/function.yaml b/functions/master/structured_data_generator/latest/src/function.yaml index 0e046122..1093e178 100644 --- a/functions/master/structured_data_generator/latest/src/function.yaml +++ b/functions/master/structured_data_generator/latest/src/function.yaml @@ -2,7 +2,7 @@ kind: job metadata: name: structured-data-generator tag: '' - hash: ac969f46aae91804024ea736856267c26578864b + hash: 44bb39f4bc55b38fc7ead1df24cb02bcf7f05bc9 project: '' labels: author: zeevr diff --git a/functions/master/structured_data_generator/latest/src/item.yaml b/functions/master/structured_data_generator/latest/src/item.yaml index 318d93d8..be2a2a94 100755 --- a/functions/master/structured_data_generator/latest/src/item.yaml +++ b/functions/master/structured_data_generator/latest/src/item.yaml @@ -26,4 +26,4 @@ spec: - langchain - tqdm url: '' -version: 1.4.0 +version: 1.5.0 diff --git a/functions/master/structured_data_generator/latest/static/function.html b/functions/master/structured_data_generator/latest/static/function.html index ef5d06b9..0594bffc 100644 --- a/functions/master/structured_data_generator/latest/static/function.html +++ b/functions/master/structured_data_generator/latest/static/function.html @@ -19,7 +19,7 @@ metadata: name: structured-data-generator tag: '' - hash: ac969f46aae91804024ea736856267c26578864b + hash: 44bb39f4bc55b38fc7ead1df24cb02bcf7f05bc9 project: '' labels: author: zeevr @@ -27,7 +27,7 @@ - machine-learning - data-preparation - data-generation - - GenAI + - genai spec: command: '' args: [] diff --git a/functions/master/structured_data_generator/latest/static/item.html b/functions/master/structured_data_generator/latest/static/item.html index c39576ca..b5ab71d0 100644 --- a/functions/master/structured_data_generator/latest/static/item.html +++ b/functions/master/structured_data_generator/latest/static/item.html @@ -20,7 +20,7 @@ - machine-learning - data-preparation - data-generation -- GenAI +- genai description: GenAI approach of generating structured data according to a given schema doc: '' example: structured_data_generator.ipynb @@ -43,7 +43,7 @@ - langchain - tqdm url: '' -version: 1.4.0 +version: 1.5.0 diff --git a/functions/master/tags.json b/functions/master/tags.json index b88850a7..00f59c75 100644 --- a/functions/master/tags.json +++ b/functions/master/tags.json @@ -1 +1 @@ -{"kind": ["nuclio", "job", "dask", "serving", "nuclio:serving"], "categories": ["data-generation", "genai", "pytorch", "feature-store", "data-analysis", "utils", "machine-learning", "model-testing", "NLP", "data-preparation", "model-serving", "etl", "model-training", "deep-learning", "audio", "data-validation", "monitoring", "huggingface"]} \ No newline at end of file +{"categories": ["feature-store", "deep-learning", "data-analysis", "monitoring", "model-training", "data-generation", "etl", "data-preparation", "genai", "audio", "data-validation", "huggingface", "model-serving", "machine-learning", "NLP", "model-testing", "utils", "pytorch"], "kind": ["serving", "nuclio", "nuclio:serving", "dask", "job"]} \ No newline at end of file diff --git a/functions/master/text_to_audio_generator/1.2.0/src/data/test_data.txt b/functions/master/text_to_audio_generator/1.2.0/src/data/test_data.txt new file mode 100644 index 00000000..e60176e3 --- /dev/null +++ b/functions/master/text_to_audio_generator/1.2.0/src/data/test_data.txt @@ -0,0 +1,2 @@ +Client: I love MLRun! +Agent: Me too! diff --git a/functions/master/text_to_audio_generator/1.2.0/src/function.yaml b/functions/master/text_to_audio_generator/1.2.0/src/function.yaml new file mode 100644 index 00000000..88ef9cb8 --- /dev/null +++ b/functions/master/text_to_audio_generator/1.2.0/src/function.yaml @@ -0,0 +1,95 @@ +kind: job +metadata: + name: text-to-audio-generator + tag: '' + hash: 89fcaf3fab53e7b7fbba448a5e65c253d7fa66ed + project: '' + labels: + author: yonatans + categories: + - data-preparation + - machine-learning + - pytorch +spec: + command: '' + args: [] + image: '' + build: + functionSourceCode: # Copyright 2023 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import pathlib
import random
import tempfile
from typing import Dict, List, Optional, Tuple, Union

import bark
import numpy as np
import pandas as pd
import torch
import torchaudio
import tqdm

# Get the global logger:
_LOGGER = logging.getLogger()


def generate_multi_speakers_audio(
    data_path: str,
    speakers: Union[List[str], Dict[str, int]],
    available_voices: List[str],
    output_directory: str = None,
    use_gpu: bool = True,
    use_small_models: bool = False,
    offload_cpu: bool = False,
    sample_rate: int = 16000,
    file_format: str = "wav",
    verbose: bool = True,
    bits_per_sample: Optional[int] = None,
) -> Tuple[str, pd.DataFrame, dict]:
    """
    Generate audio files from text files.

    :param data_path:           Path to the text file or directory containing the text files to generate audio from.
    :param speakers:            List / Dict of speakers to generate audio for.
                                If a list is given, the speakers will be assigned to channels in the order given.
                                If dictionary, the keys will be the speakers and the values will be the channels.
    :param available_voices:    List of available voices to use for the generation.
                        See here for the available voices:
                        https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c
    :param output_directory:    Path to the directory to save the generated audio files to.
    :param use_gpu:             Whether to use the GPU for the generation.
    :param use_small_models:    Whether to use the small models for the generation.
    :param offload_cpu:         To reduce the memory footprint, the models can be offloaded to the CPU after loading.
    :param sample_rate:         The sampling rate of the generated audio.
    :param file_format:         The format of the generated audio files.
    :param verbose:             Whether to print the progress of the generation.
    :param bits_per_sample:     Changes the bit depth for the supported formats.
                                Supported only in "wav" or "flac" formats.

    :returns:                   A tuple of:
                                - The output directory path.
                                - The generated audio files dataframe.
                                - The errors dictionary.
    """

    global _LOGGER
    _LOGGER = _get_logger()
    # Get the input text files to turn to audio:
    data_path = pathlib.Path(data_path).absolute()
    text_files = _get_text_files(data_path=data_path)

    # Load the bark models according to the given configurations:
    bark.preload_models(
        text_use_gpu=use_gpu,
        text_use_small=use_small_models,
        coarse_use_gpu=use_gpu,
        coarse_use_small=use_small_models,
        fine_use_gpu=use_gpu,
        fine_use_small=use_small_models,
        codec_use_gpu=use_gpu,
        force_reload=offload_cpu,
    )

    # Check for per channel generation:
    if isinstance(speakers, dict):
        speaker_per_channel = True
        # Sort the given speakers by channels:
        speakers = {
            speaker: channel
            for speaker, channel in sorted(speakers.items(), key=lambda item: item[1])
        }
    else:
        speaker_per_channel = False

    # Prepare the resampling module:
    resampler = torchaudio.transforms.Resample(
        orig_freq=bark.SAMPLE_RATE, new_freq=sample_rate, dtype=torch.float32
    )

    # Prepare the gap between each speaker:
    gap_between_speakers = np.zeros(int(0.5 * bark.SAMPLE_RATE))

    # Prepare the successes dataframe and errors dictionary to be returned:
    successes = []
    errors = {}

    # Create the output directory:
    if output_directory is None:
        output_directory = tempfile.mkdtemp()
    output_directory = pathlib.Path(output_directory)
    if not output_directory.exists():
        output_directory.mkdir(exist_ok=True, parents=True)

    # Start generating audio:
    # Go over the audio files and transcribe:
    for text_file in tqdm.tqdm(
        text_files, desc="Generating", unit="file", disable=not verbose
    ):

        try:
            # Randomize voices for each speaker:
            chosen_voices = {}
            available_voices_copy = available_voices.copy()
            for speaker in speakers:
                voice = random.choice(available_voices_copy)
                chosen_voices[speaker] = voice
                available_voices_copy.remove(voice)
            # Read text:
            with open(text_file, "r") as fp:
                text = fp.read()
            # Prepare a holder for all the generated pieces (if per channel each speaker will have its own):
            audio_pieces = (
                {speaker: [] for speaker in speakers}
                if speaker_per_channel
                else {"all": []}
            )

            # Generate audio per line:
            for line in text.splitlines():
                # Validate line is in correct speaker format:

                if ": " not in line:
                    if verbose:
                        _LOGGER.warning(f"Skipping line: {line}")
                    continue
                # Split line to speaker and his words:
                current_speaker, sentences = line.split(": ", 1)
                # Validate speaker is known:
                if current_speaker not in speakers:
                    raise ValueError(
                        f"Unknown speaker: {current_speaker}. Given speakers are: {speakers}"
                    )
                for sentence in _split_line(line=sentences):
                    # Generate words audio:
                    audio = bark.generate_audio(
                        sentence,
                        history_prompt=chosen_voices[current_speaker],
                        silent=True,
                    )
                    if speaker_per_channel:
                        silence = np.zeros_like(audio)
                        for speaker in audio_pieces.keys():
                            if speaker == current_speaker:
                                audio_pieces[speaker] += [audio, gap_between_speakers]
                            else:
                                audio_pieces[speaker] += [silence, gap_between_speakers]
                    else:
                        audio_pieces["all"] += [audio, gap_between_speakers]
            # Construct a single audio array from all the pieces and channels:

            audio = np.vstack(
                [np.concatenate(audio_pieces[speaker]) for speaker in speakers]
            ).astype(dtype=np.float32)
            # Resample:
            audio = torch.from_numpy(audio)
            audio = resampler(audio)
            # Save to audio file:
            audio_file = output_directory / f"{text_file.stem}.{file_format}"

            torchaudio.save(
                uri=str(audio_file),
                src=audio,
                sample_rate=sample_rate,
                format=file_format,
                bits_per_sample=bits_per_sample,
            )

            # Collect to the successes:
            successes.append([text_file.name, audio_file.name])
        except Exception as exception:
            # Note the exception as error in the dictionary:
            if verbose:
                _LOGGER.warning(f"Error in file: '{text_file.name}'")
            print(exception)
            errors[text_file.name] = str(exception)

    # Construct the translations dataframe:
    successes = pd.DataFrame(
        successes,
        columns=["text_file", "audio_file"],
    )

    # Print the head of the produced dataframe and return:
    if verbose:
        _LOGGER.info(
            f"Done ({successes.shape[0]}/{len(text_files)})\n"
            f"Translations summary:\n"
            f"{successes.head()}"
        )
    return str(output_directory), successes, errors


def _get_text_files(
    data_path: pathlib.Path,
) -> List[pathlib.Path]:
    # Check if the path is of a directory or a file:
    if data_path.is_dir():
        # Get all files inside the directory:
        text_files = list(data_path.glob("*.*"))
    elif data_path.is_file():
        text_files = [data_path]
    else:
        raise ValueError(
            f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. "
            f"Given: {str(data_path)} "
        )

    return text_files


def _split_line(line: str, max_length: int = 250) -> List[str]:
    if len(line) < max_length:
        return [line]

    sentences = [
        f"{sentence.strip()}." for sentence in line.split(".") if sentence.strip()
    ]

    splits = []
    current_length = len(sentences[0])
    split = sentences[0]
    for sentence in sentences[1:]:
        if current_length + len(sentence) > max_length:
            splits.append(split)
            split = sentence
            current_length = len(sentence)
        else:
            current_length += len(sentence)
            split += " " + sentence
    if split:
        splits.append(split)

    return splits


def _get_logger():
    global _LOGGER
    try:
        import mlrun
        # Check if MLRun is available:
        context = mlrun.get_or_create_ctx(name="mlrun")
        return context.logger
    except ModuleNotFoundError:
        return _LOGGER
 + base_image: mlrun/mlrun + commands: [] + code_origin: '' + origin_filename: '' + requirements: + - bark + - torchaudio + entry_points: + generate_multi_speakers_audio: + name: generate_multi_speakers_audio + doc: Generate audio files from text files. + parameters: + - name: data_path + type: str + doc: Path to the text file or directory containing the text files to generate + audio from. + - name: speakers + type: Union[List[str], Dict[str, int]] + doc: List / Dict of speakers to generate audio for. If a list is given, the + speakers will be assigned to channels in the order given. If dictionary, + the keys will be the speakers and the values will be the channels. + - name: available_voices + type: List[str] + doc: 'List of available voices to use for the generation. See here for the + available voices: https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c' + - name: output_directory + type: str + doc: Path to the directory to save the generated audio files to. + default: null + - name: use_gpu + type: bool + doc: Whether to use the GPU for the generation. + default: true + - name: use_small_models + type: bool + doc: Whether to use the small models for the generation. + default: false + - name: offload_cpu + type: bool + doc: To reduce the memory footprint, the models can be offloaded to the CPU + after loading. + default: false + - name: sample_rate + type: int + doc: The sampling rate of the generated audio. + default: 16000 + - name: file_format + type: str + doc: The format of the generated audio files. + default: wav + - name: verbose + type: bool + doc: Whether to print the progress of the generation. + default: true + - name: bits_per_sample + type: Optional[int] + doc: Changes the bit depth for the supported formats. Supported only in "wav" + or "flac" formats. + default: null + outputs: + - doc: 'A tuple of: - The output directory path. - The generated audio files + dataframe. - The errors dictionary.' + type: Tuple[str, pd.DataFrame, dict] + lineno: 31 + has_varargs: false + has_kwargs: false + description: Generate audio file from text using different speakers + default_handler: generate_multi_speakers_audio + disable_auto_mount: false + clone_target_dir: '' + env: [] + priority_class_name: '' + preemption_mode: prevent + affinity: null + tolerations: null + security_context: {} +verbose: false diff --git a/functions/master/text_to_audio_generator/1.2.0/src/item.yaml b/functions/master/text_to_audio_generator/1.2.0/src/item.yaml new file mode 100644 index 00000000..efa8afc9 --- /dev/null +++ b/functions/master/text_to_audio_generator/1.2.0/src/item.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +categories: +- data-preparation +- machine-learning +- pytorch +description: Generate audio file from text using different speakers +doc: '' +example: text_to_audio_generator.ipynb +generationDate: 2023-12-03:15-30 +hidden: false +icon: '' +labels: + author: yonatans +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.5.1 +name: text_to_audio_generator +platformVersion: 3.5.3 +spec: + filename: text_to_audio_generator.py + handler: generate_multi_speakers_audio + image: mlrun/mlrun + kind: job + requirements: + - bark + - torchaudio +url: '' +version: 1.2.0 +test_valid: True diff --git a/functions/master/text_to_audio_generator/1.2.0/src/requirements.txt b/functions/master/text_to_audio_generator/1.2.0/src/requirements.txt new file mode 100644 index 00000000..36f17cd6 --- /dev/null +++ b/functions/master/text_to_audio_generator/1.2.0/src/requirements.txt @@ -0,0 +1,2 @@ +bark +torchaudio>=2.1.0 \ No newline at end of file diff --git a/functions/master/text_to_audio_generator/1.2.0/src/test_text_to_audio_generator.py b/functions/master/text_to_audio_generator/1.2.0/src/test_text_to_audio_generator.py new file mode 100644 index 00000000..87ffe149 --- /dev/null +++ b/functions/master/text_to_audio_generator/1.2.0/src/test_text_to_audio_generator.py @@ -0,0 +1,50 @@ +# Copyright 2023 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import mlrun +import tempfile +import pytest + + +@pytest.mark.parametrize("file_format,bits_per_sample", [("wav", 8), ("mp3", None)]) +def test_generate_multi_speakers_audio(file_format, bits_per_sample): + text_to_audio_generator_function = mlrun.import_function("function.yaml") + with tempfile.TemporaryDirectory() as test_directory: + function_run = text_to_audio_generator_function.run( + handler="generate_multi_speakers_audio", + inputs={"data_path": "data/test_data.txt"}, + params={ + "output_directory": test_directory, + "speakers": {"Agent": 0, "Client": 1}, + "available_voices": [ + "v2/en_speaker_0", + "v2/en_speaker_1", + ], + "use_small_models": True, + "use_gpu": False, + "offload_cpu": True, + "file_format": file_format, + "bits_per_sample": bits_per_sample, + }, + local=True, + returns=[ + "audio_files: path", + "audio_files_dataframe: dataset", + "text_to_speech_errors: file", + ], + artifact_path=test_directory, + ) + assert function_run.error == "Run state (completed) is not in error state" + for key in ["audio_files", "audio_files_dataframe", "text_to_speech_errors"]: + assert key in function_run.outputs and function_run.outputs[key] is not None diff --git a/functions/master/text_to_audio_generator/1.2.0/src/text_to_audio_generator.ipynb b/functions/master/text_to_audio_generator/1.2.0/src/text_to_audio_generator.ipynb new file mode 100644 index 00000000..268fe2ef --- /dev/null +++ b/functions/master/text_to_audio_generator/1.2.0/src/text_to_audio_generator.ipynb @@ -0,0 +1,450 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cbbc90d3-8179-42fa-8f67-0cfe90546759", + "metadata": {}, + "source": [ + "# Text to audio conversation generator" + ] + }, + { + "cell_type": "markdown", + "id": "cf22790e-9e00-46d6-9e02-13ae46b3baee", + "metadata": {}, + "source": [ + "This function converts the text from a specified text file into speech and saves this as an audio file using the Bark library.
    \n", + "It's designed to facilitate easy generation of speech from written transcripts.\n" + ] + }, + { + "cell_type": "markdown", + "id": "1ed4da8a-6d60-41a3-a4b1-6c7469356ea8", + "metadata": {}, + "source": [ + "## Example Usage:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "bb20c4a6-f362-40e6-8f73-9145953959ec", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun\n", + "import tempfile" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d0786055-34e9-4d15-a8b2-8a736244e9de", + "metadata": {}, + "outputs": [], + "source": [ + "# Import function \n", + "text_to_audio_generator_function = mlrun.import_function(\"hub://text_to_audio_generator\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8f9400bf-ac37-4a52-a0ed-1b0ae399ea82", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-04 14:08:48,769 [info] Storing function: {'name': 'text-to-audio-generator-generate-multi-speakers-audio', 'uid': 'ba017dfc11624de9afb5e148a6678a8b', 'db': 'http://mlrun-api:8080'}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n", + "Generating: 100%|██████████| 1/1 [00:23<00:00, 23.74s/file]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-04 14:10:05,123 [info] Done (1/1)\n", + "Translations summary:\n", + " text_file audio_file\n", + "0 test_data.txt test_data.mp3\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
    \n", + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    default0Dec 04 14:08:48completedtext-to-audio-generator-generate-multi-speakers-audio
    v3io_user=yonis
    kind=local
    owner=yonis
    host=jupyter-yonis-7c9bdbfb4d-9g2p2
    data_path
    output_directory=./out
    speakers={'Agent': 0, 'Client': 1}
    available_voices=['v2/en_speaker_0', 'v2/en_speaker_1']
    use_small_models=True
    use_gpu=False
    offload_cpu=True
    file_format=mp3
    audio_files
    audio_files_dataframe
    text_to_speech_errors
    \n", + "
    \n", + "
    \n", + "
    \n", + " Title\n", + " ×\n", + "
    \n", + " \n", + "
    \n", + "
    \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-04 14:10:05,486 [info] Run execution finished: {'status': 'completed', 'name': 'text-to-audio-generator-generate-multi-speakers-audio'}\n" + ] + } + ], + "source": [ + "# Run the function with desired text files\n", + "function_run = text_to_audio_generator_function.run(\n", + " handler=\"generate_multi_speakers_audio\",\n", + " inputs={\"data_path\": \"./test_data.txt\"},\n", + " params={\n", + " \"output_directory\": \"./out\",\n", + " \"speakers\": {\"Agent\": 0, \"Client\": 1},\n", + " \"available_voices\": [\n", + " \"v2/en_speaker_0\",\n", + " \"v2/en_speaker_1\",\n", + " ],\n", + " \"use_small_models\": True,\n", + " \"use_gpu\": False,\n", + " \"offload_cpu\": True,\n", + " \"file_format\": \"mp3\",\n", + " # \"bits_per_sample\": 8,\n", + " },\n", + " local=True,\n", + " returns=[\n", + " \"audio_files: path\",\n", + " \"audio_files_dataframe: dataset\",\n", + " \"text_to_speech_errors: file\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "014420d1-a03c-47de-ba96-d631d0b4ee10", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    text_fileaudio_file
    0test_data.txttest_data.mp3
    \n", + "
    " + ], + "text/plain": [ + " text_file audio_file\n", + "0 test_data.txt test_data.mp3" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "function_run.artifact(\"audio_files_dataframe\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4d23e07e-788e-4b4e-a517-d921f6a2a24c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import IPython\n", + "\n", + "IPython.display.Audio(\"./out/test_data.mp3\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlrun-base", + "language": "python", + "name": "conda-env-mlrun-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/functions/master/text_to_audio_generator/1.2.0/src/text_to_audio_generator.py b/functions/master/text_to_audio_generator/1.2.0/src/text_to_audio_generator.py new file mode 100644 index 00000000..7602745e --- /dev/null +++ b/functions/master/text_to_audio_generator/1.2.0/src/text_to_audio_generator.py @@ -0,0 +1,268 @@ +# Copyright 2023 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import pathlib +import random +import tempfile +from typing import Dict, List, Optional, Tuple, Union + +import bark +import numpy as np +import pandas as pd +import torch +import torchaudio +import tqdm + +# Get the global logger: +_LOGGER = logging.getLogger() + + +def generate_multi_speakers_audio( + data_path: str, + speakers: Union[List[str], Dict[str, int]], + available_voices: List[str], + output_directory: str = None, + use_gpu: bool = True, + use_small_models: bool = False, + offload_cpu: bool = False, + sample_rate: int = 16000, + file_format: str = "wav", + verbose: bool = True, + bits_per_sample: Optional[int] = None, +) -> Tuple[str, pd.DataFrame, dict]: + """ + Generate audio files from text files. + + :param data_path: Path to the text file or directory containing the text files to generate audio from. + :param speakers: List / Dict of speakers to generate audio for. + If a list is given, the speakers will be assigned to channels in the order given. + If dictionary, the keys will be the speakers and the values will be the channels. + :param available_voices: List of available voices to use for the generation. + See here for the available voices: + https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c + :param output_directory: Path to the directory to save the generated audio files to. + :param use_gpu: Whether to use the GPU for the generation. + :param use_small_models: Whether to use the small models for the generation. + :param offload_cpu: To reduce the memory footprint, the models can be offloaded to the CPU after loading. + :param sample_rate: The sampling rate of the generated audio. + :param file_format: The format of the generated audio files. + :param verbose: Whether to print the progress of the generation. + :param bits_per_sample: Changes the bit depth for the supported formats. + Supported only in "wav" or "flac" formats. + + :returns: A tuple of: + - The output directory path. + - The generated audio files dataframe. + - The errors dictionary. + """ + + global _LOGGER + _LOGGER = _get_logger() + # Get the input text files to turn to audio: + data_path = pathlib.Path(data_path).absolute() + text_files = _get_text_files(data_path=data_path) + + # Load the bark models according to the given configurations: + bark.preload_models( + text_use_gpu=use_gpu, + text_use_small=use_small_models, + coarse_use_gpu=use_gpu, + coarse_use_small=use_small_models, + fine_use_gpu=use_gpu, + fine_use_small=use_small_models, + codec_use_gpu=use_gpu, + force_reload=offload_cpu, + ) + + # Check for per channel generation: + if isinstance(speakers, dict): + speaker_per_channel = True + # Sort the given speakers by channels: + speakers = { + speaker: channel + for speaker, channel in sorted(speakers.items(), key=lambda item: item[1]) + } + else: + speaker_per_channel = False + + # Prepare the resampling module: + resampler = torchaudio.transforms.Resample( + orig_freq=bark.SAMPLE_RATE, new_freq=sample_rate, dtype=torch.float32 + ) + + # Prepare the gap between each speaker: + gap_between_speakers = np.zeros(int(0.5 * bark.SAMPLE_RATE)) + + # Prepare the successes dataframe and errors dictionary to be returned: + successes = [] + errors = {} + + # Create the output directory: + if output_directory is None: + output_directory = tempfile.mkdtemp() + output_directory = pathlib.Path(output_directory) + if not output_directory.exists(): + output_directory.mkdir(exist_ok=True, parents=True) + + # Start generating audio: + # Go over the audio files and transcribe: + for text_file in tqdm.tqdm( + text_files, desc="Generating", unit="file", disable=not verbose + ): + + try: + # Randomize voices for each speaker: + chosen_voices = {} + available_voices_copy = available_voices.copy() + for speaker in speakers: + voice = random.choice(available_voices_copy) + chosen_voices[speaker] = voice + available_voices_copy.remove(voice) + # Read text: + with open(text_file, "r") as fp: + text = fp.read() + # Prepare a holder for all the generated pieces (if per channel each speaker will have its own): + audio_pieces = ( + {speaker: [] for speaker in speakers} + if speaker_per_channel + else {"all": []} + ) + + # Generate audio per line: + for line in text.splitlines(): + # Validate line is in correct speaker format: + + if ": " not in line: + if verbose: + _LOGGER.warning(f"Skipping line: {line}") + continue + # Split line to speaker and his words: + current_speaker, sentences = line.split(": ", 1) + # Validate speaker is known: + if current_speaker not in speakers: + raise ValueError( + f"Unknown speaker: {current_speaker}. Given speakers are: {speakers}" + ) + for sentence in _split_line(line=sentences): + # Generate words audio: + audio = bark.generate_audio( + sentence, + history_prompt=chosen_voices[current_speaker], + silent=True, + ) + if speaker_per_channel: + silence = np.zeros_like(audio) + for speaker in audio_pieces.keys(): + if speaker == current_speaker: + audio_pieces[speaker] += [audio, gap_between_speakers] + else: + audio_pieces[speaker] += [silence, gap_between_speakers] + else: + audio_pieces["all"] += [audio, gap_between_speakers] + # Construct a single audio array from all the pieces and channels: + + audio = np.vstack( + [np.concatenate(audio_pieces[speaker]) for speaker in speakers] + ).astype(dtype=np.float32) + # Resample: + audio = torch.from_numpy(audio) + audio = resampler(audio) + # Save to audio file: + audio_file = output_directory / f"{text_file.stem}.{file_format}" + + torchaudio.save( + uri=str(audio_file), + src=audio, + sample_rate=sample_rate, + format=file_format, + bits_per_sample=bits_per_sample, + ) + + # Collect to the successes: + successes.append([text_file.name, audio_file.name]) + except Exception as exception: + # Note the exception as error in the dictionary: + if verbose: + _LOGGER.warning(f"Error in file: '{text_file.name}'") + print(exception) + errors[text_file.name] = str(exception) + + # Construct the translations dataframe: + successes = pd.DataFrame( + successes, + columns=["text_file", "audio_file"], + ) + + # Print the head of the produced dataframe and return: + if verbose: + _LOGGER.info( + f"Done ({successes.shape[0]}/{len(text_files)})\n" + f"Translations summary:\n" + f"{successes.head()}" + ) + return str(output_directory), successes, errors + + +def _get_text_files( + data_path: pathlib.Path, +) -> List[pathlib.Path]: + # Check if the path is of a directory or a file: + if data_path.is_dir(): + # Get all files inside the directory: + text_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + text_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. " + f"Given: {str(data_path)} " + ) + + return text_files + + +def _split_line(line: str, max_length: int = 250) -> List[str]: + if len(line) < max_length: + return [line] + + sentences = [ + f"{sentence.strip()}." for sentence in line.split(".") if sentence.strip() + ] + + splits = [] + current_length = len(sentences[0]) + split = sentences[0] + for sentence in sentences[1:]: + if current_length + len(sentence) > max_length: + splits.append(split) + split = sentence + current_length = len(sentence) + else: + current_length += len(sentence) + split += " " + sentence + if split: + splits.append(split) + + return splits + + +def _get_logger(): + global _LOGGER + try: + import mlrun + # Check if MLRun is available: + context = mlrun.get_or_create_ctx(name="mlrun") + return context.logger + except ModuleNotFoundError: + return _LOGGER diff --git a/functions/master/text_to_audio_generator/1.2.0/static/documentation.html b/functions/master/text_to_audio_generator/1.2.0/static/documentation.html new file mode 100644 index 00000000..7cefacdc --- /dev/null +++ b/functions/master/text_to_audio_generator/1.2.0/static/documentation.html @@ -0,0 +1,255 @@ + + + + + + + +text_to_audio_generator package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + + + +
    +
    + + +
    +
    +
    + +
    +

    text_to_audio_generator package

    + +
    + +
    +
    +
    +
    +
    +

    text_to_audio_generator package#

    +
    +

    Submodules#

    +
    +
    +

    text_to_audio_generator.text_to_audio_generator module#

    +
    +
    +text_to_audio_generator.text_to_audio_generator.generate_multi_speakers_audio(data_path: str, speakers: Union[List[str], Dict[str, int]], available_voices: List[str], output_directory: Optional[str] = None, use_gpu: bool = True, use_small_models: bool = False, offload_cpu: bool = False, sample_rate: int = 16000, file_format: str = 'wav', verbose: bool = True, bits_per_sample: Optional[int] = None)Tuple[str, pandas.core.frame.DataFrame, dict][source]#
    +

    Generate audio files from text files.

    +
    +
    Parameters
    +
      +
    • data_path – Path to the text file or directory containing the text files to generate audio from.

    • +
    • speakers – List / Dict of speakers to generate audio for. +If a list is given, the speakers will be assigned to channels in the order given. +If dictionary, the keys will be the speakers and the values will be the channels.

    • +
    • available_voices – List of available voices to use for the generation. +See here for the available voices: +https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c

    • +
    • output_directory – Path to the directory to save the generated audio files to.

    • +
    • use_gpu – Whether to use the GPU for the generation.

    • +
    • use_small_models – Whether to use the small models for the generation.

    • +
    • offload_cpu – To reduce the memory footprint, the models can be offloaded to the CPU after loading.

    • +
    • sample_rate – The sampling rate of the generated audio.

    • +
    • file_format – The format of the generated audio files.

    • +
    • verbose – Whether to print the progress of the generation.

    • +
    • bits_per_sample – Changes the bit depth for the supported formats. +Supported only in “wav” or “flac” formats.

    • +
    +
    +
    Returns
    +

    A tuple of: +- The output directory path. +- The generated audio files dataframe. +- The errors dictionary.

    +
    +
    +
    +
    +
    +

    Module contents#

    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/text_to_audio_generator/1.2.0/static/example.html b/functions/master/text_to_audio_generator/1.2.0/static/example.html new file mode 100644 index 00000000..17b2e2b4 --- /dev/null +++ b/functions/master/text_to_audio_generator/1.2.0/static/example.html @@ -0,0 +1,523 @@ + + + + + + + +Text to audio conversation generator + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + + + +
    +
    + +
    +
    + Contents +
    + +
    +
    +
    +
    + +
    +

    Text to audio conversation generator

    + +
    +
    +
    +

    Contents

    +
    + +
    +
    +
    +
    +
    +
    +

    Text to audio conversation generator#

    +

    This function converts the text from a specified text file into speech and saves this as an audio file using the Bark library.
    +It’s designed to facilitate easy generation of speech from written transcripts.

    +
    +

    Example Usage:#

    +
    +
    +
    import mlrun
    +import tempfile
    +
    +
    +
    +
    +
    +
    +
    # Import function 
    +text_to_audio_generator_function = mlrun.import_function("hub://text_to_audio_generator")
    +
    +
    +
    +
    +
    +
    +
    # Run the function with desired text files
    +function_run = text_to_audio_generator_function.run(
    +    handler="generate_multi_speakers_audio",
    +    inputs={"data_path": "./test_data.txt"},
    +    params={
    +        "output_directory": "./out",
    +        "speakers": {"Agent": 0, "Client": 1},
    +        "available_voices": [
    +            "v2/en_speaker_0",
    +            "v2/en_speaker_1",
    +        ],
    +        "use_small_models": True,
    +        "use_gpu": False,
    +        "offload_cpu": True,
    +        "file_format": "mp3",
    +        # "bits_per_sample": 8,
    +    },
    +    local=True,
    +    returns=[
    +        "audio_files: path",
    +        "audio_files_dataframe: dataset",
    +        "text_to_speech_errors: file",
    +    ],
    +)
    +
    +
    +
    +
    +
    > 2023-12-04 14:08:48,769 [info] Storing function: {'name': 'text-to-audio-generator-generate-multi-speakers-audio', 'uid': 'ba017dfc11624de9afb5e148a6678a8b', 'db': 'http://mlrun-api:8080'}
    +
    +
    +
    torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.
    +Generating: 100%|██████████| 1/1 [00:23<00:00, 23.74s/file]
    +
    +
    +
    > 2023-12-04 14:10:05,123 [info] Done (1/1)
    +Translations summary:
    +       text_file     audio_file
    +0  test_data.txt  test_data.mp3
    +
    +
    +
    
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    default0Dec 04 14:08:48completedtext-to-audio-generator-generate-multi-speakers-audio
    v3io_user=yonis
    kind=local
    owner=yonis
    host=jupyter-yonis-7c9bdbfb4d-9g2p2
    data_path
    output_directory=./out
    speakers={'Agent': 0, 'Client': 1}
    available_voices=['v2/en_speaker_0', 'v2/en_speaker_1']
    use_small_models=True
    use_gpu=False
    offload_cpu=True
    file_format=mp3
    audio_files
    audio_files_dataframe
    text_to_speech_errors
    +
    + +
    +
    
    +
    +
    +
    > to track results use the .show() or .logs() methods or click here to open in UI
    > 2023-12-04 14:10:05,486 [info] Run execution finished: {'status': 'completed', 'name': 'text-to-audio-generator-generate-multi-speakers-audio'}
    +
    +
    +
    +
    +
    +
    +
    function_run.artifact("audio_files_dataframe").show()
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + +
    text_fileaudio_file
    0test_data.txttest_data.mp3
    +
    +
    +
    +
    +
    import IPython
    +
    +IPython.display.Audio("./out/test_data.mp3")
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/text_to_audio_generator/1.2.0/static/function.html b/functions/master/text_to_audio_generator/1.2.0/static/function.html new file mode 100644 index 00000000..5ed19c22 --- /dev/null +++ b/functions/master/text_to_audio_generator/1.2.0/static/function.html @@ -0,0 +1,117 @@ + + + + + + + + + + + Source + + + + +
    +        
    +kind: job
    +metadata:
    +  name: text-to-audio-generator
    +  tag: ''
    +  hash: 89fcaf3fab53e7b7fbba448a5e65c253d7fa66ed
    +  project: ''
    +  labels:
    +    author: yonatans
    +  categories:
    +  - data-preparation
    +  - machine-learning
    +  - pytorch
    +spec:
    +  command: ''
    +  args: []
    +  image: ''
    +  build:
    +    functionSourceCode: # Copyright 2023 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import pathlib
import random
import tempfile
from typing import Dict, List, Optional, Tuple, Union

import bark
import numpy as np
import pandas as pd
import torch
import torchaudio
import tqdm

# Get the global logger:
_LOGGER = logging.getLogger()


def generate_multi_speakers_audio(
    data_path: str,
    speakers: Union[List[str], Dict[str, int]],
    available_voices: List[str],
    output_directory: str = None,
    use_gpu: bool = True,
    use_small_models: bool = False,
    offload_cpu: bool = False,
    sample_rate: int = 16000,
    file_format: str = "wav",
    verbose: bool = True,
    bits_per_sample: Optional[int] = None,
) -> Tuple[str, pd.DataFrame, dict]:
    """
    Generate audio files from text files.

    :param data_path:           Path to the text file or directory containing the text files to generate audio from.
    :param speakers:            List / Dict of speakers to generate audio for.
                                If a list is given, the speakers will be assigned to channels in the order given.
                                If dictionary, the keys will be the speakers and the values will be the channels.
    :param available_voices:    List of available voices to use for the generation.
                        See here for the available voices:
                        https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c
    :param output_directory:    Path to the directory to save the generated audio files to.
    :param use_gpu:             Whether to use the GPU for the generation.
    :param use_small_models:    Whether to use the small models for the generation.
    :param offload_cpu:         To reduce the memory footprint, the models can be offloaded to the CPU after loading.
    :param sample_rate:         The sampling rate of the generated audio.
    :param file_format:         The format of the generated audio files.
    :param verbose:             Whether to print the progress of the generation.
    :param bits_per_sample:     Changes the bit depth for the supported formats.
                                Supported only in "wav" or "flac" formats.

    :returns:                   A tuple of:
                                - The output directory path.
                                - The generated audio files dataframe.
                                - The errors dictionary.
    """

    global _LOGGER
    _LOGGER = _get_logger()
    # Get the input text files to turn to audio:
    data_path = pathlib.Path(data_path).absolute()
    text_files = _get_text_files(data_path=data_path)

    # Load the bark models according to the given configurations:
    bark.preload_models(
        text_use_gpu=use_gpu,
        text_use_small=use_small_models,
        coarse_use_gpu=use_gpu,
        coarse_use_small=use_small_models,
        fine_use_gpu=use_gpu,
        fine_use_small=use_small_models,
        codec_use_gpu=use_gpu,
        force_reload=offload_cpu,
    )

    # Check for per channel generation:
    if isinstance(speakers, dict):
        speaker_per_channel = True
        # Sort the given speakers by channels:
        speakers = {
            speaker: channel
            for speaker, channel in sorted(speakers.items(), key=lambda item: item[1])
        }
    else:
        speaker_per_channel = False

    # Prepare the resampling module:
    resampler = torchaudio.transforms.Resample(
        orig_freq=bark.SAMPLE_RATE, new_freq=sample_rate, dtype=torch.float32
    )

    # Prepare the gap between each speaker:
    gap_between_speakers = np.zeros(int(0.5 * bark.SAMPLE_RATE))

    # Prepare the successes dataframe and errors dictionary to be returned:
    successes = []
    errors = {}

    # Create the output directory:
    if output_directory is None:
        output_directory = tempfile.mkdtemp()
    output_directory = pathlib.Path(output_directory)
    if not output_directory.exists():
        output_directory.mkdir(exist_ok=True, parents=True)

    # Start generating audio:
    # Go over the audio files and transcribe:
    for text_file in tqdm.tqdm(
        text_files, desc="Generating", unit="file", disable=not verbose
    ):

        try:
            # Randomize voices for each speaker:
            chosen_voices = {}
            available_voices_copy = available_voices.copy()
            for speaker in speakers:
                voice = random.choice(available_voices_copy)
                chosen_voices[speaker] = voice
                available_voices_copy.remove(voice)
            # Read text:
            with open(text_file, "r") as fp:
                text = fp.read()
            # Prepare a holder for all the generated pieces (if per channel each speaker will have its own):
            audio_pieces = (
                {speaker: [] for speaker in speakers}
                if speaker_per_channel
                else {"all": []}
            )

            # Generate audio per line:
            for line in text.splitlines():
                # Validate line is in correct speaker format:

                if ": " not in line:
                    if verbose:
                        _LOGGER.warning(f"Skipping line: {line}")
                    continue
                # Split line to speaker and his words:
                current_speaker, sentences = line.split(": ", 1)
                # Validate speaker is known:
                if current_speaker not in speakers:
                    raise ValueError(
                        f"Unknown speaker: {current_speaker}. Given speakers are: {speakers}"
                    )
                for sentence in _split_line(line=sentences):
                    # Generate words audio:
                    audio = bark.generate_audio(
                        sentence,
                        history_prompt=chosen_voices[current_speaker],
                        silent=True,
                    )
                    if speaker_per_channel:
                        silence = np.zeros_like(audio)
                        for speaker in audio_pieces.keys():
                            if speaker == current_speaker:
                                audio_pieces[speaker] += [audio, gap_between_speakers]
                            else:
                                audio_pieces[speaker] += [silence, gap_between_speakers]
                    else:
                        audio_pieces["all"] += [audio, gap_between_speakers]
            # Construct a single audio array from all the pieces and channels:

            audio = np.vstack(
                [np.concatenate(audio_pieces[speaker]) for speaker in speakers]
            ).astype(dtype=np.float32)
            # Resample:
            audio = torch.from_numpy(audio)
            audio = resampler(audio)
            # Save to audio file:
            audio_file = output_directory / f"{text_file.stem}.{file_format}"

            torchaudio.save(
                uri=str(audio_file),
                src=audio,
                sample_rate=sample_rate,
                format=file_format,
                bits_per_sample=bits_per_sample,
            )

            # Collect to the successes:
            successes.append([text_file.name, audio_file.name])
        except Exception as exception:
            # Note the exception as error in the dictionary:
            if verbose:
                _LOGGER.warning(f"Error in file: '{text_file.name}'")
            print(exception)
            errors[text_file.name] = str(exception)

    # Construct the translations dataframe:
    successes = pd.DataFrame(
        successes,
        columns=["text_file", "audio_file"],
    )

    # Print the head of the produced dataframe and return:
    if verbose:
        _LOGGER.info(
            f"Done ({successes.shape[0]}/{len(text_files)})\n"
            f"Translations summary:\n"
            f"{successes.head()}"
        )
    return str(output_directory), successes, errors


def _get_text_files(
    data_path: pathlib.Path,
) -> List[pathlib.Path]:
    # Check if the path is of a directory or a file:
    if data_path.is_dir():
        # Get all files inside the directory:
        text_files = list(data_path.glob("*.*"))
    elif data_path.is_file():
        text_files = [data_path]
    else:
        raise ValueError(
            f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. "
            f"Given: {str(data_path)} "
        )

    return text_files


def _split_line(line: str, max_length: int = 250) -> List[str]:
    if len(line) < max_length:
        return [line]

    sentences = [
        f"{sentence.strip()}." for sentence in line.split(".") if sentence.strip()
    ]

    splits = []
    current_length = len(sentences[0])
    split = sentences[0]
    for sentence in sentences[1:]:
        if current_length + len(sentence) > max_length:
            splits.append(split)
            split = sentence
            current_length = len(sentence)
        else:
            current_length += len(sentence)
            split += " " + sentence
    if split:
        splits.append(split)

    return splits


def _get_logger():
    global _LOGGER
    try:
        import mlrun
        # Check if MLRun is available:
        context = mlrun.get_or_create_ctx(name="mlrun")
        return context.logger
    except ModuleNotFoundError:
        return _LOGGER

    +    base_image: mlrun/mlrun
    +    commands: []
    +    code_origin: ''
    +    origin_filename: ''
    +    requirements:
    +    - bark
    +    - torchaudio
    +  entry_points:
    +    generate_multi_speakers_audio:
    +      name: generate_multi_speakers_audio
    +      doc: Generate audio files from text files.
    +      parameters:
    +      - name: data_path
    +        type: str
    +        doc: Path to the text file or directory containing the text files to generate
    +          audio from.
    +      - name: speakers
    +        type: Union[List[str], Dict[str, int]]
    +        doc: List / Dict of speakers to generate audio for. If a list is given, the
    +          speakers will be assigned to channels in the order given. If dictionary,
    +          the keys will be the speakers and the values will be the channels.
    +      - name: available_voices
    +        type: List[str]
    +        doc: 'List of available voices to use for the generation. See here for the
    +          available voices: https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c'
    +      - name: output_directory
    +        type: str
    +        doc: Path to the directory to save the generated audio files to.
    +        default: null
    +      - name: use_gpu
    +        type: bool
    +        doc: Whether to use the GPU for the generation.
    +        default: true
    +      - name: use_small_models
    +        type: bool
    +        doc: Whether to use the small models for the generation.
    +        default: false
    +      - name: offload_cpu
    +        type: bool
    +        doc: To reduce the memory footprint, the models can be offloaded to the CPU
    +          after loading.
    +        default: false
    +      - name: sample_rate
    +        type: int
    +        doc: The sampling rate of the generated audio.
    +        default: 16000
    +      - name: file_format
    +        type: str
    +        doc: The format of the generated audio files.
    +        default: wav
    +      - name: verbose
    +        type: bool
    +        doc: Whether to print the progress of the generation.
    +        default: true
    +      - name: bits_per_sample
    +        type: Optional[int]
    +        doc: Changes the bit depth for the supported formats. Supported only in "wav"
    +          or "flac" formats.
    +        default: null
    +      outputs:
    +      - doc: 'A tuple of: - The output directory path. - The generated audio files
    +          dataframe. - The errors dictionary.'
    +        type: Tuple[str, pd.DataFrame, dict]
    +      lineno: 31
    +      has_varargs: false
    +      has_kwargs: false
    +  description: Generate audio file from text using different speakers
    +  default_handler: generate_multi_speakers_audio
    +  disable_auto_mount: false
    +  clone_target_dir: ''
    +  env: []
    +  priority_class_name: ''
    +  preemption_mode: prevent
    +  affinity: null
    +  tolerations: null
    +  security_context: {}
    +verbose: false
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/text_to_audio_generator/1.2.0/static/item.html b/functions/master/text_to_audio_generator/1.2.0/static/item.html new file mode 100644 index 00000000..282a8a3a --- /dev/null +++ b/functions/master/text_to_audio_generator/1.2.0/static/item.html @@ -0,0 +1,51 @@ + + + + + + + + + + + Source + + + + +
    +        
    +apiVersion: v1
    +categories:
    +- data-preparation
    +- machine-learning
    +- pytorch
    +description: Generate audio file from text using different speakers
    +doc: ''
    +example: text_to_audio_generator.ipynb
    +generationDate: 2023-12-03:15-30
    +hidden: false
    +icon: ''
    +labels:
    +  author: yonatans
    +maintainers: []
    +marketplaceType: ''
    +mlrunVersion: 1.5.1
    +name: text_to_audio_generator
    +platformVersion: 3.5.3
    +spec:
    +  filename: text_to_audio_generator.py
    +  handler: generate_multi_speakers_audio
    +  image: mlrun/mlrun
    +  kind: job
    +  requirements:
    +    - bark
    +    - torchaudio
    +url: ''
    +version: 1.2.0
    +test_valid: True
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/text_to_audio_generator/1.2.0/static/source.html b/functions/master/text_to_audio_generator/1.2.0/static/source.html new file mode 100644 index 00000000..5ef177a1 --- /dev/null +++ b/functions/master/text_to_audio_generator/1.2.0/static/source.html @@ -0,0 +1,290 @@ + + + + + + + + + + + Source + + + + +
    +        
    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +import logging
    +import pathlib
    +import random
    +import tempfile
    +from typing import Dict, List, Optional, Tuple, Union
    +
    +import bark
    +import numpy as np
    +import pandas as pd
    +import torch
    +import torchaudio
    +import tqdm
    +
    +# Get the global logger:
    +_LOGGER = logging.getLogger()
    +
    +
    +def generate_multi_speakers_audio(
    +    data_path: str,
    +    speakers: Union[List[str], Dict[str, int]],
    +    available_voices: List[str],
    +    output_directory: str = None,
    +    use_gpu: bool = True,
    +    use_small_models: bool = False,
    +    offload_cpu: bool = False,
    +    sample_rate: int = 16000,
    +    file_format: str = "wav",
    +    verbose: bool = True,
    +    bits_per_sample: Optional[int] = None,
    +) -> Tuple[str, pd.DataFrame, dict]:
    +    """
    +    Generate audio files from text files.
    +
    +    :param data_path:           Path to the text file or directory containing the text files to generate audio from.
    +    :param speakers:            List / Dict of speakers to generate audio for.
    +                                If a list is given, the speakers will be assigned to channels in the order given.
    +                                If dictionary, the keys will be the speakers and the values will be the channels.
    +    :param available_voices:    List of available voices to use for the generation.
    +                        See here for the available voices:
    +                        https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c
    +    :param output_directory:    Path to the directory to save the generated audio files to.
    +    :param use_gpu:             Whether to use the GPU for the generation.
    +    :param use_small_models:    Whether to use the small models for the generation.
    +    :param offload_cpu:         To reduce the memory footprint, the models can be offloaded to the CPU after loading.
    +    :param sample_rate:         The sampling rate of the generated audio.
    +    :param file_format:         The format of the generated audio files.
    +    :param verbose:             Whether to print the progress of the generation.
    +    :param bits_per_sample:     Changes the bit depth for the supported formats.
    +                                Supported only in "wav" or "flac" formats.
    +
    +    :returns:                   A tuple of:
    +                                - The output directory path.
    +                                - The generated audio files dataframe.
    +                                - The errors dictionary.
    +    """
    +
    +    global _LOGGER
    +    _LOGGER = _get_logger()
    +    # Get the input text files to turn to audio:
    +    data_path = pathlib.Path(data_path).absolute()
    +    text_files = _get_text_files(data_path=data_path)
    +
    +    # Load the bark models according to the given configurations:
    +    bark.preload_models(
    +        text_use_gpu=use_gpu,
    +        text_use_small=use_small_models,
    +        coarse_use_gpu=use_gpu,
    +        coarse_use_small=use_small_models,
    +        fine_use_gpu=use_gpu,
    +        fine_use_small=use_small_models,
    +        codec_use_gpu=use_gpu,
    +        force_reload=offload_cpu,
    +    )
    +
    +    # Check for per channel generation:
    +    if isinstance(speakers, dict):
    +        speaker_per_channel = True
    +        # Sort the given speakers by channels:
    +        speakers = {
    +            speaker: channel
    +            for speaker, channel in sorted(speakers.items(), key=lambda item: item[1])
    +        }
    +    else:
    +        speaker_per_channel = False
    +
    +    # Prepare the resampling module:
    +    resampler = torchaudio.transforms.Resample(
    +        orig_freq=bark.SAMPLE_RATE, new_freq=sample_rate, dtype=torch.float32
    +    )
    +
    +    # Prepare the gap between each speaker:
    +    gap_between_speakers = np.zeros(int(0.5 * bark.SAMPLE_RATE))
    +
    +    # Prepare the successes dataframe and errors dictionary to be returned:
    +    successes = []
    +    errors = {}
    +
    +    # Create the output directory:
    +    if output_directory is None:
    +        output_directory = tempfile.mkdtemp()
    +    output_directory = pathlib.Path(output_directory)
    +    if not output_directory.exists():
    +        output_directory.mkdir(exist_ok=True, parents=True)
    +
    +    # Start generating audio:
    +    # Go over the audio files and transcribe:
    +    for text_file in tqdm.tqdm(
    +        text_files, desc="Generating", unit="file", disable=not verbose
    +    ):
    +
    +        try:
    +            # Randomize voices for each speaker:
    +            chosen_voices = {}
    +            available_voices_copy = available_voices.copy()
    +            for speaker in speakers:
    +                voice = random.choice(available_voices_copy)
    +                chosen_voices[speaker] = voice
    +                available_voices_copy.remove(voice)
    +            # Read text:
    +            with open(text_file, "r") as fp:
    +                text = fp.read()
    +            # Prepare a holder for all the generated pieces (if per channel each speaker will have its own):
    +            audio_pieces = (
    +                {speaker: [] for speaker in speakers}
    +                if speaker_per_channel
    +                else {"all": []}
    +            )
    +
    +            # Generate audio per line:
    +            for line in text.splitlines():
    +                # Validate line is in correct speaker format:
    +
    +                if ": " not in line:
    +                    if verbose:
    +                        _LOGGER.warning(f"Skipping line: {line}")
    +                    continue
    +                # Split line to speaker and his words:
    +                current_speaker, sentences = line.split(": ", 1)
    +                # Validate speaker is known:
    +                if current_speaker not in speakers:
    +                    raise ValueError(
    +                        f"Unknown speaker: {current_speaker}. Given speakers are: {speakers}"
    +                    )
    +                for sentence in _split_line(line=sentences):
    +                    # Generate words audio:
    +                    audio = bark.generate_audio(
    +                        sentence,
    +                        history_prompt=chosen_voices[current_speaker],
    +                        silent=True,
    +                    )
    +                    if speaker_per_channel:
    +                        silence = np.zeros_like(audio)
    +                        for speaker in audio_pieces.keys():
    +                            if speaker == current_speaker:
    +                                audio_pieces[speaker] += [audio, gap_between_speakers]
    +                            else:
    +                                audio_pieces[speaker] += [silence, gap_between_speakers]
    +                    else:
    +                        audio_pieces["all"] += [audio, gap_between_speakers]
    +            # Construct a single audio array from all the pieces and channels:
    +
    +            audio = np.vstack(
    +                [np.concatenate(audio_pieces[speaker]) for speaker in speakers]
    +            ).astype(dtype=np.float32)
    +            # Resample:
    +            audio = torch.from_numpy(audio)
    +            audio = resampler(audio)
    +            # Save to audio file:
    +            audio_file = output_directory / f"{text_file.stem}.{file_format}"
    +
    +            torchaudio.save(
    +                uri=str(audio_file),
    +                src=audio,
    +                sample_rate=sample_rate,
    +                format=file_format,
    +                bits_per_sample=bits_per_sample,
    +            )
    +
    +            # Collect to the successes:
    +            successes.append([text_file.name, audio_file.name])
    +        except Exception as exception:
    +            # Note the exception as error in the dictionary:
    +            if verbose:
    +                _LOGGER.warning(f"Error in file: '{text_file.name}'")
    +            print(exception)
    +            errors[text_file.name] = str(exception)
    +
    +    # Construct the translations dataframe:
    +    successes = pd.DataFrame(
    +        successes,
    +        columns=["text_file", "audio_file"],
    +    )
    +
    +    # Print the head of the produced dataframe and return:
    +    if verbose:
    +        _LOGGER.info(
    +            f"Done ({successes.shape[0]}/{len(text_files)})\n"
    +            f"Translations summary:\n"
    +            f"{successes.head()}"
    +        )
    +    return str(output_directory), successes, errors
    +
    +
    +def _get_text_files(
    +    data_path: pathlib.Path,
    +) -> List[pathlib.Path]:
    +    # Check if the path is of a directory or a file:
    +    if data_path.is_dir():
    +        # Get all files inside the directory:
    +        text_files = list(data_path.glob("*.*"))
    +    elif data_path.is_file():
    +        text_files = [data_path]
    +    else:
    +        raise ValueError(
    +            f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. "
    +            f"Given: {str(data_path)} "
    +        )
    +
    +    return text_files
    +
    +
    +def _split_line(line: str, max_length: int = 250) -> List[str]:
    +    if len(line) < max_length:
    +        return [line]
    +
    +    sentences = [
    +        f"{sentence.strip()}." for sentence in line.split(".") if sentence.strip()
    +    ]
    +
    +    splits = []
    +    current_length = len(sentences[0])
    +    split = sentences[0]
    +    for sentence in sentences[1:]:
    +        if current_length + len(sentence) > max_length:
    +            splits.append(split)
    +            split = sentence
    +            current_length = len(sentence)
    +        else:
    +            current_length += len(sentence)
    +            split += " " + sentence
    +    if split:
    +        splits.append(split)
    +
    +    return splits
    +
    +
    +def _get_logger():
    +    global _LOGGER
    +    try:
    +        import mlrun
    +        # Check if MLRun is available:
    +        context = mlrun.get_or_create_ctx(name="mlrun")
    +        return context.logger
    +    except ModuleNotFoundError:
    +        return _LOGGER
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/text_to_audio_generator/1.2.0/static/text_to_audio_generator.html b/functions/master/text_to_audio_generator/1.2.0/static/text_to_audio_generator.html new file mode 100644 index 00000000..e79429a1 --- /dev/null +++ b/functions/master/text_to_audio_generator/1.2.0/static/text_to_audio_generator.html @@ -0,0 +1,408 @@ + + + + + + + +text_to_audio_generator.text_to_audio_generator + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + +
    +
    + +
    +
    +
    +
    +
    + +
    +

    + +
    +
    +
    +
    +
    +
    +
    +

    Source code for text_to_audio_generator.text_to_audio_generator

    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +import logging
    +import pathlib
    +import random
    +import tempfile
    +from typing import Dict, List, Optional, Tuple, Union
    +
    +import bark
    +import numpy as np
    +import pandas as pd
    +import torch
    +import torchaudio
    +import tqdm
    +
    +# Get the global logger:
    +_LOGGER = logging.getLogger()
    +
    +
    +
    [docs]def generate_multi_speakers_audio( + data_path: str, + speakers: Union[List[str], Dict[str, int]], + available_voices: List[str], + output_directory: str = None, + use_gpu: bool = True, + use_small_models: bool = False, + offload_cpu: bool = False, + sample_rate: int = 16000, + file_format: str = "wav", + verbose: bool = True, + bits_per_sample: Optional[int] = None, +) -> Tuple[str, pd.DataFrame, dict]: + """ + Generate audio files from text files. + + :param data_path: Path to the text file or directory containing the text files to generate audio from. + :param speakers: List / Dict of speakers to generate audio for. + If a list is given, the speakers will be assigned to channels in the order given. + If dictionary, the keys will be the speakers and the values will be the channels. + :param available_voices: List of available voices to use for the generation. + See here for the available voices: + https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c + :param output_directory: Path to the directory to save the generated audio files to. + :param use_gpu: Whether to use the GPU for the generation. + :param use_small_models: Whether to use the small models for the generation. + :param offload_cpu: To reduce the memory footprint, the models can be offloaded to the CPU after loading. + :param sample_rate: The sampling rate of the generated audio. + :param file_format: The format of the generated audio files. + :param verbose: Whether to print the progress of the generation. + :param bits_per_sample: Changes the bit depth for the supported formats. + Supported only in "wav" or "flac" formats. + + :returns: A tuple of: + - The output directory path. + - The generated audio files dataframe. + - The errors dictionary. + """ + + global _LOGGER + _LOGGER = _get_logger() + # Get the input text files to turn to audio: + data_path = pathlib.Path(data_path).absolute() + text_files = _get_text_files(data_path=data_path) + + # Load the bark models according to the given configurations: + bark.preload_models( + text_use_gpu=use_gpu, + text_use_small=use_small_models, + coarse_use_gpu=use_gpu, + coarse_use_small=use_small_models, + fine_use_gpu=use_gpu, + fine_use_small=use_small_models, + codec_use_gpu=use_gpu, + force_reload=offload_cpu, + ) + + # Check for per channel generation: + if isinstance(speakers, dict): + speaker_per_channel = True + # Sort the given speakers by channels: + speakers = { + speaker: channel + for speaker, channel in sorted(speakers.items(), key=lambda item: item[1]) + } + else: + speaker_per_channel = False + + # Prepare the resampling module: + resampler = torchaudio.transforms.Resample( + orig_freq=bark.SAMPLE_RATE, new_freq=sample_rate, dtype=torch.float32 + ) + + # Prepare the gap between each speaker: + gap_between_speakers = np.zeros(int(0.5 * bark.SAMPLE_RATE)) + + # Prepare the successes dataframe and errors dictionary to be returned: + successes = [] + errors = {} + + # Create the output directory: + if output_directory is None: + output_directory = tempfile.mkdtemp() + output_directory = pathlib.Path(output_directory) + if not output_directory.exists(): + output_directory.mkdir(exist_ok=True, parents=True) + + # Start generating audio: + # Go over the audio files and transcribe: + for text_file in tqdm.tqdm( + text_files, desc="Generating", unit="file", disable=not verbose + ): + + try: + # Randomize voices for each speaker: + chosen_voices = {} + available_voices_copy = available_voices.copy() + for speaker in speakers: + voice = random.choice(available_voices_copy) + chosen_voices[speaker] = voice + available_voices_copy.remove(voice) + # Read text: + with open(text_file, "r") as fp: + text = fp.read() + # Prepare a holder for all the generated pieces (if per channel each speaker will have its own): + audio_pieces = ( + {speaker: [] for speaker in speakers} + if speaker_per_channel + else {"all": []} + ) + + # Generate audio per line: + for line in text.splitlines(): + # Validate line is in correct speaker format: + + if ": " not in line: + if verbose: + _LOGGER.warning(f"Skipping line: {line}") + continue + # Split line to speaker and his words: + current_speaker, sentences = line.split(": ", 1) + # Validate speaker is known: + if current_speaker not in speakers: + raise ValueError( + f"Unknown speaker: {current_speaker}. Given speakers are: {speakers}" + ) + for sentence in _split_line(line=sentences): + # Generate words audio: + audio = bark.generate_audio( + sentence, + history_prompt=chosen_voices[current_speaker], + silent=True, + ) + if speaker_per_channel: + silence = np.zeros_like(audio) + for speaker in audio_pieces.keys(): + if speaker == current_speaker: + audio_pieces[speaker] += [audio, gap_between_speakers] + else: + audio_pieces[speaker] += [silence, gap_between_speakers] + else: + audio_pieces["all"] += [audio, gap_between_speakers] + # Construct a single audio array from all the pieces and channels: + + audio = np.vstack( + [np.concatenate(audio_pieces[speaker]) for speaker in speakers] + ).astype(dtype=np.float32) + # Resample: + audio = torch.from_numpy(audio) + audio = resampler(audio) + # Save to audio file: + audio_file = output_directory / f"{text_file.stem}.{file_format}" + + torchaudio.save( + uri=str(audio_file), + src=audio, + sample_rate=sample_rate, + format=file_format, + bits_per_sample=bits_per_sample, + ) + + # Collect to the successes: + successes.append([text_file.name, audio_file.name]) + except Exception as exception: + # Note the exception as error in the dictionary: + if verbose: + _LOGGER.warning(f"Error in file: '{text_file.name}'") + print(exception) + errors[text_file.name] = str(exception) + + # Construct the translations dataframe: + successes = pd.DataFrame( + successes, + columns=["text_file", "audio_file"], + ) + + # Print the head of the produced dataframe and return: + if verbose: + _LOGGER.info( + f"Done ({successes.shape[0]}/{len(text_files)})\n" + f"Translations summary:\n" + f"{successes.head()}" + ) + return str(output_directory), successes, errors
    + + +def _get_text_files( + data_path: pathlib.Path, +) -> List[pathlib.Path]: + # Check if the path is of a directory or a file: + if data_path.is_dir(): + # Get all files inside the directory: + text_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + text_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. " + f"Given: {str(data_path)} " + ) + + return text_files + + +def _split_line(line: str, max_length: int = 250) -> List[str]: + if len(line) < max_length: + return [line] + + sentences = [ + f"{sentence.strip()}." for sentence in line.split(".") if sentence.strip() + ] + + splits = [] + current_length = len(sentences[0]) + split = sentences[0] + for sentence in sentences[1:]: + if current_length + len(sentence) > max_length: + splits.append(split) + split = sentence + current_length = len(sentence) + else: + current_length += len(sentence) + split += " " + sentence + if split: + splits.append(split) + + return splits + + +def _get_logger(): + global _LOGGER + try: + import mlrun + # Check if MLRun is available: + context = mlrun.get_or_create_ctx(name="mlrun") + return context.logger + except ModuleNotFoundError: + return _LOGGER +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/text_to_audio_generator/latest/src/function.yaml b/functions/master/text_to_audio_generator/latest/src/function.yaml index df142d2e..88ef9cb8 100644 --- a/functions/master/text_to_audio_generator/latest/src/function.yaml +++ b/functions/master/text_to_audio_generator/latest/src/function.yaml @@ -2,13 +2,14 @@ kind: job metadata: name: text-to-audio-generator tag: '' - hash: 534e34d316098dcb345860a786ea013102150e67 + hash: 89fcaf3fab53e7b7fbba448a5e65c253d7fa66ed project: '' labels: author: yonatans categories: - data-preparation - machine-learning + - pytorch spec: command: '' args: [] diff --git a/functions/master/text_to_audio_generator/latest/src/item.yaml b/functions/master/text_to_audio_generator/latest/src/item.yaml index 4784a80d..efa8afc9 100644 --- a/functions/master/text_to_audio_generator/latest/src/item.yaml +++ b/functions/master/text_to_audio_generator/latest/src/item.yaml @@ -2,6 +2,7 @@ apiVersion: v1 categories: - data-preparation - machine-learning +- pytorch description: Generate audio file from text using different speakers doc: '' example: text_to_audio_generator.ipynb @@ -24,5 +25,5 @@ spec: - bark - torchaudio url: '' -version: 1.1.0 +version: 1.2.0 test_valid: True diff --git a/functions/master/text_to_audio_generator/latest/static/function.html b/functions/master/text_to_audio_generator/latest/static/function.html index b50784f8..5ed19c22 100644 --- a/functions/master/text_to_audio_generator/latest/static/function.html +++ b/functions/master/text_to_audio_generator/latest/static/function.html @@ -19,13 +19,14 @@ metadata: name: text-to-audio-generator tag: '' - hash: 534e34d316098dcb345860a786ea013102150e67 + hash: 89fcaf3fab53e7b7fbba448a5e65c253d7fa66ed project: '' labels: author: yonatans categories: - data-preparation - machine-learning + - pytorch spec: command: '' args: [] diff --git a/functions/master/text_to_audio_generator/latest/static/item.html b/functions/master/text_to_audio_generator/latest/static/item.html index d82f688a..282a8a3a 100644 --- a/functions/master/text_to_audio_generator/latest/static/item.html +++ b/functions/master/text_to_audio_generator/latest/static/item.html @@ -19,6 +19,7 @@ categories: - data-preparation - machine-learning +- pytorch description: Generate audio file from text using different speakers doc: '' example: text_to_audio_generator.ipynb @@ -41,7 +42,7 @@ - bark - torchaudio url: '' -version: 1.1.0 +version: 1.2.0 test_valid: True diff --git a/functions/master/transcribe/1.1.0/src/data/error_file.txt b/functions/master/transcribe/1.1.0/src/data/error_file.txt new file mode 100644 index 00000000..e69de29b diff --git a/functions/master/transcribe/1.1.0/src/data/speech_01.mp3 b/functions/master/transcribe/1.1.0/src/data/speech_01.mp3 new file mode 100644 index 00000000..ae0e5c82 Binary files /dev/null and b/functions/master/transcribe/1.1.0/src/data/speech_01.mp3 differ diff --git a/functions/master/transcribe/1.1.0/src/data/speech_02.mp3 b/functions/master/transcribe/1.1.0/src/data/speech_02.mp3 new file mode 100644 index 00000000..1d5e6c03 Binary files /dev/null and b/functions/master/transcribe/1.1.0/src/data/speech_02.mp3 differ diff --git a/functions/master/transcribe/1.1.0/src/function.yaml b/functions/master/transcribe/1.1.0/src/function.yaml new file mode 100644 index 00000000..d72751ad --- /dev/null +++ b/functions/master/transcribe/1.1.0/src/function.yaml @@ -0,0 +1,311 @@ +kind: job +metadata: + name: transcribe + tag: '' + hash: 8810ac74045bd15cee15a2e4e89563e8e29908d3 + project: '' + labels: + author: yonatans + categories: + - data-preparation + - genai + - huggingface + - machine-learning +spec: + command: '' + args: [] + image: '' + build: + functionSourceCode: # Copyright 2024 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import operator
import os
import tempfile
from functools import reduce, wraps
from multiprocessing import Process, Queue
from pathlib import Path
from typing import Any, Dict, Generator, List, Literal, NamedTuple, Tuple, Union

import pandas as pd
import torch
import torchaudio
from tqdm import tqdm
from transformers import (
    AutomaticSpeechRecognitionPipeline,
    AutoModelForCausalLM,
    pipeline,
)
from transformers.utils import is_flash_attn_2_available


class BaseTask:
    """
    A task to write the transcription to file.
    """

    def __init__(
        self, audio_file: Path, transcription_output: Union[dict, str], text_file: Path
    ):
        """
        Initialize the task.

        :param audio_file:           Path to the audio file that was transcribed.
        :param transcription_output: The transcription output from the pipeline. String means an exception was raised.
        :param text_file:            Path to the text file to write the transcription to.
        """
        # Store the parameters:
        self._audio_file = audio_file
        self._transcription_output = transcription_output
        self._text_file = text_file

        # Prepare the error variable:
        self._error: str = None

    def do_task(self):
        """
        Try to perform the task storing an error if occurred.
        """
        if isinstance(self._transcription_output, str):
            self._error = self._transcription_output
            return
        try:
            self._do_task()
        except Exception as exception:
            self._error = str(exception)

    def is_failed(self) -> bool:
        """
        Check if the task failed.

        :returns: Whether the task failed.
        """
        return self._error is not None

    def get_result(self) -> Tuple[str, str]:
        """
        Get the result of the task. If the task failed, the error will be returned, otherwise, the result will be the
        text file name.

        :returns: The task's result.
        """
        if self.is_failed():
            return self._audio_file.name, self._error
        return self._audio_file.name, self._text_file.name

    def to_tuple(self) -> Tuple[str, dict]:
        """
        Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

        :returns: The converted task.
        """
        return self.__class__.__name__, {
            "audio_file": self._audio_file,
            "transcription_output": self._transcription_output,
            "text_file": self._text_file,
        }

    def _do_task(self):
        """
        Perform the task - write the transcription to the stored file path.
        """
        # Checking for no duplications:
        i = 1
        while self._text_file.exists():
            i += 1
            self._text_file = (
                self._text_file.parent
                / f"{self._text_file.stem.rsplit('_', 1)[0]}_{i}{self._text_file.suffix}"
            )

        # Make sure all directories are created:
        self._text_file.parent.mkdir(exist_ok=True, parents=True)

        # Write to file:
        with open(self._text_file, "w") as fp:
            fp.write(self._transcription_output["text"])


class SpeechDiarizationTask(BaseTask):
    """
    A task to write the transcription to file with respect to a given speech diarization.
    """

    class _DiarizationSegment(NamedTuple):
        """
        A speech diarization segment.
        """

        start: float
        end: float
        speaker: str

    class _WordTimestamp(NamedTuple):
        """
        A word with its start and end timestamps.
        """

        start: float
        end: float
        text: str

    def __init__(
        self,
        audio_file: Path,
        transcription_output: dict,
        text_file: Path,
        speech_diarization: List[Tuple[float, float, str]],
    ):
        """
        Initialize the task.

        :param audio_file:           Path to the audio file that was transcribed.
        :param transcription_output: The transcription output from the pipeline.
        :param text_file:            Path to the text file to write the transcription to.
        :param speech_diarization:   A speech diarization as a list of tuples: (start, end, speaker).
        """
        super().__init__(
            audio_file=audio_file,
            transcription_output=transcription_output,
            text_file=text_file,
        )
        self._speech_diarization = speech_diarization
        self._segments: List[SpeechDiarizationTask._DiarizationSegment] = None
        self._last_chosen_index = 0

    def to_tuple(self) -> Tuple[str, dict]:
        """
        Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

        :returns: The converted task.
        """
        task_class, task_kwargs = super().to_tuple()
        return task_class, {
            **task_kwargs,
            "speech_diarization": self._speech_diarization,
        }

    def _do_task(self):
        """
        Perform the task - write the transcription to the stored file path with respect to the given speech diarization.
        """
        # Check if a speech diarization is given, if not, just write the transcription to file:
        if not self._speech_diarization:
            super()._do_task()
            return

        # Cast the chunks to word timestamps tuples:
        words = [
            SpeechDiarizationTask._WordTimestamp(
                start=chunk["timestamp"][0],
                end=chunk["timestamp"][1],
                text=chunk["text"],
            )
            for chunk in self._transcription_output["chunks"]
        ]

        # Cast speech diarization to segments tuples:
        self._segments = [
            SpeechDiarizationTask._DiarizationSegment(*segment)
            for segment in self._speech_diarization
        ]

        # Try to match the Whisper model predicted timestamps to the closest diarization segment (closest diarization
        # segment will be the most overlapping with the word, and if there is no overlap, the closest segment to the
        # word):
        speaker = self._segments[self._last_chosen_index].speaker
        text = f"{speaker}:"
        for word in words:
            # Get the next diarization segment:
            self._get_next_segment(word=word)
            # Check if the segment is of the same speaker:
            if self._segments[self._last_chosen_index].speaker == speaker:
                # Collect the word:
                text += word.text
            else:
                # Append a newline and update the new speaker:
                speaker = self._segments[self._last_chosen_index].speaker
                text += f"\n{speaker}:{word.text}"

        # Update the transcription output with the new text to write it to file:
        self._transcription_output["text"] = text
        super()._do_task()

    def _get_next_segment(
        self,
        word: _WordTimestamp,
    ):
        """
        Get the next diarization segment the given word falls into. The `self._last_chosen_index` will be updated
        accordingly.

        :param word: The word timestamp to match to the next segment.
        """
        # If the last chosen segment is the last segment, return it:
        if self._last_chosen_index == len(self._segments) - 1:
            return

        # Get the last chosen diarization segment:
        last_chosen = self._segments[self._last_chosen_index]

        # None value may appear if the word is the last word in the audio file, or it was split during inference. In
        # that case, we'll set the last segment:
        if word.end is None:
            self._last_chosen_index = len(self._segments) - 1
            return

        # If the word ends before the last chosen segment:
        if word.end <= last_chosen.start:
            # Then it is still the closest segment
            return

        # We check if it ends inside the last chosen segment:
        if word.end < last_chosen.end:
            # Then it still is the closest segment
            return

        # The word ends after the segment, we need to collect all next segments up until the word ends before them:
        possible_segments = [self._last_chosen_index]
        for i in range(self._last_chosen_index + 1, len(self._segments)):
            if word.end > self._segments[i].end:
                possible_segments.append(i)
                continue
            possible_segments.append(i)
            break

        # Check for the most overlapping option:
        best_overlap = 0
        most_overlapping_segment_index = None
        for i in possible_segments:
            # If the word starts before segment:
            if word.start <= self._segments[i].start:
                # If it ends before the segment, there is an overlap from the start of the segment to the end of the
                # word:
                if word.end < self._segments[i].end:
                    overlap = word.end - self._segments[i].start
                else:
                    # The word is wrapping the segment, the overlap is the segment's length:
                    overlap = self._segments[i].end - self._segments[i].start
            # The word starts in segment, check if the word ends in it:
            elif word.end < self._segments[i].end:
                # The overlap is the word's length:
                overlap = word.end - word.start
            # The word start in segment but ends after it, the overlap is from the word's start to the segment's end:
            else:
                overlap = self._segments[i].end - word.start
            # Check for new best overlap:
            if overlap > best_overlap:
                best_overlap = overlap
                most_overlapping_segment_index = i
        if most_overlapping_segment_index is not None:
            self._last_chosen_index = most_overlapping_segment_index
            return

        # If there is no overlapping segment, return the closest segment:
        best_distance = None
        closest_segment_index = None
        for i in possible_segments:
            distance = (
                word.start - self._segments[i].end
                if word.start > self._segments[i].end
                else self._segments[i].start - word.end
            )
            if best_distance is None or distance < best_distance:
                best_distance = distance
                closest_segment_index = i
        self._last_chosen_index = closest_segment_index


class SpeechDiarizationPerChannelTask(BaseTask):
    """
    A task to write the transcription to file with respect to a given speech diarization per channel.
    """

    class _WordTimestamp(NamedTuple):
        """
        A word with its start and end timestamps and speaker label (channel the word was taken from).
        """

        start: float
        end: float
        speaker: str
        text: str

    def __init__(self, audio_file: Path, text_file: Path):
        """
        Initialize the task.

        :param audio_file: Path to the audio file that was transcribed.
        :param text_file:  Path to the text file to write the transcription to.
        """
        super().__init__(
            audio_file=audio_file, transcription_output={}, text_file=text_file
        )
        self._transcription_output_channels: List[Tuple[str, dict]] = []

    @property
    def transcription_output_channels(self) -> List[Tuple[str, dict]]:
        """
        Get the transcription output channels.

        :returns: The transcription output channels.
        """
        return self._transcription_output_channels

    def do_task(self):
        """
        Try to perform the task storing an error if occurred.
        """
        for _, channel_output in self._transcription_output_channels:
            if isinstance(channel_output, str):
                self._error = self._transcription_output_channels
                return
        super().do_task()

    def to_tuple(self) -> Tuple[str, dict]:
        """
        Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

        :returns: The converted task.
        """
        task_class, task_kwargs = super().to_tuple()
        task_kwargs.pop("transcription_output")
        return task_class, task_kwargs

    def _do_task(self):
        """
        Perform the task - write the transcription to the stored file path with respect to the given speech diarization
        per channel.
        """
        # Cast the chunks to word timestamps tuples:
        words_per_channel = [
            [
                SpeechDiarizationPerChannelTask._WordTimestamp(
                    start=chunk["timestamp"][0],
                    end=chunk["timestamp"][1],
                    speaker=speaker,
                    text=chunk["text"],
                )
                for chunk in output["chunks"]
            ]
            for speaker, output in self._transcription_output_channels
        ]

        # Merge and sort the words per channel by their start time:
        words = operator.add(*words_per_channel)
        words.sort()

        # Write the transcription to file:
        current_speaker = words[0].speaker
        text = f"{current_speaker}:"
        for word in words:
            # Check if the word's speaker is different from the current one:
            if word.speaker != current_speaker:
                # Append a newline and update the new speaker:
                current_speaker = word.speaker
                text += f"\n{current_speaker}:"
            # Collect the word:
            text += word.text

        # Update the transcription output with the new text to write it to file:
        self._transcription_output["text"] = text
        super()._do_task()


class BatchProcessor:
    """
    A batch processor to process batches of transcriptions. The batch processor is creating tasks and is aimed to be
    working along the transcriber. It can be used with multiprocessing queue or run the tasks directly using the
    associated methods.
    """

    def __init__(self, audio_files: List[Path], output_directory: Path):
        """
        Initialize the batch processor.

        :param audio_files:      The list of all audio files to transcribe.
        :param output_directory: The output directory to write the transcriptions to.
        """
        # Store the parameters:
        self._audio_files = audio_files
        self._output_directory = output_directory

        # Prepare the batching variables:
        self._current_file_index = 0
        self._tasks: List[BaseTask] = []
        self._results: List[Tuple[bool, Tuple[str, str]]] = []

    def process_batch(self, batch: List[Union[dict, str]]):
        """
        Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch
        processor.

        :param batch: The batch of transcriptions to process.
        """
        # Get the relevant files belongs to the given batch:
        current_files = self._get_current_files(batch_size=len(batch))

        # Build the diarization tasks:
        self._tasks.extend(
            [
                BaseTask(
                    audio_file=file,
                    transcription_output=batch[i],
                    text_file=self._output_directory / f"{file.stem}.txt",
                )
                for i, file in enumerate(current_files)
            ]
        )

    def get_tasks(self) -> List[BaseTask]:
        """
        Get the tasks to perform.

        :returns: The tasks to perform.
        """
        tasks = self._tasks
        self._tasks = []
        return tasks

    def do_tasks(self):
        """
        Perform the tasks. Should be used if no multiprocessing queue is given to a transcriber.
        """
        for task in self.get_tasks():
            task.do_task()
            self._results.append((task.is_failed(), task.get_result()))

    def get_results(self) -> List[Tuple[bool, Tuple[str, str]]]:
        """
        Get the results of the tasks. The stored results are then cleared.

        :returns: The results of the tasks.
        """
        results = self._results
        self._results = []
        return results

    def _get_current_files(self, batch_size: int) -> List[Path]:
        """
        Get the current files to process.

        :param batch_size: The batch size to progress the current file index.

        :returns: The current files to process.
        """
        end_index = (
            self._current_file_index + batch_size
            if self._current_file_index + batch_size < len(self._audio_files)
            else len(self._audio_files)
        )
        current_files = self._audio_files[self._current_file_index : end_index]
        self._current_file_index = end_index
        return current_files


class SpeechDiarizationBatchProcessor(BatchProcessor):
    """
    A batch processor to process batches of transcriptions with respect to a given speech diarization. The batch
    processor is creating tasks and is aimed to be working along the transcriber. It can be used with multiprocessing
    queue or run the tasks directly using the associated methods.
    """

    def __init__(
        self, audio_files: List[Path], output_directory: Path, speech_diarization: dict
    ):
        """
        Initialize the batch processor.

        :param audio_files:        The list of all audio files to transcribe.
        :param output_directory:   The output directory to write the transcriptions to.
        :param speech_diarization: A speech diarization dictionary to pass along with each processed batch.
        """
        super().__init__(audio_files=audio_files, output_directory=output_directory)
        self._speech_diarization = speech_diarization
        self._audio_files = audio_files

    def process_batch(self, batch: List[dict]):
        """
        Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch
        processor.

        :param batch: The batch of transcriptions to process.
        """
        # Get the relevant files belongs to the given batch:
        current_files = self._get_current_files(batch_size=len(batch))

        # Build the diarization tasks:
        self._tasks.extend(
            [
                SpeechDiarizationTask(
                    audio_file=file,
                    transcription_output=batch[i],
                    text_file=self._output_directory / f"{file.stem}.txt",
                    speech_diarization=self._speech_diarization.get(file.name),
                )
                for i, file in enumerate(current_files)
            ]
        )


class PerChannelSpeechDiarizationBatchProcessor(BatchProcessor):
    """
    A batch processor to process batches of transcriptions per channel. The batch processor is creating tasks with the
    selected amount of channels given and is aimed to be working along the transcriber. It can be used with
    multiprocessing queue or run the tasks directly using the associated methods.
    """

    def __init__(
        self,
        audio_files: List[Path],
        output_directory: Path,
        n_channels: int,
        speakers: List[str],
    ):
        """
        Initialize the batch processor.

        :param audio_files:      The list of all audio files to transcribe.
        :param output_directory: The output directory to write the transcriptions to.
        :param n_channels:       The number of channels in each audio file to transcribe.
        :param speakers:         The speakers labels to use for each channel.
        """
        super().__init__(audio_files=audio_files, output_directory=output_directory)

        # Store the parameters:
        self._n_channels = n_channels
        self._speakers = speakers

        # Prepare a channel buffer to store the channels until the current task created is fully covered:
        self._task_in_process: SpeechDiarizationPerChannelTask = None

    def process_batch(self, batch: List[dict]):
        """
        Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch
        processor.

        :param batch: The batch of transcriptions to process.
        """
        # Go over the batch and create the tasks:
        for output in batch:
            # Check if there is a task in process:
            if not self._task_in_process:
                # Create a new task:
                self._task_in_process = SpeechDiarizationPerChannelTask(
                    audio_file=self._audio_files[self._current_file_index],
                    text_file=self._output_directory
                    / f"{self._audio_files[self._current_file_index].stem}.txt",
                )
            # Get the channel's speaker:
            speaker = self._speakers[
                len(self._task_in_process.transcription_output_channels)
            ]
            # Collect the channel into the processed task:
            self._task_in_process.transcription_output_channels.append(
                (speaker, output)
            )
            # Check if the task is fully covered (all channels are collected):
            if (
                len(self._task_in_process.transcription_output_channels)
                == self._n_channels
            ):
                # Collect the task and reset the task in process:
                self._tasks.append(self._task_in_process)
                self._current_file_index += 1
                self._task_in_process = None


class Transcriber:
    """
    A transcription wrapper for the Huggingface's ASR pipeline -
    https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline to
    use with OpenAI's Whisper models - https://huggingface.co/openai.
    """

    def __init__(
        self,
        model_name: str,
        device: str = None,
        use_flash_attention_2: bool = None,
        use_better_transformers: bool = None,
        assistant_model: str = None,
        max_new_tokens: int = 128,
        chunk_length_s: int = 30,
        batch_size: int = 2,
        spoken_language: str = None,
        translate_to_english: bool = False,
        return_timestamps: Union[bool, Literal["word"]] = False,
        per_channel_transcription: int = 0,
    ):
        """
        Initialize the transcriber.

        :param model_name:                The model name to use. Should be a model from the OpenAI's Whisper models for
                                          best results (for example "tiny", "base", "large", etc.).
        :param device:                    The device to use for inference. If not given, will use GPU if available.
        :param use_flash_attention_2:     Whether to use the Flash Attention 2 implementation. It can be used only with
                                          one of the following GPUs: Nvidia H series and Nvidia A series. T4 support
                                          will be available soon.

                                          Note: If both `use_flash_attention_2` and
                                          `use_better_transformers` are `None`, the optimization will be chosen
                                          automatically according to the available resources.

        :param use_better_transformers:   Whether to use the Better Transformers library to further optimize the model.
                                          Should be used for all use cases that do not support flash attention 2.

                                          Note: If both `use_flash_attention_2` and `use_better_transformers` are
                                          `None`, the optimization will be chosen automatically according to the
                                          available resources.
       :param assistant_model:           The assistant model name to use for inference. Notice that the optimizations
                                          (flash attention 2 and better transformers) will be applied for the assistant
                                          as well. Should be a model from Huggingface's distil-whisper (see here for
                                          more information: https://github.com/huggingface/distil-whisper).
        :param max_new_tokens:            The maximum number of new tokens to generate. This is used to limit the
                                          generation length. Default is 128 tokens.
        :param chunk_length_s:            The audio chunk to split the audio to (in seconds). Default is 30 seconds.
        :param batch_size:                The batch size to use for inference. Default is 2.
        :param spoken_language:           Aim whisper to know what language is spoken. If None, it will try to detect it
                                          for each chunk.
        :param translate_to_english:      Whether to translate the transcriptions to English. Default is False.
        :param return_timestamps:         Whether to return the timestamps of the words. If "word", will return the
                                          timestamps of each word. If True will return the timestamps of each chunk.
                                          Default is False. Aimed to be used for speech diarization.
        :param per_channel_transcription: Whether to do per channel transcription. If needed to run per channel
                                          transcription, pass the number of channels expected for each audio file here.
                                          0 means regular transcription (merge channels).

                                          Note: If `per_channel_transcription` is not 0, `batch_size` must be treated to
                                          be the number of channels and not audio files. Aimed to be used for per
                                          channel speech diarization.
        """
        # Store loading parameters:
        self._model_name = model_name
        self._device = device
        self._use_flash_attention_2 = use_flash_attention_2
        self._use_better_transformers = use_better_transformers
        self._max_new_tokens = max_new_tokens
        self._chunk_length_s = chunk_length_s
        self._batch_size = batch_size
        self._return_timestamps = return_timestamps
        self._per_channel_transcription = per_channel_transcription

        # Store generation parameters:
        self._assistant_model = assistant_model
        self._spoken_language = spoken_language
        self._translate_to_english = translate_to_english

        # Prepare the transcription objects:
        self._transcription_pipeline: AutomaticSpeechRecognitionPipeline = None
        self._generate_kwargs: dict = None

    def load(self):
        """
        Load the transcriber. Must be called before transcribing.
        """
        # Set the device and data type to use (prefer GPU if available):
        device = torch.device(
            self._device or "cuda" if torch.cuda.is_available() else "cpu"
        )
        torch_dtype = torch.float16 if device.type == "cuda" else torch.float32

        # Choose the optimization to use (in case the user did not specify any):
        if (
            self._use_flash_attention_2 is None
            and self._use_better_transformers is None
        ):
            # Prefer to use flash attention 2 if available and cuda device is supported (see GPU names to architecture
            # here: https://en.wikipedia.org/wiki/List_of_Nvidia_graphics_processing_units#Tesla):
            if device.type == "cuda" and is_flash_attn_2_available():
                cuda_device_name = torch.cuda.get_device_properties(device).name
                if any(
                    cuda_device_name.startswith(gpu_name)
                    for gpu_name in [
                        "NVIDIA A",  # For Ampere architecture (e.g. A10, A30, A100)
                        "NVIDIA H",  # For Hopper architecture (e.g. H100)
                        "NVIDIA L",  # For Ada Lovelace architecture (e.g. L4, L40)
                        "NVIDIA RTX 30",  # For Ada Lovelace architecture (RTX 30 series)
                        "NVIDIA RTX 40",  # For Ada Lovelace architecture (RTX 40 series)
                        "NVIDIA RTX 50",  # For Ada Lovelace architecture (RTX 50 series)
                        # Will be supported soon according to FlashAttention GitHub repo:
                        # https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features
                        # "NVIDIA T4",  # For Turing architecture (only T4)
                        # "NVIDIA RTX 20",  # For Turing architecture (RTX 20 series)
                    ]
                ):
                    self._use_flash_attention_2 = True
                else:
                    self._use_better_transformers = True
            else:
                self._use_better_transformers = True

        # Build the optimizations kwargs:
        model_kwargs = {
            "low_cpu_mem_usage": True,
            "use_safetensors": True,
        }
        if self._use_flash_attention_2:
            if _LOGGER:
                _LOGGER.info(
                    "Using FlashAttention2 optimization - make sure the `flash-attn` package is installed via "
                    "`pip install -U flash-attn --no-build-isolation`"
                )
            model_kwargs["attn_implementation"] = "flash_attention_2"
        elif self._use_better_transformers:
            if _LOGGER:
                _LOGGER.info(
                    "Using BetterTransformers optimization - make sure the `optimum` package is installed via "
                    "`pip install -U optimum`"
                )
            model_kwargs["attn_implementation"] = "sdpa"

        # Initialize the speech recognition pipeline:
        self._transcription_pipeline = pipeline(
            task="automatic-speech-recognition",
            model=self._model_name,
            model_kwargs=model_kwargs.copy(),
            batch_size=self._batch_size,
            max_new_tokens=self._max_new_tokens,
            chunk_length_s=self._chunk_length_s,
            return_timestamps=self._return_timestamps,
            torch_dtype=torch_dtype,
            device=device,
        )

        # Prepare the generation kwargs:
        self._generate_kwargs = {
            "language": self._spoken_language,
            "task": "translate" if self._translate_to_english else "transcribe",
        }

        # Initialize the assistant model (if needed):
        if self._assistant_model:
            assistant_model = AutoModelForCausalLM.from_pretrained(
                self._assistant_model, torch_dtype=torch_dtype, **model_kwargs
            )
            assistant_model.to(device)
            self._generate_kwargs["assistant_model"] = assistant_model

    def transcribe(
        self,
        audio_files: List[Path],
        batch_processor: BatchProcessor = None,
        batches_queue: Queue = None,
        verbose: bool = False,
    ) -> Union[List[List[dict]], None]:
        """
        Transcribe the given audio files. The transcriptions will be sent to a queue or a batch processor for further
        processing like writing to text files. If no queue or batch processor is given, the transcriptions outputs from
        the pipeline will be returned. Otherwise, `None` is returned.

        :param audio_files:     The audio files to transcribe.
        :param batch_processor: A batch processor.
        :param batches_queue:   A multiprocessing queue to put the batches in.
        :param verbose:         Whether to show a progress bar. Default is False.

        :returns: The transcriptions outputs from the pipeline if no queue or batch processor is given, otherwise,
                  `None`.
        """
        # Wrap the audio files with a function to iterate over them via a generator (save memory and runtime with
        # Huggingface's pipelines as they preload each input while inference is running):
        def audio_iterator() -> Generator[Union[dict, str], None, None]:
            if self._per_channel_transcription:
                for audio_file in audio_files:
                    audio, sampling_rate = torchaudio.load(str(audio_file))
                    audio = audio.numpy()
                    for channel in audio:
                        yield {"raw": channel, "sampling_rate": sampling_rate}
            else:
                for audio_file in audio_files:
                    yield str(audio_file)

        # Create a batch iterator:
        def batch_iterator() -> Generator[List[Union[dict, str]], None, None]:
            batch = []
            for audio in audio_iterator():
                batch.append(audio)
                if len(batch) == self._batch_size:
                    yield batch
                    batch = []
            if batch:
                yield batch

        # Prepare the successes dataframe and errors dictionary to be returned:
        outputs = []

        # Infer through the pipeline:
        for input_batch in tqdm(
            batch_iterator() if self._batch_size > 1 else audio_iterator(),
            desc="Transcribing",
            unit="channel" if self._per_channel_transcription else "audio file",
            total=(
                (
                    (len(audio_files) // self._batch_size)
                    + (len(audio_files) % self._batch_size != 0)
                )
                * (self._per_channel_transcription or 1)
            ),
            disable=not verbose,
        ):
            # Infer:
            try:
                output_batch = self._transcription_pipeline(
                    input_batch,
                    generate_kwargs=self._generate_kwargs,
                )
            except Exception as exception:
                # Collect the exception:
                output_batch = str(exception)
                # Align to batch size:
                output_batch = (
                    [output_batch] * len(input_batch)
                    if isinstance(input_batch, list)
                    else [output_batch]
                )
            # To align with batching, if batch size is 1, wrap the output with a list:
            if isinstance(output_batch, dict):
                output_batch = [output_batch]
            # If a batch processor is given, process the batch:
            if batch_processor:
                # Process it directly:
                batch_processor.process_batch(batch=output_batch)
                batch_processor.do_tasks()
            elif batches_queue:
                # Otherwise, queue the batch:
                batches_queue.put(output_batch)
            else:
                # Otherwise, collect the output as is without processing:
                outputs.append(output_batch)

        # Check if given a multiprocessing queue or a batch processor:
        if batches_queue:
            batches_queue.put(_MULTIPROCESSING_STOP_MARK)

        return outputs if not batch_processor else None


#: The value to send into multiprocessing queues to stop the process:
_MULTIPROCESSING_STOP_MARK = "STOP"


def _multiprocessing_process_batches(
    batch_processor: BatchProcessor,
    batches_queue: Queue,
    tasks_queue: Queue,
    n_task_completers: int,
):
    """
    Process the batches in the given batches queue and put the tasks in the given tasks queue. The function will stop
    when the given batches queue will receive the stop mark. It is aimed to be used with multiprocessing as a process.

    :param batch_processor:   A batch processor to process the batches.
    :param batches_queue:     A queue to get the batches from.
    :param tasks_queue:       A queue to put the tasks in.
    :param n_task_completers: The number of task completers (processes that run the `_multiprocessing_complete_tasks`
                              function). A stop mark will be sent to the tasks queue for each task completer.
    """
    while True:
        # Get the batch:
        batch: List[dict] = batches_queue.get()
        if batch == _MULTIPROCESSING_STOP_MARK:
            break

        # Process the batch:
        batch_processor.process_batch(batch=batch)

        # Get the tasks:
        tasks = batch_processor.get_tasks()

        # Queue the tasks:
        for task in tasks:
            tasks_queue.put(task.to_tuple())

    # Mark the end of the batches:
    for _ in range(n_task_completers):
        tasks_queue.put(_MULTIPROCESSING_STOP_MARK)


def _multiprocessing_complete_tasks(tasks_queue: Queue, results_queue: Queue):
    """
    Complete the tasks in the given queue and put the results in the given results queue. The function will stop when
    the given tasks queue will receive the stop mark. It is aimed to be used with multiprocessing as a process.

    :param tasks_queue:   A queue to get the tasks from.
    :param results_queue: A queue to put the results in.
    """
    tasks_map = {
        BaseTask.__name__: BaseTask,
        SpeechDiarizationTask.__name__: SpeechDiarizationTask,
        SpeechDiarizationPerChannelTask.__name__: SpeechDiarizationPerChannelTask,
    }

    while True:
        # Get the task:
        task = tasks_queue.get()
        if task == _MULTIPROCESSING_STOP_MARK:
            break

        # Reconstruct the task:
        task_class, task_kwargs = task
        task = tasks_map[task_class](**task_kwargs)

        # Complete the task:
        task.do_task()
        results_queue.put((task.is_failed(), task.get_result()))

    # Mark the end of the tasks:
    results_queue.put(_MULTIPROCESSING_STOP_MARK)


# Get the global logger:
_LOGGER = logging.getLogger()


def open_mpi_handler(
    worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None
):
    global _LOGGER

    # Check for MLRun and OpenMPI availability:
    context, comm = _check_mlrun_and_open_mpi()

    # Check if MLRun is available, set the global logger to MLRun's:
    if context:
        _LOGGER = context.logger

    def decorator(handler):
        if comm is None or comm.Get_size() == 1:
            return handler

        @wraps(handler)
        def wrapper(**kwargs):
            # Get the open mpi environment properties:
            size = comm.Get_size()
            rank = comm.Get_rank()

            # Give the correct chunk of the workers inputs:
            for worker_input in worker_inputs:
                input_argument = kwargs[worker_input]
                if input_argument is None:
                    continue
                if isinstance(input_argument, str):
                    input_argument = _get_audio_files(
                        data_path=Path(input_argument).absolute()
                    )
                if len(input_argument) < size:
                    raise ValueError(
                        f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. "
                        f"Please reduce the amount of workers for this input."
                    )
                even_chunk_size = len(input_argument) // size
                chunk_start = rank * even_chunk_size
                chunk_end = (
                    (rank + 1) * even_chunk_size
                    if rank + 1 < size
                    else len(input_argument)
                )
                context.logger.info(
                    f"Rank #{rank}: Processing input chunk of '{worker_input}' "
                    f"from index {chunk_start} to {chunk_end}."
                )
                if isinstance(input_argument, list):
                    input_argument = input_argument[chunk_start:chunk_end]
                elif isinstance(input_argument, pd.DataFrame):
                    input_argument = input_argument.iloc[chunk_start:chunk_end:, :]
                kwargs[worker_input] = input_argument

            # Set the root worker only arguments:
            if rank == 0 and root_worker_inputs:
                kwargs.update(root_worker_inputs)

            # Run the worker:
            output = handler(**kwargs)

            # Save the output directory of this worker:
            output_directory = Path(output[0])

            # Send the output to the root rank (rank #0):
            output = comm.gather(output, root=0)

            # Join the data from all workers:
            if rank == 0:
                context.logger.info("Collecting data from workers to root worker.")

                # Check if there are different output directories:
                output_directories = set([Path(out_dir) for out_dir, _, _ in output])
                for r in range(1, size):
                    # True means the other workers should pass their files to the root worker (rank 0):
                    comm.send(len(output_directories) != 1, dest=r)

                # If there are different output directories, listen to the other workers:
                if len(output_directories) != 1:
                    # Collect the files from the other workers:
                    files = []
                    for r in range(1, size):
                        files.extend(comm.recv(source=r))
                    # Write the files to the root worker's output directory:
                    for file_name, file_content in files:
                        with open(output_directory / file_name, "w") as f:
                            f.write(file_content)

                # Concatenate the dataframes:
                dataframe = pd.concat(objs=[df for _, df, _ in output], axis=0)

                # Concatenate the errors dictionaries:
                errors_dictionary = reduce(
                    operator.ior, [err for _, _, err in output], {}
                )

                return str(output_directory), dataframe, errors_dictionary

            # Listen to rank 0 to see if there are different output directories and this rank need to send its files to
            # it:
            if comm.recv(source=0):
                files = []
                for file in os.listdir(output_directory):
                    with open(output_directory / file, "r") as f:
                        files.append((file, f.read()))
                comm.send(files, dest=0)
            return None

        return wrapper

    return decorator


def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    is_mpi = False
    try:
        import mlrun

        context = mlrun.get_or_create_ctx(name="mlrun")
        is_mpi = context.labels.get("kind", "job") == "mpijob"

        if is_mpi:
            try:
                from mpi4py import MPI

                return context, MPI.COMM_WORLD
            except ModuleNotFoundError as mpi4py_not_found:
                context.logger.error(
                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
                )
                raise mpi4py_not_found
        else:
            return context, None
    except ModuleNotFoundError as module_not_found:
        if is_mpi:
            raise module_not_found
    return None, None


@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True})
def transcribe(
    # Input / Output kwargs:
    data_path: Union[str, Path, List[Union[str, Path]]],
    output_directory: str = None,
    # Model loading kwargs:
    model_name: str = "openai/whisper-tiny",
    device: str = None,
    use_flash_attention_2: bool = None,
    use_better_transformers: bool = None,
    # Generation kwargs:
    assistant_model: str = None,
    max_new_tokens: int = 128,
    chunk_length_s: int = 30,
    batch_size: int = 8,
    spoken_language: str = None,
    translate_to_english: bool = False,
    # Diarization kwargs:
    speech_diarization: Dict[str, List[Tuple[float, float, str]]] = None,
    speech_diarize_per_channel: int = None,
    speaker_labels: List[str] = None,
    # Other kwargs:
    use_multiprocessing: Union[bool, int] = False,
    verbose: bool = False,
):
    """
    Transcribe audio files into text files and collect additional data. The end result is a directory of transcribed
    text files and a dataframe containing the following columns:

    * audio_file - The audio file path.
    * transcription_file - The transcribed text file name in the output directory.

    The transcription is based on Huggingface's ASR pipeline -
    https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline and
    is tested with OpenAI's Whisper models - https://huggingface.co/openai.

    If one of the speaker diarization parameters are given (either `speech_diarization` or
    `speech_diarize_per_channel`), the transcription will be written in a conversation format, where each speaker will
    be written in a separate line::

        speaker_1: text
        speaker_2: text
        speaker_1: text
        ...

    :param data_path:                  A directory of audio files or a single file or a list of files to transcribe.
    :param output_directory:           Path to a directory to save all transcribed audio files. If not given, will save
                                       the transcribed files in a temporary directory.
    :param model_name:                 The model name to use. Should be a model from the OpenAI's Whisper models for
                                       best results (for example "tiny", "base", "large", etc.). See here for more
                                       information: https://huggingface.co/openai?search_models=whisper.
    :param device:                     The device to use for inference. If not given, will use GPU if available.
    :param use_flash_attention_2:      Whether to use the Flash Attention 2 implementation. It can be used only with
                                       one of the following GPUs: Nvidia H series and Nvidia A series. T4 support
                                       will be available soon.

                                       Note: If both `use_flash_attention_2` and
                                       `use_better_transformers` are `None`, the optimization will be chosen
                                       automatically according to the available resources.

    :param use_better_transformers:    Whether to use the Better Transformers library to further optimize the model.
                                       Should be used for all use cases that do not support flash attention 2.

                                       Note: If both `use_flash_attention_2` and `use_better_transformers` are
                                       `None`, the optimization will be chosen automatically according to the
                                       available resources.
    :param assistant_model:            The assistant model name to use for inference. Notice that the optimizations
                                       (flash attention 2 and better transformers) will be applied for the assistant as
                                       well. Should be a model from Huggingface's distil-whisper (see here for more
                                       information: https://github.com/huggingface/distil-whisper).

                                       Note: Currently an assistant model is only usable with batch size of 1.
    :param max_new_tokens:             The maximum number of new tokens to generate. This is used to limit the
                                       generation length. Default is 128 tokens.
    :param chunk_length_s:             The audio chunk to split the audio to (in seconds). Default is 30 seconds.
    :param batch_size:                 The batch size to use for inference. Default is 2.
    :param spoken_language:            Aim whisper to know what language is spoken. If None, it will try to detect
                                       it.
    :param translate_to_english:       Whether to translate the transcriptions to English.
    :param speech_diarization:         A speech diarization dictionary with the file names to transcribe as keys and
                                       their diarization as value. The diarization is a list of tuples:
                                       (start, end, speaker). An example
                                       for a diarization dictionary::

                                       {
                                           "audio_file_name": [
                                               {
                                                   "start": 0.0,
                                                   "end": 2.0,
                                                   "speaker": "Agent",
                                               },
                                               {
                                                   "start": 2.0,
                                                   "end": 4.0,
                                                   "speaker": "Client",
                                               },
                                               ...
                                           ],
                                           ...
                                       }

                                       Note: The diarization must be for the entire duration of the audio file (as long
                                       as Whisper is predicting words up until then.
    :param speech_diarize_per_channel: Perform speech diarization per channel. Each speaker is expected to belong to
                                       a separate channel in the audio. Notice: This will make the transcription
                                       slower as each channel wil be transcribed separatly. If a speech diarization
                                       is passed (via the `speech_diarization` parameter), this parameter is
                                       ignored.
    :param speaker_labels:             A list of speaker labels by channel order to use for writing the
                                       transcription with respect to per channel speech diarization. This won't be
                                       used together with a given speech diarization (via the `speech_diarization`
                                       parameter).
    :param use_multiprocessing:        Whether to use multiprocessing to transcribe the audio files. Can be either a
                                       boolean value or an integer. If `True`, will use the default amount of workers
                                       (3): 1 for transcription, 1 for batch processing and 1 for task completion (such
                                       as speech diarization and writing to files). To control the amount of tasks
                                       completion workers, an integer can be provided to specify the amount of workers.
                                       `False`, will use a single process. Default is `False`.
    :param verbose:                    Whether to print the progress of the transcription. Default is `False`.
    """
    global _LOGGER

    # Get the input audio files to transcribe:
    if verbose:
        _LOGGER.info("Collecting audio files.")
    audio_files = _get_audio_files(data_path=data_path)
    if verbose:
        _LOGGER.info(f"Collected {len(audio_files)} audio files.")

    # Get the output directory:
    if output_directory is None:
        if verbose:
            _LOGGER.info("No output directory given, using temporary directory.")
        output_directory = tempfile.mkdtemp()
    output_directory = Path(output_directory).absolute()
    output_directory.mkdir(exist_ok=True, parents=True)
    if verbose:
        _LOGGER.info(f"Transcriptions will be saved to: {output_directory}")

    # Initialize a batch processor according to user requirements (no speech diarization, given speech diarization,
    # speech diarization per channel):
    if speech_diarization:
        batch_processor = SpeechDiarizationBatchProcessor(
            audio_files=audio_files,
            output_directory=output_directory,
            speech_diarization=speech_diarization,
        )
    elif speech_diarize_per_channel:
        batch_processor = PerChannelSpeechDiarizationBatchProcessor(
            audio_files=audio_files,
            output_directory=output_directory,
            n_channels=speech_diarize_per_channel,
            speakers=speaker_labels,
        )
    else:
        batch_processor = BatchProcessor(
            audio_files=audio_files,
            output_directory=output_directory,
        )

    # Initialize the transcription pipeline:
    transcriber = Transcriber(
        device=device,
        use_flash_attention_2=use_flash_attention_2,
        use_better_transformers=use_better_transformers,
        assistant_model=assistant_model,
        model_name=model_name,
        max_new_tokens=max_new_tokens,
        chunk_length_s=chunk_length_s,
        batch_size=batch_size,
        return_timestamps=(
            "word"
            if speech_diarization is not None or speech_diarize_per_channel is not None
            else False
        ),
        per_channel_transcription=speech_diarize_per_channel or 0,
        spoken_language=spoken_language,
        translate_to_english=translate_to_english,
    )

    # Run the transcription:
    if use_multiprocessing:
        results = _parallel_run(
            n_workers=use_multiprocessing
            if isinstance(use_multiprocessing, int)
            else 1,
            audio_files=audio_files,
            batch_processor=batch_processor,
            transcriber=transcriber,
            verbose=verbose,
        )
    else:
        results = _run(
            audio_files=audio_files,
            batch_processor=batch_processor,
            transcriber=transcriber,
            verbose=verbose,
        )

    # Process the results:
    if verbose:
        _LOGGER.info("Summarizing the results.")
    successes = []
    errors = {}
    for is_error, result in results:
        if is_error:
            errors[result[0]] = result[1]
        else:
            successes.append(result)
    successes = pd.DataFrame(successes, columns=["audio_file", "transcription_file"])
    if verbose:
        _LOGGER.info(
            f"Done ({successes.shape[0]}/{len(audio_files)})\n"
            f"Transcriptions summary:\n"
            f"{successes.head()}"
        )

    return str(output_directory), successes, errors


def _get_audio_files(
    data_path: Union[Path, str, list],
) -> List[Path]:
    """
    Get the audio files to transcribe. If a path to a directory is given, all files in the directory will be collected.

    :param data_path: The data path to collect the audio files from.

    :returns: The audio files list.
    """
    # Check if given a list of paths:
    if isinstance(data_path, list):
        audio_files = []
        for path in data_path:
            audio_files.extend(_get_audio_files(data_path=path))
        return audio_files

    # Check if given a single string path to cast it to a `pathlib.Path`:
    if isinstance(data_path, str):
        data_path = Path(data_path).absolute()

    # Check if the path is of a directory or a file:
    if data_path.is_dir():
        # Get all files inside the directory:
        audio_files = list(data_path.glob("*.*"))
    elif data_path.is_file():
        audio_files = [data_path]
    else:
        raise ValueError(
            f"Unrecognized data path. The parameter `data_path` must be a valid path to either a directory path or a "
            f"file. Given: {str(data_path)} "
        )

    return audio_files


def _run(
    audio_files: List[Path],
    batch_processor: BatchProcessor,
    transcriber: Transcriber,
    verbose: bool,
) -> List[Tuple[bool, Tuple[str, str]]]:
    """
    Run the transcription without multiprocessing.

    :param audio_files:     The audio files to transcribe.
    :param batch_processor: The batch processor to use.
    :param transcriber:     The transcriber to use.
    :param verbose:         Verbosity.

    :returns: The collected results.
    """
    # Load the transcription pipeline:
    if verbose:
        _LOGGER.info(f"Loading the transcription pipeline.")
    transcriber.load()
    if verbose:
        _LOGGER.info("Transcription pipeline loaded.")

    # Transcribe the files:
    transcriber.transcribe(
        audio_files=audio_files,
        batch_processor=batch_processor,
        verbose=verbose,
    )

    # Return the results:
    return batch_processor.get_results()


def _parallel_run(
    n_workers: int,
    audio_files: List[Path],
    batch_processor: BatchProcessor,
    transcriber: Transcriber,
    verbose: bool,
):
    """
    Run the transcription with multiprocessing.

    :param n_workers:       The amount of workers to use as task completers.
    :param audio_files:     The audio files to transcribe.
    :param batch_processor: The batch processor to use.
    :param transcriber:     The transcriber to use.
    :param verbose:         Verbosity.

    :returns: The collected results.
    """
    # Initialize the multiprocessing queues:
    batches_queue = Queue()
    tasks_queue = Queue()
    results_queue = Queue()

    # Initialize the multiprocessing processes:
    batch_processing_process = Process(
        target=_multiprocessing_process_batches,
        kwargs={
            "batch_processor": batch_processor,
            "batches_queue": batches_queue,
            "tasks_queue": tasks_queue,
            "n_task_completers": n_workers,
        },
    )
    task_completion_processes = [
        Process(
            target=_multiprocessing_complete_tasks,
            kwargs={"tasks_queue": tasks_queue, "results_queue": results_queue},
        )
        for _ in range(n_workers)
    ]

    # Start the multiprocessing processes:
    batch_processing_process.start()
    for p in task_completion_processes:
        p.start()

    # Load the transcription pipeline:
    if verbose:
        _LOGGER.info(f"Loading the transcription pipeline.")
    transcriber.load()
    if verbose:
        _LOGGER.info("Transcription pipeline loaded.")

    # Transcribe the files:
    transcriber.transcribe(
        audio_files=audio_files, batches_queue=batches_queue, verbose=verbose
    )

    # Collect the results:
    results = []
    stop_marks_counter = 0
    while True:
        # Get a result from the queue:
        result: Tuple[bool, Tuple[str, str]] = results_queue.get()
        if result == _MULTIPROCESSING_STOP_MARK:
            stop_marks_counter += 1
            if stop_marks_counter == n_workers:
                break
        else:
            # Collect the result:
            results.append(result)

    # Wait for the processes to finish:
    results_queue.empty()
    batch_processing_process.join()
    for p in task_completion_processes:
        p.join()

    return results + base_image: mlrun/mlrun + commands: [] + code_origin: '' + origin_filename: '' + requirements: + - transformers + - tqdm + - torchaudio + - torch + - accelerate + entry_points: + do_task: + name: do_task + doc: Try to perform the task storing an error if occurred. + parameters: + - name: self + outputs: [] + lineno: 348 + has_varargs: false + has_kwargs: false + is_failed: + name: is_failed + doc: Check if the task failed. + parameters: + - name: self + outputs: + - doc: Whether the task failed. + type: bool + lineno: 70 + has_varargs: false + has_kwargs: false + get_result: + name: get_result + doc: 'Get the result of the task. If the task failed, the error will be returned, + otherwise, the result will be the + + text file name.' + parameters: + - name: self + outputs: + - doc: The task's result. + type: Tuple[str, str] + lineno: 78 + has_varargs: false + has_kwargs: false + to_tuple: + name: to_tuple + doc: Convert the task to a tuple to reconstruct it later (used for multiprocessing + to pass in queue). + parameters: + - name: self + outputs: + - doc: The converted task. + type: Tuple[str, dict] + lineno: 358 + has_varargs: false + has_kwargs: false + transcription_output_channels: + name: transcription_output_channels + doc: Get the transcription output channels. + parameters: + - name: self + outputs: + - doc: The transcription output channels. + type: List[Tuple[str, dict]] + lineno: 340 + has_varargs: false + has_kwargs: false + process_batch: + name: process_batch + doc: 'Process a batch of transcriptions. Tasks related to the given batch will + be created and stored in the batch + + processor.' + parameters: + - name: self + - name: batch + type: List[dict] + doc: The batch of transcriptions to process. + outputs: [] + lineno: 575 + has_varargs: false + has_kwargs: false + get_tasks: + name: get_tasks + doc: Get the tasks to perform. + parameters: + - name: self + outputs: + - doc: The tasks to perform. + type: List[BaseTask] + lineno: 453 + has_varargs: false + has_kwargs: false + do_tasks: + name: do_tasks + doc: Perform the tasks. Should be used if no multiprocessing queue is given + to a transcriber. + parameters: + - name: self + outputs: [] + lineno: 463 + has_varargs: false + has_kwargs: false + get_results: + name: get_results + doc: Get the results of the tasks. The stored results are then cleared. + parameters: + - name: self + outputs: + - doc: The results of the tasks. + type: List[Tuple[bool, Tuple[str, str]]] + lineno: 471 + has_varargs: false + has_kwargs: false + load: + name: load + doc: Load the transcriber. Must be called before transcribing. + parameters: + - name: self + outputs: [] + lineno: 695 + has_varargs: false + has_kwargs: false + transcribe: + name: transcribe + doc: "Transcribe audio files into text files and collect additional data. The\ + \ end result is a directory of transcribed\ntext files and a dataframe containing\ + \ the following columns:\n\n* audio_file - The audio file path.\n* transcription_file\ + \ - The transcribed text file name in the output directory.\n\nThe transcription\ + \ is based on Huggingface's ASR pipeline -\nhttps://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline\ + \ and\nis tested with OpenAI's Whisper models - https://huggingface.co/openai.\n\ + \nIf one of the speaker diarization parameters are given (either `speech_diarization`\ + \ or\n`speech_diarize_per_channel`), the transcription will be written in\ + \ a conversation format, where each speaker will\nbe written in a separate\ + \ line::\n\n speaker_1: text\n speaker_2: text\n speaker_1: text\n\ + \ ..." + parameters: + - name: data_path + type: Union[str, Path, List[Union[str, Path]]] + doc: A directory of audio files or a single file or a list of files to transcribe. + - name: output_directory + type: str + doc: Path to a directory to save all transcribed audio files. If not given, + will save the transcribed files in a temporary directory. + default: null + - name: model_name + type: str + doc: 'The model name to use. Should be a model from the OpenAI''s Whisper + models for best results (for example "tiny", "base", "large", etc.). See + here for more information: https://huggingface.co/openai?search_models=whisper.' + default: openai/whisper-tiny + - name: device + type: str + doc: The device to use for inference. If not given, will use GPU if available. + default: null + - name: use_flash_attention_2 + type: bool + doc: 'Whether to use the Flash Attention 2 implementation. It can be used + only with one of the following GPUs: Nvidia H series and Nvidia A series. + T4 support will be available soon.' + default: null + - name: use_better_transformers + type: bool + doc: Whether to use the Better Transformers library to further optimize the + model. Should be used for all use cases that do not support flash attention + 2. + default: null + - name: assistant_model + type: str + doc: 'The assistant model name to use for inference. Notice that the optimizations + (flash attention 2 and better transformers) will be applied for the assistant + as well. Should be a model from Huggingface''s distil-whisper (see here + for more information: https://github.com/huggingface/distil-whisper).' + default: null + - name: max_new_tokens + type: int + doc: The maximum number of new tokens to generate. This is used to limit the + generation length. Default is 128 tokens. + default: 128 + - name: chunk_length_s + type: int + doc: The audio chunk to split the audio to (in seconds). Default is 30 seconds. + default: 30 + - name: batch_size + type: int + doc: The batch size to use for inference. Default is 2. + default: 8 + - name: spoken_language + type: str + doc: Aim whisper to know what language is spoken. If None, it will try to + detect it. + default: null + - name: translate_to_english + type: bool + doc: Whether to translate the transcriptions to English. + default: false + - name: speech_diarization + type: Dict[str, List[Tuple[float, float, str]]] + doc: 'A speech diarization dictionary with the file names to transcribe as + keys and their diarization as value. The diarization is a list of tuples: + (start, end, speaker). An example for a diarization dictionary::' + default: null + - name: speech_diarize_per_channel + type: int + doc: 'Perform speech diarization per channel. Each speaker is expected to + belong to a separate channel in the audio. Notice: This will make the transcription + slower as each channel wil be transcribed separatly. If a speech diarization + is passed (via the `speech_diarization` parameter), this parameter is ignored.' + default: null + - name: speaker_labels + type: List[str] + doc: A list of speaker labels by channel order to use for writing the transcription + with respect to per channel speech diarization. This won't be used together + with a given speech diarization (via the `speech_diarization` parameter). + default: null + - name: use_multiprocessing + type: Union[bool, int] + doc: 'Whether to use multiprocessing to transcribe the audio files. Can be + either a boolean value or an integer. If `True`, will use the default amount + of workers (3): 1 for transcription, 1 for batch processing and 1 for task + completion (such as speech diarization and writing to files). To control + the amount of tasks completion workers, an integer can be provided to specify + the amount of workers. `False`, will use a single process. Default is `False`.' + default: false + - name: verbose + type: bool + doc: Whether to print the progress of the transcription. Default is `False`. + default: false + outputs: [] + lineno: 1097 + has_varargs: false + has_kwargs: false + audio_iterator: + name: audio_iterator + doc: '' + parameters: [] + outputs: + - type: Generator[Union[dict, str], None, None] + lineno: 804 + has_varargs: false + has_kwargs: false + batch_iterator: + name: batch_iterator + doc: '' + parameters: [] + outputs: + - type: Generator[List[Union[dict, str]], None, None] + lineno: 816 + has_varargs: false + has_kwargs: false + open_mpi_handler: + name: open_mpi_handler + doc: '' + parameters: + - name: worker_inputs + type: List[str] + - name: root_worker_inputs + type: Dict[str, Any] + default: null + outputs: [] + lineno: 957 + has_varargs: false + has_kwargs: false + decorator: + name: decorator + doc: '' + parameters: + - name: handler + outputs: [] + lineno: 969 + has_varargs: false + has_kwargs: false + wrapper: + name: wrapper + doc: '' + parameters: [] + outputs: [] + lineno: 974 + has_varargs: false + has_kwargs: true + description: Transcribe audio files into text files + default_handler: transcribe + disable_auto_mount: false + clone_target_dir: '' + env: [] + priority_class_name: '' + preemption_mode: prevent + affinity: null + tolerations: null + security_context: {} +verbose: false diff --git a/functions/master/transcribe/1.1.0/src/item.yaml b/functions/master/transcribe/1.1.0/src/item.yaml new file mode 100644 index 00000000..7fddcf95 --- /dev/null +++ b/functions/master/transcribe/1.1.0/src/item.yaml @@ -0,0 +1,32 @@ +apiVersion: v1 +categories: +- data-preparation +- genai +- huggingface +- machine-learning +description: Transcribe audio files into text files +doc: '' +example: transcribe.ipynb +generationDate: 2023-07-13:11-20 +hidden: false +icon: '' +labels: + author: yonatans +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.5.1 +name: transcribe +platformVersion: 3.5.3 +spec: + filename: transcribe.py + handler: transcribe + image: mlrun/mlrun + kind: job + requirements: + - transformers + - tqdm + - torchaudio + - torch + - accelerate +url: '' +version: 1.1.0 \ No newline at end of file diff --git a/functions/master/transcribe/1.1.0/src/requirements.txt b/functions/master/transcribe/1.1.0/src/requirements.txt new file mode 100644 index 00000000..d16bfc9d --- /dev/null +++ b/functions/master/transcribe/1.1.0/src/requirements.txt @@ -0,0 +1,5 @@ +transformers +torch +torchaudio +tqdm +accelerate \ No newline at end of file diff --git a/functions/master/transcribe/1.1.0/src/test_transcribe.py b/functions/master/transcribe/1.1.0/src/test_transcribe.py new file mode 100644 index 00000000..f70b3856 --- /dev/null +++ b/functions/master/transcribe/1.1.0/src/test_transcribe.py @@ -0,0 +1,104 @@ +# Copyright 2019 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import pathlib +import tempfile +from difflib import SequenceMatcher + +import mlrun +import pytest + + +expected_outputs = [ + "This is a speech to text test.", + "In the heart of the stadium, " + "cheers paint the air as the ball weaves its tale across the pitch. " + "With each kick, players chase their dreams, guided by the rhythmic dance of teamwork. " + "The crowd roars, a symphony of passion, " + "as the game writes its unpredictable story on the field of destiny.", +] +models = [ + + "openai/whisper-tiny", +] + + +@pytest.mark.skipif(os.system("which ffmpeg") != 0, reason="ffmpeg not installed") +@pytest.mark.parametrize("model_name", models) +@pytest.mark.parametrize("audio_path", ["./data", "./data/speech_01.mp3"]) +def test_transcribe(model_name: str, audio_path: str): + # Setting variables and importing function: + artifact_path = tempfile.mkdtemp() + project = mlrun.get_or_create_project("test") + transcribe_function = project.set_function("transcribe.py", "transcribe", kind="job", image="mlrun/mlrun") + # transcribe_function = mlrun.import_function("function.yaml") + temp_dir = tempfile.mkdtemp() + + # Running transcribe function: + transcribe_run = transcribe_function.run( + handler="transcribe", + params={ + "data_path": audio_path, + "model_name": model_name, + "device": "cpu", + "output_directory": temp_dir, + }, + local=True, + returns=["output_dir: path", "dataset: dataset", "errored_files"], + artifact_path=artifact_path, + ) + + artifact_path += ( + f"/{transcribe_run.metadata.name}/{transcribe_run.metadata.iteration}/" + ) + + # Getting actual files from run (text and errored): + input_files = ( + os.listdir(audio_path) + if pathlib.Path(audio_path).is_dir() + else [pathlib.Path(audio_path).name] + ) + expected_text_files = sorted([f for f in input_files if f.endswith("mp3")]) + error_files = list(set(input_files) - set(expected_text_files)) + expected_text_files = [f.replace("mp3", "txt") for f in expected_text_files] + text_files = sorted(os.listdir(temp_dir)) + + # Check that the text files are saved in output_directory: + assert text_files == expected_text_files + + # Check that the transcribed text was approximately (90%) generated from audio: + for text_file, expected in zip(text_files, expected_outputs): + with open(os.path.join(temp_dir, text_file), "r") as f: + output = f.readlines()[0] + ratio = SequenceMatcher(None, expected, output).ratio() + assert ratio >= 0.9 + + # Check that the dataframe is in the correct size: + df = mlrun.get_dataitem(artifact_path + "dataset.parquet").as_df() + assert len(df) == len(expected_text_files) + + # Check errored files: + if isinstance(transcribe_run.outputs["errored_files"], str): + actual_errored_files = [] + else: + actual_errored_files = [ + os.path.basename(errored) + for errored in transcribe_run.outputs["errored_files"].keys() + ] + assert actual_errored_files == error_files + + # Check output_dir: + zip_dir = mlrun.get_dataitem(artifact_path + "output_dir.zip") + assert zip_dir.kind == "file" diff --git a/functions/master/transcribe/1.1.0/src/transcribe.ipynb b/functions/master/transcribe/1.1.0/src/transcribe.ipynb new file mode 100644 index 00000000..5671160c --- /dev/null +++ b/functions/master/transcribe/1.1.0/src/transcribe.ipynb @@ -0,0 +1,338 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a80305ba-ffff-4116-aa46-5c1b67368239", + "metadata": {}, + "source": [ + "# Transcribe tutorial" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "bdb947f0-5b9a-492d-9676-374c38eee14a", + "metadata": { + "ExecuteTime": { + "start_time": "2023-07-16T17:13:48.565039Z", + "end_time": "2023-07-16T17:14:01.952515Z" + } + }, + "outputs": [], + "source": [ + "import tempfile\n", + "import mlrun" + ] + }, + { + "cell_type": "markdown", + "id": "b7364965-8dcd-419a-8764-dd0c87edb9f8", + "metadata": {}, + "source": [ + "## Importing the transcribe function from hub\n", + "\n", + "To import the function directly from hub, use:\n", + "```python \n", + "transcribe_fn = mlrun.import_function(\"hub://transcribe\")\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [], + "source": [ + "artifact_path = tempfile.mkdtemp()" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "start_time": "2023-07-16T17:14:01.954022Z", + "end_time": "2023-07-16T17:14:01.955760Z" + } + } + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2d9a80a2-8448-49cd-a92f-1ab2072fc720", + "metadata": { + "ExecuteTime": { + "start_time": "2023-07-16T17:14:01.956508Z", + "end_time": "2023-07-16T17:14:01.966758Z" + } + }, + "outputs": [], + "source": [ + "transcribe_fn = mlrun.import_function(\"function.yaml\")" + ] + }, + { + "cell_type": "markdown", + "id": "7fcb6c8a-f83b-42d9-b02e-9187e85fe232", + "metadata": {}, + "source": [ + "## Running transcribe" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1570b05f-cfb7-466d-84c8-98f4c9d54ad4", + "metadata": { + "ExecuteTime": { + "start_time": "2023-07-16T17:14:01.969912Z", + "end_time": "2023-07-16T17:14:12.724086Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-16 17:14:01,968 [info] Storing function: {'name': 'transcribe-transcribe', 'uid': 'd1384cb679bc4c178b0195d964b628a8', 'db': None}\n", + "> 2023-07-16 17:14:01,969 [warning] Could not detect path to API server, not connected to API server!\n", + "> 2023-07-16 17:14:01,969 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect\n", + "> 2023-07-16 17:14:01,970 [warning] Could not detect path to API server, not connected to API server!\n", + "> 2023-07-16 17:14:01,970 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect\n", + "> 2023-07-16 17:14:01,972 [warning] Could not detect path to API server, not connected to API server!\n", + "> 2023-07-16 17:14:01,972 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect\n", + "> 2023-07-16 17:14:09,804 [warning] Could not detect path to API server, not connected to API server!\n", + "> 2023-07-16 17:14:09,805 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect\n", + "> 2023-07-16 17:14:09,805 [info] Loading whisper model: 'tiny'\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\n", + "IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-16 17:14:10,374 [info] Model loaded.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Transcribing: 67%|██████▋ | 2/3 [00:02<00:01, 1.04s/file]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-16 17:14:12,556 [warning] Error in file: '/Users/Yonatan_Shelach/projects/functions/transcribe/data/error_file.txt'\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Transcribing: 100%|██████████| 3/3 [00:02<00:00, 1.39file/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-16 17:14:12,566 [info] Done:\n", + " audio_file transcription_file language length rate_of_speech\n", + "0 speech_01.mp3 speech_01.txt en 2.011333 3.480278\n", + "1 speech_02.mp3 speech_02.txt en 20.793500 2.548873\n", + "> 2023-07-16 17:14:12,596 [warning] Could not detect path to API server, not connected to API server!\n", + "> 2023-07-16 17:14:12,597 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect\n", + "> 2023-07-16 17:14:12,659 [warning] Could not detect path to API server, not connected to API server!\n", + "> 2023-07-16 17:14:12,660 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect\n", + "> 2023-07-16 17:14:12,671 [warning] Could not detect path to API server, not connected to API server!\n", + "> 2023-07-16 17:14:12,672 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-16 17:14:12,707 [warning] Could not detect path to API server, not connected to API server!\n", + "> 2023-07-16 17:14:12,707 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect\n", + "> 2023-07-16 17:14:12,708 [warning] Could not detect path to API server, not connected to API server!\n", + "> 2023-07-16 17:14:12,708 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect\n" + ] + }, + { + "data": { + "text/plain": "", + "text/html": "\n
    \n
    \n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    default
    ...b628a8
    0Jul 16 14:14:01completedtranscribe-transcribe
    kind=
    owner=Yonatan_Shelach
    host=M-QWXQJK77Q0
    model_name=tiny
    audio_files_directory=./data
    decoding_options={'fp16': False}
    output_directory=./output
    transcriptions
    transcriptions_df
    transcriptions_errors
    \n
    \n
    \n
    \n Title\n ×\n
    \n \n
    \n
    \n" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": "", + "text/html": " > to track results use the .show() or .logs() methods " + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-16 17:14:12,721 [info] Run execution finished: {'status': 'completed', 'name': 'transcribe-transcribe'}\n" + ] + } + ], + "source": [ + "transcribe_run = transcribe_fn.run(\n", + " handler=\"transcribe\",\n", + " params={\n", + " \"model_name\": \"tiny\",\n", + " \"input_path\": \"./data\",\n", + " \"decoding_options\": {\"fp16\": False},\n", + " \"output_directory\": \"./output\",\n", + " },\n", + " returns=[\n", + " \"transcriptions: path\",\n", + " \"transcriptions_df: dataset\",\n", + " {\"key\": \"transcriptions_errors\", \"artifact_type\": \"file\", \"file_format\": \"yaml\"},\n", + " ],\n", + " local=True,\n", + " artifact_path=artifact_path,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "407d1e6c-d2a4-42e7-b3e2-c51138cb30ea", + "metadata": { + "ExecuteTime": { + "start_time": "2023-07-16T17:14:12.726898Z", + "end_time": "2023-07-16T17:14:12.745521Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "{'transcriptions': 'store://artifacts/default/transcribe-transcribe_transcriptions:d1384cb679bc4c178b0195d964b628a8',\n 'transcriptions_df': 'store://artifacts/default/transcribe-transcribe_transcriptions_df:d1384cb679bc4c178b0195d964b628a8',\n 'transcriptions_errors': 'store://artifacts/default/transcribe-transcribe_transcriptions_errors:d1384cb679bc4c178b0195d964b628a8'}" + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transcribe_run.outputs" + ] + }, + { + "cell_type": "markdown", + "source": [ + "**Notice**: If connected to mlrun server, you can simply use:\n", + "\n", + "```python\n", + "df = transcribe_run.artifact(\"transcriptions_df\")\n", + "```" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [], + "source": [ + "artifact_path += f\"/{transcribe_run.metadata.name}/{transcribe_run.metadata.iteration}/\"" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "start_time": "2023-07-16T17:14:12.730064Z", + "end_time": "2023-07-16T17:14:12.748292Z" + } + } + }, + { + "cell_type": "code", + "execution_count": 10, + "outputs": [], + "source": [ + "df = mlrun.get_dataitem(artifact_path + \"transcriptions_df.parquet\").as_df()" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "start_time": "2023-07-16T17:25:02.712455Z", + "end_time": "2023-07-16T17:25:02.719538Z" + } + } + }, + { + "cell_type": "code", + "execution_count": 11, + "outputs": [ + { + "data": { + "text/plain": " audio_file transcription_file language length rate_of_speech\n0 speech_01.mp3 speech_01.txt en 2.011333 3.480278\n1 speech_02.mp3 speech_02.txt en 20.793500 2.548873", + "text/html": "
    \n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
    audio_filetranscription_filelanguagelengthrate_of_speech
    0speech_01.mp3speech_01.txten2.0113333.480278
    1speech_02.mp3speech_02.txten20.7935002.548873
    \n
    " + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "start_time": "2023-07-16T17:25:07.878158Z", + "end_time": "2023-07-16T17:25:07.880514Z" + } + } + } + ], + "metadata": { + "kernelspec": { + "name": "python3", + "language": "python", + "display_name": "Python 3 (ipykernel)" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/functions/master/transcribe/1.1.0/src/transcribe.py b/functions/master/transcribe/1.1.0/src/transcribe.py new file mode 100644 index 00000000..9cabcb1e --- /dev/null +++ b/functions/master/transcribe/1.1.0/src/transcribe.py @@ -0,0 +1,1464 @@ +# Copyright 2024 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import operator +import os +import tempfile +from functools import reduce, wraps +from multiprocessing import Process, Queue +from pathlib import Path +from typing import Any, Dict, Generator, List, Literal, NamedTuple, Tuple, Union + +import pandas as pd +import torch +import torchaudio +from tqdm import tqdm +from transformers import ( + AutomaticSpeechRecognitionPipeline, + AutoModelForCausalLM, + pipeline, +) +from transformers.utils import is_flash_attn_2_available + + +class BaseTask: + """ + A task to write the transcription to file. + """ + + def __init__( + self, audio_file: Path, transcription_output: Union[dict, str], text_file: Path + ): + """ + Initialize the task. + + :param audio_file: Path to the audio file that was transcribed. + :param transcription_output: The transcription output from the pipeline. String means an exception was raised. + :param text_file: Path to the text file to write the transcription to. + """ + # Store the parameters: + self._audio_file = audio_file + self._transcription_output = transcription_output + self._text_file = text_file + + # Prepare the error variable: + self._error: str = None + + def do_task(self): + """ + Try to perform the task storing an error if occurred. + """ + if isinstance(self._transcription_output, str): + self._error = self._transcription_output + return + try: + self._do_task() + except Exception as exception: + self._error = str(exception) + + def is_failed(self) -> bool: + """ + Check if the task failed. + + :returns: Whether the task failed. + """ + return self._error is not None + + def get_result(self) -> Tuple[str, str]: + """ + Get the result of the task. If the task failed, the error will be returned, otherwise, the result will be the + text file name. + + :returns: The task's result. + """ + if self.is_failed(): + return self._audio_file.name, self._error + return self._audio_file.name, self._text_file.name + + def to_tuple(self) -> Tuple[str, dict]: + """ + Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + + :returns: The converted task. + """ + return self.__class__.__name__, { + "audio_file": self._audio_file, + "transcription_output": self._transcription_output, + "text_file": self._text_file, + } + + def _do_task(self): + """ + Perform the task - write the transcription to the stored file path. + """ + # Checking for no duplications: + i = 1 + while self._text_file.exists(): + i += 1 + self._text_file = ( + self._text_file.parent + / f"{self._text_file.stem.rsplit('_', 1)[0]}_{i}{self._text_file.suffix}" + ) + + # Make sure all directories are created: + self._text_file.parent.mkdir(exist_ok=True, parents=True) + + # Write to file: + with open(self._text_file, "w") as fp: + fp.write(self._transcription_output["text"]) + + +class SpeechDiarizationTask(BaseTask): + """ + A task to write the transcription to file with respect to a given speech diarization. + """ + + class _DiarizationSegment(NamedTuple): + """ + A speech diarization segment. + """ + + start: float + end: float + speaker: str + + class _WordTimestamp(NamedTuple): + """ + A word with its start and end timestamps. + """ + + start: float + end: float + text: str + + def __init__( + self, + audio_file: Path, + transcription_output: dict, + text_file: Path, + speech_diarization: List[Tuple[float, float, str]], + ): + """ + Initialize the task. + + :param audio_file: Path to the audio file that was transcribed. + :param transcription_output: The transcription output from the pipeline. + :param text_file: Path to the text file to write the transcription to. + :param speech_diarization: A speech diarization as a list of tuples: (start, end, speaker). + """ + super().__init__( + audio_file=audio_file, + transcription_output=transcription_output, + text_file=text_file, + ) + self._speech_diarization = speech_diarization + self._segments: List[SpeechDiarizationTask._DiarizationSegment] = None + self._last_chosen_index = 0 + + def to_tuple(self) -> Tuple[str, dict]: + """ + Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + + :returns: The converted task. + """ + task_class, task_kwargs = super().to_tuple() + return task_class, { + **task_kwargs, + "speech_diarization": self._speech_diarization, + } + + def _do_task(self): + """ + Perform the task - write the transcription to the stored file path with respect to the given speech diarization. + """ + # Check if a speech diarization is given, if not, just write the transcription to file: + if not self._speech_diarization: + super()._do_task() + return + + # Cast the chunks to word timestamps tuples: + words = [ + SpeechDiarizationTask._WordTimestamp( + start=chunk["timestamp"][0], + end=chunk["timestamp"][1], + text=chunk["text"], + ) + for chunk in self._transcription_output["chunks"] + ] + + # Cast speech diarization to segments tuples: + self._segments = [ + SpeechDiarizationTask._DiarizationSegment(*segment) + for segment in self._speech_diarization + ] + + # Try to match the Whisper model predicted timestamps to the closest diarization segment (closest diarization + # segment will be the most overlapping with the word, and if there is no overlap, the closest segment to the + # word): + speaker = self._segments[self._last_chosen_index].speaker + text = f"{speaker}:" + for word in words: + # Get the next diarization segment: + self._get_next_segment(word=word) + # Check if the segment is of the same speaker: + if self._segments[self._last_chosen_index].speaker == speaker: + # Collect the word: + text += word.text + else: + # Append a newline and update the new speaker: + speaker = self._segments[self._last_chosen_index].speaker + text += f"\n{speaker}:{word.text}" + + # Update the transcription output with the new text to write it to file: + self._transcription_output["text"] = text + super()._do_task() + + def _get_next_segment( + self, + word: _WordTimestamp, + ): + """ + Get the next diarization segment the given word falls into. The `self._last_chosen_index` will be updated + accordingly. + + :param word: The word timestamp to match to the next segment. + """ + # If the last chosen segment is the last segment, return it: + if self._last_chosen_index == len(self._segments) - 1: + return + + # Get the last chosen diarization segment: + last_chosen = self._segments[self._last_chosen_index] + + # None value may appear if the word is the last word in the audio file, or it was split during inference. In + # that case, we'll set the last segment: + if word.end is None: + self._last_chosen_index = len(self._segments) - 1 + return + + # If the word ends before the last chosen segment: + if word.end <= last_chosen.start: + # Then it is still the closest segment + return + + # We check if it ends inside the last chosen segment: + if word.end < last_chosen.end: + # Then it still is the closest segment + return + + # The word ends after the segment, we need to collect all next segments up until the word ends before them: + possible_segments = [self._last_chosen_index] + for i in range(self._last_chosen_index + 1, len(self._segments)): + if word.end > self._segments[i].end: + possible_segments.append(i) + continue + possible_segments.append(i) + break + + # Check for the most overlapping option: + best_overlap = 0 + most_overlapping_segment_index = None + for i in possible_segments: + # If the word starts before segment: + if word.start <= self._segments[i].start: + # If it ends before the segment, there is an overlap from the start of the segment to the end of the + # word: + if word.end < self._segments[i].end: + overlap = word.end - self._segments[i].start + else: + # The word is wrapping the segment, the overlap is the segment's length: + overlap = self._segments[i].end - self._segments[i].start + # The word starts in segment, check if the word ends in it: + elif word.end < self._segments[i].end: + # The overlap is the word's length: + overlap = word.end - word.start + # The word start in segment but ends after it, the overlap is from the word's start to the segment's end: + else: + overlap = self._segments[i].end - word.start + # Check for new best overlap: + if overlap > best_overlap: + best_overlap = overlap + most_overlapping_segment_index = i + if most_overlapping_segment_index is not None: + self._last_chosen_index = most_overlapping_segment_index + return + + # If there is no overlapping segment, return the closest segment: + best_distance = None + closest_segment_index = None + for i in possible_segments: + distance = ( + word.start - self._segments[i].end + if word.start > self._segments[i].end + else self._segments[i].start - word.end + ) + if best_distance is None or distance < best_distance: + best_distance = distance + closest_segment_index = i + self._last_chosen_index = closest_segment_index + + +class SpeechDiarizationPerChannelTask(BaseTask): + """ + A task to write the transcription to file with respect to a given speech diarization per channel. + """ + + class _WordTimestamp(NamedTuple): + """ + A word with its start and end timestamps and speaker label (channel the word was taken from). + """ + + start: float + end: float + speaker: str + text: str + + def __init__(self, audio_file: Path, text_file: Path): + """ + Initialize the task. + + :param audio_file: Path to the audio file that was transcribed. + :param text_file: Path to the text file to write the transcription to. + """ + super().__init__( + audio_file=audio_file, transcription_output={}, text_file=text_file + ) + self._transcription_output_channels: List[Tuple[str, dict]] = [] + + @property + def transcription_output_channels(self) -> List[Tuple[str, dict]]: + """ + Get the transcription output channels. + + :returns: The transcription output channels. + """ + return self._transcription_output_channels + + def do_task(self): + """ + Try to perform the task storing an error if occurred. + """ + for _, channel_output in self._transcription_output_channels: + if isinstance(channel_output, str): + self._error = self._transcription_output_channels + return + super().do_task() + + def to_tuple(self) -> Tuple[str, dict]: + """ + Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + + :returns: The converted task. + """ + task_class, task_kwargs = super().to_tuple() + task_kwargs.pop("transcription_output") + return task_class, task_kwargs + + def _do_task(self): + """ + Perform the task - write the transcription to the stored file path with respect to the given speech diarization + per channel. + """ + # Cast the chunks to word timestamps tuples: + words_per_channel = [ + [ + SpeechDiarizationPerChannelTask._WordTimestamp( + start=chunk["timestamp"][0], + end=chunk["timestamp"][1], + speaker=speaker, + text=chunk["text"], + ) + for chunk in output["chunks"] + ] + for speaker, output in self._transcription_output_channels + ] + + # Merge and sort the words per channel by their start time: + words = operator.add(*words_per_channel) + words.sort() + + # Write the transcription to file: + current_speaker = words[0].speaker + text = f"{current_speaker}:" + for word in words: + # Check if the word's speaker is different from the current one: + if word.speaker != current_speaker: + # Append a newline and update the new speaker: + current_speaker = word.speaker + text += f"\n{current_speaker}:" + # Collect the word: + text += word.text + + # Update the transcription output with the new text to write it to file: + self._transcription_output["text"] = text + super()._do_task() + + +class BatchProcessor: + """ + A batch processor to process batches of transcriptions. The batch processor is creating tasks and is aimed to be + working along the transcriber. It can be used with multiprocessing queue or run the tasks directly using the + associated methods. + """ + + def __init__(self, audio_files: List[Path], output_directory: Path): + """ + Initialize the batch processor. + + :param audio_files: The list of all audio files to transcribe. + :param output_directory: The output directory to write the transcriptions to. + """ + # Store the parameters: + self._audio_files = audio_files + self._output_directory = output_directory + + # Prepare the batching variables: + self._current_file_index = 0 + self._tasks: List[BaseTask] = [] + self._results: List[Tuple[bool, Tuple[str, str]]] = [] + + def process_batch(self, batch: List[Union[dict, str]]): + """ + Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch + processor. + + :param batch: The batch of transcriptions to process. + """ + # Get the relevant files belongs to the given batch: + current_files = self._get_current_files(batch_size=len(batch)) + + # Build the diarization tasks: + self._tasks.extend( + [ + BaseTask( + audio_file=file, + transcription_output=batch[i], + text_file=self._output_directory / f"{file.stem}.txt", + ) + for i, file in enumerate(current_files) + ] + ) + + def get_tasks(self) -> List[BaseTask]: + """ + Get the tasks to perform. + + :returns: The tasks to perform. + """ + tasks = self._tasks + self._tasks = [] + return tasks + + def do_tasks(self): + """ + Perform the tasks. Should be used if no multiprocessing queue is given to a transcriber. + """ + for task in self.get_tasks(): + task.do_task() + self._results.append((task.is_failed(), task.get_result())) + + def get_results(self) -> List[Tuple[bool, Tuple[str, str]]]: + """ + Get the results of the tasks. The stored results are then cleared. + + :returns: The results of the tasks. + """ + results = self._results + self._results = [] + return results + + def _get_current_files(self, batch_size: int) -> List[Path]: + """ + Get the current files to process. + + :param batch_size: The batch size to progress the current file index. + + :returns: The current files to process. + """ + end_index = ( + self._current_file_index + batch_size + if self._current_file_index + batch_size < len(self._audio_files) + else len(self._audio_files) + ) + current_files = self._audio_files[self._current_file_index : end_index] + self._current_file_index = end_index + return current_files + + +class SpeechDiarizationBatchProcessor(BatchProcessor): + """ + A batch processor to process batches of transcriptions with respect to a given speech diarization. The batch + processor is creating tasks and is aimed to be working along the transcriber. It can be used with multiprocessing + queue or run the tasks directly using the associated methods. + """ + + def __init__( + self, audio_files: List[Path], output_directory: Path, speech_diarization: dict + ): + """ + Initialize the batch processor. + + :param audio_files: The list of all audio files to transcribe. + :param output_directory: The output directory to write the transcriptions to. + :param speech_diarization: A speech diarization dictionary to pass along with each processed batch. + """ + super().__init__(audio_files=audio_files, output_directory=output_directory) + self._speech_diarization = speech_diarization + self._audio_files = audio_files + + def process_batch(self, batch: List[dict]): + """ + Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch + processor. + + :param batch: The batch of transcriptions to process. + """ + # Get the relevant files belongs to the given batch: + current_files = self._get_current_files(batch_size=len(batch)) + + # Build the diarization tasks: + self._tasks.extend( + [ + SpeechDiarizationTask( + audio_file=file, + transcription_output=batch[i], + text_file=self._output_directory / f"{file.stem}.txt", + speech_diarization=self._speech_diarization.get(file.name), + ) + for i, file in enumerate(current_files) + ] + ) + + +class PerChannelSpeechDiarizationBatchProcessor(BatchProcessor): + """ + A batch processor to process batches of transcriptions per channel. The batch processor is creating tasks with the + selected amount of channels given and is aimed to be working along the transcriber. It can be used with + multiprocessing queue or run the tasks directly using the associated methods. + """ + + def __init__( + self, + audio_files: List[Path], + output_directory: Path, + n_channels: int, + speakers: List[str], + ): + """ + Initialize the batch processor. + + :param audio_files: The list of all audio files to transcribe. + :param output_directory: The output directory to write the transcriptions to. + :param n_channels: The number of channels in each audio file to transcribe. + :param speakers: The speakers labels to use for each channel. + """ + super().__init__(audio_files=audio_files, output_directory=output_directory) + + # Store the parameters: + self._n_channels = n_channels + self._speakers = speakers + + # Prepare a channel buffer to store the channels until the current task created is fully covered: + self._task_in_process: SpeechDiarizationPerChannelTask = None + + def process_batch(self, batch: List[dict]): + """ + Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch + processor. + + :param batch: The batch of transcriptions to process. + """ + # Go over the batch and create the tasks: + for output in batch: + # Check if there is a task in process: + if not self._task_in_process: + # Create a new task: + self._task_in_process = SpeechDiarizationPerChannelTask( + audio_file=self._audio_files[self._current_file_index], + text_file=self._output_directory + / f"{self._audio_files[self._current_file_index].stem}.txt", + ) + # Get the channel's speaker: + speaker = self._speakers[ + len(self._task_in_process.transcription_output_channels) + ] + # Collect the channel into the processed task: + self._task_in_process.transcription_output_channels.append( + (speaker, output) + ) + # Check if the task is fully covered (all channels are collected): + if ( + len(self._task_in_process.transcription_output_channels) + == self._n_channels + ): + # Collect the task and reset the task in process: + self._tasks.append(self._task_in_process) + self._current_file_index += 1 + self._task_in_process = None + + +class Transcriber: + """ + A transcription wrapper for the Huggingface's ASR pipeline - + https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline to + use with OpenAI's Whisper models - https://huggingface.co/openai. + """ + + def __init__( + self, + model_name: str, + device: str = None, + use_flash_attention_2: bool = None, + use_better_transformers: bool = None, + assistant_model: str = None, + max_new_tokens: int = 128, + chunk_length_s: int = 30, + batch_size: int = 2, + spoken_language: str = None, + translate_to_english: bool = False, + return_timestamps: Union[bool, Literal["word"]] = False, + per_channel_transcription: int = 0, + ): + """ + Initialize the transcriber. + + :param model_name: The model name to use. Should be a model from the OpenAI's Whisper models for + best results (for example "tiny", "base", "large", etc.). + :param device: The device to use for inference. If not given, will use GPU if available. + :param use_flash_attention_2: Whether to use the Flash Attention 2 implementation. It can be used only with + one of the following GPUs: Nvidia H series and Nvidia A series. T4 support + will be available soon. + + Note: If both `use_flash_attention_2` and + `use_better_transformers` are `None`, the optimization will be chosen + automatically according to the available resources. + + :param use_better_transformers: Whether to use the Better Transformers library to further optimize the model. + Should be used for all use cases that do not support flash attention 2. + + Note: If both `use_flash_attention_2` and `use_better_transformers` are + `None`, the optimization will be chosen automatically according to the + available resources. + :param assistant_model: The assistant model name to use for inference. Notice that the optimizations + (flash attention 2 and better transformers) will be applied for the assistant + as well. Should be a model from Huggingface's distil-whisper (see here for + more information: https://github.com/huggingface/distil-whisper). + :param max_new_tokens: The maximum number of new tokens to generate. This is used to limit the + generation length. Default is 128 tokens. + :param chunk_length_s: The audio chunk to split the audio to (in seconds). Default is 30 seconds. + :param batch_size: The batch size to use for inference. Default is 2. + :param spoken_language: Aim whisper to know what language is spoken. If None, it will try to detect it + for each chunk. + :param translate_to_english: Whether to translate the transcriptions to English. Default is False. + :param return_timestamps: Whether to return the timestamps of the words. If "word", will return the + timestamps of each word. If True will return the timestamps of each chunk. + Default is False. Aimed to be used for speech diarization. + :param per_channel_transcription: Whether to do per channel transcription. If needed to run per channel + transcription, pass the number of channels expected for each audio file here. + 0 means regular transcription (merge channels). + + Note: If `per_channel_transcription` is not 0, `batch_size` must be treated to + be the number of channels and not audio files. Aimed to be used for per + channel speech diarization. + """ + # Store loading parameters: + self._model_name = model_name + self._device = device + self._use_flash_attention_2 = use_flash_attention_2 + self._use_better_transformers = use_better_transformers + self._max_new_tokens = max_new_tokens + self._chunk_length_s = chunk_length_s + self._batch_size = batch_size + self._return_timestamps = return_timestamps + self._per_channel_transcription = per_channel_transcription + + # Store generation parameters: + self._assistant_model = assistant_model + self._spoken_language = spoken_language + self._translate_to_english = translate_to_english + + # Prepare the transcription objects: + self._transcription_pipeline: AutomaticSpeechRecognitionPipeline = None + self._generate_kwargs: dict = None + + def load(self): + """ + Load the transcriber. Must be called before transcribing. + """ + # Set the device and data type to use (prefer GPU if available): + device = torch.device( + self._device or "cuda" if torch.cuda.is_available() else "cpu" + ) + torch_dtype = torch.float16 if device.type == "cuda" else torch.float32 + + # Choose the optimization to use (in case the user did not specify any): + if ( + self._use_flash_attention_2 is None + and self._use_better_transformers is None + ): + # Prefer to use flash attention 2 if available and cuda device is supported (see GPU names to architecture + # here: https://en.wikipedia.org/wiki/List_of_Nvidia_graphics_processing_units#Tesla): + if device.type == "cuda" and is_flash_attn_2_available(): + cuda_device_name = torch.cuda.get_device_properties(device).name + if any( + cuda_device_name.startswith(gpu_name) + for gpu_name in [ + "NVIDIA A", # For Ampere architecture (e.g. A10, A30, A100) + "NVIDIA H", # For Hopper architecture (e.g. H100) + "NVIDIA L", # For Ada Lovelace architecture (e.g. L4, L40) + "NVIDIA RTX 30", # For Ada Lovelace architecture (RTX 30 series) + "NVIDIA RTX 40", # For Ada Lovelace architecture (RTX 40 series) + "NVIDIA RTX 50", # For Ada Lovelace architecture (RTX 50 series) + # Will be supported soon according to FlashAttention GitHub repo: + # https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features + # "NVIDIA T4", # For Turing architecture (only T4) + # "NVIDIA RTX 20", # For Turing architecture (RTX 20 series) + ] + ): + self._use_flash_attention_2 = True + else: + self._use_better_transformers = True + else: + self._use_better_transformers = True + + # Build the optimizations kwargs: + model_kwargs = { + "low_cpu_mem_usage": True, + "use_safetensors": True, + } + if self._use_flash_attention_2: + if _LOGGER: + _LOGGER.info( + "Using FlashAttention2 optimization - make sure the `flash-attn` package is installed via " + "`pip install -U flash-attn --no-build-isolation`" + ) + model_kwargs["attn_implementation"] = "flash_attention_2" + elif self._use_better_transformers: + if _LOGGER: + _LOGGER.info( + "Using BetterTransformers optimization - make sure the `optimum` package is installed via " + "`pip install -U optimum`" + ) + model_kwargs["attn_implementation"] = "sdpa" + + # Initialize the speech recognition pipeline: + self._transcription_pipeline = pipeline( + task="automatic-speech-recognition", + model=self._model_name, + model_kwargs=model_kwargs.copy(), + batch_size=self._batch_size, + max_new_tokens=self._max_new_tokens, + chunk_length_s=self._chunk_length_s, + return_timestamps=self._return_timestamps, + torch_dtype=torch_dtype, + device=device, + ) + + # Prepare the generation kwargs: + self._generate_kwargs = { + "language": self._spoken_language, + "task": "translate" if self._translate_to_english else "transcribe", + } + + # Initialize the assistant model (if needed): + if self._assistant_model: + assistant_model = AutoModelForCausalLM.from_pretrained( + self._assistant_model, torch_dtype=torch_dtype, **model_kwargs + ) + assistant_model.to(device) + self._generate_kwargs["assistant_model"] = assistant_model + + def transcribe( + self, + audio_files: List[Path], + batch_processor: BatchProcessor = None, + batches_queue: Queue = None, + verbose: bool = False, + ) -> Union[List[List[dict]], None]: + """ + Transcribe the given audio files. The transcriptions will be sent to a queue or a batch processor for further + processing like writing to text files. If no queue or batch processor is given, the transcriptions outputs from + the pipeline will be returned. Otherwise, `None` is returned. + + :param audio_files: The audio files to transcribe. + :param batch_processor: A batch processor. + :param batches_queue: A multiprocessing queue to put the batches in. + :param verbose: Whether to show a progress bar. Default is False. + + :returns: The transcriptions outputs from the pipeline if no queue or batch processor is given, otherwise, + `None`. + """ + # Wrap the audio files with a function to iterate over them via a generator (save memory and runtime with + # Huggingface's pipelines as they preload each input while inference is running): + def audio_iterator() -> Generator[Union[dict, str], None, None]: + if self._per_channel_transcription: + for audio_file in audio_files: + audio, sampling_rate = torchaudio.load(str(audio_file)) + audio = audio.numpy() + for channel in audio: + yield {"raw": channel, "sampling_rate": sampling_rate} + else: + for audio_file in audio_files: + yield str(audio_file) + + # Create a batch iterator: + def batch_iterator() -> Generator[List[Union[dict, str]], None, None]: + batch = [] + for audio in audio_iterator(): + batch.append(audio) + if len(batch) == self._batch_size: + yield batch + batch = [] + if batch: + yield batch + + # Prepare the successes dataframe and errors dictionary to be returned: + outputs = [] + + # Infer through the pipeline: + for input_batch in tqdm( + batch_iterator() if self._batch_size > 1 else audio_iterator(), + desc="Transcribing", + unit="channel" if self._per_channel_transcription else "audio file", + total=( + ( + (len(audio_files) // self._batch_size) + + (len(audio_files) % self._batch_size != 0) + ) + * (self._per_channel_transcription or 1) + ), + disable=not verbose, + ): + # Infer: + try: + output_batch = self._transcription_pipeline( + input_batch, + generate_kwargs=self._generate_kwargs, + ) + except Exception as exception: + # Collect the exception: + output_batch = str(exception) + # Align to batch size: + output_batch = ( + [output_batch] * len(input_batch) + if isinstance(input_batch, list) + else [output_batch] + ) + # To align with batching, if batch size is 1, wrap the output with a list: + if isinstance(output_batch, dict): + output_batch = [output_batch] + # If a batch processor is given, process the batch: + if batch_processor: + # Process it directly: + batch_processor.process_batch(batch=output_batch) + batch_processor.do_tasks() + elif batches_queue: + # Otherwise, queue the batch: + batches_queue.put(output_batch) + else: + # Otherwise, collect the output as is without processing: + outputs.append(output_batch) + + # Check if given a multiprocessing queue or a batch processor: + if batches_queue: + batches_queue.put(_MULTIPROCESSING_STOP_MARK) + + return outputs if not batch_processor else None + + +#: The value to send into multiprocessing queues to stop the process: +_MULTIPROCESSING_STOP_MARK = "STOP" + + +def _multiprocessing_process_batches( + batch_processor: BatchProcessor, + batches_queue: Queue, + tasks_queue: Queue, + n_task_completers: int, +): + """ + Process the batches in the given batches queue and put the tasks in the given tasks queue. The function will stop + when the given batches queue will receive the stop mark. It is aimed to be used with multiprocessing as a process. + + :param batch_processor: A batch processor to process the batches. + :param batches_queue: A queue to get the batches from. + :param tasks_queue: A queue to put the tasks in. + :param n_task_completers: The number of task completers (processes that run the `_multiprocessing_complete_tasks` + function). A stop mark will be sent to the tasks queue for each task completer. + """ + while True: + # Get the batch: + batch: List[dict] = batches_queue.get() + if batch == _MULTIPROCESSING_STOP_MARK: + break + + # Process the batch: + batch_processor.process_batch(batch=batch) + + # Get the tasks: + tasks = batch_processor.get_tasks() + + # Queue the tasks: + for task in tasks: + tasks_queue.put(task.to_tuple()) + + # Mark the end of the batches: + for _ in range(n_task_completers): + tasks_queue.put(_MULTIPROCESSING_STOP_MARK) + + +def _multiprocessing_complete_tasks(tasks_queue: Queue, results_queue: Queue): + """ + Complete the tasks in the given queue and put the results in the given results queue. The function will stop when + the given tasks queue will receive the stop mark. It is aimed to be used with multiprocessing as a process. + + :param tasks_queue: A queue to get the tasks from. + :param results_queue: A queue to put the results in. + """ + tasks_map = { + BaseTask.__name__: BaseTask, + SpeechDiarizationTask.__name__: SpeechDiarizationTask, + SpeechDiarizationPerChannelTask.__name__: SpeechDiarizationPerChannelTask, + } + + while True: + # Get the task: + task = tasks_queue.get() + if task == _MULTIPROCESSING_STOP_MARK: + break + + # Reconstruct the task: + task_class, task_kwargs = task + task = tasks_map[task_class](**task_kwargs) + + # Complete the task: + task.do_task() + results_queue.put((task.is_failed(), task.get_result())) + + # Mark the end of the tasks: + results_queue.put(_MULTIPROCESSING_STOP_MARK) + + +# Get the global logger: +_LOGGER = logging.getLogger() + + +def open_mpi_handler( + worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None +): + global _LOGGER + + # Check for MLRun and OpenMPI availability: + context, comm = _check_mlrun_and_open_mpi() + + # Check if MLRun is available, set the global logger to MLRun's: + if context: + _LOGGER = context.logger + + def decorator(handler): + if comm is None or comm.Get_size() == 1: + return handler + + @wraps(handler) + def wrapper(**kwargs): + # Get the open mpi environment properties: + size = comm.Get_size() + rank = comm.Get_rank() + + # Give the correct chunk of the workers inputs: + for worker_input in worker_inputs: + input_argument = kwargs[worker_input] + if input_argument is None: + continue + if isinstance(input_argument, str): + input_argument = _get_audio_files( + data_path=Path(input_argument).absolute() + ) + if len(input_argument) < size: + raise ValueError( + f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. " + f"Please reduce the amount of workers for this input." + ) + even_chunk_size = len(input_argument) // size + chunk_start = rank * even_chunk_size + chunk_end = ( + (rank + 1) * even_chunk_size + if rank + 1 < size + else len(input_argument) + ) + context.logger.info( + f"Rank #{rank}: Processing input chunk of '{worker_input}' " + f"from index {chunk_start} to {chunk_end}." + ) + if isinstance(input_argument, list): + input_argument = input_argument[chunk_start:chunk_end] + elif isinstance(input_argument, pd.DataFrame): + input_argument = input_argument.iloc[chunk_start:chunk_end:, :] + kwargs[worker_input] = input_argument + + # Set the root worker only arguments: + if rank == 0 and root_worker_inputs: + kwargs.update(root_worker_inputs) + + # Run the worker: + output = handler(**kwargs) + + # Save the output directory of this worker: + output_directory = Path(output[0]) + + # Send the output to the root rank (rank #0): + output = comm.gather(output, root=0) + + # Join the data from all workers: + if rank == 0: + context.logger.info("Collecting data from workers to root worker.") + + # Check if there are different output directories: + output_directories = set([Path(out_dir) for out_dir, _, _ in output]) + for r in range(1, size): + # True means the other workers should pass their files to the root worker (rank 0): + comm.send(len(output_directories) != 1, dest=r) + + # If there are different output directories, listen to the other workers: + if len(output_directories) != 1: + # Collect the files from the other workers: + files = [] + for r in range(1, size): + files.extend(comm.recv(source=r)) + # Write the files to the root worker's output directory: + for file_name, file_content in files: + with open(output_directory / file_name, "w") as f: + f.write(file_content) + + # Concatenate the dataframes: + dataframe = pd.concat(objs=[df for _, df, _ in output], axis=0) + + # Concatenate the errors dictionaries: + errors_dictionary = reduce( + operator.ior, [err for _, _, err in output], {} + ) + + return str(output_directory), dataframe, errors_dictionary + + # Listen to rank 0 to see if there are different output directories and this rank need to send its files to + # it: + if comm.recv(source=0): + files = [] + for file in os.listdir(output_directory): + with open(output_directory / file, "r") as f: + files.append((file, f.read())) + comm.send(files, dest=0) + return None + + return wrapper + + return decorator + + +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]: + is_mpi = False + try: + import mlrun + + context = mlrun.get_or_create_ctx(name="mlrun") + is_mpi = context.labels.get("kind", "job") == "mpijob" + + if is_mpi: + try: + from mpi4py import MPI + + return context, MPI.COMM_WORLD + except ModuleNotFoundError as mpi4py_not_found: + context.logger.error( + "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your " + "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi." + ) + raise mpi4py_not_found + else: + return context, None + except ModuleNotFoundError as module_not_found: + if is_mpi: + raise module_not_found + return None, None + + +@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True}) +def transcribe( + # Input / Output kwargs: + data_path: Union[str, Path, List[Union[str, Path]]], + output_directory: str = None, + # Model loading kwargs: + model_name: str = "openai/whisper-tiny", + device: str = None, + use_flash_attention_2: bool = None, + use_better_transformers: bool = None, + # Generation kwargs: + assistant_model: str = None, + max_new_tokens: int = 128, + chunk_length_s: int = 30, + batch_size: int = 8, + spoken_language: str = None, + translate_to_english: bool = False, + # Diarization kwargs: + speech_diarization: Dict[str, List[Tuple[float, float, str]]] = None, + speech_diarize_per_channel: int = None, + speaker_labels: List[str] = None, + # Other kwargs: + use_multiprocessing: Union[bool, int] = False, + verbose: bool = False, +): + """ + Transcribe audio files into text files and collect additional data. The end result is a directory of transcribed + text files and a dataframe containing the following columns: + + * audio_file - The audio file path. + * transcription_file - The transcribed text file name in the output directory. + + The transcription is based on Huggingface's ASR pipeline - + https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline and + is tested with OpenAI's Whisper models - https://huggingface.co/openai. + + If one of the speaker diarization parameters are given (either `speech_diarization` or + `speech_diarize_per_channel`), the transcription will be written in a conversation format, where each speaker will + be written in a separate line:: + + speaker_1: text + speaker_2: text + speaker_1: text + ... + + :param data_path: A directory of audio files or a single file or a list of files to transcribe. + :param output_directory: Path to a directory to save all transcribed audio files. If not given, will save + the transcribed files in a temporary directory. + :param model_name: The model name to use. Should be a model from the OpenAI's Whisper models for + best results (for example "tiny", "base", "large", etc.). See here for more + information: https://huggingface.co/openai?search_models=whisper. + :param device: The device to use for inference. If not given, will use GPU if available. + :param use_flash_attention_2: Whether to use the Flash Attention 2 implementation. It can be used only with + one of the following GPUs: Nvidia H series and Nvidia A series. T4 support + will be available soon. + + Note: If both `use_flash_attention_2` and + `use_better_transformers` are `None`, the optimization will be chosen + automatically according to the available resources. + + :param use_better_transformers: Whether to use the Better Transformers library to further optimize the model. + Should be used for all use cases that do not support flash attention 2. + + Note: If both `use_flash_attention_2` and `use_better_transformers` are + `None`, the optimization will be chosen automatically according to the + available resources. + :param assistant_model: The assistant model name to use for inference. Notice that the optimizations + (flash attention 2 and better transformers) will be applied for the assistant as + well. Should be a model from Huggingface's distil-whisper (see here for more + information: https://github.com/huggingface/distil-whisper). + + Note: Currently an assistant model is only usable with batch size of 1. + :param max_new_tokens: The maximum number of new tokens to generate. This is used to limit the + generation length. Default is 128 tokens. + :param chunk_length_s: The audio chunk to split the audio to (in seconds). Default is 30 seconds. + :param batch_size: The batch size to use for inference. Default is 2. + :param spoken_language: Aim whisper to know what language is spoken. If None, it will try to detect + it. + :param translate_to_english: Whether to translate the transcriptions to English. + :param speech_diarization: A speech diarization dictionary with the file names to transcribe as keys and + their diarization as value. The diarization is a list of tuples: + (start, end, speaker). An example + for a diarization dictionary:: + + { + "audio_file_name": [ + { + "start": 0.0, + "end": 2.0, + "speaker": "Agent", + }, + { + "start": 2.0, + "end": 4.0, + "speaker": "Client", + }, + ... + ], + ... + } + + Note: The diarization must be for the entire duration of the audio file (as long + as Whisper is predicting words up until then. + :param speech_diarize_per_channel: Perform speech diarization per channel. Each speaker is expected to belong to + a separate channel in the audio. Notice: This will make the transcription + slower as each channel wil be transcribed separatly. If a speech diarization + is passed (via the `speech_diarization` parameter), this parameter is + ignored. + :param speaker_labels: A list of speaker labels by channel order to use for writing the + transcription with respect to per channel speech diarization. This won't be + used together with a given speech diarization (via the `speech_diarization` + parameter). + :param use_multiprocessing: Whether to use multiprocessing to transcribe the audio files. Can be either a + boolean value or an integer. If `True`, will use the default amount of workers + (3): 1 for transcription, 1 for batch processing and 1 for task completion (such + as speech diarization and writing to files). To control the amount of tasks + completion workers, an integer can be provided to specify the amount of workers. + `False`, will use a single process. Default is `False`. + :param verbose: Whether to print the progress of the transcription. Default is `False`. + """ + global _LOGGER + + # Get the input audio files to transcribe: + if verbose: + _LOGGER.info("Collecting audio files.") + audio_files = _get_audio_files(data_path=data_path) + if verbose: + _LOGGER.info(f"Collected {len(audio_files)} audio files.") + + # Get the output directory: + if output_directory is None: + if verbose: + _LOGGER.info("No output directory given, using temporary directory.") + output_directory = tempfile.mkdtemp() + output_directory = Path(output_directory).absolute() + output_directory.mkdir(exist_ok=True, parents=True) + if verbose: + _LOGGER.info(f"Transcriptions will be saved to: {output_directory}") + + # Initialize a batch processor according to user requirements (no speech diarization, given speech diarization, + # speech diarization per channel): + if speech_diarization: + batch_processor = SpeechDiarizationBatchProcessor( + audio_files=audio_files, + output_directory=output_directory, + speech_diarization=speech_diarization, + ) + elif speech_diarize_per_channel: + batch_processor = PerChannelSpeechDiarizationBatchProcessor( + audio_files=audio_files, + output_directory=output_directory, + n_channels=speech_diarize_per_channel, + speakers=speaker_labels, + ) + else: + batch_processor = BatchProcessor( + audio_files=audio_files, + output_directory=output_directory, + ) + + # Initialize the transcription pipeline: + transcriber = Transcriber( + device=device, + use_flash_attention_2=use_flash_attention_2, + use_better_transformers=use_better_transformers, + assistant_model=assistant_model, + model_name=model_name, + max_new_tokens=max_new_tokens, + chunk_length_s=chunk_length_s, + batch_size=batch_size, + return_timestamps=( + "word" + if speech_diarization is not None or speech_diarize_per_channel is not None + else False + ), + per_channel_transcription=speech_diarize_per_channel or 0, + spoken_language=spoken_language, + translate_to_english=translate_to_english, + ) + + # Run the transcription: + if use_multiprocessing: + results = _parallel_run( + n_workers=use_multiprocessing + if isinstance(use_multiprocessing, int) + else 1, + audio_files=audio_files, + batch_processor=batch_processor, + transcriber=transcriber, + verbose=verbose, + ) + else: + results = _run( + audio_files=audio_files, + batch_processor=batch_processor, + transcriber=transcriber, + verbose=verbose, + ) + + # Process the results: + if verbose: + _LOGGER.info("Summarizing the results.") + successes = [] + errors = {} + for is_error, result in results: + if is_error: + errors[result[0]] = result[1] + else: + successes.append(result) + successes = pd.DataFrame(successes, columns=["audio_file", "transcription_file"]) + if verbose: + _LOGGER.info( + f"Done ({successes.shape[0]}/{len(audio_files)})\n" + f"Transcriptions summary:\n" + f"{successes.head()}" + ) + + return str(output_directory), successes, errors + + +def _get_audio_files( + data_path: Union[Path, str, list], +) -> List[Path]: + """ + Get the audio files to transcribe. If a path to a directory is given, all files in the directory will be collected. + + :param data_path: The data path to collect the audio files from. + + :returns: The audio files list. + """ + # Check if given a list of paths: + if isinstance(data_path, list): + audio_files = [] + for path in data_path: + audio_files.extend(_get_audio_files(data_path=path)) + return audio_files + + # Check if given a single string path to cast it to a `pathlib.Path`: + if isinstance(data_path, str): + data_path = Path(data_path).absolute() + + # Check if the path is of a directory or a file: + if data_path.is_dir(): + # Get all files inside the directory: + audio_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + audio_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be a valid path to either a directory path or a " + f"file. Given: {str(data_path)} " + ) + + return audio_files + + +def _run( + audio_files: List[Path], + batch_processor: BatchProcessor, + transcriber: Transcriber, + verbose: bool, +) -> List[Tuple[bool, Tuple[str, str]]]: + """ + Run the transcription without multiprocessing. + + :param audio_files: The audio files to transcribe. + :param batch_processor: The batch processor to use. + :param transcriber: The transcriber to use. + :param verbose: Verbosity. + + :returns: The collected results. + """ + # Load the transcription pipeline: + if verbose: + _LOGGER.info(f"Loading the transcription pipeline.") + transcriber.load() + if verbose: + _LOGGER.info("Transcription pipeline loaded.") + + # Transcribe the files: + transcriber.transcribe( + audio_files=audio_files, + batch_processor=batch_processor, + verbose=verbose, + ) + + # Return the results: + return batch_processor.get_results() + + +def _parallel_run( + n_workers: int, + audio_files: List[Path], + batch_processor: BatchProcessor, + transcriber: Transcriber, + verbose: bool, +): + """ + Run the transcription with multiprocessing. + + :param n_workers: The amount of workers to use as task completers. + :param audio_files: The audio files to transcribe. + :param batch_processor: The batch processor to use. + :param transcriber: The transcriber to use. + :param verbose: Verbosity. + + :returns: The collected results. + """ + # Initialize the multiprocessing queues: + batches_queue = Queue() + tasks_queue = Queue() + results_queue = Queue() + + # Initialize the multiprocessing processes: + batch_processing_process = Process( + target=_multiprocessing_process_batches, + kwargs={ + "batch_processor": batch_processor, + "batches_queue": batches_queue, + "tasks_queue": tasks_queue, + "n_task_completers": n_workers, + }, + ) + task_completion_processes = [ + Process( + target=_multiprocessing_complete_tasks, + kwargs={"tasks_queue": tasks_queue, "results_queue": results_queue}, + ) + for _ in range(n_workers) + ] + + # Start the multiprocessing processes: + batch_processing_process.start() + for p in task_completion_processes: + p.start() + + # Load the transcription pipeline: + if verbose: + _LOGGER.info(f"Loading the transcription pipeline.") + transcriber.load() + if verbose: + _LOGGER.info("Transcription pipeline loaded.") + + # Transcribe the files: + transcriber.transcribe( + audio_files=audio_files, batches_queue=batches_queue, verbose=verbose + ) + + # Collect the results: + results = [] + stop_marks_counter = 0 + while True: + # Get a result from the queue: + result: Tuple[bool, Tuple[str, str]] = results_queue.get() + if result == _MULTIPROCESSING_STOP_MARK: + stop_marks_counter += 1 + if stop_marks_counter == n_workers: + break + else: + # Collect the result: + results.append(result) + + # Wait for the processes to finish: + results_queue.empty() + batch_processing_process.join() + for p in task_completion_processes: + p.join() + + return results \ No newline at end of file diff --git a/functions/master/transcribe/1.1.0/static/documentation.html b/functions/master/transcribe/1.1.0/static/documentation.html new file mode 100644 index 00000000..d2c18f28 --- /dev/null +++ b/functions/master/transcribe/1.1.0/static/documentation.html @@ -0,0 +1,537 @@ + + + + + + + +transcribe package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + + + +
    +
    + + +
    +
    +
    + +
    +

    transcribe package

    + +
    + +
    +
    +
    +
    +
    +

    transcribe package#

    +
    +

    Submodules#

    +
    +
    +

    transcribe.transcribe module#

    +
    +
    +class transcribe.transcribe.BaseTask(audio_file: pathlib.Path, transcription_output: Union[dict, str], text_file: pathlib.Path)[source]#
    +

    Bases: object

    +

    A task to write the transcription to file.

    +
    +
    +do_task()[source]#
    +

    Try to perform the task storing an error if occurred.

    +
    +
    +
    +get_result()Tuple[str, str][source]#
    +

    Get the result of the task. If the task failed, the error will be returned, otherwise, the result will be the +text file name.

    +
    +
    Returns
    +

    The task’s result.

    +
    +
    +
    +
    +
    +is_failed()bool[source]#
    +

    Check if the task failed.

    +
    +
    Returns
    +

    Whether the task failed.

    +
    +
    +
    +
    +
    +to_tuple()Tuple[str, dict][source]#
    +

    Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

    +
    +
    Returns
    +

    The converted task.

    +
    +
    +
    +
    +
    +
    +class transcribe.transcribe.BatchProcessor(audio_files: List[pathlib.Path], output_directory: pathlib.Path)[source]#
    +

    Bases: object

    +

    A batch processor to process batches of transcriptions. The batch processor is creating tasks and is aimed to be +working along the transcriber. It can be used with multiprocessing queue or run the tasks directly using the +associated methods.

    +
    +
    +do_tasks()[source]#
    +

    Perform the tasks. Should be used if no multiprocessing queue is given to a transcriber.

    +
    +
    +
    +get_results()List[Tuple[bool, Tuple[str, str]]][source]#
    +

    Get the results of the tasks. The stored results are then cleared.

    +
    +
    Returns
    +

    The results of the tasks.

    +
    +
    +
    +
    +
    +get_tasks()List[transcribe.transcribe.BaseTask][source]#
    +

    Get the tasks to perform.

    +
    +
    Returns
    +

    The tasks to perform.

    +
    +
    +
    +
    +
    +process_batch(batch: List[Union[dict, str]])[source]#
    +

    Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch +processor.

    +
    +
    Parameters
    +

    batch – The batch of transcriptions to process.

    +
    +
    +
    +
    +
    +
    +class transcribe.transcribe.PerChannelSpeechDiarizationBatchProcessor(audio_files: List[pathlib.Path], output_directory: pathlib.Path, n_channels: int, speakers: List[str])[source]#
    +

    Bases: transcribe.transcribe.BatchProcessor

    +

    A batch processor to process batches of transcriptions per channel. The batch processor is creating tasks with the +selected amount of channels given and is aimed to be working along the transcriber. It can be used with +multiprocessing queue or run the tasks directly using the associated methods.

    +
    +
    +process_batch(batch: List[dict])[source]#
    +

    Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch +processor.

    +
    +
    Parameters
    +

    batch – The batch of transcriptions to process.

    +
    +
    +
    +
    +
    +
    +class transcribe.transcribe.SpeechDiarizationBatchProcessor(audio_files: List[pathlib.Path], output_directory: pathlib.Path, speech_diarization: dict)[source]#
    +

    Bases: transcribe.transcribe.BatchProcessor

    +

    A batch processor to process batches of transcriptions with respect to a given speech diarization. The batch +processor is creating tasks and is aimed to be working along the transcriber. It can be used with multiprocessing +queue or run the tasks directly using the associated methods.

    +
    +
    +process_batch(batch: List[dict])[source]#
    +

    Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch +processor.

    +
    +
    Parameters
    +

    batch – The batch of transcriptions to process.

    +
    +
    +
    +
    +
    +
    +class transcribe.transcribe.SpeechDiarizationPerChannelTask(audio_file: pathlib.Path, text_file: pathlib.Path)[source]#
    +

    Bases: transcribe.transcribe.BaseTask

    +

    A task to write the transcription to file with respect to a given speech diarization per channel.

    +
    +
    +do_task()[source]#
    +

    Try to perform the task storing an error if occurred.

    +
    +
    +
    +to_tuple()Tuple[str, dict][source]#
    +

    Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

    +
    +
    Returns
    +

    The converted task.

    +
    +
    +
    +
    +
    +property transcription_output_channels: List[Tuple[str, dict]]#
    +

    Get the transcription output channels.

    +
    +
    Returns
    +

    The transcription output channels.

    +
    +
    +
    +
    +
    +
    +class transcribe.transcribe.SpeechDiarizationTask(audio_file: pathlib.Path, transcription_output: dict, text_file: pathlib.Path, speech_diarization: List[Tuple[float, float, str]])[source]#
    +

    Bases: transcribe.transcribe.BaseTask

    +

    A task to write the transcription to file with respect to a given speech diarization.

    +
    +
    +to_tuple()Tuple[str, dict][source]#
    +

    Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

    +
    +
    Returns
    +

    The converted task.

    +
    +
    +
    +
    +
    +
    +class transcribe.transcribe.Transcriber(model_name: str, device: Optional[str] = None, use_flash_attention_2: Optional[bool] = None, use_better_transformers: Optional[bool] = None, assistant_model: Optional[str] = None, max_new_tokens: int = 128, chunk_length_s: int = 30, batch_size: int = 2, spoken_language: Optional[str] = None, translate_to_english: bool = False, return_timestamps: Union[bool, Literal[word]] = False, per_channel_transcription: int = 0)[source]#
    +

    Bases: object

    +

    A transcription wrapper for the Huggingface’s ASR pipeline - +https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline to +use with OpenAI’s Whisper models - https://huggingface.co/openai.

    +
    +
    +load()[source]#
    +

    Load the transcriber. Must be called before transcribing.

    +
    +
    +
    +transcribe(audio_files: List[pathlib.Path], batch_processor: Optional[transcribe.transcribe.BatchProcessor] = None, batches_queue: Optional[multiprocessing.context.BaseContext.Queue] = None, verbose: bool = False)Optional[List[List[dict]]][source]#
    +

    Transcribe the given audio files. The transcriptions will be sent to a queue or a batch processor for further +processing like writing to text files. If no queue or batch processor is given, the transcriptions outputs from +the pipeline will be returned. Otherwise, None is returned.

    +
    +
    Parameters
    +
      +
    • audio_files – The audio files to transcribe.

    • +
    • batch_processor – A batch processor.

    • +
    • batches_queue – A multiprocessing queue to put the batches in.

    • +
    • verbose – Whether to show a progress bar. Default is False.

    • +
    +
    +
    Returns
    +

    The transcriptions outputs from the pipeline if no queue or batch processor is given, otherwise, +None.

    +
    +
    +
    +
    +
    +
    +transcribe.transcribe.open_mpi_handler(worker_inputs: List[str], root_worker_inputs: Optional[Dict[str, Any]] = None)[source]#
    +
    +
    +
    +transcribe.transcribe.transcribe(data_path: Union[str, pathlib.Path, List[Union[str, pathlib.Path]]], output_directory: Optional[str] = None, model_name: str = 'openai/whisper-tiny', device: Optional[str] = None, use_flash_attention_2: Optional[bool] = None, use_better_transformers: Optional[bool] = None, assistant_model: Optional[str] = None, max_new_tokens: int = 128, chunk_length_s: int = 30, batch_size: int = 8, spoken_language: Optional[str] = None, translate_to_english: bool = False, speech_diarization: Optional[Dict[str, List[Tuple[float, float, str]]]] = None, speech_diarize_per_channel: Optional[int] = None, speaker_labels: Optional[List[str]] = None, use_multiprocessing: Union[bool, int] = False, verbose: bool = False)[source]#
    +

    Transcribe audio files into text files and collect additional data. The end result is a directory of transcribed +text files and a dataframe containing the following columns:

    +
      +
    • audio_file - The audio file path.

    • +
    • transcription_file - The transcribed text file name in the output directory.

    • +
    +

    The transcription is based on Huggingface’s ASR pipeline - +https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline and +is tested with OpenAI’s Whisper models - https://huggingface.co/openai.

    +

    If one of the speaker diarization parameters are given (either speech_diarization or +speech_diarize_per_channel), the transcription will be written in a conversation format, where each speaker will +be written in a separate line:

    +
    speaker_1: text
    +speaker_2: text
    +speaker_1: text
    +...
    +
    +
    +
    +
    Parameters
    +
      +
    • data_path – A directory of audio files or a single file or a list of files to transcribe.

    • +
    • output_directory – Path to a directory to save all transcribed audio files. If not given, will save +the transcribed files in a temporary directory.

    • +
    • model_name – The model name to use. Should be a model from the OpenAI’s Whisper models for +best results (for example “tiny”, “base”, “large”, etc.). See here for more +information: https://huggingface.co/openai?search_models=whisper.

    • +
    • device – The device to use for inference. If not given, will use GPU if available.

    • +
    • use_flash_attention_2

      Whether to use the Flash Attention 2 implementation. It can be used only with +one of the following GPUs: Nvidia H series and Nvidia A series. T4 support +will be available soon.

      +

      Note: If both use_flash_attention_2 and +use_better_transformers are None, the optimization will be chosen +automatically according to the available resources.

      +

    • +
    • use_better_transformers

      Whether to use the Better Transformers library to further optimize the model. +Should be used for all use cases that do not support flash attention 2.

      +

      Note: If both use_flash_attention_2 and use_better_transformers are +None, the optimization will be chosen automatically according to the +available resources.

      +

    • +
    • assistant_model

      The assistant model name to use for inference. Notice that the optimizations +(flash attention 2 and better transformers) will be applied for the assistant as +well. Should be a model from Huggingface’s distil-whisper (see here for more +information: https://github.com/huggingface/distil-whisper).

      +

      Note: Currently an assistant model is only usable with batch size of 1.

      +

    • +
    • max_new_tokens – The maximum number of new tokens to generate. This is used to limit the +generation length. Default is 128 tokens.

    • +
    • chunk_length_s – The audio chunk to split the audio to (in seconds). Default is 30 seconds.

    • +
    • batch_size – The batch size to use for inference. Default is 2.

    • +
    • spoken_language – Aim whisper to know what language is spoken. If None, it will try to detect +it.

    • +
    • translate_to_english – Whether to translate the transcriptions to English.

    • +
    • speech_diarization

      A speech diarization dictionary with the file names to transcribe as keys and +their diarization as value. The diarization is a list of tuples: +(start, end, speaker). An example +for a diarization dictionary:

      +
      {
      +
      +
      +
      +
      +
      ”audio_file_name”: [
      +
      {

      “start”: 0.0, +“end”: 2.0, +“speaker”: “Agent”,

      +
      +
      +

      }, +{

      +
      +

      ”start”: 2.0, +“end”: 4.0, +“speaker”: “Client”,

      +
      +
      +
      +
      +

      }

      +

      Note: The diarization must be for the entire duration of the audio file (as long +as Whisper is predicting words up until then.

      +

    • +
    • speech_diarize_per_channel – Perform speech diarization per channel. Each speaker is expected to belong to +a separate channel in the audio. Notice: This will make the transcription +slower as each channel wil be transcribed separatly. If a speech diarization +is passed (via the speech_diarization parameter), this parameter is +ignored.

    • +
    • speaker_labels – A list of speaker labels by channel order to use for writing the +transcription with respect to per channel speech diarization. This won’t be +used together with a given speech diarization (via the speech_diarization +parameter).

    • +
    • use_multiprocessing – Whether to use multiprocessing to transcribe the audio files. Can be either a +boolean value or an integer. If True, will use the default amount of workers +(3): 1 for transcription, 1 for batch processing and 1 for task completion (such +as speech diarization and writing to files). To control the amount of tasks +completion workers, an integer can be provided to specify the amount of workers. +False, will use a single process. Default is False.

    • +
    • verbose – Whether to print the progress of the transcription. Default is False.

    • +
    +
    +
    +
    +
    +
    +

    Module contents#

    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/transcribe/1.1.0/static/example.html b/functions/master/transcribe/1.1.0/static/example.html new file mode 100644 index 00000000..6caa1b3d --- /dev/null +++ b/functions/master/transcribe/1.1.0/static/example.html @@ -0,0 +1,599 @@ + + + + + + + +Transcribe tutorial + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + + + +
    +
    + + +
    +
    +
    + +
    +

    Transcribe tutorial

    + +
    + +
    +
    +
    +
    +
    +

    Transcribe tutorial#

    +
    +
    +
    import tempfile
    +import mlrun
    +
    +
    +
    +
    +
    +

    Importing the transcribe function from hub#

    +

    To import the function directly from hub, use:

    +
    transcribe_fn = mlrun.import_function("hub://transcribe")
    +
    +
    +
    +
    +
    artifact_path = tempfile.mkdtemp()
    +
    +
    +
    +
    +
    +
    +
    transcribe_fn = mlrun.import_function("function.yaml")
    +
    +
    +
    +
    +
    +
    +

    Running transcribe#

    +
    +
    +
    transcribe_run = transcribe_fn.run(
    +    handler="transcribe",
    +    params={
    +        "model_name": "tiny",
    +        "input_path": "./data",
    +        "decoding_options": {"fp16": False},
    +        "output_directory": "./output",
    +    },
    +    returns=[
    +        "transcriptions: path",
    +        "transcriptions_df: dataset",
    +        {"key": "transcriptions_errors", "artifact_type": "file", "file_format": "yaml"},
    +    ],
    +    local=True,
    +    artifact_path=artifact_path,
    +)
    +
    +
    +
    +
    +
    > 2023-07-16 17:14:01,968 [info] Storing function: {'name': 'transcribe-transcribe', 'uid': 'd1384cb679bc4c178b0195d964b628a8', 'db': None}
    +> 2023-07-16 17:14:01,969 [warning] Could not detect path to API server, not connected to API server!
    +> 2023-07-16 17:14:01,969 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect
    +> 2023-07-16 17:14:01,970 [warning] Could not detect path to API server, not connected to API server!
    +> 2023-07-16 17:14:01,970 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect
    +> 2023-07-16 17:14:01,972 [warning] Could not detect path to API server, not connected to API server!
    +> 2023-07-16 17:14:01,972 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect
    +> 2023-07-16 17:14:09,804 [warning] Could not detect path to API server, not connected to API server!
    +> 2023-07-16 17:14:09,805 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect
    +> 2023-07-16 17:14:09,805 [info] Loading whisper model: 'tiny'
    +
    +
    +
    The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
    +IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
    +
    +
    +
    > 2023-07-16 17:14:10,374 [info] Model loaded.
    +
    +
    +
    Transcribing:  67%|██████▋   | 2/3 [00:02<00:01,  1.04s/file]
    +
    +
    +
    > 2023-07-16 17:14:12,556 [warning] Error in file: '/Users/Yonatan_Shelach/projects/functions/transcribe/data/error_file.txt'
    +
    +
    +
    Transcribing: 100%|██████████| 3/3 [00:02<00:00,  1.39file/s]
    +
    +
    +
    > 2023-07-16 17:14:12,566 [info] Done:
    +      audio_file transcription_file language     length  rate_of_speech
    +0  speech_01.mp3      speech_01.txt       en   2.011333        3.480278
    +1  speech_02.mp3      speech_02.txt       en  20.793500        2.548873
    +> 2023-07-16 17:14:12,596 [warning] Could not detect path to API server, not connected to API server!
    +> 2023-07-16 17:14:12,597 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect
    +> 2023-07-16 17:14:12,659 [warning] Could not detect path to API server, not connected to API server!
    +> 2023-07-16 17:14:12,660 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect
    +> 2023-07-16 17:14:12,671 [warning] Could not detect path to API server, not connected to API server!
    +> 2023-07-16 17:14:12,672 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect
    +
    +
    +
    
    +
    +
    +
    > 2023-07-16 17:14:12,707 [warning] Could not detect path to API server, not connected to API server!
    +> 2023-07-16 17:14:12,707 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect
    +> 2023-07-16 17:14:12,708 [warning] Could not detect path to API server, not connected to API server!
    +> 2023-07-16 17:14:12,708 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    default
    ...b628a8
    0Jul 16 14:14:01completedtranscribe-transcribe
    kind=
    owner=Yonatan_Shelach
    host=M-QWXQJK77Q0
    model_name=tiny
    audio_files_directory=./data
    decoding_options={'fp16': False}
    output_directory=./output
    transcriptions
    transcriptions_df
    transcriptions_errors
    +
    + +
    +
    
    +
    +
    +
    > to track results use the .show() or .logs() methods
    > 2023-07-16 17:14:12,721 [info] Run execution finished: {'status': 'completed', 'name': 'transcribe-transcribe'}
    +
    +
    +
    +
    +
    +
    +
    transcribe_run.outputs
    +
    +
    +
    +
    +
    {'transcriptions': 'store://artifacts/default/transcribe-transcribe_transcriptions:d1384cb679bc4c178b0195d964b628a8',
    + 'transcriptions_df': 'store://artifacts/default/transcribe-transcribe_transcriptions_df:d1384cb679bc4c178b0195d964b628a8',
    + 'transcriptions_errors': 'store://artifacts/default/transcribe-transcribe_transcriptions_errors:d1384cb679bc4c178b0195d964b628a8'}
    +
    +
    +
    +
    +

    Notice: If connected to mlrun server, you can simply use:

    +
    df = transcribe_run.artifact("transcriptions_df")
    +
    +
    +
    +
    +
    artifact_path += f"/{transcribe_run.metadata.name}/{transcribe_run.metadata.iteration}/"
    +
    +
    +
    +
    +
    +
    +
    df = mlrun.get_dataitem(artifact_path + "transcriptions_df.parquet").as_df()
    +
    +
    +
    +
    +
    +
    +
    df.head()
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    audio_filetranscription_filelanguagelengthrate_of_speech
    0speech_01.mp3speech_01.txten2.0113333.480278
    1speech_02.mp3speech_02.txten20.7935002.548873
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/transcribe/1.1.0/static/function.html b/functions/master/transcribe/1.1.0/static/function.html new file mode 100644 index 00000000..befb810a --- /dev/null +++ b/functions/master/transcribe/1.1.0/static/function.html @@ -0,0 +1,333 @@ + + + + + + + + + + + Source + + + + +
    +        
    +kind: job
    +metadata:
    +  name: transcribe
    +  tag: ''
    +  hash: 8810ac74045bd15cee15a2e4e89563e8e29908d3
    +  project: ''
    +  labels:
    +    author: yonatans
    +  categories:
    +  - data-preparation
    +  - genai
    +  - huggingface
    +  - machine-learning
    +spec:
    +  command: ''
    +  args: []
    +  image: ''
    +  build:
    +    functionSourceCode: # Copyright 2024 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import operator
import os
import tempfile
from functools import reduce, wraps
from multiprocessing import Process, Queue
from pathlib import Path
from typing import Any, Dict, Generator, List, Literal, NamedTuple, Tuple, Union

import pandas as pd
import torch
import torchaudio
from tqdm import tqdm
from transformers import (
    AutomaticSpeechRecognitionPipeline,
    AutoModelForCausalLM,
    pipeline,
)
from transformers.utils import is_flash_attn_2_available


class BaseTask:
    """
    A task to write the transcription to file.
    """

    def __init__(
        self, audio_file: Path, transcription_output: Union[dict, str], text_file: Path
    ):
        """
        Initialize the task.

        :param audio_file:           Path to the audio file that was transcribed.
        :param transcription_output: The transcription output from the pipeline. String means an exception was raised.
        :param text_file:            Path to the text file to write the transcription to.
        """
        # Store the parameters:
        self._audio_file = audio_file
        self._transcription_output = transcription_output
        self._text_file = text_file

        # Prepare the error variable:
        self._error: str = None

    def do_task(self):
        """
        Try to perform the task storing an error if occurred.
        """
        if isinstance(self._transcription_output, str):
            self._error = self._transcription_output
            return
        try:
            self._do_task()
        except Exception as exception:
            self._error = str(exception)

    def is_failed(self) -> bool:
        """
        Check if the task failed.

        :returns: Whether the task failed.
        """
        return self._error is not None

    def get_result(self) -> Tuple[str, str]:
        """
        Get the result of the task. If the task failed, the error will be returned, otherwise, the result will be the
        text file name.

        :returns: The task's result.
        """
        if self.is_failed():
            return self._audio_file.name, self._error
        return self._audio_file.name, self._text_file.name

    def to_tuple(self) -> Tuple[str, dict]:
        """
        Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

        :returns: The converted task.
        """
        return self.__class__.__name__, {
            "audio_file": self._audio_file,
            "transcription_output": self._transcription_output,
            "text_file": self._text_file,
        }

    def _do_task(self):
        """
        Perform the task - write the transcription to the stored file path.
        """
        # Checking for no duplications:
        i = 1
        while self._text_file.exists():
            i += 1
            self._text_file = (
                self._text_file.parent
                / f"{self._text_file.stem.rsplit('_', 1)[0]}_{i}{self._text_file.suffix}"
            )

        # Make sure all directories are created:
        self._text_file.parent.mkdir(exist_ok=True, parents=True)

        # Write to file:
        with open(self._text_file, "w") as fp:
            fp.write(self._transcription_output["text"])


class SpeechDiarizationTask(BaseTask):
    """
    A task to write the transcription to file with respect to a given speech diarization.
    """

    class _DiarizationSegment(NamedTuple):
        """
        A speech diarization segment.
        """

        start: float
        end: float
        speaker: str

    class _WordTimestamp(NamedTuple):
        """
        A word with its start and end timestamps.
        """

        start: float
        end: float
        text: str

    def __init__(
        self,
        audio_file: Path,
        transcription_output: dict,
        text_file: Path,
        speech_diarization: List[Tuple[float, float, str]],
    ):
        """
        Initialize the task.

        :param audio_file:           Path to the audio file that was transcribed.
        :param transcription_output: The transcription output from the pipeline.
        :param text_file:            Path to the text file to write the transcription to.
        :param speech_diarization:   A speech diarization as a list of tuples: (start, end, speaker).
        """
        super().__init__(
            audio_file=audio_file,
            transcription_output=transcription_output,
            text_file=text_file,
        )
        self._speech_diarization = speech_diarization
        self._segments: List[SpeechDiarizationTask._DiarizationSegment] = None
        self._last_chosen_index = 0

    def to_tuple(self) -> Tuple[str, dict]:
        """
        Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

        :returns: The converted task.
        """
        task_class, task_kwargs = super().to_tuple()
        return task_class, {
            **task_kwargs,
            "speech_diarization": self._speech_diarization,
        }

    def _do_task(self):
        """
        Perform the task - write the transcription to the stored file path with respect to the given speech diarization.
        """
        # Check if a speech diarization is given, if not, just write the transcription to file:
        if not self._speech_diarization:
            super()._do_task()
            return

        # Cast the chunks to word timestamps tuples:
        words = [
            SpeechDiarizationTask._WordTimestamp(
                start=chunk["timestamp"][0],
                end=chunk["timestamp"][1],
                text=chunk["text"],
            )
            for chunk in self._transcription_output["chunks"]
        ]

        # Cast speech diarization to segments tuples:
        self._segments = [
            SpeechDiarizationTask._DiarizationSegment(*segment)
            for segment in self._speech_diarization
        ]

        # Try to match the Whisper model predicted timestamps to the closest diarization segment (closest diarization
        # segment will be the most overlapping with the word, and if there is no overlap, the closest segment to the
        # word):
        speaker = self._segments[self._last_chosen_index].speaker
        text = f"{speaker}:"
        for word in words:
            # Get the next diarization segment:
            self._get_next_segment(word=word)
            # Check if the segment is of the same speaker:
            if self._segments[self._last_chosen_index].speaker == speaker:
                # Collect the word:
                text += word.text
            else:
                # Append a newline and update the new speaker:
                speaker = self._segments[self._last_chosen_index].speaker
                text += f"\n{speaker}:{word.text}"

        # Update the transcription output with the new text to write it to file:
        self._transcription_output["text"] = text
        super()._do_task()

    def _get_next_segment(
        self,
        word: _WordTimestamp,
    ):
        """
        Get the next diarization segment the given word falls into. The `self._last_chosen_index` will be updated
        accordingly.

        :param word: The word timestamp to match to the next segment.
        """
        # If the last chosen segment is the last segment, return it:
        if self._last_chosen_index == len(self._segments) - 1:
            return

        # Get the last chosen diarization segment:
        last_chosen = self._segments[self._last_chosen_index]

        # None value may appear if the word is the last word in the audio file, or it was split during inference. In
        # that case, we'll set the last segment:
        if word.end is None:
            self._last_chosen_index = len(self._segments) - 1
            return

        # If the word ends before the last chosen segment:
        if word.end <= last_chosen.start:
            # Then it is still the closest segment
            return

        # We check if it ends inside the last chosen segment:
        if word.end < last_chosen.end:
            # Then it still is the closest segment
            return

        # The word ends after the segment, we need to collect all next segments up until the word ends before them:
        possible_segments = [self._last_chosen_index]
        for i in range(self._last_chosen_index + 1, len(self._segments)):
            if word.end > self._segments[i].end:
                possible_segments.append(i)
                continue
            possible_segments.append(i)
            break

        # Check for the most overlapping option:
        best_overlap = 0
        most_overlapping_segment_index = None
        for i in possible_segments:
            # If the word starts before segment:
            if word.start <= self._segments[i].start:
                # If it ends before the segment, there is an overlap from the start of the segment to the end of the
                # word:
                if word.end < self._segments[i].end:
                    overlap = word.end - self._segments[i].start
                else:
                    # The word is wrapping the segment, the overlap is the segment's length:
                    overlap = self._segments[i].end - self._segments[i].start
            # The word starts in segment, check if the word ends in it:
            elif word.end < self._segments[i].end:
                # The overlap is the word's length:
                overlap = word.end - word.start
            # The word start in segment but ends after it, the overlap is from the word's start to the segment's end:
            else:
                overlap = self._segments[i].end - word.start
            # Check for new best overlap:
            if overlap > best_overlap:
                best_overlap = overlap
                most_overlapping_segment_index = i
        if most_overlapping_segment_index is not None:
            self._last_chosen_index = most_overlapping_segment_index
            return

        # If there is no overlapping segment, return the closest segment:
        best_distance = None
        closest_segment_index = None
        for i in possible_segments:
            distance = (
                word.start - self._segments[i].end
                if word.start > self._segments[i].end
                else self._segments[i].start - word.end
            )
            if best_distance is None or distance < best_distance:
                best_distance = distance
                closest_segment_index = i
        self._last_chosen_index = closest_segment_index


class SpeechDiarizationPerChannelTask(BaseTask):
    """
    A task to write the transcription to file with respect to a given speech diarization per channel.
    """

    class _WordTimestamp(NamedTuple):
        """
        A word with its start and end timestamps and speaker label (channel the word was taken from).
        """

        start: float
        end: float
        speaker: str
        text: str

    def __init__(self, audio_file: Path, text_file: Path):
        """
        Initialize the task.

        :param audio_file: Path to the audio file that was transcribed.
        :param text_file:  Path to the text file to write the transcription to.
        """
        super().__init__(
            audio_file=audio_file, transcription_output={}, text_file=text_file
        )
        self._transcription_output_channels: List[Tuple[str, dict]] = []

    @property
    def transcription_output_channels(self) -> List[Tuple[str, dict]]:
        """
        Get the transcription output channels.

        :returns: The transcription output channels.
        """
        return self._transcription_output_channels

    def do_task(self):
        """
        Try to perform the task storing an error if occurred.
        """
        for _, channel_output in self._transcription_output_channels:
            if isinstance(channel_output, str):
                self._error = self._transcription_output_channels
                return
        super().do_task()

    def to_tuple(self) -> Tuple[str, dict]:
        """
        Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

        :returns: The converted task.
        """
        task_class, task_kwargs = super().to_tuple()
        task_kwargs.pop("transcription_output")
        return task_class, task_kwargs

    def _do_task(self):
        """
        Perform the task - write the transcription to the stored file path with respect to the given speech diarization
        per channel.
        """
        # Cast the chunks to word timestamps tuples:
        words_per_channel = [
            [
                SpeechDiarizationPerChannelTask._WordTimestamp(
                    start=chunk["timestamp"][0],
                    end=chunk["timestamp"][1],
                    speaker=speaker,
                    text=chunk["text"],
                )
                for chunk in output["chunks"]
            ]
            for speaker, output in self._transcription_output_channels
        ]

        # Merge and sort the words per channel by their start time:
        words = operator.add(*words_per_channel)
        words.sort()

        # Write the transcription to file:
        current_speaker = words[0].speaker
        text = f"{current_speaker}:"
        for word in words:
            # Check if the word's speaker is different from the current one:
            if word.speaker != current_speaker:
                # Append a newline and update the new speaker:
                current_speaker = word.speaker
                text += f"\n{current_speaker}:"
            # Collect the word:
            text += word.text

        # Update the transcription output with the new text to write it to file:
        self._transcription_output["text"] = text
        super()._do_task()


class BatchProcessor:
    """
    A batch processor to process batches of transcriptions. The batch processor is creating tasks and is aimed to be
    working along the transcriber. It can be used with multiprocessing queue or run the tasks directly using the
    associated methods.
    """

    def __init__(self, audio_files: List[Path], output_directory: Path):
        """
        Initialize the batch processor.

        :param audio_files:      The list of all audio files to transcribe.
        :param output_directory: The output directory to write the transcriptions to.
        """
        # Store the parameters:
        self._audio_files = audio_files
        self._output_directory = output_directory

        # Prepare the batching variables:
        self._current_file_index = 0
        self._tasks: List[BaseTask] = []
        self._results: List[Tuple[bool, Tuple[str, str]]] = []

    def process_batch(self, batch: List[Union[dict, str]]):
        """
        Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch
        processor.

        :param batch: The batch of transcriptions to process.
        """
        # Get the relevant files belongs to the given batch:
        current_files = self._get_current_files(batch_size=len(batch))

        # Build the diarization tasks:
        self._tasks.extend(
            [
                BaseTask(
                    audio_file=file,
                    transcription_output=batch[i],
                    text_file=self._output_directory / f"{file.stem}.txt",
                )
                for i, file in enumerate(current_files)
            ]
        )

    def get_tasks(self) -> List[BaseTask]:
        """
        Get the tasks to perform.

        :returns: The tasks to perform.
        """
        tasks = self._tasks
        self._tasks = []
        return tasks

    def do_tasks(self):
        """
        Perform the tasks. Should be used if no multiprocessing queue is given to a transcriber.
        """
        for task in self.get_tasks():
            task.do_task()
            self._results.append((task.is_failed(), task.get_result()))

    def get_results(self) -> List[Tuple[bool, Tuple[str, str]]]:
        """
        Get the results of the tasks. The stored results are then cleared.

        :returns: The results of the tasks.
        """
        results = self._results
        self._results = []
        return results

    def _get_current_files(self, batch_size: int) -> List[Path]:
        """
        Get the current files to process.

        :param batch_size: The batch size to progress the current file index.

        :returns: The current files to process.
        """
        end_index = (
            self._current_file_index + batch_size
            if self._current_file_index + batch_size < len(self._audio_files)
            else len(self._audio_files)
        )
        current_files = self._audio_files[self._current_file_index : end_index]
        self._current_file_index = end_index
        return current_files


class SpeechDiarizationBatchProcessor(BatchProcessor):
    """
    A batch processor to process batches of transcriptions with respect to a given speech diarization. The batch
    processor is creating tasks and is aimed to be working along the transcriber. It can be used with multiprocessing
    queue or run the tasks directly using the associated methods.
    """

    def __init__(
        self, audio_files: List[Path], output_directory: Path, speech_diarization: dict
    ):
        """
        Initialize the batch processor.

        :param audio_files:        The list of all audio files to transcribe.
        :param output_directory:   The output directory to write the transcriptions to.
        :param speech_diarization: A speech diarization dictionary to pass along with each processed batch.
        """
        super().__init__(audio_files=audio_files, output_directory=output_directory)
        self._speech_diarization = speech_diarization
        self._audio_files = audio_files

    def process_batch(self, batch: List[dict]):
        """
        Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch
        processor.

        :param batch: The batch of transcriptions to process.
        """
        # Get the relevant files belongs to the given batch:
        current_files = self._get_current_files(batch_size=len(batch))

        # Build the diarization tasks:
        self._tasks.extend(
            [
                SpeechDiarizationTask(
                    audio_file=file,
                    transcription_output=batch[i],
                    text_file=self._output_directory / f"{file.stem}.txt",
                    speech_diarization=self._speech_diarization.get(file.name),
                )
                for i, file in enumerate(current_files)
            ]
        )


class PerChannelSpeechDiarizationBatchProcessor(BatchProcessor):
    """
    A batch processor to process batches of transcriptions per channel. The batch processor is creating tasks with the
    selected amount of channels given and is aimed to be working along the transcriber. It can be used with
    multiprocessing queue or run the tasks directly using the associated methods.
    """

    def __init__(
        self,
        audio_files: List[Path],
        output_directory: Path,
        n_channels: int,
        speakers: List[str],
    ):
        """
        Initialize the batch processor.

        :param audio_files:      The list of all audio files to transcribe.
        :param output_directory: The output directory to write the transcriptions to.
        :param n_channels:       The number of channels in each audio file to transcribe.
        :param speakers:         The speakers labels to use for each channel.
        """
        super().__init__(audio_files=audio_files, output_directory=output_directory)

        # Store the parameters:
        self._n_channels = n_channels
        self._speakers = speakers

        # Prepare a channel buffer to store the channels until the current task created is fully covered:
        self._task_in_process: SpeechDiarizationPerChannelTask = None

    def process_batch(self, batch: List[dict]):
        """
        Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch
        processor.

        :param batch: The batch of transcriptions to process.
        """
        # Go over the batch and create the tasks:
        for output in batch:
            # Check if there is a task in process:
            if not self._task_in_process:
                # Create a new task:
                self._task_in_process = SpeechDiarizationPerChannelTask(
                    audio_file=self._audio_files[self._current_file_index],
                    text_file=self._output_directory
                    / f"{self._audio_files[self._current_file_index].stem}.txt",
                )
            # Get the channel's speaker:
            speaker = self._speakers[
                len(self._task_in_process.transcription_output_channels)
            ]
            # Collect the channel into the processed task:
            self._task_in_process.transcription_output_channels.append(
                (speaker, output)
            )
            # Check if the task is fully covered (all channels are collected):
            if (
                len(self._task_in_process.transcription_output_channels)
                == self._n_channels
            ):
                # Collect the task and reset the task in process:
                self._tasks.append(self._task_in_process)
                self._current_file_index += 1
                self._task_in_process = None


class Transcriber:
    """
    A transcription wrapper for the Huggingface's ASR pipeline -
    https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline to
    use with OpenAI's Whisper models - https://huggingface.co/openai.
    """

    def __init__(
        self,
        model_name: str,
        device: str = None,
        use_flash_attention_2: bool = None,
        use_better_transformers: bool = None,
        assistant_model: str = None,
        max_new_tokens: int = 128,
        chunk_length_s: int = 30,
        batch_size: int = 2,
        spoken_language: str = None,
        translate_to_english: bool = False,
        return_timestamps: Union[bool, Literal["word"]] = False,
        per_channel_transcription: int = 0,
    ):
        """
        Initialize the transcriber.

        :param model_name:                The model name to use. Should be a model from the OpenAI's Whisper models for
                                          best results (for example "tiny", "base", "large", etc.).
        :param device:                    The device to use for inference. If not given, will use GPU if available.
        :param use_flash_attention_2:     Whether to use the Flash Attention 2 implementation. It can be used only with
                                          one of the following GPUs: Nvidia H series and Nvidia A series. T4 support
                                          will be available soon.

                                          Note: If both `use_flash_attention_2` and
                                          `use_better_transformers` are `None`, the optimization will be chosen
                                          automatically according to the available resources.

        :param use_better_transformers:   Whether to use the Better Transformers library to further optimize the model.
                                          Should be used for all use cases that do not support flash attention 2.

                                          Note: If both `use_flash_attention_2` and `use_better_transformers` are
                                          `None`, the optimization will be chosen automatically according to the
                                          available resources.
       :param assistant_model:           The assistant model name to use for inference. Notice that the optimizations
                                          (flash attention 2 and better transformers) will be applied for the assistant
                                          as well. Should be a model from Huggingface's distil-whisper (see here for
                                          more information: https://github.com/huggingface/distil-whisper).
        :param max_new_tokens:            The maximum number of new tokens to generate. This is used to limit the
                                          generation length. Default is 128 tokens.
        :param chunk_length_s:            The audio chunk to split the audio to (in seconds). Default is 30 seconds.
        :param batch_size:                The batch size to use for inference. Default is 2.
        :param spoken_language:           Aim whisper to know what language is spoken. If None, it will try to detect it
                                          for each chunk.
        :param translate_to_english:      Whether to translate the transcriptions to English. Default is False.
        :param return_timestamps:         Whether to return the timestamps of the words. If "word", will return the
                                          timestamps of each word. If True will return the timestamps of each chunk.
                                          Default is False. Aimed to be used for speech diarization.
        :param per_channel_transcription: Whether to do per channel transcription. If needed to run per channel
                                          transcription, pass the number of channels expected for each audio file here.
                                          0 means regular transcription (merge channels).

                                          Note: If `per_channel_transcription` is not 0, `batch_size` must be treated to
                                          be the number of channels and not audio files. Aimed to be used for per
                                          channel speech diarization.
        """
        # Store loading parameters:
        self._model_name = model_name
        self._device = device
        self._use_flash_attention_2 = use_flash_attention_2
        self._use_better_transformers = use_better_transformers
        self._max_new_tokens = max_new_tokens
        self._chunk_length_s = chunk_length_s
        self._batch_size = batch_size
        self._return_timestamps = return_timestamps
        self._per_channel_transcription = per_channel_transcription

        # Store generation parameters:
        self._assistant_model = assistant_model
        self._spoken_language = spoken_language
        self._translate_to_english = translate_to_english

        # Prepare the transcription objects:
        self._transcription_pipeline: AutomaticSpeechRecognitionPipeline = None
        self._generate_kwargs: dict = None

    def load(self):
        """
        Load the transcriber. Must be called before transcribing.
        """
        # Set the device and data type to use (prefer GPU if available):
        device = torch.device(
            self._device or "cuda" if torch.cuda.is_available() else "cpu"
        )
        torch_dtype = torch.float16 if device.type == "cuda" else torch.float32

        # Choose the optimization to use (in case the user did not specify any):
        if (
            self._use_flash_attention_2 is None
            and self._use_better_transformers is None
        ):
            # Prefer to use flash attention 2 if available and cuda device is supported (see GPU names to architecture
            # here: https://en.wikipedia.org/wiki/List_of_Nvidia_graphics_processing_units#Tesla):
            if device.type == "cuda" and is_flash_attn_2_available():
                cuda_device_name = torch.cuda.get_device_properties(device).name
                if any(
                    cuda_device_name.startswith(gpu_name)
                    for gpu_name in [
                        "NVIDIA A",  # For Ampere architecture (e.g. A10, A30, A100)
                        "NVIDIA H",  # For Hopper architecture (e.g. H100)
                        "NVIDIA L",  # For Ada Lovelace architecture (e.g. L4, L40)
                        "NVIDIA RTX 30",  # For Ada Lovelace architecture (RTX 30 series)
                        "NVIDIA RTX 40",  # For Ada Lovelace architecture (RTX 40 series)
                        "NVIDIA RTX 50",  # For Ada Lovelace architecture (RTX 50 series)
                        # Will be supported soon according to FlashAttention GitHub repo:
                        # https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features
                        # "NVIDIA T4",  # For Turing architecture (only T4)
                        # "NVIDIA RTX 20",  # For Turing architecture (RTX 20 series)
                    ]
                ):
                    self._use_flash_attention_2 = True
                else:
                    self._use_better_transformers = True
            else:
                self._use_better_transformers = True

        # Build the optimizations kwargs:
        model_kwargs = {
            "low_cpu_mem_usage": True,
            "use_safetensors": True,
        }
        if self._use_flash_attention_2:
            if _LOGGER:
                _LOGGER.info(
                    "Using FlashAttention2 optimization - make sure the `flash-attn` package is installed via "
                    "`pip install -U flash-attn --no-build-isolation`"
                )
            model_kwargs["attn_implementation"] = "flash_attention_2"
        elif self._use_better_transformers:
            if _LOGGER:
                _LOGGER.info(
                    "Using BetterTransformers optimization - make sure the `optimum` package is installed via "
                    "`pip install -U optimum`"
                )
            model_kwargs["attn_implementation"] = "sdpa"

        # Initialize the speech recognition pipeline:
        self._transcription_pipeline = pipeline(
            task="automatic-speech-recognition",
            model=self._model_name,
            model_kwargs=model_kwargs.copy(),
            batch_size=self._batch_size,
            max_new_tokens=self._max_new_tokens,
            chunk_length_s=self._chunk_length_s,
            return_timestamps=self._return_timestamps,
            torch_dtype=torch_dtype,
            device=device,
        )

        # Prepare the generation kwargs:
        self._generate_kwargs = {
            "language": self._spoken_language,
            "task": "translate" if self._translate_to_english else "transcribe",
        }

        # Initialize the assistant model (if needed):
        if self._assistant_model:
            assistant_model = AutoModelForCausalLM.from_pretrained(
                self._assistant_model, torch_dtype=torch_dtype, **model_kwargs
            )
            assistant_model.to(device)
            self._generate_kwargs["assistant_model"] = assistant_model

    def transcribe(
        self,
        audio_files: List[Path],
        batch_processor: BatchProcessor = None,
        batches_queue: Queue = None,
        verbose: bool = False,
    ) -> Union[List[List[dict]], None]:
        """
        Transcribe the given audio files. The transcriptions will be sent to a queue or a batch processor for further
        processing like writing to text files. If no queue or batch processor is given, the transcriptions outputs from
        the pipeline will be returned. Otherwise, `None` is returned.

        :param audio_files:     The audio files to transcribe.
        :param batch_processor: A batch processor.
        :param batches_queue:   A multiprocessing queue to put the batches in.
        :param verbose:         Whether to show a progress bar. Default is False.

        :returns: The transcriptions outputs from the pipeline if no queue or batch processor is given, otherwise,
                  `None`.
        """
        # Wrap the audio files with a function to iterate over them via a generator (save memory and runtime with
        # Huggingface's pipelines as they preload each input while inference is running):
        def audio_iterator() -> Generator[Union[dict, str], None, None]:
            if self._per_channel_transcription:
                for audio_file in audio_files:
                    audio, sampling_rate = torchaudio.load(str(audio_file))
                    audio = audio.numpy()
                    for channel in audio:
                        yield {"raw": channel, "sampling_rate": sampling_rate}
            else:
                for audio_file in audio_files:
                    yield str(audio_file)

        # Create a batch iterator:
        def batch_iterator() -> Generator[List[Union[dict, str]], None, None]:
            batch = []
            for audio in audio_iterator():
                batch.append(audio)
                if len(batch) == self._batch_size:
                    yield batch
                    batch = []
            if batch:
                yield batch

        # Prepare the successes dataframe and errors dictionary to be returned:
        outputs = []

        # Infer through the pipeline:
        for input_batch in tqdm(
            batch_iterator() if self._batch_size > 1 else audio_iterator(),
            desc="Transcribing",
            unit="channel" if self._per_channel_transcription else "audio file",
            total=(
                (
                    (len(audio_files) // self._batch_size)
                    + (len(audio_files) % self._batch_size != 0)
                )
                * (self._per_channel_transcription or 1)
            ),
            disable=not verbose,
        ):
            # Infer:
            try:
                output_batch = self._transcription_pipeline(
                    input_batch,
                    generate_kwargs=self._generate_kwargs,
                )
            except Exception as exception:
                # Collect the exception:
                output_batch = str(exception)
                # Align to batch size:
                output_batch = (
                    [output_batch] * len(input_batch)
                    if isinstance(input_batch, list)
                    else [output_batch]
                )
            # To align with batching, if batch size is 1, wrap the output with a list:
            if isinstance(output_batch, dict):
                output_batch = [output_batch]
            # If a batch processor is given, process the batch:
            if batch_processor:
                # Process it directly:
                batch_processor.process_batch(batch=output_batch)
                batch_processor.do_tasks()
            elif batches_queue:
                # Otherwise, queue the batch:
                batches_queue.put(output_batch)
            else:
                # Otherwise, collect the output as is without processing:
                outputs.append(output_batch)

        # Check if given a multiprocessing queue or a batch processor:
        if batches_queue:
            batches_queue.put(_MULTIPROCESSING_STOP_MARK)

        return outputs if not batch_processor else None


#: The value to send into multiprocessing queues to stop the process:
_MULTIPROCESSING_STOP_MARK = "STOP"


def _multiprocessing_process_batches(
    batch_processor: BatchProcessor,
    batches_queue: Queue,
    tasks_queue: Queue,
    n_task_completers: int,
):
    """
    Process the batches in the given batches queue and put the tasks in the given tasks queue. The function will stop
    when the given batches queue will receive the stop mark. It is aimed to be used with multiprocessing as a process.

    :param batch_processor:   A batch processor to process the batches.
    :param batches_queue:     A queue to get the batches from.
    :param tasks_queue:       A queue to put the tasks in.
    :param n_task_completers: The number of task completers (processes that run the `_multiprocessing_complete_tasks`
                              function). A stop mark will be sent to the tasks queue for each task completer.
    """
    while True:
        # Get the batch:
        batch: List[dict] = batches_queue.get()
        if batch == _MULTIPROCESSING_STOP_MARK:
            break

        # Process the batch:
        batch_processor.process_batch(batch=batch)

        # Get the tasks:
        tasks = batch_processor.get_tasks()

        # Queue the tasks:
        for task in tasks:
            tasks_queue.put(task.to_tuple())

    # Mark the end of the batches:
    for _ in range(n_task_completers):
        tasks_queue.put(_MULTIPROCESSING_STOP_MARK)


def _multiprocessing_complete_tasks(tasks_queue: Queue, results_queue: Queue):
    """
    Complete the tasks in the given queue and put the results in the given results queue. The function will stop when
    the given tasks queue will receive the stop mark. It is aimed to be used with multiprocessing as a process.

    :param tasks_queue:   A queue to get the tasks from.
    :param results_queue: A queue to put the results in.
    """
    tasks_map = {
        BaseTask.__name__: BaseTask,
        SpeechDiarizationTask.__name__: SpeechDiarizationTask,
        SpeechDiarizationPerChannelTask.__name__: SpeechDiarizationPerChannelTask,
    }

    while True:
        # Get the task:
        task = tasks_queue.get()
        if task == _MULTIPROCESSING_STOP_MARK:
            break

        # Reconstruct the task:
        task_class, task_kwargs = task
        task = tasks_map[task_class](**task_kwargs)

        # Complete the task:
        task.do_task()
        results_queue.put((task.is_failed(), task.get_result()))

    # Mark the end of the tasks:
    results_queue.put(_MULTIPROCESSING_STOP_MARK)


# Get the global logger:
_LOGGER = logging.getLogger()


def open_mpi_handler(
    worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None
):
    global _LOGGER

    # Check for MLRun and OpenMPI availability:
    context, comm = _check_mlrun_and_open_mpi()

    # Check if MLRun is available, set the global logger to MLRun's:
    if context:
        _LOGGER = context.logger

    def decorator(handler):
        if comm is None or comm.Get_size() == 1:
            return handler

        @wraps(handler)
        def wrapper(**kwargs):
            # Get the open mpi environment properties:
            size = comm.Get_size()
            rank = comm.Get_rank()

            # Give the correct chunk of the workers inputs:
            for worker_input in worker_inputs:
                input_argument = kwargs[worker_input]
                if input_argument is None:
                    continue
                if isinstance(input_argument, str):
                    input_argument = _get_audio_files(
                        data_path=Path(input_argument).absolute()
                    )
                if len(input_argument) < size:
                    raise ValueError(
                        f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. "
                        f"Please reduce the amount of workers for this input."
                    )
                even_chunk_size = len(input_argument) // size
                chunk_start = rank * even_chunk_size
                chunk_end = (
                    (rank + 1) * even_chunk_size
                    if rank + 1 < size
                    else len(input_argument)
                )
                context.logger.info(
                    f"Rank #{rank}: Processing input chunk of '{worker_input}' "
                    f"from index {chunk_start} to {chunk_end}."
                )
                if isinstance(input_argument, list):
                    input_argument = input_argument[chunk_start:chunk_end]
                elif isinstance(input_argument, pd.DataFrame):
                    input_argument = input_argument.iloc[chunk_start:chunk_end:, :]
                kwargs[worker_input] = input_argument

            # Set the root worker only arguments:
            if rank == 0 and root_worker_inputs:
                kwargs.update(root_worker_inputs)

            # Run the worker:
            output = handler(**kwargs)

            # Save the output directory of this worker:
            output_directory = Path(output[0])

            # Send the output to the root rank (rank #0):
            output = comm.gather(output, root=0)

            # Join the data from all workers:
            if rank == 0:
                context.logger.info("Collecting data from workers to root worker.")

                # Check if there are different output directories:
                output_directories = set([Path(out_dir) for out_dir, _, _ in output])
                for r in range(1, size):
                    # True means the other workers should pass their files to the root worker (rank 0):
                    comm.send(len(output_directories) != 1, dest=r)

                # If there are different output directories, listen to the other workers:
                if len(output_directories) != 1:
                    # Collect the files from the other workers:
                    files = []
                    for r in range(1, size):
                        files.extend(comm.recv(source=r))
                    # Write the files to the root worker's output directory:
                    for file_name, file_content in files:
                        with open(output_directory / file_name, "w") as f:
                            f.write(file_content)

                # Concatenate the dataframes:
                dataframe = pd.concat(objs=[df for _, df, _ in output], axis=0)

                # Concatenate the errors dictionaries:
                errors_dictionary = reduce(
                    operator.ior, [err for _, _, err in output], {}
                )

                return str(output_directory), dataframe, errors_dictionary

            # Listen to rank 0 to see if there are different output directories and this rank need to send its files to
            # it:
            if comm.recv(source=0):
                files = []
                for file in os.listdir(output_directory):
                    with open(output_directory / file, "r") as f:
                        files.append((file, f.read()))
                comm.send(files, dest=0)
            return None

        return wrapper

    return decorator


def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    is_mpi = False
    try:
        import mlrun

        context = mlrun.get_or_create_ctx(name="mlrun")
        is_mpi = context.labels.get("kind", "job") == "mpijob"

        if is_mpi:
            try:
                from mpi4py import MPI

                return context, MPI.COMM_WORLD
            except ModuleNotFoundError as mpi4py_not_found:
                context.logger.error(
                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
                )
                raise mpi4py_not_found
        else:
            return context, None
    except ModuleNotFoundError as module_not_found:
        if is_mpi:
            raise module_not_found
    return None, None


@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True})
def transcribe(
    # Input / Output kwargs:
    data_path: Union[str, Path, List[Union[str, Path]]],
    output_directory: str = None,
    # Model loading kwargs:
    model_name: str = "openai/whisper-tiny",
    device: str = None,
    use_flash_attention_2: bool = None,
    use_better_transformers: bool = None,
    # Generation kwargs:
    assistant_model: str = None,
    max_new_tokens: int = 128,
    chunk_length_s: int = 30,
    batch_size: int = 8,
    spoken_language: str = None,
    translate_to_english: bool = False,
    # Diarization kwargs:
    speech_diarization: Dict[str, List[Tuple[float, float, str]]] = None,
    speech_diarize_per_channel: int = None,
    speaker_labels: List[str] = None,
    # Other kwargs:
    use_multiprocessing: Union[bool, int] = False,
    verbose: bool = False,
):
    """
    Transcribe audio files into text files and collect additional data. The end result is a directory of transcribed
    text files and a dataframe containing the following columns:

    * audio_file - The audio file path.
    * transcription_file - The transcribed text file name in the output directory.

    The transcription is based on Huggingface's ASR pipeline -
    https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline and
    is tested with OpenAI's Whisper models - https://huggingface.co/openai.

    If one of the speaker diarization parameters are given (either `speech_diarization` or
    `speech_diarize_per_channel`), the transcription will be written in a conversation format, where each speaker will
    be written in a separate line::

        speaker_1: text
        speaker_2: text
        speaker_1: text
        ...

    :param data_path:                  A directory of audio files or a single file or a list of files to transcribe.
    :param output_directory:           Path to a directory to save all transcribed audio files. If not given, will save
                                       the transcribed files in a temporary directory.
    :param model_name:                 The model name to use. Should be a model from the OpenAI's Whisper models for
                                       best results (for example "tiny", "base", "large", etc.). See here for more
                                       information: https://huggingface.co/openai?search_models=whisper.
    :param device:                     The device to use for inference. If not given, will use GPU if available.
    :param use_flash_attention_2:      Whether to use the Flash Attention 2 implementation. It can be used only with
                                       one of the following GPUs: Nvidia H series and Nvidia A series. T4 support
                                       will be available soon.

                                       Note: If both `use_flash_attention_2` and
                                       `use_better_transformers` are `None`, the optimization will be chosen
                                       automatically according to the available resources.

    :param use_better_transformers:    Whether to use the Better Transformers library to further optimize the model.
                                       Should be used for all use cases that do not support flash attention 2.

                                       Note: If both `use_flash_attention_2` and `use_better_transformers` are
                                       `None`, the optimization will be chosen automatically according to the
                                       available resources.
    :param assistant_model:            The assistant model name to use for inference. Notice that the optimizations
                                       (flash attention 2 and better transformers) will be applied for the assistant as
                                       well. Should be a model from Huggingface's distil-whisper (see here for more
                                       information: https://github.com/huggingface/distil-whisper).

                                       Note: Currently an assistant model is only usable with batch size of 1.
    :param max_new_tokens:             The maximum number of new tokens to generate. This is used to limit the
                                       generation length. Default is 128 tokens.
    :param chunk_length_s:             The audio chunk to split the audio to (in seconds). Default is 30 seconds.
    :param batch_size:                 The batch size to use for inference. Default is 2.
    :param spoken_language:            Aim whisper to know what language is spoken. If None, it will try to detect
                                       it.
    :param translate_to_english:       Whether to translate the transcriptions to English.
    :param speech_diarization:         A speech diarization dictionary with the file names to transcribe as keys and
                                       their diarization as value. The diarization is a list of tuples:
                                       (start, end, speaker). An example
                                       for a diarization dictionary::

                                       {
                                           "audio_file_name": [
                                               {
                                                   "start": 0.0,
                                                   "end": 2.0,
                                                   "speaker": "Agent",
                                               },
                                               {
                                                   "start": 2.0,
                                                   "end": 4.0,
                                                   "speaker": "Client",
                                               },
                                               ...
                                           ],
                                           ...
                                       }

                                       Note: The diarization must be for the entire duration of the audio file (as long
                                       as Whisper is predicting words up until then.
    :param speech_diarize_per_channel: Perform speech diarization per channel. Each speaker is expected to belong to
                                       a separate channel in the audio. Notice: This will make the transcription
                                       slower as each channel wil be transcribed separatly. If a speech diarization
                                       is passed (via the `speech_diarization` parameter), this parameter is
                                       ignored.
    :param speaker_labels:             A list of speaker labels by channel order to use for writing the
                                       transcription with respect to per channel speech diarization. This won't be
                                       used together with a given speech diarization (via the `speech_diarization`
                                       parameter).
    :param use_multiprocessing:        Whether to use multiprocessing to transcribe the audio files. Can be either a
                                       boolean value or an integer. If `True`, will use the default amount of workers
                                       (3): 1 for transcription, 1 for batch processing and 1 for task completion (such
                                       as speech diarization and writing to files). To control the amount of tasks
                                       completion workers, an integer can be provided to specify the amount of workers.
                                       `False`, will use a single process. Default is `False`.
    :param verbose:                    Whether to print the progress of the transcription. Default is `False`.
    """
    global _LOGGER

    # Get the input audio files to transcribe:
    if verbose:
        _LOGGER.info("Collecting audio files.")
    audio_files = _get_audio_files(data_path=data_path)
    if verbose:
        _LOGGER.info(f"Collected {len(audio_files)} audio files.")

    # Get the output directory:
    if output_directory is None:
        if verbose:
            _LOGGER.info("No output directory given, using temporary directory.")
        output_directory = tempfile.mkdtemp()
    output_directory = Path(output_directory).absolute()
    output_directory.mkdir(exist_ok=True, parents=True)
    if verbose:
        _LOGGER.info(f"Transcriptions will be saved to: {output_directory}")

    # Initialize a batch processor according to user requirements (no speech diarization, given speech diarization,
    # speech diarization per channel):
    if speech_diarization:
        batch_processor = SpeechDiarizationBatchProcessor(
            audio_files=audio_files,
            output_directory=output_directory,
            speech_diarization=speech_diarization,
        )
    elif speech_diarize_per_channel:
        batch_processor = PerChannelSpeechDiarizationBatchProcessor(
            audio_files=audio_files,
            output_directory=output_directory,
            n_channels=speech_diarize_per_channel,
            speakers=speaker_labels,
        )
    else:
        batch_processor = BatchProcessor(
            audio_files=audio_files,
            output_directory=output_directory,
        )

    # Initialize the transcription pipeline:
    transcriber = Transcriber(
        device=device,
        use_flash_attention_2=use_flash_attention_2,
        use_better_transformers=use_better_transformers,
        assistant_model=assistant_model,
        model_name=model_name,
        max_new_tokens=max_new_tokens,
        chunk_length_s=chunk_length_s,
        batch_size=batch_size,
        return_timestamps=(
            "word"
            if speech_diarization is not None or speech_diarize_per_channel is not None
            else False
        ),
        per_channel_transcription=speech_diarize_per_channel or 0,
        spoken_language=spoken_language,
        translate_to_english=translate_to_english,
    )

    # Run the transcription:
    if use_multiprocessing:
        results = _parallel_run(
            n_workers=use_multiprocessing
            if isinstance(use_multiprocessing, int)
            else 1,
            audio_files=audio_files,
            batch_processor=batch_processor,
            transcriber=transcriber,
            verbose=verbose,
        )
    else:
        results = _run(
            audio_files=audio_files,
            batch_processor=batch_processor,
            transcriber=transcriber,
            verbose=verbose,
        )

    # Process the results:
    if verbose:
        _LOGGER.info("Summarizing the results.")
    successes = []
    errors = {}
    for is_error, result in results:
        if is_error:
            errors[result[0]] = result[1]
        else:
            successes.append(result)
    successes = pd.DataFrame(successes, columns=["audio_file", "transcription_file"])
    if verbose:
        _LOGGER.info(
            f"Done ({successes.shape[0]}/{len(audio_files)})\n"
            f"Transcriptions summary:\n"
            f"{successes.head()}"
        )

    return str(output_directory), successes, errors


def _get_audio_files(
    data_path: Union[Path, str, list],
) -> List[Path]:
    """
    Get the audio files to transcribe. If a path to a directory is given, all files in the directory will be collected.

    :param data_path: The data path to collect the audio files from.

    :returns: The audio files list.
    """
    # Check if given a list of paths:
    if isinstance(data_path, list):
        audio_files = []
        for path in data_path:
            audio_files.extend(_get_audio_files(data_path=path))
        return audio_files

    # Check if given a single string path to cast it to a `pathlib.Path`:
    if isinstance(data_path, str):
        data_path = Path(data_path).absolute()

    # Check if the path is of a directory or a file:
    if data_path.is_dir():
        # Get all files inside the directory:
        audio_files = list(data_path.glob("*.*"))
    elif data_path.is_file():
        audio_files = [data_path]
    else:
        raise ValueError(
            f"Unrecognized data path. The parameter `data_path` must be a valid path to either a directory path or a "
            f"file. Given: {str(data_path)} "
        )

    return audio_files


def _run(
    audio_files: List[Path],
    batch_processor: BatchProcessor,
    transcriber: Transcriber,
    verbose: bool,
) -> List[Tuple[bool, Tuple[str, str]]]:
    """
    Run the transcription without multiprocessing.

    :param audio_files:     The audio files to transcribe.
    :param batch_processor: The batch processor to use.
    :param transcriber:     The transcriber to use.
    :param verbose:         Verbosity.

    :returns: The collected results.
    """
    # Load the transcription pipeline:
    if verbose:
        _LOGGER.info(f"Loading the transcription pipeline.")
    transcriber.load()
    if verbose:
        _LOGGER.info("Transcription pipeline loaded.")

    # Transcribe the files:
    transcriber.transcribe(
        audio_files=audio_files,
        batch_processor=batch_processor,
        verbose=verbose,
    )

    # Return the results:
    return batch_processor.get_results()


def _parallel_run(
    n_workers: int,
    audio_files: List[Path],
    batch_processor: BatchProcessor,
    transcriber: Transcriber,
    verbose: bool,
):
    """
    Run the transcription with multiprocessing.

    :param n_workers:       The amount of workers to use as task completers.
    :param audio_files:     The audio files to transcribe.
    :param batch_processor: The batch processor to use.
    :param transcriber:     The transcriber to use.
    :param verbose:         Verbosity.

    :returns: The collected results.
    """
    # Initialize the multiprocessing queues:
    batches_queue = Queue()
    tasks_queue = Queue()
    results_queue = Queue()

    # Initialize the multiprocessing processes:
    batch_processing_process = Process(
        target=_multiprocessing_process_batches,
        kwargs={
            "batch_processor": batch_processor,
            "batches_queue": batches_queue,
            "tasks_queue": tasks_queue,
            "n_task_completers": n_workers,
        },
    )
    task_completion_processes = [
        Process(
            target=_multiprocessing_complete_tasks,
            kwargs={"tasks_queue": tasks_queue, "results_queue": results_queue},
        )
        for _ in range(n_workers)
    ]

    # Start the multiprocessing processes:
    batch_processing_process.start()
    for p in task_completion_processes:
        p.start()

    # Load the transcription pipeline:
    if verbose:
        _LOGGER.info(f"Loading the transcription pipeline.")
    transcriber.load()
    if verbose:
        _LOGGER.info("Transcription pipeline loaded.")

    # Transcribe the files:
    transcriber.transcribe(
        audio_files=audio_files, batches_queue=batches_queue, verbose=verbose
    )

    # Collect the results:
    results = []
    stop_marks_counter = 0
    while True:
        # Get a result from the queue:
        result: Tuple[bool, Tuple[str, str]] = results_queue.get()
        if result == _MULTIPROCESSING_STOP_MARK:
            stop_marks_counter += 1
            if stop_marks_counter == n_workers:
                break
        else:
            # Collect the result:
            results.append(result)

    # Wait for the processes to finish:
    results_queue.empty()
    batch_processing_process.join()
    for p in task_completion_processes:
        p.join()

    return results
    +    base_image: mlrun/mlrun
    +    commands: []
    +    code_origin: ''
    +    origin_filename: ''
    +    requirements:
    +    - transformers
    +    - tqdm
    +    - torchaudio
    +    - torch
    +    - accelerate
    +  entry_points:
    +    do_task:
    +      name: do_task
    +      doc: Try to perform the task storing an error if occurred.
    +      parameters:
    +      - name: self
    +      outputs: []
    +      lineno: 348
    +      has_varargs: false
    +      has_kwargs: false
    +    is_failed:
    +      name: is_failed
    +      doc: Check if the task failed.
    +      parameters:
    +      - name: self
    +      outputs:
    +      - doc: Whether the task failed.
    +        type: bool
    +      lineno: 70
    +      has_varargs: false
    +      has_kwargs: false
    +    get_result:
    +      name: get_result
    +      doc: 'Get the result of the task. If the task failed, the error will be returned,
    +        otherwise, the result will be the
    +
    +        text file name.'
    +      parameters:
    +      - name: self
    +      outputs:
    +      - doc: The task's result.
    +        type: Tuple[str, str]
    +      lineno: 78
    +      has_varargs: false
    +      has_kwargs: false
    +    to_tuple:
    +      name: to_tuple
    +      doc: Convert the task to a tuple to reconstruct it later (used for multiprocessing
    +        to pass in queue).
    +      parameters:
    +      - name: self
    +      outputs:
    +      - doc: The converted task.
    +        type: Tuple[str, dict]
    +      lineno: 358
    +      has_varargs: false
    +      has_kwargs: false
    +    transcription_output_channels:
    +      name: transcription_output_channels
    +      doc: Get the transcription output channels.
    +      parameters:
    +      - name: self
    +      outputs:
    +      - doc: The transcription output channels.
    +        type: List[Tuple[str, dict]]
    +      lineno: 340
    +      has_varargs: false
    +      has_kwargs: false
    +    process_batch:
    +      name: process_batch
    +      doc: 'Process a batch of transcriptions. Tasks related to the given batch will
    +        be created and stored in the batch
    +
    +        processor.'
    +      parameters:
    +      - name: self
    +      - name: batch
    +        type: List[dict]
    +        doc: The batch of transcriptions to process.
    +      outputs: []
    +      lineno: 575
    +      has_varargs: false
    +      has_kwargs: false
    +    get_tasks:
    +      name: get_tasks
    +      doc: Get the tasks to perform.
    +      parameters:
    +      - name: self
    +      outputs:
    +      - doc: The tasks to perform.
    +        type: List[BaseTask]
    +      lineno: 453
    +      has_varargs: false
    +      has_kwargs: false
    +    do_tasks:
    +      name: do_tasks
    +      doc: Perform the tasks. Should be used if no multiprocessing queue is given
    +        to a transcriber.
    +      parameters:
    +      - name: self
    +      outputs: []
    +      lineno: 463
    +      has_varargs: false
    +      has_kwargs: false
    +    get_results:
    +      name: get_results
    +      doc: Get the results of the tasks. The stored results are then cleared.
    +      parameters:
    +      - name: self
    +      outputs:
    +      - doc: The results of the tasks.
    +        type: List[Tuple[bool, Tuple[str, str]]]
    +      lineno: 471
    +      has_varargs: false
    +      has_kwargs: false
    +    load:
    +      name: load
    +      doc: Load the transcriber. Must be called before transcribing.
    +      parameters:
    +      - name: self
    +      outputs: []
    +      lineno: 695
    +      has_varargs: false
    +      has_kwargs: false
    +    transcribe:
    +      name: transcribe
    +      doc: "Transcribe audio files into text files and collect additional data. The\
    +        \ end result is a directory of transcribed\ntext files and a dataframe containing\
    +        \ the following columns:\n\n* audio_file - The audio file path.\n* transcription_file\
    +        \ - The transcribed text file name in the output directory.\n\nThe transcription\
    +        \ is based on Huggingface's ASR pipeline -\nhttps://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline\
    +        \ and\nis tested with OpenAI's Whisper models - https://huggingface.co/openai.\n\
    +        \nIf one of the speaker diarization parameters are given (either `speech_diarization`\
    +        \ or\n`speech_diarize_per_channel`), the transcription will be written in\
    +        \ a conversation format, where each speaker will\nbe written in a separate\
    +        \ line::\n\n    speaker_1: text\n    speaker_2: text\n    speaker_1: text\n\
    +        \    ..."
    +      parameters:
    +      - name: data_path
    +        type: Union[str, Path, List[Union[str, Path]]]
    +        doc: A directory of audio files or a single file or a list of files to transcribe.
    +      - name: output_directory
    +        type: str
    +        doc: Path to a directory to save all transcribed audio files. If not given,
    +          will save the transcribed files in a temporary directory.
    +        default: null
    +      - name: model_name
    +        type: str
    +        doc: 'The model name to use. Should be a model from the OpenAI''s Whisper
    +          models for best results (for example "tiny", "base", "large", etc.). See
    +          here for more information: https://huggingface.co/openai?search_models=whisper.'
    +        default: openai/whisper-tiny
    +      - name: device
    +        type: str
    +        doc: The device to use for inference. If not given, will use GPU if available.
    +        default: null
    +      - name: use_flash_attention_2
    +        type: bool
    +        doc: 'Whether to use the Flash Attention 2 implementation. It can be used
    +          only with one of the following GPUs: Nvidia H series and Nvidia A series.
    +          T4 support will be available soon.'
    +        default: null
    +      - name: use_better_transformers
    +        type: bool
    +        doc: Whether to use the Better Transformers library to further optimize the
    +          model. Should be used for all use cases that do not support flash attention
    +          2.
    +        default: null
    +      - name: assistant_model
    +        type: str
    +        doc: 'The assistant model name to use for inference. Notice that the optimizations
    +          (flash attention 2 and better transformers) will be applied for the assistant
    +          as well. Should be a model from Huggingface''s distil-whisper (see here
    +          for more information: https://github.com/huggingface/distil-whisper).'
    +        default: null
    +      - name: max_new_tokens
    +        type: int
    +        doc: The maximum number of new tokens to generate. This is used to limit the
    +          generation length. Default is 128 tokens.
    +        default: 128
    +      - name: chunk_length_s
    +        type: int
    +        doc: The audio chunk to split the audio to (in seconds). Default is 30 seconds.
    +        default: 30
    +      - name: batch_size
    +        type: int
    +        doc: The batch size to use for inference. Default is 2.
    +        default: 8
    +      - name: spoken_language
    +        type: str
    +        doc: Aim whisper to know what language is spoken. If None, it will try to
    +          detect it.
    +        default: null
    +      - name: translate_to_english
    +        type: bool
    +        doc: Whether to translate the transcriptions to English.
    +        default: false
    +      - name: speech_diarization
    +        type: Dict[str, List[Tuple[float, float, str]]]
    +        doc: 'A speech diarization dictionary with the file names to transcribe as
    +          keys and their diarization as value. The diarization is a list of tuples:
    +          (start, end, speaker). An example for a diarization dictionary::'
    +        default: null
    +      - name: speech_diarize_per_channel
    +        type: int
    +        doc: 'Perform speech diarization per channel. Each speaker is expected to
    +          belong to a separate channel in the audio. Notice: This will make the transcription
    +          slower as each channel wil be transcribed separatly. If a speech diarization
    +          is passed (via the `speech_diarization` parameter), this parameter is ignored.'
    +        default: null
    +      - name: speaker_labels
    +        type: List[str]
    +        doc: A list of speaker labels by channel order to use for writing the transcription
    +          with respect to per channel speech diarization. This won't be used together
    +          with a given speech diarization (via the `speech_diarization` parameter).
    +        default: null
    +      - name: use_multiprocessing
    +        type: Union[bool, int]
    +        doc: 'Whether to use multiprocessing to transcribe the audio files. Can be
    +          either a boolean value or an integer. If `True`, will use the default amount
    +          of workers (3): 1 for transcription, 1 for batch processing and 1 for task
    +          completion (such as speech diarization and writing to files). To control
    +          the amount of tasks completion workers, an integer can be provided to specify
    +          the amount of workers. `False`, will use a single process. Default is `False`.'
    +        default: false
    +      - name: verbose
    +        type: bool
    +        doc: Whether to print the progress of the transcription. Default is `False`.
    +        default: false
    +      outputs: []
    +      lineno: 1097
    +      has_varargs: false
    +      has_kwargs: false
    +    audio_iterator:
    +      name: audio_iterator
    +      doc: ''
    +      parameters: []
    +      outputs:
    +      - type: Generator[Union[dict, str], None, None]
    +      lineno: 804
    +      has_varargs: false
    +      has_kwargs: false
    +    batch_iterator:
    +      name: batch_iterator
    +      doc: ''
    +      parameters: []
    +      outputs:
    +      - type: Generator[List[Union[dict, str]], None, None]
    +      lineno: 816
    +      has_varargs: false
    +      has_kwargs: false
    +    open_mpi_handler:
    +      name: open_mpi_handler
    +      doc: ''
    +      parameters:
    +      - name: worker_inputs
    +        type: List[str]
    +      - name: root_worker_inputs
    +        type: Dict[str, Any]
    +        default: null
    +      outputs: []
    +      lineno: 957
    +      has_varargs: false
    +      has_kwargs: false
    +    decorator:
    +      name: decorator
    +      doc: ''
    +      parameters:
    +      - name: handler
    +      outputs: []
    +      lineno: 969
    +      has_varargs: false
    +      has_kwargs: false
    +    wrapper:
    +      name: wrapper
    +      doc: ''
    +      parameters: []
    +      outputs: []
    +      lineno: 974
    +      has_varargs: false
    +      has_kwargs: true
    +  description: Transcribe audio files into text files
    +  default_handler: transcribe
    +  disable_auto_mount: false
    +  clone_target_dir: ''
    +  env: []
    +  priority_class_name: ''
    +  preemption_mode: prevent
    +  affinity: null
    +  tolerations: null
    +  security_context: {}
    +verbose: false
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/transcribe/1.1.0/static/item.html b/functions/master/transcribe/1.1.0/static/item.html new file mode 100644 index 00000000..fddc2e3e --- /dev/null +++ b/functions/master/transcribe/1.1.0/static/item.html @@ -0,0 +1,53 @@ + + + + + + + + + + + Source + + + + +
    +        
    +apiVersion: v1
    +categories:
    +- data-preparation
    +- genai
    +- huggingface
    +- machine-learning
    +description: Transcribe audio files into text files
    +doc: ''
    +example: transcribe.ipynb
    +generationDate: 2023-07-13:11-20
    +hidden: false
    +icon: ''
    +labels:
    +  author: yonatans
    +maintainers: []
    +marketplaceType: ''
    +mlrunVersion: 1.5.1
    +name: transcribe
    +platformVersion: 3.5.3
    +spec:
    +  filename: transcribe.py
    +  handler: transcribe
    +  image: mlrun/mlrun
    +  kind: job
    +  requirements:
    +    - transformers
    +    - tqdm
    +    - torchaudio
    +    - torch
    +    - accelerate
    +url: ''
    +version: 1.1.0
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/transcribe/1.1.0/static/source.html b/functions/master/transcribe/1.1.0/static/source.html new file mode 100644 index 00000000..534b397d --- /dev/null +++ b/functions/master/transcribe/1.1.0/static/source.html @@ -0,0 +1,1485 @@ + + + + + + + + + + + Source + + + + +
    +        
    +# Copyright 2024 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +import logging
    +import operator
    +import os
    +import tempfile
    +from functools import reduce, wraps
    +from multiprocessing import Process, Queue
    +from pathlib import Path
    +from typing import Any, Dict, Generator, List, Literal, NamedTuple, Tuple, Union
    +
    +import pandas as pd
    +import torch
    +import torchaudio
    +from tqdm import tqdm
    +from transformers import (
    +    AutomaticSpeechRecognitionPipeline,
    +    AutoModelForCausalLM,
    +    pipeline,
    +)
    +from transformers.utils import is_flash_attn_2_available
    +
    +
    +class BaseTask:
    +    """
    +    A task to write the transcription to file.
    +    """
    +
    +    def __init__(
    +        self, audio_file: Path, transcription_output: Union[dict, str], text_file: Path
    +    ):
    +        """
    +        Initialize the task.
    +
    +        :param audio_file:           Path to the audio file that was transcribed.
    +        :param transcription_output: The transcription output from the pipeline. String means an exception was raised.
    +        :param text_file:            Path to the text file to write the transcription to.
    +        """
    +        # Store the parameters:
    +        self._audio_file = audio_file
    +        self._transcription_output = transcription_output
    +        self._text_file = text_file
    +
    +        # Prepare the error variable:
    +        self._error: str = None
    +
    +    def do_task(self):
    +        """
    +        Try to perform the task storing an error if occurred.
    +        """
    +        if isinstance(self._transcription_output, str):
    +            self._error = self._transcription_output
    +            return
    +        try:
    +            self._do_task()
    +        except Exception as exception:
    +            self._error = str(exception)
    +
    +    def is_failed(self) -> bool:
    +        """
    +        Check if the task failed.
    +
    +        :returns: Whether the task failed.
    +        """
    +        return self._error is not None
    +
    +    def get_result(self) -> Tuple[str, str]:
    +        """
    +        Get the result of the task. If the task failed, the error will be returned, otherwise, the result will be the
    +        text file name.
    +
    +        :returns: The task's result.
    +        """
    +        if self.is_failed():
    +            return self._audio_file.name, self._error
    +        return self._audio_file.name, self._text_file.name
    +
    +    def to_tuple(self) -> Tuple[str, dict]:
    +        """
    +        Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).
    +
    +        :returns: The converted task.
    +        """
    +        return self.__class__.__name__, {
    +            "audio_file": self._audio_file,
    +            "transcription_output": self._transcription_output,
    +            "text_file": self._text_file,
    +        }
    +
    +    def _do_task(self):
    +        """
    +        Perform the task - write the transcription to the stored file path.
    +        """
    +        # Checking for no duplications:
    +        i = 1
    +        while self._text_file.exists():
    +            i += 1
    +            self._text_file = (
    +                self._text_file.parent
    +                / f"{self._text_file.stem.rsplit('_', 1)[0]}_{i}{self._text_file.suffix}"
    +            )
    +
    +        # Make sure all directories are created:
    +        self._text_file.parent.mkdir(exist_ok=True, parents=True)
    +
    +        # Write to file:
    +        with open(self._text_file, "w") as fp:
    +            fp.write(self._transcription_output["text"])
    +
    +
    +class SpeechDiarizationTask(BaseTask):
    +    """
    +    A task to write the transcription to file with respect to a given speech diarization.
    +    """
    +
    +    class _DiarizationSegment(NamedTuple):
    +        """
    +        A speech diarization segment.
    +        """
    +
    +        start: float
    +        end: float
    +        speaker: str
    +
    +    class _WordTimestamp(NamedTuple):
    +        """
    +        A word with its start and end timestamps.
    +        """
    +
    +        start: float
    +        end: float
    +        text: str
    +
    +    def __init__(
    +        self,
    +        audio_file: Path,
    +        transcription_output: dict,
    +        text_file: Path,
    +        speech_diarization: List[Tuple[float, float, str]],
    +    ):
    +        """
    +        Initialize the task.
    +
    +        :param audio_file:           Path to the audio file that was transcribed.
    +        :param transcription_output: The transcription output from the pipeline.
    +        :param text_file:            Path to the text file to write the transcription to.
    +        :param speech_diarization:   A speech diarization as a list of tuples: (start, end, speaker).
    +        """
    +        super().__init__(
    +            audio_file=audio_file,
    +            transcription_output=transcription_output,
    +            text_file=text_file,
    +        )
    +        self._speech_diarization = speech_diarization
    +        self._segments: List[SpeechDiarizationTask._DiarizationSegment] = None
    +        self._last_chosen_index = 0
    +
    +    def to_tuple(self) -> Tuple[str, dict]:
    +        """
    +        Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).
    +
    +        :returns: The converted task.
    +        """
    +        task_class, task_kwargs = super().to_tuple()
    +        return task_class, {
    +            **task_kwargs,
    +            "speech_diarization": self._speech_diarization,
    +        }
    +
    +    def _do_task(self):
    +        """
    +        Perform the task - write the transcription to the stored file path with respect to the given speech diarization.
    +        """
    +        # Check if a speech diarization is given, if not, just write the transcription to file:
    +        if not self._speech_diarization:
    +            super()._do_task()
    +            return
    +
    +        # Cast the chunks to word timestamps tuples:
    +        words = [
    +            SpeechDiarizationTask._WordTimestamp(
    +                start=chunk["timestamp"][0],
    +                end=chunk["timestamp"][1],
    +                text=chunk["text"],
    +            )
    +            for chunk in self._transcription_output["chunks"]
    +        ]
    +
    +        # Cast speech diarization to segments tuples:
    +        self._segments = [
    +            SpeechDiarizationTask._DiarizationSegment(*segment)
    +            for segment in self._speech_diarization
    +        ]
    +
    +        # Try to match the Whisper model predicted timestamps to the closest diarization segment (closest diarization
    +        # segment will be the most overlapping with the word, and if there is no overlap, the closest segment to the
    +        # word):
    +        speaker = self._segments[self._last_chosen_index].speaker
    +        text = f"{speaker}:"
    +        for word in words:
    +            # Get the next diarization segment:
    +            self._get_next_segment(word=word)
    +            # Check if the segment is of the same speaker:
    +            if self._segments[self._last_chosen_index].speaker == speaker:
    +                # Collect the word:
    +                text += word.text
    +            else:
    +                # Append a newline and update the new speaker:
    +                speaker = self._segments[self._last_chosen_index].speaker
    +                text += f"\n{speaker}:{word.text}"
    +
    +        # Update the transcription output with the new text to write it to file:
    +        self._transcription_output["text"] = text
    +        super()._do_task()
    +
    +    def _get_next_segment(
    +        self,
    +        word: _WordTimestamp,
    +    ):
    +        """
    +        Get the next diarization segment the given word falls into. The `self._last_chosen_index` will be updated
    +        accordingly.
    +
    +        :param word: The word timestamp to match to the next segment.
    +        """
    +        # If the last chosen segment is the last segment, return it:
    +        if self._last_chosen_index == len(self._segments) - 1:
    +            return
    +
    +        # Get the last chosen diarization segment:
    +        last_chosen = self._segments[self._last_chosen_index]
    +
    +        # None value may appear if the word is the last word in the audio file, or it was split during inference. In
    +        # that case, we'll set the last segment:
    +        if word.end is None:
    +            self._last_chosen_index = len(self._segments) - 1
    +            return
    +
    +        # If the word ends before the last chosen segment:
    +        if word.end <= last_chosen.start:
    +            # Then it is still the closest segment
    +            return
    +
    +        # We check if it ends inside the last chosen segment:
    +        if word.end < last_chosen.end:
    +            # Then it still is the closest segment
    +            return
    +
    +        # The word ends after the segment, we need to collect all next segments up until the word ends before them:
    +        possible_segments = [self._last_chosen_index]
    +        for i in range(self._last_chosen_index + 1, len(self._segments)):
    +            if word.end > self._segments[i].end:
    +                possible_segments.append(i)
    +                continue
    +            possible_segments.append(i)
    +            break
    +
    +        # Check for the most overlapping option:
    +        best_overlap = 0
    +        most_overlapping_segment_index = None
    +        for i in possible_segments:
    +            # If the word starts before segment:
    +            if word.start <= self._segments[i].start:
    +                # If it ends before the segment, there is an overlap from the start of the segment to the end of the
    +                # word:
    +                if word.end < self._segments[i].end:
    +                    overlap = word.end - self._segments[i].start
    +                else:
    +                    # The word is wrapping the segment, the overlap is the segment's length:
    +                    overlap = self._segments[i].end - self._segments[i].start
    +            # The word starts in segment, check if the word ends in it:
    +            elif word.end < self._segments[i].end:
    +                # The overlap is the word's length:
    +                overlap = word.end - word.start
    +            # The word start in segment but ends after it, the overlap is from the word's start to the segment's end:
    +            else:
    +                overlap = self._segments[i].end - word.start
    +            # Check for new best overlap:
    +            if overlap > best_overlap:
    +                best_overlap = overlap
    +                most_overlapping_segment_index = i
    +        if most_overlapping_segment_index is not None:
    +            self._last_chosen_index = most_overlapping_segment_index
    +            return
    +
    +        # If there is no overlapping segment, return the closest segment:
    +        best_distance = None
    +        closest_segment_index = None
    +        for i in possible_segments:
    +            distance = (
    +                word.start - self._segments[i].end
    +                if word.start > self._segments[i].end
    +                else self._segments[i].start - word.end
    +            )
    +            if best_distance is None or distance < best_distance:
    +                best_distance = distance
    +                closest_segment_index = i
    +        self._last_chosen_index = closest_segment_index
    +
    +
    +class SpeechDiarizationPerChannelTask(BaseTask):
    +    """
    +    A task to write the transcription to file with respect to a given speech diarization per channel.
    +    """
    +
    +    class _WordTimestamp(NamedTuple):
    +        """
    +        A word with its start and end timestamps and speaker label (channel the word was taken from).
    +        """
    +
    +        start: float
    +        end: float
    +        speaker: str
    +        text: str
    +
    +    def __init__(self, audio_file: Path, text_file: Path):
    +        """
    +        Initialize the task.
    +
    +        :param audio_file: Path to the audio file that was transcribed.
    +        :param text_file:  Path to the text file to write the transcription to.
    +        """
    +        super().__init__(
    +            audio_file=audio_file, transcription_output={}, text_file=text_file
    +        )
    +        self._transcription_output_channels: List[Tuple[str, dict]] = []
    +
    +    @property
    +    def transcription_output_channels(self) -> List[Tuple[str, dict]]:
    +        """
    +        Get the transcription output channels.
    +
    +        :returns: The transcription output channels.
    +        """
    +        return self._transcription_output_channels
    +
    +    def do_task(self):
    +        """
    +        Try to perform the task storing an error if occurred.
    +        """
    +        for _, channel_output in self._transcription_output_channels:
    +            if isinstance(channel_output, str):
    +                self._error = self._transcription_output_channels
    +                return
    +        super().do_task()
    +
    +    def to_tuple(self) -> Tuple[str, dict]:
    +        """
    +        Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).
    +
    +        :returns: The converted task.
    +        """
    +        task_class, task_kwargs = super().to_tuple()
    +        task_kwargs.pop("transcription_output")
    +        return task_class, task_kwargs
    +
    +    def _do_task(self):
    +        """
    +        Perform the task - write the transcription to the stored file path with respect to the given speech diarization
    +        per channel.
    +        """
    +        # Cast the chunks to word timestamps tuples:
    +        words_per_channel = [
    +            [
    +                SpeechDiarizationPerChannelTask._WordTimestamp(
    +                    start=chunk["timestamp"][0],
    +                    end=chunk["timestamp"][1],
    +                    speaker=speaker,
    +                    text=chunk["text"],
    +                )
    +                for chunk in output["chunks"]
    +            ]
    +            for speaker, output in self._transcription_output_channels
    +        ]
    +
    +        # Merge and sort the words per channel by their start time:
    +        words = operator.add(*words_per_channel)
    +        words.sort()
    +
    +        # Write the transcription to file:
    +        current_speaker = words[0].speaker
    +        text = f"{current_speaker}:"
    +        for word in words:
    +            # Check if the word's speaker is different from the current one:
    +            if word.speaker != current_speaker:
    +                # Append a newline and update the new speaker:
    +                current_speaker = word.speaker
    +                text += f"\n{current_speaker}:"
    +            # Collect the word:
    +            text += word.text
    +
    +        # Update the transcription output with the new text to write it to file:
    +        self._transcription_output["text"] = text
    +        super()._do_task()
    +
    +
    +class BatchProcessor:
    +    """
    +    A batch processor to process batches of transcriptions. The batch processor is creating tasks and is aimed to be
    +    working along the transcriber. It can be used with multiprocessing queue or run the tasks directly using the
    +    associated methods.
    +    """
    +
    +    def __init__(self, audio_files: List[Path], output_directory: Path):
    +        """
    +        Initialize the batch processor.
    +
    +        :param audio_files:      The list of all audio files to transcribe.
    +        :param output_directory: The output directory to write the transcriptions to.
    +        """
    +        # Store the parameters:
    +        self._audio_files = audio_files
    +        self._output_directory = output_directory
    +
    +        # Prepare the batching variables:
    +        self._current_file_index = 0
    +        self._tasks: List[BaseTask] = []
    +        self._results: List[Tuple[bool, Tuple[str, str]]] = []
    +
    +    def process_batch(self, batch: List[Union[dict, str]]):
    +        """
    +        Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch
    +        processor.
    +
    +        :param batch: The batch of transcriptions to process.
    +        """
    +        # Get the relevant files belongs to the given batch:
    +        current_files = self._get_current_files(batch_size=len(batch))
    +
    +        # Build the diarization tasks:
    +        self._tasks.extend(
    +            [
    +                BaseTask(
    +                    audio_file=file,
    +                    transcription_output=batch[i],
    +                    text_file=self._output_directory / f"{file.stem}.txt",
    +                )
    +                for i, file in enumerate(current_files)
    +            ]
    +        )
    +
    +    def get_tasks(self) -> List[BaseTask]:
    +        """
    +        Get the tasks to perform.
    +
    +        :returns: The tasks to perform.
    +        """
    +        tasks = self._tasks
    +        self._tasks = []
    +        return tasks
    +
    +    def do_tasks(self):
    +        """
    +        Perform the tasks. Should be used if no multiprocessing queue is given to a transcriber.
    +        """
    +        for task in self.get_tasks():
    +            task.do_task()
    +            self._results.append((task.is_failed(), task.get_result()))
    +
    +    def get_results(self) -> List[Tuple[bool, Tuple[str, str]]]:
    +        """
    +        Get the results of the tasks. The stored results are then cleared.
    +
    +        :returns: The results of the tasks.
    +        """
    +        results = self._results
    +        self._results = []
    +        return results
    +
    +    def _get_current_files(self, batch_size: int) -> List[Path]:
    +        """
    +        Get the current files to process.
    +
    +        :param batch_size: The batch size to progress the current file index.
    +
    +        :returns: The current files to process.
    +        """
    +        end_index = (
    +            self._current_file_index + batch_size
    +            if self._current_file_index + batch_size < len(self._audio_files)
    +            else len(self._audio_files)
    +        )
    +        current_files = self._audio_files[self._current_file_index : end_index]
    +        self._current_file_index = end_index
    +        return current_files
    +
    +
    +class SpeechDiarizationBatchProcessor(BatchProcessor):
    +    """
    +    A batch processor to process batches of transcriptions with respect to a given speech diarization. The batch
    +    processor is creating tasks and is aimed to be working along the transcriber. It can be used with multiprocessing
    +    queue or run the tasks directly using the associated methods.
    +    """
    +
    +    def __init__(
    +        self, audio_files: List[Path], output_directory: Path, speech_diarization: dict
    +    ):
    +        """
    +        Initialize the batch processor.
    +
    +        :param audio_files:        The list of all audio files to transcribe.
    +        :param output_directory:   The output directory to write the transcriptions to.
    +        :param speech_diarization: A speech diarization dictionary to pass along with each processed batch.
    +        """
    +        super().__init__(audio_files=audio_files, output_directory=output_directory)
    +        self._speech_diarization = speech_diarization
    +        self._audio_files = audio_files
    +
    +    def process_batch(self, batch: List[dict]):
    +        """
    +        Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch
    +        processor.
    +
    +        :param batch: The batch of transcriptions to process.
    +        """
    +        # Get the relevant files belongs to the given batch:
    +        current_files = self._get_current_files(batch_size=len(batch))
    +
    +        # Build the diarization tasks:
    +        self._tasks.extend(
    +            [
    +                SpeechDiarizationTask(
    +                    audio_file=file,
    +                    transcription_output=batch[i],
    +                    text_file=self._output_directory / f"{file.stem}.txt",
    +                    speech_diarization=self._speech_diarization.get(file.name),
    +                )
    +                for i, file in enumerate(current_files)
    +            ]
    +        )
    +
    +
    +class PerChannelSpeechDiarizationBatchProcessor(BatchProcessor):
    +    """
    +    A batch processor to process batches of transcriptions per channel. The batch processor is creating tasks with the
    +    selected amount of channels given and is aimed to be working along the transcriber. It can be used with
    +    multiprocessing queue or run the tasks directly using the associated methods.
    +    """
    +
    +    def __init__(
    +        self,
    +        audio_files: List[Path],
    +        output_directory: Path,
    +        n_channels: int,
    +        speakers: List[str],
    +    ):
    +        """
    +        Initialize the batch processor.
    +
    +        :param audio_files:      The list of all audio files to transcribe.
    +        :param output_directory: The output directory to write the transcriptions to.
    +        :param n_channels:       The number of channels in each audio file to transcribe.
    +        :param speakers:         The speakers labels to use for each channel.
    +        """
    +        super().__init__(audio_files=audio_files, output_directory=output_directory)
    +
    +        # Store the parameters:
    +        self._n_channels = n_channels
    +        self._speakers = speakers
    +
    +        # Prepare a channel buffer to store the channels until the current task created is fully covered:
    +        self._task_in_process: SpeechDiarizationPerChannelTask = None
    +
    +    def process_batch(self, batch: List[dict]):
    +        """
    +        Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch
    +        processor.
    +
    +        :param batch: The batch of transcriptions to process.
    +        """
    +        # Go over the batch and create the tasks:
    +        for output in batch:
    +            # Check if there is a task in process:
    +            if not self._task_in_process:
    +                # Create a new task:
    +                self._task_in_process = SpeechDiarizationPerChannelTask(
    +                    audio_file=self._audio_files[self._current_file_index],
    +                    text_file=self._output_directory
    +                    / f"{self._audio_files[self._current_file_index].stem}.txt",
    +                )
    +            # Get the channel's speaker:
    +            speaker = self._speakers[
    +                len(self._task_in_process.transcription_output_channels)
    +            ]
    +            # Collect the channel into the processed task:
    +            self._task_in_process.transcription_output_channels.append(
    +                (speaker, output)
    +            )
    +            # Check if the task is fully covered (all channels are collected):
    +            if (
    +                len(self._task_in_process.transcription_output_channels)
    +                == self._n_channels
    +            ):
    +                # Collect the task and reset the task in process:
    +                self._tasks.append(self._task_in_process)
    +                self._current_file_index += 1
    +                self._task_in_process = None
    +
    +
    +class Transcriber:
    +    """
    +    A transcription wrapper for the Huggingface's ASR pipeline -
    +    https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline to
    +    use with OpenAI's Whisper models - https://huggingface.co/openai.
    +    """
    +
    +    def __init__(
    +        self,
    +        model_name: str,
    +        device: str = None,
    +        use_flash_attention_2: bool = None,
    +        use_better_transformers: bool = None,
    +        assistant_model: str = None,
    +        max_new_tokens: int = 128,
    +        chunk_length_s: int = 30,
    +        batch_size: int = 2,
    +        spoken_language: str = None,
    +        translate_to_english: bool = False,
    +        return_timestamps: Union[bool, Literal["word"]] = False,
    +        per_channel_transcription: int = 0,
    +    ):
    +        """
    +        Initialize the transcriber.
    +
    +        :param model_name:                The model name to use. Should be a model from the OpenAI's Whisper models for
    +                                          best results (for example "tiny", "base", "large", etc.).
    +        :param device:                    The device to use for inference. If not given, will use GPU if available.
    +        :param use_flash_attention_2:     Whether to use the Flash Attention 2 implementation. It can be used only with
    +                                          one of the following GPUs: Nvidia H series and Nvidia A series. T4 support
    +                                          will be available soon.
    +
    +                                          Note: If both `use_flash_attention_2` and
    +                                          `use_better_transformers` are `None`, the optimization will be chosen
    +                                          automatically according to the available resources.
    +
    +        :param use_better_transformers:   Whether to use the Better Transformers library to further optimize the model.
    +                                          Should be used for all use cases that do not support flash attention 2.
    +
    +                                          Note: If both `use_flash_attention_2` and `use_better_transformers` are
    +                                          `None`, the optimization will be chosen automatically according to the
    +                                          available resources.
    +       :param assistant_model:           The assistant model name to use for inference. Notice that the optimizations
    +                                          (flash attention 2 and better transformers) will be applied for the assistant
    +                                          as well. Should be a model from Huggingface's distil-whisper (see here for
    +                                          more information: https://github.com/huggingface/distil-whisper).
    +        :param max_new_tokens:            The maximum number of new tokens to generate. This is used to limit the
    +                                          generation length. Default is 128 tokens.
    +        :param chunk_length_s:            The audio chunk to split the audio to (in seconds). Default is 30 seconds.
    +        :param batch_size:                The batch size to use for inference. Default is 2.
    +        :param spoken_language:           Aim whisper to know what language is spoken. If None, it will try to detect it
    +                                          for each chunk.
    +        :param translate_to_english:      Whether to translate the transcriptions to English. Default is False.
    +        :param return_timestamps:         Whether to return the timestamps of the words. If "word", will return the
    +                                          timestamps of each word. If True will return the timestamps of each chunk.
    +                                          Default is False. Aimed to be used for speech diarization.
    +        :param per_channel_transcription: Whether to do per channel transcription. If needed to run per channel
    +                                          transcription, pass the number of channels expected for each audio file here.
    +                                          0 means regular transcription (merge channels).
    +
    +                                          Note: If `per_channel_transcription` is not 0, `batch_size` must be treated to
    +                                          be the number of channels and not audio files. Aimed to be used for per
    +                                          channel speech diarization.
    +        """
    +        # Store loading parameters:
    +        self._model_name = model_name
    +        self._device = device
    +        self._use_flash_attention_2 = use_flash_attention_2
    +        self._use_better_transformers = use_better_transformers
    +        self._max_new_tokens = max_new_tokens
    +        self._chunk_length_s = chunk_length_s
    +        self._batch_size = batch_size
    +        self._return_timestamps = return_timestamps
    +        self._per_channel_transcription = per_channel_transcription
    +
    +        # Store generation parameters:
    +        self._assistant_model = assistant_model
    +        self._spoken_language = spoken_language
    +        self._translate_to_english = translate_to_english
    +
    +        # Prepare the transcription objects:
    +        self._transcription_pipeline: AutomaticSpeechRecognitionPipeline = None
    +        self._generate_kwargs: dict = None
    +
    +    def load(self):
    +        """
    +        Load the transcriber. Must be called before transcribing.
    +        """
    +        # Set the device and data type to use (prefer GPU if available):
    +        device = torch.device(
    +            self._device or "cuda" if torch.cuda.is_available() else "cpu"
    +        )
    +        torch_dtype = torch.float16 if device.type == "cuda" else torch.float32
    +
    +        # Choose the optimization to use (in case the user did not specify any):
    +        if (
    +            self._use_flash_attention_2 is None
    +            and self._use_better_transformers is None
    +        ):
    +            # Prefer to use flash attention 2 if available and cuda device is supported (see GPU names to architecture
    +            # here: https://en.wikipedia.org/wiki/List_of_Nvidia_graphics_processing_units#Tesla):
    +            if device.type == "cuda" and is_flash_attn_2_available():
    +                cuda_device_name = torch.cuda.get_device_properties(device).name
    +                if any(
    +                    cuda_device_name.startswith(gpu_name)
    +                    for gpu_name in [
    +                        "NVIDIA A",  # For Ampere architecture (e.g. A10, A30, A100)
    +                        "NVIDIA H",  # For Hopper architecture (e.g. H100)
    +                        "NVIDIA L",  # For Ada Lovelace architecture (e.g. L4, L40)
    +                        "NVIDIA RTX 30",  # For Ada Lovelace architecture (RTX 30 series)
    +                        "NVIDIA RTX 40",  # For Ada Lovelace architecture (RTX 40 series)
    +                        "NVIDIA RTX 50",  # For Ada Lovelace architecture (RTX 50 series)
    +                        # Will be supported soon according to FlashAttention GitHub repo:
    +                        # https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features
    +                        # "NVIDIA T4",  # For Turing architecture (only T4)
    +                        # "NVIDIA RTX 20",  # For Turing architecture (RTX 20 series)
    +                    ]
    +                ):
    +                    self._use_flash_attention_2 = True
    +                else:
    +                    self._use_better_transformers = True
    +            else:
    +                self._use_better_transformers = True
    +
    +        # Build the optimizations kwargs:
    +        model_kwargs = {
    +            "low_cpu_mem_usage": True,
    +            "use_safetensors": True,
    +        }
    +        if self._use_flash_attention_2:
    +            if _LOGGER:
    +                _LOGGER.info(
    +                    "Using FlashAttention2 optimization - make sure the `flash-attn` package is installed via "
    +                    "`pip install -U flash-attn --no-build-isolation`"
    +                )
    +            model_kwargs["attn_implementation"] = "flash_attention_2"
    +        elif self._use_better_transformers:
    +            if _LOGGER:
    +                _LOGGER.info(
    +                    "Using BetterTransformers optimization - make sure the `optimum` package is installed via "
    +                    "`pip install -U optimum`"
    +                )
    +            model_kwargs["attn_implementation"] = "sdpa"
    +
    +        # Initialize the speech recognition pipeline:
    +        self._transcription_pipeline = pipeline(
    +            task="automatic-speech-recognition",
    +            model=self._model_name,
    +            model_kwargs=model_kwargs.copy(),
    +            batch_size=self._batch_size,
    +            max_new_tokens=self._max_new_tokens,
    +            chunk_length_s=self._chunk_length_s,
    +            return_timestamps=self._return_timestamps,
    +            torch_dtype=torch_dtype,
    +            device=device,
    +        )
    +
    +        # Prepare the generation kwargs:
    +        self._generate_kwargs = {
    +            "language": self._spoken_language,
    +            "task": "translate" if self._translate_to_english else "transcribe",
    +        }
    +
    +        # Initialize the assistant model (if needed):
    +        if self._assistant_model:
    +            assistant_model = AutoModelForCausalLM.from_pretrained(
    +                self._assistant_model, torch_dtype=torch_dtype, **model_kwargs
    +            )
    +            assistant_model.to(device)
    +            self._generate_kwargs["assistant_model"] = assistant_model
    +
    +    def transcribe(
    +        self,
    +        audio_files: List[Path],
    +        batch_processor: BatchProcessor = None,
    +        batches_queue: Queue = None,
    +        verbose: bool = False,
    +    ) -> Union[List[List[dict]], None]:
    +        """
    +        Transcribe the given audio files. The transcriptions will be sent to a queue or a batch processor for further
    +        processing like writing to text files. If no queue or batch processor is given, the transcriptions outputs from
    +        the pipeline will be returned. Otherwise, `None` is returned.
    +
    +        :param audio_files:     The audio files to transcribe.
    +        :param batch_processor: A batch processor.
    +        :param batches_queue:   A multiprocessing queue to put the batches in.
    +        :param verbose:         Whether to show a progress bar. Default is False.
    +
    +        :returns: The transcriptions outputs from the pipeline if no queue or batch processor is given, otherwise,
    +                  `None`.
    +        """
    +        # Wrap the audio files with a function to iterate over them via a generator (save memory and runtime with
    +        # Huggingface's pipelines as they preload each input while inference is running):
    +        def audio_iterator() -> Generator[Union[dict, str], None, None]:
    +            if self._per_channel_transcription:
    +                for audio_file in audio_files:
    +                    audio, sampling_rate = torchaudio.load(str(audio_file))
    +                    audio = audio.numpy()
    +                    for channel in audio:
    +                        yield {"raw": channel, "sampling_rate": sampling_rate}
    +            else:
    +                for audio_file in audio_files:
    +                    yield str(audio_file)
    +
    +        # Create a batch iterator:
    +        def batch_iterator() -> Generator[List[Union[dict, str]], None, None]:
    +            batch = []
    +            for audio in audio_iterator():
    +                batch.append(audio)
    +                if len(batch) == self._batch_size:
    +                    yield batch
    +                    batch = []
    +            if batch:
    +                yield batch
    +
    +        # Prepare the successes dataframe and errors dictionary to be returned:
    +        outputs = []
    +
    +        # Infer through the pipeline:
    +        for input_batch in tqdm(
    +            batch_iterator() if self._batch_size > 1 else audio_iterator(),
    +            desc="Transcribing",
    +            unit="channel" if self._per_channel_transcription else "audio file",
    +            total=(
    +                (
    +                    (len(audio_files) // self._batch_size)
    +                    + (len(audio_files) % self._batch_size != 0)
    +                )
    +                * (self._per_channel_transcription or 1)
    +            ),
    +            disable=not verbose,
    +        ):
    +            # Infer:
    +            try:
    +                output_batch = self._transcription_pipeline(
    +                    input_batch,
    +                    generate_kwargs=self._generate_kwargs,
    +                )
    +            except Exception as exception:
    +                # Collect the exception:
    +                output_batch = str(exception)
    +                # Align to batch size:
    +                output_batch = (
    +                    [output_batch] * len(input_batch)
    +                    if isinstance(input_batch, list)
    +                    else [output_batch]
    +                )
    +            # To align with batching, if batch size is 1, wrap the output with a list:
    +            if isinstance(output_batch, dict):
    +                output_batch = [output_batch]
    +            # If a batch processor is given, process the batch:
    +            if batch_processor:
    +                # Process it directly:
    +                batch_processor.process_batch(batch=output_batch)
    +                batch_processor.do_tasks()
    +            elif batches_queue:
    +                # Otherwise, queue the batch:
    +                batches_queue.put(output_batch)
    +            else:
    +                # Otherwise, collect the output as is without processing:
    +                outputs.append(output_batch)
    +
    +        # Check if given a multiprocessing queue or a batch processor:
    +        if batches_queue:
    +            batches_queue.put(_MULTIPROCESSING_STOP_MARK)
    +
    +        return outputs if not batch_processor else None
    +
    +
    +#: The value to send into multiprocessing queues to stop the process:
    +_MULTIPROCESSING_STOP_MARK = "STOP"
    +
    +
    +def _multiprocessing_process_batches(
    +    batch_processor: BatchProcessor,
    +    batches_queue: Queue,
    +    tasks_queue: Queue,
    +    n_task_completers: int,
    +):
    +    """
    +    Process the batches in the given batches queue and put the tasks in the given tasks queue. The function will stop
    +    when the given batches queue will receive the stop mark. It is aimed to be used with multiprocessing as a process.
    +
    +    :param batch_processor:   A batch processor to process the batches.
    +    :param batches_queue:     A queue to get the batches from.
    +    :param tasks_queue:       A queue to put the tasks in.
    +    :param n_task_completers: The number of task completers (processes that run the `_multiprocessing_complete_tasks`
    +                              function). A stop mark will be sent to the tasks queue for each task completer.
    +    """
    +    while True:
    +        # Get the batch:
    +        batch: List[dict] = batches_queue.get()
    +        if batch == _MULTIPROCESSING_STOP_MARK:
    +            break
    +
    +        # Process the batch:
    +        batch_processor.process_batch(batch=batch)
    +
    +        # Get the tasks:
    +        tasks = batch_processor.get_tasks()
    +
    +        # Queue the tasks:
    +        for task in tasks:
    +            tasks_queue.put(task.to_tuple())
    +
    +    # Mark the end of the batches:
    +    for _ in range(n_task_completers):
    +        tasks_queue.put(_MULTIPROCESSING_STOP_MARK)
    +
    +
    +def _multiprocessing_complete_tasks(tasks_queue: Queue, results_queue: Queue):
    +    """
    +    Complete the tasks in the given queue and put the results in the given results queue. The function will stop when
    +    the given tasks queue will receive the stop mark. It is aimed to be used with multiprocessing as a process.
    +
    +    :param tasks_queue:   A queue to get the tasks from.
    +    :param results_queue: A queue to put the results in.
    +    """
    +    tasks_map = {
    +        BaseTask.__name__: BaseTask,
    +        SpeechDiarizationTask.__name__: SpeechDiarizationTask,
    +        SpeechDiarizationPerChannelTask.__name__: SpeechDiarizationPerChannelTask,
    +    }
    +
    +    while True:
    +        # Get the task:
    +        task = tasks_queue.get()
    +        if task == _MULTIPROCESSING_STOP_MARK:
    +            break
    +
    +        # Reconstruct the task:
    +        task_class, task_kwargs = task
    +        task = tasks_map[task_class](**task_kwargs)
    +
    +        # Complete the task:
    +        task.do_task()
    +        results_queue.put((task.is_failed(), task.get_result()))
    +
    +    # Mark the end of the tasks:
    +    results_queue.put(_MULTIPROCESSING_STOP_MARK)
    +
    +
    +# Get the global logger:
    +_LOGGER = logging.getLogger()
    +
    +
    +def open_mpi_handler(
    +    worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None
    +):
    +    global _LOGGER
    +
    +    # Check for MLRun and OpenMPI availability:
    +    context, comm = _check_mlrun_and_open_mpi()
    +
    +    # Check if MLRun is available, set the global logger to MLRun's:
    +    if context:
    +        _LOGGER = context.logger
    +
    +    def decorator(handler):
    +        if comm is None or comm.Get_size() == 1:
    +            return handler
    +
    +        @wraps(handler)
    +        def wrapper(**kwargs):
    +            # Get the open mpi environment properties:
    +            size = comm.Get_size()
    +            rank = comm.Get_rank()
    +
    +            # Give the correct chunk of the workers inputs:
    +            for worker_input in worker_inputs:
    +                input_argument = kwargs[worker_input]
    +                if input_argument is None:
    +                    continue
    +                if isinstance(input_argument, str):
    +                    input_argument = _get_audio_files(
    +                        data_path=Path(input_argument).absolute()
    +                    )
    +                if len(input_argument) < size:
    +                    raise ValueError(
    +                        f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. "
    +                        f"Please reduce the amount of workers for this input."
    +                    )
    +                even_chunk_size = len(input_argument) // size
    +                chunk_start = rank * even_chunk_size
    +                chunk_end = (
    +                    (rank + 1) * even_chunk_size
    +                    if rank + 1 < size
    +                    else len(input_argument)
    +                )
    +                context.logger.info(
    +                    f"Rank #{rank}: Processing input chunk of '{worker_input}' "
    +                    f"from index {chunk_start} to {chunk_end}."
    +                )
    +                if isinstance(input_argument, list):
    +                    input_argument = input_argument[chunk_start:chunk_end]
    +                elif isinstance(input_argument, pd.DataFrame):
    +                    input_argument = input_argument.iloc[chunk_start:chunk_end:, :]
    +                kwargs[worker_input] = input_argument
    +
    +            # Set the root worker only arguments:
    +            if rank == 0 and root_worker_inputs:
    +                kwargs.update(root_worker_inputs)
    +
    +            # Run the worker:
    +            output = handler(**kwargs)
    +
    +            # Save the output directory of this worker:
    +            output_directory = Path(output[0])
    +
    +            # Send the output to the root rank (rank #0):
    +            output = comm.gather(output, root=0)
    +
    +            # Join the data from all workers:
    +            if rank == 0:
    +                context.logger.info("Collecting data from workers to root worker.")
    +
    +                # Check if there are different output directories:
    +                output_directories = set([Path(out_dir) for out_dir, _, _ in output])
    +                for r in range(1, size):
    +                    # True means the other workers should pass their files to the root worker (rank 0):
    +                    comm.send(len(output_directories) != 1, dest=r)
    +
    +                # If there are different output directories, listen to the other workers:
    +                if len(output_directories) != 1:
    +                    # Collect the files from the other workers:
    +                    files = []
    +                    for r in range(1, size):
    +                        files.extend(comm.recv(source=r))
    +                    # Write the files to the root worker's output directory:
    +                    for file_name, file_content in files:
    +                        with open(output_directory / file_name, "w") as f:
    +                            f.write(file_content)
    +
    +                # Concatenate the dataframes:
    +                dataframe = pd.concat(objs=[df for _, df, _ in output], axis=0)
    +
    +                # Concatenate the errors dictionaries:
    +                errors_dictionary = reduce(
    +                    operator.ior, [err for _, _, err in output], {}
    +                )
    +
    +                return str(output_directory), dataframe, errors_dictionary
    +
    +            # Listen to rank 0 to see if there are different output directories and this rank need to send its files to
    +            # it:
    +            if comm.recv(source=0):
    +                files = []
    +                for file in os.listdir(output_directory):
    +                    with open(output_directory / file, "r") as f:
    +                        files.append((file, f.read()))
    +                comm.send(files, dest=0)
    +            return None
    +
    +        return wrapper
    +
    +    return decorator
    +
    +
    +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    +    is_mpi = False
    +    try:
    +        import mlrun
    +
    +        context = mlrun.get_or_create_ctx(name="mlrun")
    +        is_mpi = context.labels.get("kind", "job") == "mpijob"
    +
    +        if is_mpi:
    +            try:
    +                from mpi4py import MPI
    +
    +                return context, MPI.COMM_WORLD
    +            except ModuleNotFoundError as mpi4py_not_found:
    +                context.logger.error(
    +                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
    +                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
    +                )
    +                raise mpi4py_not_found
    +        else:
    +            return context, None
    +    except ModuleNotFoundError as module_not_found:
    +        if is_mpi:
    +            raise module_not_found
    +    return None, None
    +
    +
    +@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True})
    +def transcribe(
    +    # Input / Output kwargs:
    +    data_path: Union[str, Path, List[Union[str, Path]]],
    +    output_directory: str = None,
    +    # Model loading kwargs:
    +    model_name: str = "openai/whisper-tiny",
    +    device: str = None,
    +    use_flash_attention_2: bool = None,
    +    use_better_transformers: bool = None,
    +    # Generation kwargs:
    +    assistant_model: str = None,
    +    max_new_tokens: int = 128,
    +    chunk_length_s: int = 30,
    +    batch_size: int = 8,
    +    spoken_language: str = None,
    +    translate_to_english: bool = False,
    +    # Diarization kwargs:
    +    speech_diarization: Dict[str, List[Tuple[float, float, str]]] = None,
    +    speech_diarize_per_channel: int = None,
    +    speaker_labels: List[str] = None,
    +    # Other kwargs:
    +    use_multiprocessing: Union[bool, int] = False,
    +    verbose: bool = False,
    +):
    +    """
    +    Transcribe audio files into text files and collect additional data. The end result is a directory of transcribed
    +    text files and a dataframe containing the following columns:
    +
    +    * audio_file - The audio file path.
    +    * transcription_file - The transcribed text file name in the output directory.
    +
    +    The transcription is based on Huggingface's ASR pipeline -
    +    https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline and
    +    is tested with OpenAI's Whisper models - https://huggingface.co/openai.
    +
    +    If one of the speaker diarization parameters are given (either `speech_diarization` or
    +    `speech_diarize_per_channel`), the transcription will be written in a conversation format, where each speaker will
    +    be written in a separate line::
    +
    +        speaker_1: text
    +        speaker_2: text
    +        speaker_1: text
    +        ...
    +
    +    :param data_path:                  A directory of audio files or a single file or a list of files to transcribe.
    +    :param output_directory:           Path to a directory to save all transcribed audio files. If not given, will save
    +                                       the transcribed files in a temporary directory.
    +    :param model_name:                 The model name to use. Should be a model from the OpenAI's Whisper models for
    +                                       best results (for example "tiny", "base", "large", etc.). See here for more
    +                                       information: https://huggingface.co/openai?search_models=whisper.
    +    :param device:                     The device to use for inference. If not given, will use GPU if available.
    +    :param use_flash_attention_2:      Whether to use the Flash Attention 2 implementation. It can be used only with
    +                                       one of the following GPUs: Nvidia H series and Nvidia A series. T4 support
    +                                       will be available soon.
    +
    +                                       Note: If both `use_flash_attention_2` and
    +                                       `use_better_transformers` are `None`, the optimization will be chosen
    +                                       automatically according to the available resources.
    +
    +    :param use_better_transformers:    Whether to use the Better Transformers library to further optimize the model.
    +                                       Should be used for all use cases that do not support flash attention 2.
    +
    +                                       Note: If both `use_flash_attention_2` and `use_better_transformers` are
    +                                       `None`, the optimization will be chosen automatically according to the
    +                                       available resources.
    +    :param assistant_model:            The assistant model name to use for inference. Notice that the optimizations
    +                                       (flash attention 2 and better transformers) will be applied for the assistant as
    +                                       well. Should be a model from Huggingface's distil-whisper (see here for more
    +                                       information: https://github.com/huggingface/distil-whisper).
    +
    +                                       Note: Currently an assistant model is only usable with batch size of 1.
    +    :param max_new_tokens:             The maximum number of new tokens to generate. This is used to limit the
    +                                       generation length. Default is 128 tokens.
    +    :param chunk_length_s:             The audio chunk to split the audio to (in seconds). Default is 30 seconds.
    +    :param batch_size:                 The batch size to use for inference. Default is 2.
    +    :param spoken_language:            Aim whisper to know what language is spoken. If None, it will try to detect
    +                                       it.
    +    :param translate_to_english:       Whether to translate the transcriptions to English.
    +    :param speech_diarization:         A speech diarization dictionary with the file names to transcribe as keys and
    +                                       their diarization as value. The diarization is a list of tuples:
    +                                       (start, end, speaker). An example
    +                                       for a diarization dictionary::
    +
    +                                       {
    +                                           "audio_file_name": [
    +                                               {
    +                                                   "start": 0.0,
    +                                                   "end": 2.0,
    +                                                   "speaker": "Agent",
    +                                               },
    +                                               {
    +                                                   "start": 2.0,
    +                                                   "end": 4.0,
    +                                                   "speaker": "Client",
    +                                               },
    +                                               ...
    +                                           ],
    +                                           ...
    +                                       }
    +
    +                                       Note: The diarization must be for the entire duration of the audio file (as long
    +                                       as Whisper is predicting words up until then.
    +    :param speech_diarize_per_channel: Perform speech diarization per channel. Each speaker is expected to belong to
    +                                       a separate channel in the audio. Notice: This will make the transcription
    +                                       slower as each channel wil be transcribed separatly. If a speech diarization
    +                                       is passed (via the `speech_diarization` parameter), this parameter is
    +                                       ignored.
    +    :param speaker_labels:             A list of speaker labels by channel order to use for writing the
    +                                       transcription with respect to per channel speech diarization. This won't be
    +                                       used together with a given speech diarization (via the `speech_diarization`
    +                                       parameter).
    +    :param use_multiprocessing:        Whether to use multiprocessing to transcribe the audio files. Can be either a
    +                                       boolean value or an integer. If `True`, will use the default amount of workers
    +                                       (3): 1 for transcription, 1 for batch processing and 1 for task completion (such
    +                                       as speech diarization and writing to files). To control the amount of tasks
    +                                       completion workers, an integer can be provided to specify the amount of workers.
    +                                       `False`, will use a single process. Default is `False`.
    +    :param verbose:                    Whether to print the progress of the transcription. Default is `False`.
    +    """
    +    global _LOGGER
    +
    +    # Get the input audio files to transcribe:
    +    if verbose:
    +        _LOGGER.info("Collecting audio files.")
    +    audio_files = _get_audio_files(data_path=data_path)
    +    if verbose:
    +        _LOGGER.info(f"Collected {len(audio_files)} audio files.")
    +
    +    # Get the output directory:
    +    if output_directory is None:
    +        if verbose:
    +            _LOGGER.info("No output directory given, using temporary directory.")
    +        output_directory = tempfile.mkdtemp()
    +    output_directory = Path(output_directory).absolute()
    +    output_directory.mkdir(exist_ok=True, parents=True)
    +    if verbose:
    +        _LOGGER.info(f"Transcriptions will be saved to: {output_directory}")
    +
    +    # Initialize a batch processor according to user requirements (no speech diarization, given speech diarization,
    +    # speech diarization per channel):
    +    if speech_diarization:
    +        batch_processor = SpeechDiarizationBatchProcessor(
    +            audio_files=audio_files,
    +            output_directory=output_directory,
    +            speech_diarization=speech_diarization,
    +        )
    +    elif speech_diarize_per_channel:
    +        batch_processor = PerChannelSpeechDiarizationBatchProcessor(
    +            audio_files=audio_files,
    +            output_directory=output_directory,
    +            n_channels=speech_diarize_per_channel,
    +            speakers=speaker_labels,
    +        )
    +    else:
    +        batch_processor = BatchProcessor(
    +            audio_files=audio_files,
    +            output_directory=output_directory,
    +        )
    +
    +    # Initialize the transcription pipeline:
    +    transcriber = Transcriber(
    +        device=device,
    +        use_flash_attention_2=use_flash_attention_2,
    +        use_better_transformers=use_better_transformers,
    +        assistant_model=assistant_model,
    +        model_name=model_name,
    +        max_new_tokens=max_new_tokens,
    +        chunk_length_s=chunk_length_s,
    +        batch_size=batch_size,
    +        return_timestamps=(
    +            "word"
    +            if speech_diarization is not None or speech_diarize_per_channel is not None
    +            else False
    +        ),
    +        per_channel_transcription=speech_diarize_per_channel or 0,
    +        spoken_language=spoken_language,
    +        translate_to_english=translate_to_english,
    +    )
    +
    +    # Run the transcription:
    +    if use_multiprocessing:
    +        results = _parallel_run(
    +            n_workers=use_multiprocessing
    +            if isinstance(use_multiprocessing, int)
    +            else 1,
    +            audio_files=audio_files,
    +            batch_processor=batch_processor,
    +            transcriber=transcriber,
    +            verbose=verbose,
    +        )
    +    else:
    +        results = _run(
    +            audio_files=audio_files,
    +            batch_processor=batch_processor,
    +            transcriber=transcriber,
    +            verbose=verbose,
    +        )
    +
    +    # Process the results:
    +    if verbose:
    +        _LOGGER.info("Summarizing the results.")
    +    successes = []
    +    errors = {}
    +    for is_error, result in results:
    +        if is_error:
    +            errors[result[0]] = result[1]
    +        else:
    +            successes.append(result)
    +    successes = pd.DataFrame(successes, columns=["audio_file", "transcription_file"])
    +    if verbose:
    +        _LOGGER.info(
    +            f"Done ({successes.shape[0]}/{len(audio_files)})\n"
    +            f"Transcriptions summary:\n"
    +            f"{successes.head()}"
    +        )
    +
    +    return str(output_directory), successes, errors
    +
    +
    +def _get_audio_files(
    +    data_path: Union[Path, str, list],
    +) -> List[Path]:
    +    """
    +    Get the audio files to transcribe. If a path to a directory is given, all files in the directory will be collected.
    +
    +    :param data_path: The data path to collect the audio files from.
    +
    +    :returns: The audio files list.
    +    """
    +    # Check if given a list of paths:
    +    if isinstance(data_path, list):
    +        audio_files = []
    +        for path in data_path:
    +            audio_files.extend(_get_audio_files(data_path=path))
    +        return audio_files
    +
    +    # Check if given a single string path to cast it to a `pathlib.Path`:
    +    if isinstance(data_path, str):
    +        data_path = Path(data_path).absolute()
    +
    +    # Check if the path is of a directory or a file:
    +    if data_path.is_dir():
    +        # Get all files inside the directory:
    +        audio_files = list(data_path.glob("*.*"))
    +    elif data_path.is_file():
    +        audio_files = [data_path]
    +    else:
    +        raise ValueError(
    +            f"Unrecognized data path. The parameter `data_path` must be a valid path to either a directory path or a "
    +            f"file. Given: {str(data_path)} "
    +        )
    +
    +    return audio_files
    +
    +
    +def _run(
    +    audio_files: List[Path],
    +    batch_processor: BatchProcessor,
    +    transcriber: Transcriber,
    +    verbose: bool,
    +) -> List[Tuple[bool, Tuple[str, str]]]:
    +    """
    +    Run the transcription without multiprocessing.
    +
    +    :param audio_files:     The audio files to transcribe.
    +    :param batch_processor: The batch processor to use.
    +    :param transcriber:     The transcriber to use.
    +    :param verbose:         Verbosity.
    +
    +    :returns: The collected results.
    +    """
    +    # Load the transcription pipeline:
    +    if verbose:
    +        _LOGGER.info(f"Loading the transcription pipeline.")
    +    transcriber.load()
    +    if verbose:
    +        _LOGGER.info("Transcription pipeline loaded.")
    +
    +    # Transcribe the files:
    +    transcriber.transcribe(
    +        audio_files=audio_files,
    +        batch_processor=batch_processor,
    +        verbose=verbose,
    +    )
    +
    +    # Return the results:
    +    return batch_processor.get_results()
    +
    +
    +def _parallel_run(
    +    n_workers: int,
    +    audio_files: List[Path],
    +    batch_processor: BatchProcessor,
    +    transcriber: Transcriber,
    +    verbose: bool,
    +):
    +    """
    +    Run the transcription with multiprocessing.
    +
    +    :param n_workers:       The amount of workers to use as task completers.
    +    :param audio_files:     The audio files to transcribe.
    +    :param batch_processor: The batch processor to use.
    +    :param transcriber:     The transcriber to use.
    +    :param verbose:         Verbosity.
    +
    +    :returns: The collected results.
    +    """
    +    # Initialize the multiprocessing queues:
    +    batches_queue = Queue()
    +    tasks_queue = Queue()
    +    results_queue = Queue()
    +
    +    # Initialize the multiprocessing processes:
    +    batch_processing_process = Process(
    +        target=_multiprocessing_process_batches,
    +        kwargs={
    +            "batch_processor": batch_processor,
    +            "batches_queue": batches_queue,
    +            "tasks_queue": tasks_queue,
    +            "n_task_completers": n_workers,
    +        },
    +    )
    +    task_completion_processes = [
    +        Process(
    +            target=_multiprocessing_complete_tasks,
    +            kwargs={"tasks_queue": tasks_queue, "results_queue": results_queue},
    +        )
    +        for _ in range(n_workers)
    +    ]
    +
    +    # Start the multiprocessing processes:
    +    batch_processing_process.start()
    +    for p in task_completion_processes:
    +        p.start()
    +
    +    # Load the transcription pipeline:
    +    if verbose:
    +        _LOGGER.info(f"Loading the transcription pipeline.")
    +    transcriber.load()
    +    if verbose:
    +        _LOGGER.info("Transcription pipeline loaded.")
    +
    +    # Transcribe the files:
    +    transcriber.transcribe(
    +        audio_files=audio_files, batches_queue=batches_queue, verbose=verbose
    +    )
    +
    +    # Collect the results:
    +    results = []
    +    stop_marks_counter = 0
    +    while True:
    +        # Get a result from the queue:
    +        result: Tuple[bool, Tuple[str, str]] = results_queue.get()
    +        if result == _MULTIPROCESSING_STOP_MARK:
    +            stop_marks_counter += 1
    +            if stop_marks_counter == n_workers:
    +                break
    +        else:
    +            # Collect the result:
    +            results.append(result)
    +
    +    # Wait for the processes to finish:
    +    results_queue.empty()
    +    batch_processing_process.join()
    +    for p in task_completion_processes:
    +        p.join()
    +
    +    return results
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/transcribe/1.1.0/static/transcribe.html b/functions/master/transcribe/1.1.0/static/transcribe.html new file mode 100644 index 00000000..afcbe790 --- /dev/null +++ b/functions/master/transcribe/1.1.0/static/transcribe.html @@ -0,0 +1,1604 @@ + + + + + + + +transcribe.transcribe + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + +
    +
    + +
    +
    +
    +
    +
    + +
    +

    + +
    +
    +
    +
    +
    +
    +
    +

    Source code for transcribe.transcribe

    +# Copyright 2024 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +import logging
    +import operator
    +import os
    +import tempfile
    +from functools import reduce, wraps
    +from multiprocessing import Process, Queue
    +from pathlib import Path
    +from typing import Any, Dict, Generator, List, Literal, NamedTuple, Tuple, Union
    +
    +import pandas as pd
    +import torch
    +import torchaudio
    +from tqdm import tqdm
    +from transformers import (
    +    AutomaticSpeechRecognitionPipeline,
    +    AutoModelForCausalLM,
    +    pipeline,
    +)
    +from transformers.utils import is_flash_attn_2_available
    +
    +
    +
    [docs]class BaseTask: + """ + A task to write the transcription to file. + """ + + def __init__( + self, audio_file: Path, transcription_output: Union[dict, str], text_file: Path + ): + """ + Initialize the task. + + :param audio_file: Path to the audio file that was transcribed. + :param transcription_output: The transcription output from the pipeline. String means an exception was raised. + :param text_file: Path to the text file to write the transcription to. + """ + # Store the parameters: + self._audio_file = audio_file + self._transcription_output = transcription_output + self._text_file = text_file + + # Prepare the error variable: + self._error: str = None + +
    [docs] def do_task(self): + """ + Try to perform the task storing an error if occurred. + """ + if isinstance(self._transcription_output, str): + self._error = self._transcription_output + return + try: + self._do_task() + except Exception as exception: + self._error = str(exception)
    + +
    [docs] def is_failed(self) -> bool: + """ + Check if the task failed. + + :returns: Whether the task failed. + """ + return self._error is not None
    + +
    [docs] def get_result(self) -> Tuple[str, str]: + """ + Get the result of the task. If the task failed, the error will be returned, otherwise, the result will be the + text file name. + + :returns: The task's result. + """ + if self.is_failed(): + return self._audio_file.name, self._error + return self._audio_file.name, self._text_file.name
    + +
    [docs] def to_tuple(self) -> Tuple[str, dict]: + """ + Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + + :returns: The converted task. + """ + return self.__class__.__name__, { + "audio_file": self._audio_file, + "transcription_output": self._transcription_output, + "text_file": self._text_file, + }
    + + def _do_task(self): + """ + Perform the task - write the transcription to the stored file path. + """ + # Checking for no duplications: + i = 1 + while self._text_file.exists(): + i += 1 + self._text_file = ( + self._text_file.parent + / f"{self._text_file.stem.rsplit('_', 1)[0]}_{i}{self._text_file.suffix}" + ) + + # Make sure all directories are created: + self._text_file.parent.mkdir(exist_ok=True, parents=True) + + # Write to file: + with open(self._text_file, "w") as fp: + fp.write(self._transcription_output["text"])
    + + +
    [docs]class SpeechDiarizationTask(BaseTask): + """ + A task to write the transcription to file with respect to a given speech diarization. + """ + + class _DiarizationSegment(NamedTuple): + """ + A speech diarization segment. + """ + + start: float + end: float + speaker: str + + class _WordTimestamp(NamedTuple): + """ + A word with its start and end timestamps. + """ + + start: float + end: float + text: str + + def __init__( + self, + audio_file: Path, + transcription_output: dict, + text_file: Path, + speech_diarization: List[Tuple[float, float, str]], + ): + """ + Initialize the task. + + :param audio_file: Path to the audio file that was transcribed. + :param transcription_output: The transcription output from the pipeline. + :param text_file: Path to the text file to write the transcription to. + :param speech_diarization: A speech diarization as a list of tuples: (start, end, speaker). + """ + super().__init__( + audio_file=audio_file, + transcription_output=transcription_output, + text_file=text_file, + ) + self._speech_diarization = speech_diarization + self._segments: List[SpeechDiarizationTask._DiarizationSegment] = None + self._last_chosen_index = 0 + +
    [docs] def to_tuple(self) -> Tuple[str, dict]: + """ + Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + + :returns: The converted task. + """ + task_class, task_kwargs = super().to_tuple() + return task_class, { + **task_kwargs, + "speech_diarization": self._speech_diarization, + }
    + + def _do_task(self): + """ + Perform the task - write the transcription to the stored file path with respect to the given speech diarization. + """ + # Check if a speech diarization is given, if not, just write the transcription to file: + if not self._speech_diarization: + super()._do_task() + return + + # Cast the chunks to word timestamps tuples: + words = [ + SpeechDiarizationTask._WordTimestamp( + start=chunk["timestamp"][0], + end=chunk["timestamp"][1], + text=chunk["text"], + ) + for chunk in self._transcription_output["chunks"] + ] + + # Cast speech diarization to segments tuples: + self._segments = [ + SpeechDiarizationTask._DiarizationSegment(*segment) + for segment in self._speech_diarization + ] + + # Try to match the Whisper model predicted timestamps to the closest diarization segment (closest diarization + # segment will be the most overlapping with the word, and if there is no overlap, the closest segment to the + # word): + speaker = self._segments[self._last_chosen_index].speaker + text = f"{speaker}:" + for word in words: + # Get the next diarization segment: + self._get_next_segment(word=word) + # Check if the segment is of the same speaker: + if self._segments[self._last_chosen_index].speaker == speaker: + # Collect the word: + text += word.text + else: + # Append a newline and update the new speaker: + speaker = self._segments[self._last_chosen_index].speaker + text += f"\n{speaker}:{word.text}" + + # Update the transcription output with the new text to write it to file: + self._transcription_output["text"] = text + super()._do_task() + + def _get_next_segment( + self, + word: _WordTimestamp, + ): + """ + Get the next diarization segment the given word falls into. The `self._last_chosen_index` will be updated + accordingly. + + :param word: The word timestamp to match to the next segment. + """ + # If the last chosen segment is the last segment, return it: + if self._last_chosen_index == len(self._segments) - 1: + return + + # Get the last chosen diarization segment: + last_chosen = self._segments[self._last_chosen_index] + + # None value may appear if the word is the last word in the audio file, or it was split during inference. In + # that case, we'll set the last segment: + if word.end is None: + self._last_chosen_index = len(self._segments) - 1 + return + + # If the word ends before the last chosen segment: + if word.end <= last_chosen.start: + # Then it is still the closest segment + return + + # We check if it ends inside the last chosen segment: + if word.end < last_chosen.end: + # Then it still is the closest segment + return + + # The word ends after the segment, we need to collect all next segments up until the word ends before them: + possible_segments = [self._last_chosen_index] + for i in range(self._last_chosen_index + 1, len(self._segments)): + if word.end > self._segments[i].end: + possible_segments.append(i) + continue + possible_segments.append(i) + break + + # Check for the most overlapping option: + best_overlap = 0 + most_overlapping_segment_index = None + for i in possible_segments: + # If the word starts before segment: + if word.start <= self._segments[i].start: + # If it ends before the segment, there is an overlap from the start of the segment to the end of the + # word: + if word.end < self._segments[i].end: + overlap = word.end - self._segments[i].start + else: + # The word is wrapping the segment, the overlap is the segment's length: + overlap = self._segments[i].end - self._segments[i].start + # The word starts in segment, check if the word ends in it: + elif word.end < self._segments[i].end: + # The overlap is the word's length: + overlap = word.end - word.start + # The word start in segment but ends after it, the overlap is from the word's start to the segment's end: + else: + overlap = self._segments[i].end - word.start + # Check for new best overlap: + if overlap > best_overlap: + best_overlap = overlap + most_overlapping_segment_index = i + if most_overlapping_segment_index is not None: + self._last_chosen_index = most_overlapping_segment_index + return + + # If there is no overlapping segment, return the closest segment: + best_distance = None + closest_segment_index = None + for i in possible_segments: + distance = ( + word.start - self._segments[i].end + if word.start > self._segments[i].end + else self._segments[i].start - word.end + ) + if best_distance is None or distance < best_distance: + best_distance = distance + closest_segment_index = i + self._last_chosen_index = closest_segment_index
    + + +
    [docs]class SpeechDiarizationPerChannelTask(BaseTask): + """ + A task to write the transcription to file with respect to a given speech diarization per channel. + """ + + class _WordTimestamp(NamedTuple): + """ + A word with its start and end timestamps and speaker label (channel the word was taken from). + """ + + start: float + end: float + speaker: str + text: str + + def __init__(self, audio_file: Path, text_file: Path): + """ + Initialize the task. + + :param audio_file: Path to the audio file that was transcribed. + :param text_file: Path to the text file to write the transcription to. + """ + super().__init__( + audio_file=audio_file, transcription_output={}, text_file=text_file + ) + self._transcription_output_channels: List[Tuple[str, dict]] = [] + + @property + def transcription_output_channels(self) -> List[Tuple[str, dict]]: + """ + Get the transcription output channels. + + :returns: The transcription output channels. + """ + return self._transcription_output_channels + +
    [docs] def do_task(self): + """ + Try to perform the task storing an error if occurred. + """ + for _, channel_output in self._transcription_output_channels: + if isinstance(channel_output, str): + self._error = self._transcription_output_channels + return + super().do_task()
    + +
    [docs] def to_tuple(self) -> Tuple[str, dict]: + """ + Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + + :returns: The converted task. + """ + task_class, task_kwargs = super().to_tuple() + task_kwargs.pop("transcription_output") + return task_class, task_kwargs
    + + def _do_task(self): + """ + Perform the task - write the transcription to the stored file path with respect to the given speech diarization + per channel. + """ + # Cast the chunks to word timestamps tuples: + words_per_channel = [ + [ + SpeechDiarizationPerChannelTask._WordTimestamp( + start=chunk["timestamp"][0], + end=chunk["timestamp"][1], + speaker=speaker, + text=chunk["text"], + ) + for chunk in output["chunks"] + ] + for speaker, output in self._transcription_output_channels + ] + + # Merge and sort the words per channel by their start time: + words = operator.add(*words_per_channel) + words.sort() + + # Write the transcription to file: + current_speaker = words[0].speaker + text = f"{current_speaker}:" + for word in words: + # Check if the word's speaker is different from the current one: + if word.speaker != current_speaker: + # Append a newline and update the new speaker: + current_speaker = word.speaker + text += f"\n{current_speaker}:" + # Collect the word: + text += word.text + + # Update the transcription output with the new text to write it to file: + self._transcription_output["text"] = text + super()._do_task()
    + + +
    [docs]class BatchProcessor: + """ + A batch processor to process batches of transcriptions. The batch processor is creating tasks and is aimed to be + working along the transcriber. It can be used with multiprocessing queue or run the tasks directly using the + associated methods. + """ + + def __init__(self, audio_files: List[Path], output_directory: Path): + """ + Initialize the batch processor. + + :param audio_files: The list of all audio files to transcribe. + :param output_directory: The output directory to write the transcriptions to. + """ + # Store the parameters: + self._audio_files = audio_files + self._output_directory = output_directory + + # Prepare the batching variables: + self._current_file_index = 0 + self._tasks: List[BaseTask] = [] + self._results: List[Tuple[bool, Tuple[str, str]]] = [] + +
    [docs] def process_batch(self, batch: List[Union[dict, str]]): + """ + Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch + processor. + + :param batch: The batch of transcriptions to process. + """ + # Get the relevant files belongs to the given batch: + current_files = self._get_current_files(batch_size=len(batch)) + + # Build the diarization tasks: + self._tasks.extend( + [ + BaseTask( + audio_file=file, + transcription_output=batch[i], + text_file=self._output_directory / f"{file.stem}.txt", + ) + for i, file in enumerate(current_files) + ] + )
    + +
    [docs] def get_tasks(self) -> List[BaseTask]: + """ + Get the tasks to perform. + + :returns: The tasks to perform. + """ + tasks = self._tasks + self._tasks = [] + return tasks
    + +
    [docs] def do_tasks(self): + """ + Perform the tasks. Should be used if no multiprocessing queue is given to a transcriber. + """ + for task in self.get_tasks(): + task.do_task() + self._results.append((task.is_failed(), task.get_result()))
    + +
    [docs] def get_results(self) -> List[Tuple[bool, Tuple[str, str]]]: + """ + Get the results of the tasks. The stored results are then cleared. + + :returns: The results of the tasks. + """ + results = self._results + self._results = [] + return results
    + + def _get_current_files(self, batch_size: int) -> List[Path]: + """ + Get the current files to process. + + :param batch_size: The batch size to progress the current file index. + + :returns: The current files to process. + """ + end_index = ( + self._current_file_index + batch_size + if self._current_file_index + batch_size < len(self._audio_files) + else len(self._audio_files) + ) + current_files = self._audio_files[self._current_file_index : end_index] + self._current_file_index = end_index + return current_files
    + + +
    [docs]class SpeechDiarizationBatchProcessor(BatchProcessor): + """ + A batch processor to process batches of transcriptions with respect to a given speech diarization. The batch + processor is creating tasks and is aimed to be working along the transcriber. It can be used with multiprocessing + queue or run the tasks directly using the associated methods. + """ + + def __init__( + self, audio_files: List[Path], output_directory: Path, speech_diarization: dict + ): + """ + Initialize the batch processor. + + :param audio_files: The list of all audio files to transcribe. + :param output_directory: The output directory to write the transcriptions to. + :param speech_diarization: A speech diarization dictionary to pass along with each processed batch. + """ + super().__init__(audio_files=audio_files, output_directory=output_directory) + self._speech_diarization = speech_diarization + self._audio_files = audio_files + +
    [docs] def process_batch(self, batch: List[dict]): + """ + Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch + processor. + + :param batch: The batch of transcriptions to process. + """ + # Get the relevant files belongs to the given batch: + current_files = self._get_current_files(batch_size=len(batch)) + + # Build the diarization tasks: + self._tasks.extend( + [ + SpeechDiarizationTask( + audio_file=file, + transcription_output=batch[i], + text_file=self._output_directory / f"{file.stem}.txt", + speech_diarization=self._speech_diarization.get(file.name), + ) + for i, file in enumerate(current_files) + ] + )
    + + +
    [docs]class PerChannelSpeechDiarizationBatchProcessor(BatchProcessor): + """ + A batch processor to process batches of transcriptions per channel. The batch processor is creating tasks with the + selected amount of channels given and is aimed to be working along the transcriber. It can be used with + multiprocessing queue or run the tasks directly using the associated methods. + """ + + def __init__( + self, + audio_files: List[Path], + output_directory: Path, + n_channels: int, + speakers: List[str], + ): + """ + Initialize the batch processor. + + :param audio_files: The list of all audio files to transcribe. + :param output_directory: The output directory to write the transcriptions to. + :param n_channels: The number of channels in each audio file to transcribe. + :param speakers: The speakers labels to use for each channel. + """ + super().__init__(audio_files=audio_files, output_directory=output_directory) + + # Store the parameters: + self._n_channels = n_channels + self._speakers = speakers + + # Prepare a channel buffer to store the channels until the current task created is fully covered: + self._task_in_process: SpeechDiarizationPerChannelTask = None + +
    [docs] def process_batch(self, batch: List[dict]): + """ + Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch + processor. + + :param batch: The batch of transcriptions to process. + """ + # Go over the batch and create the tasks: + for output in batch: + # Check if there is a task in process: + if not self._task_in_process: + # Create a new task: + self._task_in_process = SpeechDiarizationPerChannelTask( + audio_file=self._audio_files[self._current_file_index], + text_file=self._output_directory + / f"{self._audio_files[self._current_file_index].stem}.txt", + ) + # Get the channel's speaker: + speaker = self._speakers[ + len(self._task_in_process.transcription_output_channels) + ] + # Collect the channel into the processed task: + self._task_in_process.transcription_output_channels.append( + (speaker, output) + ) + # Check if the task is fully covered (all channels are collected): + if ( + len(self._task_in_process.transcription_output_channels) + == self._n_channels + ): + # Collect the task and reset the task in process: + self._tasks.append(self._task_in_process) + self._current_file_index += 1 + self._task_in_process = None
    + + +
    [docs]class Transcriber: + """ + A transcription wrapper for the Huggingface's ASR pipeline - + https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline to + use with OpenAI's Whisper models - https://huggingface.co/openai. + """ + + def __init__( + self, + model_name: str, + device: str = None, + use_flash_attention_2: bool = None, + use_better_transformers: bool = None, + assistant_model: str = None, + max_new_tokens: int = 128, + chunk_length_s: int = 30, + batch_size: int = 2, + spoken_language: str = None, + translate_to_english: bool = False, + return_timestamps: Union[bool, Literal["word"]] = False, + per_channel_transcription: int = 0, + ): + """ + Initialize the transcriber. + + :param model_name: The model name to use. Should be a model from the OpenAI's Whisper models for + best results (for example "tiny", "base", "large", etc.). + :param device: The device to use for inference. If not given, will use GPU if available. + :param use_flash_attention_2: Whether to use the Flash Attention 2 implementation. It can be used only with + one of the following GPUs: Nvidia H series and Nvidia A series. T4 support + will be available soon. + + Note: If both `use_flash_attention_2` and + `use_better_transformers` are `None`, the optimization will be chosen + automatically according to the available resources. + + :param use_better_transformers: Whether to use the Better Transformers library to further optimize the model. + Should be used for all use cases that do not support flash attention 2. + + Note: If both `use_flash_attention_2` and `use_better_transformers` are + `None`, the optimization will be chosen automatically according to the + available resources. + :param assistant_model: The assistant model name to use for inference. Notice that the optimizations + (flash attention 2 and better transformers) will be applied for the assistant + as well. Should be a model from Huggingface's distil-whisper (see here for + more information: https://github.com/huggingface/distil-whisper). + :param max_new_tokens: The maximum number of new tokens to generate. This is used to limit the + generation length. Default is 128 tokens. + :param chunk_length_s: The audio chunk to split the audio to (in seconds). Default is 30 seconds. + :param batch_size: The batch size to use for inference. Default is 2. + :param spoken_language: Aim whisper to know what language is spoken. If None, it will try to detect it + for each chunk. + :param translate_to_english: Whether to translate the transcriptions to English. Default is False. + :param return_timestamps: Whether to return the timestamps of the words. If "word", will return the + timestamps of each word. If True will return the timestamps of each chunk. + Default is False. Aimed to be used for speech diarization. + :param per_channel_transcription: Whether to do per channel transcription. If needed to run per channel + transcription, pass the number of channels expected for each audio file here. + 0 means regular transcription (merge channels). + + Note: If `per_channel_transcription` is not 0, `batch_size` must be treated to + be the number of channels and not audio files. Aimed to be used for per + channel speech diarization. + """ + # Store loading parameters: + self._model_name = model_name + self._device = device + self._use_flash_attention_2 = use_flash_attention_2 + self._use_better_transformers = use_better_transformers + self._max_new_tokens = max_new_tokens + self._chunk_length_s = chunk_length_s + self._batch_size = batch_size + self._return_timestamps = return_timestamps + self._per_channel_transcription = per_channel_transcription + + # Store generation parameters: + self._assistant_model = assistant_model + self._spoken_language = spoken_language + self._translate_to_english = translate_to_english + + # Prepare the transcription objects: + self._transcription_pipeline: AutomaticSpeechRecognitionPipeline = None + self._generate_kwargs: dict = None + +
    [docs] def load(self): + """ + Load the transcriber. Must be called before transcribing. + """ + # Set the device and data type to use (prefer GPU if available): + device = torch.device( + self._device or "cuda" if torch.cuda.is_available() else "cpu" + ) + torch_dtype = torch.float16 if device.type == "cuda" else torch.float32 + + # Choose the optimization to use (in case the user did not specify any): + if ( + self._use_flash_attention_2 is None + and self._use_better_transformers is None + ): + # Prefer to use flash attention 2 if available and cuda device is supported (see GPU names to architecture + # here: https://en.wikipedia.org/wiki/List_of_Nvidia_graphics_processing_units#Tesla): + if device.type == "cuda" and is_flash_attn_2_available(): + cuda_device_name = torch.cuda.get_device_properties(device).name + if any( + cuda_device_name.startswith(gpu_name) + for gpu_name in [ + "NVIDIA A", # For Ampere architecture (e.g. A10, A30, A100) + "NVIDIA H", # For Hopper architecture (e.g. H100) + "NVIDIA L", # For Ada Lovelace architecture (e.g. L4, L40) + "NVIDIA RTX 30", # For Ada Lovelace architecture (RTX 30 series) + "NVIDIA RTX 40", # For Ada Lovelace architecture (RTX 40 series) + "NVIDIA RTX 50", # For Ada Lovelace architecture (RTX 50 series) + # Will be supported soon according to FlashAttention GitHub repo: + # https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features + # "NVIDIA T4", # For Turing architecture (only T4) + # "NVIDIA RTX 20", # For Turing architecture (RTX 20 series) + ] + ): + self._use_flash_attention_2 = True + else: + self._use_better_transformers = True + else: + self._use_better_transformers = True + + # Build the optimizations kwargs: + model_kwargs = { + "low_cpu_mem_usage": True, + "use_safetensors": True, + } + if self._use_flash_attention_2: + if _LOGGER: + _LOGGER.info( + "Using FlashAttention2 optimization - make sure the `flash-attn` package is installed via " + "`pip install -U flash-attn --no-build-isolation`" + ) + model_kwargs["attn_implementation"] = "flash_attention_2" + elif self._use_better_transformers: + if _LOGGER: + _LOGGER.info( + "Using BetterTransformers optimization - make sure the `optimum` package is installed via " + "`pip install -U optimum`" + ) + model_kwargs["attn_implementation"] = "sdpa" + + # Initialize the speech recognition pipeline: + self._transcription_pipeline = pipeline( + task="automatic-speech-recognition", + model=self._model_name, + model_kwargs=model_kwargs.copy(), + batch_size=self._batch_size, + max_new_tokens=self._max_new_tokens, + chunk_length_s=self._chunk_length_s, + return_timestamps=self._return_timestamps, + torch_dtype=torch_dtype, + device=device, + ) + + # Prepare the generation kwargs: + self._generate_kwargs = { + "language": self._spoken_language, + "task": "translate" if self._translate_to_english else "transcribe", + } + + # Initialize the assistant model (if needed): + if self._assistant_model: + assistant_model = AutoModelForCausalLM.from_pretrained( + self._assistant_model, torch_dtype=torch_dtype, **model_kwargs + ) + assistant_model.to(device) + self._generate_kwargs["assistant_model"] = assistant_model
    + +
    [docs] def transcribe( + self, + audio_files: List[Path], + batch_processor: BatchProcessor = None, + batches_queue: Queue = None, + verbose: bool = False, + ) -> Union[List[List[dict]], None]: + """ + Transcribe the given audio files. The transcriptions will be sent to a queue or a batch processor for further + processing like writing to text files. If no queue or batch processor is given, the transcriptions outputs from + the pipeline will be returned. Otherwise, `None` is returned. + + :param audio_files: The audio files to transcribe. + :param batch_processor: A batch processor. + :param batches_queue: A multiprocessing queue to put the batches in. + :param verbose: Whether to show a progress bar. Default is False. + + :returns: The transcriptions outputs from the pipeline if no queue or batch processor is given, otherwise, + `None`. + """ + # Wrap the audio files with a function to iterate over them via a generator (save memory and runtime with + # Huggingface's pipelines as they preload each input while inference is running): + def audio_iterator() -> Generator[Union[dict, str], None, None]: + if self._per_channel_transcription: + for audio_file in audio_files: + audio, sampling_rate = torchaudio.load(str(audio_file)) + audio = audio.numpy() + for channel in audio: + yield {"raw": channel, "sampling_rate": sampling_rate} + else: + for audio_file in audio_files: + yield str(audio_file) + + # Create a batch iterator: + def batch_iterator() -> Generator[List[Union[dict, str]], None, None]: + batch = [] + for audio in audio_iterator(): + batch.append(audio) + if len(batch) == self._batch_size: + yield batch + batch = [] + if batch: + yield batch + + # Prepare the successes dataframe and errors dictionary to be returned: + outputs = [] + + # Infer through the pipeline: + for input_batch in tqdm( + batch_iterator() if self._batch_size > 1 else audio_iterator(), + desc="Transcribing", + unit="channel" if self._per_channel_transcription else "audio file", + total=( + ( + (len(audio_files) // self._batch_size) + + (len(audio_files) % self._batch_size != 0) + ) + * (self._per_channel_transcription or 1) + ), + disable=not verbose, + ): + # Infer: + try: + output_batch = self._transcription_pipeline( + input_batch, + generate_kwargs=self._generate_kwargs, + ) + except Exception as exception: + # Collect the exception: + output_batch = str(exception) + # Align to batch size: + output_batch = ( + [output_batch] * len(input_batch) + if isinstance(input_batch, list) + else [output_batch] + ) + # To align with batching, if batch size is 1, wrap the output with a list: + if isinstance(output_batch, dict): + output_batch = [output_batch] + # If a batch processor is given, process the batch: + if batch_processor: + # Process it directly: + batch_processor.process_batch(batch=output_batch) + batch_processor.do_tasks() + elif batches_queue: + # Otherwise, queue the batch: + batches_queue.put(output_batch) + else: + # Otherwise, collect the output as is without processing: + outputs.append(output_batch) + + # Check if given a multiprocessing queue or a batch processor: + if batches_queue: + batches_queue.put(_MULTIPROCESSING_STOP_MARK) + + return outputs if not batch_processor else None
    + + +#: The value to send into multiprocessing queues to stop the process: +_MULTIPROCESSING_STOP_MARK = "STOP" + + +def _multiprocessing_process_batches( + batch_processor: BatchProcessor, + batches_queue: Queue, + tasks_queue: Queue, + n_task_completers: int, +): + """ + Process the batches in the given batches queue and put the tasks in the given tasks queue. The function will stop + when the given batches queue will receive the stop mark. It is aimed to be used with multiprocessing as a process. + + :param batch_processor: A batch processor to process the batches. + :param batches_queue: A queue to get the batches from. + :param tasks_queue: A queue to put the tasks in. + :param n_task_completers: The number of task completers (processes that run the `_multiprocessing_complete_tasks` + function). A stop mark will be sent to the tasks queue for each task completer. + """ + while True: + # Get the batch: + batch: List[dict] = batches_queue.get() + if batch == _MULTIPROCESSING_STOP_MARK: + break + + # Process the batch: + batch_processor.process_batch(batch=batch) + + # Get the tasks: + tasks = batch_processor.get_tasks() + + # Queue the tasks: + for task in tasks: + tasks_queue.put(task.to_tuple()) + + # Mark the end of the batches: + for _ in range(n_task_completers): + tasks_queue.put(_MULTIPROCESSING_STOP_MARK) + + +def _multiprocessing_complete_tasks(tasks_queue: Queue, results_queue: Queue): + """ + Complete the tasks in the given queue and put the results in the given results queue. The function will stop when + the given tasks queue will receive the stop mark. It is aimed to be used with multiprocessing as a process. + + :param tasks_queue: A queue to get the tasks from. + :param results_queue: A queue to put the results in. + """ + tasks_map = { + BaseTask.__name__: BaseTask, + SpeechDiarizationTask.__name__: SpeechDiarizationTask, + SpeechDiarizationPerChannelTask.__name__: SpeechDiarizationPerChannelTask, + } + + while True: + # Get the task: + task = tasks_queue.get() + if task == _MULTIPROCESSING_STOP_MARK: + break + + # Reconstruct the task: + task_class, task_kwargs = task + task = tasks_map[task_class](**task_kwargs) + + # Complete the task: + task.do_task() + results_queue.put((task.is_failed(), task.get_result())) + + # Mark the end of the tasks: + results_queue.put(_MULTIPROCESSING_STOP_MARK) + + +# Get the global logger: +_LOGGER = logging.getLogger() + + +
    [docs]def open_mpi_handler( + worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None +): + global _LOGGER + + # Check for MLRun and OpenMPI availability: + context, comm = _check_mlrun_and_open_mpi() + + # Check if MLRun is available, set the global logger to MLRun's: + if context: + _LOGGER = context.logger + + def decorator(handler): + if comm is None or comm.Get_size() == 1: + return handler + + @wraps(handler) + def wrapper(**kwargs): + # Get the open mpi environment properties: + size = comm.Get_size() + rank = comm.Get_rank() + + # Give the correct chunk of the workers inputs: + for worker_input in worker_inputs: + input_argument = kwargs[worker_input] + if input_argument is None: + continue + if isinstance(input_argument, str): + input_argument = _get_audio_files( + data_path=Path(input_argument).absolute() + ) + if len(input_argument) < size: + raise ValueError( + f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. " + f"Please reduce the amount of workers for this input." + ) + even_chunk_size = len(input_argument) // size + chunk_start = rank * even_chunk_size + chunk_end = ( + (rank + 1) * even_chunk_size + if rank + 1 < size + else len(input_argument) + ) + context.logger.info( + f"Rank #{rank}: Processing input chunk of '{worker_input}' " + f"from index {chunk_start} to {chunk_end}." + ) + if isinstance(input_argument, list): + input_argument = input_argument[chunk_start:chunk_end] + elif isinstance(input_argument, pd.DataFrame): + input_argument = input_argument.iloc[chunk_start:chunk_end:, :] + kwargs[worker_input] = input_argument + + # Set the root worker only arguments: + if rank == 0 and root_worker_inputs: + kwargs.update(root_worker_inputs) + + # Run the worker: + output = handler(**kwargs) + + # Save the output directory of this worker: + output_directory = Path(output[0]) + + # Send the output to the root rank (rank #0): + output = comm.gather(output, root=0) + + # Join the data from all workers: + if rank == 0: + context.logger.info("Collecting data from workers to root worker.") + + # Check if there are different output directories: + output_directories = set([Path(out_dir) for out_dir, _, _ in output]) + for r in range(1, size): + # True means the other workers should pass their files to the root worker (rank 0): + comm.send(len(output_directories) != 1, dest=r) + + # If there are different output directories, listen to the other workers: + if len(output_directories) != 1: + # Collect the files from the other workers: + files = [] + for r in range(1, size): + files.extend(comm.recv(source=r)) + # Write the files to the root worker's output directory: + for file_name, file_content in files: + with open(output_directory / file_name, "w") as f: + f.write(file_content) + + # Concatenate the dataframes: + dataframe = pd.concat(objs=[df for _, df, _ in output], axis=0) + + # Concatenate the errors dictionaries: + errors_dictionary = reduce( + operator.ior, [err for _, _, err in output], {} + ) + + return str(output_directory), dataframe, errors_dictionary + + # Listen to rank 0 to see if there are different output directories and this rank need to send its files to + # it: + if comm.recv(source=0): + files = [] + for file in os.listdir(output_directory): + with open(output_directory / file, "r") as f: + files.append((file, f.read())) + comm.send(files, dest=0) + return None + + return wrapper + + return decorator
    + + +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]: + is_mpi = False + try: + import mlrun + + context = mlrun.get_or_create_ctx(name="mlrun") + is_mpi = context.labels.get("kind", "job") == "mpijob" + + if is_mpi: + try: + from mpi4py import MPI + + return context, MPI.COMM_WORLD + except ModuleNotFoundError as mpi4py_not_found: + context.logger.error( + "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your " + "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi." + ) + raise mpi4py_not_found + else: + return context, None + except ModuleNotFoundError as module_not_found: + if is_mpi: + raise module_not_found + return None, None + + +
    [docs]@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True}) +def transcribe( + # Input / Output kwargs: + data_path: Union[str, Path, List[Union[str, Path]]], + output_directory: str = None, + # Model loading kwargs: + model_name: str = "openai/whisper-tiny", + device: str = None, + use_flash_attention_2: bool = None, + use_better_transformers: bool = None, + # Generation kwargs: + assistant_model: str = None, + max_new_tokens: int = 128, + chunk_length_s: int = 30, + batch_size: int = 8, + spoken_language: str = None, + translate_to_english: bool = False, + # Diarization kwargs: + speech_diarization: Dict[str, List[Tuple[float, float, str]]] = None, + speech_diarize_per_channel: int = None, + speaker_labels: List[str] = None, + # Other kwargs: + use_multiprocessing: Union[bool, int] = False, + verbose: bool = False, +): + """ + Transcribe audio files into text files and collect additional data. The end result is a directory of transcribed + text files and a dataframe containing the following columns: + + * audio_file - The audio file path. + * transcription_file - The transcribed text file name in the output directory. + + The transcription is based on Huggingface's ASR pipeline - + https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline and + is tested with OpenAI's Whisper models - https://huggingface.co/openai. + + If one of the speaker diarization parameters are given (either `speech_diarization` or + `speech_diarize_per_channel`), the transcription will be written in a conversation format, where each speaker will + be written in a separate line:: + + speaker_1: text + speaker_2: text + speaker_1: text + ... + + :param data_path: A directory of audio files or a single file or a list of files to transcribe. + :param output_directory: Path to a directory to save all transcribed audio files. If not given, will save + the transcribed files in a temporary directory. + :param model_name: The model name to use. Should be a model from the OpenAI's Whisper models for + best results (for example "tiny", "base", "large", etc.). See here for more + information: https://huggingface.co/openai?search_models=whisper. + :param device: The device to use for inference. If not given, will use GPU if available. + :param use_flash_attention_2: Whether to use the Flash Attention 2 implementation. It can be used only with + one of the following GPUs: Nvidia H series and Nvidia A series. T4 support + will be available soon. + + Note: If both `use_flash_attention_2` and + `use_better_transformers` are `None`, the optimization will be chosen + automatically according to the available resources. + + :param use_better_transformers: Whether to use the Better Transformers library to further optimize the model. + Should be used for all use cases that do not support flash attention 2. + + Note: If both `use_flash_attention_2` and `use_better_transformers` are + `None`, the optimization will be chosen automatically according to the + available resources. + :param assistant_model: The assistant model name to use for inference. Notice that the optimizations + (flash attention 2 and better transformers) will be applied for the assistant as + well. Should be a model from Huggingface's distil-whisper (see here for more + information: https://github.com/huggingface/distil-whisper). + + Note: Currently an assistant model is only usable with batch size of 1. + :param max_new_tokens: The maximum number of new tokens to generate. This is used to limit the + generation length. Default is 128 tokens. + :param chunk_length_s: The audio chunk to split the audio to (in seconds). Default is 30 seconds. + :param batch_size: The batch size to use for inference. Default is 2. + :param spoken_language: Aim whisper to know what language is spoken. If None, it will try to detect + it. + :param translate_to_english: Whether to translate the transcriptions to English. + :param speech_diarization: A speech diarization dictionary with the file names to transcribe as keys and + their diarization as value. The diarization is a list of tuples: + (start, end, speaker). An example + for a diarization dictionary:: + + { + "audio_file_name": [ + { + "start": 0.0, + "end": 2.0, + "speaker": "Agent", + }, + { + "start": 2.0, + "end": 4.0, + "speaker": "Client", + }, + ... + ], + ... + } + + Note: The diarization must be for the entire duration of the audio file (as long + as Whisper is predicting words up until then. + :param speech_diarize_per_channel: Perform speech diarization per channel. Each speaker is expected to belong to + a separate channel in the audio. Notice: This will make the transcription + slower as each channel wil be transcribed separatly. If a speech diarization + is passed (via the `speech_diarization` parameter), this parameter is + ignored. + :param speaker_labels: A list of speaker labels by channel order to use for writing the + transcription with respect to per channel speech diarization. This won't be + used together with a given speech diarization (via the `speech_diarization` + parameter). + :param use_multiprocessing: Whether to use multiprocessing to transcribe the audio files. Can be either a + boolean value or an integer. If `True`, will use the default amount of workers + (3): 1 for transcription, 1 for batch processing and 1 for task completion (such + as speech diarization and writing to files). To control the amount of tasks + completion workers, an integer can be provided to specify the amount of workers. + `False`, will use a single process. Default is `False`. + :param verbose: Whether to print the progress of the transcription. Default is `False`. + """ + global _LOGGER + + # Get the input audio files to transcribe: + if verbose: + _LOGGER.info("Collecting audio files.") + audio_files = _get_audio_files(data_path=data_path) + if verbose: + _LOGGER.info(f"Collected {len(audio_files)} audio files.") + + # Get the output directory: + if output_directory is None: + if verbose: + _LOGGER.info("No output directory given, using temporary directory.") + output_directory = tempfile.mkdtemp() + output_directory = Path(output_directory).absolute() + output_directory.mkdir(exist_ok=True, parents=True) + if verbose: + _LOGGER.info(f"Transcriptions will be saved to: {output_directory}") + + # Initialize a batch processor according to user requirements (no speech diarization, given speech diarization, + # speech diarization per channel): + if speech_diarization: + batch_processor = SpeechDiarizationBatchProcessor( + audio_files=audio_files, + output_directory=output_directory, + speech_diarization=speech_diarization, + ) + elif speech_diarize_per_channel: + batch_processor = PerChannelSpeechDiarizationBatchProcessor( + audio_files=audio_files, + output_directory=output_directory, + n_channels=speech_diarize_per_channel, + speakers=speaker_labels, + ) + else: + batch_processor = BatchProcessor( + audio_files=audio_files, + output_directory=output_directory, + ) + + # Initialize the transcription pipeline: + transcriber = Transcriber( + device=device, + use_flash_attention_2=use_flash_attention_2, + use_better_transformers=use_better_transformers, + assistant_model=assistant_model, + model_name=model_name, + max_new_tokens=max_new_tokens, + chunk_length_s=chunk_length_s, + batch_size=batch_size, + return_timestamps=( + "word" + if speech_diarization is not None or speech_diarize_per_channel is not None + else False + ), + per_channel_transcription=speech_diarize_per_channel or 0, + spoken_language=spoken_language, + translate_to_english=translate_to_english, + ) + + # Run the transcription: + if use_multiprocessing: + results = _parallel_run( + n_workers=use_multiprocessing + if isinstance(use_multiprocessing, int) + else 1, + audio_files=audio_files, + batch_processor=batch_processor, + transcriber=transcriber, + verbose=verbose, + ) + else: + results = _run( + audio_files=audio_files, + batch_processor=batch_processor, + transcriber=transcriber, + verbose=verbose, + ) + + # Process the results: + if verbose: + _LOGGER.info("Summarizing the results.") + successes = [] + errors = {} + for is_error, result in results: + if is_error: + errors[result[0]] = result[1] + else: + successes.append(result) + successes = pd.DataFrame(successes, columns=["audio_file", "transcription_file"]) + if verbose: + _LOGGER.info( + f"Done ({successes.shape[0]}/{len(audio_files)})\n" + f"Transcriptions summary:\n" + f"{successes.head()}" + ) + + return str(output_directory), successes, errors
    + + +def _get_audio_files( + data_path: Union[Path, str, list], +) -> List[Path]: + """ + Get the audio files to transcribe. If a path to a directory is given, all files in the directory will be collected. + + :param data_path: The data path to collect the audio files from. + + :returns: The audio files list. + """ + # Check if given a list of paths: + if isinstance(data_path, list): + audio_files = [] + for path in data_path: + audio_files.extend(_get_audio_files(data_path=path)) + return audio_files + + # Check if given a single string path to cast it to a `pathlib.Path`: + if isinstance(data_path, str): + data_path = Path(data_path).absolute() + + # Check if the path is of a directory or a file: + if data_path.is_dir(): + # Get all files inside the directory: + audio_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + audio_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be a valid path to either a directory path or a " + f"file. Given: {str(data_path)} " + ) + + return audio_files + + +def _run( + audio_files: List[Path], + batch_processor: BatchProcessor, + transcriber: Transcriber, + verbose: bool, +) -> List[Tuple[bool, Tuple[str, str]]]: + """ + Run the transcription without multiprocessing. + + :param audio_files: The audio files to transcribe. + :param batch_processor: The batch processor to use. + :param transcriber: The transcriber to use. + :param verbose: Verbosity. + + :returns: The collected results. + """ + # Load the transcription pipeline: + if verbose: + _LOGGER.info(f"Loading the transcription pipeline.") + transcriber.load() + if verbose: + _LOGGER.info("Transcription pipeline loaded.") + + # Transcribe the files: + transcriber.transcribe( + audio_files=audio_files, + batch_processor=batch_processor, + verbose=verbose, + ) + + # Return the results: + return batch_processor.get_results() + + +def _parallel_run( + n_workers: int, + audio_files: List[Path], + batch_processor: BatchProcessor, + transcriber: Transcriber, + verbose: bool, +): + """ + Run the transcription with multiprocessing. + + :param n_workers: The amount of workers to use as task completers. + :param audio_files: The audio files to transcribe. + :param batch_processor: The batch processor to use. + :param transcriber: The transcriber to use. + :param verbose: Verbosity. + + :returns: The collected results. + """ + # Initialize the multiprocessing queues: + batches_queue = Queue() + tasks_queue = Queue() + results_queue = Queue() + + # Initialize the multiprocessing processes: + batch_processing_process = Process( + target=_multiprocessing_process_batches, + kwargs={ + "batch_processor": batch_processor, + "batches_queue": batches_queue, + "tasks_queue": tasks_queue, + "n_task_completers": n_workers, + }, + ) + task_completion_processes = [ + Process( + target=_multiprocessing_complete_tasks, + kwargs={"tasks_queue": tasks_queue, "results_queue": results_queue}, + ) + for _ in range(n_workers) + ] + + # Start the multiprocessing processes: + batch_processing_process.start() + for p in task_completion_processes: + p.start() + + # Load the transcription pipeline: + if verbose: + _LOGGER.info(f"Loading the transcription pipeline.") + transcriber.load() + if verbose: + _LOGGER.info("Transcription pipeline loaded.") + + # Transcribe the files: + transcriber.transcribe( + audio_files=audio_files, batches_queue=batches_queue, verbose=verbose + ) + + # Collect the results: + results = [] + stop_marks_counter = 0 + while True: + # Get a result from the queue: + result: Tuple[bool, Tuple[str, str]] = results_queue.get() + if result == _MULTIPROCESSING_STOP_MARK: + stop_marks_counter += 1 + if stop_marks_counter == n_workers: + break + else: + # Collect the result: + results.append(result) + + # Wait for the processes to finish: + results_queue.empty() + batch_processing_process.join() + for p in task_completion_processes: + p.join() + + return results +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/transcribe/latest/src/function.yaml b/functions/master/transcribe/latest/src/function.yaml index 40dd2f0e..d72751ad 100644 --- a/functions/master/transcribe/latest/src/function.yaml +++ b/functions/master/transcribe/latest/src/function.yaml @@ -2,12 +2,14 @@ kind: job metadata: name: transcribe tag: '' - hash: 5cd620de67a936ee8a87cfc1f0b97e19730d0a69 + hash: 8810ac74045bd15cee15a2e4e89563e8e29908d3 project: '' labels: author: yonatans categories: - data-preparation + - genai + - huggingface - machine-learning spec: command: '' @@ -24,6 +26,7 @@ spec: - tqdm - torchaudio - torch + - accelerate entry_points: do_task: name: do_task diff --git a/functions/master/transcribe/latest/src/item.yaml b/functions/master/transcribe/latest/src/item.yaml index d53341ff..7fddcf95 100644 --- a/functions/master/transcribe/latest/src/item.yaml +++ b/functions/master/transcribe/latest/src/item.yaml @@ -1,6 +1,8 @@ apiVersion: v1 categories: - data-preparation +- genai +- huggingface - machine-learning description: Transcribe audio files into text files doc: '' @@ -27,4 +29,4 @@ spec: - torch - accelerate url: '' -version: 1.0.0 \ No newline at end of file +version: 1.1.0 \ No newline at end of file diff --git a/functions/master/transcribe/latest/static/function.html b/functions/master/transcribe/latest/static/function.html index 74602322..befb810a 100644 --- a/functions/master/transcribe/latest/static/function.html +++ b/functions/master/transcribe/latest/static/function.html @@ -19,12 +19,14 @@ metadata: name: transcribe tag: '' - hash: 5cd620de67a936ee8a87cfc1f0b97e19730d0a69 + hash: 8810ac74045bd15cee15a2e4e89563e8e29908d3 project: '' labels: author: yonatans categories: - data-preparation + - genai + - huggingface - machine-learning spec: command: '' @@ -41,6 +43,7 @@ - tqdm - torchaudio - torch + - accelerate entry_points: do_task: name: do_task diff --git a/functions/master/transcribe/latest/static/item.html b/functions/master/transcribe/latest/static/item.html index 2da1eccb..fddc2e3e 100644 --- a/functions/master/transcribe/latest/static/item.html +++ b/functions/master/transcribe/latest/static/item.html @@ -18,6 +18,8 @@ apiVersion: v1 categories: - data-preparation +- genai +- huggingface - machine-learning description: Transcribe audio files into text files doc: '' @@ -44,7 +46,7 @@ - torch - accelerate url: '' -version: 1.0.0 +version: 1.1.0 diff --git a/functions/master/translate/0.1.0/src/function.yaml b/functions/master/translate/0.1.0/src/function.yaml new file mode 100644 index 00000000..bb165610 --- /dev/null +++ b/functions/master/translate/0.1.0/src/function.yaml @@ -0,0 +1,135 @@ +kind: job +metadata: + name: translate + tag: '' + hash: 7eedf684bcebfbfd964e5503afbb56335c8f4097 + project: '' + labels: + author: guyl + categories: + - data-preparation + - huggingface + - machine-learning + - deep-learning + - NLP +spec: + command: '' + args: [] + image: '' + build: + functionSourceCode: # Copyright 2023 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import operator
import pathlib
from functools import reduce, wraps
from typing import Any, Dict, List, Tuple, Union

import pandas as pd
import transformers
from tqdm import tqdm

# Get the global logger:
_LOGGER = logging.getLogger()


def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    is_mpi = False
    try:
        import mlrun

        context = mlrun.get_or_create_ctx(name="mlrun")
        is_mpi = context.labels.get("kind", "job") == "mpijob"

        if is_mpi:
            try:
                from mpi4py import MPI

                return context, MPI.COMM_WORLD
            except ModuleNotFoundError as mpi4py_not_found:
                context.logger.error(
                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
                )
                raise mpi4py_not_found
        else:
            return context, None
    except ModuleNotFoundError as module_not_found:
        if is_mpi:
            raise module_not_found
    return None, None


def open_mpi_handler(
    worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None
):
    global _LOGGER

    # Check for MLRun and OpenMPI availability:
    context, comm = _check_mlrun_and_open_mpi()

    # Check if MLRun is available, set the global logger to MLRun's:
    if context:
        _LOGGER = context.logger

    def decorator(handler):
        if comm is None or comm.Get_size() == 1:
            return handler

        @wraps(handler)
        def wrapper(**kwargs):
            # Get the open mpi environment properties:
            size = comm.Get_size()
            rank = comm.Get_rank()

            # Give the correct chunk of the workers inputs:
            for worker_input in worker_inputs:
                input_argument = kwargs[worker_input]
                if input_argument is None:
                    continue
                if isinstance(input_argument, (str, pathlib.Path)):
                    input_argument = _get_text_files(
                        data_path=pathlib.Path(input_argument).absolute()
                    )
                if len(input_argument) < size:
                    raise ValueError(
                        f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. "
                        f"Please reduce the amount of workers for this input."
                    )
                even_chunk_size = len(input_argument) // size
                chunk_start = rank * even_chunk_size
                chunk_end = (
                    (rank + 1) * even_chunk_size
                    if rank + 1 < size
                    else len(input_argument)
                )
                context.logger.info(
                    f"Rank #{rank}: Processing input chunk of '{worker_input}' "
                    f"from index {chunk_start} to {chunk_end}."
                )
                if isinstance(input_argument, list):
                    input_argument = input_argument[chunk_start:chunk_end]
                elif isinstance(input_argument, pd.DataFrame):
                    input_argument = input_argument.iloc[chunk_start:chunk_end:, :]
                kwargs[worker_input] = input_argument

            # Set the root worker only arguments:
            if rank == 0 and root_worker_inputs:
                kwargs.update(root_worker_inputs)

            # Run the worker:
            output = handler(**kwargs)

            # Send the output to the root rank (rank #0):
            output = comm.gather(output, root=0)
            if rank == 0:
                # Join the outputs:
                context.logger.info("Collecting data from workers to root worker.")
                output_directory = output[0][0]
                dataframe = pd.concat(objs=[df for _, df, _ in output], axis=0)
                errors_dictionary = reduce(
                    operator.ior, [err for _, _, err in output], {}
                )
                return output_directory, dataframe, errors_dictionary
            return None

        return wrapper

    return decorator


@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True})
def translate(
    data_path: Union[str, List[str], pathlib.Path],
    output_directory: str,
    model_name: str = None,
    source_language: str = None,
    target_language: str = None,
    device: str = None,
    model_kwargs: dict = None,
    batch_size: int = 1,
    translation_kwargs: dict = None,
    verbose: bool = False,
) -> Tuple[str, pd.DataFrame, dict]:
    """
    Translate text files using a transformer model from Huggingface's hub according to the source and target languages
    given (or using the directly provided model name). The end result is a directory of translated text files and a
    dataframe containing the following columns:

    * text_file - The text file path.
    * translation_file - The translation text file name in the output directory.

    :param data_path:          A directory of text files or a single file or a list of files to translate.
    :param output_directory:   Directory where the translated files will be saved.
    :param model_name:         The name of a model to load. If None, the model name is constructed using the source and
                               target languages parameters.
    :param source_language:    The source language code (e.g., 'en' for English).
    :param target_language:    The target language code (e.g., 'en' for English).
    :param model_kwargs:       Keyword arguments to pass regarding the loading of the model in HuggingFace's `pipeline`
                               function.
    :param device:             The device index for transformers. Default will prefer cuda if available.
    :param batch_size:         The number of batches to use in translation. The files are translated one by one, but the
                               sentences can be batched.
    :param translation_kwargs: Additional keyword arguments to pass to a `transformers.TranslationPipeline` when doing
                               the translation inference. Notice the batch size here is being added automatically.
    :param verbose:            Whether to present logs of a progress bar and errors. Default: True.

    :returns: A tuple of:

              * Path to the output directory.
              * A dataframe dataset of the translated file names.
              * A dictionary of errored files that were not translated.
    """
    global _LOGGER

    # Get the input text files to translate:
    if verbose:
        _LOGGER.info("Collecting text files.")
    if isinstance(data_path, str):
        data_path = pathlib.Path(data_path).absolute()
        text_files = _get_text_files(data_path=data_path)
    else:
        text_files = data_path
    if verbose:
        _LOGGER.info(f"Collected {len(text_files)} text files.")

    # Get the translation pipeline:
    if verbose:
        _LOGGER.info(f"Loading model - using device '{device}'.")
    translation_pipeline, model_name = _get_translation_pipeline(
        model_name=model_name,
        source_language=source_language,
        target_language=target_language,
        device=device,
        model_kwargs=model_kwargs,
        batch_size=batch_size if batch_size != 1 else None,
    )
    if verbose:
        _LOGGER.info(f"Model '{model_name}' was loaded successfully.")

    # Prepare the successes dataframe and errors dictionary to be returned:
    successes = []
    errors = {}

    # Create the output directory:
    output_directory = pathlib.Path(output_directory)
    output_directory.mkdir(parents=True, exist_ok=True)

    # Prepare the translation keyword arguments:
    translation_kwargs = translation_kwargs or {}

    # Go over the audio files and transcribe:
    for text_file in tqdm(
        text_files, desc="Translating", unit="file", disable=not verbose
    ):
        try:
            # Translate:
            translation = _translate(
                text_file=text_file,
                translation_pipeline=translation_pipeline,
                translation_kwargs=translation_kwargs,
            )
            # Write the transcription to file:
            translation_file = _save_to_file(
                translation=translation,
                file_name=text_file.stem,
                output_directory=output_directory,
            )
            # Note as a success in the list:
            successes.append(
                [
                    text_file.name,
                    translation_file.name,
                ]
            )
        except Exception as exception:
            # Note the exception as error in the dictionary:
            if verbose:
                _LOGGER.warning(f"Error in file: '{text_file.name}'")
            errors[str(text_file.name)] = str(exception)
            continue

    # Construct the translations dataframe:
    columns = [
        "text_file",
        "translation_file",
    ]
    successes = pd.DataFrame(
        successes,
        columns=columns,
    )

    # Print the head of the produced dataframe and return:
    if verbose:
        _LOGGER.info(
            f"Done ({successes.shape[0]}/{len(text_files)})\n"
            f"Translations summary:\n"
            f"{successes.head()}"
        )
    return str(output_directory), successes, errors


def _get_text_files(
    data_path: pathlib.Path,
) -> List[pathlib.Path]:
    # Check if the path is of a directory or a file:
    if data_path.is_dir():
        # Get all files inside the directory:
        text_files = list(data_path.glob("*.*"))
    elif data_path.is_file():
        text_files = [data_path]
    else:
        raise ValueError(
            f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. "
            f"Given: {str(data_path)} "
        )

    return text_files


def _get_translation_pipeline(
    model_name: str = None,
    source_language: str = None,
    target_language: str = None,
    device: str = None,
    model_kwargs: dict = None,
    batch_size: int = None,
) -> Tuple[transformers.Pipeline, str]:
    # Construct the model name - if model name is provided (not None) then we take it, otherwise we check both source
    # and target were provided to construct the model name:
    if model_name is None and (source_language is None or target_language is None):
        raise ValueError(
            "No model name were given and missing source and / or target languages. In order to translate you must "
            "pass a `model_name` or both `source_language` and `target_language`."
        )
    elif model_name is None:
        model_name = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"

    # Initialize the translation pipeline:
    try:
        translation_pipeline = transformers.pipeline(
            task="translation",
            model=model_name,
            tokenizer=model_name,
            device=device,
            model_kwargs=model_kwargs,
            batch_size=batch_size,
        )
    except OSError as load_exception:
        if (
            "is not a valid model identifier listed on 'https://huggingface.co/models'"
            in str(load_exception)
            and source_language
        ):
            raise ValueError(
                f"The model '{model_name}' is not a valid model identifier. "
                f"The parameters `source_language` and `target_language` are used to construct a Helsinki model for "
                f"text to text generation, but the model created from the given languages does not exist. "
                f"You may check language identifiers at "
                f"https://developers.google.com/admin-sdk/directory/v1/languages, and if the error was not fixed, one "
                f"or more language code might be with 3 letters and needs to be found online. "
                f"Remember, you can always choose a model directly from the Huggingface hub by using the `model_name` "
                f"parameter."
            ) from load_exception
        raise load_exception

    return translation_pipeline, model_name


def _translate(
    text_file: pathlib.Path,
    translation_pipeline: transformers.Pipeline,
    translation_kwargs: dict,
) -> str:
    # Read the text from file:
    with open(text_file, "r") as fp:
        text = fp.read()

    # Split to paragraphs and each paragraph to sentences:
    paragraphs = [paragraph.split(".") for paragraph in text.split("\n")]

    # Discover the newline indexes to restore the file to its structure post translation:
    newlines_indexes = []
    for paragraph in paragraphs[:-1]:
        if len(newlines_indexes) == 0:
            newlines_indexes.append(len(paragraph) - 1)
        else:
            newlines_indexes.append(newlines_indexes[-1] + len(paragraph))

    # Prepare the batches (each sentence from the paragraphs). Notice we add a dot not only to restore the sentence
    # structure but to ignore empty strings as it will ruin the translation:
    sentences = [f"{line}." for paragraph in paragraphs for line in paragraph]

    # Translate the sentences:
    translations = translation_pipeline(sentences, **translation_kwargs)

    # Restructure the full text from the sentences:
    translated_text = []
    newline_index = newlines_indexes.pop(0) if newlines_indexes else None
    for i, translation in enumerate(translations):
        # Get the translation:
        text = translation["translation_text"]
        # Validate if it was an empty sentence before:
        if text == ".":
            text = ""
        # Check if needed to insert a newline:
        if newline_index and newline_index == i:
            text += "\n"
            newline_index = newlines_indexes.pop(0) if newlines_indexes else None
        # Collect it:
        translated_text.append(text)
    translated_text = "".join(translated_text)

    return translated_text


def _save_to_file(
    translation: str, file_name: str, output_directory: pathlib.Path
) -> pathlib.Path:
    # Prepare the file full path (checking for no duplications):
    translation_file = output_directory / f"{file_name}.txt"
    i = 1
    while translation_file.exists():
        i += 1
        translation_file = output_directory / f"{file_name}_{i}.txt"

    # Make sure all directories are created:
    translation_file.parent.mkdir(exist_ok=True, parents=True)

    # Write to file:
    with open(translation_file, "w") as fp:
        fp.write(translation)

    return translation_file
 + base_image: mlrun/mlrun + commands: [] + code_origin: '' + origin_filename: '' + requirements: + - transformers + - sentencepiece + - torch + - tqdm + entry_points: + open_mpi_handler: + name: open_mpi_handler + doc: '' + parameters: + - name: worker_inputs + type: List[str] + - name: root_worker_inputs + type: Dict[str, Any] + default: null + outputs: [] + lineno: 56 + has_varargs: false + has_kwargs: false + decorator: + name: decorator + doc: '' + parameters: + - name: handler + outputs: [] + lineno: 68 + has_varargs: false + has_kwargs: false + wrapper: + name: wrapper + doc: '' + parameters: [] + outputs: [] + lineno: 73 + has_varargs: false + has_kwargs: true + translate: + name: translate + doc: 'Translate text files using a transformer model from Huggingface''s hub + according to the source and target languages + + given (or using the directly provided model name). The end result is a directory + of translated text files and a + + dataframe containing the following columns: + + + * text_file - The text file path. + + * translation_file - The translation text file name in the output directory.' + parameters: + - name: data_path + type: Union[str, List[str], Path] + doc: A directory of text files or a single file or a list of files to translate. + - name: output_directory + type: str + doc: Directory where the translated files will be saved. + - name: model_name + type: str + doc: The name of a model to load. If None, the model name is constructed using + the source and target languages parameters. + default: null + - name: source_language + type: str + doc: The source language code (e.g., 'en' for English). + default: null + - name: target_language + type: str + doc: The target language code (e.g., 'en' for English). + default: null + - name: device + type: str + doc: The device index for transformers. Default will prefer cuda if available. + default: null + - name: model_kwargs + type: dict + doc: Keyword arguments to pass regarding the loading of the model in HuggingFace's + `pipeline` function. + default: null + - name: batch_size + type: int + doc: The number of batches to use in translation. The files are translated + one by one, but the sentences can be batched. + default: 1 + - name: translation_kwargs + type: dict + doc: Additional keyword arguments to pass to a `transformers.TranslationPipeline` + when doing the translation inference. Notice the batch size here is being + added automatically. + default: null + - name: verbose + type: bool + doc: 'Whether to present logs of a progress bar and errors. Default: True.' + default: false + outputs: + - doc: 'A tuple of:' + type: Tuple[str, pd.DataFrame, dict] + lineno: 135 + has_varargs: false + has_kwargs: false + description: Translate text files from one language to another + default_handler: translate + disable_auto_mount: false + clone_target_dir: '' + env: [] + priority_class_name: '' + preemption_mode: prevent + affinity: null + tolerations: null + security_context: {} +verbose: false diff --git a/functions/master/translate/0.1.0/src/item.yaml b/functions/master/translate/0.1.0/src/item.yaml new file mode 100644 index 00000000..e6394734 --- /dev/null +++ b/functions/master/translate/0.1.0/src/item.yaml @@ -0,0 +1,33 @@ +apiVersion: v1 +categories: +- data-preparation +- huggingface +- machine-learning +- deep-learning +- NLP +description: Translate text files from one language to another +doc: '' +example: translate.ipynb +generationDate: 2023-12-05:17-20 +hidden: false +icon: '' +labels: + author: guyl +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.5.1 +name: translate +platformVersion: 3.5.3 +spec: + filename: translate.py + handler: translate + image: mlrun/mlrun + kind: job + requirements: + - transformers + - sentencepiece + - torch + - tqdm +url: '' +version: 0.1.0 +test_valid: True diff --git a/functions/master/translate/0.1.0/src/requirements.txt b/functions/master/translate/0.1.0/src/requirements.txt new file mode 100644 index 00000000..94e54846 --- /dev/null +++ b/functions/master/translate/0.1.0/src/requirements.txt @@ -0,0 +1,4 @@ +transformers +tqdm +torch +sentencepiece \ No newline at end of file diff --git a/functions/master/translate/0.1.0/src/test_translate.py b/functions/master/translate/0.1.0/src/test_translate.py new file mode 100644 index 00000000..a22dc899 --- /dev/null +++ b/functions/master/translate/0.1.0/src/test_translate.py @@ -0,0 +1,51 @@ +# Copyright 2023 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os.path +import tempfile + +import mlrun + + +def test_translate(): + project = mlrun.new_project("test-translate") + translate_fn = project.set_function("translate.py", "translate", image="mlrun/mlrun") + input_text = "Ali her gece bir kitap okur." + expected_translation = "Ali reads a book every night." + + with tempfile.TemporaryDirectory() as test_dir: + with tempfile.TemporaryDirectory() as data_dir: + with open(os.path.join(data_dir, "test_tr.txt"), "w") as f: + f.write(input_text) + translate_run = translate_fn.run( + handler="translate", + inputs={ + "data_path": data_dir, + }, + params={ + "model_name": "Helsinki-NLP/opus-mt-tr-en", + "device": "cpu", + "output_directory": test_dir, + }, + local=True, + returns=[ + "files: path", + "text_files_dataframe: dataset", + "errors: dict", + ], + artifact_path=test_dir, + ) + assert translate_run.status.state == "completed" + with open(os.path.join(test_dir, "test_tr.txt")) as f: + assert f.read() == expected_translation + diff --git a/functions/master/translate/0.1.0/src/translate.ipynb b/functions/master/translate/0.1.0/src/translate.ipynb new file mode 100644 index 00000000..5e14ee87 --- /dev/null +++ b/functions/master/translate/0.1.0/src/translate.ipynb @@ -0,0 +1,658 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6d3c20aa-7129-4905-beaa-7011943373f5", + "metadata": {}, + "source": [ + "# Translate tutorial" + ] + }, + { + "cell_type": "markdown", + "id": "afe4a3ee-f886-461c-9830-0fd9a5b625c3", + "metadata": {}, + "source": [ + "## Short description and explenation" + ] + }, + { + "cell_type": "markdown", + "id": "313ed5c3-7416-4bbb-a7fb-aa37ab1f8445", + "metadata": {}, + "source": [ + "Machine translation has made huge strides in recent years thanks to advances in deep learning, our translte function makes it even easier to use.
    \n", + "Simply tell it where your file is and the languages you're working with (the one you're translating from and the one you want),
    \n", + "and this function takes care of the rest. It cleverly picks the right pre-trained model for your language pair, ensuring top-notch translations.
    \n", + "\n", + "No need to worry about finding the perfect model or dealing with complex setup – it's all handled behind the scenes.
    \n", + "\n", + "With this function, language translation becomes a breeze, making your documents accessible in any language without breaking a sweat." + ] + }, + { + "cell_type": "markdown", + "id": "9352f799-fe99-4ace-9b44-ca0e28bb1fb4", + "metadata": {}, + "source": [ + "## Background" + ] + }, + { + "cell_type": "markdown", + "id": "6026a8bd-e2e7-454a-b325-9550561a587e", + "metadata": {}, + "source": [ + "The function takes two parameters: a model name or the source and target languages, and a path to one or more text files to translate.\n", + "\n", + "It first checks if a model name was passed. If so, it loads that Helsinki-NLP model.
    \n", + "If not, it looks at the source and target languages and loads the appropriate Helsinki-NLP translation model.\n", + "\n", + "It then reads in the text files and translates them using the loaded model.\n", + "\n", + "Finally, it writes the translated text out to new files and returns the filename or dir name.
    \n", + "\n", + "This allows the user to easily translate a text file to another language using Helsinki-NLP's pre-trained models by just passing the model name or language pair and source text file.
    \n", + "\n", + "This function auto-model selection is based on the great translation models offered by Helsinki. Check them out https://huggingface.co/Helsinki-NLP" + ] + }, + { + "cell_type": "markdown", + "id": "42ec9bc3-2b90-40f1-b10b-5493d9e2b75e", + "metadata": {}, + "source": [ + "## Requirements" + ] + }, + { + "cell_type": "markdown", + "id": "6b756726-e750-4da4-b032-bf5385f85311", + "metadata": {}, + "source": [ + "`transformers`
    \n", + "`tqdm`
    " + ] + }, + { + "cell_type": "markdown", + "id": "212b8161-3e75-459e-98f3-a5b7c5a15efe", + "metadata": {}, + "source": [ + "## Documentation" + ] + }, + { + "cell_type": "markdown", + "id": "9b5fe561-4fbb-4471-91bb-532fa55559f9", + "metadata": {}, + "source": [ + "`data_path`: A directory of text files or a single text file or a list of files to translate.\n", + "\n", + "`output_directory`: Directory where the translated files will be saved.\n", + "\n", + "`model_name`: The name of a model to load. If None, the model name is constructed using the source and
    \n", + " target languages parameters from the \"Helsinki-NLP\" group.\n", + " \n", + "`source_language`: The source language code (e.g., 'en' for English).\n", + "\n", + "`target_language`: The target language code (e.g., 'en' for English).\n", + "\n", + "`model_kwargs`: Keyword arguments to pass regarding the loading of the model in HuggingFace's \"pipeline\"\n", + " function.\n", + " \n", + "`device`: The device index for transformers. Default will prefer cuda if available.\n", + "\n", + "`batch_size`: The number of batches to use in translation. The files are translated one by one, but the\n", + " sentences can be batched.\n", + " \n", + "`translation_kwargs`: Additional keyword arguments to pass to a \"transformers.TranslationPipeline\" when doing
    \n", + " the translation inference. Notice the batch size here is being added automatically.\n" + ] + }, + { + "cell_type": "markdown", + "id": "2e6f44a6-d6ac-48ed-a7d1-936d25e7426c", + "metadata": {}, + "source": [ + "## Demo " + ] + }, + { + "cell_type": "markdown", + "id": "2b231e4c-0224-41a2-87cf-400a4680e2b9", + "metadata": {}, + "source": [ + "The following demo will show an example of translating a text file written in turkish to eanglish using the _tranlate_ function.
    \n", + "\n", + "### (1.) Import the function (import mlrun, set project and import function)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "797ef0d4-f435-485c-b705-e1d6115fb8fd", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun" + ] + }, + { + "cell_type": "markdown", + "id": "1ff51127-dc54-44d2-bd13-0b81165b2033", + "metadata": {}, + "source": [ + "We want to translate the following turkish sentence into english, so we will write it to a text file." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f9517cc8-a0d6-4169-b746-cf4c265e6a3b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing data.txt\n" + ] + } + ], + "source": [ + "%%writefile data.txt\n", + "Ali her gece bir kitap okur. # which means: \"Ali reads a book every night.\"" + ] + }, + { + "cell_type": "markdown", + "id": "c24d71a7-9400-475a-9472-424658801914", + "metadata": {}, + "source": [ + "Setting a project and importing the translate function" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e61184ea-44a3-4184-9a2f-9c45b90fdc0f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-06 14:44:05,223 [info] Created and saved project: {'name': 'test-translate', 'from_template': None, 'overwrite': False, 'context': './', 'save': True}\n" + ] + } + ], + "source": [ + "project = mlrun.new_project(\"test-translate\")\n", + "translate_fn = project.set_function(\"hub://translate\", \"translate\")" + ] + }, + { + "cell_type": "markdown", + "id": "558260ce-e453-4e05-a6a7-b2df39cff1b9", + "metadata": {}, + "source": [ + "## Usage" + ] + }, + { + "cell_type": "markdown", + "id": "5a1781ee-a210-4dc1-82de-0f4f5d191173", + "metadata": {}, + "source": [ + "### (2.1.) Manual model selection\n", + "Here we run our function that we've imported from the MLRun Function Hub.
    \n", + "We select the specific model, give the function a path to to the file and output directory and choose to run on the cpu." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9b3107fd-b78d-43de-b4a2-ad3863f72a03", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-06 14:48:52,794 [info] Storing function: {'name': 'translate-translate', 'uid': '5768d0ddaf06469da053c85d47f61a47', 'db': 'http://mlrun-api:8080'}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Recommended: pip install sacremoses.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-06 14:48:56,190 [warning] Skipping logging an object with the log hint '{'key': 'errors', 'artifact_type': 'dict'}' due to the following error:\n", + "An exception was raised during the packing of '{}': No packager was found for the combination of 'object_type=builtins.dict' and 'artifact_type=dict'.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
    \n", + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    test-translate0Dec 06 14:48:52completedtranslate-translate
    v3io_user=yonis
    kind=local
    owner=yonis
    host=jupyter-yonis-7c9bdbfb4d-9g2p2
    data_path
    model_name=Helsinki-NLP/opus-mt-tr-en
    device=cpu
    output_directory=./
    files
    text_files_dataframe
    \n", + "
    \n", + "
    \n", + "
    \n", + " Title\n", + " ×\n", + "
    \n", + " \n", + "
    \n", + "
    \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-06 14:48:56,409 [info] Run execution finished: {'status': 'completed', 'name': 'translate-translate'}\n" + ] + } + ], + "source": [ + "translate_run = translate_fn.run(\n", + " handler=\"translate\",\n", + " inputs={\"data_path\": \"data.txt\"},\n", + " params={\n", + " \"model_name\": \"Helsinki-NLP/opus-mt-tr-en\",\n", + " \"device\": \"cpu\",\n", + " \"output_directory\": \"./\",\n", + " },\n", + " local=True,\n", + " returns=[\n", + " \"files: path\",\n", + " \"text_files_dataframe: dataset\",\n", + " \"errors: dict\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8b2fcf2b-3893-4dda-85e2-4a2b9ed0d963", + "metadata": {}, + "source": [ + "### (2.1.) Auto model detectyion" + ] + }, + { + "cell_type": "markdown", + "id": "8c3d24ca-8df7-4204-8b0d-e7a08d53d8c9", + "metadata": {}, + "source": [ + "Here we run our function that we've imported from the MLRun Function Hub.
    \n", + "We select the languages to use for choosing the model, give the function a path to to the file and output directory and choose to run on the cpu." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbe10afd-5ede-4475-abc2-bb07dfdf33aa", + "metadata": {}, + "outputs": [], + "source": [ + "translate_run = translate_fn.run(\n", + " handler=\"translate\",\n", + " inputs={\"data_path\": \"data.txt\"},\n", + " params={\n", + " \"target_language\": \"en\",\n", + " \"source_language\": \"tr\",\n", + " \"device\": \"cpu\",\n", + " \"output_directory\": \"./\",\n", + " },\n", + " local=True,\n", + " returns=[\n", + " \"files: path\",\n", + " \"text_files_dataframe: dataset\",\n", + " \"errors: dict\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "40e4a666-9680-40d6-93ee-9466d31a9efc", + "metadata": {}, + "source": [ + "We can take alook at the file created" + ] + }, + { + "cell_type": "markdown", + "id": "89a1952c-f3c3-4a7b-bad4-b59c701a5af6", + "metadata": {}, + "source": [ + "### (3.) Review results" + ] + }, + { + "cell_type": "markdown", + "id": "9d583cf9-7e81-4d0d-982f-aba345d4cf9c", + "metadata": {}, + "source": [ + "We can look at the articat returned, the import " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c3dab6f8-6089-46c2-b4b9-899a2442403f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    text_filetranslation_file
    0data.txtdata_2.txt
    \n", + "
    " + ], + "text/plain": [ + " text_file translation_file\n", + "0 data.txt data_2.txt" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "translate_run.artifact(\"text_files_dataframe\").show()" + ] + }, + { + "cell_type": "markdown", + "id": "580a20a2-4877-48b4-8f83-59cbfc2f3b83", + "metadata": {}, + "source": [ + "Checking that translation is correct, we print the text file created by function, and can see the sentence is as expected." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "0157bcaf-8f2c-4995-a214-32f2710da4c9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Translated text:\n", + "Ali reads a book every night.\n", + "\n" + ] + } + ], + "source": [ + "with open(\"data_2.txt\", \"r\") as f:\n", + " print(f\"Translated text:\\n{f.read()}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlrun-base", + "language": "python", + "name": "conda-env-mlrun-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/functions/master/translate/0.1.0/src/translate.py b/functions/master/translate/0.1.0/src/translate.py new file mode 100644 index 00000000..360fa620 --- /dev/null +++ b/functions/master/translate/0.1.0/src/translate.py @@ -0,0 +1,396 @@ +# Copyright 2023 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import operator +import pathlib +from functools import reduce, wraps +from typing import Any, Dict, List, Tuple, Union + +import pandas as pd +import transformers +from tqdm import tqdm + +# Get the global logger: +_LOGGER = logging.getLogger() + + +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]: + is_mpi = False + try: + import mlrun + + context = mlrun.get_or_create_ctx(name="mlrun") + is_mpi = context.labels.get("kind", "job") == "mpijob" + + if is_mpi: + try: + from mpi4py import MPI + + return context, MPI.COMM_WORLD + except ModuleNotFoundError as mpi4py_not_found: + context.logger.error( + "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your " + "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi." + ) + raise mpi4py_not_found + else: + return context, None + except ModuleNotFoundError as module_not_found: + if is_mpi: + raise module_not_found + return None, None + + +def open_mpi_handler( + worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None +): + global _LOGGER + + # Check for MLRun and OpenMPI availability: + context, comm = _check_mlrun_and_open_mpi() + + # Check if MLRun is available, set the global logger to MLRun's: + if context: + _LOGGER = context.logger + + def decorator(handler): + if comm is None or comm.Get_size() == 1: + return handler + + @wraps(handler) + def wrapper(**kwargs): + # Get the open mpi environment properties: + size = comm.Get_size() + rank = comm.Get_rank() + + # Give the correct chunk of the workers inputs: + for worker_input in worker_inputs: + input_argument = kwargs[worker_input] + if input_argument is None: + continue + if isinstance(input_argument, (str, pathlib.Path)): + input_argument = _get_text_files( + data_path=pathlib.Path(input_argument).absolute() + ) + if len(input_argument) < size: + raise ValueError( + f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. " + f"Please reduce the amount of workers for this input." + ) + even_chunk_size = len(input_argument) // size + chunk_start = rank * even_chunk_size + chunk_end = ( + (rank + 1) * even_chunk_size + if rank + 1 < size + else len(input_argument) + ) + context.logger.info( + f"Rank #{rank}: Processing input chunk of '{worker_input}' " + f"from index {chunk_start} to {chunk_end}." + ) + if isinstance(input_argument, list): + input_argument = input_argument[chunk_start:chunk_end] + elif isinstance(input_argument, pd.DataFrame): + input_argument = input_argument.iloc[chunk_start:chunk_end:, :] + kwargs[worker_input] = input_argument + + # Set the root worker only arguments: + if rank == 0 and root_worker_inputs: + kwargs.update(root_worker_inputs) + + # Run the worker: + output = handler(**kwargs) + + # Send the output to the root rank (rank #0): + output = comm.gather(output, root=0) + if rank == 0: + # Join the outputs: + context.logger.info("Collecting data from workers to root worker.") + output_directory = output[0][0] + dataframe = pd.concat(objs=[df for _, df, _ in output], axis=0) + errors_dictionary = reduce( + operator.ior, [err for _, _, err in output], {} + ) + return output_directory, dataframe, errors_dictionary + return None + + return wrapper + + return decorator + + +@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True}) +def translate( + data_path: Union[str, List[str], pathlib.Path], + output_directory: str, + model_name: str = None, + source_language: str = None, + target_language: str = None, + device: str = None, + model_kwargs: dict = None, + batch_size: int = 1, + translation_kwargs: dict = None, + verbose: bool = False, +) -> Tuple[str, pd.DataFrame, dict]: + """ + Translate text files using a transformer model from Huggingface's hub according to the source and target languages + given (or using the directly provided model name). The end result is a directory of translated text files and a + dataframe containing the following columns: + + * text_file - The text file path. + * translation_file - The translation text file name in the output directory. + + :param data_path: A directory of text files or a single file or a list of files to translate. + :param output_directory: Directory where the translated files will be saved. + :param model_name: The name of a model to load. If None, the model name is constructed using the source and + target languages parameters. + :param source_language: The source language code (e.g., 'en' for English). + :param target_language: The target language code (e.g., 'en' for English). + :param model_kwargs: Keyword arguments to pass regarding the loading of the model in HuggingFace's `pipeline` + function. + :param device: The device index for transformers. Default will prefer cuda if available. + :param batch_size: The number of batches to use in translation. The files are translated one by one, but the + sentences can be batched. + :param translation_kwargs: Additional keyword arguments to pass to a `transformers.TranslationPipeline` when doing + the translation inference. Notice the batch size here is being added automatically. + :param verbose: Whether to present logs of a progress bar and errors. Default: True. + + :returns: A tuple of: + + * Path to the output directory. + * A dataframe dataset of the translated file names. + * A dictionary of errored files that were not translated. + """ + global _LOGGER + + # Get the input text files to translate: + if verbose: + _LOGGER.info("Collecting text files.") + if isinstance(data_path, str): + data_path = pathlib.Path(data_path).absolute() + text_files = _get_text_files(data_path=data_path) + else: + text_files = data_path + if verbose: + _LOGGER.info(f"Collected {len(text_files)} text files.") + + # Get the translation pipeline: + if verbose: + _LOGGER.info(f"Loading model - using device '{device}'.") + translation_pipeline, model_name = _get_translation_pipeline( + model_name=model_name, + source_language=source_language, + target_language=target_language, + device=device, + model_kwargs=model_kwargs, + batch_size=batch_size if batch_size != 1 else None, + ) + if verbose: + _LOGGER.info(f"Model '{model_name}' was loaded successfully.") + + # Prepare the successes dataframe and errors dictionary to be returned: + successes = [] + errors = {} + + # Create the output directory: + output_directory = pathlib.Path(output_directory) + output_directory.mkdir(parents=True, exist_ok=True) + + # Prepare the translation keyword arguments: + translation_kwargs = translation_kwargs or {} + + # Go over the audio files and transcribe: + for text_file in tqdm( + text_files, desc="Translating", unit="file", disable=not verbose + ): + try: + # Translate: + translation = _translate( + text_file=text_file, + translation_pipeline=translation_pipeline, + translation_kwargs=translation_kwargs, + ) + # Write the transcription to file: + translation_file = _save_to_file( + translation=translation, + file_name=text_file.stem, + output_directory=output_directory, + ) + # Note as a success in the list: + successes.append( + [ + text_file.name, + translation_file.name, + ] + ) + except Exception as exception: + # Note the exception as error in the dictionary: + if verbose: + _LOGGER.warning(f"Error in file: '{text_file.name}'") + errors[str(text_file.name)] = str(exception) + continue + + # Construct the translations dataframe: + columns = [ + "text_file", + "translation_file", + ] + successes = pd.DataFrame( + successes, + columns=columns, + ) + + # Print the head of the produced dataframe and return: + if verbose: + _LOGGER.info( + f"Done ({successes.shape[0]}/{len(text_files)})\n" + f"Translations summary:\n" + f"{successes.head()}" + ) + return str(output_directory), successes, errors + + +def _get_text_files( + data_path: pathlib.Path, +) -> List[pathlib.Path]: + # Check if the path is of a directory or a file: + if data_path.is_dir(): + # Get all files inside the directory: + text_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + text_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. " + f"Given: {str(data_path)} " + ) + + return text_files + + +def _get_translation_pipeline( + model_name: str = None, + source_language: str = None, + target_language: str = None, + device: str = None, + model_kwargs: dict = None, + batch_size: int = None, +) -> Tuple[transformers.Pipeline, str]: + # Construct the model name - if model name is provided (not None) then we take it, otherwise we check both source + # and target were provided to construct the model name: + if model_name is None and (source_language is None or target_language is None): + raise ValueError( + "No model name were given and missing source and / or target languages. In order to translate you must " + "pass a `model_name` or both `source_language` and `target_language`." + ) + elif model_name is None: + model_name = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}" + + # Initialize the translation pipeline: + try: + translation_pipeline = transformers.pipeline( + task="translation", + model=model_name, + tokenizer=model_name, + device=device, + model_kwargs=model_kwargs, + batch_size=batch_size, + ) + except OSError as load_exception: + if ( + "is not a valid model identifier listed on 'https://huggingface.co/models'" + in str(load_exception) + and source_language + ): + raise ValueError( + f"The model '{model_name}' is not a valid model identifier. " + f"The parameters `source_language` and `target_language` are used to construct a Helsinki model for " + f"text to text generation, but the model created from the given languages does not exist. " + f"You may check language identifiers at " + f"https://developers.google.com/admin-sdk/directory/v1/languages, and if the error was not fixed, one " + f"or more language code might be with 3 letters and needs to be found online. " + f"Remember, you can always choose a model directly from the Huggingface hub by using the `model_name` " + f"parameter." + ) from load_exception + raise load_exception + + return translation_pipeline, model_name + + +def _translate( + text_file: pathlib.Path, + translation_pipeline: transformers.Pipeline, + translation_kwargs: dict, +) -> str: + # Read the text from file: + with open(text_file, "r") as fp: + text = fp.read() + + # Split to paragraphs and each paragraph to sentences: + paragraphs = [paragraph.split(".") for paragraph in text.split("\n")] + + # Discover the newline indexes to restore the file to its structure post translation: + newlines_indexes = [] + for paragraph in paragraphs[:-1]: + if len(newlines_indexes) == 0: + newlines_indexes.append(len(paragraph) - 1) + else: + newlines_indexes.append(newlines_indexes[-1] + len(paragraph)) + + # Prepare the batches (each sentence from the paragraphs). Notice we add a dot not only to restore the sentence + # structure but to ignore empty strings as it will ruin the translation: + sentences = [f"{line}." for paragraph in paragraphs for line in paragraph] + + # Translate the sentences: + translations = translation_pipeline(sentences, **translation_kwargs) + + # Restructure the full text from the sentences: + translated_text = [] + newline_index = newlines_indexes.pop(0) if newlines_indexes else None + for i, translation in enumerate(translations): + # Get the translation: + text = translation["translation_text"] + # Validate if it was an empty sentence before: + if text == ".": + text = "" + # Check if needed to insert a newline: + if newline_index and newline_index == i: + text += "\n" + newline_index = newlines_indexes.pop(0) if newlines_indexes else None + # Collect it: + translated_text.append(text) + translated_text = "".join(translated_text) + + return translated_text + + +def _save_to_file( + translation: str, file_name: str, output_directory: pathlib.Path +) -> pathlib.Path: + # Prepare the file full path (checking for no duplications): + translation_file = output_directory / f"{file_name}.txt" + i = 1 + while translation_file.exists(): + i += 1 + translation_file = output_directory / f"{file_name}_{i}.txt" + + # Make sure all directories are created: + translation_file.parent.mkdir(exist_ok=True, parents=True) + + # Write to file: + with open(translation_file, "w") as fp: + fp.write(translation) + + return translation_file diff --git a/functions/master/translate/0.1.0/static/documentation.html b/functions/master/translate/0.1.0/static/documentation.html new file mode 100644 index 00000000..a14e4045 --- /dev/null +++ b/functions/master/translate/0.1.0/static/documentation.html @@ -0,0 +1,266 @@ + + + + + + + +translate package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + + + +
    +
    + + +
    +
    +
    + +
    +

    translate package

    + +
    + +
    +
    +
    +
    +
    +

    translate package#

    +
    +

    Submodules#

    +
    +
    +

    translate.translate module#

    +
    +
    +translate.translate.open_mpi_handler(worker_inputs: List[str], root_worker_inputs: Optional[Dict[str, Any]] = None)[source]#
    +
    +
    +
    +translate.translate.translate(data_path: Union[str, List[str], pathlib.Path], output_directory: str, model_name: Optional[str] = None, source_language: Optional[str] = None, target_language: Optional[str] = None, device: Optional[str] = None, model_kwargs: Optional[dict] = None, batch_size: int = 1, translation_kwargs: Optional[dict] = None, verbose: bool = False)Tuple[str, pandas.core.frame.DataFrame, dict][source]#
    +

    Translate text files using a transformer model from Huggingface’s hub according to the source and target languages +given (or using the directly provided model name). The end result is a directory of translated text files and a +dataframe containing the following columns:

    +
      +
    • text_file - The text file path.

    • +
    • translation_file - The translation text file name in the output directory.

    • +
    +
    +
    Parameters
    +
      +
    • data_path – A directory of text files or a single file or a list of files to translate.

    • +
    • output_directory – Directory where the translated files will be saved.

    • +
    • model_name – The name of a model to load. If None, the model name is constructed using the source and +target languages parameters.

    • +
    • source_language – The source language code (e.g., ‘en’ for English).

    • +
    • target_language – The target language code (e.g., ‘en’ for English).

    • +
    • model_kwargs – Keyword arguments to pass regarding the loading of the model in HuggingFace’s pipeline +function.

    • +
    • device – The device index for transformers. Default will prefer cuda if available.

    • +
    • batch_size – The number of batches to use in translation. The files are translated one by one, but the +sentences can be batched.

    • +
    • translation_kwargs – Additional keyword arguments to pass to a transformers.TranslationPipeline when doing +the translation inference. Notice the batch size here is being added automatically.

    • +
    • verbose – Whether to present logs of a progress bar and errors. Default: True.

    • +
    +
    +
    Returns
    +

    A tuple of:

    +
      +
    • Path to the output directory.

    • +
    • A dataframe dataset of the translated file names.

    • +
    • A dictionary of errored files that were not translated.

    • +
    +

    +
    +
    +
    +
    +
    +

    Module contents#

    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/translate/0.1.0/static/example.html b/functions/master/translate/0.1.0/static/example.html new file mode 100644 index 00000000..0476b66a --- /dev/null +++ b/functions/master/translate/0.1.0/static/example.html @@ -0,0 +1,705 @@ + + + + + + + +Translate tutorial + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + + + +
    +
    + + +
    +
    +
    + + +
    +
    +
    +

    Translate tutorial#

    +
    +

    Short description and explenation#

    +

    Machine translation has made huge strides in recent years thanks to advances in deep learning, our translte function makes it even easier to use.
    +Simply tell it where your file is and the languages you’re working with (the one you’re translating from and the one you want),
    +and this function takes care of the rest. It cleverly picks the right pre-trained model for your language pair, ensuring top-notch translations.

    +

    No need to worry about finding the perfect model or dealing with complex setup – it’s all handled behind the scenes.

    +

    With this function, language translation becomes a breeze, making your documents accessible in any language without breaking a sweat.

    +
    +
    +

    Background#

    +

    The function takes two parameters: a model name or the source and target languages, and a path to one or more text files to translate.

    +

    It first checks if a model name was passed. If so, it loads that Helsinki-NLP model.
    +If not, it looks at the source and target languages and loads the appropriate Helsinki-NLP translation model.

    +

    It then reads in the text files and translates them using the loaded model.

    +

    Finally, it writes the translated text out to new files and returns the filename or dir name.

    +

    This allows the user to easily translate a text file to another language using Helsinki-NLP’s pre-trained models by just passing the model name or language pair and source text file.

    +

    This function auto-model selection is based on the great translation models offered by Helsinki. Check them out https://huggingface.co/Helsinki-NLP

    +
    +
    +

    Requirements#

    +

    transformers
    +tqdm

    +
    +
    +

    Documentation#

    +

    data_path: A directory of text files or a single text file or a list of files to translate.

    +

    output_directory: Directory where the translated files will be saved.

    +

    model_name: The name of a model to load. If None, the model name is constructed using the source and
    +target languages parameters from the “Helsinki-NLP” group.

    +

    source_language: The source language code (e.g., ‘en’ for English).

    +

    target_language: The target language code (e.g., ‘en’ for English).

    +

    model_kwargs: Keyword arguments to pass regarding the loading of the model in HuggingFace’s “pipeline” +function.

    +

    device: The device index for transformers. Default will prefer cuda if available.

    +

    batch_size: The number of batches to use in translation. The files are translated one by one, but the +sentences can be batched.

    +

    translation_kwargs: Additional keyword arguments to pass to a “transformers.TranslationPipeline” when doing
    +the translation inference. Notice the batch size here is being added automatically.

    +
    +
    +

    Demo#

    +

    The following demo will show an example of translating a text file written in turkish to eanglish using the tranlate function.

    +
    +

    (1.) Import the function (import mlrun, set project and import function)#

    +
    +
    +
    import mlrun
    +
    +
    +
    +
    +

    We want to translate the following turkish sentence into english, so we will write it to a text file.

    +
    +
    +
    %%writefile data.txt
    +Ali her gece bir kitap okur. # which means: "Ali reads a book every night."
    +
    +
    +
    +
    +
    Writing data.txt
    +
    +
    +
    +
    +

    Setting a project and importing the translate function

    +
    +
    +
    project = mlrun.new_project("test-translate")
    +translate_fn = project.set_function("hub://translate", "translate")
    +
    +
    +
    +
    +
    > 2023-12-06 14:44:05,223 [info] Created and saved project: {'name': 'test-translate', 'from_template': None, 'overwrite': False, 'context': './', 'save': True}
    +
    +
    +
    +
    +
    +
    +
    +

    Usage#

    +
    +

    (2.1.) Manual model selection#

    +

    Here we run our function that we’ve imported from the MLRun Function Hub.
    +We select the specific model, give the function a path to to the file and output directory and choose to run on the cpu.

    +
    +
    +
    translate_run = translate_fn.run(
    +    handler="translate",
    +    inputs={"data_path": "data.txt"},
    +    params={
    +        "model_name": "Helsinki-NLP/opus-mt-tr-en",
    +        "device": "cpu",
    +        "output_directory": "./",
    +    },
    +    local=True,
    +    returns=[
    +        "files: path",
    +        "text_files_dataframe: dataset",
    +        "errors: dict",
    +    ],
    +)
    +
    +
    +
    +
    +
    > 2023-12-06 14:48:52,794 [info] Storing function: {'name': 'translate-translate', 'uid': '5768d0ddaf06469da053c85d47f61a47', 'db': 'http://mlrun-api:8080'}
    +
    +
    +
    Recommended: pip install sacremoses.
    +
    +
    +
    > 2023-12-06 14:48:56,190 [warning] Skipping logging an object with the log hint '{'key': 'errors', 'artifact_type': 'dict'}' due to the following error:
    +An exception was raised during the packing of '{}': No packager was found for the combination of 'object_type=builtins.dict' and 'artifact_type=dict'.
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    test-translate0Dec 06 14:48:52completedtranslate-translate
    v3io_user=yonis
    kind=local
    owner=yonis
    host=jupyter-yonis-7c9bdbfb4d-9g2p2
    data_path
    model_name=Helsinki-NLP/opus-mt-tr-en
    device=cpu
    output_directory=./
    files
    text_files_dataframe
    +
    + +
    +
    
    +
    +
    +
    > to track results use the .show() or .logs() methods or click here to open in UI
    > 2023-12-06 14:48:56,409 [info] Run execution finished: {'status': 'completed', 'name': 'translate-translate'}
    +
    +
    +
    +
    +
    +
    +

    (2.1.) Auto model detectyion#

    +

    Here we run our function that we’ve imported from the MLRun Function Hub.
    +We select the languages to use for choosing the model, give the function a path to to the file and output directory and choose to run on the cpu.

    +
    +
    +
    translate_run = translate_fn.run(
    +    handler="translate",
    +    inputs={"data_path": "data.txt"},
    +    params={
    +        "target_language": "en",
    +        "source_language": "tr",
    +        "device": "cpu",
    +        "output_directory": "./",
    +    },
    +    local=True,
    +    returns=[
    +        "files: path",
    +        "text_files_dataframe: dataset",
    +        "errors: dict",
    +    ],
    +)
    +
    +
    +
    +
    +

    We can take alook at the file created

    +
    +
    +

    (3.) Review results#

    +

    We can look at the articat returned, the import

    +
    +
    +
    translate_run.artifact("text_files_dataframe").show()
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + +
    text_filetranslation_file
    0data.txtdata_2.txt
    +
    +
    +

    Checking that translation is correct, we print the text file created by function, and can see the sentence is as expected.

    +
    +
    +
    with open("data_2.txt", "r") as f:
    +    print(f"Translated text:\n{f.read()}")
    +
    +
    +
    +
    +
    Translated text:
    +Ali reads a book every night.
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/translate/0.1.0/static/function.html b/functions/master/translate/0.1.0/static/function.html new file mode 100644 index 00000000..9103bffc --- /dev/null +++ b/functions/master/translate/0.1.0/static/function.html @@ -0,0 +1,157 @@ + + + + + + + + + + + Source + + + + +
    +        
    +kind: job
    +metadata:
    +  name: translate
    +  tag: ''
    +  hash: 7eedf684bcebfbfd964e5503afbb56335c8f4097
    +  project: ''
    +  labels:
    +    author: guyl
    +  categories:
    +  - data-preparation
    +  - huggingface
    +  - machine-learning
    +  - deep-learning
    +  - NLP
    +spec:
    +  command: ''
    +  args: []
    +  image: ''
    +  build:
    +    functionSourceCode: # Copyright 2023 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import operator
import pathlib
from functools import reduce, wraps
from typing import Any, Dict, List, Tuple, Union

import pandas as pd
import transformers
from tqdm import tqdm

# Get the global logger:
_LOGGER = logging.getLogger()


def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    is_mpi = False
    try:
        import mlrun

        context = mlrun.get_or_create_ctx(name="mlrun")
        is_mpi = context.labels.get("kind", "job") == "mpijob"

        if is_mpi:
            try:
                from mpi4py import MPI

                return context, MPI.COMM_WORLD
            except ModuleNotFoundError as mpi4py_not_found:
                context.logger.error(
                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
                )
                raise mpi4py_not_found
        else:
            return context, None
    except ModuleNotFoundError as module_not_found:
        if is_mpi:
            raise module_not_found
    return None, None


def open_mpi_handler(
    worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None
):
    global _LOGGER

    # Check for MLRun and OpenMPI availability:
    context, comm = _check_mlrun_and_open_mpi()

    # Check if MLRun is available, set the global logger to MLRun's:
    if context:
        _LOGGER = context.logger

    def decorator(handler):
        if comm is None or comm.Get_size() == 1:
            return handler

        @wraps(handler)
        def wrapper(**kwargs):
            # Get the open mpi environment properties:
            size = comm.Get_size()
            rank = comm.Get_rank()

            # Give the correct chunk of the workers inputs:
            for worker_input in worker_inputs:
                input_argument = kwargs[worker_input]
                if input_argument is None:
                    continue
                if isinstance(input_argument, (str, pathlib.Path)):
                    input_argument = _get_text_files(
                        data_path=pathlib.Path(input_argument).absolute()
                    )
                if len(input_argument) < size:
                    raise ValueError(
                        f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. "
                        f"Please reduce the amount of workers for this input."
                    )
                even_chunk_size = len(input_argument) // size
                chunk_start = rank * even_chunk_size
                chunk_end = (
                    (rank + 1) * even_chunk_size
                    if rank + 1 < size
                    else len(input_argument)
                )
                context.logger.info(
                    f"Rank #{rank}: Processing input chunk of '{worker_input}' "
                    f"from index {chunk_start} to {chunk_end}."
                )
                if isinstance(input_argument, list):
                    input_argument = input_argument[chunk_start:chunk_end]
                elif isinstance(input_argument, pd.DataFrame):
                    input_argument = input_argument.iloc[chunk_start:chunk_end:, :]
                kwargs[worker_input] = input_argument

            # Set the root worker only arguments:
            if rank == 0 and root_worker_inputs:
                kwargs.update(root_worker_inputs)

            # Run the worker:
            output = handler(**kwargs)

            # Send the output to the root rank (rank #0):
            output = comm.gather(output, root=0)
            if rank == 0:
                # Join the outputs:
                context.logger.info("Collecting data from workers to root worker.")
                output_directory = output[0][0]
                dataframe = pd.concat(objs=[df for _, df, _ in output], axis=0)
                errors_dictionary = reduce(
                    operator.ior, [err for _, _, err in output], {}
                )
                return output_directory, dataframe, errors_dictionary
            return None

        return wrapper

    return decorator


@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True})
def translate(
    data_path: Union[str, List[str], pathlib.Path],
    output_directory: str,
    model_name: str = None,
    source_language: str = None,
    target_language: str = None,
    device: str = None,
    model_kwargs: dict = None,
    batch_size: int = 1,
    translation_kwargs: dict = None,
    verbose: bool = False,
) -> Tuple[str, pd.DataFrame, dict]:
    """
    Translate text files using a transformer model from Huggingface's hub according to the source and target languages
    given (or using the directly provided model name). The end result is a directory of translated text files and a
    dataframe containing the following columns:

    * text_file - The text file path.
    * translation_file - The translation text file name in the output directory.

    :param data_path:          A directory of text files or a single file or a list of files to translate.
    :param output_directory:   Directory where the translated files will be saved.
    :param model_name:         The name of a model to load. If None, the model name is constructed using the source and
                               target languages parameters.
    :param source_language:    The source language code (e.g., 'en' for English).
    :param target_language:    The target language code (e.g., 'en' for English).
    :param model_kwargs:       Keyword arguments to pass regarding the loading of the model in HuggingFace's `pipeline`
                               function.
    :param device:             The device index for transformers. Default will prefer cuda if available.
    :param batch_size:         The number of batches to use in translation. The files are translated one by one, but the
                               sentences can be batched.
    :param translation_kwargs: Additional keyword arguments to pass to a `transformers.TranslationPipeline` when doing
                               the translation inference. Notice the batch size here is being added automatically.
    :param verbose:            Whether to present logs of a progress bar and errors. Default: True.

    :returns: A tuple of:

              * Path to the output directory.
              * A dataframe dataset of the translated file names.
              * A dictionary of errored files that were not translated.
    """
    global _LOGGER

    # Get the input text files to translate:
    if verbose:
        _LOGGER.info("Collecting text files.")
    if isinstance(data_path, str):
        data_path = pathlib.Path(data_path).absolute()
        text_files = _get_text_files(data_path=data_path)
    else:
        text_files = data_path
    if verbose:
        _LOGGER.info(f"Collected {len(text_files)} text files.")

    # Get the translation pipeline:
    if verbose:
        _LOGGER.info(f"Loading model - using device '{device}'.")
    translation_pipeline, model_name = _get_translation_pipeline(
        model_name=model_name,
        source_language=source_language,
        target_language=target_language,
        device=device,
        model_kwargs=model_kwargs,
        batch_size=batch_size if batch_size != 1 else None,
    )
    if verbose:
        _LOGGER.info(f"Model '{model_name}' was loaded successfully.")

    # Prepare the successes dataframe and errors dictionary to be returned:
    successes = []
    errors = {}

    # Create the output directory:
    output_directory = pathlib.Path(output_directory)
    output_directory.mkdir(parents=True, exist_ok=True)

    # Prepare the translation keyword arguments:
    translation_kwargs = translation_kwargs or {}

    # Go over the audio files and transcribe:
    for text_file in tqdm(
        text_files, desc="Translating", unit="file", disable=not verbose
    ):
        try:
            # Translate:
            translation = _translate(
                text_file=text_file,
                translation_pipeline=translation_pipeline,
                translation_kwargs=translation_kwargs,
            )
            # Write the transcription to file:
            translation_file = _save_to_file(
                translation=translation,
                file_name=text_file.stem,
                output_directory=output_directory,
            )
            # Note as a success in the list:
            successes.append(
                [
                    text_file.name,
                    translation_file.name,
                ]
            )
        except Exception as exception:
            # Note the exception as error in the dictionary:
            if verbose:
                _LOGGER.warning(f"Error in file: '{text_file.name}'")
            errors[str(text_file.name)] = str(exception)
            continue

    # Construct the translations dataframe:
    columns = [
        "text_file",
        "translation_file",
    ]
    successes = pd.DataFrame(
        successes,
        columns=columns,
    )

    # Print the head of the produced dataframe and return:
    if verbose:
        _LOGGER.info(
            f"Done ({successes.shape[0]}/{len(text_files)})\n"
            f"Translations summary:\n"
            f"{successes.head()}"
        )
    return str(output_directory), successes, errors


def _get_text_files(
    data_path: pathlib.Path,
) -> List[pathlib.Path]:
    # Check if the path is of a directory or a file:
    if data_path.is_dir():
        # Get all files inside the directory:
        text_files = list(data_path.glob("*.*"))
    elif data_path.is_file():
        text_files = [data_path]
    else:
        raise ValueError(
            f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. "
            f"Given: {str(data_path)} "
        )

    return text_files


def _get_translation_pipeline(
    model_name: str = None,
    source_language: str = None,
    target_language: str = None,
    device: str = None,
    model_kwargs: dict = None,
    batch_size: int = None,
) -> Tuple[transformers.Pipeline, str]:
    # Construct the model name - if model name is provided (not None) then we take it, otherwise we check both source
    # and target were provided to construct the model name:
    if model_name is None and (source_language is None or target_language is None):
        raise ValueError(
            "No model name were given and missing source and / or target languages. In order to translate you must "
            "pass a `model_name` or both `source_language` and `target_language`."
        )
    elif model_name is None:
        model_name = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"

    # Initialize the translation pipeline:
    try:
        translation_pipeline = transformers.pipeline(
            task="translation",
            model=model_name,
            tokenizer=model_name,
            device=device,
            model_kwargs=model_kwargs,
            batch_size=batch_size,
        )
    except OSError as load_exception:
        if (
            "is not a valid model identifier listed on 'https://huggingface.co/models'"
            in str(load_exception)
            and source_language
        ):
            raise ValueError(
                f"The model '{model_name}' is not a valid model identifier. "
                f"The parameters `source_language` and `target_language` are used to construct a Helsinki model for "
                f"text to text generation, but the model created from the given languages does not exist. "
                f"You may check language identifiers at "
                f"https://developers.google.com/admin-sdk/directory/v1/languages, and if the error was not fixed, one "
                f"or more language code might be with 3 letters and needs to be found online. "
                f"Remember, you can always choose a model directly from the Huggingface hub by using the `model_name` "
                f"parameter."
            ) from load_exception
        raise load_exception

    return translation_pipeline, model_name


def _translate(
    text_file: pathlib.Path,
    translation_pipeline: transformers.Pipeline,
    translation_kwargs: dict,
) -> str:
    # Read the text from file:
    with open(text_file, "r") as fp:
        text = fp.read()

    # Split to paragraphs and each paragraph to sentences:
    paragraphs = [paragraph.split(".") for paragraph in text.split("\n")]

    # Discover the newline indexes to restore the file to its structure post translation:
    newlines_indexes = []
    for paragraph in paragraphs[:-1]:
        if len(newlines_indexes) == 0:
            newlines_indexes.append(len(paragraph) - 1)
        else:
            newlines_indexes.append(newlines_indexes[-1] + len(paragraph))

    # Prepare the batches (each sentence from the paragraphs). Notice we add a dot not only to restore the sentence
    # structure but to ignore empty strings as it will ruin the translation:
    sentences = [f"{line}." for paragraph in paragraphs for line in paragraph]

    # Translate the sentences:
    translations = translation_pipeline(sentences, **translation_kwargs)

    # Restructure the full text from the sentences:
    translated_text = []
    newline_index = newlines_indexes.pop(0) if newlines_indexes else None
    for i, translation in enumerate(translations):
        # Get the translation:
        text = translation["translation_text"]
        # Validate if it was an empty sentence before:
        if text == ".":
            text = ""
        # Check if needed to insert a newline:
        if newline_index and newline_index == i:
            text += "\n"
            newline_index = newlines_indexes.pop(0) if newlines_indexes else None
        # Collect it:
        translated_text.append(text)
    translated_text = "".join(translated_text)

    return translated_text


def _save_to_file(
    translation: str, file_name: str, output_directory: pathlib.Path
) -> pathlib.Path:
    # Prepare the file full path (checking for no duplications):
    translation_file = output_directory / f"{file_name}.txt"
    i = 1
    while translation_file.exists():
        i += 1
        translation_file = output_directory / f"{file_name}_{i}.txt"

    # Make sure all directories are created:
    translation_file.parent.mkdir(exist_ok=True, parents=True)

    # Write to file:
    with open(translation_file, "w") as fp:
        fp.write(translation)

    return translation_file

    +    base_image: mlrun/mlrun
    +    commands: []
    +    code_origin: ''
    +    origin_filename: ''
    +    requirements:
    +    - transformers
    +    - sentencepiece
    +    - torch
    +    - tqdm
    +  entry_points:
    +    open_mpi_handler:
    +      name: open_mpi_handler
    +      doc: ''
    +      parameters:
    +      - name: worker_inputs
    +        type: List[str]
    +      - name: root_worker_inputs
    +        type: Dict[str, Any]
    +        default: null
    +      outputs: []
    +      lineno: 56
    +      has_varargs: false
    +      has_kwargs: false
    +    decorator:
    +      name: decorator
    +      doc: ''
    +      parameters:
    +      - name: handler
    +      outputs: []
    +      lineno: 68
    +      has_varargs: false
    +      has_kwargs: false
    +    wrapper:
    +      name: wrapper
    +      doc: ''
    +      parameters: []
    +      outputs: []
    +      lineno: 73
    +      has_varargs: false
    +      has_kwargs: true
    +    translate:
    +      name: translate
    +      doc: 'Translate text files using a transformer model from Huggingface''s hub
    +        according to the source and target languages
    +
    +        given (or using the directly provided model name). The end result is a directory
    +        of translated text files and a
    +
    +        dataframe containing the following columns:
    +
    +
    +        * text_file - The text file path.
    +
    +        * translation_file - The translation text file name in the output directory.'
    +      parameters:
    +      - name: data_path
    +        type: Union[str, List[str], Path]
    +        doc: A directory of text files or a single file or a list of files to translate.
    +      - name: output_directory
    +        type: str
    +        doc: Directory where the translated files will be saved.
    +      - name: model_name
    +        type: str
    +        doc: The name of a model to load. If None, the model name is constructed using
    +          the source and target languages parameters.
    +        default: null
    +      - name: source_language
    +        type: str
    +        doc: The source language code (e.g., 'en' for English).
    +        default: null
    +      - name: target_language
    +        type: str
    +        doc: The target language code (e.g., 'en' for English).
    +        default: null
    +      - name: device
    +        type: str
    +        doc: The device index for transformers. Default will prefer cuda if available.
    +        default: null
    +      - name: model_kwargs
    +        type: dict
    +        doc: Keyword arguments to pass regarding the loading of the model in HuggingFace's
    +          `pipeline` function.
    +        default: null
    +      - name: batch_size
    +        type: int
    +        doc: The number of batches to use in translation. The files are translated
    +          one by one, but the sentences can be batched.
    +        default: 1
    +      - name: translation_kwargs
    +        type: dict
    +        doc: Additional keyword arguments to pass to a `transformers.TranslationPipeline`
    +          when doing the translation inference. Notice the batch size here is being
    +          added automatically.
    +        default: null
    +      - name: verbose
    +        type: bool
    +        doc: 'Whether to present logs of a progress bar and errors. Default: True.'
    +        default: false
    +      outputs:
    +      - doc: 'A tuple of:'
    +        type: Tuple[str, pd.DataFrame, dict]
    +      lineno: 135
    +      has_varargs: false
    +      has_kwargs: false
    +  description: Translate text files from one language to another
    +  default_handler: translate
    +  disable_auto_mount: false
    +  clone_target_dir: ''
    +  env: []
    +  priority_class_name: ''
    +  preemption_mode: prevent
    +  affinity: null
    +  tolerations: null
    +  security_context: {}
    +verbose: false
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/translate/0.1.0/static/item.html b/functions/master/translate/0.1.0/static/item.html new file mode 100644 index 00000000..2f9e9d57 --- /dev/null +++ b/functions/master/translate/0.1.0/static/item.html @@ -0,0 +1,55 @@ + + + + + + + + + + + Source + + + + +
    +        
    +apiVersion: v1
    +categories:
    +- data-preparation
    +- huggingface
    +- machine-learning
    +- deep-learning
    +- NLP
    +description: Translate text files from one language to another
    +doc: ''
    +example: translate.ipynb
    +generationDate: 2023-12-05:17-20
    +hidden: false
    +icon: ''
    +labels:
    +  author: guyl
    +maintainers: []
    +marketplaceType: ''
    +mlrunVersion: 1.5.1
    +name: translate
    +platformVersion: 3.5.3
    +spec:
    +  filename: translate.py
    +  handler: translate
    +  image: mlrun/mlrun
    +  kind: job
    +  requirements:
    +    - transformers
    +    - sentencepiece
    +    - torch
    +    - tqdm
    +url: ''
    +version: 0.1.0
    +test_valid: True
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/translate/0.1.0/static/source.html b/functions/master/translate/0.1.0/static/source.html new file mode 100644 index 00000000..d50569d4 --- /dev/null +++ b/functions/master/translate/0.1.0/static/source.html @@ -0,0 +1,418 @@ + + + + + + + + + + + Source + + + + +
    +        
    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
    +import logging
    +import operator
    +import pathlib
    +from functools import reduce, wraps
    +from typing import Any, Dict, List, Tuple, Union
    +
    +import pandas as pd
    +import transformers
    +from tqdm import tqdm
    +
    +# Get the global logger:
    +_LOGGER = logging.getLogger()
    +
    +
    +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    +    is_mpi = False
    +    try:
    +        import mlrun
    +
    +        context = mlrun.get_or_create_ctx(name="mlrun")
    +        is_mpi = context.labels.get("kind", "job") == "mpijob"
    +
    +        if is_mpi:
    +            try:
    +                from mpi4py import MPI
    +
    +                return context, MPI.COMM_WORLD
    +            except ModuleNotFoundError as mpi4py_not_found:
    +                context.logger.error(
    +                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
    +                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
    +                )
    +                raise mpi4py_not_found
    +        else:
    +            return context, None
    +    except ModuleNotFoundError as module_not_found:
    +        if is_mpi:
    +            raise module_not_found
    +    return None, None
    +
    +
    +def open_mpi_handler(
    +    worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None
    +):
    +    global _LOGGER
    +
    +    # Check for MLRun and OpenMPI availability:
    +    context, comm = _check_mlrun_and_open_mpi()
    +
    +    # Check if MLRun is available, set the global logger to MLRun's:
    +    if context:
    +        _LOGGER = context.logger
    +
    +    def decorator(handler):
    +        if comm is None or comm.Get_size() == 1:
    +            return handler
    +
    +        @wraps(handler)
    +        def wrapper(**kwargs):
    +            # Get the open mpi environment properties:
    +            size = comm.Get_size()
    +            rank = comm.Get_rank()
    +
    +            # Give the correct chunk of the workers inputs:
    +            for worker_input in worker_inputs:
    +                input_argument = kwargs[worker_input]
    +                if input_argument is None:
    +                    continue
    +                if isinstance(input_argument, (str, pathlib.Path)):
    +                    input_argument = _get_text_files(
    +                        data_path=pathlib.Path(input_argument).absolute()
    +                    )
    +                if len(input_argument) < size:
    +                    raise ValueError(
    +                        f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. "
    +                        f"Please reduce the amount of workers for this input."
    +                    )
    +                even_chunk_size = len(input_argument) // size
    +                chunk_start = rank * even_chunk_size
    +                chunk_end = (
    +                    (rank + 1) * even_chunk_size
    +                    if rank + 1 < size
    +                    else len(input_argument)
    +                )
    +                context.logger.info(
    +                    f"Rank #{rank}: Processing input chunk of '{worker_input}' "
    +                    f"from index {chunk_start} to {chunk_end}."
    +                )
    +                if isinstance(input_argument, list):
    +                    input_argument = input_argument[chunk_start:chunk_end]
    +                elif isinstance(input_argument, pd.DataFrame):
    +                    input_argument = input_argument.iloc[chunk_start:chunk_end:, :]
    +                kwargs[worker_input] = input_argument
    +
    +            # Set the root worker only arguments:
    +            if rank == 0 and root_worker_inputs:
    +                kwargs.update(root_worker_inputs)
    +
    +            # Run the worker:
    +            output = handler(**kwargs)
    +
    +            # Send the output to the root rank (rank #0):
    +            output = comm.gather(output, root=0)
    +            if rank == 0:
    +                # Join the outputs:
    +                context.logger.info("Collecting data from workers to root worker.")
    +                output_directory = output[0][0]
    +                dataframe = pd.concat(objs=[df for _, df, _ in output], axis=0)
    +                errors_dictionary = reduce(
    +                    operator.ior, [err for _, _, err in output], {}
    +                )
    +                return output_directory, dataframe, errors_dictionary
    +            return None
    +
    +        return wrapper
    +
    +    return decorator
    +
    +
    +@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True})
    +def translate(
    +    data_path: Union[str, List[str], pathlib.Path],
    +    output_directory: str,
    +    model_name: str = None,
    +    source_language: str = None,
    +    target_language: str = None,
    +    device: str = None,
    +    model_kwargs: dict = None,
    +    batch_size: int = 1,
    +    translation_kwargs: dict = None,
    +    verbose: bool = False,
    +) -> Tuple[str, pd.DataFrame, dict]:
    +    """
    +    Translate text files using a transformer model from Huggingface's hub according to the source and target languages
    +    given (or using the directly provided model name). The end result is a directory of translated text files and a
    +    dataframe containing the following columns:
    +
    +    * text_file - The text file path.
    +    * translation_file - The translation text file name in the output directory.
    +
    +    :param data_path:          A directory of text files or a single file or a list of files to translate.
    +    :param output_directory:   Directory where the translated files will be saved.
    +    :param model_name:         The name of a model to load. If None, the model name is constructed using the source and
    +                               target languages parameters.
    +    :param source_language:    The source language code (e.g., 'en' for English).
    +    :param target_language:    The target language code (e.g., 'en' for English).
    +    :param model_kwargs:       Keyword arguments to pass regarding the loading of the model in HuggingFace's `pipeline`
    +                               function.
    +    :param device:             The device index for transformers. Default will prefer cuda if available.
    +    :param batch_size:         The number of batches to use in translation. The files are translated one by one, but the
    +                               sentences can be batched.
    +    :param translation_kwargs: Additional keyword arguments to pass to a `transformers.TranslationPipeline` when doing
    +                               the translation inference. Notice the batch size here is being added automatically.
    +    :param verbose:            Whether to present logs of a progress bar and errors. Default: True.
    +
    +    :returns: A tuple of:
    +
    +              * Path to the output directory.
    +              * A dataframe dataset of the translated file names.
    +              * A dictionary of errored files that were not translated.
    +    """
    +    global _LOGGER
    +
    +    # Get the input text files to translate:
    +    if verbose:
    +        _LOGGER.info("Collecting text files.")
    +    if isinstance(data_path, str):
    +        data_path = pathlib.Path(data_path).absolute()
    +        text_files = _get_text_files(data_path=data_path)
    +    else:
    +        text_files = data_path
    +    if verbose:
    +        _LOGGER.info(f"Collected {len(text_files)} text files.")
    +
    +    # Get the translation pipeline:
    +    if verbose:
    +        _LOGGER.info(f"Loading model - using device '{device}'.")
    +    translation_pipeline, model_name = _get_translation_pipeline(
    +        model_name=model_name,
    +        source_language=source_language,
    +        target_language=target_language,
    +        device=device,
    +        model_kwargs=model_kwargs,
    +        batch_size=batch_size if batch_size != 1 else None,
    +    )
    +    if verbose:
    +        _LOGGER.info(f"Model '{model_name}' was loaded successfully.")
    +
    +    # Prepare the successes dataframe and errors dictionary to be returned:
    +    successes = []
    +    errors = {}
    +
    +    # Create the output directory:
    +    output_directory = pathlib.Path(output_directory)
    +    output_directory.mkdir(parents=True, exist_ok=True)
    +
    +    # Prepare the translation keyword arguments:
    +    translation_kwargs = translation_kwargs or {}
    +
    +    # Go over the audio files and transcribe:
    +    for text_file in tqdm(
    +        text_files, desc="Translating", unit="file", disable=not verbose
    +    ):
    +        try:
    +            # Translate:
    +            translation = _translate(
    +                text_file=text_file,
    +                translation_pipeline=translation_pipeline,
    +                translation_kwargs=translation_kwargs,
    +            )
    +            # Write the transcription to file:
    +            translation_file = _save_to_file(
    +                translation=translation,
    +                file_name=text_file.stem,
    +                output_directory=output_directory,
    +            )
    +            # Note as a success in the list:
    +            successes.append(
    +                [
    +                    text_file.name,
    +                    translation_file.name,
    +                ]
    +            )
    +        except Exception as exception:
    +            # Note the exception as error in the dictionary:
    +            if verbose:
    +                _LOGGER.warning(f"Error in file: '{text_file.name}'")
    +            errors[str(text_file.name)] = str(exception)
    +            continue
    +
    +    # Construct the translations dataframe:
    +    columns = [
    +        "text_file",
    +        "translation_file",
    +    ]
    +    successes = pd.DataFrame(
    +        successes,
    +        columns=columns,
    +    )
    +
    +    # Print the head of the produced dataframe and return:
    +    if verbose:
    +        _LOGGER.info(
    +            f"Done ({successes.shape[0]}/{len(text_files)})\n"
    +            f"Translations summary:\n"
    +            f"{successes.head()}"
    +        )
    +    return str(output_directory), successes, errors
    +
    +
    +def _get_text_files(
    +    data_path: pathlib.Path,
    +) -> List[pathlib.Path]:
    +    # Check if the path is of a directory or a file:
    +    if data_path.is_dir():
    +        # Get all files inside the directory:
    +        text_files = list(data_path.glob("*.*"))
    +    elif data_path.is_file():
    +        text_files = [data_path]
    +    else:
    +        raise ValueError(
    +            f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. "
    +            f"Given: {str(data_path)} "
    +        )
    +
    +    return text_files
    +
    +
    +def _get_translation_pipeline(
    +    model_name: str = None,
    +    source_language: str = None,
    +    target_language: str = None,
    +    device: str = None,
    +    model_kwargs: dict = None,
    +    batch_size: int = None,
    +) -> Tuple[transformers.Pipeline, str]:
    +    # Construct the model name - if model name is provided (not None) then we take it, otherwise we check both source
    +    # and target were provided to construct the model name:
    +    if model_name is None and (source_language is None or target_language is None):
    +        raise ValueError(
    +            "No model name were given and missing source and / or target languages. In order to translate you must "
    +            "pass a `model_name` or both `source_language` and `target_language`."
    +        )
    +    elif model_name is None:
    +        model_name = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"
    +
    +    # Initialize the translation pipeline:
    +    try:
    +        translation_pipeline = transformers.pipeline(
    +            task="translation",
    +            model=model_name,
    +            tokenizer=model_name,
    +            device=device,
    +            model_kwargs=model_kwargs,
    +            batch_size=batch_size,
    +        )
    +    except OSError as load_exception:
    +        if (
    +            "is not a valid model identifier listed on 'https://huggingface.co/models'"
    +            in str(load_exception)
    +            and source_language
    +        ):
    +            raise ValueError(
    +                f"The model '{model_name}' is not a valid model identifier. "
    +                f"The parameters `source_language` and `target_language` are used to construct a Helsinki model for "
    +                f"text to text generation, but the model created from the given languages does not exist. "
    +                f"You may check language identifiers at "
    +                f"https://developers.google.com/admin-sdk/directory/v1/languages, and if the error was not fixed, one "
    +                f"or more language code might be with 3 letters and needs to be found online. "
    +                f"Remember, you can always choose a model directly from the Huggingface hub by using the `model_name` "
    +                f"parameter."
    +            ) from load_exception
    +        raise load_exception
    +
    +    return translation_pipeline, model_name
    +
    +
    +def _translate(
    +    text_file: pathlib.Path,
    +    translation_pipeline: transformers.Pipeline,
    +    translation_kwargs: dict,
    +) -> str:
    +    # Read the text from file:
    +    with open(text_file, "r") as fp:
    +        text = fp.read()
    +
    +    # Split to paragraphs and each paragraph to sentences:
    +    paragraphs = [paragraph.split(".") for paragraph in text.split("\n")]
    +
    +    # Discover the newline indexes to restore the file to its structure post translation:
    +    newlines_indexes = []
    +    for paragraph in paragraphs[:-1]:
    +        if len(newlines_indexes) == 0:
    +            newlines_indexes.append(len(paragraph) - 1)
    +        else:
    +            newlines_indexes.append(newlines_indexes[-1] + len(paragraph))
    +
    +    # Prepare the batches (each sentence from the paragraphs). Notice we add a dot not only to restore the sentence
    +    # structure but to ignore empty strings as it will ruin the translation:
    +    sentences = [f"{line}." for paragraph in paragraphs for line in paragraph]
    +
    +    # Translate the sentences:
    +    translations = translation_pipeline(sentences, **translation_kwargs)
    +
    +    # Restructure the full text from the sentences:
    +    translated_text = []
    +    newline_index = newlines_indexes.pop(0) if newlines_indexes else None
    +    for i, translation in enumerate(translations):
    +        # Get the translation:
    +        text = translation["translation_text"]
    +        # Validate if it was an empty sentence before:
    +        if text == ".":
    +            text = ""
    +        # Check if needed to insert a newline:
    +        if newline_index and newline_index == i:
    +            text += "\n"
    +            newline_index = newlines_indexes.pop(0) if newlines_indexes else None
    +        # Collect it:
    +        translated_text.append(text)
    +    translated_text = "".join(translated_text)
    +
    +    return translated_text
    +
    +
    +def _save_to_file(
    +    translation: str, file_name: str, output_directory: pathlib.Path
    +) -> pathlib.Path:
    +    # Prepare the file full path (checking for no duplications):
    +    translation_file = output_directory / f"{file_name}.txt"
    +    i = 1
    +    while translation_file.exists():
    +        i += 1
    +        translation_file = output_directory / f"{file_name}_{i}.txt"
    +
    +    # Make sure all directories are created:
    +    translation_file.parent.mkdir(exist_ok=True, parents=True)
    +
    +    # Write to file:
    +    with open(translation_file, "w") as fp:
    +        fp.write(translation)
    +
    +    return translation_file
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/translate/0.1.0/static/translate.html b/functions/master/translate/0.1.0/static/translate.html new file mode 100644 index 00000000..95d99310 --- /dev/null +++ b/functions/master/translate/0.1.0/static/translate.html @@ -0,0 +1,536 @@ + + + + + + + +translate.translate + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + + +
    + +
    +
    +
    +
    +
    +
    + + +
    +
    + +
    +
    +
    +
    +
    + +
    +

    + +
    +
    +
    +
    +
    +
    +
    +

    Source code for translate.translate

    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
    +import logging
    +import operator
    +import pathlib
    +from functools import reduce, wraps
    +from typing import Any, Dict, List, Tuple, Union
    +
    +import pandas as pd
    +import transformers
    +from tqdm import tqdm
    +
    +# Get the global logger:
    +_LOGGER = logging.getLogger()
    +
    +
    +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    +    is_mpi = False
    +    try:
    +        import mlrun
    +
    +        context = mlrun.get_or_create_ctx(name="mlrun")
    +        is_mpi = context.labels.get("kind", "job") == "mpijob"
    +
    +        if is_mpi:
    +            try:
    +                from mpi4py import MPI
    +
    +                return context, MPI.COMM_WORLD
    +            except ModuleNotFoundError as mpi4py_not_found:
    +                context.logger.error(
    +                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
    +                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
    +                )
    +                raise mpi4py_not_found
    +        else:
    +            return context, None
    +    except ModuleNotFoundError as module_not_found:
    +        if is_mpi:
    +            raise module_not_found
    +    return None, None
    +
    +
    +
    [docs]def open_mpi_handler( + worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None +): + global _LOGGER + + # Check for MLRun and OpenMPI availability: + context, comm = _check_mlrun_and_open_mpi() + + # Check if MLRun is available, set the global logger to MLRun's: + if context: + _LOGGER = context.logger + + def decorator(handler): + if comm is None or comm.Get_size() == 1: + return handler + + @wraps(handler) + def wrapper(**kwargs): + # Get the open mpi environment properties: + size = comm.Get_size() + rank = comm.Get_rank() + + # Give the correct chunk of the workers inputs: + for worker_input in worker_inputs: + input_argument = kwargs[worker_input] + if input_argument is None: + continue + if isinstance(input_argument, (str, pathlib.Path)): + input_argument = _get_text_files( + data_path=pathlib.Path(input_argument).absolute() + ) + if len(input_argument) < size: + raise ValueError( + f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. " + f"Please reduce the amount of workers for this input." + ) + even_chunk_size = len(input_argument) // size + chunk_start = rank * even_chunk_size + chunk_end = ( + (rank + 1) * even_chunk_size + if rank + 1 < size + else len(input_argument) + ) + context.logger.info( + f"Rank #{rank}: Processing input chunk of '{worker_input}' " + f"from index {chunk_start} to {chunk_end}." + ) + if isinstance(input_argument, list): + input_argument = input_argument[chunk_start:chunk_end] + elif isinstance(input_argument, pd.DataFrame): + input_argument = input_argument.iloc[chunk_start:chunk_end:, :] + kwargs[worker_input] = input_argument + + # Set the root worker only arguments: + if rank == 0 and root_worker_inputs: + kwargs.update(root_worker_inputs) + + # Run the worker: + output = handler(**kwargs) + + # Send the output to the root rank (rank #0): + output = comm.gather(output, root=0) + if rank == 0: + # Join the outputs: + context.logger.info("Collecting data from workers to root worker.") + output_directory = output[0][0] + dataframe = pd.concat(objs=[df for _, df, _ in output], axis=0) + errors_dictionary = reduce( + operator.ior, [err for _, _, err in output], {} + ) + return output_directory, dataframe, errors_dictionary + return None + + return wrapper + + return decorator
    + + +
    [docs]@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True}) +def translate( + data_path: Union[str, List[str], pathlib.Path], + output_directory: str, + model_name: str = None, + source_language: str = None, + target_language: str = None, + device: str = None, + model_kwargs: dict = None, + batch_size: int = 1, + translation_kwargs: dict = None, + verbose: bool = False, +) -> Tuple[str, pd.DataFrame, dict]: + """ + Translate text files using a transformer model from Huggingface's hub according to the source and target languages + given (or using the directly provided model name). The end result is a directory of translated text files and a + dataframe containing the following columns: + + * text_file - The text file path. + * translation_file - The translation text file name in the output directory. + + :param data_path: A directory of text files or a single file or a list of files to translate. + :param output_directory: Directory where the translated files will be saved. + :param model_name: The name of a model to load. If None, the model name is constructed using the source and + target languages parameters. + :param source_language: The source language code (e.g., 'en' for English). + :param target_language: The target language code (e.g., 'en' for English). + :param model_kwargs: Keyword arguments to pass regarding the loading of the model in HuggingFace's `pipeline` + function. + :param device: The device index for transformers. Default will prefer cuda if available. + :param batch_size: The number of batches to use in translation. The files are translated one by one, but the + sentences can be batched. + :param translation_kwargs: Additional keyword arguments to pass to a `transformers.TranslationPipeline` when doing + the translation inference. Notice the batch size here is being added automatically. + :param verbose: Whether to present logs of a progress bar and errors. Default: True. + + :returns: A tuple of: + + * Path to the output directory. + * A dataframe dataset of the translated file names. + * A dictionary of errored files that were not translated. + """ + global _LOGGER + + # Get the input text files to translate: + if verbose: + _LOGGER.info("Collecting text files.") + if isinstance(data_path, str): + data_path = pathlib.Path(data_path).absolute() + text_files = _get_text_files(data_path=data_path) + else: + text_files = data_path + if verbose: + _LOGGER.info(f"Collected {len(text_files)} text files.") + + # Get the translation pipeline: + if verbose: + _LOGGER.info(f"Loading model - using device '{device}'.") + translation_pipeline, model_name = _get_translation_pipeline( + model_name=model_name, + source_language=source_language, + target_language=target_language, + device=device, + model_kwargs=model_kwargs, + batch_size=batch_size if batch_size != 1 else None, + ) + if verbose: + _LOGGER.info(f"Model '{model_name}' was loaded successfully.") + + # Prepare the successes dataframe and errors dictionary to be returned: + successes = [] + errors = {} + + # Create the output directory: + output_directory = pathlib.Path(output_directory) + output_directory.mkdir(parents=True, exist_ok=True) + + # Prepare the translation keyword arguments: + translation_kwargs = translation_kwargs or {} + + # Go over the audio files and transcribe: + for text_file in tqdm( + text_files, desc="Translating", unit="file", disable=not verbose + ): + try: + # Translate: + translation = _translate( + text_file=text_file, + translation_pipeline=translation_pipeline, + translation_kwargs=translation_kwargs, + ) + # Write the transcription to file: + translation_file = _save_to_file( + translation=translation, + file_name=text_file.stem, + output_directory=output_directory, + ) + # Note as a success in the list: + successes.append( + [ + text_file.name, + translation_file.name, + ] + ) + except Exception as exception: + # Note the exception as error in the dictionary: + if verbose: + _LOGGER.warning(f"Error in file: '{text_file.name}'") + errors[str(text_file.name)] = str(exception) + continue + + # Construct the translations dataframe: + columns = [ + "text_file", + "translation_file", + ] + successes = pd.DataFrame( + successes, + columns=columns, + ) + + # Print the head of the produced dataframe and return: + if verbose: + _LOGGER.info( + f"Done ({successes.shape[0]}/{len(text_files)})\n" + f"Translations summary:\n" + f"{successes.head()}" + ) + return str(output_directory), successes, errors
    + + +def _get_text_files( + data_path: pathlib.Path, +) -> List[pathlib.Path]: + # Check if the path is of a directory or a file: + if data_path.is_dir(): + # Get all files inside the directory: + text_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + text_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. " + f"Given: {str(data_path)} " + ) + + return text_files + + +def _get_translation_pipeline( + model_name: str = None, + source_language: str = None, + target_language: str = None, + device: str = None, + model_kwargs: dict = None, + batch_size: int = None, +) -> Tuple[transformers.Pipeline, str]: + # Construct the model name - if model name is provided (not None) then we take it, otherwise we check both source + # and target were provided to construct the model name: + if model_name is None and (source_language is None or target_language is None): + raise ValueError( + "No model name were given and missing source and / or target languages. In order to translate you must " + "pass a `model_name` or both `source_language` and `target_language`." + ) + elif model_name is None: + model_name = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}" + + # Initialize the translation pipeline: + try: + translation_pipeline = transformers.pipeline( + task="translation", + model=model_name, + tokenizer=model_name, + device=device, + model_kwargs=model_kwargs, + batch_size=batch_size, + ) + except OSError as load_exception: + if ( + "is not a valid model identifier listed on 'https://huggingface.co/models'" + in str(load_exception) + and source_language + ): + raise ValueError( + f"The model '{model_name}' is not a valid model identifier. " + f"The parameters `source_language` and `target_language` are used to construct a Helsinki model for " + f"text to text generation, but the model created from the given languages does not exist. " + f"You may check language identifiers at " + f"https://developers.google.com/admin-sdk/directory/v1/languages, and if the error was not fixed, one " + f"or more language code might be with 3 letters and needs to be found online. " + f"Remember, you can always choose a model directly from the Huggingface hub by using the `model_name` " + f"parameter." + ) from load_exception + raise load_exception + + return translation_pipeline, model_name + + +def _translate( + text_file: pathlib.Path, + translation_pipeline: transformers.Pipeline, + translation_kwargs: dict, +) -> str: + # Read the text from file: + with open(text_file, "r") as fp: + text = fp.read() + + # Split to paragraphs and each paragraph to sentences: + paragraphs = [paragraph.split(".") for paragraph in text.split("\n")] + + # Discover the newline indexes to restore the file to its structure post translation: + newlines_indexes = [] + for paragraph in paragraphs[:-1]: + if len(newlines_indexes) == 0: + newlines_indexes.append(len(paragraph) - 1) + else: + newlines_indexes.append(newlines_indexes[-1] + len(paragraph)) + + # Prepare the batches (each sentence from the paragraphs). Notice we add a dot not only to restore the sentence + # structure but to ignore empty strings as it will ruin the translation: + sentences = [f"{line}." for paragraph in paragraphs for line in paragraph] + + # Translate the sentences: + translations = translation_pipeline(sentences, **translation_kwargs) + + # Restructure the full text from the sentences: + translated_text = [] + newline_index = newlines_indexes.pop(0) if newlines_indexes else None + for i, translation in enumerate(translations): + # Get the translation: + text = translation["translation_text"] + # Validate if it was an empty sentence before: + if text == ".": + text = "" + # Check if needed to insert a newline: + if newline_index and newline_index == i: + text += "\n" + newline_index = newlines_indexes.pop(0) if newlines_indexes else None + # Collect it: + translated_text.append(text) + translated_text = "".join(translated_text) + + return translated_text + + +def _save_to_file( + translation: str, file_name: str, output_directory: pathlib.Path +) -> pathlib.Path: + # Prepare the file full path (checking for no duplications): + translation_file = output_directory / f"{file_name}.txt" + i = 1 + while translation_file.exists(): + i += 1 + translation_file = output_directory / f"{file_name}_{i}.txt" + + # Make sure all directories are created: + translation_file.parent.mkdir(exist_ok=True, parents=True) + + # Write to file: + with open(translation_file, "w") as fp: + fp.write(translation) + + return translation_file +
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    +
    + + + + \ No newline at end of file diff --git a/functions/master/translate/latest/src/function.yaml b/functions/master/translate/latest/src/function.yaml index 1a3fd7a8..bb165610 100644 --- a/functions/master/translate/latest/src/function.yaml +++ b/functions/master/translate/latest/src/function.yaml @@ -2,13 +2,16 @@ kind: job metadata: name: translate tag: '' - hash: bc26313449cd13554a18106ed9893535fb79dd6e + hash: 7eedf684bcebfbfd964e5503afbb56335c8f4097 project: '' labels: author: guyl categories: - data-preparation + - huggingface - machine-learning + - deep-learning + - NLP spec: command: '' args: [] @@ -34,24 +37,27 @@ spec: - name: root_worker_inputs type: Dict[str, Any] default: null - outputs: - - default: '' + outputs: [] lineno: 56 + has_varargs: false + has_kwargs: false decorator: name: decorator doc: '' parameters: - name: handler - outputs: - - default: '' + outputs: [] lineno: 68 + has_varargs: false + has_kwargs: false wrapper: name: wrapper doc: '' parameters: [] - outputs: - - default: '' + outputs: [] lineno: 73 + has_varargs: false + has_kwargs: true translate: name: translate doc: 'Translate text files using a transformer model from Huggingface''s hub @@ -112,8 +118,10 @@ spec: default: false outputs: - doc: 'A tuple of:' - default: '' + type: Tuple[str, pd.DataFrame, dict] lineno: 135 + has_varargs: false + has_kwargs: false description: Translate text files from one language to another default_handler: translate disable_auto_mount: false diff --git a/functions/master/translate/latest/src/item.yaml b/functions/master/translate/latest/src/item.yaml index f85a5599..e6394734 100644 --- a/functions/master/translate/latest/src/item.yaml +++ b/functions/master/translate/latest/src/item.yaml @@ -1,6 +1,7 @@ apiVersion: v1 categories: - data-preparation +- huggingface - machine-learning - deep-learning - NLP @@ -28,5 +29,5 @@ spec: - torch - tqdm url: '' -version: 0.0.2 +version: 0.1.0 test_valid: True diff --git a/functions/master/translate/latest/static/function.html b/functions/master/translate/latest/static/function.html index a75a6baf..9103bffc 100644 --- a/functions/master/translate/latest/static/function.html +++ b/functions/master/translate/latest/static/function.html @@ -19,13 +19,16 @@ metadata: name: translate tag: '' - hash: bc26313449cd13554a18106ed9893535fb79dd6e + hash: 7eedf684bcebfbfd964e5503afbb56335c8f4097 project: '' labels: author: guyl categories: - data-preparation + - huggingface - machine-learning + - deep-learning + - NLP spec: command: '' args: [] @@ -51,24 +54,27 @@ - name: root_worker_inputs type: Dict[str, Any] default: null - outputs: - - default: '' + outputs: [] lineno: 56 + has_varargs: false + has_kwargs: false decorator: name: decorator doc: '' parameters: - name: handler - outputs: - - default: '' + outputs: [] lineno: 68 + has_varargs: false + has_kwargs: false wrapper: name: wrapper doc: '' parameters: [] - outputs: - - default: '' + outputs: [] lineno: 73 + has_varargs: false + has_kwargs: true translate: name: translate doc: 'Translate text files using a transformer model from Huggingface''s hub @@ -129,8 +135,10 @@ default: false outputs: - doc: 'A tuple of:' - default: '' + type: Tuple[str, pd.DataFrame, dict] lineno: 135 + has_varargs: false + has_kwargs: false description: Translate text files from one language to another default_handler: translate disable_auto_mount: false diff --git a/functions/master/translate/latest/static/item.html b/functions/master/translate/latest/static/item.html index eff27700..2f9e9d57 100644 --- a/functions/master/translate/latest/static/item.html +++ b/functions/master/translate/latest/static/item.html @@ -18,6 +18,7 @@ apiVersion: v1 categories: - data-preparation +- huggingface - machine-learning - deep-learning - NLP @@ -45,7 +46,7 @@ - torch - tqdm url: '' -version: 0.0.2 +version: 0.1.0 test_valid: True