-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Finish Annotated Type lecture, update schemas
- Loading branch information
1 parent
5dbc95f
commit 1296b24
Showing
3 changed files
with
91 additions
and
82 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,20 +31,21 @@ | |
"import joblib\n", | ||
"import mlflow\n", | ||
"from mlflow.models import infer_signature\n", | ||
"import nbdev #; nbdev.nbdev_export()\n", | ||
"import nbdev # ; nbdev.nbdev_export()\n", | ||
"from nbdev.showdoc import *\n", | ||
"import pandas as pd\n", | ||
"import re\n", | ||
"from sklearn.feature_extraction.text import (\n", | ||
" CountVectorizer\n", | ||
" , TfidfTransformer\n", | ||
" , TfidfVectorizer\n", | ||
" , \n", | ||
" CountVectorizer,\n", | ||
" TfidfTransformer,\n", | ||
" TfidfVectorizer,\n", | ||
")\n", | ||
"from sklearn.model_selection import train_test_split\n", | ||
"from sklearn.pipeline import make_pipeline\n", | ||
"from src.backend.embedding_creation.apply_stanza import CustomSKLearnAnalyzer\n", | ||
"from src.backend.embedding_creation.sklearn_transformer_as_mlflow_model import CustomSKLearnWrapper\n", | ||
"from src.backend.embedding_creation.sklearn_transformer_as_mlflow_model import (\n", | ||
" CustomSKLearnWrapper,\n", | ||
")\n", | ||
"import src.backend.raw_data_cleaning.raw_data_preprocessor as rdpp\n", | ||
"import stanza\n", | ||
"from tqdm import tqdm" | ||
|
@@ -77,19 +78,18 @@ | |
} | ||
], | ||
"source": [ | ||
"#@markdown Enter the username of your DAGsHub account:\n", | ||
"DAGSHUB_USER_NAME = \"AaronWChen\" #@param {type:\"string\"}\n", | ||
"# @markdown Enter the username of your DAGsHub account:\n", | ||
"DAGSHUB_USER_NAME = \"AaronWChen\" # @param {type:\"string\"}\n", | ||
"\n", | ||
"#@markdown Enter the email for your DAGsHub account:\n", | ||
"DAGSHUB_EMAIL = \"[email protected]\" #@param {type:\"string\"}\n", | ||
"# @markdown Enter the email for your DAGsHub account:\n", | ||
"DAGSHUB_EMAIL = \"[email protected]\" # @param {type:\"string\"}\n", | ||
"\n", | ||
"#@markdown Enter the repo name \n", | ||
"# @markdown Enter the repo name\n", | ||
"DAGSHUB_REPO_NAME = \"MeaLeon\"\n", | ||
"\n", | ||
"#@markdown Enter the name of the branch you are working on \n", | ||
"# @markdown Enter the name of the branch you are working on\n", | ||
"BRANCH = \"init_mealeon_to_notebook_refactor\"\n", | ||
"dagshub.init(repo_name=DAGSHUB_REPO_NAME\n", | ||
" , repo_owner=DAGSHUB_USER_NAME)" | ||
"dagshub.init(repo_name=DAGSHUB_REPO_NAME, repo_owner=DAGSHUB_USER_NAME)" | ||
] | ||
}, | ||
{ | ||
|
@@ -187,7 +187,7 @@ | |
"outputs": [], | ||
"source": [ | ||
"# ETL work (currently, data cleaning/prep)\n", | ||
"# how the prep works is via dataframe_preprocessor \n", | ||
"# how the prep works is via dataframe_preprocessor\n", | ||
"cleaned_df = rdpp.preprocess_dataframe(raw_df)\n", | ||
"cleaned_df.to_parquet(\"../data/processed/cleaned_df.parquet.gzip\", compression=\"gzip\")" | ||
] | ||
|
@@ -260,14 +260,14 @@ | |
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#| export\n", | ||
"# | export\n", | ||
"# this is a custom function to be used with MLflow to get or create experiments (is from the MLflow team)\n", | ||
"def get_mlflow_experiment_id(name):\n", | ||
" # this function allows us to get the experiment ID from an experiment name\n", | ||
" exp = mlflow.get_experiment_by_name(name)\n", | ||
" if exp is None:\n", | ||
" exp_id = mlflow.create_experiment(name)\n", | ||
" return exp_id\n", | ||
" exp_id = mlflow.create_experiment(name)\n", | ||
" return exp_id\n", | ||
" return exp.experiment_id" | ||
] | ||
}, | ||
|
@@ -284,15 +284,17 @@ | |
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"mlflow.set_tracking_uri(f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n", | ||
"mlflow.set_tracking_uri(f\"https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow\")\n", | ||
"\n", | ||
"# starter idea for making an experiment name, can be the git branch, but need more specificity\n", | ||
"experiment_name = f\"{DAGSHUB_EMAIL}/DVC-MLflow-integration-test\"\n", | ||
"mlflow_exp_id = get_mlflow_experiment_id(experiment_name)\n", | ||
"\n", | ||
"# define processed data location and data to be added to DVC\n", | ||
"processed_data_base = \"../data/processed\"\n", | ||
"transformed_recipes_parquet_path = processed_data_base + \"/transformed_recipes.parquet.gzip\"\n", | ||
"transformed_recipes_parquet_path = (\n", | ||
" processed_data_base + \"/transformed_recipes.parquet.gzip\"\n", | ||
")\n", | ||
"combined_df_path = processed_data_base + \"/combined_df.parquet.gzip\"\n", | ||
"\n", | ||
"\n", | ||
|
@@ -305,13 +307,13 @@ | |
"# transformed_recipes_path = model_directory + \"/transformed_recipes.pkl\"\n", | ||
"combined_df_sample_path = model_directory + \"/combined_df_sample.parquet\"\n", | ||
"\n", | ||
"artifacts = {'sklearn_model': sklearn_model_path,\n", | ||
" 'sklearn_transformer': sklearn_transformer_path,\n", | ||
" # 'transformed_recipes': transformed_recipes_path,\n", | ||
" # 'combined_data': combined_df_path,\n", | ||
" 'combined_data_sample': combined_df_sample_path\n", | ||
" }\n", | ||
"\n" | ||
"artifacts = {\n", | ||
" \"sklearn_model\": sklearn_model_path,\n", | ||
" \"sklearn_transformer\": sklearn_transformer_path,\n", | ||
" # 'transformed_recipes': transformed_recipes_path,\n", | ||
" # 'combined_data': combined_df_path,\n", | ||
" \"combined_data_sample\": combined_df_sample_path,\n", | ||
"}" | ||
] | ||
}, | ||
{ | ||
|
@@ -811,105 +813,105 @@ | |
"source": [ | ||
"# load from MLflow\n", | ||
"mlflow_client = mlflow.tracking.MlflowClient(\n", | ||
" tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n", | ||
" tracking_uri=f\"https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow\"\n", | ||
")\n", | ||
"\n", | ||
"# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n", | ||
"sklearn_transformer_params = { \n", | ||
" 'analyzer': CustomSKLearnAnalyzer().ngram_maker(\n", | ||
"sklearn_transformer_params = {\n", | ||
" \"analyzer\": CustomSKLearnAnalyzer().ngram_maker(\n", | ||
" min_ngram_length=1,\n", | ||
" max_ngram_length=4,\n", | ||
" ),\n", | ||
" 'min_df':3,\n", | ||
" 'binary':False\n", | ||
" ),\n", | ||
" \"min_df\": 3,\n", | ||
" \"binary\": False,\n", | ||
"}\n", | ||
"\n", | ||
"# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters\n", | ||
"pipeline_params = {\n", | ||
" 'stanza_model': 'en',\n", | ||
" 'sklearn-transformer': 'TFIDF'\n", | ||
"}\n", | ||
"pipeline_params = {\"stanza_model\": \"en\", \"sklearn-transformer\": \"TFIDF\"}\n", | ||
"\n", | ||
"# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking\n", | ||
"pipeline_params.update(sklearn_transformer_params)\n", | ||
"\n", | ||
"with mlflow.start_run(experiment_id=mlflow_exp_id): \n", | ||
"with mlflow.start_run(experiment_id=mlflow_exp_id):\n", | ||
" # LOG PARAMETERS\n", | ||
" mlflow.log_params(pipeline_params)\n", | ||
"\n", | ||
" # LOG INPUTS (QUERIES) AND OUTPUTS\n", | ||
" # MLflow example uses a list of strings or a list of str->str dicts\n", | ||
" # Will be useful in STAGING/Evaluation\n", | ||
" \n", | ||
"\n", | ||
" # LOG MODEL\n", | ||
" # Instantiate sklearn TFIDFVectorizer\n", | ||
" sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)\n", | ||
"\n", | ||
" print('\\n')\n", | ||
" print('-' * 80)\n", | ||
" print('sklearn fit transform on ingredients:')\n", | ||
" print(\"\\n\")\n", | ||
" print(\"-\" * 80)\n", | ||
" print(\"sklearn fit transform on ingredients:\")\n", | ||
"\n", | ||
" model_input = whole_nlp_df['ingredients_lemmafied']\n", | ||
" model_input = whole_nlp_df[\"ingredients_lemmafied\"]\n", | ||
"\n", | ||
" print('\\n')\n", | ||
" print('-' * 80)\n", | ||
" print('Input Data: ')\n", | ||
" print(\"\\n\")\n", | ||
" print(\"-\" * 80)\n", | ||
" print(\"Input Data: \")\n", | ||
" print(model_input)\n", | ||
"\n", | ||
" print('\\n')\n", | ||
" print('-' * 80)\n", | ||
" print('Input Data Shape: ')\n", | ||
" print(\"\\n\")\n", | ||
" print(\"-\" * 80)\n", | ||
" print(\"Input Data Shape: \")\n", | ||
" print(model_input.shape)\n", | ||
"\n", | ||
" random_sample = model_input.sample(3, random_state=200)\n", | ||
"\n", | ||
" print('\\n')\n", | ||
" print('-' * 80)\n", | ||
" print('Random 3 Records from Input Data: ')\n", | ||
" print(\"\\n\")\n", | ||
" print(\"-\" * 80)\n", | ||
" print(\"Random 3 Records from Input Data: \")\n", | ||
" print(random_sample)\n", | ||
"\n", | ||
" # Do fit transform on data\n", | ||
" response = sklearn_transformer.fit_transform(tqdm(model_input)) \n", | ||
" \n", | ||
" response = sklearn_transformer.fit_transform(tqdm(model_input))\n", | ||
"\n", | ||
" transformed_recipe = pd.DataFrame(\n", | ||
" response.toarray(),\n", | ||
" columns=sklearn_transformer.get_feature_names_out(),\n", | ||
" index=model_input.index\n", | ||
" response.toarray(),\n", | ||
" columns=sklearn_transformer.get_feature_names_out(),\n", | ||
" index=model_input.index,\n", | ||
" )\n", | ||
"\n", | ||
" signature = infer_signature(model_input=model_input,\n", | ||
" model_output=transformed_recipe\n", | ||
" )\n", | ||
" signature = infer_signature(\n", | ||
" model_input=model_input, model_output=transformed_recipe\n", | ||
" )\n", | ||
"\n", | ||
" print('\\n')\n", | ||
" print('-' * 80)\n", | ||
" print('Transformed Data:')\n", | ||
" print(\"\\n\")\n", | ||
" print(\"-\" * 80)\n", | ||
" print(\"Transformed Data:\")\n", | ||
" print(transformed_recipe.head())\n", | ||
" \n", | ||
" combined_df = transformed_recipe.join(model_input, how='inner')\n", | ||
" combined_df_sample = transformed_recipe.join(random_sample, how='inner')\n", | ||
"\n", | ||
" print('\\n')\n", | ||
" print('-' * 80)\n", | ||
" print('Random Sample of Combined Data:')\n", | ||
" combined_df = transformed_recipe.join(model_input, how=\"inner\")\n", | ||
" combined_df_sample = transformed_recipe.join(random_sample, how=\"inner\")\n", | ||
"\n", | ||
" print(\"\\n\")\n", | ||
" print(\"-\" * 80)\n", | ||
" print(\"Random Sample of Combined Data:\")\n", | ||
" print(combined_df_sample.head())\n", | ||
"\n", | ||
" with open(sklearn_transformer_path, \"wb\") as fo:\n", | ||
" pickle.dump(sklearn_transformer, fo)\n", | ||
"\n", | ||
" transformed_recipe.to_parquet(path=transformed_recipes_parquet_path, compression=\"gzip\")\n", | ||
" transformed_recipe.to_parquet(\n", | ||
" path=transformed_recipes_parquet_path, compression=\"gzip\"\n", | ||
" )\n", | ||
"\n", | ||
" combined_df.to_parquet(path=combined_df_path, compression=\"gzip\")\n", | ||
" \n", | ||
"\n", | ||
" combined_df_sample.to_parquet(path=combined_df_sample_path)\n", | ||
"\n", | ||
" model_info = mlflow.pyfunc.log_model( \n", | ||
" model_info = mlflow.pyfunc.log_model(\n", | ||
" code_path=[\"../src/backend/\"],\n", | ||
" python_model=CustomSKLearnWrapper(),\n", | ||
" input_example=whole_nlp_df['ingredients_lemmafied'][0],\n", | ||
" signature=signature, \n", | ||
" input_example=whole_nlp_df[\"ingredients_lemmafied\"][0],\n", | ||
" signature=signature,\n", | ||
" artifact_path=\"sklearn_model\",\n", | ||
" artifacts=artifacts\n", | ||
" ) \n", | ||
" artifacts=artifacts,\n", | ||
" )\n", | ||
"\n", | ||
" # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel" | ||
] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters