Skip to content

Commit

Permalink
Finish Annotated Type lecture, update schemas
Browse files Browse the repository at this point in the history
  • Loading branch information
AaronWChen committed Sep 12, 2024
1 parent 5dbc95f commit 1296b24
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 82 deletions.
17 changes: 12 additions & 5 deletions main_example.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from fastapi import FastAPI, HTTPException
from fastapi import FastAPI, HTTPException, Path, Query
from schemas_example import GenreURLChoices, BandBase, BandCreate, BandWithID
from typing import Annotated

# set --port argument, can't use 8000, the default uvicorn
# use localhost:{port} in browser
Expand All @@ -24,20 +25,26 @@

@app.get("/bands")
async def bands(
genre: GenreURLChoices | None = None, has_albums: bool = False
genre: GenreURLChoices | None = None,
# has_albums: bool = False,
name_query: Annotated[str | None, Query(max_length=10)] = None,
) -> list[BandWithID]:
band_list = [BandWithID(**b) for b in BANDS]

if genre:
band_list = [b for b in band_list if b.genre.value.lower() == genre.value]

if has_albums:
band_list = [b for b in band_list if len(b.albums) > 0]
# if has_albums:
# band_list = [b for b in band_list if len(b.albums) > 0]

if name_query:
band_list = [b for b in band_list if name_query.lower() in b.name.lower()]

return band_list


@app.get("/bands/{band_id}")
async def band(band_id: int) -> BandWithID:
async def band(band_id: Annotated[int, Path(title="The band ID")]) -> BandWithID:
band = next((BandWithID(**b) for b in BANDS if b["id"] == band_id), None)
# Aaron: I'm a little confused, could we use `get` instead?

Expand Down
154 changes: 78 additions & 76 deletions nbs/16_notebook_refactor.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,21 @@
"import joblib\n",
"import mlflow\n",
"from mlflow.models import infer_signature\n",
"import nbdev #; nbdev.nbdev_export()\n",
"import nbdev # ; nbdev.nbdev_export()\n",
"from nbdev.showdoc import *\n",
"import pandas as pd\n",
"import re\n",
"from sklearn.feature_extraction.text import (\n",
" CountVectorizer\n",
" , TfidfTransformer\n",
" , TfidfVectorizer\n",
" , \n",
" CountVectorizer,\n",
" TfidfTransformer,\n",
" TfidfVectorizer,\n",
")\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.pipeline import make_pipeline\n",
"from src.backend.embedding_creation.apply_stanza import CustomSKLearnAnalyzer\n",
"from src.backend.embedding_creation.sklearn_transformer_as_mlflow_model import CustomSKLearnWrapper\n",
"from src.backend.embedding_creation.sklearn_transformer_as_mlflow_model import (\n",
" CustomSKLearnWrapper,\n",
")\n",
"import src.backend.raw_data_cleaning.raw_data_preprocessor as rdpp\n",
"import stanza\n",
"from tqdm import tqdm"
Expand Down Expand Up @@ -77,19 +78,18 @@
}
],
"source": [
"#@markdown Enter the username of your DAGsHub account:\n",
"DAGSHUB_USER_NAME = \"AaronWChen\" #@param {type:\"string\"}\n",
"# @markdown Enter the username of your DAGsHub account:\n",
"DAGSHUB_USER_NAME = \"AaronWChen\" # @param {type:\"string\"}\n",
"\n",
"#@markdown Enter the email for your DAGsHub account:\n",
"DAGSHUB_EMAIL = \"[email protected]\" #@param {type:\"string\"}\n",
"# @markdown Enter the email for your DAGsHub account:\n",
"DAGSHUB_EMAIL = \"[email protected]\" # @param {type:\"string\"}\n",
"\n",
"#@markdown Enter the repo name \n",
"# @markdown Enter the repo name\n",
"DAGSHUB_REPO_NAME = \"MeaLeon\"\n",
"\n",
"#@markdown Enter the name of the branch you are working on \n",
"# @markdown Enter the name of the branch you are working on\n",
"BRANCH = \"init_mealeon_to_notebook_refactor\"\n",
"dagshub.init(repo_name=DAGSHUB_REPO_NAME\n",
" , repo_owner=DAGSHUB_USER_NAME)"
"dagshub.init(repo_name=DAGSHUB_REPO_NAME, repo_owner=DAGSHUB_USER_NAME)"
]
},
{
Expand Down Expand Up @@ -187,7 +187,7 @@
"outputs": [],
"source": [
"# ETL work (currently, data cleaning/prep)\n",
"# how the prep works is via dataframe_preprocessor \n",
"# how the prep works is via dataframe_preprocessor\n",
"cleaned_df = rdpp.preprocess_dataframe(raw_df)\n",
"cleaned_df.to_parquet(\"../data/processed/cleaned_df.parquet.gzip\", compression=\"gzip\")"
]
Expand Down Expand Up @@ -260,14 +260,14 @@
"metadata": {},
"outputs": [],
"source": [
"#| export\n",
"# | export\n",
"# this is a custom function to be used with MLflow to get or create experiments (is from the MLflow team)\n",
"def get_mlflow_experiment_id(name):\n",
" # this function allows us to get the experiment ID from an experiment name\n",
" exp = mlflow.get_experiment_by_name(name)\n",
" if exp is None:\n",
" exp_id = mlflow.create_experiment(name)\n",
" return exp_id\n",
" exp_id = mlflow.create_experiment(name)\n",
" return exp_id\n",
" return exp.experiment_id"
]
},
Expand All @@ -284,15 +284,17 @@
"metadata": {},
"outputs": [],
"source": [
"mlflow.set_tracking_uri(f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n",
"mlflow.set_tracking_uri(f\"https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow\")\n",
"\n",
"# starter idea for making an experiment name, can be the git branch, but need more specificity\n",
"experiment_name = f\"{DAGSHUB_EMAIL}/DVC-MLflow-integration-test\"\n",
"mlflow_exp_id = get_mlflow_experiment_id(experiment_name)\n",
"\n",
"# define processed data location and data to be added to DVC\n",
"processed_data_base = \"../data/processed\"\n",
"transformed_recipes_parquet_path = processed_data_base + \"/transformed_recipes.parquet.gzip\"\n",
"transformed_recipes_parquet_path = (\n",
" processed_data_base + \"/transformed_recipes.parquet.gzip\"\n",
")\n",
"combined_df_path = processed_data_base + \"/combined_df.parquet.gzip\"\n",
"\n",
"\n",
Expand All @@ -305,13 +307,13 @@
"# transformed_recipes_path = model_directory + \"/transformed_recipes.pkl\"\n",
"combined_df_sample_path = model_directory + \"/combined_df_sample.parquet\"\n",
"\n",
"artifacts = {'sklearn_model': sklearn_model_path,\n",
" 'sklearn_transformer': sklearn_transformer_path,\n",
" # 'transformed_recipes': transformed_recipes_path,\n",
" # 'combined_data': combined_df_path,\n",
" 'combined_data_sample': combined_df_sample_path\n",
" }\n",
"\n"
"artifacts = {\n",
" \"sklearn_model\": sklearn_model_path,\n",
" \"sklearn_transformer\": sklearn_transformer_path,\n",
" # 'transformed_recipes': transformed_recipes_path,\n",
" # 'combined_data': combined_df_path,\n",
" \"combined_data_sample\": combined_df_sample_path,\n",
"}"
]
},
{
Expand Down Expand Up @@ -811,105 +813,105 @@
"source": [
"# load from MLflow\n",
"mlflow_client = mlflow.tracking.MlflowClient(\n",
" tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n",
" tracking_uri=f\"https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow\"\n",
")\n",
"\n",
"# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n",
"sklearn_transformer_params = { \n",
" 'analyzer': CustomSKLearnAnalyzer().ngram_maker(\n",
"sklearn_transformer_params = {\n",
" \"analyzer\": CustomSKLearnAnalyzer().ngram_maker(\n",
" min_ngram_length=1,\n",
" max_ngram_length=4,\n",
" ),\n",
" 'min_df':3,\n",
" 'binary':False\n",
" ),\n",
" \"min_df\": 3,\n",
" \"binary\": False,\n",
"}\n",
"\n",
"# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters\n",
"pipeline_params = {\n",
" 'stanza_model': 'en',\n",
" 'sklearn-transformer': 'TFIDF'\n",
"}\n",
"pipeline_params = {\"stanza_model\": \"en\", \"sklearn-transformer\": \"TFIDF\"}\n",
"\n",
"# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking\n",
"pipeline_params.update(sklearn_transformer_params)\n",
"\n",
"with mlflow.start_run(experiment_id=mlflow_exp_id): \n",
"with mlflow.start_run(experiment_id=mlflow_exp_id):\n",
" # LOG PARAMETERS\n",
" mlflow.log_params(pipeline_params)\n",
"\n",
" # LOG INPUTS (QUERIES) AND OUTPUTS\n",
" # MLflow example uses a list of strings or a list of str->str dicts\n",
" # Will be useful in STAGING/Evaluation\n",
" \n",
"\n",
" # LOG MODEL\n",
" # Instantiate sklearn TFIDFVectorizer\n",
" sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)\n",
"\n",
" print('\\n')\n",
" print('-' * 80)\n",
" print('sklearn fit transform on ingredients:')\n",
" print(\"\\n\")\n",
" print(\"-\" * 80)\n",
" print(\"sklearn fit transform on ingredients:\")\n",
"\n",
" model_input = whole_nlp_df['ingredients_lemmafied']\n",
" model_input = whole_nlp_df[\"ingredients_lemmafied\"]\n",
"\n",
" print('\\n')\n",
" print('-' * 80)\n",
" print('Input Data: ')\n",
" print(\"\\n\")\n",
" print(\"-\" * 80)\n",
" print(\"Input Data: \")\n",
" print(model_input)\n",
"\n",
" print('\\n')\n",
" print('-' * 80)\n",
" print('Input Data Shape: ')\n",
" print(\"\\n\")\n",
" print(\"-\" * 80)\n",
" print(\"Input Data Shape: \")\n",
" print(model_input.shape)\n",
"\n",
" random_sample = model_input.sample(3, random_state=200)\n",
"\n",
" print('\\n')\n",
" print('-' * 80)\n",
" print('Random 3 Records from Input Data: ')\n",
" print(\"\\n\")\n",
" print(\"-\" * 80)\n",
" print(\"Random 3 Records from Input Data: \")\n",
" print(random_sample)\n",
"\n",
" # Do fit transform on data\n",
" response = sklearn_transformer.fit_transform(tqdm(model_input)) \n",
" \n",
" response = sklearn_transformer.fit_transform(tqdm(model_input))\n",
"\n",
" transformed_recipe = pd.DataFrame(\n",
" response.toarray(),\n",
" columns=sklearn_transformer.get_feature_names_out(),\n",
" index=model_input.index\n",
" response.toarray(),\n",
" columns=sklearn_transformer.get_feature_names_out(),\n",
" index=model_input.index,\n",
" )\n",
"\n",
" signature = infer_signature(model_input=model_input,\n",
" model_output=transformed_recipe\n",
" )\n",
" signature = infer_signature(\n",
" model_input=model_input, model_output=transformed_recipe\n",
" )\n",
"\n",
" print('\\n')\n",
" print('-' * 80)\n",
" print('Transformed Data:')\n",
" print(\"\\n\")\n",
" print(\"-\" * 80)\n",
" print(\"Transformed Data:\")\n",
" print(transformed_recipe.head())\n",
" \n",
" combined_df = transformed_recipe.join(model_input, how='inner')\n",
" combined_df_sample = transformed_recipe.join(random_sample, how='inner')\n",
"\n",
" print('\\n')\n",
" print('-' * 80)\n",
" print('Random Sample of Combined Data:')\n",
" combined_df = transformed_recipe.join(model_input, how=\"inner\")\n",
" combined_df_sample = transformed_recipe.join(random_sample, how=\"inner\")\n",
"\n",
" print(\"\\n\")\n",
" print(\"-\" * 80)\n",
" print(\"Random Sample of Combined Data:\")\n",
" print(combined_df_sample.head())\n",
"\n",
" with open(sklearn_transformer_path, \"wb\") as fo:\n",
" pickle.dump(sklearn_transformer, fo)\n",
"\n",
" transformed_recipe.to_parquet(path=transformed_recipes_parquet_path, compression=\"gzip\")\n",
" transformed_recipe.to_parquet(\n",
" path=transformed_recipes_parquet_path, compression=\"gzip\"\n",
" )\n",
"\n",
" combined_df.to_parquet(path=combined_df_path, compression=\"gzip\")\n",
" \n",
"\n",
" combined_df_sample.to_parquet(path=combined_df_sample_path)\n",
"\n",
" model_info = mlflow.pyfunc.log_model( \n",
" model_info = mlflow.pyfunc.log_model(\n",
" code_path=[\"../src/backend/\"],\n",
" python_model=CustomSKLearnWrapper(),\n",
" input_example=whole_nlp_df['ingredients_lemmafied'][0],\n",
" signature=signature, \n",
" input_example=whole_nlp_df[\"ingredients_lemmafied\"][0],\n",
" signature=signature,\n",
" artifact_path=\"sklearn_model\",\n",
" artifacts=artifacts\n",
" ) \n",
" artifacts=artifacts,\n",
" )\n",
"\n",
" # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel"
]
Expand Down
2 changes: 1 addition & 1 deletion schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ class Edamam_API_Response(BaseModel):


# QUERY
class RecipeQuery(BaseModel):
class RecipeQuery(BaseModel): # maybe RecipeSearchString?
recipe_title: str


Expand Down

0 comments on commit 1296b24

Please sign in to comment.