Finish Annotated Type lecture, update schemas

AaronWChen · Sep 12, 2024 · 1296b24 · 1296b24
1 parent 5dbc95f
commit 1296b24
Show file tree

Hide file tree

Showing 3 changed files with 91 additions and 82 deletions.
diff --git a/main_example.py b/main_example.py
@@ -1,5 +1,6 @@
-from fastapi import FastAPI, HTTPException
+from fastapi import FastAPI, HTTPException, Path, Query
 from schemas_example import GenreURLChoices, BandBase, BandCreate, BandWithID
+from typing import Annotated
 
 # set --port argument, can't use 8000, the default uvicorn
 # use localhost:{port} in browser
@@ -24,20 +25,26 @@
 
 @app.get("/bands")
 async def bands(
-    genre: GenreURLChoices | None = None, has_albums: bool = False
+    genre: GenreURLChoices | None = None,
+    # has_albums: bool = False,
+    name_query: Annotated[str | None, Query(max_length=10)] = None,
 ) -> list[BandWithID]:
     band_list = [BandWithID(**b) for b in BANDS]
 
     if genre:
         band_list = [b for b in band_list if b.genre.value.lower() == genre.value]
 
-    if has_albums:
-        band_list = [b for b in band_list if len(b.albums) > 0]
+    # if has_albums:
+    #     band_list = [b for b in band_list if len(b.albums) > 0]
+
+    if name_query:
+        band_list = [b for b in band_list if name_query.lower() in b.name.lower()]
+
     return band_list
 
 
 @app.get("/bands/{band_id}")
-async def band(band_id: int) -> BandWithID:
+async def band(band_id: Annotated[int, Path(title="The band ID")]) -> BandWithID:
     band = next((BandWithID(**b) for b in BANDS if b["id"] == band_id), None)
     # Aaron: I'm a little confused, could we use `get` instead?
 

diff --git a/nbs/16_notebook_refactor.ipynb b/nbs/16_notebook_refactor.ipynb
@@ -31,20 +31,21 @@
     "import joblib\n",
     "import mlflow\n",
     "from mlflow.models import infer_signature\n",
-    "import nbdev #; nbdev.nbdev_export()\n",
+    "import nbdev  # ; nbdev.nbdev_export()\n",
     "from nbdev.showdoc import *\n",
     "import pandas as pd\n",
     "import re\n",
     "from sklearn.feature_extraction.text import (\n",
-    "    CountVectorizer\n",
-    "    , TfidfTransformer\n",
-    "    , TfidfVectorizer\n",
-    "    , \n",
+    "    CountVectorizer,\n",
+    "    TfidfTransformer,\n",
+    "    TfidfVectorizer,\n",
     ")\n",
     "from sklearn.model_selection import train_test_split\n",
     "from sklearn.pipeline import make_pipeline\n",
     "from src.backend.embedding_creation.apply_stanza import CustomSKLearnAnalyzer\n",
-    "from src.backend.embedding_creation.sklearn_transformer_as_mlflow_model import CustomSKLearnWrapper\n",
+    "from src.backend.embedding_creation.sklearn_transformer_as_mlflow_model import (\n",
+    "    CustomSKLearnWrapper,\n",
+    ")\n",
     "import src.backend.raw_data_cleaning.raw_data_preprocessor as rdpp\n",
     "import stanza\n",
     "from tqdm import tqdm"
@@ -77,19 +78,18 @@
     }
    ],
    "source": [
-    "#@markdown Enter the username of your DAGsHub account:\n",
-    "DAGSHUB_USER_NAME = \"AaronWChen\" #@param {type:\"string\"}\n",
+    "# @markdown Enter the username of your DAGsHub account:\n",
+    "DAGSHUB_USER_NAME = \"AaronWChen\"  # @param {type:\"string\"}\n",
     "\n",
-    "#@markdown Enter the email for your DAGsHub account:\n",
-    "DAGSHUB_EMAIL = \"[email protected]\" #@param {type:\"string\"}\n",
+    "# @markdown Enter the email for your DAGsHub account:\n",
+    "DAGSHUB_EMAIL = \"[email protected]\"  # @param {type:\"string\"}\n",
     "\n",
-    "#@markdown Enter the repo name \n",
+    "# @markdown Enter the repo name\n",
     "DAGSHUB_REPO_NAME = \"MeaLeon\"\n",
     "\n",
-    "#@markdown Enter the name of the branch you are working on \n",
+    "# @markdown Enter the name of the branch you are working on\n",
     "BRANCH = \"init_mealeon_to_notebook_refactor\"\n",
-    "dagshub.init(repo_name=DAGSHUB_REPO_NAME\n",
-    "             , repo_owner=DAGSHUB_USER_NAME)"
+    "dagshub.init(repo_name=DAGSHUB_REPO_NAME, repo_owner=DAGSHUB_USER_NAME)"
    ]
   },
   {
@@ -187,7 +187,7 @@
    "outputs": [],
    "source": [
     "# ETL work (currently, data cleaning/prep)\n",
-    "# how the prep works is via dataframe_preprocessor \n",
+    "# how the prep works is via dataframe_preprocessor\n",
     "cleaned_df = rdpp.preprocess_dataframe(raw_df)\n",
     "cleaned_df.to_parquet(\"../data/processed/cleaned_df.parquet.gzip\", compression=\"gzip\")"
    ]
@@ -260,14 +260,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#| export\n",
+    "# | export\n",
     "# this is a custom function to be used with MLflow to get or create experiments (is from the MLflow team)\n",
     "def get_mlflow_experiment_id(name):\n",
     "    # this function allows us to get the experiment ID from an experiment name\n",
     "    exp = mlflow.get_experiment_by_name(name)\n",
     "    if exp is None:\n",
-    "      exp_id = mlflow.create_experiment(name)\n",
-    "      return exp_id\n",
+    "        exp_id = mlflow.create_experiment(name)\n",
+    "        return exp_id\n",
     "    return exp.experiment_id"
    ]
   },
@@ -284,15 +284,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "mlflow.set_tracking_uri(f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n",
+    "mlflow.set_tracking_uri(f\"https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow\")\n",
     "\n",
     "# starter idea for making an experiment name, can be the git branch, but need more specificity\n",
     "experiment_name = f\"{DAGSHUB_EMAIL}/DVC-MLflow-integration-test\"\n",
     "mlflow_exp_id = get_mlflow_experiment_id(experiment_name)\n",
     "\n",
     "# define processed data location and data to be added to DVC\n",
     "processed_data_base = \"../data/processed\"\n",
-    "transformed_recipes_parquet_path = processed_data_base + \"/transformed_recipes.parquet.gzip\"\n",
+    "transformed_recipes_parquet_path = (\n",
+    "    processed_data_base + \"/transformed_recipes.parquet.gzip\"\n",
+    ")\n",
     "combined_df_path = processed_data_base + \"/combined_df.parquet.gzip\"\n",
     "\n",
     "\n",
@@ -305,13 +307,13 @@
     "# transformed_recipes_path = model_directory + \"/transformed_recipes.pkl\"\n",
     "combined_df_sample_path = model_directory + \"/combined_df_sample.parquet\"\n",
     "\n",
-    "artifacts = {'sklearn_model': sklearn_model_path,\n",
-    "             'sklearn_transformer': sklearn_transformer_path,\n",
-    "            #  'transformed_recipes': transformed_recipes_path,\n",
-    "            #  'combined_data': combined_df_path,\n",
-    "             'combined_data_sample': combined_df_sample_path\n",
-    "             }\n",
-    "\n"
+    "artifacts = {\n",
+    "    \"sklearn_model\": sklearn_model_path,\n",
+    "    \"sklearn_transformer\": sklearn_transformer_path,\n",
+    "    #  'transformed_recipes': transformed_recipes_path,\n",
+    "    #  'combined_data': combined_df_path,\n",
+    "    \"combined_data_sample\": combined_df_sample_path,\n",
+    "}"
    ]
   },
   {
@@ -811,105 +813,105 @@
    "source": [
     "# load from MLflow\n",
     "mlflow_client = mlflow.tracking.MlflowClient(\n",
-    "    tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n",
+    "    tracking_uri=f\"https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow\"\n",
+    ")\n",
     "\n",
     "# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n",
-    "sklearn_transformer_params = {    \n",
-    "    'analyzer': CustomSKLearnAnalyzer().ngram_maker(\n",
+    "sklearn_transformer_params = {\n",
+    "    \"analyzer\": CustomSKLearnAnalyzer().ngram_maker(\n",
     "        min_ngram_length=1,\n",
     "        max_ngram_length=4,\n",
-    "        ),\n",
-    "    'min_df':3,\n",
-    "    'binary':False\n",
+    "    ),\n",
+    "    \"min_df\": 3,\n",
+    "    \"binary\": False,\n",
     "}\n",
     "\n",
     "# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters\n",
-    "pipeline_params = {\n",
-    "    'stanza_model': 'en',\n",
-    "    'sklearn-transformer': 'TFIDF'\n",
-    "}\n",
+    "pipeline_params = {\"stanza_model\": \"en\", \"sklearn-transformer\": \"TFIDF\"}\n",
     "\n",
     "# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking\n",
     "pipeline_params.update(sklearn_transformer_params)\n",
     "\n",
-    "with mlflow.start_run(experiment_id=mlflow_exp_id):    \n",
+    "with mlflow.start_run(experiment_id=mlflow_exp_id):\n",
     "    # LOG PARAMETERS\n",
     "    mlflow.log_params(pipeline_params)\n",
     "\n",
     "    # LOG INPUTS (QUERIES) AND OUTPUTS\n",
     "    # MLflow example uses a list of strings or a list of str->str dicts\n",
     "    # Will be useful in STAGING/Evaluation\n",
-    "    \n",
+    "\n",
     "    # LOG MODEL\n",
     "    # Instantiate sklearn TFIDFVectorizer\n",
     "    sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)\n",
     "\n",
-    "    print('\\n')\n",
-    "    print('-' * 80)\n",
-    "    print('sklearn fit transform on ingredients:')\n",
+    "    print(\"\\n\")\n",
+    "    print(\"-\" * 80)\n",
+    "    print(\"sklearn fit transform on ingredients:\")\n",
     "\n",
-    "    model_input = whole_nlp_df['ingredients_lemmafied']\n",
+    "    model_input = whole_nlp_df[\"ingredients_lemmafied\"]\n",
     "\n",
-    "    print('\\n')\n",
-    "    print('-' * 80)\n",
-    "    print('Input Data: ')\n",
+    "    print(\"\\n\")\n",
+    "    print(\"-\" * 80)\n",
+    "    print(\"Input Data: \")\n",
     "    print(model_input)\n",
     "\n",
-    "    print('\\n')\n",
-    "    print('-' * 80)\n",
-    "    print('Input Data Shape: ')\n",
+    "    print(\"\\n\")\n",
+    "    print(\"-\" * 80)\n",
+    "    print(\"Input Data Shape: \")\n",
     "    print(model_input.shape)\n",
     "\n",
     "    random_sample = model_input.sample(3, random_state=200)\n",
     "\n",
-    "    print('\\n')\n",
-    "    print('-' * 80)\n",
-    "    print('Random 3 Records from Input Data: ')\n",
+    "    print(\"\\n\")\n",
+    "    print(\"-\" * 80)\n",
+    "    print(\"Random 3 Records from Input Data: \")\n",
     "    print(random_sample)\n",
     "\n",
     "    # Do fit transform on data\n",
-    "    response = sklearn_transformer.fit_transform(tqdm(model_input)) \n",
-    "    \n",
+    "    response = sklearn_transformer.fit_transform(tqdm(model_input))\n",
+    "\n",
     "    transformed_recipe = pd.DataFrame(\n",
-    "            response.toarray(),\n",
-    "            columns=sklearn_transformer.get_feature_names_out(),\n",
-    "            index=model_input.index\n",
+    "        response.toarray(),\n",
+    "        columns=sklearn_transformer.get_feature_names_out(),\n",
+    "        index=model_input.index,\n",
     "    )\n",
     "\n",
-    "    signature = infer_signature(model_input=model_input,\n",
-    "                                model_output=transformed_recipe\n",
-    "                                )\n",
+    "    signature = infer_signature(\n",
+    "        model_input=model_input, model_output=transformed_recipe\n",
+    "    )\n",
     "\n",
-    "    print('\\n')\n",
-    "    print('-' * 80)\n",
-    "    print('Transformed Data:')\n",
+    "    print(\"\\n\")\n",
+    "    print(\"-\" * 80)\n",
+    "    print(\"Transformed Data:\")\n",
     "    print(transformed_recipe.head())\n",
-    "    \n",
-    "    combined_df = transformed_recipe.join(model_input, how='inner')\n",
-    "    combined_df_sample = transformed_recipe.join(random_sample, how='inner')\n",
     "\n",
-    "    print('\\n')\n",
-    "    print('-' * 80)\n",
-    "    print('Random Sample of Combined Data:')\n",
+    "    combined_df = transformed_recipe.join(model_input, how=\"inner\")\n",
+    "    combined_df_sample = transformed_recipe.join(random_sample, how=\"inner\")\n",
+    "\n",
+    "    print(\"\\n\")\n",
+    "    print(\"-\" * 80)\n",
+    "    print(\"Random Sample of Combined Data:\")\n",
     "    print(combined_df_sample.head())\n",
     "\n",
     "    with open(sklearn_transformer_path, \"wb\") as fo:\n",
     "        pickle.dump(sklearn_transformer, fo)\n",
     "\n",
-    "    transformed_recipe.to_parquet(path=transformed_recipes_parquet_path, compression=\"gzip\")\n",
+    "    transformed_recipe.to_parquet(\n",
+    "        path=transformed_recipes_parquet_path, compression=\"gzip\"\n",
+    "    )\n",
     "\n",
     "    combined_df.to_parquet(path=combined_df_path, compression=\"gzip\")\n",
-    "    \n",
+    "\n",
     "    combined_df_sample.to_parquet(path=combined_df_sample_path)\n",
     "\n",
-    "    model_info = mlflow.pyfunc.log_model( \n",
+    "    model_info = mlflow.pyfunc.log_model(\n",
     "        code_path=[\"../src/backend/\"],\n",
     "        python_model=CustomSKLearnWrapper(),\n",
-    "        input_example=whole_nlp_df['ingredients_lemmafied'][0],\n",
-    "        signature=signature,        \n",
+    "        input_example=whole_nlp_df[\"ingredients_lemmafied\"][0],\n",
+    "        signature=signature,\n",
     "        artifact_path=\"sklearn_model\",\n",
-    "        artifacts=artifacts\n",
-    "        ) \n",
+    "        artifacts=artifacts,\n",
+    "    )\n",
     "\n",
     "    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel"
    ]

diff --git a/schemas.py b/schemas.py
@@ -183,7 +183,7 @@ class Edamam_API_Response(BaseModel):
 
 
 # QUERY
-class RecipeQuery(BaseModel):
+class RecipeQuery(BaseModel):  # maybe RecipeSearchString?
     recipe_title: str
-Original file line number
+Diff line change
@@ Expand Up / @@ -183,7 +183,7 @@ class Edamam_API_Response(BaseModel): @@
     # QUERY
-    class RecipeQuery(BaseModel):
+    class RecipeQuery(BaseModel):  # maybe RecipeSearchString?
         recipe_title: str
@@ Expand Down @@