AaronWChen · AaronWChen · Apr 4, 2024 · Nov 7, 2023 · Nov 9, 2023 · Nov 14, 2023
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -7,7 +7,7 @@ jobs:
     strategy:
       fail-fast: true
       matrix:
-        os:  [ubuntu, macos]
+        os:  [ubuntu]
         py_version: ["3.10"]
     runs-on: ${{ matrix.os }}-latest
 
@@ -43,6 +43,10 @@ jobs:
         run: python -m poetry install
         if: steps.cache.outputs.cache-hit != 'true'
 
+      - name: Log in to DagsHub
+        run: |
+          poetry run dagshub login --token ${{ secrets.DAGSHUB_TOKEN }}
+
       - name: Download pretrained spacy libraries
         run: poetry run spacy download en_core_web_sm
 
@@ -69,10 +73,6 @@ jobs:
           poetry run dvc remote modify origin --local password ${{ secrets.DAGSHUB_TOKEN }}
           poetry run dvc pull
 
-      - name: Log in to DagsHub
-        run: |
-          poetry run dagshub login --token ${{ secrets.DAGSHUB_TOKEN }}
-
       - name: Test notebooks batch ${{matrix.nb_dec}}${{matrix.nb_unit}}
         run: python -m poetry run nbdev_test --flags '' --n_workers 3 --pause 1.0 --file_re "${{matrix.nb_dec}}${{matrix.nb_unit}}.*"
 

diff --git a/nbs/00_template.ipynb b/nbs/00_template.ipynb
@@ -27,6 +27,9 @@
    "outputs": [],
    "source": [
     "# | hide\n",
+    "import dagshub\n",
+    "import mlflow\n",
+    "import nbdev\n",
     "from nbdev.showdoc import *"
    ]
   },
@@ -46,10 +49,156 @@
    "outputs": [],
    "source": [
     "# | hide\n",
-    "import nbdev\n",
-    "\n",
+    "# this function allows us to get the experiment ID from an experiment name\n",
+    "def get_experiment_id(name):\n",
+    "    exp = mlflow.get_experiment_by_name(name)\n",
+    "    if exp is None:\n",
+    "      exp_id = mlflow.create_experiment(name)\n",
+    "      return exp_id\n",
+    "    return exp.experiment_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# | hide\n",
     "nbdev.nbdev_export()"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# | Below this are blocks to use DagsHub with MLflow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#@markdown Enter the username of your DAGsHub account:\n",
+    "DAGSHUB_USER_NAME = \"AaronWChen\" #@param {type:\"string\"}\n",
+    "\n",
+    "#@markdown Enter the email for your DAGsHub account:\n",
+    "DAGSHUB_EMAIL = \"[email protected]\" #@param {type:\"string\"}\n",
+    "\n",
+    "#@markdown Enter the repo name \n",
+    "DAGSHUB_REPO_NAME=\"MeaLeon\"\n",
+    "\n",
+    "#@markdown Enter the name of the branch you are working on \n",
+    "BRANCH=\"STANZA-1/refactor-nltk-stanza\"\n",
+    "dagshub.init(repo_name=DAGSHUB_REPO_NAME\n",
+    "             , repo_owner=DAGSHUB_USER_NAME)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mlflow.set_tracking_uri(f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n",
+    "\n",
+    "# starter idea for making an experiment name can be the git branch, but need more specificity\n",
+    "experiment_name = f\"{DAGSHUB_EMAIL}/stanza_quadgrams_small_set_v1\"\n",
+    "mlflow_exp_id = get_experiment_id(experiment_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create pipelines relevant to library used\n",
+    "# MLflow example uses HuggingFace\n",
+    "# below is example for MeaLeon with Stanza and sklearn NLP pipeline\n",
+    "\n",
+    "# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n",
+    "cv_params = {\n",
+    "    'strip_accents':\"unicode\",\n",
+    "    'lowercase':True,\n",
+    "    'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),\n",
+    "    'min_df':10,\n",
+    "}\n",
+    "\n",
+    "# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters\n",
+    "pipeline_params = {\n",
+    "    'stanza_model': 'en',\n",
+    "    'language': 'english',\n",
+    "    'sklearn-transformer': 'TfidfVectorizer'\n",
+    "}\n",
+    "\n",
+    "# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking\n",
+    "pipeline_params.update(cv_params)\n",
+    "\n",
+    "with mlflow.start_run(experiment_id=mlflow_exp_id):\n",
+    "    # LOG PARAMETERS\n",
+    "    mlflow.log_params(pipeline_params)\n",
+    "\n",
+    "    # LOG INPUTS (QUERIES) AND OUTPUTS\n",
+    "    # MLflow example uses a list of strings or a list of str->str dicts\n",
+    "    \n",
+    "    # import necessary libraries to handle raw data\n",
+    "    import dill as pickle\n",
+    "    import dvc.api\n",
+    "    import pandas as pd\n",
+    "    from sklearn.feature_extraction.text import (\n",
+    "        CountVectorizer\n",
+    "        , TfidfTransformer\n",
+    "        , TfidfVectorizer\n",
+    "        ,\n",
+    "    )\n",
+    "    from src.custom_stanza_mlflow import StanzaWrapper\n",
+    "    import src.dataframe_preprocessor as dfpp\n",
+    "    import tqdm\n",
+    "    \n",
+    "    # load raw data and preprocess/clean\n",
+    "    data = dvc.api.read(\n",
+    "           path='../data/raw/recipes-en-201706/epicurious-recipes_m2.json'\n",
+    "           mode='r')\n",
+    "    raw_df = pd.read_json(data)\n",
+    "\n",
+    "    # pre_proc_df is cleaned dataframe\n",
+    "    pre_proc_df = dfpp.preprocess_dataframe(raw_df)\n",
+    "\n",
+    "    # create subset for dev purposes\n",
+    "    to_nlp_df = pre_proc_df[0:50]\n",
+    "\n",
+    "    # save and log preprocessed dataframe(s)\n",
+    "    pre_proc_df.to_json('../data/processed/preprocessed_dataframe.json')\n",
+    "    mlflow.log_artifact('../data/processed/preprocessed_dataframe.json', artifact_path=\"preprocessed_dataframes\")\n",
+    "    \n",
+    "    to_nlp_df.to_json('../data/processed/preprocessed_subset_dataframe.json')\n",
+    "    mlflow.log_artifact('../data/processed/preprocessed_subset_dataframe.json', artifact_path=\"preprocessed_dataframes\")\n",
+    "    \n",
+    "    # LOG MODEL\n",
+    "    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel\n",
+    "    # Instantiate sklearn TFIDFVectorizer\n",
+    "    tfidf_vectorizer_model = TfidfVectorizer(**cv_params)\n",
+    "\n",
+    "    # Do fit transform on data\n",
+    "    test_tfidf_transform = tfidf_vectorizer_model.fit_transform(tqdm(to_nlp_df[\"ingredients\"]))\n",
+    "\n",
+    "    word_matrix = pd.DataFrame(\n",
+    "        test_tfidf_transform.toarray()\n",
+    "        , columns=tfidf_vectorizer_model.get_feature_names_out()\n",
+    "        , index=to_nlp_df.index\n",
+    "    )\n",
+    "\n",
+    "    with open(\"../joblib/tfidf_transformer_small_test.pkl\", \"wb\") as fo:\n",
+    "        pickle.dump(tfidf_vectorizer_model, fo)\n",
+    "        mlflow.log_artifact(\"../joblib/tfidf_transformer_small_test.pkl\", artifact_path=\"sklearn_dill_pkls\")\n",
+    "\n",
+    "    with open(\"../joblib/database_word_matrix_small_test.pkl\", \"wb\") as fo:\n",
+    "        pickle.dump(word_matrix, fo)\n",
+    "        mlflow.log_artifact(\"../joblib/database_word_matrix_small_test.pkl\", artifact_path=\"sklearn_dill_pkls\")\n"
+   ]
   }
  ],
  "metadata": {},