Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stanza 1/refactor nltk stanza #88

Merged
merged 11 commits into from
Apr 4, 2024
10 changes: 5 additions & 5 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ jobs:
strategy:
fail-fast: true
matrix:
os: [ubuntu, macos]
os: [ubuntu]
py_version: ["3.10"]
runs-on: ${{ matrix.os }}-latest

Expand Down Expand Up @@ -43,6 +43,10 @@ jobs:
run: python -m poetry install
if: steps.cache.outputs.cache-hit != 'true'

- name: Log in to DagsHub
run: |
poetry run dagshub login --token ${{ secrets.DAGSHUB_TOKEN }}

- name: Download pretrained spacy libraries
run: poetry run spacy download en_core_web_sm

Expand All @@ -69,10 +73,6 @@ jobs:
poetry run dvc remote modify origin --local password ${{ secrets.DAGSHUB_TOKEN }}
poetry run dvc pull

- name: Log in to DagsHub
run: |
poetry run dagshub login --token ${{ secrets.DAGSHUB_TOKEN }}

- name: Test notebooks batch ${{matrix.nb_dec}}${{matrix.nb_unit}}
run: python -m poetry run nbdev_test --flags '' --n_workers 3 --pause 1.0 --file_re "${{matrix.nb_dec}}${{matrix.nb_unit}}.*"

Expand Down
153 changes: 151 additions & 2 deletions nbs/00_template.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
"outputs": [],
"source": [
"# | hide\n",
"import dagshub\n",
"import mlflow\n",
"import nbdev\n",
"from nbdev.showdoc import *"
]
},
Expand All @@ -46,10 +49,156 @@
"outputs": [],
"source": [
"# | hide\n",
"import nbdev\n",
"\n",
"# this function allows us to get the experiment ID from an experiment name\n",
"def get_experiment_id(name):\n",
" exp = mlflow.get_experiment_by_name(name)\n",
" if exp is None:\n",
" exp_id = mlflow.create_experiment(name)\n",
" return exp_id\n",
" return exp.experiment_id"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# | hide\n",
"nbdev.nbdev_export()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# | Below this are blocks to use DagsHub with MLflow"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#@markdown Enter the username of your DAGsHub account:\n",
"DAGSHUB_USER_NAME = \"AaronWChen\" #@param {type:\"string\"}\n",
"\n",
"#@markdown Enter the email for your DAGsHub account:\n",
"DAGSHUB_EMAIL = \"[email protected]\" #@param {type:\"string\"}\n",
"\n",
"#@markdown Enter the repo name \n",
"DAGSHUB_REPO_NAME=\"MeaLeon\"\n",
"\n",
"#@markdown Enter the name of the branch you are working on \n",
"BRANCH=\"STANZA-1/refactor-nltk-stanza\"\n",
"dagshub.init(repo_name=DAGSHUB_REPO_NAME\n",
" , repo_owner=DAGSHUB_USER_NAME)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mlflow.set_tracking_uri(f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n",
"\n",
"# starter idea for making an experiment name can be the git branch, but need more specificity\n",
"experiment_name = f\"{DAGSHUB_EMAIL}/stanza_quadgrams_small_set_v1\"\n",
"mlflow_exp_id = get_experiment_id(experiment_name)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# create pipelines relevant to library used\n",
"# MLflow example uses HuggingFace\n",
"# below is example for MeaLeon with Stanza and sklearn NLP pipeline\n",
"\n",
"# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n",
"cv_params = {\n",
" 'strip_accents':\"unicode\",\n",
" 'lowercase':True,\n",
" 'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),\n",
" 'min_df':10,\n",
"}\n",
"\n",
"# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters\n",
"pipeline_params = {\n",
" 'stanza_model': 'en',\n",
" 'language': 'english',\n",
" 'sklearn-transformer': 'TfidfVectorizer'\n",
"}\n",
"\n",
"# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking\n",
"pipeline_params.update(cv_params)\n",
"\n",
"with mlflow.start_run(experiment_id=mlflow_exp_id):\n",
" # LOG PARAMETERS\n",
" mlflow.log_params(pipeline_params)\n",
"\n",
" # LOG INPUTS (QUERIES) AND OUTPUTS\n",
" # MLflow example uses a list of strings or a list of str->str dicts\n",
" \n",
" # import necessary libraries to handle raw data\n",
" import dill as pickle\n",
" import dvc.api\n",
" import pandas as pd\n",
" from sklearn.feature_extraction.text import (\n",
" CountVectorizer\n",
" , TfidfTransformer\n",
" , TfidfVectorizer\n",
" ,\n",
" )\n",
" from src.custom_stanza_mlflow import StanzaWrapper\n",
" import src.dataframe_preprocessor as dfpp\n",
" import tqdm\n",
" \n",
" # load raw data and preprocess/clean\n",
" data = dvc.api.read(\n",
" path='../data/raw/recipes-en-201706/epicurious-recipes_m2.json'\n",
" mode='r')\n",
" raw_df = pd.read_json(data)\n",
"\n",
" # pre_proc_df is cleaned dataframe\n",
" pre_proc_df = dfpp.preprocess_dataframe(raw_df)\n",
"\n",
" # create subset for dev purposes\n",
" to_nlp_df = pre_proc_df[0:50]\n",
"\n",
" # save and log preprocessed dataframe(s)\n",
" pre_proc_df.to_json('../data/processed/preprocessed_dataframe.json')\n",
" mlflow.log_artifact('../data/processed/preprocessed_dataframe.json', artifact_path=\"preprocessed_dataframes\")\n",
" \n",
" to_nlp_df.to_json('../data/processed/preprocessed_subset_dataframe.json')\n",
" mlflow.log_artifact('../data/processed/preprocessed_subset_dataframe.json', artifact_path=\"preprocessed_dataframes\")\n",
" \n",
" # LOG MODEL\n",
" # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel\n",
" # Instantiate sklearn TFIDFVectorizer\n",
" tfidf_vectorizer_model = TfidfVectorizer(**cv_params)\n",
"\n",
" # Do fit transform on data\n",
" test_tfidf_transform = tfidf_vectorizer_model.fit_transform(tqdm(to_nlp_df[\"ingredients\"]))\n",
"\n",
" word_matrix = pd.DataFrame(\n",
" test_tfidf_transform.toarray()\n",
" , columns=tfidf_vectorizer_model.get_feature_names_out()\n",
" , index=to_nlp_df.index\n",
" )\n",
"\n",
" with open(\"../joblib/tfidf_transformer_small_test.pkl\", \"wb\") as fo:\n",
" pickle.dump(tfidf_vectorizer_model, fo)\n",
" mlflow.log_artifact(\"../joblib/tfidf_transformer_small_test.pkl\", artifact_path=\"sklearn_dill_pkls\")\n",
"\n",
" with open(\"../joblib/database_word_matrix_small_test.pkl\", \"wb\") as fo:\n",
" pickle.dump(word_matrix, fo)\n",
" mlflow.log_artifact(\"../joblib/database_word_matrix_small_test.pkl\", artifact_path=\"sklearn_dill_pkls\")\n"
]
}
],
"metadata": {},
Expand Down
Loading
Loading