diff --git a/examples/pzmm_generate_complete_mode_card.ipynb b/examples/pzmm_generate_complete_model_card.ipynb similarity index 93% rename from examples/pzmm_generate_complete_mode_card.ipynb rename to examples/pzmm_generate_complete_model_card.ipynb index fb8373f7..7ebf0858 100644 --- a/examples/pzmm_generate_complete_mode_card.ipynb +++ b/examples/pzmm_generate_complete_model_card.ipynb @@ -493,7 +493,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The data is looking better, but the education statuses are a bit too granular. Lets combine some of them to make the job easier for our model." + "The data is looking better, but the martial status, education and work class statuses are a bit too granular. Lets combine some of them to make the job easier for our model." ] }, { @@ -549,6 +549,53 @@ "df.columns" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we want the automatically generated score code to leaverage these steps when scoring new data, we can put them in a preprocessing function and pass them into our import_model function call. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def preprocess_function(df):\n", + " cat_vals = df[[\"WorkClass\", \"Education\", \"MartialStatus\", \"Relationship\", \"Race\", \"Sex\"]]\n", + " df = pd.get_dummies(df, columns=[\"WorkClass\", \"Education\", \"MartialStatus\", \"Relationship\", \"Race\", \"Sex\"])\n", + " df.columns = df.columns.str.replace(' ', '')\n", + " df.columns = df.columns.str.replace('-', '_')\n", + " df = df.drop(['Sex_Male'], axis=1)\n", + " df = pd.concat([df, cat_vals], axis=1).drop('index', axis=1)\n", + " # For the model to score correctly, all OHE columns must exist\n", + " input_cols = [\n", + " \"Education_9th\", \"Education_10th\", \"Education_11th\", \"Education_12th\", \"Education_Assoc_voc\", \"Education_Assoc_acdm\", \"Education_Masters\", \"Education_Prof_school\",\n", + " \"Education_Doctorate\", \"Education_Preschool\", \"Education_1st_4th\", \"Education_5th_6th\", \"Education_7th_8th\", \"WorkClass_Self_emp_inc\", \"WorkClass_Self_emp_not_inc\",\n", + " \"WorkClass_Federal_gov\", \"WorkClass_Local_gov\", \"WorkClass_State_gov\", \"WorkClass_Without_pay\", \"WorkClass_Never_worked\", \"MartialStatus_Married_spouse_absent\",\n", + " \"MartialStatus_Married_AF_spouse\", 'MartialStatus_Married_civ_spouse', 'MartialStatus_Never_married', 'MartialStatus_Divorced', 'MartialStatus_Separated', \n", + " 'MartialStatus_Widowed', 'Race_White', 'Race_Black', 'Race_Asian_Pac_Islander', 'Race_Amer_Indian_Eskimo', 'Race_Other', 'Relationship_Husband', \n", + " 'Relationship_Not_in_family', 'Relationship_Own_child', 'Relationship_Unmarried', 'Relationship_Wife', 'Relationship_Other_relative', 'WorkClass_Private',\n", + " 'Education_Bachelors'\n", + " ]\n", + " for col in input_cols:\n", + " if col not in df.columns:\n", + " df[col] = 0\n", + " df[\"Education_Some_HS\"] = df[\"Education_9th\"] | df[\"Education_10th\"] | df[\"Education_11th\"] | df[\"Education_12th\"]\n", + " df[\"Education_Assoc\"] = df[\"Education_Assoc_voc\"] | df[\"Education_Assoc_acdm\"]\n", + " df[\"Education_Adv_Degree\"] = df[\"Education_Masters\"] | df[\"Education_Prof_school\"] | df[\"Education_Doctorate\"]\n", + " df[\"Education_No_HS\"] = df[\"Education_Preschool\"] | df[\"Education_1st_4th\"] | df[\"Education_5th_6th\"] | df[\"Education_7th_8th\"]\n", + "\n", + " df[\"WorkClass_Self\"] = df[\"WorkClass_Self_emp_inc\"] | df[\"WorkClass_Self_emp_not_inc\"]\n", + " df[\"WorkClass_Gov\"] = df[\"WorkClass_Federal_gov\"] | df[\"WorkClass_Local_gov\"] | df[\"WorkClass_State_gov\"]\n", + " df[\"WorkClass_Other\"] = df[\"WorkClass_Without_pay\"] | df[\"WorkClass_Never_worked\"]\n", + "\n", + " df[\"MartialStatus_Other\"] = df[\"MartialStatus_Married_spouse_absent\"] | df[\"MartialStatus_Married_AF_spouse\"]\n", + "\n", + " return df" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -663,9 +710,8 @@ "1. Write model performance statistics to a file. \n", "1. Assess model bias and fairness (if a potentially sensitive variable is available for assessment). \n", "1. Generate the last few model card files. \n", - "1. Write model score code (Writing the score code yourself is not necessary if you are using the import_model function to automatically generate the score code. Since we've heavily processed the data, I want to write this score code manually to include the data preprocessing code. ). \n", "1. Generate requirements file. \n", - "1. Import model to SAS Model Manager. \n", + "1. Import model to SAS Model Manager and automatically generate the score code. \n", "1. Open the model in SAS Model Manager and begin managing the model lifecycle there. \n", "\n", "So first, be sure that the variables in the block below match your use case." @@ -1608,194 +1654,6 @@ ")" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "I'm going to write the score code myself because I want to include all the data preprocessing we did earlier, but that is not a requirement. If you prefer that this score code be automatically written, I'll provide example code before the conclusion you can use instead. " - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "# Step 13: Write score code\n", - "sn = 'score_' + model_prefix + \".py\"\n", - "sc = Path.cwd() / output_path / sn" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile $sc\n", - "\n", - "import math\n", - "import pickle\n", - "import pandas as pd\n", - "import numpy as np\n", - "from pathlib import Path\n", - "\n", - "import settings\n", - "\n", - "with open(Path(settings.pickle_path) / \"dtc.pickle\", \"rb\") as pickle_model:\n", - " model = pickle.load(pickle_model)\n", - "\n", - "def score(Age, WorkClass, Education, MartialStatus, Relationship, Race, Sex, HoursPerWeek): \n", - " \"Output: EM_EVENTPROBABILITY, EM_CLASSIFICATION\" \n", - "\n", - " # Check if pickle file is loaded, load if unavailable\n", - " try:\n", - " global model\n", - " except NameError:\n", - " with open(settings.pickle_path + 'dtc.pickle', 'rb') as _pFile:\n", - " model = pickle.load(_pFile)\n", - "\n", - " # Encode WorkClass\n", - " WorkClass_Private = 0\n", - " WorkClass_Self = 0\n", - " WorkClass_Gov = 0 \n", - " WorkClass_Other = 0\n", - " if \"Private\" in WorkClass: WorkClass_Private = 1\n", - " elif \"Self-emp-not-inc\" in WorkClass: WorkClass_Self = 1\n", - " elif \"Self-emp-inc\" in WorkClass: WorkClass_Self = 1\n", - " elif \"Local-gov\" in WorkClass: WorkClass_Gov = 1\n", - " elif \"State-gov\" in WorkClass: WorkClass_Gov = 1\n", - " elif \"Federal-gov\" in WorkClass: WorkClass_Gov = 1\n", - " else: WorkClass_Other = 1\n", - "\n", - "\n", - " # Encode Education\n", - " Education_HS_grad = 0\n", - " Education_Some_HS = 0\n", - " Education_Assoc = 0\n", - " Education_Some_college = 0\n", - " Education_Bachelors = 0\n", - " Education_Adv_Degree = 0\n", - " Education_No_HS = 0\n", - "\n", - " if \"HS-grad\" in Education: Education_HS_grad = 1\n", - " elif \"11th\" in Education: Education_Some_HS = 1\n", - " elif \"10th\" in Education: Education_Some_HS = 1\n", - " elif \"9th\" in Education: Education_Some_HS = 1\n", - " elif \"12th\" in Education: Education_Some_HS = 1\n", - " elif \"Assoc-voc\" in Education: Education_Assoc = 1\n", - " elif \"Assoc-acdm\" in Education: Education_Assoc = 1\n", - " elif \"Some-college\" in Education: Education_Some_college = 1\n", - " elif \"Bachelors\" in Education: Education_Bachelors = 1\n", - " elif \"Masters\" in Education: Education_Adv_Degree = 1\n", - " elif \"Prof-school\" in Education: Education_Adv_Degree = 1\n", - " elif \"Doctorate\" in Education: Education_Adv_Degree = 1\n", - " else:Education_No_HS = 1\n", - " \n", - " # Encode MaritalStatus\n", - " MartialStatus_Married_civ_spouse = 0\n", - " MartialStatus_Never_married = 0\n", - " MartialStatus_Divorced = 0\n", - " MartialStatus_Separated = 0 \n", - " MartialStatus_Widowed = 0 \n", - " MartialStatus_Other = 0\n", - " if \"Married-civ-spouse\" in MartialStatus: MartialStatus_Married_civ_spouse = 1\n", - " elif \"Never-married\" in MartialStatus: MartialStatus_Never_married = 1\n", - " elif \"Divorced\" in MartialStatus: MartialStatus_Divorced = 1\n", - " elif \"Separated\" in MartialStatus: MartialStatus_Separated = 1\n", - " elif \"Widowed\" in MartialStatus: MartialStatus_Widowed = 1\n", - " else: MartialStatus_Other = 1\n", - " \n", - " # Encode Relationship\n", - " Relationship_Husband = 0\n", - " Relationship_Not_in_family = 0\n", - " Relationship_Own_child= 0\n", - " Relationship_Unmarried = 0 \n", - " Relationship_Wife = 0\n", - " Relationship_Other_relative = 0\n", - " if \"Husband\" in Relationship: Relationship_Husband = 1\n", - " elif \"Not-in-family\" in Relationship: Relationship_Not_in_family = 1\n", - " elif \"Own-child\" in Relationship: Relationship_Own_child = 1\n", - " elif \"Unmarried\" in Relationship: Relationship_Unmarried = 1\n", - " elif \"Wife\" in Relationship: Relationship_Wife = 1\n", - " else: Relationship_Other_relative = 1\n", - "\n", - " # Encode Race\n", - " Race_White = 0\n", - " Race_Black = 0\n", - " Race_Asian_Pac_Islander = 0\n", - " Race_Amer_Indian_Eskimo = 0\n", - " Race_Other = 0\n", - " if \"White\" in Race: Race_White = 1\n", - " elif \"Black\" in Race: Race_Black = 1\n", - " elif \"Asian-Pac-Islander\" in Race: Race_Asian_Pac_Islander = 1\n", - " elif \"Amer-Indian-Eskimo\" in Race: Race_Amer_Indian_Eskimo = 1\n", - " else: Race_Other = 1\n", - "\n", - " # Encode Sex\n", - " Sex_Female = 0\n", - " if \"Female\" in Sex: Sex_Female = 1\n", - "\n", - " try: \n", - " input_array = pd.DataFrame([[Age, HoursPerWeek, WorkClass_Private, WorkClass_Self, WorkClass_Gov, WorkClass_Other, \n", - " Education_HS_grad, Education_Some_HS, Education_Assoc, Education_Some_college, \n", - " Education_Bachelors, Education_Adv_Degree, Education_No_HS, \n", - " MartialStatus_Married_civ_spouse, MartialStatus_Never_married, \n", - " MartialStatus_Divorced, MartialStatus_Separated, MartialStatus_Widowed, \n", - " MartialStatus_Other, Relationship_Husband, Relationship_Not_in_family, \n", - " Relationship_Own_child, Relationship_Unmarried, Relationship_Wife, \n", - " Relationship_Other_relative, Race_White, Race_Black, Race_Asian_Pac_Islander, \n", - " Race_Amer_Indian_Eskimo, Race_Other, Sex_Female]], \n", - " columns = ['Age', 'HoursPerWeek', 'WorkClass_Private', 'WorkClass_Self', 'WorkClass_Gov', 'WorkClass_Other', \n", - " 'Education_HS_grad', 'Education_Some_HS', 'Education_Assoc', 'Education_Some_college', \n", - " 'Education_Bachelors','Education_Adv_Degree', 'Education_No_HS', \n", - " 'MartialStatus_Married_civ_spouse', 'MartialStatus_Never_married', \n", - " 'MartialStatus_Divorced', 'MartialStatus_Separated', 'MartialStatus_Widowed', \n", - " 'MartialStatus_Other', 'Relationship_Husband', 'Relationship_Not_in_family', \n", - " 'Relationship_Own_child', 'Relationship_Unmarried', 'Relationship_Wife', \n", - " 'Relationship_Other_relative', 'Race_White', 'Race_Black', 'Race_Asian_Pac_Islander', \n", - " 'Race_Amer_Indian_Eskimo', 'Race_Other','Sex_Female'], \n", - " dtype = float)\n", - " except ValueError:\n", - " input_array = pd.DataFrame([[1.0, Age, HoursPerWeek, WorkClass_Private, WorkClass_Self, WorkClass_Gov, WorkClass_Other, \n", - " Education_HS_grad, Education_Some_HS, Education_Assoc, Education_Some_college, \n", - " Education_Bachelors, Education_Adv_Degree, Education_No_HS, \n", - " MartialStatus_Married_civ_spouse, MartialStatus_Never_married, \n", - " MartialStatus_Divorced, MartialStatus_Separated, MartialStatus_Widowed, \n", - " MartialStatus_Other, Relationship_Husband, Relationship_Not_in_family, \n", - " Relationship_Own_child, Relationship_Unmarried, Relationship_Wife, \n", - " Relationship_Other_relative, Race_White, Race_Black, Race_Asian_Pac_Islander, \n", - " Race_Amer_Indian_Eskimo, Race_Other, Sex_Female]], \n", - " columns = ['const', 'Age', 'HoursPerWeek', 'WorkClass_Private', 'WorkClass_Self', 'WorkClass_Gov', 'WorkClass_Other', \n", - " 'Education_HS_grad', 'Education_Some_HS', 'Education_Assoc', 'Education_Some_college', \n", - " 'Education_Bachelors', 'Education_Adv_Degree', 'Education_No_HS', \n", - " 'MartialStatus_Married_civ_spouse', 'MartialStatus_Never_married', \n", - " 'MartialStatus_Divorced', 'MartialStatus_Separated', 'MartialStatus_Widowed', \n", - " 'MartialStatus_Other', 'Relationship_Husband','Relationship_Not_in_family', \n", - " 'Relationship_Own_child', 'Relationship_Unmarried', 'Relationship_Wife', \n", - " 'Relationship_Other_relative', 'Race_White', 'Race_Black', 'Race_Asian_Pac_Islander', \n", - " 'Race_Amer_Indian_Eskimo', 'Race_Other','Sex_Female'], \n", - " dtype = float)\n", - "\n", - " prediction = model.predict_proba(input_array).tolist()\n", - "\n", - " # Check for numpy values and convert to a CAS readable representation\n", - " if isinstance(prediction, np.ndarray):\n", - " prediction = prediction.tolist()\n", - "\n", - " if input_array.shape[0] == 1:\n", - " if prediction[0][1] > 0.5:\n", - " EM_CLASSIFICATION = \"1\"\n", - " else:\n", - " EM_CLASSIFICATION = \"0\"\n", - " return EM_CLASSIFICATION, prediction[0][1]\n", - " else:\n", - " df = pd.DataFrame(prediction)\n", - " proba = df[1]\n", - " classifications = np.where(df[1] > 0.5, '1', '0')\n", - " return pd.DataFrame({'EM_CLASSIFICATION': classifications, 'EM_EVENTPROBABILITY': proba})" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1844,7 +1702,7 @@ } ], "source": [ - "# Step 14: Generate requirements files\n", + "# Step 13: Generate requirements files\n", "requirements_json = pzmm.JSONFiles.create_requirements_json(output_path)\n", "\n", "import json\n", @@ -1865,7 +1723,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Alright, we should have all our files now, so let's import our model and its files into SAS Model Manager! " + "Alright, we should have all our files now, so let's import our model and its files into SAS Model Manager! With the following function, we can automatically generate the score code and import the model into SAS Model Manager in one step." ] }, { @@ -1874,30 +1732,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Step 15: Import model into SAS Model Manager \n", - "zipIOFile = pzmm.ZipModel.zip_files(model_files=output_path, model_prefix=model_prefix, is_viya4=True)\n", - "\n", - "mr.create_project(mm_project, 'Public')\n", - "\n", - "mr.import_model_from_zip(model_prefix, mm_project, zipIOFile)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And now your model is available in SAS Model Manager where you can run the scoring test, update the model usage properties, add Key Performance Indicators (KPIs) thresholds, and run performance monitoring against the model. Managing the rest of the model lifecycle in SAS Model Manager will complete the rest of the model card.\n", - "\n", - "Instead of writing the score code manually and importing the model, you can run the following function to automatically generate the score code and import the model into SAS Model Manager in one step. Automatically generating the score code makes things easy, but manually writing allows greater customization and control. Only run the block below if you don't want to run steps 13 and 15. \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Alternate to Step 13 and Step 15: Automatically generate score code and import model \n", + "# Step 14: Automatically generate score code and import model \n", "pzmm.ImportModel.import_model(\n", " model_files=output_path, # Where are the model files?\n", " model_prefix=model_prefix, # What is the model name?\n", @@ -1910,12 +1745,20 @@ " target_values=[\"0\", \"1\"], # What are the expected values of the target variable?\n", " target_index=1, # What is the index of the target value in target_values?\n", " model_file_name=model_prefix + \".pickle\", # How was the model file serialized?\n", - " missing_values=False # Does the data include missing values?\n", + " missing_values=False, # Does the data include missing values?\n", + " preprocess_function=preprocess_function # What do we want to do to the data before we score it?\n", " )\n", " # Reinitialize the score_code variable when writing more than one model's score code\n", "pzmm.ScoreCode.score_code = \"\"" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And now your model is available in SAS Model Manager where you can run the scoring test, update the model usage properties, add Key Performance Indicators (KPIs) thresholds, and run performance monitoring against the model. Managing the rest of the model lifecycle in SAS Model Manager will complete the rest of the model card." + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/src/sasctl/pzmm/import_model.py b/src/sasctl/pzmm/import_model.py index cc28f7e5..3b2b9218 100644 --- a/src/sasctl/pzmm/import_model.py +++ b/src/sasctl/pzmm/import_model.py @@ -213,6 +213,7 @@ def import_model( target_values: Optional[List[str]] = None, overwrite_project_properties: Optional[bool] = False, target_index: Optional[int] = None, + preprocess_function: Optional[Callable[[DataFrame], DataFrame]] = None, **kwargs, ) -> Tuple[RestObj, Union[dict, str, Path]]: """ @@ -371,6 +372,7 @@ def import_model( missing_values=missing_values, score_cas=score_cas, target_index=target_index, + preprocess_function=preprocess_function, **kwargs, ) if score_code_dict: @@ -471,6 +473,7 @@ def import_model( missing_values=missing_values, score_cas=score_cas, target_index=target_index, + preprocess_function=preprocess_function, **kwargs, ) if score_code_dict: diff --git a/src/sasctl/pzmm/write_score_code.py b/src/sasctl/pzmm/write_score_code.py index 6b8a5546..b29d1a1f 100644 --- a/src/sasctl/pzmm/write_score_code.py +++ b/src/sasctl/pzmm/write_score_code.py @@ -36,6 +36,7 @@ def write_score_code( score_cas: Optional[bool] = True, score_code_path: Union[Path, str, None] = None, target_index: Optional[int] = None, + preprocess_function: Optional[Callable[[DataFrame], DataFrame]] = None, **kwargs, ) -> Union[dict, None]: """ @@ -249,6 +250,7 @@ def score(var1, var2, var3, var4): input_var_list, missing_values=missing_values, dtype_list=input_dtypes_list, + preprocess_function=preprocess_function, ) self._predictions_to_metrics( score_metrics, @@ -265,6 +267,7 @@ def score(var1, var2, var3, var4): missing_values=missing_values, statsmodels_model="statsmodels_model" in kwargs, tf_model="tf_keras_model" in kwargs or "tf_core_model" in kwargs, + preprocess_function=preprocess_function, ) # Include check for numpy values and a conversion operation as needed self.score_code += ( @@ -292,6 +295,9 @@ def score(var1, var2, var3, var4): if missing_values: self._impute_missing_values(input_data, missing_values) + if preprocess_function: + self._add_preprocess_code(preprocess_function) + # SAS Viya 3.5 model if model_id: mas_code, cas_code = self._viya35_score_code_import( @@ -762,6 +768,7 @@ def _predict_method( missing_values: Optional[Any] = None, statsmodels_model: Optional[bool] = False, tf_model: Optional[bool] = False, + preprocess_function: Optional[Callable[[DataFrame], DataFrame]] = None, ) -> None: """ Write the model prediction section of the score code. @@ -812,10 +819,15 @@ def _predict_method( input_frame = f'{{{", ".join(input_dict)}}}, index=index' self.score_code += self._wrap_indent_string(input_frame, 8) self.score_code += f"\n{'':4})\n" + if missing_values: self.score_code += ( f"{'':4}input_array = impute_missing_values(input_array)\n" ) + if preprocess_function: + self.score_code += ( + f"{'':4}input_array = {preprocess_function.__name__}(input_array)\n" + ) self.score_code += ( f"{'':4}column_types = {column_types}\n" f"{'':4}h2o_array = h2o.H2OFrame(input_array, " @@ -858,6 +870,10 @@ def _predict_method( self.score_code += ( f"{'':4}input_array = impute_missing_values(input_array)\n" ) + if preprocess_function: + self.score_code += ( + f"{'':4}input_array = {preprocess_function.__name__}(input_array)\n" + ) self.score_code += ( f"{'':4}prediction = model.{method.__name__}(input_array)\n" ) @@ -879,6 +895,10 @@ def _predict_method( self.score_code += ( f"{'':4}input_array = impute_missing_values(input_array)\n" ) + if preprocess_function: + self.score_code += ( + f"{'':4}input_array = {preprocess_function.__name__}(input_array)\n" + ) self.score_code += ( f"{'':4}prediction = model.{method.__name__}(input_array)\n\n" f"{'':4} # Check if model returns logits or probabilities\n" @@ -911,6 +931,10 @@ def _predict_method( self.score_code += ( f"{'':4}input_array = impute_missing_values(input_array)\n" ) + if preprocess_function: + self.score_code += ( + f"{'':4}input_array = {preprocess_function.__name__}(input_array)\n" + ) self.score_code += ( f"{'':4}prediction = model.{method.__name__}(input_array).tolist()\n" ) @@ -2241,3 +2265,30 @@ def _viya35_score_code_import( model["scoreCodeType"] = "ds2MultiType" mr.update_model(model) return mas_code, cas_code + + def _add_preprocess_code( + self, preprocess_function: Callable[[DataFrame], DataFrame] + ): + """ + Places the given preprocess function, which must both take a DataFrame as an argument + and return a DataFrame, into the score code. If the preprocess function does not + return anything, an error is thrown. + + Parameters + ---------- + preprocess_function: function + The preprocess function to be added to the score code. + """ + import inspect + + preprocess_code = inspect.getsource(preprocess_function) + if not "return" in preprocess_code: + raise ValueError( + "The given score code does not return a value. " + + "To allow for the score code to work correctly, please ensure the preprocessed " + + "data is returned." + ) + if self.score_code[-1] == "\n": + self.score_code += preprocess_code + else: + self.score_code += "\n" + preprocess_code diff --git a/tests/unit/test_write_score_code.py b/tests/unit/test_write_score_code.py index f4f857ac..25eb1aaf 100644 --- a/tests/unit/test_write_score_code.py +++ b/tests/unit/test_write_score_code.py @@ -189,6 +189,36 @@ def test_impute_missing_values(): assert "'c': 1" in sc.score_code or "'c': np.int64(1)" in sc.score_code +def test_preprocess_function(): + """ + Test Cases: + - function + - function with no return + """ + test_df = pd.DataFrame( + data=[[0, "a", 1], [2, "b", 0]], columns=["num", "char", "bin"] + ) + sc = ScoreCode() + sc.score_code = " " + + def preprocess_function_one(data: pd.DataFrame): + print("preprocessing happens here") + return data + + sc._add_preprocess_code(preprocess_function_one) + assert "preprocessing happens here" in sc.score_code + assert "preprocess_function_one" in sc.score_code + + sc = ScoreCode() + sc.score_code = " " + + def preprocess_function_two(data: pd.DataFrame): + print("preprocessing happens here?") + + with pytest.raises(ValueError): + sc._add_preprocess_code(preprocess_function_two) + + def test_predict_method(): """ Test Cases: