From 5e8d8fd0be588d45dfacb9038abef4ae4e7b25c1 Mon Sep 17 00:00:00 2001 From: Scott Lindauer Date: Tue, 26 Sep 2023 09:03:36 -0400 Subject: [PATCH] Deprecate older functions to generate dmcas_fitstat/ROC/Lift json files --- src/sasctl/pzmm/write_json_files.py | 441 ---------------------------- 1 file changed, 441 deletions(-) diff --git a/src/sasctl/pzmm/write_json_files.py b/src/sasctl/pzmm/write_json_files.py index d088aaab..1f3282d1 100644 --- a/src/sasctl/pzmm/write_json_files.py +++ b/src/sasctl/pzmm/write_json_files.py @@ -1466,447 +1466,6 @@ def apply_dataframe_to_json( ) return json_dict - # noinspection PyCallingNonCallable, PyNestedDecorators - @deprecated( - "Please use the calculate_model_statistics method instead.", - version="1.9", - removed_in="1.10", - ) - @classmethod - def calculateFitStat( - cls, validateData=None, trainData=None, testData=None, jPath=Path.cwd() - ): - """ - Calculates fit statistics from user data and predictions and then writes to a - JSON file for importing into the common model repository. - - Note that if no data set is provided (validate, train, or test), - this function raises an error and does not create a JSON file. - - Datasets can be provided in the following forms: - * pandas dataframe; the actual and predicted values are their own columns - * numpy array; the actual and predicted values are their own columns or rows and - ordered such that the actual values come first and the predicted second - * list; the actual and predicted values are their own indexed entry - - This function outputs a JSON file named "dmcas_fitstat.json". - - Parameters - ---------- - validateData : pandas dataframe, numpy array, or list, optional - Dataframe, array, or list of the validation data set, including both - the actual and predicted values. The default value is None. - trainData : pandas dataframe, numpy array, or list, optional - Dataframe, array, or list of the train data set, including both - the actual and predicted values. The default value is None. - testData : pandas dataframe, numpy array, or list, optional - Dataframe, array, or list of the test data set, including both - the actual and predicted values. The default value is None. - jPath : string, optional - Location for the output JSON file. The default value is the current - working directory. - """ - # If numpy inputs are supplied, then assume numpy is installed - try: - import numpy as np - except ImportError: - np = None - - try: - from sklearn import metrics - except ImportError: - raise RuntimeError( - "The 'scikit-learn' package is required to use the calculateFitStat " - "function. " - ) - - nullJSONPath = ( - Path(__file__).resolve().parent / "template_files/dmcas_fitstat.json" - ) - nullJSONDict = cls.read_json_file(nullJSONPath) - - dataSets = [[[None], [None]], [[None], [None]], [[None], [None]]] - - dataPartitionExists = [] - for i, data in enumerate([validateData, trainData, testData]): - if data is not None: - dataPartitionExists.append(i) - if type(data) is pd.core.frame.DataFrame: - dataSets[i] = data.transpose().values.tolist() - elif type(data) is list: - dataSets[i] = data - elif type(data) is np.ndarray: - dataSets[i] = data.tolist() - - if len(dataPartitionExists) == 0: - raise ValueError( - "No data was provided. Please provide the actual and predicted values " - "for at least one of the partitions (VALIDATE, TRAIN, or TEST)." - ) - - for j in dataPartitionExists: - fitStats = nullJSONDict["data"][j]["dataMap"] - - fitStats["_PartInd_"] = j - - # If the data provided is Predicted | Actual instead of Actual | - # Predicted, catch the error and flip the columns - try: - fpr, tpr, _ = metrics.roc_curve(dataSets[j][0], dataSets[j][1]) - except ValueError: - tempSet = dataSets[j] - dataSets[j][0] = tempSet[1] - dataSets[j][1] = tempSet[0] - fpr, tpr, _ = metrics.roc_curve(dataSets[j][0], dataSets[j][1]) - - RASE = math.sqrt(metrics.mean_squared_error(dataSets[j][0], dataSets[j][1])) - fitStats["_RASE_"] = RASE - - NObs = len(dataSets[j][0]) - fitStats["_NObs_"] = NObs - - auc = metrics.roc_auc_score(dataSets[j][0], dataSets[j][1]) - GINI = (2 * auc) - 1 - fitStats["_GINI_"] = GINI - - try: - from scipy.stats import gamma - - _, _, scale = gamma.fit(dataSets[j][1]) - fitStats["_GAMMA_"] = 1 / scale - except ImportError: - warnings.warn( - "scipy was not installed, so the gamma calculation could" - "not be computed." - ) - fitStats["_GAMMA_"] = None - - intPredict = [round(x) for x in dataSets[j][1]] - MCE = 1 - metrics.accuracy_score(dataSets[j][0], intPredict) - fitStats["_MCE_"] = MCE - - ASE = metrics.mean_squared_error(dataSets[j][0], dataSets[j][1]) - fitStats["_ASE_"] = ASE - - MCLL = metrics.log_loss(dataSets[j][0], dataSets[j][1]) - fitStats["_MCLL_"] = MCLL - - KS = max(abs(fpr - tpr)) - fitStats["_KS_"] = KS - - KSPostCutoff = None - fitStats["_KSPostCutoff_"] = KSPostCutoff - - DIV = len(dataSets[j][0]) - fitStats["_DIV_"] = DIV - - TAU = pd.Series(dataSets[j][0]).corr( - pd.Series(dataSets[j][1]), method="kendall" - ) - fitStats["_TAU_"] = TAU - - KSCut = None - fitStats["_KSCut_"] = KSCut - - C = metrics.auc(fpr, tpr) - fitStats["_C_"] = C - - nullJSONDict["data"][j]["dataMap"] = fitStats - - with open(Path(jPath) / FITSTAT, "w") as jFile: - json.dump(nullJSONDict, jFile, indent=4) - if cls.notebook_output: - print( - f"{FITSTAT} was successfully written and saved to " - f"{Path(jPath) / FITSTAT}" - ) - - # noinspection PyCallingNonCallable,PyNestedDecorators - @deprecated( - "Please use the calculate_model_statistics method instead.", - version="1.9", - removed_in="1.10", - ) - @classmethod - def generateROCLiftStat( - cls, - targetName, - targetValue, - swatConn, - validateData=None, - trainData=None, - testData=None, - jPath=Path.cwd(), - ): - """ - Calculates the ROC and Lift curves from user data and model predictions and - the writes it to JSON files for importing in to the common model repository. - - ROC and Lift calculations are completed by CAS through a SWAT call. Note that - if no data set is provided (validate, train, or test), this function raises - an error and does not create any JSON files. - - This function outputs a pair of JSON files named "dmcas_lift.json" and - "dmcas_roc.json". - - Parameters - --------------- - targetName: str - Target variable name to be predicted. - targetValue: int or float - Value of target variable that indicates an event. - swatConn: SWAT connection to CAS - Connection object to CAS service in SAS Model Manager through SWAT - authentication. - validateData : pandas dataframe, numpy array, or list, optional - Dataframe, array, or list of the validation data set, including both the - actual values and the calculated probabilities. The default value is None. - trainData : pandas dataframe, numpy array, or list, optional - Dataframe, array, or list of the train data set, including both the actual - values and the calculated probabilities. The default value is None. - testData : pandas dataframe, numpy array, or list, optional - Dataframe, array, or list of the test data set, including both the actual - values and the calculated probabilities. The default value is None. - jPath : string, optional - Location for the output JSON file. The default value is the current working - directory. - """ - # If numpy inputs are supplied, then assume numpy is installed - try: - # noinspection PyPackageRequirements - import numpy as np - except ImportError: - np = None - try: - import swat - except ImportError: - raise RuntimeError( - "The 'swat' package is required to generate ROC and Lift charts with " - "this function. " - ) - - nullJSONROCPath = ( - Path(__file__).resolve().parent / "template_files/dmcas_roc.json" - ) - nullJSONROCDict = cls.read_json_file(nullJSONROCPath) - - nullJSONLiftPath = ( - Path(__file__).resolve().parent / "template_files/dmcas_lift.json" - ) - nullJSONLiftDict = cls.read_json_file(nullJSONLiftPath) - - dataSets = [pd.DataFrame(), pd.DataFrame(), pd.DataFrame()] - columns = ["actual", "predict"] - - dataPartitionExists = [] - # Check if a data partition exists, then convert to a pandas dataframe - for i, data in enumerate([validateData, trainData, testData]): - if data is not None: - dataPartitionExists.append(i) - if type(data) is list: - dataSets[i][columns] = list(zip(*data)) - elif type(data) is pd.core.frame.DataFrame: - try: - dataSets[i][columns[0]] = data.iloc[:, 0] - dataSets[i][columns[1]] = data.iloc[:, 1] - except NameError: - dataSets[i] = pd.DataFrame(data=data.iloc[:, 0]).rename( - columns={data.columns[0]: columns[0]} - ) - dataSets[i][columns[1]] = data.iloc[:, 1] - elif type(data) is np.ndarray: - try: - dataSets[i][columns] = data - except ValueError: - dataSets[i][columns] = data.transpose() - - if len(dataPartitionExists) == 0: - raise ValueError( - "No data was provided. Please provide the actual and predicted values " - "for at least one of the partitions (VALIDATE, TRAIN, or TEST)" - ) - - nullLiftRow = list(range(1, 64)) - nullROCRow = list(range(1, 301)) - - swatConn.loadactionset("percentile") - - for i in dataPartitionExists: - swatConn.read_frame( - dataSets[i][columns], casout=dict(name="SCOREDVALUES", replace=True) - ) - swatConn.percentile.assess( - table="SCOREDVALUES", - inputs=[columns[1]], - casout=dict(name="SCOREASSESS", replace=True), - response=columns[0], - event=str(targetValue), - ) - assessROC = swatConn.CASTable("SCOREASSESS_ROC").to_frame() - assessLift = swatConn.CASTable("SCOREASSESS").to_frame() - - for j in range(100): - rowNumber = (i * 100) + j - nullROCRow.remove(rowNumber + 1) - nullJSONROCDict["data"][rowNumber]["dataMap"]["_Event_"] = targetValue - nullJSONROCDict["data"][rowNumber]["dataMap"][ - "_TargetName_" - ] = targetName - nullJSONROCDict["data"][rowNumber]["dataMap"]["_Cutoff_"] = str( - assessROC["_Cutoff_"][j] - ) - nullJSONROCDict["data"][rowNumber]["dataMap"]["_TP_"] = str( - assessROC["_TP_"][j] - ) - nullJSONROCDict["data"][rowNumber]["dataMap"]["_FP_"] = str( - assessROC["_FP_"][j] - ) - nullJSONROCDict["data"][rowNumber]["dataMap"]["_FN_"] = str( - assessROC["_FN_"][j] - ) - nullJSONROCDict["data"][rowNumber]["dataMap"]["_TN_"] = str( - assessROC["_TN_"][j] - ) - nullJSONROCDict["data"][rowNumber]["dataMap"]["_Sensitivity_"] = str( - assessROC["_Sensitivity_"][j] - ) - nullJSONROCDict["data"][rowNumber]["dataMap"]["_Specificity_"] = str( - assessROC["_Specificity_"][j] - ) - nullJSONROCDict["data"][rowNumber]["dataMap"]["_KS_"] = str( - assessROC["_KS_"][j] - ) - nullJSONROCDict["data"][rowNumber]["dataMap"]["_KS2_"] = str( - assessROC["_KS2_"][j] - ) - nullJSONROCDict["data"][rowNumber]["dataMap"]["_FHALF_"] = str( - assessROC["_FHALF_"][j] - ) - nullJSONROCDict["data"][rowNumber]["dataMap"]["_FPR_"] = str( - assessROC["_FPR_"][j] - ) - nullJSONROCDict["data"][rowNumber]["dataMap"]["_ACC_"] = str( - assessROC["_ACC_"][j] - ) - nullJSONROCDict["data"][rowNumber]["dataMap"]["_FDR_"] = str( - assessROC["_FDR_"][j] - ) - nullJSONROCDict["data"][rowNumber]["dataMap"]["_F1_"] = str( - assessROC["_F1_"][j] - ) - nullJSONROCDict["data"][rowNumber]["dataMap"]["_C_"] = str( - assessROC["_C_"][j] - ) - nullJSONROCDict["data"][rowNumber]["dataMap"]["_GINI_"] = str( - assessROC["_GINI_"][j] - ) - nullJSONROCDict["data"][rowNumber]["dataMap"]["_GAMMA_"] = str( - assessROC["_GAMMA_"][j] - ) - nullJSONROCDict["data"][rowNumber]["dataMap"]["_TAU_"] = str( - assessROC["_TAU_"][j] - ) - nullJSONROCDict["data"][rowNumber]["dataMap"]["_MiscEvent_"] = str( - assessROC["_MiscEvent_"][j] - ) - nullJSONROCDict["data"][rowNumber]["dataMap"][ - "_OneMinusSpecificity_" - ] = str(1 - assessROC["_Specificity_"][j]) - - for j in range(21): - rowNumber = (i * 21) + j - nullLiftRow.remove(rowNumber + 1) - nullJSONLiftDict["data"][rowNumber]["dataMap"]["_Event_"] = str( - targetValue - ) - nullJSONLiftDict["data"][rowNumber]["dataMap"][ - "_TargetName_" - ] = targetName - if j != 0: - nullJSONLiftDict["data"][rowNumber]["dataMap"]["_Depth_"] = str( - assessLift["_Depth_"][j - 1] - ) - nullJSONLiftDict["data"][rowNumber]["dataMap"]["_Value_"] = str( - assessLift["_Value_"][j - 1] - ) - nullJSONLiftDict["data"][rowNumber]["dataMap"]["_NObs_"] = str( - assessLift["_NObs_"][j - 1] - ) - nullJSONLiftDict["data"][rowNumber]["dataMap"]["_NEvents_"] = str( - assessLift["_NEvents_"][j - 1] - ) - nullJSONLiftDict["data"][rowNumber]["dataMap"][ - "_NEventsBest_" - ] = str(assessLift["_NEventsBest_"][j - 1]) - nullJSONLiftDict["data"][rowNumber]["dataMap"]["_Resp_"] = str( - assessLift["_Resp_"][j - 1] - ) - nullJSONLiftDict["data"][rowNumber]["dataMap"]["_RespBest_"] = str( - assessLift["_RespBest_"][j - 1] - ) - nullJSONLiftDict["data"][rowNumber]["dataMap"]["_Lift_"] = str( - assessLift["_Lift_"][j - 1] - ) - nullJSONLiftDict["data"][rowNumber]["dataMap"]["_LiftBest_"] = str( - assessLift["_LiftBest_"][j - 1] - ) - nullJSONLiftDict["data"][rowNumber]["dataMap"]["_CumResp_"] = str( - assessLift["_CumResp_"][j - 1] - ) - nullJSONLiftDict["data"][rowNumber]["dataMap"][ - "_CumRespBest_" - ] = str(assessLift["_CumRespBest_"][j - 1]) - nullJSONLiftDict["data"][rowNumber]["dataMap"]["_CumLift_"] = str( - assessLift["_CumLift_"][j - 1] - ) - nullJSONLiftDict["data"][rowNumber]["dataMap"][ - "_CumLiftBest_" - ] = str(assessLift["_CumLiftBest_"][j - 1]) - nullJSONLiftDict["data"][rowNumber]["dataMap"]["_PctResp_"] = str( - assessLift["_PctResp_"][j - 1] - ) - nullJSONLiftDict["data"][rowNumber]["dataMap"][ - "_PctRespBest_" - ] = str(assessLift["_PctRespBest_"][j - 1]) - nullJSONLiftDict["data"][rowNumber]["dataMap"][ - "_CumPctResp_" - ] = str(assessLift["_CumPctResp_"][j - 1]) - nullJSONLiftDict["data"][rowNumber]["dataMap"][ - "_CumPctRespBest_" - ] = str(assessLift["_CumPctRespBest_"][j - 1]) - nullJSONLiftDict["data"][rowNumber]["dataMap"]["_Gain_"] = str( - assessLift["_Gain_"][j - 1] - ) - nullJSONLiftDict["data"][rowNumber]["dataMap"]["_GainBest_"] = str( - assessLift["_GainBest_"][j - 1] - ) - - # If not all partitions are present, clean up the dicts for compliant formatting - if len(dataPartitionExists) < 3: - # Remove missing partitions from ROC and Lift dicts - for index, row in reversed(list(enumerate(nullJSONLiftDict["data"]))): - if int(row["rowNumber"]) in nullLiftRow: - nullJSONLiftDict["data"].pop(index) - for index, row in reversed(list(enumerate(nullJSONROCDict["data"]))): - if int(row["rowNumber"]) in nullROCRow: - nullJSONROCDict["data"].pop(index) - - # Reassign the row number values to match what is left in each dict - for i, _ in enumerate(nullJSONLiftDict["data"]): - nullJSONLiftDict["data"][i]["rowNumber"] = i + 1 - for i, _ in enumerate(nullJSONROCDict["data"]): - nullJSONROCDict["data"][i]["rowNumber"] = i + 1 - - with open(Path(jPath) / ROC, "w") as jFile: - json.dump(nullJSONROCDict, jFile, indent=4) - if cls.notebook_output: - print(f"{ROC} was successfully written and saved to {Path(jPath) / ROC}") - - with open(Path(jPath) / LIFT, "w") as jFile: - json.dump(nullJSONLiftDict, jFile, indent=4) - if cls.notebook_output: - print(f"{LIFT} was successfully written and saved to {Path(jPath) / LIFT}") - @staticmethod def read_json_file(path: Union[str, Path]) -> Any: """