-
Notifications
You must be signed in to change notification settings - Fork 0
/
glm.py
125 lines (94 loc) · 4.57 KB
/
glm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import statsmodels.formula.api as smf
import statsmodels.api as sm
import numpy as np
import pandas as pd
import torch
from data_load import load_data
def preprocess_data_glm(data_full):
"""
Preprocess the data for the GLM model
:param data_full: the full dataset
:return:
"""
# Map Area to continuous values
area_mapping = {"'A'": 1, "'B'": 2, "'C'": 3, "'D'": 4, "'E'": 5, "'F'": 6}
data_full['Area'] = data_full['Area'].map(area_mapping)
# Merge VehPower groups
# data_full['VehPower'] = data_full['VehPower'].apply(lambda x: x if x < 9 else 9)
# Create categorical classes for VehAge
# data_full['VehAge'] = pd.cut(data_full['VehAge'], bins=[np.min(data_full['VehAge'])-1, 1, 10, np.inf],
# labels=[0, 1, 2])
# Create categorical classes for DrivAge
# data_full['DrivAge'] = pd.cut(data_full['DrivAge'], bins=[np.min(data_full['DrivAge'])-1, 21, 26, 31, 41, 51, 71, np.inf],
# labels=[0, 1, 2, 3, 4, 5, 6])
# Scale numerical variables
data_full['VehPower'] = (data_full['VehPower'] - data_full['VehPower'].mean()) / data_full['VehPower'].std()
data_full['VehAge'] = (data_full['VehAge'] - data_full['VehAge'].mean()) / data_full['VehAge'].std()
data_full['DrivAge'] = (data_full['DrivAge'] - data_full['DrivAge'].mean()) / data_full['DrivAge'].std()
# Cap BonusMalus at 150
data_full['BonusMalus'] = data_full['BonusMalus'].clip(upper=150)
# Log-transform Density
data_full['Density'] = np.log1p(data_full['Density'])
return data_full
def GLM_model(data_):
"""
Fit a GLM model to the data
Model the frequency and severity separately to handle the zero-inflation
:param data_: the full dataset
:return: None
"""
data_full = data_.copy()
data_full = data_full[data_full['Exposure'] > 0]
# For frequency, we use Poisson distribution as the target variable is a count
model_freq = smf.glm(
'ClaimNb ~ DrivAge + VehPower + VehAge + BonusMalus + VehBrand + VehGas + Region + Area + Density',
# '+ (DrivAge^2) + (DrivAge^3) + (VehPower^2) + (VehPower^3) + (VehAge^2) + (VehAge^3)',
data=data_full, family=sm.families.Poisson(),
offset=np.log(data_full['Exposure'])
).fit()
print(model_freq.summary())
# Normalize the claim amount by the exposure and log-transform it to counteract the skewness
data_full['NormalizedClaimAmount'] = data_full['ClaimAmount'] / data_full['Exposure']
data_full['LogClaimAmount'] = np.log1p(data_full['NormalizedClaimAmount'])
# We only model the severity for the rows with ClaimAmount > 0
severity_data = data_full[data_full['ClaimAmount'] > 0]
# For severity, we use the Gamma distribution as the target variable is continuous
model_sev = smf.glm(
'LogClaimAmount ~ DrivAge + VehPower + VehAge + BonusMalus + VehBrand + VehGas + Region + Area + Density',
data=severity_data, family=sm.families.Gamma()
).fit()
print(model_sev.summary())
# Frequency prediction: For the entire dataset
freq_pred = model_freq.predict()
# Severity prediction: Now for all rows
sev_pred = np.expm1(model_sev.predict(data_full))
# Calculate Expected Claim Cost
data_full['ExpectedClaimCost'] = freq_pred * sev_pred * data_full['Exposure']
return data_full[['IDpol', 'NormalizedClaimAmount', 'ExpectedClaimCost']], model_freq, model_sev
def evaluate_glm(glm_freq, glm_sev, test_data):
"""
Evaluate the model on the test data
:param glm_freq: the frequency model
:param glm_sev: the severity model
:param test_data: the test data
:return: metric values
"""
prediction = glm_freq.predict(test_data) * np.expm1(glm_sev.predict(test_data)) * test_data['Exposure']
MAE = torch.mean(
torch.abs(torch.Tensor(np.array(prediction)) - torch.Tensor(np.array(test_data['ClaimAmount'])))).item()
RMSE = torch.sqrt(torch.mean((torch.Tensor(np.array(prediction)) - torch.Tensor(np.array(test_data['ClaimAmount']))) ** 2)).item()
print(f"GLM MAE: {MAE}")
print(f"GLM RMSE: {RMSE}")
return {'MAE': MAE, 'RMSE': RMSE}
def train_glm():
data = load_data()
data = preprocess_data_glm(data)
# Split data
data_train = data.sample(frac=0.8, random_state=0)
data_test = data.drop(data_train.index)
glm_pred, glm_freq, glm_sev = GLM_model(data_train)
return glm_pred, glm_freq, glm_sev, data_test
if __name__ == '__main__':
glm_pred, glm_freq, glm_sev, data_test = train_glm()
metrics_glm = evaluate_glm(glm_freq, glm_sev, data_test)
print(metrics_glm)