-
Notifications
You must be signed in to change notification settings - Fork 0
/
single_gvkey_model_outcomes_compiler.py
399 lines (305 loc) · 19.3 KB
/
single_gvkey_model_outcomes_compiler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
## import packages ##
import warnings
import sys
if not sys.warnoptions:
warnings.filterwarnings("once")
## graphing packages ##
import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline
## date working / printing packages
import datetime
from datetime import date, timedelta, datetime
from tabulate import tabulate
## stats/math/econometrics packages
import pandas as pd
import numpy as np
import scipy as sp
from scipy import stats, integrate, optimize
from statsmodels.tsa.vector_ar import vecm
from statsmodels.tsa.stattools import adfuller, zivot_andrews
import statsmodels.formula.api as smf #VAR package contained within
import statsmodels.tsa.api as smt
from arch.unitroot import DFGLS, ADF, ZivotAndrews, VarianceRatio
## econometrics warning utilities
from statsmodels.tools.sm_exceptions import ValueWarning
from arch.utility.exceptions import (
InfeasibleTestException,
InvalidLengthWarning,
invalid_length_doc)
warnings.filterwarnings("once", category = ValueWarning)
class color:
#a simplified way to easily call different style elements for displaying outputs
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
GREEN = '\033[92m'
HEADER = '\033[95m'
OKBLUE = '\033[94m'
WARNING = '\033[93m'
FAIL = '\033[91m'
END = '\033[0m'
def single_series_model_outcomes_and_summary_statistics(gvkey_selection, dataframe, data_series, graphs='yes',
unit_root_tests='yes', vr_ratio='yes',
za_test='yes', vecm_model='yes', **kwargs):
"""
Returns all the key statistical tests and regressions models conducted on the entire data sample in the main project.
This function will run identical test specifications but pertaining only to a single specified GVKEY.
Parameters
----------
gvkey_selection: int
The selected GVKEY (unique group identifier)
dataframe:
The DataFrame which contains the relevant GVKEY's CDS and PECDS data series
data_series: array_like
The chosen primary data series to display summary statistics for
graphs: bool, optional
Will display data series with known breakpoints in graphical form
unit_root_tests: bool, optional
Will return the ADF and DFGLS unit root test results on the selected [data_series]
vr_ratio: bool, optional
Will return the Variance Ratio test on the selected [data_series]
vr_ratio_increment: int, optional
The size of the Variance Ratio increments to conduct the joint variance tests on. Default is 2.5%
za_test: bool, optional
Will return the results of the Zivot Andrews Breakpoint test
vecm_model: bool, optional
Will return the results of the Vector Error Correction Model and Granger Causality Test Results within the VECM framework
Associated price discovery leadership methods (GG and HAS measures) are also included.
"""
data_series = str(data_series)
### group/gvkey general identifiers
gvkey_group_df = dataframe.loc[dataframe.gvkey==gvkey_selection]
print(color.BOLD + '\n', "--"*38, '\n', "--"*38, '\n', color.OKBLUE + "//"*10,
"<< SUMMARY MODEL OUTCOMES VIEWER >> ", "//"*9, color.END, '\n', "--"*38, '\n', "--"*38, '\n')
## print general overview items for the selected GVKEY
print(tabulate([['COMPANY NAME', gvkey_group_df.company_name.iloc[0]],
['COMPANY SECTOR', gvkey_group_df.sector.iloc[0]],
['BOND ID', gvkey_group_df.bond_sym_id.iloc[0]]],
headers=[color.BOLD + 'SELECTED GVKEY: ', str(gvkey_selection) + color.END],
tablefmt='pretty', numalign='center'))
### extract just the relevant date elements from the trade_date timestamp
time_min_string = str(gvkey_group_df.loc[gvkey_group_df.gvkey==gvkey_selection]['trade_date'].min())
time_max_string = str(gvkey_group_df.loc[gvkey_group_df.gvkey==gvkey_selection]['trade_date'].max())
time_min_str_strip = datetime.strptime(time_min_string, '%Y-%m-%d %H:%M:%S')
time_max_str_strip = datetime.strptime(time_max_string, '%Y-%m-%d %H:%M:%S')
date_ymd_min = time_min_str_strip.strftime('%Y-%m-%d')
date_ymd_max = time_max_str_strip.strftime('%Y-%m-%d')
print(tabulate([['TOTAL OBS', gvkey_group_df.loc[gvkey_group_df.gvkey==gvkey_selection][data_series].count()],
['FIRST OBS', date_ymd_min],
['LAST OBS', date_ymd_max],
['TIME HORIZON (DAYS)', (gvkey_group_df.loc[gvkey_group_df.gvkey==gvkey_selection]['trade_date'].max() -
gvkey_group_df.loc[gvkey_group_df.gvkey==gvkey_selection]['trade_date'].min()).days]],
headers=[color.BOLD + 'SELECTED SERIES: ', data_series.upper() + color.END],
tablefmt='pretty', numalign='center'))
print(color.BOLD, "// SUMMARY SERIES STATISTICS // ", color.END)
#### print general descriptive statistics pertaining to the selected series on the GVKEY
quartiles = np.quantile(gvkey_group_df.loc[gvkey_group_df.gvkey==gvkey_selection]['market_cds_spread'], [0.25, 0.75])
print(tabulate([['MEAN', np.round(gvkey_group_df.loc[gvkey_group_df.gvkey==gvkey_selection][data_series].mean(), 2)],
['MIN VALUE', gvkey_group_df.loc[gvkey_group_df.gvkey==gvkey_selection][data_series].min()],
['MAX VALUE', gvkey_group_df.loc[gvkey_group_df.gvkey==gvkey_selection][data_series].max()],
['VALUE RANGE', np.ptp(gvkey_group_df.loc[gvkey_group_df.gvkey==gvkey_selection][data_series])],
['25% QUANTILE', quartiles[0]],
['75% QUANTILE', quartiles[1]],
['INT-Q RANGE', (quartiles[1] - quartiles[0])],
['STD DEVIATION', np.round(gvkey_group_df.loc[gvkey_group_df.gvkey==gvkey_selection][data_series].std(), 2)],
['NORMALISED STDDEV', np.round(gvkey_group_df.loc[gvkey_group_df.gvkey==gvkey_selection][data_series].std() /
gvkey_group_df.loc[gvkey_group_df.gvkey==gvkey_selection][data_series].mean(), 2)],
['SKEW', np.round(gvkey_group_df.loc[gvkey_group_df.gvkey==gvkey_selection][data_series].skew(),2)],
['KURTOSIS', np.round(gvkey_group_df.loc[gvkey_group_df.gvkey==gvkey_selection][data_series].kurtosis(),2)]],
headers=[color.BOLD + 'DATA ITEM', 'VALUE' + color.END],
tablefmt='pretty', numalign='center'))
## overlay skew/kurtosis with KDE of market CDS spread
if graphs=='yes':
try:
fig_cds_skew = plt.figure(figsize=(6, 4))
cds_skew_overlay_graph = fig_cds_skew.add_subplot(sns.distplot(gvkey_group_df[data_series],
kde=True, fit=stats.norm))
plt.title("{} SKEW AND KURTOSIS OVERLAID BY ~N(0,1) KDE".format(
data_series.upper()), loc='center', fontsize=10)
plt.show()
except Exception:
print('\n', color.BOLD + color.RED, "Error Fitting KDE graph to CDS series", color.END, '\n')
##### UNIT ROOT TESTS #####
##### ADF TEST
if unit_root_tests=='yes':
try:
print('\n', '--'*38, '\n', color.BOLD + "UNIT ROOT TESTS" + color.END)
ADF_model = ADF(gvkey_group_df.loc[gvkey_group_df.gvkey==gvkey_selection][data_series], lags=10)
dfgls_model = DFGLS(gvkey_group_df.loc[gvkey_group_df.gvkey==gvkey_selection][data_series],
max_lags=10, method='BIC')
if ADF_model.stat <= ADF_model.critical_values['5%']:
adf_model_outcome = ADF_model.alternative_hypothesis
else:
adf_model_outcome = ADF_model.null_hypothesis
if dfgls_model.stat <= dfgls_model.critical_values['5%']:
dfgls_model_outcome = dfgls_model.alternative_hypothesis
else:
dfgls_model_outcome = dfgls_model.null_hypothesis
print(tabulate([['ADF TEST', np.round(ADF_model.stat, 4),
np.round(ADF_model.critical_values['5%'],4), adf_model_outcome],
['DFGLS TEST', np.round(dfgls_model.stat, 4),
np.round(dfgls_model.critical_values['5%'],4), dfgls_model_outcome]],
headers=[color.BOLD + 'TEST TYPE', 'TEST STAT', 'TEST 5% CV', 'TEST OUTCOME' + color.END],
tablefmt='pretty', numalign='center'))
except IndexError as unitroot_ie:
print("DFGLS/ADF MODEL ERROR //", unitroot_ie.args)
else:
print("UNIT ROOT TESTING NOT SELECTED")
## VARIANCE RATIO
if vr_ratio=='yes':
try:
print('\n', color.BOLD + "--"*38 + '\n', "VARIANCE RATIO" + color.END)
## check if optional parameter is passed in for VR ratio increment
vr_ratio_increment = kwargs.get('vr_ratio_increment')
## if no optional argument is passed in for the Variance Ratio Test: Default is 2.5% (40 equal increments)
if vr_ratio_increment is None:
lag_length_vr_test = 0.025
else:
lag_length_vr_test = vr_ratio_increment
## take the floor (lowest closest interger of the increment (lag-size) length)) & run model instance
variance_ratio_increment_size = np.floor(
lag_length_vr_test * len(gvkey_group_df.loc[gvkey_group_df.gvkey==gvkey_selection][data_series]))
variance_ratio_model = VarianceRatio(gvkey_group_df.loc[gvkey_group_df.gvkey==gvkey_selection][data_series],
trend='c', robust=True, lags=int(variance_ratio_increment_size))
## tabulate results of the variance ratio from the above specified test parameters:
print(tabulate([['VARIANCE RATIO', np.round(variance_ratio_model.vr, 2)],
['VR TEST STAT', np.round(variance_ratio_model.stat, 2)],
['VR P-VALUE', np.round(variance_ratio_model.pvalue, 4)],
['VR LAGS USED', np.round(variance_ratio_model.lags, 2)]],
tablefmt='pretty', numalign='center'))
if variance_ratio_model.stat >= variance_ratio_model.critical_values['95%']:
print(color.BOLD + "VR TEST-STATISTIC CONCLUSION:", color.END + "{}".format(
variance_ratio_model.alternative_hypothesis), '\n', "--"*38)
else:
print(color.BOLD + "VR TEST-STATISTIC CONCLUSION: ", color.END + "{}".format(
variance_ratio_model.null_hypothesis), '\n', "--"*38)
except RuntimeWarning as vr_rtw:
print(vr_rtw.args)
#### ZA BREAKPOINT TEST ####
if za_test=='yes':
print('\n', color.BOLD + "--"*38 + '\n', "ZIVOT ANDREWS BREAKPOINT TEST" + color.END)
try:
statsmodels_za = zivot_andrews(gvkey_group_df.loc[gvkey_group_df.gvkey==gvkey_selection][data_series],
trim=0.2, maxlag=10, regression='c', autolag='t-stat')
## assign an outcome to the ZA test based off the test statistic
if statsmodels_za[1] <= statsmodels_za[2]['5%']:
za_outcome_str = 'process is trend/break stationary'
else:
za_outcome_str = 'process is UR with structural break'
print(tabulate([['ZA TEST STAT', np.round(statsmodels_za[1],2)],
['5% ZA CRIT VALUE ', np.round(statsmodels_za[2]['5%'],4)],
['ZA PVALUE ', np.round(statsmodels_za[3], 4)],
['ZA OUTCOME ', za_outcome_str]],
headers=[color.BOLD + 'ZA TEST ITEM', 'VALUE' + color.END],
tablefmt='pretty', numalign='center'))
#### determine the real data associated with the breakpoint ####
#### [this occurs where the ADF unit root t-stat is minimised (most negative)]
## define the date series from the original DF:
date_series = gvkey_group_df['trade_date']
## find the corressponding date to the ZA breakpoint
index_location_za_bpoint = statsmodels_za[4]
date_location_za = str(date_series.iloc[index_location_za_bpoint])
date_location_za_strp = datetime.strptime(date_location_za, '%Y-%m-%d %H:%M:%S')
date_location_za_clean = date_location_za_strp.strftime('%Y-%m-%d')
print(color.BOLD + "CALCULATED BREAKPOINT DATE: " + color.END + "{}".format(date_location_za_clean), '\n', "--"*38, '\n')
except (InfeasibleTestException, UnboundLocalError) as za_model_errors:
print("Unable to calculate ZivotAndrews Breakpoint Test - Reason: {}".format(za_model_errors.args))
else:
print('\n', "ZIVOT ANDREWS BREAKPOINT NOT CALCULATED. THIS MAY AFFECT PROPER CONSTRUCTION OF LINE-GRAPHS", '\n')
##### VECM ITEMS & PRICE DISCOVERY MEASURES #####
if vecm_model=='yes':
try:
print('\n', "--"*38, '\n', color.BOLD + "VECM MODEL & PRICE DISCOVERY MEASURES", color.END)
## VECM MODEL CONSTRUCTION -- only valid on the two series: CDS + PECDS
vecm_data_df = pd.DataFrame(data=(gvkey_group_df['market_cds_spread'], gvkey_group_df['pecds'])).T
optimal_bic_lags = vecm.select_order(vecm_data_df, maxlags=10)
vecm_cointegration_rank = vecm.select_coint_rank(vecm_data_df, det_order=1, k_ar_diff = optimal_bic_lags.bic,
method='trace',signif=0.05)
## limit max cointegrating relationships to one as it's a dual-variable system.
if vecm_cointegration_rank.rank >=1:
cointegrating_rels = int(1)
else:
cointegrating_rels = int(0)
print(color.BOLD + " COINTEGRATION RANK: " + color.END, "{}".format(cointegrating_rels),
'\n','--'*38)
vecm_model_fitted = vecm.VECM(vecm_data_df, k_ar_diff = optimal_bic_lags.bic, coint_rank=cointegrating_rels,
deterministic='co').fit()
cholesky_decomp = np.linalg.cholesky(vecm_model_fitted.sigma_u)
sig_1 = cholesky_decomp[0, 0]
sig_12 = cholesky_decomp[1, 0]
sig_2 = cholesky_decomp[1, 1]
x_1 = vecm_model_fitted.alpha[0,0]
x_2 = vecm_model_fitted.alpha[1,0]
gg_val = np.round(vecm_model_fitted.alpha[1,0] / vecm_model_fitted.alpha[1,0] - vecm_model_fitted.pvalues_alpha[0,0], 2)
HAS_1 = np.round(( x_2**2 * (sig_1**2 - sig_12**2 / sig_2**2) ) / ( x_2**2 * sig_1**2 - 2*x_1*x_2*sig_12 + x_1**2 * sig_2**2 ),4)
HAS_2 = np.round(( (x_2 * sig_1 - x_1 * sig_12 / sig_1)**2 ) / ( x_2**2 * sig_1**2 - 2*x_1*x_2*sig_12 + x_1**2 * sig_2**2 ),4)
HAS_mid = np.round((HAS_2 + HAS_1) / 2,4)
print(tabulate([['GG VALUE', gg_val],
['HAS LOWER', HAS_1],
['HAS MID', HAS_mid],
['HAS UPPER', HAS_2]],
headers=[color.BOLD + 'PRICE DISCOVERY', 'VALUE' + color.END], tablefmt='pretty', numalign='center'),
'\n', color.GREEN, "[ VALUES > 0.5 FAVOUR CDS |&| VALUES < 0.5 FAVOUR BONDS ]", color.END)
print(tabulate([['EQ(1) | Alpha(1) ', np.round(vecm_model_fitted.alpha[0,0], 5), np.round(vecm_model_fitted.pvalues_alpha[0,0], 4)],
['EQ(2) | Alpha(1)', np.round(vecm_model_fitted.alpha[1,0], 5), np.round(vecm_model_fitted.pvalues_alpha[1,0], 4)]],
headers=[color.BOLD + 'VECM MODEL', 'TEST STAT','P-VALUE' + color.END],
tablefmt='pretty', numalign='center'))
print('\n', color.BOLD + "// GRANGER CAUSALITY TESTS // " + color.END)
print(tabulate([['PECDS CAUSES CDS',np.round(vecm_model_fitted.test_granger_causality(caused=0).test_statistic, 4),
vecm_model_fitted.test_granger_causality(caused=0).conclusion_str[11:]],
['CDS CAUSES PECDS', np.round(vecm_model_fitted.test_granger_causality(caused=1).test_statistic, 4),
vecm_model_fitted.test_granger_causality(caused=1).conclusion_str[11: ]]],
headers=[color.BOLD + 'GRANGER TEST', 'TEST STAT',
'TEST CONCLUSION' + color.END], tablefmt='pretty', numalign='center'))
except ValueError as vecm_ve:
print(vecm_ve.args)
except InfeasibleTestException as vecm_ite:
print(vecm_ite.args)
except IndexError as vecm_ie:
print('\n', "Highly probable no significant cointegrating relationship was found", '\n', vecm_ie.args)
## <OPTIONAL> show graphs by year for the CDS series
print("--"*38, '\n')
if graphs=='yes':
print(color.BOLD + "SELECTED VISUAL REPRESENTATIONS OF GVKEY [{}] |&| FOCUSED SERIES: [{}]".format(
gvkey_selection, data_series.upper()) + color.END)
## define a new DF with trade_date as the index for plotting the ZA breakdate
## note: this graph calls the ZA breakdate from the DF (not from the above calculation)
## this is to allow the graphs to display the supposed ZA breakdate while maintaining printing results as optional
gvkey_group_df_timeind = gvkey_group_df.set_index(['trade_date'])
za_bp_date_raw = str(gvkey_group_df.ZA_BREAKDATE.iloc[0])
za_bp_date_strp = datetime.strptime(za_bp_date_raw, '%Y-%m-%d %H:%M:%S')
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
fig.suptitle('GVKEY [{}] CDS & PECDS LINE GRAPHS WITH ZA-BREAKPOINT'.format(gvkey_selection))
## PLOT CDS
sns.lineplot(x=gvkey_group_df_timeind.index,
y='market_cds_spread',
data=gvkey_group_df_timeind,
palette='winter',
markers=True,
dashes=False,
legend='full',
ax=ax1)
## AFFIX THE ZIVOT-ANDREWS BREAKDPOINT
ax1.set_ylabel("DAILY CDS VALUES", fontsize=12)
ax1.axvline(za_bp_date_strp, 0, color='r', linestyle='--', label='ZA BREAKPOINT')
ax1_labels=['ZA BREAKPOINT']
handles_1, _ = ax1.get_legend_handles_labels()
ax1.legend(handles=handles_1, labels=ax1_labels)
## PLOT PECDS
sns.lineplot(x=gvkey_group_df_timeind.index,
y=gvkey_group_df_timeind['pecds'],
data=gvkey_group_df_timeind,
palette='RdGy_r',
legend='full',
ax=ax2)
## AFFIX THE ZIVOT-ANDREWS BREAKDPOINT
ax2.set_ylabel("DAILY PECDS VALUES", fontsize=12)
ax2.axvline(za_bp_date_strp, 0, color='r', linestyle='--', label='ZA BREAKPOINT')
ax2_labels=['ZA BREAKPOINT']
handles_2, _ = ax2.get_legend_handles_labels()
ax2.legend(handles=handles_2, labels=ax2_labels)
plt.show()
else:
print('\n', "GRAPHS SET TO NO-SHOW", '\n', "--"*38)
pass