-
Notifications
You must be signed in to change notification settings - Fork 4
/
modeling_forecast.py
115 lines (93 loc) · 4.34 KB
/
modeling_forecast.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# forecast.py
from __future__ import print_function
import datetime
import numpy as np
import pandas as pd
import sklearn
from pandas_datareader import data
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.svm import LinearSVC, SVC
def create_lagged_series(symbol, start_date, end_date, lags=5):
"""
This creates a Pandas DataFrame that stores the
percentage returns of the adjusted closing value of
a stock obtained from Yahoo Finance, along with a
number of lagged returns from the prior trading days
(lags defaults to 5 days). Trading volume, as well as
the Direction from the previous day, are also included.
"""
# Obtain stock information from Yahoo Finance
ts = data.DataReader(symbol, 'yahoo', start_date - datetime.timedelta(days=365), end_date)
# Create the new lagged DataFrame
tslag = pd.DataFrame(index=ts.index)
tslag['Today'] = ts['Adj Close']
tslag['Volume'] = ts['Volume']
# Create the shifted lag series of prior trading period close values
for i in range (0, lags):
tslag['Lag%s' % str(i + 1)] = ts['Adj Close'].shift(i + 1)
# Create the returns DataFrame
tsret = pd.DataFrame(index=tslag.index)
tsret['Volume'] = tslag['Volume']
tsret['Today'] = tslag['Today'].pct_change()*100.0
# If any of the values of percentage returns equal zero, set them to
# a small number (stops issues with QDA model in Scikit-Learn)
for i, x in enumerate(tsret['Today']):
if abs(x) < 0.0001:
tsret['Today'][i] = 0.0001
# Create the lagged percentage returns columns
for i in range(0, lags):
tsret['Lag%s' % str(i + 1)] = tslag['Lag%s' % str(i + 1)].pct_change()*100.0
# Create the "Direction" column (+1 or -1) indicating an up/down day
tsret['Direction'] = np.sign(tsret['Today'])
tsret = tsret[tsret.index >= start_date]
return tsret
if __name__ == "__main__":
# Create a lagged series of the S&P500 US stock market index
snpret = create_lagged_series('^GSPC',
datetime.datetime (2011, 1, 1),
datetime.datetime (2021, 12, 31),
lags=5)
# Use the prior two days of returns as predictor
# values, with direction as the response
X = snpret[['Lag1', 'Lag2']]
y = snpret['Direction']
# The test data is split into two parts: Before and after 1st Jan 2021.
start_test = datetime.datetime(2021, 1, 1)
# Create training and test sets
X_train = X[X.index < start_test]
X_test = X[X.index >= start_test]
y_train = y[y.index < start_test]
y_test = y[y.index >= start_test]
# Create the (parametrised) models
print('Hit Rates/Confusion Matrices:\n')
models = [('LR', LogisticRegression()),
('LDA', LDA()),
('QDA', QDA()),
('LSVC', LinearSVC()),
('RSVM', SVC(C=1000000.0,
cache_size=200,
class_weight=None,
coef0=0.0, degree=3,
gamma=0.0001, kernel='rbf',
max_iter=-1, probability=False,
random_state=None,
shrinking=True, tol=0.001, verbose=False)),
('RF', RandomForestClassifier(n_estimators=1000, criterion='gini',
max_depth=None, min_samples_split=2,
min_samples_leaf=1, max_features='auto',
bootstrap=True, oob_score=False, n_jobs=1,
random_state=None, verbose=0)
)]
# Iterate through the models
for m in models:
# Train each of the models on the training set
m[1].fit(X_train, y_train)
# Make an array of predictions on the test set
pred = m[1].predict(X_test)
# Output the hit-rate and the confusion matrix for each model
print('%s:\n%0.3f' % (m[0], m[1].score(X_test, y_test)))
print('%s\n' % confusion_matrix(pred, y_test))