1 模型特征
特征的选择是非常个性化的过程
因子的选择取决于你对问题的理解
常见基础指标列举: 基础指标 常见经济指标 常见股指 常见汇率
常见技术指标列举(包含python实现)
2 模型建立
探究了决策树和随机森林之间的关系,并引出了对集成算法的讨论,内容较为简略,深度不够,不再赘述。
之后简单提及了一些模型和概念 逻辑回归 1_study/algorithm/支持向量机 SVM]] [[基础神经网络 损失函数
3 示例代码1:常用模型对比
# forecast.py
from __future__ import print_function
import datetime
import numpy as np
import pandas as pd
import sklearn
from pandas.io.data import DataReader
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import LDA
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import QDA
from sklearn.svm import LinearSVC, SVC
def create_lagged_series(symbol, start_date, end_date, lags=5):
"""
This creates a pandas DataFrame that stores the
percentage returns of the adjusted closing value of
a stock obtained from Yahoo Finance, along with a
number of lagged returns from the prior trading days
(lags defaults to 5 days). Trading volume, as well as
the Direction from the previous day, are also included.
"""
# Obtain stock information from Yahoo Finance
ts = DataReader(
symbol, "yahoo",
start_date-datetime.timedelta(days=365),
end_date
)
# Create the new lagged DataFrame
tslag = pd.DataFrame(index=ts.index)
tslag["Today"] = ts["Adj Close"]
tslag["Volume"] = ts["Volume"]
# Create the shifted lag series of prior trading period close values
for i in range(0, lags):
tslag["Lag%s" % str(i+1)] = ts["Adj Close"].shift(i+1)
# Create the returns DataFrame
tsret = pd.DataFrame(index=tslag.index)
tsret["Volume"] = tslag["Volume"]
tsret["Today"] = tslag["Today"].pct_change()*100.0
# If any of the values of percentage returns equal zero, set them to
# a small number (stops issues with QDA model in scikit-learn)
for i,x in enumerate(tsret["Today"]):
if (abs(x) < 0.0001):
tsret["Today"][i] = 0.0001
# Create the lagged percentage returns columns
for i in range(0, lags):
tsret["Lag%s" % str(i+1)] = \
tslag["Lag%s" % str(i+1)].pct_change()*100.0
# Create the "Direction" column (+1 or -1) indicating an up/down day
tsret["Direction"] = np.sign(tsret["Today"])
tsret = tsret[tsret.index >= start_date]
return tsret
if __name__ == "__main__":
# Create a lagged series of the S&P500 US stock market index
snpret = create_lagged_series(
"^GSPC", datetime.datetime(2001,1,10),
datetime.datetime(2005,12,31), lags=5
)
# Use the prior two days of returns as predictor
# values, with direction as the response
X = snpret[["Lag1","Lag2"]]
y = snpret["Direction"]
# The test data is split into two parts: Before and after 1st Jan 2005.
start_test = datetime.datetime(2005,1,1)
# Create training and test sets
X_train = X[X.index < start_test]
X_test = X[X.index >= start_test]
y_train = y[y.index < start_test]
y_test = y[y.index >= start_test]
# Create the (parametrised) models
print("Hit Rates/Confusion Matrices:\n")
models = [("LR", LogisticRegression()),
("LDA", LDA()),
("QDA", QDA()),
("LSVC", LinearSVC()),
("RSVM", SVC(
C=1000000.0, cache_size=200, class_weight=None,
coef0=0.0, degree=3, gamma=0.0001, kernel='rbf',
max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False)
),
("RF", RandomForestClassifier(
n_estimators=1000, criterion='gini',
max_depth=None, min_samples_split=2,
min_samples_leaf=1, max_features='auto',
bootstrap=True, oob_score=False, n_jobs=1,
random_state=None, verbose=0)
)]
# Iterate through the models
for m in models:
# Train each of the models on the training set
m[1].fit(X_train, y_train)
# Make an array of predictions on the test set
pred = m[1].predict(X_test)
# Output the hit-rate and the confusion matrix for each model
print("%s:\n%0.3f" % (m[0], m[1].score(X_test, y_test)))
print("%s\n" % confusion_matrix(pred, y_test))
4 示例代码2:grid_search调参
# grid_search.py
from __future__ import print_function
import datetime
import sklearn
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from forecast import create_lagged_series
if __name__ == "__main__":
# Create a lagged series of the S&P500 US stock market index
snpret = create_lagged_series(
"^GSPC", datetime.datetime(2001,1,10),
datetime.datetime(2005,12,31), lags=5
)
# Use the prior two days of returns as predictor
# values, with direction as the response
X = snpret[["Lag1","Lag2"]]
y = snpret["Direction"]
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.5, random_state=42
)
# Set the parameters by cross-validation
tuned_parameters = [
{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}
]
# Perform the grid search on the tuned parameters
model = GridSearchCV(SVC(C=1), tuned_parameters, cv=10)
model.fit(X_train, y_train)
print("Optimised parameters found on training set:")
print(model.best_estimator_, "\n")
print("Grid scores calculated on training set:")
for params, mean_score, scores in model.grid_scores_:
print("%0.3f for %r" % (mean_score, params))