基本思想:將序列分解成季節(jié)、趨勢(shì)和殘差,然后對(duì)殘差進(jìn)行EMD分解,再將這些成分作為特征輸入XGBoost模型,并對(duì)各個(gè)成分進(jìn)行預(yù)測(cè),最后相加得到預(yù)測(cè)值。
emd代碼:經(jīng)驗(yàn)?zāi)B(tài)分解(EMD)實(shí)現(xiàn)
GLM:[文獻(xiàn)復(fù)現(xiàn)]中長(zhǎng)期神經(jīng)網(wǎng)絡(luò)概率預(yù)測(cè)模型
import warnings # 忽略警告
warnings.filterwarnings('ignore')
import datetime as dt
from EMD import emd
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
class GLM:
def __init__(self, df):
self.df = df
def getXY(self):
self.df['dayofyear'] = self.df.index.dayofyear
self.x = self.df.dayofyear.values.reshape(len(self.df), 1)
self.y = self.df.y.values.reshape(len(self.df), 1)
def getSX(self):
self.getXY()
sat = self.df['sat'].values.reshape(len(self.df), 1)
sun = self.df['sun'].values.reshape(len(self.df), 1)
w = 2 * np.pi / 365
x = self.x
x = np.hstack([x, np.sin(w * x), np.sin(2 * w * x), np.cos(w * x), np.cos(2 * w * x), sat, sun])
self.sx = x
def Trend(self, season):
'''trend = \beta_0+\beta_1*t'''
model = LinearRegression()
model.fit(self.x, self.y - season)
return model
def seasonality(self):
'''season = \sum_{k=1}^{2}[\beta_{1+k}sin(kwt)+\beta_{2+k}sin(kwt)]+
\beta_6D_{sat}(t)+\beta_7D_{sun}(t)+\beta_8D_{Hol}(t)'''
model = LinearRegression()
model.fit(self.sx, self.y)
return model
xgboost_params = {
'colsample_bynode': 0.8,
'learning_rate': 0.01,
'max_depth': 4,
'objective': 'reg:squarederror',
'subsample': 0.8,
'reg_alpha': 0.005,
'n_estimators': 200
}
class XGB:
def fit(self, X_train, y_train, n=None):
pipeline = [('SimpleImputer', SimpleImputer(missing_values=np.nan, strategy='mean'))
# ('scaler', StandardScaler()),
# ('pca', PCA(n_components=n))
]
multi_regressorr = MultiOutputRegressor(XGBRegressor(params=xgboost_params), 20)
pipeline.append(('regressor', multi_regressorr))
estimator = Pipeline(pipeline)
self.xgb = estimator.fit(X_train, y_train)
def predict(self, X_test):
y_pred = self.xgb.predict(X_test)
return y_pred
def error(self, y_test, y_pred):
error = mean_squared_error(y_test.values.flatten(), y_pred.flatten())
return error
class SaveAndLoadModel:
def __init__(self, path):
self.path = path
if not os.path.exists(self.path):
os.makedirs(self.path)
def save_model(self, model):
filename = 'model.pkl'
save_name = os.path.join(self.path, filename)
print('saving: -> exporting to %s' % save_name)
with open(save_name, 'wb') as handle:
pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
def load_model(self):
filename = 'model.pkl'
save_name = os.path.join(self.path, filename)
print('loading: -> exporting to %s' % save_name)
with open(save_name, 'rb') as handle:
model = pickle.load(handle)
return model
def main(df):
columns = ['y', 'season', 'trend', 'emd0', 'emd1', 'emd2', 'emd3', 'emd4']
df_train = df[(df.index > dt.datetime(2015, 1, 1)) & (df.index <= dt.datetime(2017, 1, 1))]
df_test = df[(df.index > dt.datetime(2017, 1, 1)) & (df.index <= dt.datetime(2018, 8, 1))]
df_y_train = df_train[columns].shift(-24)[:-24]
df_y_test = df_test[columns].shift(-24)[:-24]
X_train, X_test, = df_train.values[:-24], df_test.values[:-24]
y_train, y_test = df_y_train.values, df_y_test.values
xgb = XGB()
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
error1 = xgb.error(df_y_test.y, y_pred[:, 0])
y_pred2 = y_pred[:, 1:].sum(axis=1)
error2 = xgb.error(df_y_test.y, y_pred2)
plt.figure()
plt.plot(y_pred[:, 0])
plt.plot(y_pred2)
plt.plot(df_y_test.y.values)
plt.legend(['y', 'emd', 'true'])
print(error1, error2)
# 807037.8726142674 2487790.09234746
# 直接預(yù)測(cè)比emd的誤差小
if __name__ == '__main__':
df = pd.read_csv('./AEP_hourly.csv', index_col=[0])
df.columns = ['y']
df.index = pd.to_datetime(df.index)
df['weekday'] = df.index.weekday
df['hour'] = df.index.hour
df['sat'] = (df['weekday'] == 5) * 1
df['sun'] = (df['weekday'] == 6) * 1
glm = GLM(df)
glm.getSX()
glm.getXY()
season_model = glm.seasonality()
season = season_model.predict(glm.sx)
trend_model = glm.Trend(season)
trend = trend_model.predict(glm.x)
residual = glm.y - season - trend
df['season'] = season
df['trend'] = trend
df['residual'] = residual
x = (glm.y - season - trend).flatten()
t = np.arange(0, len(x) * .001, .001)
imfs = emd(x, nIMF=5)
for i in range(len(imfs)):
df[f'emd{i}'] = imfs[i]
for col in ['season', 'trend', 'emd0', 'emd1', 'emd2', 'emd3', 'emd4']: # 'season',
for i in range(24):
df[f'{col}_lag{i}'] = df[col].shift(i)
df.dropna(inplace=True)
main(df)
結(jié)果如下所示:

