RollingRegression(滚动回归分析)之Python实现
2021-07-16 04:06
标签:取数 plt gen 转化 save parser sea gre rect # -*- coding: utf-8 -*- @author: acadsoc import pandas as pd plt.style.use(‘ggplot‘) # 设置ggplot2画图风格 # 根据不同平台设定工作目录 # 定义滚动多元回归分析类 # 从起始日开始做回归 # 按字典格式保存系数、pvalue、R2 # 系数字典转化为数据框,并按日期升序排序 # 系数pvalue转化为数据框,并按日期升序排序 # R2转化为数据框,并按日期升序排序 # 定义日期转换函数 for t, pvalue in zip(self.coef_pvalue_.index, self.coef_pvalue_[feature]): # p值图 #plt.xlabel(‘日期‘) line = Line(‘R2‘) # R2图 for i, feature in enumerate(self.coef_.columns): ol.add(line) self.page_.add(charts) ‘‘‘ ============================================================================================================‘‘‘ fs = featureSelection() fs.randomForestRandomSearch(rr.df_) fs.stepwise(rr.df_, response=‘续单数‘, criterion=‘aic‘, intercept=True, val_enter=0.0, rr.fit(fs.stepwise_feat_selected) RollingRegression(滚动回归分析)之Python实现 标签:取数 plt gen 转化 save parser sea gre rect 原文地址:https://www.cnblogs.com/lantingg/p/9535019.html
"""
Created on Sat Aug 18 11:08:38 2018
"""
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from pyecharts import Bar, Line, Page, Overlap
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
# import pymssql
from dateutil import parser
import copy
import os
import sys
from featureSelection import featureSelection
# 根据不同平台设置其中文字体路径
if sys.platform == ‘linux‘:
zh_font = matplotlib.font_manager.FontProperties(
fname=‘path/anaconda3/lib/python3.6/site-packages/matplotlib/mpl-data/fonts/ttf/STZHONGS.TTF‘)
else:
zh_font = matplotlib.font_manager.FontProperties(fname=‘C:\Windows\Fonts\STZHONGS.ttf‘) # 设置中文字体
if sys.platform == ‘linux‘:
os.chdir(path) # Linux path
else:
os.chdir(path) # Windows path
class rollingRegression():
def __init__(self, target=‘新单数‘, date_begin=‘2018-01-01‘, date_end=‘2018-07-31‘, rolling_days=30,
const=False, p_value_threshold=.1, normalize=False):
self.target = target # 回归因变量
self.date_begin = date_begin # 起始日期
self.date_end = date_end # 终止日期
self.rolling_days = rolling_days # 滚动天数
self.const = const # 回归方程是否带常数项
self.p_value_threshold = p_value_threshold # p值显示阈值
self.normalize = normalize # 是否将数据标准化后再进行回归分析
if self.normalize: # 如果数据标准化,常数强制设置为0
self.const = False
# 起始日期间隔必须大于等于滚动天数
if (parser.parse(self.date_end) - parser.parse(self.date_begin)).days raise IOError(‘起始日期间隔必须大于等于滚动天数,请重新选择起始日期或者调整滚动日期。‘)
# 读取数据
def getData(self, file=‘业绩相关数据2018-8-1.xlsx‘, variabls_in=None, variables_out=None):
df = pd.read_excel(file) # 读取数据
dateTransfer = np.vectorize(self._dateTransfer) # 向量化日期转换函数
df.index = df.iloc[:, 0] # 将日期变为索引
df = df.iloc[:, 1:]
df = pd.concat([df[self.target], df.iloc[:, 6:]], axis=1) # 選取有用列
df[df.isnull()] = 0 # 缺失值填充
df = df.astype(float) # 将数据框object格式转换为float
# dingdan.index = dateTransfer(dingdan.index) # 转换索引日期格式
df.index = pd.DatetimeIndex(df.index) # 将索引转换为datetime格式
if self.normalize: # 数据标准化
df_std = StandardScaler().fit_transform(df)
self.df_ = pd.DataFrame(df_std, index=df.index, columns=df.columns)
else:
self.df_ = df
# 滚动日期多元线性模型
def rollingOLS(self, df):
df = df.loc[(df.index>=self.date_begin) & (df.index df = df.sort_index(ascending=True) # 按日期升序排序
coef = {}
coef_pvalue = {}
r2 = {}
for i in range(df.shape[0] - self.rolling_days):
date = df.index[i+self.rolling_days]
data = df.iloc[i:i+self.rolling_days, :]
X = data.iloc[:, 1:]
y = data.iloc[:, 0]
# 线性回归模型拟合
model = sm.OLS(y, X, hasconst=self.const)
lr = model.fit()
coef[date] = lr.params
coef_pvalue[date] = lr.pvalues
r2[date] = []
r2[date].append(lr.rsquared)
r2[date].append(lr.rsquared_adj)
coef = pd.DataFrame.from_dict(coef, orient=‘index‘)
coef = coef.sort_index(ascending=True)
coef_pvalue = pd.DataFrame.from_dict(coef_pvalue, orient=‘index‘)
coef_pvalue = coef_pvalue.sort_index(ascending=True)
r2 = pd.DataFrame.from_dict(r2, orient=‘index‘)
r2.columns = [‘R_squred‘,‘R_squred_adj‘]
r2 = r2.sort_index(ascending=True)
return coef, coef_pvalue, r2
def _dateTransfer(self, date):
return parser.parse(date).strftime(‘%Y-%m-%d‘)
# 多元回归分析并保存数据
def fit(self, feat_selected=None):
if feat_selected is not None:
df = pd.concat([self.df_.iloc[:, 0], self.df_[feat_selected]], axis=1)
else:
df = self.df_
# 滚动回归分析
self.coef_, self.coef_pvalue_, self.r2_ = self.rollingOLS(df)
# 存储分析数据表
self.coef_.to_excel(‘coef.xlsx‘)
self.coef_pvalue_.to_excel(‘coef_pvalue.xlsx‘)
self.r2_.to_excel(‘r2.xlsx‘)
return self
# 画图
def coefPlots(self, width_subplot=12, height_subplot=5, columns_subplots=3):
num_subplots = self.coef_.shape[1] + 1 # 确定子图个数
# 确定子图行数
if num_subplots % columns_subplots == 0: # 余数为0
rows_subplots = num_subplots // columns_subplots # 取整
else:
rows_subplots = num_subplots // columns_subplots + 1
# 确定画布宽、高
width_figure = columns_subplots * width_subplot
height_figure = rows_subplots * height_subplot
# 绘制滚动回归R2图
plt.figure(figsize=(width_figure, height_figure))
plt.subplot(rows_subplots, columns_subplots, 1)
plt.plot(self.r2_[‘R_squred‘], color=‘r‘, lw=3, label=‘R_squred‘)
plt.plot(self.r2_[‘R_squred_adj‘], color=‘g‘, lw=3, label=‘R_squred_adj‘)
plt.title(‘R2‘)
plt.legend()
# 在子图中画系滚动回归系数及p值图
for i, feature in enumerate(self.coef_.columns): # 系数图
plt.subplot(rows_subplots, columns_subplots, i+2)
plt.plot(self.coef_[feature], color=‘red‘, lw=3, label=‘Beta‘)
if pvalue plt.vlines(t, ymin=np.min(self.coef_[feature]), ymax=np.max(self.coef_[feature]),
color=‘green‘, alpha=.3, lw=5, label=‘p_value‘)
if ((i + columns_subplots + 1) % columns_subplots) & (i > 0) == 0:
plt.ylabel(‘coef‘)
plt.title(feature, fontproperties=zh_font)
# plt.savefig(‘rollingRegression.jpeg‘) # 保存图片
plt.show()
return self
# 利用Echarts画图。注:因为没有vline方法,故用echarts画出的图文件过大,在浏览器中打开很慢
def coefEcharts(self):
self.page_ = Page(self.target + ‘回归分析‘)
charts = []
zeros = np.zeros(self.coef_.shape[0])
bar = Bar()
line.add(‘R_squred‘, self.r2_.index, self.r2_[‘R_squred‘], is_more_utils=True)
line.add(‘R_squred_adj‘, self.r2_.index, self.r2_[‘R_squred_adj‘], is_more_utils=True)
charts.append(line)
min_num = np.min(self.coef_[feature])
max_num = np.max(self.coef_[feature])
line = Line(feature)
bar = Bar()
ol = Overlap()
line.add(‘coef‘, self.coef_.index, self.coef_[feature], is_more_utils=True) # 系数图
#line.on()
for t, pvalue in zip(self.coef_pvalue_.index, self.coef_pvalue_[feature]): # p值图
if pvalue min_array, max_array = copy.deepcopy(zeros), copy.deepcopy(zeros)
min_array[self.coef_.index==t] = min_num
max_array[self.coef_.index==t] = max_num
bar.add(‘p-value‘, self.coef_.index, min_array)
bar.add(‘p-value‘, self.coef_.index, max_array)
ol.add(bar)
charts.append(ol)
self.page_.render() # 保存格式为HTML, 保存地址为设定的全局path
return self
# 使用方法
rr = rollingRegression(target=‘续单数‘)
rr.getData(file=‘D:/Matlab/achivement2018-8-1.xlsx‘)
fs.elasticNetFeatureSelectPlot(df=rr.df_, l1_ratio=.08,
plot_width=16, plot_height=8, xlim_exp=[-2, 2], ylim=[-.1,.1])
fs.elasticNetRandomSearch(df=rr.df_)
fs.elasticnet_rs_best
fs.elasticNet(rr.df_, alpha=.7, normalize=True)
fs.elasticnet_coef_
fs.elasticnet_R2_
fs.eln.coef_
fs.featureBarhPlot(fs.elasticnet_coef_)
fs.elasticnet_coef_selected_
fs.rf_rs_best
fs.randomForest(rr.df_, n_estimators=139, max_features=6, impo_cum_threshold=.8)
fs.featureBarhPlot(fs.rf_feat_impo_)
fs.rf_feat_selected_
rr.fit(fs.rf_feat_selected_)
rr.coefPlots(columns_subplots=2)
p_value_enter=.05, direction=‘both‘, show_step=True)
rr.coefPlots(columns_subplots=2)
文章标题:RollingRegression(滚动回归分析)之Python实现
文章链接:http://soscw.com/index.php/essay/105877.html