RollingRegression(滚动回归分析)之Python实现
2021-07-16 04:06
                         标签:取数   plt   gen   转化   save   parser   sea   gre   rect    # -*- coding: utf-8 -*- @author: acadsoc import pandas as pd plt.style.use(‘ggplot‘) # 设置ggplot2画图风格 # 根据不同平台设定工作目录 # 定义滚动多元回归分析类         # 从起始日开始做回归             # 按字典格式保存系数、pvalue、R2         # 系数字典转化为数据框,并按日期升序排序         # 系数pvalue转化为数据框,并按日期升序排序         # R2转化为数据框,并按日期升序排序     # 定义日期转换函数             for t, pvalue in zip(self.coef_pvalue_.index, self.coef_pvalue_[feature]):  # p值图             #plt.xlabel(‘日期‘)         line = Line(‘R2‘)  # R2图         for i, feature in enumerate(self.coef_.columns):               ol.add(line)         self.page_.add(charts) ‘‘‘  ============================================================================================================‘‘‘ fs = featureSelection() fs.randomForestRandomSearch(rr.df_) fs.stepwise(rr.df_, response=‘续单数‘, criterion=‘aic‘, intercept=True, val_enter=0.0, rr.fit(fs.stepwise_feat_selected) RollingRegression(滚动回归分析)之Python实现 标签:取数   plt   gen   转化   save   parser   sea   gre   rect    原文地址:https://www.cnblogs.com/lantingg/p/9535019.html
"""
Created on Sat Aug 18 11:08:38 2018
"""
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from pyecharts import Bar, Line, Page, Overlap
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
# import pymssql
from dateutil import parser
import copy
import os 
import sys
from featureSelection import featureSelection
# 根据不同平台设置其中文字体路径
if sys.platform == ‘linux‘: 
    zh_font = matplotlib.font_manager.FontProperties(
        fname=‘path/anaconda3/lib/python3.6/site-packages/matplotlib/mpl-data/fonts/ttf/STZHONGS.TTF‘)
else:
    zh_font = matplotlib.font_manager.FontProperties(fname=‘C:\Windows\Fonts\STZHONGS.ttf‘)  # 设置中文字体
if sys.platform == ‘linux‘: 
    os.chdir(path) # Linux path
else:
    os.chdir(path) # Windows path
class rollingRegression():
    def __init__(self, target=‘新单数‘, date_begin=‘2018-01-01‘, date_end=‘2018-07-31‘, rolling_days=30, 
                 const=False, p_value_threshold=.1, normalize=False):
        self.target = target  # 回归因变量
        self.date_begin = date_begin  # 起始日期
        self.date_end = date_end  # 终止日期
        self.rolling_days = rolling_days  # 滚动天数
        self.const = const  # 回归方程是否带常数项
        self.p_value_threshold = p_value_threshold  # p值显示阈值
        self.normalize = normalize  # 是否将数据标准化后再进行回归分析
        if self.normalize:  # 如果数据标准化,常数强制设置为0
            self.const = False
        # 起始日期间隔必须大于等于滚动天数
        if (parser.parse(self.date_end) - parser.parse(self.date_begin)).days             raise IOError(‘起始日期间隔必须大于等于滚动天数,请重新选择起始日期或者调整滚动日期。‘)
    
    # 读取数据        
    def getData(self, file=‘业绩相关数据2018-8-1.xlsx‘, variabls_in=None, variables_out=None):        
        df = pd.read_excel(file)  # 读取数据       
        dateTransfer = np.vectorize(self._dateTransfer)   # 向量化日期转换函数        
        df.index = df.iloc[:, 0]  # 将日期变为索引
        df = df.iloc[:, 1:]        
        df = pd.concat([df[self.target], df.iloc[:, 6:]], axis=1)  # 選取有用列        
        df[df.isnull()] = 0  # 缺失值填充        
        df = df.astype(float)  # 将数据框object格式转换为float          
        # dingdan.index = dateTransfer(dingdan.index) # 转换索引日期格式          
        df.index = pd.DatetimeIndex(df.index)   # 将索引转换为datetime格式    
       
        if self.normalize:   # 数据标准化     
            df_std = StandardScaler().fit_transform(df)
            self.df_ = pd.DataFrame(df_std, index=df.index, columns=df.columns)    
        else:
            self.df_ = df
            
    # 滚动日期多元线性模型
    def rollingOLS(self, df):
        df = df.loc[(df.index>=self.date_begin) & (df.index        df = df.sort_index(ascending=True)  # 按日期升序排序
        coef = {}
        coef_pvalue = {}
        r2 = {}
        for i in range(df.shape[0] - self.rolling_days):
            date = df.index[i+self.rolling_days]   
            data = df.iloc[i:i+self.rolling_days, :]
            X = data.iloc[:, 1:]
            y = data.iloc[:, 0]        
            # 线性回归模型拟合    
            model = sm.OLS(y, X, hasconst=self.const)
            lr = model.fit()
            coef[date] = lr.params            
            coef_pvalue[date] = lr.pvalues
            r2[date] = []
            r2[date].append(lr.rsquared)
            r2[date].append(lr.rsquared_adj)
        coef = pd.DataFrame.from_dict(coef, orient=‘index‘)
        coef = coef.sort_index(ascending=True)
        coef_pvalue = pd.DataFrame.from_dict(coef_pvalue, orient=‘index‘)
        coef_pvalue = coef_pvalue.sort_index(ascending=True)
        r2 = pd.DataFrame.from_dict(r2, orient=‘index‘)
        r2.columns = [‘R_squred‘,‘R_squred_adj‘]
        r2 = r2.sort_index(ascending=True)
        return coef, coef_pvalue, r2
    def _dateTransfer(self, date): 
        return parser.parse(date).strftime(‘%Y-%m-%d‘)
    
    # 多元回归分析并保存数据
    def fit(self, feat_selected=None): 
        if feat_selected is not None:
            df = pd.concat([self.df_.iloc[:, 0], self.df_[feat_selected]], axis=1)
        else:
            df = self.df_
        # 滚动回归分析        
        self.coef_, self.coef_pvalue_, self.r2_ = self.rollingOLS(df)  
        # 存储分析数据表
        self.coef_.to_excel(‘coef.xlsx‘)
        self.coef_pvalue_.to_excel(‘coef_pvalue.xlsx‘)
        self.r2_.to_excel(‘r2.xlsx‘)        
        return self
    
    # 画图
    def coefPlots(self, width_subplot=12, height_subplot=5, columns_subplots=3):        
        num_subplots = self.coef_.shape[1] + 1  # 确定子图个数
        # 确定子图行数
        if num_subplots % columns_subplots == 0: # 余数为0
            rows_subplots = num_subplots // columns_subplots  # 取整
        else:
            rows_subplots = num_subplots // columns_subplots + 1
        # 确定画布宽、高
        width_figure = columns_subplots * width_subplot
        height_figure = rows_subplots * height_subplot
        
        # 绘制滚动回归R2图
        plt.figure(figsize=(width_figure, height_figure))
        plt.subplot(rows_subplots, columns_subplots, 1)
        plt.plot(self.r2_[‘R_squred‘], color=‘r‘, lw=3, label=‘R_squred‘)
        plt.plot(self.r2_[‘R_squred_adj‘], color=‘g‘, lw=3, label=‘R_squred_adj‘)
        plt.title(‘R2‘)
        plt.legend()
        # 在子图中画系滚动回归系数及p值图
        for i, feature in enumerate(self.coef_.columns):  # 系数图
            plt.subplot(rows_subplots, columns_subplots, i+2)
            plt.plot(self.coef_[feature], color=‘red‘, lw=3, label=‘Beta‘)
                if pvalue                     plt.vlines(t, ymin=np.min(self.coef_[feature]), ymax=np.max(self.coef_[feature]), 
                               color=‘green‘, alpha=.3, lw=5, label=‘p_value‘)
            if ((i + columns_subplots + 1) % columns_subplots) & (i > 0) == 0:
                plt.ylabel(‘coef‘)
            plt.title(feature, fontproperties=zh_font)
        # plt.savefig(‘rollingRegression.jpeg‘) # 保存图片
        plt.show()
        return self
    
    # 利用Echarts画图。注:因为没有vline方法,故用echarts画出的图文件过大,在浏览器中打开很慢
    def coefEcharts(self):
        self.page_ = Page(self.target + ‘回归分析‘)
        charts = []
        zeros = np.zeros(self.coef_.shape[0])
        bar = Bar()
        line.add(‘R_squred‘, self.r2_.index, self.r2_[‘R_squred‘], is_more_utils=True) 
        line.add(‘R_squred_adj‘, self.r2_.index, self.r2_[‘R_squred_adj‘], is_more_utils=True) 
        charts.append(line)
            min_num = np.min(self.coef_[feature])
            max_num = np.max(self.coef_[feature])
            line = Line(feature)
            bar = Bar()
            ol = Overlap()
            line.add(‘coef‘, self.coef_.index, self.coef_[feature], is_more_utils=True) # 系数图
            #line.on()
            for t, pvalue in zip(self.coef_pvalue_.index, self.coef_pvalue_[feature]):  # p值图
                if pvalue                     min_array, max_array = copy.deepcopy(zeros), copy.deepcopy(zeros)
                    min_array[self.coef_.index==t] = min_num
                    max_array[self.coef_.index==t] = max_num
                    bar.add(‘p-value‘, self.coef_.index, min_array)
                    bar.add(‘p-value‘, self.coef_.index, max_array)
            ol.add(bar)
            charts.append(ol)
        self.page_.render()  # 保存格式为HTML, 保存地址为设定的全局path
        return self
    
# 使用方法
rr = rollingRegression(target=‘续单数‘)
rr.getData(file=‘D:/Matlab/achivement2018-8-1.xlsx‘)
fs.elasticNetFeatureSelectPlot(df=rr.df_, l1_ratio=.08, 
                               plot_width=16, plot_height=8, xlim_exp=[-2, 2], ylim=[-.1,.1])
fs.elasticNetRandomSearch(df=rr.df_)
fs.elasticnet_rs_best
fs.elasticNet(rr.df_, alpha=.7, normalize=True)
fs.elasticnet_coef_
fs.elasticnet_R2_
fs.eln.coef_
fs.featureBarhPlot(fs.elasticnet_coef_)
fs.elasticnet_coef_selected_
fs.rf_rs_best
fs.randomForest(rr.df_, n_estimators=139, max_features=6, impo_cum_threshold=.8)
fs.featureBarhPlot(fs.rf_feat_impo_)
fs.rf_feat_selected_
rr.fit(fs.rf_feat_selected_)
rr.coefPlots(columns_subplots=2)
            p_value_enter=.05, direction=‘both‘, show_step=True)
rr.coefPlots(columns_subplots=2)
文章标题:RollingRegression(滚动回归分析)之Python实现
文章链接:http://soscw.com/essay/105877.html