code:
[python]
- <span style="font-size:18px;">import numpy as np
- from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
- from sklearn.preprocessing import PolynomialFeatures
- import matplotlib.pyplot as plt
- from sklearn.pipeline import Pipeline
- import matplotlib as mpl
- import warnings
- # 计算统计参数TSS RSS R
- def xss(y, y_hat):
- # y转置
- y = y.ravel()
- y_hat = y_hat.ravel()
- # 都是利用差平方和公式计算
- tss = ((y - np.average(y)) ** 2).sum()
- rss = ((y_hat - y) ** 2).sum()
- ess = ((y_hat - np.average(y)) ** 2).sum()
- # 统计学中R参数计算公式
- r2 = 1 - rss/tss
- print('Rss:', rss)
- print('Ess:', ess)
- print('Rss + Ess:', rss + ess)
- tss_list.append(tss)
- rss_list.append(rss)
- ess_list.append(ess)
- ess_rss_list.append(rss + ess)
- # 得到y和y_hat的相关系数
- corr_coef = np.corrcoef(y, y_hat)[0, 1]
- return r2, corr_coef
- if __name__ =='__main__':
- warnings.filterwarnings("ignore")
- np.random.seed(0)
- np.set_printoptions(linewidth=1000)
- N = 9
- x = np.linspace(0, 6, N) + np.random.randn(N)
- x = np.sort(x)
- y = x**2 - 4*x - 3 + np.random.randn(N)
- x.shape = -1, 1
- y.shape = -1, 1
- # 构建几个相关的线性模型回归,Ridge,LassoCV以及ElasticNetCV
- models = [Pipeline([('poly', PolynomialFeatures()), ('linear', LinearRegression(fit_intercept=False))]),
- Pipeline([('poly', PolynomialFeatures()), ('linear', RidgeCV(alphas=np.logspace(-3, 2, 50), fit_intercept=False))]),
- Pipeline([('poly', PolynomialFeatures()), ('linear', LassoCV(alphas=np.logspace(-3, 2, 50), fit_intercept=False))]),
- Pipeline([('poly', PolynomialFeatures()), ('linear', ElasticNetCV(alphas=np.logspace(-3, 2, 50), l1_ratio=[.1, .5, .7, .9, .95, .99, 1], fit_intercept=False))])
- ]
- np.set_printoptions(suppress=True)
- plt.figure(figsize=(15, 15), facecolor='w')
- d_pool = np.arange(1, N, 1)
- m = d_pool.size
- # 存颜色的list
- clrs = []
- for c in np.linspace(16711680, 255, m):
- clrs.append('#%06x' % int(c))
- line_width = np.linspace(5, 2, m)
- titles = 'linear regression', 'Ridge regression', 'Lasso', 'ElasticNet'
- tss_list = []
- rss_list = []
- ess_list = []
- ess_rss_list = []
- for t in range(4):
- model = models[t]
- plt.subplot(2, 2, t+1)
- plt.plot(x, y, 'ro', ms=10, zorder=N)
- for i, d in enumerate(d_pool):
- model.set_params(poly__degree=d)
- model.fit(x, y.ravel())
- lin = model.get_params('linear')['linear']
- output = '%s:%d level, parameters:'%(titles[t], d)
- if hasattr(lin, 'alpha_'):
- idx = output.find('parameters')
- output = output[:idx] + ('alpha = %.6f, ' % lin.alpha_) + output[idx:]
- # 这里使用交叉验证,从输入的l1_ratio(list)中选择一个最优的l1_ratio(float)值
- if hasattr(lin, 'l1_ratio_'):
- idx = output.find('parameters')
- output = output[:idx] + ('l1_ratio = %.6f, ' % lin.l1_ratio_) + output[idx:]
- print("output:\n", output)
- print("lin.coef_.ravel():\n", lin.coef_.ravel())
- x_hat = np.linspace(x.min(), x.max(), num=100)
- x_hat.shape = -1, 1
- y_hat = model.predict(x_hat)
- s= model.score(x, y)
- r2, corr_coef = xss(y, model.predict(x))
- print("R2 and corrlated params:", r2, corr_coef)
- print('R2:', s, '\n')
- z = N - 1 if (d == 2) else 0
- label = '%d level, $R^2 $=%.3f' %(d, s)
- if hasattr(lin, 'l1_ratio_'):
- label += ', L1 ration=%.2f' % lin.l1_ratio_
- plt.plot(x_hat, y_hat, color=clrs[i], lw=line_width[i], alpha=0.75, label=label, zorder=z)
- plt.legend(loc='upper left')
- plt.grid(True)
- plt.title(titles[t], fontsize=18)
- plt.xlabel("X", fontsize=15)
- plt.ylabel("Y", fontsize=15)
- plt.tight_layout(pad=2.5, w_pad=0.5, rect=(0, 0, 1, 0.95))
- # plt.tight_layout()
- plt.suptitle('multiply curve fitness compare', fontsize=22)
- plt.show()
- y_max = max(max(tss_list), max(ess_rss_list)) * 1.05
- plt.figure(figsize=(15,15), facecolor='w')
- t = np.arange(len(tss_list))
- plt.plot(t, tss_list, 'ro-', lw=2, label='Tss(Total Sum of Squares)')
- plt.plot(t, ess_list, 'mo-', lw=1, label='Ess(Explained Sum of Squares)')
- plt.plot(t, rss_list, 'bo-', lw=1, label='Ess(Residual Sum of Squares)')
- plt.plot(t, ess_rss_list, 'go-', lw=2, label='ESS + RSS')
- plt.ylim((0, y_max))
- plt.legend(loc='center right')
- plt.xlabel('trial:linear regression/RIdge?Lasso?ElasticNet', fontsize=15)
- plt.ylabel('XSS value', fontsize=15)
- plt.title('Total Sum Of Tss = ?', fontsize=18)
- plt.grid(True)
- plt.show()
- </span>
第一张图拟合出来的效果只是回归算法的loss function不同,但是出来的效果明显后两种要好。
接着,我们使用决策树以及bagging的决策树进行拟合看看效果:(关于bagging的决策树,就是GBDT,原理部分这个文章不累述)
[python]
- <span style="font-size:18px;">import numpy as np
- import matplotlib.pyplot as plt
- import matplotlib as mpl
- from sklearn.linear_model import RidgeCV
- from sklearn.ensemble import BaggingRegressor
- from sklearn.tree import DecisionTreeRegressor
- from sklearn.pipeline import Pipeline
- from sklearn.preprocessing import PolynomialFeatures
- def f(x):
- return 0.5*np.exp(-(x + 3)**2) + np.exp(-x**2) + 0.5*np.exp(-(x - 3)**2)
- if __name__ == '__main__':
- np.random.seed(0)
- N = 500
- # 得到200个在[-5, 5]的数据
- x = np.random.rand(N) *10 - 5
- x = np.sort(x)
- y = f(x) + 0.05*np.random.randn(N)
- x.shape = -1, 1
- ridge = RidgeCV(alphas=np.logspace(-3, 2, 10), fit_intercept=False)
- ridged = Pipeline([('poly', PolynomialFeatures(degree=10)), ('Ridge', ridge)])
- bagging_ridged = BaggingRegressor(ridged, n_estimators=100, max_samples=0.3)
- dtr = DecisionTreeRegressor(max_depth=6)
- regs=[
- ('DecisionTree Regressor', dtr),
- ('Ridge Regressor(6 Degree)', ridged),
- ('Bagging Ridge(6 Degree)', bagging_ridged),
- ('Bagging DecisionTree Regressor', BaggingRegressor(dtr, n_estimators=100, max_samples=0.3))
- ]
- x_test = np.linspace(1.1*x.min(), 1.1*x.max(), 1000)
- plt.figure(figsize=(12, 8), facecolor='w')
- plt.plot(x, y, 'ro', label='train datas')
- plt.plot(x_test, f(x_test), color='k', lw=4, label='real datas')
- clrs = 'bmyg'
- for i, (name, reg) in enumerate(regs):
- reg.fit(x, y)
- y_test = reg.predict(x_test.reshape(-1, 1))
- plt.plot(x_test, y_test.ravel(), color=clrs[i], lw=i+1, label=name, zorder=6-i)
- plt.legend(loc='upper left')
- plt.xlabel('x', fontsize=15)
- plt.ylabel('y', fontsize=15)
- plt.title('regression curve fittness', fontsize=20)
- plt.ylim((-0.2, 1.2))
- plt.tight_layout(True)
- plt.grid(True)
- plt.show()</span>
显然,使用线性拟合,对训练数据拟合效果是很好的,大部分都能落在拟合曲线上,而决策树就形成锯齿状,拟合的效果也不及线性拟合,这也是为什么说线性拟合分类器是强分类器,而决策树分类器是弱分类器。加入bagging(GBDT)后,拟合效果改善很大,不过其效果也不会达到线性拟合那么“完美”。
登录 | 立即注册