scikit-learn linaerRegression 1.1.3 LASSO

xiaoxiao2021-02-28 127

Lasso 是一种估计稀疏线性模型的方法.由于它倾向具有少量参数值的情况，对于给定解决方案是相关情况下，有效的减少了变量数量。因此，Lasso及其变种是压缩感知(压缩采样)的基础。在约束条件下，它可以回复一组非零精确的权重系数(参考Compressive sensing: tomography reconstruction with L1 prior (Lasso)).

用数学形式表达，Lasso 包含一个使用先验作为正则化因子的线性模型。其目标函数是最小化:

lasso 解决带罚项的最小平方和，其中是一个常量，是参数向量的 -norm

Lasso 类实现使用了坐标下降法(一种非梯度优化算法) 来拟合系数.参考另一种实现 Least Angle Regression最小角回归

>>> from sklearn import linear_model >>> clf = linear_model.Lasso(alpha = 0.1) >>> clf.fit([[0, 0], [1, 1]], [0, 1]) Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False) >>> clf.predict([[1, 1]]) array([ 0.8])

函数 lasso_path 对于lower-level任务非常有用。它能够通过搜索所有可能的路径上的值来计算系数.

基于加成噪声的手工生成稀疏信号的LASSO和弹性网回归模型估计。估计系数与真实情况进行比较。

Script output:

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False) r^2 on test data : 0.384710 ElasticNet(alpha=0.1, copy_X=True, fit_intercept=True, l1_ratio=0.7, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False) r^2 on test data : 0.240176

Python source code: plot_lasso_and_elasticnet.py

print(__doc__) import numpy as np import matplotlib.pyplot as plt from sklearn.metrics import r2_score ############################################################################### # generate some sparse data to play with np.random.seed(42) n_samples, n_features = 50, 200 X = np.random.randn(n_samples, n_features) coef = 3 * np.random.randn(n_features) inds = np.arange(n_features) np.random.shuffle(inds) coef[inds[10:]] = 0 # sparsify coef y = np.dot(X, coef) # add noise y += 0.01 * np.random.normal((n_samples,)) # Split data in train set and test set n_samples = X.shape[0] X_train, y_train = X[:n_samples / 2], y[:n_samples / 2] X_test, y_test = X[n_samples / 2:], y[n_samples / 2:] ############################################################################### # Lasso from sklearn.linear_model import Lasso alpha = 0.1 lasso = Lasso(alpha=alpha) y_pred_lasso = lasso.fit(X_train, y_train).predict(X_test) r2_score_lasso = r2_score(y_test, y_pred_lasso) print(lasso) print("r^2 on test data : %f" % r2_score_lasso) ############################################################################### # ElasticNet from sklearn.linear_model import ElasticNet enet = ElasticNet(alpha=alpha, l1_ratio=0.7) y_pred_enet = enet.fit(X_train, y_train).predict(X_test) r2_score_enet = r2_score(y_test, y_pred_enet) print(enet) print("r^2 on test data : %f" % r2_score_enet) plt.plot(enet.coef_, label='Elastic net coefficients') plt.plot(lasso.coef_, label='Lasso coefficients') plt.plot(coef, '--', label='original coefficients') plt.legend(loc='best') plt.title("Lasso R^2: %f, Elastic Net R^2: %f" % (r2_score_lasso, r2_score_enet)) plt.show()

例子2：

这个例子显示了从一组平行投影中重建图像的过程，这些投影是沿着不同的角度获得的。这样的数据集是在计算机断层扫描中获得的。

如果没有关于样本的任何先验信息，重建图像所需的投影数是图像的线性尺寸L的顺序（以像素为单位）。为了简单起见，我们在这里考虑一个稀疏图像，其中只有边界上的像素具有非零值。这样的数据可以对应于蜂窝材料。但是请注意，大多数图像在不同的基础上是稀疏的，如Haar小波。只有L／7预测获得，因此有必要使用样本上的先验信息（其稀疏性）：这是压缩感知的一个例子

层析投影操作是一种线性变换。除了与线性回归相对应的数据保真项，我们还惩罚图像的L1范数，以解释其稀疏性。由此产生的优化问题称为套索。我们使用类sklearn.linear_model.lasso，使用坐标下降算法。重要的是，这种实现在稀疏矩阵上比在这里使用的投影算子计算效率更高。

L1惩罚的重建提供一个结果零误差（所有像素都成功标记0或1），即使加入了噪声预测。相比之下，L2惩罚（sklearn。linear_model。脊）产生大量的像素标记错误。重要文物的重建图像上观察到，与L1惩罚。注意，特别是圆形的伪影，将角点中的像素分割出来，这导致了比中央磁盘更少的投影。

print(__doc__) # Author: Emmanuelle Gouillart <emmanuelle.gouillart@nsup.org> # License: BSD 3 clause import numpy as np from scipy import sparse from scipy import ndimage from sklearn.linear_model import Lasso from sklearn.linear_model import Ridge import matplotlib.pyplot as plt def _weights(x, dx=1, orig=0): x = np.ravel(x) floor_x = np.floor((x - orig) / dx) alpha = (x - orig - floor_x * dx) / dx return np.hstack((floor_x, floor_x + 1)), np.hstack((1 - alpha, alpha)) def _generate_center_coordinates(l_x): X, Y = np.mgrid[:l_x, :l_x].astype(np.float64) center = l_x / 2. X += 0.5 - center Y += 0.5 - center return X, Y def build_projection_operator(l_x, n_dir): """ Compute the tomography design matrix. Parameters ---------- l_x : int linear size of image array n_dir : int number of angles at which projections are acquired. Returns ------- p : sparse matrix of shape (n_dir l_x, l_x**2) """ X, Y = _generate_center_coordinates(l_x) angles = np.linspace(0, np.pi, n_dir, endpoint=False) data_inds, weights, camera_inds = [], [], [] data_unravel_indices = np.arange(l_x ** 2) data_unravel_indices = np.hstack((data_unravel_indices, data_unravel_indices)) for i, angle in enumerate(angles): Xrot = np.cos(angle) * X - np.sin(angle) * Y inds, w = _weights(Xrot, dx=1, orig=X.min()) mask = np.logical_and(inds >= 0, inds < l_x) weights += list(w[mask]) camera_inds += list(inds[mask] + i * l_x) data_inds += list(data_unravel_indices[mask]) proj_operator = sparse.coo_matrix((weights, (camera_inds, data_inds))) return proj_operator def generate_synthetic_data(): """ Synthetic binary data """ rs = np.random.RandomState(0) n_pts = 36. x, y = np.ogrid[0:l, 0:l] mask_outer = (x - l / 2) ** 2 + (y - l / 2) ** 2 < (l / 2) ** 2 mask = np.zeros((l, l)) points = l * rs.rand(2, n_pts) mask[(points[0]).astype(np.int), (points[1]).astype(np.int)] = 1 mask = ndimage.gaussian_filter(mask, sigma=l / n_pts) res = np.logical_and(mask > mask.mean(), mask_outer) return res - ndimage.binary_erosion(res) # Generate synthetic images, and projections l = 128 proj_operator = build_projection_operator(l, l / 7.) data = generate_synthetic_data() proj = proj_operator * data.ravel()[:, np.newaxis] proj += 0.15 * np.random.randn(*proj.shape) # Reconstruction with L2 (Ridge) penalization rgr_ridge = Ridge(alpha=0.2) rgr_ridge.fit(proj_operator, proj.ravel()) rec_l2 = rgr_ridge.coef_.reshape(l, l) # Reconstruction with L1 (Lasso) penalization # the best value of alpha was determined using cross validation # with LassoCV rgr_lasso = Lasso(alpha=0.001) rgr_lasso.fit(proj_operator, proj.ravel()) rec_l1 = rgr_lasso.coef_.reshape(l, l) plt.figure(figsize=(8, 3.3)) plt.subplot(131) plt.imshow(data, cmap=plt.cm.gray, interpolation='nearest') plt.axis('off') plt.title('original image') plt.subplot(132) plt.imshow(rec_l2, cmap=plt.cm.gray, interpolation='nearest') plt.title('L2 penalization') plt.axis('off') plt.subplot(133) plt.imshow(rec_l1, cmap=plt.cm.gray, interpolation='nearest') plt.title('L1 penalization') plt.axis('off') plt.subplots_adjust(hspace=0.01, wspace=0.01, top=1, bottom=0, left=0, right=1) plt.show()

1.1.3.1. 设置正则化参数

alpha 参数控制估计的系数的稀疏程度。

1.1.3.1.1. 使用交叉验证

scikit-learn 暴露以下两个类 LassoCV 和 LassoLarsCV 可以设置 Lasso alpha 参数. LassoCV 基于下面解释的算法 Least Angle Regression最小角回归

对于含有很多共线性的高维的数据集，LassoCV 是最合适不过了。然而，LassoLarsCV 在寻找 alpha 参数更相关的值时更具有优势，并且如果样本相比于观测的数量时，通常比 LassoCV 更快.

1.1.3.1.2. 基于模型选择的信息约束

LassoLarsIC 建议使用Akaike information criterion (AIC) 和 Bayes Information criterion (BIC)。

由于在计算:math:alpha 过程中，当使用k-折交叉验证的时候，正则化路径只计算1次而不是k+1次，所以在计算上代价非常小。

然而，这种约束需要一个合适的对于解的自由度的估计（可参考矩阵的解的自由度）,这可以从大量的样本（渐进结果）导出并且假设模型是正确的。

例如，数据实际上是有该模型产生的，但是当问题是病态条件时这种数据可能会有问题(参考病态矩阵，条件数等概念)，

比如特征维数大于样本数.（小样本问题）

使用Akaike信息准则（AIC），贝叶斯信息准则（BIC）和选择的LASSO估计正则化参数α的最优值的交叉验证得到的结果是基于lassolarsic AIC、BIC准则。基于信息标准的模型选择是非常快速的，但它依赖于对自由度的适当估计，它来自于大样本（渐近结果），假设模型是正确的，即数据实际上是由这个模型生成的。当问题被严重限制（比样本更多的特性）时，它们也会中断。交叉验证，我们用20倍2的算法来计算套索路径：坐标下降，由lassocv类实现的，和拉尔斯（最小角回归）的lassolarscv类实现。这两种算法给出的结果大致相同。它们在执行速度和数值误差来源方面有所不同。拉尔斯只计算路径中每个扭结的路径解。因此，当只有很少的扭结时，它是非常有效的，如果很少有特征或样本，则是如此。此外，它可以在不设置任何元参数的情况下计算完整路径。与之相反，坐标下降计算预先指定的网格上的路径点（在这里我们使用默认值）。因此，如果网格点的数量小于路径中的扭结数，则效率更高。如果特征数量非常大，并且有足够的样本选择大量，这样的策略可能会很有意思。在数值误差方面，对于重相关变量，拉尔斯会积累更多的误差，而坐标下降算法只对网格上的路径进行采样。注意alpha的最佳值是如何变化的。这说明了当尝试通过交叉验证选择参数的方法的性能时，嵌套的交叉验证是必要的：这个参数的选择对于看不见的数据可能不是最优的。

parameter may not be optimal for unseen data.

Script output:

Computing regularization path using the coordinate descent lasso... Computing regularization path using the Lars lasso...

Python source code: plot_lasso_model_selection.py

print(__doc__) # Author: Olivier Grisel, Gael Varoquaux, Alexandre Gramfort # License: BSD 3 clause import time import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC from sklearn import datasets diabetes = datasets.load_diabetes() X = diabetes.data y = diabetes.target rng = np.random.RandomState(42) X = np.c_[X, rng.randn(X.shape[0], 14)] # add some bad features # normalize data as done by Lars to allow for comparison X /= np.sqrt(np.sum(X ** 2, axis=0)) ############################################################################## # LassoLarsIC: least angle regression with BIC/AIC criterion model_bic = LassoLarsIC(criterion='bic') t1 = time.time() model_bic.fit(X, y) t_bic = time.time() - t1 alpha_bic_ = model_bic.alpha_ model_aic = LassoLarsIC(criterion='aic') model_aic.fit(X, y) alpha_aic_ = model_aic.alpha_ def plot_ic_criterion(model, name, color): alpha_ = model.alpha_ alphas_ = model.alphas_ criterion_ = model.criterion_ plt.plot(-np.log10(alphas_), criterion_, '--', color=color, linewidth=3, label='%s criterion' % name) plt.axvline(-np.log10(alpha_), color=color, linewidth=3, label='alpha: %s estimate' % name) plt.xlabel('-log(alpha)') plt.ylabel('criterion') plt.figure() plot_ic_criterion(model_aic, 'AIC', 'b') plot_ic_criterion(model_bic, 'BIC', 'r') plt.legend() plt.title('Information-criterion for model selection (training time %.3fs)' % t_bic) ############################################################################## # LassoCV: coordinate descent # Compute paths print("Computing regularization path using the coordinate descent lasso...") t1 = time.time() model = LassoCV(cv=20).fit(X, y) t_lasso_cv = time.time() - t1 # Display results m_log_alphas = -np.log10(model.alphas_) plt.figure() ymin, ymax = 2300, 3800 plt.plot(m_log_alphas, model.mse_path_, ':') plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha: CV estimate') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean square error') plt.title('Mean square error on each fold: coordinate descent ' '(train time: %.2fs)' % t_lasso_cv) plt.axis('tight') plt.ylim(ymin, ymax) ############################################################################## # LassoLarsCV: least angle regression # Compute paths print("Computing regularization path using the Lars lasso...") t1 = time.time() model = LassoLarsCV(cv=20).fit(X, y) t_lasso_lars_cv = time.time() - t1 # Display results m_log_alphas = -np.log10(model.cv_alphas_) plt.figure() plt.plot(m_log_alphas, model.cv_mse_path_, ':') plt.plot(m_log_alphas, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean square error') plt.title('Mean square error on each fold: Lars (train time: %.2fs)' % t_lasso_lars_cv) plt.axis('tight') plt.ylim(ymin, ymax) plt.show()

转载请注明原文地址: https://www.6miu.com/read-40181.html

技术

最新回复(0)