一元线性回归 梯度下降法 含绘图

2023-12-13 07:51:59
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

class LinearRegression:

    def __init__(self,dataX,dataY):
        # 存储原始数据
        self.dataOX = dataX;
        # 定义特征列和标签列
        self.dataX = LinearRegression.preHandle(dataX)
        self.dataY = dataY
        # 获取训练数据长度
        self.dataSize = self.dataX.shape[0]
        # 定义待求解的theta值,默认为全0数组
        self.theta = np.zeros((self.dataX.shape[1],1))
        # 定义学习率和迭代次数
        self.alpha = 0.01
        self.numIterations = 1000
        # 损失值的变化历史
        self.lossHistory = []
        self.thetaHistory = np.empty((self.numIterations,2))
        # 绘图设置,中文支持
        plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
        plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
        self.fig = plt.figure()  
        self.fig.set_size_inches(10,10)
    
    # 基于当前的theta,计算loss值,不在train上调用此方法因为会重复计算
    def loss(self):
        # 计算预测值
        prediction = np.dot(self.dataX,self.theta)
        # 计算差值
        delta = prediction - self.dataY
        # 差值平方/2倍的size就是loss
        loss = np.dot(delta.T,delta) / (self.dataSize * 2)

    def train(self):
        for index in range(self.numIterations):
            prediction = np.dot(self.dataX,self.theta)
            delta = prediction-self.dataY
            # 新的theta值 = 旧的theta值 - 学习率/样本数量 * ((预测数据-真实数据)·样本数据)
            self.theta = self.theta - self.alpha / self.dataSize * np.dot(self.dataX.T,delta)
            # 计算当前的loss,MSE(1/2)
            mse = (np.dot(delta.T,delta) / (2 * self.dataSize))[0][0]
            # 记录此次的theta值和loss值
            self.lossHistory.append(mse)
            self.thetaHistory[index] = self.theta.reshape(2,)
        print("loss:",self.lossHistory[-1]," theta:" , self.theta)

    # 绘图(仅限单变量)
    def draw(self):
        self.drawOriginalScatter()
        self.drawNormalScatter()
        self.drawGD()
        self.drawRegression()
        plt.show()

    # 原始数据绘制散点图
    def drawOriginalScatter(self):
        ax = self.fig.add_subplot(221)
        ax.set_title("Original Data Scatter")
        ax.scatter(self.dataOX, self.dataY,label='Train Dataset',s = 5)
        ax.set_xlabel("x")
        ax.set_ylabel("y")
        ax.legend()

    # 归一化的数据和原始数据绘制散点图
    def drawNormalScatter(self):
        ax = self.fig.add_subplot(222)
        ax.set_title("Normal Data Scatter")
        ax.scatter(self.dataOX, self.dataY,label='Original Train Dataset',s = 5)
        ax.scatter(self.dataX[:,1], self.dataY,label='Normal Train Dataset',s = 5)
        ax.set_xlabel("x")
        ax.set_ylabel("y")
        ax.legend()

    # loss值随迭代次数变化曲线
    def drawGD(self):
        ax = self.fig.add_subplot(223)
        ax.set_title("NumIterations loss")
        ax.plot(np.linspace(0,self.numIterations,self.numIterations),self.lossHistory,label='loss')
        ax.plot(np.linspace(0,self.numIterations,self.numIterations),self.thetaHistory[:,0],label='x1')
        ax.plot(np.linspace(0,self.numIterations,self.numIterations),self.thetaHistory[:,1],label='x2')
        ax.set_xlabel("numIterations")
        ax.set_ylabel("loss")
        ax.legend()

    # 绘制回归线
    def drawRegression(self):
        ax = self.fig.add_subplot(224)
        ax.set_title("Regression Line")
        ax.scatter(self.dataOX, self.dataY,label='Original Train Dataset',s = 5)
        # 对比可见,回归方程是归一化处理后的数据的回归方程,而不是原始数据的回归方程
        ax.scatter(self.dataX[:,1], self.dataY,label='Normal Train Dataset',s = 5)
        # 从训练集的最小值到最大值,取训练集数量个点绘制回归线,注意原数据需要预处理一下
        xx = LinearRegression.preHandle(np.linspace(self.dataX.min(),self.dataX.max(),self.dataSize).reshape(self.dataSize,1))
        yy = np.dot(xx ,self.theta)
        ax.plot(xx[:,-1],yy,label="Regression Line")
        ax.set_xlabel("x")
        ax.set_ylabel("y")
        ax.legend()
    """
    预处理数据,零均值归一化处理
    """
    @staticmethod
    def preHandle(data):
        normalData = np.copy(data)
        # 平均值
        mean = np.mean(normalData)
        # 方差
        std = np.std(normalData)
        normalData = (normalData - mean) / std
        normalData = np.hstack((np.ones((normalData.shape[0],1)),normalData))
        return normalData


df = pd.read_csv("./data/world-happiness-report-2017.csv")
xName = "Economy..GDP.per.Capita."
yName = "Happiness.Score"
trainDataX = df[[xName]].values
trainDataY = df[[yName]].values

linearRegression = LinearRegression(trainDataX,trainDataY)
linearRegression.train()
linearRegression.draw()

文章来源:https://blog.csdn.net/qq_37293230/article/details/134556087
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。