package gotensor import "math" // Optimizer 优化器接口 type Optimizer interface { Step() // 根据梯度更新参数 ZeroGrad() // 清空所有梯度 } // SGD 随机梯度下降优化器 type SGD struct { Parameters []*Tensor LR float64 // 学习率 } // NewSGD 创建一个新的SGD优化器 func NewSGD(parameters []*Tensor, lr float64) *SGD { return &SGD{ Parameters: parameters, LR: lr, } } // Step 更新参数 func (s *SGD) Step() { for _, param := range s.Parameters { // 获取参数的梯度 grad := param.Grad // 获取参数的形状 shape := param.Data.Shape() // 更新参数: param = param - lr * grad if len(shape) == 1 { for i := 0; i < shape[0]; i++ { paramVal, _ := param.Data.Get(i) gradVal, _ := grad.Get(i) newVal := paramVal - s.LR * gradVal param.Data.Set(newVal, i) } } else if len(shape) == 2 { rows, cols := shape[0], shape[1] for i := 0; i < rows; i++ { for j := 0; j < cols; j++ { paramVal, _ := param.Data.Get(i, j) gradVal, _ := grad.Get(i, j) newVal := paramVal - s.LR * gradVal param.Data.Set(newVal, i, j) } } } } } // ZeroGrad 清空所有梯度 func (s *SGD) ZeroGrad() { for _, param := range s.Parameters { param.ZeroGrad() } } // Adam 优化器 type Adam struct { Parameters []*Tensor LR float64 // 学习率 Beta1 float64 // 一阶矩估计的指数衰减率 Beta2 float64 // 二阶矩估计的指数衰减率 Epsilon float64 // 防止除零的小常数 T int // 当前步数 // 一阶矩估计 M []map[string]*Tensor // 二阶矩估计 V []map[string]*Tensor } // NewAdam 创建一个新的Adam优化器 func NewAdam(parameters []*Tensor, lr, beta1, beta2, epsilon float64) *Adam { adam := &Adam{ Parameters: parameters, LR: lr, Beta1: beta1, Beta2: beta2, Epsilon: epsilon, T: 0, M: make([]map[string]*Tensor, len(parameters)), V: make([]map[string]*Tensor, len(parameters)), } // 初始化M和V for i := range parameters { adam.M[i] = make(map[string]*Tensor) adam.V[i] = make(map[string]*Tensor) // 创建与参数形状相同的零张量 shape := parameters[i].Shape() m, _ := NewZeros(shape) v, _ := NewZeros(shape) adam.M[i]["tensor"] = m adam.V[i]["tensor"] = v } return adam } // Step 更新参数 func (a *Adam) Step() { a.T++ for i, param := range a.Parameters { grad := param.Grad shape := param.Data.Shape() // 更新一阶矩估计: m = beta1 * m + (1 - beta1) * grad m := a.M[i]["tensor"] newMData := make([]float64, param.Size()) if len(shape) == 1 { for idx := 0; idx < shape[0]; idx++ { mVal, _ := m.Data.Get(idx) gradVal, _ := grad.Get(idx) newMData[idx] = a.Beta1 * mVal + (1 - a.Beta1) * gradVal } } else if len(shape) == 2 { rows, cols := shape[0], shape[1] for r := 0; r < rows; r++ { for c := 0; c < cols; c++ { mVal, _ := m.Data.Get(r, c) gradVal, _ := grad.Get(r, c) newMData[r*cols+c] = a.Beta1 * mVal + (1 - a.Beta1) * gradVal } } } newM, _ := NewTensor(newMData, shape) a.M[i]["tensor"] = newM // 更新二阶矩估计: v = beta2 * v + (1 - beta2) * grad^2 v := a.V[i]["tensor"] newVData := make([]float64, param.Size()) if len(shape) == 1 { for idx := 0; idx < shape[0]; idx++ { vVal, _ := v.Data.Get(idx) gradVal, _ := grad.Get(idx) newVData[idx] = a.Beta2 * vVal + (1 - a.Beta2) * gradVal * gradVal } } else if len(shape) == 2 { rows, cols := shape[0], shape[1] for r := 0; r < rows; r++ { for c := 0; c < cols; c++ { vVal, _ := v.Data.Get(r, c) gradVal, _ := grad.Get(r, c) newVData[r*cols+c] = a.Beta2 * vVal + (1 - a.Beta2) * gradVal * gradVal } } } newV, _ := NewTensor(newVData, shape) a.V[i]["tensor"] = newV // 计算偏差修正的一阶矩估计 mHatData := make([]float64, param.Size()) mHatShape := shape if len(shape) == 1 { for idx := 0; idx < shape[0]; idx++ { mVal, _ := newM.Data.Get(idx) mHatData[idx] = mVal / (1 - math.Pow(a.Beta1, float64(a.T))) } } else if len(shape) == 2 { rows, cols := shape[0], shape[1] for r := 0; r < rows; r++ { for c := 0; c < cols; c++ { mVal, _ := newM.Data.Get(r, c) mHatData[r*cols+c] = mVal / (1 - math.Pow(a.Beta1, float64(a.T))) } } } mHat, _ := NewTensor(mHatData, mHatShape) // 计算偏差修正的二阶矩估计 vHatData := make([]float64, param.Size()) vHatShape := shape if len(shape) == 1 { for idx := 0; idx < shape[0]; idx++ { vVal, _ := newV.Data.Get(idx) vHatData[idx] = vVal / (1 - math.Pow(a.Beta2, float64(a.T))) } } else if len(shape) == 2 { rows, cols := shape[0], shape[1] for r := 0; r < rows; r++ { for c := 0; c < cols; c++ { vVal, _ := newV.Data.Get(r, c) vHatData[r*cols+c] = vVal / (1 - math.Pow(a.Beta2, float64(a.T))) } } } vHat, _ := NewTensor(vHatData, vHatShape) // 更新参数: param = param - lr * m_hat / (sqrt(v_hat) + epsilon) if len(shape) == 1 { for idx := 0; idx < shape[0]; idx++ { paramVal, _ := param.Data.Get(idx) mHatVal, _ := mHat.Data.Get(idx) vHatVal, _ := vHat.Data.Get(idx) updateVal := a.LR * mHatVal / (math.Sqrt(vHatVal) + a.Epsilon) newVal := paramVal - updateVal param.Data.Set(newVal, idx) } } else if len(shape) == 2 { rows, cols := shape[0], shape[1] for r := 0; r < rows; r++ { for c := 0; c < cols; c++ { paramVal, _ := param.Data.Get(r, c) mHatVal, _ := mHat.Data.Get(r, c) vHatVal, _ := vHat.Data.Get(r, c) updateVal := a.LR * mHatVal / (math.Sqrt(vHatVal) + a.Epsilon) newVal := paramVal - updateVal param.Data.Set(newVal, r, c) } } } } } // ZeroGrad 清空所有梯度 func (a *Adam) ZeroGrad() { for _, param := range a.Parameters { param.ZeroGrad() } }