1
0
mirror of https://github.com/sjwhitworth/golearn.git synced 2025-04-28 13:48:56 +08:00
golearn/optimisation/gradient_descent.go
2014-05-05 21:40:33 +01:00

118 lines
3.4 KiB
Go

package optimisation
import "github.com/gonum/matrix/mat64"
// Batch gradient descent finds the local minimum of a function.
// See http://en.wikipedia.org/wiki/Gradient_descent for more details.
func BatchGradientDescent(x, y, theta *mat64.Dense, alpha float64, epoch int) *mat64.Dense {
m, _ := y.Dims()
for i := 0; i < epoch; i++ {
xFlat := mat64.DenseCopyOf(x)
xFlat.TCopy(xFlat)
temp := mat64.DenseCopyOf(x)
// Calculate our best prediction, given theta
temp.Mul(temp, theta)
// Calculate our error from the real values
temp.Sub(temp, y)
xFlat.Mul(xFlat, temp)
// Temporary hack to get around the fact there is no scalar division in mat64
xFlatRow, _ := xFlat.Dims()
gradient := make([]float64, 0)
for k := 0; k < xFlatRow; k++ {
row := xFlat.RowView(k)
for v := range row {
divd := row[v] / float64(m) * alpha
gradient = append(gradient, divd)
}
}
grows := len(gradient)
grad := mat64.NewDense(grows, 1, gradient)
theta.Sub(theta, grad)
}
return theta
}
// Stochastic gradient descent updates the parameters of theta on a random row selection from a matrix.
// It is faster as it does not compute the cost function over the entire dataset every time.
// It instead calculates the error parameters over only one row of the dataset at a time.
// In return, there is a trade off for accuracy. This is minimised by running multiple SGD processes
// (the number of goroutines spawned is specified by the procs variable) in parallel and taking an average of the result.
func StochasticGradientDescent(x, y, theta *mat64.Dense, alpha float64, epoch, procs int) *mat64.Dense {
m, _ := y.Dims()
resultPipe := make(chan *mat64.Dense)
results := make([]*mat64.Dense, 0)
for p := 0; p < procs; p++ {
go func() {
// Is this just a pointer to theta?
thetaCopy := mat64.DenseCopyOf(theta)
for i := 0; i < epoch; i++ {
for k := 0; k < m; k++ {
datXtemp := x.RowView(k)
datYtemp := y.RowView(k)
datX := mat64.NewDense(1, len(datXtemp), datXtemp)
datY := mat64.NewDense(1, 1, datYtemp)
datXFlat := mat64.DenseCopyOf(datX)
datXFlat.TCopy(datXFlat)
datX.Mul(datX, thetaCopy)
datX.Sub(datX, datY)
datXFlat.Mul(datXFlat, datX)
// Horrible hack to get around the fact there is no elementwise division in mat64
xFlatRow, _ := datXFlat.Dims()
gradient := make([]float64, 0)
for i := 0; i < xFlatRow; i++ {
row := datXFlat.RowView(i)
for i := range row {
divd := row[i] / float64(m) * alpha
gradient = append(gradient, divd)
}
}
grows := len(gradient)
grad := mat64.NewDense(grows, 1, gradient)
thetaCopy.Sub(thetaCopy, grad)
}
}
resultPipe <- thetaCopy
}()
}
for {
select {
case d := <-resultPipe:
results = append(results, d)
if len(results) == procs {
return averageTheta(results)
}
}
}
}
func averageTheta(matrices []*mat64.Dense) *mat64.Dense {
if len(matrices) < 2 {
panic("Must provide at least two matrices to average")
}
base := matrices[0]
rows, _ := base.Dims()
for i := 1; i < len(matrices); i++ {
base.Add(base, matrices[i])
}
averaged := make([]float64, 0)
for i := 0; i < rows; i++ {
row := base.RowView(i)
for i := range row {
divd := row[i] / float64(len(matrices))
averaged = append(averaged, divd)
}
}
baseDense := mat64.NewDense(rows, 1, averaged)
return baseDense
}