package optimisation

import "github.com/gonum/matrix/mat64"

// Batch gradient descent finds the local minimum of a function.
// See http://en.wikipedia.org/wiki/Gradient_descent for more details.
func BatchGradientDescent(x, y, theta *mat64.Dense, alpha float64, epoch int) *mat64.Dense {
	m, _ := y.Dims()
	for i := 0; i < epoch; i++ {
		xFlat := mat64.DenseCopyOf(x)
		xFlat.TCopy(xFlat)
		temp := mat64.DenseCopyOf(x)

		// Calculate our best prediction, given theta
		temp.Mul(temp, theta)

		// Calculate our error from the real values
		temp.Sub(temp, y)
		xFlat.Mul(xFlat, temp)

		// Temporary hack to get around the fact there is no scalar division in mat64
		xFlatRow, _ := xFlat.Dims()
		gradient := make([]float64, 0)
		for k := 0; k < xFlatRow; k++ {
			row := xFlat.RowView(k)
			for v := range row {
				divd := row[v] / float64(m) * alpha
				gradient = append(gradient, divd)
			}
		}
		grows := len(gradient)
		grad := mat64.NewDense(grows, 1, gradient)
		theta.Sub(theta, grad)
	}
	return theta
}

// Stochastic gradient descent updates the parameters of theta on a random row selection from a matrix.
// It is faster as it does not compute the cost function over the entire dataset every time.
// It instead calculates the error parameters over only one row of the dataset at a time.
// In return, there is a trade off for accuracy. This is minimised by running multiple SGD processes
// (the number of goroutines spawned is specified by the procs variable) in parallel and taking an average of the result.
func StochasticGradientDescent(x, y, theta *mat64.Dense, alpha float64, epoch, procs int) *mat64.Dense {
	m, _ := y.Dims()
	resultPipe := make(chan *mat64.Dense)
	results := make([]*mat64.Dense, 0)

	for p := 0; p < procs; p++ {
		go func() {
			// Is this just a pointer to theta?
			thetaCopy := mat64.DenseCopyOf(theta)
			for i := 0; i < epoch; i++ {
				for k := 0; k < m; k++ {
					datXtemp := x.RowView(k)
					datYtemp := y.RowView(k)
					datX := mat64.NewDense(1, len(datXtemp), datXtemp)
					datY := mat64.NewDense(1, 1, datYtemp)
					datXFlat := mat64.DenseCopyOf(datX)
					datXFlat.TCopy(datXFlat)
					datX.Mul(datX, thetaCopy)
					datX.Sub(datX, datY)
					datXFlat.Mul(datXFlat, datX)

					// Horrible hack to get around the fact there is no elementwise division in mat64
					xFlatRow, _ := datXFlat.Dims()
					gradient := make([]float64, 0)
					for i := 0; i < xFlatRow; i++ {
						row := datXFlat.RowView(i)
						for i := range row {
							divd := row[i] / float64(m) * alpha
							gradient = append(gradient, divd)
						}
					}
					grows := len(gradient)
					grad := mat64.NewDense(grows, 1, gradient)
					thetaCopy.Sub(thetaCopy, grad)
				}

			}
			resultPipe <- thetaCopy
		}()
	}

	for {
		select {
		case d := <-resultPipe:
			results = append(results, d)
			if len(results) == procs {
				return averageTheta(results)
			}
		}
	}
}

func averageTheta(matrices []*mat64.Dense) *mat64.Dense {
	if len(matrices) < 2 {
		panic("Must provide at least two matrices to average")
	}
	base := matrices[0]
	rows, _ := base.Dims()

	for i := 1; i < len(matrices); i++ {
		base.Add(base, matrices[i])
	}

	averaged := make([]float64, 0)
	for i := 0; i < rows; i++ {
		row := base.RowView(i)
		for i := range row {
			divd := row[i] / float64(len(matrices))
			averaged = append(averaged, divd)
		}
	}

	baseDense := mat64.NewDense(rows, 1, averaged)
	return baseDense
}