2015-03-12 02:51:31 +05:00
|
|
|
//Implementation of Principal Component Analysis(PCA) with SVD
|
|
|
|
package pca
|
|
|
|
|
2017-04-18 20:14:12 +01:00
|
|
|
import (
|
2018-03-23 23:39:55 +00:00
|
|
|
"gonum.org/v1/gonum/mat"
|
2015-03-12 02:51:31 +05:00
|
|
|
)
|
|
|
|
|
|
|
|
type PCA struct {
|
|
|
|
Num_components int
|
2018-03-23 23:39:55 +00:00
|
|
|
svd *mat.SVD
|
2015-03-12 02:51:31 +05:00
|
|
|
}
|
|
|
|
|
|
|
|
// Number of components. 0 - by default, use number of features as number of components
|
2017-04-18 20:14:12 +01:00
|
|
|
func NewPCA(num_components int) *PCA {
|
|
|
|
return &PCA{Num_components: num_components}
|
2015-03-12 02:51:31 +05:00
|
|
|
}
|
|
|
|
|
2018-02-27 11:39:57 +02:00
|
|
|
// Fit PCA model and transform data
|
|
|
|
// Need return is base.FixedDataGrid
|
2018-03-23 23:39:55 +00:00
|
|
|
func (pca *PCA) FitTransform(X *mat.Dense) *mat.Dense {
|
2018-02-27 11:39:57 +02:00
|
|
|
return pca.Fit(X).Transform(X)
|
|
|
|
}
|
2015-03-12 02:51:31 +05:00
|
|
|
|
2018-02-27 11:39:57 +02:00
|
|
|
// Fit PCA model
|
2018-03-23 23:39:55 +00:00
|
|
|
func (pca *PCA) Fit(X *mat.Dense) *PCA {
|
2018-02-27 11:39:57 +02:00
|
|
|
// Mean to input data
|
2015-03-12 02:51:31 +05:00
|
|
|
M := mean(X)
|
|
|
|
X = matrixSubVector(X, M)
|
|
|
|
|
2018-02-27 11:39:57 +02:00
|
|
|
// Get SVD decomposition from data
|
2018-03-23 23:39:55 +00:00
|
|
|
pca.svd = &mat.SVD{}
|
|
|
|
ok := pca.svd.Factorize(X, mat.SVDThin)
|
2017-04-18 20:14:12 +01:00
|
|
|
if !ok {
|
|
|
|
panic("Unable to factorize")
|
|
|
|
}
|
2015-03-12 02:51:31 +05:00
|
|
|
if pca.Num_components < 0 {
|
|
|
|
panic("Number of components can't be less than zero")
|
|
|
|
}
|
|
|
|
|
2018-02-27 11:39:57 +02:00
|
|
|
return pca
|
|
|
|
}
|
|
|
|
|
|
|
|
// Need return is base.FixedDataGrid
|
2018-03-23 23:39:55 +00:00
|
|
|
func (pca *PCA) Transform(X *mat.Dense) *mat.Dense {
|
2018-02-27 11:39:57 +02:00
|
|
|
if pca.svd == nil {
|
|
|
|
panic("You should to fit PCA model first")
|
|
|
|
}
|
|
|
|
|
|
|
|
num_samples, num_features := X.Dims()
|
|
|
|
|
2018-03-23 23:39:55 +00:00
|
|
|
vTemp := new(mat.Dense)
|
|
|
|
pca.svd.VTo(vTemp)
|
2015-03-12 02:51:31 +05:00
|
|
|
//Compute to full data
|
2017-04-18 20:14:12 +01:00
|
|
|
if pca.Num_components == 0 || pca.Num_components > num_features {
|
|
|
|
return compute(X, vTemp)
|
2015-03-12 02:51:31 +05:00
|
|
|
}
|
|
|
|
|
2017-04-18 20:14:12 +01:00
|
|
|
X = compute(X, vTemp)
|
2018-03-23 23:39:55 +00:00
|
|
|
result := mat.NewDense(num_samples, pca.Num_components, nil)
|
|
|
|
result.Copy(X)
|
2015-03-12 02:51:31 +05:00
|
|
|
return result
|
|
|
|
}
|
|
|
|
|
|
|
|
//Helpful private functions
|
|
|
|
|
|
|
|
//Compute mean of the columns of input matrix
|
2018-03-23 23:39:55 +00:00
|
|
|
func mean(matrix *mat.Dense) *mat.Dense {
|
2015-03-12 02:51:31 +05:00
|
|
|
rows, cols := matrix.Dims()
|
|
|
|
meanVector := make([]float64, cols)
|
|
|
|
for i := 0; i < cols; i++ {
|
2018-03-23 23:39:55 +00:00
|
|
|
sum := mat.Sum(matrix.ColView(i))
|
2017-04-18 20:14:12 +01:00
|
|
|
meanVector[i] = sum / float64(rows)
|
2015-03-12 02:51:31 +05:00
|
|
|
}
|
2018-03-23 23:39:55 +00:00
|
|
|
return mat.NewDense(1, cols, meanVector)
|
2015-03-12 02:51:31 +05:00
|
|
|
}
|
|
|
|
|
|
|
|
// After computing of mean, compute: X(input matrix) - X(mean vector)
|
2018-03-23 23:39:55 +00:00
|
|
|
func matrixSubVector(mat, vec *mat.Dense) *mat.Dense {
|
2015-03-12 02:51:31 +05:00
|
|
|
rowsm, colsm := mat.Dims()
|
|
|
|
_, colsv := vec.Dims()
|
|
|
|
if colsv != colsm {
|
|
|
|
panic("Error in dimension")
|
|
|
|
}
|
|
|
|
for i := 0; i < rowsm; i++ {
|
|
|
|
for j := 0; j < colsm; j++ {
|
2017-04-18 20:14:12 +01:00
|
|
|
mat.Set(i, j, (mat.At(i, j) - vec.At(0, j)))
|
2015-03-12 02:51:31 +05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return mat
|
|
|
|
}
|
|
|
|
|
|
|
|
//Multiplication of X(input data) and V(from SVD)
|
2018-03-23 23:39:55 +00:00
|
|
|
func compute(X, Y mat.Matrix) *mat.Dense {
|
|
|
|
var ret mat.Dense
|
2017-04-18 20:14:12 +01:00
|
|
|
ret.Mul(X, Y)
|
|
|
|
return &ret
|
|
|
|
}
|