1
0
mirror of https://github.com/sjwhitworth/golearn.git synced 2025-04-28 13:48:56 +08:00

Remove unused cross_validation package

This commit is contained in:
Amit Kumar Gupta 2014-08-22 09:34:32 +00:00
parent 529b3bcaa5
commit f14020f78c
2 changed files with 0 additions and 157 deletions

View File

@ -1,96 +0,0 @@
package cross_validation
import (
"fmt"
"github.com/gonum/matrix/mat64"
"math/rand"
"sync"
"time"
)
func shuffleMatrix(returnDatasets []*mat64.Dense, dataset mat64.Matrix, testSize int, seed int64, wg *sync.WaitGroup) {
numGen := rand.New(rand.NewSource(seed))
// We don't want to alter the original dataset.
shuffledSet := mat64.DenseCopyOf(dataset)
rowCount, colCount := shuffledSet.Dims()
temp := make([]float64, colCount)
// FisherYates shuffle
for i := 0; i < rowCount; i++ {
j := numGen.Intn(i + 1)
if j != i {
// Make a "hard" copy to avoid pointer craziness.
copy(temp, shuffledSet.RowView(i))
shuffledSet.SetRow(i, shuffledSet.RowView(j))
shuffledSet.SetRow(j, temp)
}
}
trainSize := rowCount - testSize
returnDatasets[0] = mat64.NewDense(trainSize, colCount, shuffledSet.RawMatrix().Data[:trainSize*colCount])
returnDatasets[1] = mat64.NewDense(testSize, colCount, shuffledSet.RawMatrix().Data[trainSize*colCount:])
wg.Done()
}
// TrainTestSplit splits input DenseMatrix into subsets for testing.
// The function expects a test size number (int) or percentage (float64), and a random state or nil to get "random" shuffle.
// It returns a list containing the train-test split and an error status.
func TrainTestSplit(size interface{}, randomState interface{}, datasets ...*mat64.Dense) ([]*mat64.Dense, error) {
// Get number of instances (rows).
instanceCount, _ := datasets[0].Dims()
// Input should be one or two matrices.
dataCount := len(datasets)
if dataCount > 2 {
return nil, fmt.Errorf("expected 1 or 2 datasets, got %d\n", dataCount)
}
if dataCount == 2 {
// Test for consistency.
labelCount, labelFeatures := datasets[1].Dims()
if labelCount != instanceCount {
return nil, fmt.Errorf("data and labels must have the same number of instances")
} else if labelFeatures != 1 {
return nil, fmt.Errorf("label matrix must have single feature")
}
}
var testSize int
switch size := size.(type) {
// If size is an integer, treat it as the test data instance count.
case int:
testSize = size
case float64:
// If size is a float, treat it as a percentage of the instances to be allocated to the test set.
testSize = int(float64(instanceCount)*size + 0.5)
default:
return nil, fmt.Errorf("expected a test instance count (int) or percentage (float64)")
}
var randSeed int64
// Create a deterministic shuffle, or a "random" one based on current time.
if seed, ok := randomState.(int); ok {
randSeed = int64(seed)
} else {
// Use seconds since epoch as seed
randSeed = time.Now().Unix()
}
// Wait group for goroutine syncronization.
wg := new(sync.WaitGroup)
wg.Add(dataCount)
// Return slice will hold training and test data and optional labels matrix.
returnDatasets := make([]*mat64.Dense, 2*dataCount)
for i, dataset := range datasets {
// Send proper returnDataset slice.
// This is needed so goroutine doesn't mess up the expected return order.
// Perhaps returning a map is a better solution...
go shuffleMatrix(returnDatasets[i:i+2], dataset, testSize, randSeed, wg)
}
wg.Wait()
return returnDatasets, nil
}

View File

@ -1,61 +0,0 @@
package cross_validation
import (
//. "github.com/smartystreets/goconvey/convey"
"github.com/gonum/matrix/mat64"
"math/rand"
"testing"
"time"
)
var (
flatValues, flatLabels []float64
values, labels *mat64.Dense
)
func init() {
flatValues = make([]float64, 80)
flatLabels = make([]float64, 20)
for i := 0; i < 80; i++ {
flatValues[i] = float64(i + 1)
// Replaces labels four times per run but who cares.
flatLabels[int(i/4)] = float64(rand.Intn(2))
}
values = mat64.NewDense(20, 4, flatValues)
labels = mat64.NewDense(20, 1, flatLabels)
}
func TestTrainTrainTestSplit(t *testing.T) {
nolab1, err := TrainTestSplit(4, nil, values)
if err != nil {
t.Error(err)
}
// Make sure the random generator gets a new seed (time).
time.Sleep(time.Second)
nolab2, _ := TrainTestSplit(4, nil, values)
if nolab1[0].Equals(nolab2[0]) {
t.Errorf("Shuffle with different seed returned same matrix")
}
nolab1, _ = TrainTestSplit(4, 1, values)
nolab2, _ = TrainTestSplit(4, 1, values)
// Comparing the determinants does not guarantee uniqueness, but it will do for now.
if !nolab1[0].Equals(nolab2[0]) {
t.Errorf("Shuffle with same seed returned different matrix")
}
// Same thing for data with labels.
lab1, err := TrainTestSplit(0.1, 10, values, labels)
if err != nil {
t.Error(err)
}
lab2, _ := TrainTestSplit(0.1, 10, values, labels)
if !lab1[0].Equals(lab2[0]) {
t.Errorf("Shuffle with same seed returned different determinants")
}
}