1
0
mirror of https://github.com/sjwhitworth/golearn.git synced 2025-04-26 13:49:14 +08:00

Adding Isolation + Fixing previous import issue

This commit is contained in:
Ayush 2020-08-24 14:26:45 +05:30
parent 6aa37aca00
commit 452acbabb3
3 changed files with 220 additions and 3 deletions

View File

@ -7,7 +7,6 @@ import (
"github.com/sjwhitworth/golearn/base"
"github.com/sjwhitworth/golearn/trees"
)
func main() {
@ -45,7 +44,7 @@ func main() {
// Create New Classification Tree
// Hyperparameters - loss function, max Depth (-1 will split until pure), list of unique labels
decTree := NewDecisionTreeClassifier("entropy", -1, []int64{0, 1})
decTree := trees.NewDecisionTreeClassifier("entropy", -1, []int64{0, 1})
// Train Tree
err = decTree.Fit(trainData)
@ -72,7 +71,7 @@ func main() {
trainRegData, testRegData := base.InstancesTrainTestSplit(regressionData, 0.5)
// Hyperparameters - Loss function, max Depth (-1 will split until pure)
regTree := NewDecisionTreeRegressor("mse", -1)
regTree := trees.NewDecisionTreeRegressor("mse", -1)
// Train Tree
err = regTree.Fit(trainRegData)

218
trees/isolation.go Normal file
View File

@ -0,0 +1,218 @@
package trees
import (
"math"
"math/rand"
"time"
"github.com/sjwhitworth/golearn/base"
)
type IsolationForest struct {
nTrees int
maxDepth int
subSpace int
trees []regressorNode
}
func selectFeature(data [][]float64) int64 {
rand.Seed(time.Now().UnixNano())
return int64(rand.Intn(len(data[0])))
}
func minMax(feature int64, data [][]float64) (float64, float64) {
var min, max float64
min = math.Inf(1)
max = math.Inf(-1)
for _, instance := range data {
if instance[feature] > max {
max = instance[feature]
}
if instance[feature] < min {
min = instance[feature]
}
}
return min, max
}
func selectValue(min, max float64) float64 {
rand.Seed(time.Now().UnixNano())
val := min + (rand.Float64() * (max - min))
if val == min {
val += 0.000001
} else if val == max {
val -= 0.000001
}
return val
}
func splitData(val float64, feature int64, data [][]float64) ([][]float64, [][]float64) {
var leftData, rightData [][]float64
for _, instance := range data {
if instance[feature] <= val {
leftData = append(leftData, instance)
} else {
rightData = append(rightData, instance)
}
}
return leftData, rightData
}
func checkData(data [][]float64) bool {
for _, instance := range data {
for i, val := range instance {
if val != data[0][i] {
return true
}
}
}
return false
}
func buildTree(data [][]float64, upperNode regressorNode, depth int, maxDepth int) regressorNode {
depth++
upperNode.isNodeNeeded = true
if (depth > maxDepth) || (len(data) <= 1) || !checkData(data) {
upperNode.isNodeNeeded = false
return upperNode
}
var featureToSplit int64
var splitVal float64
var min, max float64
min, max = 0.0, 0.0
for min == max {
featureToSplit = selectFeature(data)
min, max = minMax(featureToSplit, data)
splitVal = selectValue(min, max)
}
leftData, rightData := splitData(splitVal, featureToSplit, data)
upperNode.Feature = featureToSplit
upperNode.Threshold = splitVal
upperNode.LeftPred = float64(len(leftData))
upperNode.RightPred = float64(len(rightData))
var leftNode, rightNode regressorNode
leftNode = buildTree(leftData, leftNode, depth, maxDepth)
rightNode = buildTree(rightData, rightNode, depth, maxDepth)
if leftNode.isNodeNeeded {
upperNode.Left = &leftNode
}
if rightNode.isNodeNeeded {
upperNode.Right = &rightNode
}
return upperNode
}
func getRandomData(data [][]float64, subSpace int) [][]float64 {
var randomData [][]float64
rand.Seed(time.Now().UnixNano())
for i := 0; i < subSpace; i++ {
randomData = append(randomData, data[rand.Intn(len(data))])
}
return randomData
}
func NewIsolationForest(nTrees int, maxDepth int, subSpace int) IsolationForest {
var iForest IsolationForest
iForest.nTrees = nTrees
iForest.maxDepth = maxDepth
iForest.subSpace = subSpace
return iForest
}
func (iForest *IsolationForest) Fit(X base.FixedDataGrid) {
data := preprocessData(X)
nTrees := iForest.nTrees
subSpace := iForest.subSpace
maxDepth := iForest.maxDepth
var forest []regressorNode
for i := 0; i < nTrees; i++ {
subData := getRandomData(data, subSpace)
var tree regressorNode
tree = buildTree(subData, tree, 0, maxDepth)
forest = append(forest, tree)
}
iForest.trees = forest
}
func pathLength(tree regressorNode, instance []float64, path float64) float64 {
path++
if instance[tree.Feature] <= tree.Threshold {
if tree.Left == nil {
if tree.LeftPred <= 1 {
return path
} else {
return path + cFactor(int(tree.LeftPred))
}
}
path = pathLength(*tree.Left, instance, path)
} else {
if tree.Right == nil {
if tree.RightPred <= 1 {
return path
} else {
return path + cFactor(int(tree.RightPred))
}
}
path = pathLength(*tree.Right, instance, path)
}
return path
}
func evaluateInstance(instance []float64, forest []regressorNode) []float64 {
var paths []float64
for _, tree := range forest {
paths = append(paths, pathLength(tree, instance, 0))
}
return paths
}
func cFactor(n int) float64 {
return 2.0*(math.Log(float64(n-1))+0.5772156649) - (float64(2.0*(n-1)) / float64(n))
}
func anomalyScore(instance []float64, forest []regressorNode, n int) float64 {
paths := evaluateInstance(instance, forest)
E := 0.0
for _, path := range paths {
E += path
}
E /= float64(len(paths))
c := cFactor(n)
return math.Pow(2, (-1 * E / c))
}
func (iForest *IsolationForest) Predict(X base.FixedDataGrid) []float64 {
data := preprocessData(X)
var preds []float64
for _, instance := range data {
score := anomalyScore(instance, iForest.trees, iForest.subSpace)
preds = append(preds, score)
}
return preds
}
func preprocessData(X base.FixedDataGrid) [][]float64 {
data := convertInstancesToProblemVec(X)
class, err := regressorConvertInstancesToLabelVec(X)
if err != nil {
panic(err)
}
for i, point := range class {
data[i] = append(data[i], point)
}
return data
}