diff --git a/examples/trees/cart.go b/examples/trees/cart/cart.go similarity index 95% rename from examples/trees/cart.go rename to examples/trees/cart/cart.go index 5a19cab..15d5888 100644 --- a/examples/trees/cart.go +++ b/examples/trees/cart/cart.go @@ -7,7 +7,6 @@ import ( "github.com/sjwhitworth/golearn/base" "github.com/sjwhitworth/golearn/trees" - ) func main() { @@ -45,7 +44,7 @@ func main() { // Create New Classification Tree // Hyperparameters - loss function, max Depth (-1 will split until pure), list of unique labels - decTree := NewDecisionTreeClassifier("entropy", -1, []int64{0, 1}) + decTree := trees.NewDecisionTreeClassifier("entropy", -1, []int64{0, 1}) // Train Tree err = decTree.Fit(trainData) @@ -72,7 +71,7 @@ func main() { trainRegData, testRegData := base.InstancesTrainTestSplit(regressionData, 0.5) // Hyperparameters - Loss function, max Depth (-1 will split until pure) - regTree := NewDecisionTreeRegressor("mse", -1) + regTree := trees.NewDecisionTreeRegressor("mse", -1) // Train Tree err = regTree.Fit(trainRegData) diff --git a/examples/trees/trees.go b/examples/trees/id3/trees.go similarity index 100% rename from examples/trees/trees.go rename to examples/trees/id3/trees.go diff --git a/trees/isolation.go b/trees/isolation.go new file mode 100644 index 0000000..bfd9f03 --- /dev/null +++ b/trees/isolation.go @@ -0,0 +1,218 @@ +package trees + +import ( + "math" + "math/rand" + "time" + + "github.com/sjwhitworth/golearn/base" +) + +type IsolationForest struct { + nTrees int + maxDepth int + subSpace int + trees []regressorNode +} + +func selectFeature(data [][]float64) int64 { + rand.Seed(time.Now().UnixNano()) + return int64(rand.Intn(len(data[0]))) +} + +func minMax(feature int64, data [][]float64) (float64, float64) { + + var min, max float64 + + min = math.Inf(1) + max = math.Inf(-1) + for _, instance := range data { + if instance[feature] > max { + max = instance[feature] + } + if instance[feature] < min { + min = instance[feature] + } + } + + return min, max +} + +func selectValue(min, max float64) float64 { + rand.Seed(time.Now().UnixNano()) + val := min + (rand.Float64() * (max - min)) + if val == min { + val += 0.000001 + } else if val == max { + val -= 0.000001 + } + return val +} + +func splitData(val float64, feature int64, data [][]float64) ([][]float64, [][]float64) { + var leftData, rightData [][]float64 + for _, instance := range data { + if instance[feature] <= val { + leftData = append(leftData, instance) + } else { + rightData = append(rightData, instance) + } + } + return leftData, rightData +} + +func checkData(data [][]float64) bool { + for _, instance := range data { + for i, val := range instance { + if val != data[0][i] { + return true + } + } + } + return false +} + +func buildTree(data [][]float64, upperNode regressorNode, depth int, maxDepth int) regressorNode { + depth++ + + upperNode.isNodeNeeded = true + if (depth > maxDepth) || (len(data) <= 1) || !checkData(data) { + upperNode.isNodeNeeded = false + return upperNode + } + + var featureToSplit int64 + var splitVal float64 + var min, max float64 + min, max = 0.0, 0.0 + + for min == max { + featureToSplit = selectFeature(data) + min, max = minMax(featureToSplit, data) + splitVal = selectValue(min, max) + } + + leftData, rightData := splitData(splitVal, featureToSplit, data) + + upperNode.Feature = featureToSplit + upperNode.Threshold = splitVal + upperNode.LeftPred = float64(len(leftData)) + upperNode.RightPred = float64(len(rightData)) + + var leftNode, rightNode regressorNode + leftNode = buildTree(leftData, leftNode, depth, maxDepth) + rightNode = buildTree(rightData, rightNode, depth, maxDepth) + + if leftNode.isNodeNeeded { + upperNode.Left = &leftNode + } + if rightNode.isNodeNeeded { + upperNode.Right = &rightNode + } + + return upperNode +} + +func getRandomData(data [][]float64, subSpace int) [][]float64 { + var randomData [][]float64 + rand.Seed(time.Now().UnixNano()) + for i := 0; i < subSpace; i++ { + randomData = append(randomData, data[rand.Intn(len(data))]) + } + return randomData +} + +func NewIsolationForest(nTrees int, maxDepth int, subSpace int) IsolationForest { + var iForest IsolationForest + iForest.nTrees = nTrees + iForest.maxDepth = maxDepth + iForest.subSpace = subSpace + return iForest +} + +func (iForest *IsolationForest) Fit(X base.FixedDataGrid) { + data := preprocessData(X) + nTrees := iForest.nTrees + subSpace := iForest.subSpace + maxDepth := iForest.maxDepth + + var forest []regressorNode + for i := 0; i < nTrees; i++ { + subData := getRandomData(data, subSpace) + var tree regressorNode + + tree = buildTree(subData, tree, 0, maxDepth) + forest = append(forest, tree) + } + iForest.trees = forest +} + +func pathLength(tree regressorNode, instance []float64, path float64) float64 { + path++ + + if instance[tree.Feature] <= tree.Threshold { + if tree.Left == nil { + if tree.LeftPred <= 1 { + return path + } else { + return path + cFactor(int(tree.LeftPred)) + } + } + path = pathLength(*tree.Left, instance, path) + } else { + if tree.Right == nil { + if tree.RightPred <= 1 { + return path + } else { + return path + cFactor(int(tree.RightPred)) + } + } + path = pathLength(*tree.Right, instance, path) + } + return path +} + +func evaluateInstance(instance []float64, forest []regressorNode) []float64 { + var paths []float64 + for _, tree := range forest { + paths = append(paths, pathLength(tree, instance, 0)) + } + return paths +} + +func cFactor(n int) float64 { + return 2.0*(math.Log(float64(n-1))+0.5772156649) - (float64(2.0*(n-1)) / float64(n)) +} + +func anomalyScore(instance []float64, forest []regressorNode, n int) float64 { + paths := evaluateInstance(instance, forest) + E := 0.0 + for _, path := range paths { + E += path + } + E /= float64(len(paths)) + c := cFactor(n) + return math.Pow(2, (-1 * E / c)) +} +func (iForest *IsolationForest) Predict(X base.FixedDataGrid) []float64 { + data := preprocessData(X) + + var preds []float64 + for _, instance := range data { + score := anomalyScore(instance, iForest.trees, iForest.subSpace) + preds = append(preds, score) + } + return preds +} + +func preprocessData(X base.FixedDataGrid) [][]float64 { + data := convertInstancesToProblemVec(X) + class, err := regressorConvertInstancesToLabelVec(X) + if err != nil { + panic(err) + } + for i, point := range class { + data[i] = append(data[i], point) + } + return data +}