Merge pull request #6 from Yushgoel/IsolationForest

Adding Feature Request for Isolation Forest
2025-04-26 13:49:14 +08:00 · 2020-08-30 18:19:39 +05:30 · 2020-08-30 18:19:39 +05:30 · fef30344d3
commit fef30344d3
parent 6aa37aca00 333997b024
7 changed files with 1360 additions and 9 deletions
--- a/examples/datasets/gaussian_outliers.csv
+++ b/examples/datasets/gaussian_outliers.csv
--- a/examples/trees/cart/cart.go
+++ b/examples/trees/cart/cart.go
@ -7,7 +7,6 @@ import (

 	"github.com/sjwhitworth/golearn/base"
 	"github.com/sjwhitworth/golearn/trees"
-
 )

 func main() {
@ -45,7 +44,7 @@ func main() {

 	// Create New Classification Tree
 	// Hyperparameters - loss function, max Depth (-1 will split until pure), list of unique labels
-	decTree := NewDecisionTreeClassifier("entropy", -1, []int64{0, 1})
+	decTree := trees.NewDecisionTreeClassifier("entropy", -1, []int64{0, 1})

 	// Train Tree
 	err = decTree.Fit(trainData)
@ -72,7 +71,7 @@ func main() {
 	trainRegData, testRegData := base.InstancesTrainTestSplit(regressionData, 0.5)

 	// Hyperparameters - Loss function, max Depth (-1 will split until pure)
-	regTree := NewDecisionTreeRegressor("mse", -1)
+	regTree := trees.NewDecisionTreeRegressor("mse", -1)

 	// Train Tree
 	err = regTree.Fit(trainRegData)
--- a/examples/trees/id3/trees.go
+++ b/examples/trees/id3/trees.go
--- a/examples/trees/isolationForest/isolation_forest.go
+++ b/examples/trees/isolationForest/isolation_forest.go
@ -0,0 +1,57 @@
+// Example of how to use Isolation Forest for outlier detection
+
+package main
+
+import (
+	"fmt"
+
+	"github.com/sjwhitworth/golearn/base"
+	"github.com/sjwhitworth/golearn/trees"
+)
+
+func main() {
+	/* Isolation Forest is used for outlier detection
+		 The algorithm works by randomly splitting the data, so results won't be exactly reproducible
+	 	 but generally outliers will still be classified as outliers. */
+
+	// Load data for outlier detection - includes gaussian distribution, and ten outliers at the end
+	// Dataset has 1000 normal datapoints, and 10 outliers at the ned
+	csvData, err := base.ParseCSVToInstances("../datasets/gaussian_outliers.csv", true)
+	if err != nil {
+		panic(err)
+	}
+
+	// Create New Isolation Forest with 100 trees, max depth 100, and each tree will use 850 datapoints
+	forest := trees.NewIsolationForest(100, 100, 850)
+
+	// fit the isolation forest to the data. Note that all class attributes are also used during training.
+	// Remove all class attributes you don't want to use before calling fit.
+	forest.Fit(csvData)
+
+	// Make predictions. Generally, IsolationForest is used for Interpolation, not Extrapolation.
+	// Predictions are returned as Anomaly Scores from 0 to 1. close to 0 - not outlier, close to 1 - outlier
+	preds := forest.Predict(csvData)
+
+	// Let's find the average and minimum Anomaly Score for normal data
+	var avgScore float64
+	var min float64
+	min = 1
+
+	for i := 0; i < 1000; i++ {
+		temp := preds[i]
+		avgScore += temp
+		if temp < min {
+			min = temp
+		}
+	}
+	fmt.Println(avgScore / 1000)
+	fmt.Println(min)
+
+	// Now let's print the anomaly scores for the outliers.
+	// You should find that these values are much higher (around 0.7) as comapred to the scores for normal data.
+	fmt.Println("Anomaly Scores for outliers are ")
+	for i := 1000; i < 1010; i++ {
+		fmt.Print("      ")
+		fmt.Println(preds[i])
+	}
+}
--- a/trees/isolation.go
+++ b/trees/isolation.go
@ -0,0 +1,234 @@
+package trees
+
+import (
+	"math"
+	"math/rand"
+	"time"
+
+	"github.com/sjwhitworth/golearn/base"
+)
+
+type IsolationForest struct {
+	nTrees   int
+	maxDepth int
+	subSpace int
+	trees    []regressorNode
+}
+
+// Select A random feature for splitting from the data.
+func selectFeature(data [][]float64) int64 {
+	rand.Seed(time.Now().UnixNano())
+	return int64(rand.Intn(len(data[0])))
+}
+
+// Find the minimum and maximum values of a feature. Used so that we can choose a random threshold.
+func minMax(feature int64, data [][]float64) (float64, float64) {
+
+	var min, max float64
+
+	min = math.Inf(1)
+	max = math.Inf(-1)
+	for _, instance := range data {
+		if instance[feature] > max {
+			max = instance[feature]
+		}
+		if instance[feature] < min {
+			min = instance[feature]
+		}
+	}
+
+	return min, max
+}
+
+// Select a random threshold between the minimum and maximum of the feature.
+func selectValue(min, max float64) float64 {
+	rand.Seed(time.Now().UnixNano())
+	val := min + (rand.Float64() * (max - min))
+	if val == min {
+		val += 0.000001
+	} else if val == max {
+		val -= 0.000001
+	}
+	return val
+}
+
+// Split the data based on the threshold.
+func splitData(val float64, feature int64, data [][]float64) ([][]float64, [][]float64) {
+	var leftData, rightData [][]float64
+	for _, instance := range data {
+		if instance[feature] <= val {
+			leftData = append(leftData, instance)
+		} else {
+			rightData = append(rightData, instance)
+		}
+	}
+	return leftData, rightData
+}
+
+// Make sure that the data can still be split (all datapoints are not duplicate)
+func checkData(data [][]float64) bool {
+	for _, instance := range data {
+		for i, val := range instance {
+			if val != data[0][i] {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+// Recusrively build a tree by randomly choosing a feature to split until nodes are pure or max depth is reached.
+func buildTree(data [][]float64, upperNode regressorNode, depth int, maxDepth int) regressorNode {
+	depth++
+
+	upperNode.isNodeNeeded = true
+	if (depth > maxDepth) || (len(data) <= 1) || !checkData(data) {
+		upperNode.isNodeNeeded = false
+		return upperNode
+	}
+
+	var featureToSplit int64
+	var splitVal float64
+	var min, max float64
+	min, max = 0.0, 0.0
+
+	for min == max {
+		featureToSplit = selectFeature(data)
+		min, max = minMax(featureToSplit, data)
+		splitVal = selectValue(min, max)
+	}
+
+	leftData, rightData := splitData(splitVal, featureToSplit, data)
+
+	upperNode.Feature = featureToSplit
+	upperNode.Threshold = splitVal
+	upperNode.LeftPred = float64(len(leftData))
+	upperNode.RightPred = float64(len(rightData))
+
+	var leftNode, rightNode regressorNode
+	leftNode = buildTree(leftData, leftNode, depth, maxDepth)
+	rightNode = buildTree(rightData, rightNode, depth, maxDepth)
+
+	if leftNode.isNodeNeeded {
+		upperNode.Left = &leftNode
+	}
+	if rightNode.isNodeNeeded {
+		upperNode.Right = &rightNode
+	}
+
+	return upperNode
+}
+
+// Get a random subset of the data. Helps making each tree in forest different.
+func getRandomData(data [][]float64, subSpace int) [][]float64 {
+	var randomData [][]float64
+	rand.Seed(time.Now().UnixNano())
+	for i := 0; i < subSpace; i++ {
+		randomData = append(randomData, data[rand.Intn(len(data))])
+	}
+	return randomData
+}
+
+// Function to create a new isolation forest. nTrees is number of trees in Forest. Maxdepth is maximum depth of each tree. Subspace is number of data points to use per tree.
+func NewIsolationForest(nTrees int, maxDepth int, subSpace int) IsolationForest {
+	var iForest IsolationForest
+	iForest.nTrees = nTrees
+	iForest.maxDepth = maxDepth
+	iForest.subSpace = subSpace
+	return iForest
+}
+
+// Fit the data based on hyperparameters and data.
+func (iForest *IsolationForest) Fit(X base.FixedDataGrid) {
+	data := preprocessData(X)
+	nTrees := iForest.nTrees
+	subSpace := iForest.subSpace
+	maxDepth := iForest.maxDepth
+
+	var forest []regressorNode
+	for i := 0; i < nTrees; i++ {
+		subData := getRandomData(data, subSpace)
+		var tree regressorNode
+
+		tree = buildTree(subData, tree, 0, maxDepth)
+		forest = append(forest, tree)
+	}
+	iForest.trees = forest
+}
+
+// Calculate the path length to reach a leaf node for a datapoint. Outliers have smaller path lengths than standard data points.
+func pathLength(tree regressorNode, instance []float64, path float64) float64 {
+	path++
+
+	if instance[tree.Feature] <= tree.Threshold {
+		if tree.Left == nil {
+			if tree.LeftPred <= 1 {
+				return path
+			} else {
+				return path + cFactor(int(tree.LeftPred))
+			}
+		}
+		path = pathLength(*tree.Left, instance, path)
+	} else {
+		if tree.Right == nil {
+			if tree.RightPred <= 1 {
+				return path
+			} else {
+				return path + cFactor(int(tree.RightPred))
+			}
+		}
+		path = pathLength(*tree.Right, instance, path)
+	}
+	return path
+}
+
+// Find the path length of a a datapoints from all trees in forest.
+func evaluateInstance(instance []float64, forest []regressorNode) []float64 {
+	var paths []float64
+	for _, tree := range forest {
+		paths = append(paths, pathLength(tree, instance, 0))
+	}
+	return paths
+}
+
+// Helper function to calculate anomaly score.
+func cFactor(n int) float64 {
+	return 2.0*(math.Log(float64(n-1))+0.5772156649) - (float64(2.0*(n-1)) / float64(n))
+}
+
+// Anamoly Score - How anomalous is a data point. closer to 1 - higher chance of it being outlier. closer to 0 - low chance of it being outlier.
+func anomalyScore(instance []float64, forest []regressorNode, n int) float64 {
+	paths := evaluateInstance(instance, forest)
+	E := 0.0
+	for _, path := range paths {
+		E += path
+	}
+	E /= float64(len(paths))
+	c := cFactor(n)
+	return math.Pow(2, (-1 * E / c))
+}
+
+// Return anamoly score for all datapoints.
+func (iForest *IsolationForest) Predict(X base.FixedDataGrid) []float64 {
+	data := preprocessData(X)
+
+	var preds []float64
+	for _, instance := range data {
+		score := anomalyScore(instance, iForest.trees, iForest.subSpace)
+		preds = append(preds, score)
+	}
+	return preds
+}
+
+// Extract data in the form of floats. Used in Fit and predict. Note that class labels are treated as normal data because Isolation Forest is unsupervised.
+func preprocessData(X base.FixedDataGrid) [][]float64 {
+	data := convertInstancesToProblemVec(X)
+	class, err := regressorConvertInstancesToLabelVec(X)
+	if err != nil {
+		panic(err)
+	}
+	for i, point := range class {
+		data[i] = append(data[i], point)
+	}
+	return data
+}
--- a/trees/isolation_test.go
+++ b/trees/isolation_test.go
@ -0,0 +1,42 @@
+package trees
+
+import (
+	"testing"
+
+	. "github.com/smartystreets/goconvey/convey"
+)
+
+func TestIsolation(t *testing.T) {
+
+	Convey("Doing an Isolation Forest Test", t, func() {
+
+		var data [][]float64
+		data = append(data, []float64{8, 9, 8, 3})
+		data = append(data, []float64{4, 2, 5, 3})
+		data = append(data, []float64{3, 2, 5, 9})
+		data = append(data, []float64{2, 1, 5, 9})
+
+		featureChosen := selectFeature(data)
+		So(featureChosen, ShouldNotBeNil)
+
+		min, max := minMax(0, data)
+		So(min, ShouldEqual, 2)
+		So(max, ShouldEqual, 8)
+
+		min, max = minMax(featureChosen, data)
+
+		val := selectValue(min, max)
+		So(val, ShouldBeBetween, min, max)
+
+		leftData, rightData := splitData(val, featureChosen, data)
+		So(len(leftData), ShouldBeGreaterThan, 0)
+		So(len(rightData), ShouldBeGreaterThan, 0)
+
+		checked := checkData(data)
+		So(checked, ShouldBeTrue)
+
+		randomSubset := getRandomData(data, 2)
+		So(len(randomSubset), ShouldEqual, 2)
+
+	})
+}
--- a/trees/trees.go
+++ b/trees/trees.go
@ -11,13 +11,13 @@
 			present, so discretise beforehand (see
 			filters)

-  CART (Classification and Regression Trees):
-    Builds a binary decision tree using the CART algorithm
-      using a greedy approach to find the best split at each node.
+	CART (Classification and Regression Trees):
+		Builds a binary decision tree using the CART algorithm
+		using a greedy approach to find the best split at each node.

-    Can be used for regression and classficiation.
-      Attributes have to be FloatAttributes even for classification.
-      Hence, convert to Integer Labels before hand for Classficiation.
+		Can be used for regression and classficiation.
+		Attributes have to be FloatAttributes even for classification.
+		Hence, convert to Integer Labels before hand for Classficiation.

 	RandomTree:
 		Builds a decision tree using the ID3 algorithm
@ -29,6 +29,14 @@
 			present, so discretise beforehand (see
 			filters)

+	IsolationForest:
+		Unsupervised learning model for outlier detection.
+
+		Builds a tree by randomly picking an attribute and splitting value.
+
+		Attributes must be FloatAttributes.
+		All Class Attributes will be treated as Normal Feature Attributes,
+			So remove any Class Attributes you don't want during training beforehand.
 */

 package trees