Merge pull request #250 from Yushgoel/IsolationForest_reviewed

Implements Isolation Forest Feature Request
2025-04-28 13:48:56 +08:00 · 2020-09-08 23:06:55 +01:00 · 2020-09-08 23:06:55 +01:00 · 6fed29ee9c
commit 6fed29ee9c
parent c39ef5156b e3a09cfa34
7 changed files with 1360 additions and 12 deletions
--- a/examples/datasets/gaussian_outliers.csv
+++ b/examples/datasets/gaussian_outliers.csv
--- a/examples/trees/cart/cart.go
+++ b/examples/trees/cart/cart.go
@ -7,7 +7,6 @@ import (
 	"github.com/sjwhitworth/golearn/base"
 	"github.com/sjwhitworth/golearn/trees"
 )
 func main() {
@ -37,7 +36,7 @@ func main() {
 	*/
 	// Load Titanic Data For classification
-	classificationData, err := base.ParseCSVToInstances("../datasets/titanic.csv", false)
+	classificationData, err := base.ParseCSVToInstances("../../datasets/titanic.csv", false)
 	if err != nil {
 		panic(err)
 	}
@ -45,7 +44,7 @@ func main() {
 	// Create New Classification Tree
 	// Hyperparameters - loss function, max Depth (-1 will split until pure), list of unique labels
-	decTree := NewDecisionTreeClassifier("entropy", -1, []int64{0, 1})
+	decTree := trees.NewDecisionTreeClassifier("entropy", -1, []int64{0, 1})
 	// Train Tree
 	err = decTree.Fit(trainData)
@ -72,7 +71,7 @@ func main() {
 	trainRegData, testRegData := base.InstancesTrainTestSplit(regressionData, 0.5)
 	// Hyperparameters - Loss function, max Depth (-1 will split until pure)
-	regTree := NewDecisionTreeRegressor("mse", -1)
+	regTree := trees.NewDecisionTreeRegressor("mse", -1)
 	// Train Tree
 	err = regTree.Fit(trainRegData)
--- a/examples/trees/id3/trees.go
+++ b/examples/trees/id3/trees.go
@ -4,12 +4,13 @@ package main
 import (
 	"fmt"
 	"math/rand"
 	"github.com/sjwhitworth/golearn/base"
 	"github.com/sjwhitworth/golearn/ensemble"
 	"github.com/sjwhitworth/golearn/evaluation"
 	"github.com/sjwhitworth/golearn/filters"
 	"github.com/sjwhitworth/golearn/trees"
 	"math/rand"
 )
 func main() {
@ -19,7 +20,7 @@ func main() {
 	rand.Seed(44111342)
 	// Load in the iris dataset
-	iris, err := base.ParseCSVToInstances("../datasets/iris_headers.csv", true)
+	iris, err := base.ParseCSVToInstances("../../datasets/iris_headers.csv", true)
 	if err != nil {
 		panic(err)
 	}
--- a/examples/trees/isolationForest/isolation_forest.go
+++ b/examples/trees/isolationForest/isolation_forest.go
@ -0,0 +1,57 @@
 // Example of how to use Isolation Forest for outlier detection
 package main
 import (
 	"fmt"
 	"github.com/sjwhitworth/golearn/base"
 	"github.com/sjwhitworth/golearn/trees"
 )
 func main() {
 	/* Isolation Forest is used for outlier detection
 		 The algorithm works by randomly splitting the data, so results won't be exactly reproducible
 	 	 but generally outliers will still be classified as outliers. */
 	// Load data for outlier detection - includes gaussian distribution, and ten outliers at the end
 	// Dataset has 1000 normal datapoints, and 10 outliers at the ned
 	csvData, err := base.ParseCSVToInstances("../../datasets/gaussian_outliers.csv", true)
 	if err != nil {
 		panic(err)
 	}
 	// Create New Isolation Forest with 100 trees, max depth 100, and each tree will use 850 datapoints
 	forest := trees.NewIsolationForest(100, 100, 850)
 	// fit the isolation forest to the data. Note that all class attributes are also used during training.
 	// Remove all class attributes you don't want to use before calling fit.
 	forest.Fit(csvData)
 	// Make predictions. Generally, IsolationForest is used for Interpolation, not Extrapolation.
 	// Predictions are returned as Anomaly Scores from 0 to 1. close to 0 - not outlier, close to 1 - outlier
 	preds := forest.Predict(csvData)
 	// Let's find the average and minimum Anomaly Score for normal data
 	var avgScore float64
 	var min float64
 	min = 1
 	for i := 0; i < 1000; i++ {
 		temp := preds[i]
 		avgScore += temp
 		if temp < min {
 			min = temp
 		}
 	}
 	fmt.Println(avgScore / 1000)
 	fmt.Println(min)
 	// Now let's print the anomaly scores for the outliers.
 	// You should find that these values are much higher (around 0.7) as comapred to the scores for normal data.
 	fmt.Println("Anomaly Scores for outliers are ")
 	for i := 1000; i < 1010; i++ {
 		fmt.Print("      ")
 		fmt.Println(preds[i])
 	}
 }
--- a/trees/isolation.go
+++ b/trees/isolation.go
@ -0,0 +1,230 @@
 package trees
 import (
 	"math"
 	"math/rand"
 	"github.com/sjwhitworth/golearn/base"
 )
 type IsolationForest struct {
 	nTrees   int
 	maxDepth int
 	subSpace int
 	trees    []regressorNode
 }
 // Select A random feature for splitting from the data.
 func selectFeature(data [][]float64) int64 {
 	return int64(rand.Intn(len(data[0])))
 }
 // Find the minimum and maximum values of a feature. Used so that we can choose a random threshold.
 func minMax(feature int64, data [][]float64) (float64, float64) {
 	var min, max float64
 	min = math.Inf(1)
 	max = math.Inf(-1)
 	for _, instance := range data {
 		if instance[feature] > max {
 			max = instance[feature]
 		}
 		if instance[feature] < min {
 			min = instance[feature]
 		}
 	}
 	return min, max
 }
 // Select a random threshold between the minimum and maximum of the feature.
 func selectValue(min, max float64) float64 {
 	val := min + (rand.Float64() * (max - min))
 	if val == min {
 		val += 0.000001
 	} else if val == max {
 		val -= 0.000001
 	}
 	return val
 }
 // Split the data based on the threshold.
 func splitData(val float64, feature int64, data [][]float64) ([][]float64, [][]float64) {
 	var leftData, rightData [][]float64
 	for _, instance := range data {
 		if instance[feature] <= val {
 			leftData = append(leftData, instance)
 		} else {
 			rightData = append(rightData, instance)
 		}
 	}
 	return leftData, rightData
 }
 // Make sure that the data can still be split (all datapoints are not duplicate)
 func checkData(data [][]float64) bool {
 	for _, instance := range data {
 		for i, val := range instance {
 			if val != data[0][i] {
 				return true
 			}
 		}
 	}
 	return false
 }
 // Recusrively build a tree by randomly choosing a feature to split until nodes are pure or max depth is reached.
 func buildTree(data [][]float64, upperNode regressorNode, depth int, maxDepth int) regressorNode {
 	depth++
 	upperNode.isNodeNeeded = true
 	if (depth > maxDepth) || (len(data) <= 1) || !checkData(data) {
 		upperNode.isNodeNeeded = false
 		return upperNode
 	}
 	var featureToSplit int64
 	var splitVal float64
 	var min, max float64
 	min, max = 0.0, 0.0
 	for min == max {
 		featureToSplit = selectFeature(data)
 		min, max = minMax(featureToSplit, data)
 		splitVal = selectValue(min, max)
 	}
 	leftData, rightData := splitData(splitVal, featureToSplit, data)
 	upperNode.Feature = featureToSplit
 	upperNode.Threshold = splitVal
 	upperNode.LeftPred = float64(len(leftData))
 	upperNode.RightPred = float64(len(rightData))
 	var leftNode, rightNode regressorNode
 	leftNode = buildTree(leftData, leftNode, depth, maxDepth)
 	rightNode = buildTree(rightData, rightNode, depth, maxDepth)
 	if leftNode.isNodeNeeded {
 		upperNode.Left = &leftNode
 	}
 	if rightNode.isNodeNeeded {
 		upperNode.Right = &rightNode
 	}
 	return upperNode
 }
 // Get a random subset of the data. Helps making each tree in forest different.
 func getRandomData(data [][]float64, subSpace int) [][]float64 {
 	var randomData [][]float64
 	for i := 0; i < subSpace; i++ {
 		randomData = append(randomData, data[rand.Intn(len(data))])
 	}
 	return randomData
 }
 // Function to create a new isolation forest. nTrees is number of trees in Forest. Maxdepth is maximum depth of each tree. Subspace is number of data points to use per tree.
 func NewIsolationForest(nTrees int, maxDepth int, subSpace int) IsolationForest {
 	var iForest IsolationForest
 	iForest.nTrees = nTrees
 	iForest.maxDepth = maxDepth
 	iForest.subSpace = subSpace
 	return iForest
 }
 // Fit the data based on hyperparameters and data.
 func (iForest *IsolationForest) Fit(X base.FixedDataGrid) {
 	data := preprocessData(X)
 	nTrees := iForest.nTrees
 	subSpace := iForest.subSpace
 	maxDepth := iForest.maxDepth
 	var forest []regressorNode
 	for i := 0; i < nTrees; i++ {
 		subData := getRandomData(data, subSpace)
 		var tree regressorNode
 		tree = buildTree(subData, tree, 0, maxDepth)
 		forest = append(forest, tree)
 	}
 	iForest.trees = forest
 }
 // Calculate the path length to reach a leaf node for a datapoint. Outliers have smaller path lengths than standard data points.
 func pathLength(tree regressorNode, instance []float64, path float64) float64 {
 	path++
 	if instance[tree.Feature] <= tree.Threshold {
 		if tree.Left == nil {
 			if tree.LeftPred <= 1 {
 				return path
 			} else {
 				return path + cFactor(int(tree.LeftPred))
 			}
 		}
 		path = pathLength(*tree.Left, instance, path)
 	} else {
 		if tree.Right == nil {
 			if tree.RightPred <= 1 {
 				return path
 			} else {
 				return path + cFactor(int(tree.RightPred))
 			}
 		}
 		path = pathLength(*tree.Right, instance, path)
 	}
 	return path
 }
 // Find the path length of a a datapoints from all trees in forest.
 func evaluateInstance(instance []float64, forest []regressorNode) []float64 {
 	var paths []float64
 	for _, tree := range forest {
 		paths = append(paths, pathLength(tree, instance, 0))
 	}
 	return paths
 }
 // Helper function to calculate anomaly score.
 func cFactor(n int) float64 {
 	return 2.0*(math.Log(float64(n-1))+0.5772156649) - (float64(2.0*(n-1)) / float64(n))
 }
 // Anamoly Score - How anomalous is a data point. closer to 1 - higher chance of it being outlier. closer to 0 - low chance of it being outlier.
 func anomalyScore(instance []float64, forest []regressorNode, n int) float64 {
 	paths := evaluateInstance(instance, forest)
 	E := 0.0
 	for _, path := range paths {
 		E += path
 	}
 	E /= float64(len(paths))
 	c := cFactor(n)
 	return math.Pow(2, (-1 * E / c))
 }
 // Return anamoly score for all datapoints.
 func (iForest *IsolationForest) Predict(X base.FixedDataGrid) []float64 {
 	data := preprocessData(X)
 	var preds []float64
 	for _, instance := range data {
 		score := anomalyScore(instance, iForest.trees, iForest.subSpace)
 		preds = append(preds, score)
 	}
 	return preds
 }
 // Extract data in the form of floats. Used in Fit and predict. Note that class labels are treated as normal data because Isolation Forest is unsupervised.
 func preprocessData(X base.FixedDataGrid) [][]float64 {
 	data := convertInstancesToProblemVec(X)
 	class, err := regressorConvertInstancesToLabelVec(X)
 	if err != nil {
 		panic(err)
 	}
 	for i, point := range class {
 		data[i] = append(data[i], point)
 	}
 	return data
 }
--- a/trees/isolation_test.go
+++ b/trees/isolation_test.go
@ -0,0 +1,42 @@
 package trees
 import (
 	"testing"
 	. "github.com/smartystreets/goconvey/convey"
 )
 func TestIsolation(t *testing.T) {
 	Convey("Doing an Isolation Forest Test", t, func() {
 		var data [][]float64
 		data = append(data, []float64{8, 9, 8, 3})
 		data = append(data, []float64{4, 2, 5, 3})
 		data = append(data, []float64{3, 2, 5, 9})
 		data = append(data, []float64{2, 1, 5, 9})
 		featureChosen := selectFeature(data)
 		So(featureChosen, ShouldNotBeNil)
 		min, max := minMax(0, data)
 		So(min, ShouldEqual, 2)
 		So(max, ShouldEqual, 8)
 		min, max = minMax(featureChosen, data)
 		val := selectValue(min, max)
 		So(val, ShouldBeBetween, min, max)
 		leftData, rightData := splitData(val, featureChosen, data)
 		So(len(leftData), ShouldBeGreaterThan, 0)
 		So(len(rightData), ShouldBeGreaterThan, 0)
 		checked := checkData(data)
 		So(checked, ShouldBeTrue)
 		randomSubset := getRandomData(data, 2)
 		So(len(randomSubset), ShouldEqual, 2)
 	})
 }
--- a/trees/trees.go
+++ b/trees/trees.go
@ -11,13 +11,13 @@
 			present, so discretise beforehand (see
 			filters)
-  CART (Classification and Regression Trees):
+	CART (Classification and Regression Trees):
-    Builds a binary decision tree using the CART algorithm
+		Builds a binary decision tree using the CART algorithm
-      using a greedy approach to find the best split at each node.
+		using a greedy approach to find the best split at each node.
-    Can be used for regression and classficiation.
+		Can be used for regression and classficiation.
-      Attributes have to be FloatAttributes even for classification.
+		Attributes have to be FloatAttributes even for classification.
-      Hence, convert to Integer Labels before hand for Classficiation.
+		Hence, convert to Integer Labels before hand for Classficiation.
 	RandomTree:
 		Builds a decision tree using the ID3 algorithm
@ -29,6 +29,14 @@
 			present, so discretise beforehand (see
 			filters)
 	IsolationForest:
 		Unsupervised learning model for outlier detection.
 		Builds a tree by randomly picking an attribute and splitting value.
 		Attributes must be FloatAttributes.
 		All Class Attributes will be treated as Normal Feature Attributes,
 			So remove any Class Attributes you don't want during training beforehand.
 */
 package trees