mirror of
https://github.com/sjwhitworth/golearn.git
synced 2025-04-26 13:49:14 +08:00
Merge pull request #6 from Yushgoel/IsolationForest
Adding Feature Request for Isolation Forest
This commit is contained in:
commit
fef30344d3
1011
examples/datasets/gaussian_outliers.csv
Normal file
1011
examples/datasets/gaussian_outliers.csv
Normal file
File diff suppressed because it is too large
Load Diff
@ -7,7 +7,6 @@ import (
|
||||
|
||||
"github.com/sjwhitworth/golearn/base"
|
||||
"github.com/sjwhitworth/golearn/trees"
|
||||
|
||||
)
|
||||
|
||||
func main() {
|
||||
@ -45,7 +44,7 @@ func main() {
|
||||
|
||||
// Create New Classification Tree
|
||||
// Hyperparameters - loss function, max Depth (-1 will split until pure), list of unique labels
|
||||
decTree := NewDecisionTreeClassifier("entropy", -1, []int64{0, 1})
|
||||
decTree := trees.NewDecisionTreeClassifier("entropy", -1, []int64{0, 1})
|
||||
|
||||
// Train Tree
|
||||
err = decTree.Fit(trainData)
|
||||
@ -72,7 +71,7 @@ func main() {
|
||||
trainRegData, testRegData := base.InstancesTrainTestSplit(regressionData, 0.5)
|
||||
|
||||
// Hyperparameters - Loss function, max Depth (-1 will split until pure)
|
||||
regTree := NewDecisionTreeRegressor("mse", -1)
|
||||
regTree := trees.NewDecisionTreeRegressor("mse", -1)
|
||||
|
||||
// Train Tree
|
||||
err = regTree.Fit(trainRegData)
|
57
examples/trees/isolationForest/isolation_forest.go
Normal file
57
examples/trees/isolationForest/isolation_forest.go
Normal file
@ -0,0 +1,57 @@
|
||||
// Example of how to use Isolation Forest for outlier detection
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/sjwhitworth/golearn/base"
|
||||
"github.com/sjwhitworth/golearn/trees"
|
||||
)
|
||||
|
||||
func main() {
|
||||
/* Isolation Forest is used for outlier detection
|
||||
The algorithm works by randomly splitting the data, so results won't be exactly reproducible
|
||||
but generally outliers will still be classified as outliers. */
|
||||
|
||||
// Load data for outlier detection - includes gaussian distribution, and ten outliers at the end
|
||||
// Dataset has 1000 normal datapoints, and 10 outliers at the ned
|
||||
csvData, err := base.ParseCSVToInstances("../datasets/gaussian_outliers.csv", true)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
// Create New Isolation Forest with 100 trees, max depth 100, and each tree will use 850 datapoints
|
||||
forest := trees.NewIsolationForest(100, 100, 850)
|
||||
|
||||
// fit the isolation forest to the data. Note that all class attributes are also used during training.
|
||||
// Remove all class attributes you don't want to use before calling fit.
|
||||
forest.Fit(csvData)
|
||||
|
||||
// Make predictions. Generally, IsolationForest is used for Interpolation, not Extrapolation.
|
||||
// Predictions are returned as Anomaly Scores from 0 to 1. close to 0 - not outlier, close to 1 - outlier
|
||||
preds := forest.Predict(csvData)
|
||||
|
||||
// Let's find the average and minimum Anomaly Score for normal data
|
||||
var avgScore float64
|
||||
var min float64
|
||||
min = 1
|
||||
|
||||
for i := 0; i < 1000; i++ {
|
||||
temp := preds[i]
|
||||
avgScore += temp
|
||||
if temp < min {
|
||||
min = temp
|
||||
}
|
||||
}
|
||||
fmt.Println(avgScore / 1000)
|
||||
fmt.Println(min)
|
||||
|
||||
// Now let's print the anomaly scores for the outliers.
|
||||
// You should find that these values are much higher (around 0.7) as comapred to the scores for normal data.
|
||||
fmt.Println("Anomaly Scores for outliers are ")
|
||||
for i := 1000; i < 1010; i++ {
|
||||
fmt.Print(" ")
|
||||
fmt.Println(preds[i])
|
||||
}
|
||||
}
|
234
trees/isolation.go
Normal file
234
trees/isolation.go
Normal file
@ -0,0 +1,234 @@
|
||||
package trees
|
||||
|
||||
import (
|
||||
"math"
|
||||
"math/rand"
|
||||
"time"
|
||||
|
||||
"github.com/sjwhitworth/golearn/base"
|
||||
)
|
||||
|
||||
type IsolationForest struct {
|
||||
nTrees int
|
||||
maxDepth int
|
||||
subSpace int
|
||||
trees []regressorNode
|
||||
}
|
||||
|
||||
// Select A random feature for splitting from the data.
|
||||
func selectFeature(data [][]float64) int64 {
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
return int64(rand.Intn(len(data[0])))
|
||||
}
|
||||
|
||||
// Find the minimum and maximum values of a feature. Used so that we can choose a random threshold.
|
||||
func minMax(feature int64, data [][]float64) (float64, float64) {
|
||||
|
||||
var min, max float64
|
||||
|
||||
min = math.Inf(1)
|
||||
max = math.Inf(-1)
|
||||
for _, instance := range data {
|
||||
if instance[feature] > max {
|
||||
max = instance[feature]
|
||||
}
|
||||
if instance[feature] < min {
|
||||
min = instance[feature]
|
||||
}
|
||||
}
|
||||
|
||||
return min, max
|
||||
}
|
||||
|
||||
// Select a random threshold between the minimum and maximum of the feature.
|
||||
func selectValue(min, max float64) float64 {
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
val := min + (rand.Float64() * (max - min))
|
||||
if val == min {
|
||||
val += 0.000001
|
||||
} else if val == max {
|
||||
val -= 0.000001
|
||||
}
|
||||
return val
|
||||
}
|
||||
|
||||
// Split the data based on the threshold.
|
||||
func splitData(val float64, feature int64, data [][]float64) ([][]float64, [][]float64) {
|
||||
var leftData, rightData [][]float64
|
||||
for _, instance := range data {
|
||||
if instance[feature] <= val {
|
||||
leftData = append(leftData, instance)
|
||||
} else {
|
||||
rightData = append(rightData, instance)
|
||||
}
|
||||
}
|
||||
return leftData, rightData
|
||||
}
|
||||
|
||||
// Make sure that the data can still be split (all datapoints are not duplicate)
|
||||
func checkData(data [][]float64) bool {
|
||||
for _, instance := range data {
|
||||
for i, val := range instance {
|
||||
if val != data[0][i] {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Recusrively build a tree by randomly choosing a feature to split until nodes are pure or max depth is reached.
|
||||
func buildTree(data [][]float64, upperNode regressorNode, depth int, maxDepth int) regressorNode {
|
||||
depth++
|
||||
|
||||
upperNode.isNodeNeeded = true
|
||||
if (depth > maxDepth) || (len(data) <= 1) || !checkData(data) {
|
||||
upperNode.isNodeNeeded = false
|
||||
return upperNode
|
||||
}
|
||||
|
||||
var featureToSplit int64
|
||||
var splitVal float64
|
||||
var min, max float64
|
||||
min, max = 0.0, 0.0
|
||||
|
||||
for min == max {
|
||||
featureToSplit = selectFeature(data)
|
||||
min, max = minMax(featureToSplit, data)
|
||||
splitVal = selectValue(min, max)
|
||||
}
|
||||
|
||||
leftData, rightData := splitData(splitVal, featureToSplit, data)
|
||||
|
||||
upperNode.Feature = featureToSplit
|
||||
upperNode.Threshold = splitVal
|
||||
upperNode.LeftPred = float64(len(leftData))
|
||||
upperNode.RightPred = float64(len(rightData))
|
||||
|
||||
var leftNode, rightNode regressorNode
|
||||
leftNode = buildTree(leftData, leftNode, depth, maxDepth)
|
||||
rightNode = buildTree(rightData, rightNode, depth, maxDepth)
|
||||
|
||||
if leftNode.isNodeNeeded {
|
||||
upperNode.Left = &leftNode
|
||||
}
|
||||
if rightNode.isNodeNeeded {
|
||||
upperNode.Right = &rightNode
|
||||
}
|
||||
|
||||
return upperNode
|
||||
}
|
||||
|
||||
// Get a random subset of the data. Helps making each tree in forest different.
|
||||
func getRandomData(data [][]float64, subSpace int) [][]float64 {
|
||||
var randomData [][]float64
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
for i := 0; i < subSpace; i++ {
|
||||
randomData = append(randomData, data[rand.Intn(len(data))])
|
||||
}
|
||||
return randomData
|
||||
}
|
||||
|
||||
// Function to create a new isolation forest. nTrees is number of trees in Forest. Maxdepth is maximum depth of each tree. Subspace is number of data points to use per tree.
|
||||
func NewIsolationForest(nTrees int, maxDepth int, subSpace int) IsolationForest {
|
||||
var iForest IsolationForest
|
||||
iForest.nTrees = nTrees
|
||||
iForest.maxDepth = maxDepth
|
||||
iForest.subSpace = subSpace
|
||||
return iForest
|
||||
}
|
||||
|
||||
// Fit the data based on hyperparameters and data.
|
||||
func (iForest *IsolationForest) Fit(X base.FixedDataGrid) {
|
||||
data := preprocessData(X)
|
||||
nTrees := iForest.nTrees
|
||||
subSpace := iForest.subSpace
|
||||
maxDepth := iForest.maxDepth
|
||||
|
||||
var forest []regressorNode
|
||||
for i := 0; i < nTrees; i++ {
|
||||
subData := getRandomData(data, subSpace)
|
||||
var tree regressorNode
|
||||
|
||||
tree = buildTree(subData, tree, 0, maxDepth)
|
||||
forest = append(forest, tree)
|
||||
}
|
||||
iForest.trees = forest
|
||||
}
|
||||
|
||||
// Calculate the path length to reach a leaf node for a datapoint. Outliers have smaller path lengths than standard data points.
|
||||
func pathLength(tree regressorNode, instance []float64, path float64) float64 {
|
||||
path++
|
||||
|
||||
if instance[tree.Feature] <= tree.Threshold {
|
||||
if tree.Left == nil {
|
||||
if tree.LeftPred <= 1 {
|
||||
return path
|
||||
} else {
|
||||
return path + cFactor(int(tree.LeftPred))
|
||||
}
|
||||
}
|
||||
path = pathLength(*tree.Left, instance, path)
|
||||
} else {
|
||||
if tree.Right == nil {
|
||||
if tree.RightPred <= 1 {
|
||||
return path
|
||||
} else {
|
||||
return path + cFactor(int(tree.RightPred))
|
||||
}
|
||||
}
|
||||
path = pathLength(*tree.Right, instance, path)
|
||||
}
|
||||
return path
|
||||
}
|
||||
|
||||
// Find the path length of a a datapoints from all trees in forest.
|
||||
func evaluateInstance(instance []float64, forest []regressorNode) []float64 {
|
||||
var paths []float64
|
||||
for _, tree := range forest {
|
||||
paths = append(paths, pathLength(tree, instance, 0))
|
||||
}
|
||||
return paths
|
||||
}
|
||||
|
||||
// Helper function to calculate anomaly score.
|
||||
func cFactor(n int) float64 {
|
||||
return 2.0*(math.Log(float64(n-1))+0.5772156649) - (float64(2.0*(n-1)) / float64(n))
|
||||
}
|
||||
|
||||
// Anamoly Score - How anomalous is a data point. closer to 1 - higher chance of it being outlier. closer to 0 - low chance of it being outlier.
|
||||
func anomalyScore(instance []float64, forest []regressorNode, n int) float64 {
|
||||
paths := evaluateInstance(instance, forest)
|
||||
E := 0.0
|
||||
for _, path := range paths {
|
||||
E += path
|
||||
}
|
||||
E /= float64(len(paths))
|
||||
c := cFactor(n)
|
||||
return math.Pow(2, (-1 * E / c))
|
||||
}
|
||||
|
||||
// Return anamoly score for all datapoints.
|
||||
func (iForest *IsolationForest) Predict(X base.FixedDataGrid) []float64 {
|
||||
data := preprocessData(X)
|
||||
|
||||
var preds []float64
|
||||
for _, instance := range data {
|
||||
score := anomalyScore(instance, iForest.trees, iForest.subSpace)
|
||||
preds = append(preds, score)
|
||||
}
|
||||
return preds
|
||||
}
|
||||
|
||||
// Extract data in the form of floats. Used in Fit and predict. Note that class labels are treated as normal data because Isolation Forest is unsupervised.
|
||||
func preprocessData(X base.FixedDataGrid) [][]float64 {
|
||||
data := convertInstancesToProblemVec(X)
|
||||
class, err := regressorConvertInstancesToLabelVec(X)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
for i, point := range class {
|
||||
data[i] = append(data[i], point)
|
||||
}
|
||||
return data
|
||||
}
|
42
trees/isolation_test.go
Normal file
42
trees/isolation_test.go
Normal file
@ -0,0 +1,42 @@
|
||||
package trees
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
. "github.com/smartystreets/goconvey/convey"
|
||||
)
|
||||
|
||||
func TestIsolation(t *testing.T) {
|
||||
|
||||
Convey("Doing an Isolation Forest Test", t, func() {
|
||||
|
||||
var data [][]float64
|
||||
data = append(data, []float64{8, 9, 8, 3})
|
||||
data = append(data, []float64{4, 2, 5, 3})
|
||||
data = append(data, []float64{3, 2, 5, 9})
|
||||
data = append(data, []float64{2, 1, 5, 9})
|
||||
|
||||
featureChosen := selectFeature(data)
|
||||
So(featureChosen, ShouldNotBeNil)
|
||||
|
||||
min, max := minMax(0, data)
|
||||
So(min, ShouldEqual, 2)
|
||||
So(max, ShouldEqual, 8)
|
||||
|
||||
min, max = minMax(featureChosen, data)
|
||||
|
||||
val := selectValue(min, max)
|
||||
So(val, ShouldBeBetween, min, max)
|
||||
|
||||
leftData, rightData := splitData(val, featureChosen, data)
|
||||
So(len(leftData), ShouldBeGreaterThan, 0)
|
||||
So(len(rightData), ShouldBeGreaterThan, 0)
|
||||
|
||||
checked := checkData(data)
|
||||
So(checked, ShouldBeTrue)
|
||||
|
||||
randomSubset := getRandomData(data, 2)
|
||||
So(len(randomSubset), ShouldEqual, 2)
|
||||
|
||||
})
|
||||
}
|
@ -11,13 +11,13 @@
|
||||
present, so discretise beforehand (see
|
||||
filters)
|
||||
|
||||
CART (Classification and Regression Trees):
|
||||
Builds a binary decision tree using the CART algorithm
|
||||
using a greedy approach to find the best split at each node.
|
||||
CART (Classification and Regression Trees):
|
||||
Builds a binary decision tree using the CART algorithm
|
||||
using a greedy approach to find the best split at each node.
|
||||
|
||||
Can be used for regression and classficiation.
|
||||
Attributes have to be FloatAttributes even for classification.
|
||||
Hence, convert to Integer Labels before hand for Classficiation.
|
||||
Can be used for regression and classficiation.
|
||||
Attributes have to be FloatAttributes even for classification.
|
||||
Hence, convert to Integer Labels before hand for Classficiation.
|
||||
|
||||
RandomTree:
|
||||
Builds a decision tree using the ID3 algorithm
|
||||
@ -29,6 +29,14 @@
|
||||
present, so discretise beforehand (see
|
||||
filters)
|
||||
|
||||
IsolationForest:
|
||||
Unsupervised learning model for outlier detection.
|
||||
|
||||
Builds a tree by randomly picking an attribute and splitting value.
|
||||
|
||||
Attributes must be FloatAttributes.
|
||||
All Class Attributes will be treated as Normal Feature Attributes,
|
||||
So remove any Class Attributes you don't want during training beforehand.
|
||||
*/
|
||||
|
||||
package trees
|
||||
|
Loading…
x
Reference in New Issue
Block a user