mirror of
https://github.com/sjwhitworth/golearn.git
synced 2025-04-28 13:48:56 +08:00
Merge pull request #250 from Yushgoel/IsolationForest_reviewed
Implements Isolation Forest Feature Request
This commit is contained in:
commit
6fed29ee9c
1011
examples/datasets/gaussian_outliers.csv
Normal file
1011
examples/datasets/gaussian_outliers.csv
Normal file
File diff suppressed because it is too large
Load Diff
@ -7,7 +7,6 @@ import (
|
|||||||
|
|
||||||
"github.com/sjwhitworth/golearn/base"
|
"github.com/sjwhitworth/golearn/base"
|
||||||
"github.com/sjwhitworth/golearn/trees"
|
"github.com/sjwhitworth/golearn/trees"
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
@ -37,7 +36,7 @@ func main() {
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// Load Titanic Data For classification
|
// Load Titanic Data For classification
|
||||||
classificationData, err := base.ParseCSVToInstances("../datasets/titanic.csv", false)
|
classificationData, err := base.ParseCSVToInstances("../../datasets/titanic.csv", false)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
@ -45,7 +44,7 @@ func main() {
|
|||||||
|
|
||||||
// Create New Classification Tree
|
// Create New Classification Tree
|
||||||
// Hyperparameters - loss function, max Depth (-1 will split until pure), list of unique labels
|
// Hyperparameters - loss function, max Depth (-1 will split until pure), list of unique labels
|
||||||
decTree := NewDecisionTreeClassifier("entropy", -1, []int64{0, 1})
|
decTree := trees.NewDecisionTreeClassifier("entropy", -1, []int64{0, 1})
|
||||||
|
|
||||||
// Train Tree
|
// Train Tree
|
||||||
err = decTree.Fit(trainData)
|
err = decTree.Fit(trainData)
|
||||||
@ -72,7 +71,7 @@ func main() {
|
|||||||
trainRegData, testRegData := base.InstancesTrainTestSplit(regressionData, 0.5)
|
trainRegData, testRegData := base.InstancesTrainTestSplit(regressionData, 0.5)
|
||||||
|
|
||||||
// Hyperparameters - Loss function, max Depth (-1 will split until pure)
|
// Hyperparameters - Loss function, max Depth (-1 will split until pure)
|
||||||
regTree := NewDecisionTreeRegressor("mse", -1)
|
regTree := trees.NewDecisionTreeRegressor("mse", -1)
|
||||||
|
|
||||||
// Train Tree
|
// Train Tree
|
||||||
err = regTree.Fit(trainRegData)
|
err = regTree.Fit(trainRegData)
|
@ -4,12 +4,13 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"math/rand"
|
||||||
|
|
||||||
"github.com/sjwhitworth/golearn/base"
|
"github.com/sjwhitworth/golearn/base"
|
||||||
"github.com/sjwhitworth/golearn/ensemble"
|
"github.com/sjwhitworth/golearn/ensemble"
|
||||||
"github.com/sjwhitworth/golearn/evaluation"
|
"github.com/sjwhitworth/golearn/evaluation"
|
||||||
"github.com/sjwhitworth/golearn/filters"
|
"github.com/sjwhitworth/golearn/filters"
|
||||||
"github.com/sjwhitworth/golearn/trees"
|
"github.com/sjwhitworth/golearn/trees"
|
||||||
"math/rand"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
@ -19,7 +20,7 @@ func main() {
|
|||||||
rand.Seed(44111342)
|
rand.Seed(44111342)
|
||||||
|
|
||||||
// Load in the iris dataset
|
// Load in the iris dataset
|
||||||
iris, err := base.ParseCSVToInstances("../datasets/iris_headers.csv", true)
|
iris, err := base.ParseCSVToInstances("../../datasets/iris_headers.csv", true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
57
examples/trees/isolationForest/isolation_forest.go
Normal file
57
examples/trees/isolationForest/isolation_forest.go
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
// Example of how to use Isolation Forest for outlier detection
|
||||||
|
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/sjwhitworth/golearn/base"
|
||||||
|
"github.com/sjwhitworth/golearn/trees"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
/* Isolation Forest is used for outlier detection
|
||||||
|
The algorithm works by randomly splitting the data, so results won't be exactly reproducible
|
||||||
|
but generally outliers will still be classified as outliers. */
|
||||||
|
|
||||||
|
// Load data for outlier detection - includes gaussian distribution, and ten outliers at the end
|
||||||
|
// Dataset has 1000 normal datapoints, and 10 outliers at the ned
|
||||||
|
csvData, err := base.ParseCSVToInstances("../../datasets/gaussian_outliers.csv", true)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create New Isolation Forest with 100 trees, max depth 100, and each tree will use 850 datapoints
|
||||||
|
forest := trees.NewIsolationForest(100, 100, 850)
|
||||||
|
|
||||||
|
// fit the isolation forest to the data. Note that all class attributes are also used during training.
|
||||||
|
// Remove all class attributes you don't want to use before calling fit.
|
||||||
|
forest.Fit(csvData)
|
||||||
|
|
||||||
|
// Make predictions. Generally, IsolationForest is used for Interpolation, not Extrapolation.
|
||||||
|
// Predictions are returned as Anomaly Scores from 0 to 1. close to 0 - not outlier, close to 1 - outlier
|
||||||
|
preds := forest.Predict(csvData)
|
||||||
|
|
||||||
|
// Let's find the average and minimum Anomaly Score for normal data
|
||||||
|
var avgScore float64
|
||||||
|
var min float64
|
||||||
|
min = 1
|
||||||
|
|
||||||
|
for i := 0; i < 1000; i++ {
|
||||||
|
temp := preds[i]
|
||||||
|
avgScore += temp
|
||||||
|
if temp < min {
|
||||||
|
min = temp
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fmt.Println(avgScore / 1000)
|
||||||
|
fmt.Println(min)
|
||||||
|
|
||||||
|
// Now let's print the anomaly scores for the outliers.
|
||||||
|
// You should find that these values are much higher (around 0.7) as comapred to the scores for normal data.
|
||||||
|
fmt.Println("Anomaly Scores for outliers are ")
|
||||||
|
for i := 1000; i < 1010; i++ {
|
||||||
|
fmt.Print(" ")
|
||||||
|
fmt.Println(preds[i])
|
||||||
|
}
|
||||||
|
}
|
230
trees/isolation.go
Normal file
230
trees/isolation.go
Normal file
@ -0,0 +1,230 @@
|
|||||||
|
package trees
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math"
|
||||||
|
"math/rand"
|
||||||
|
|
||||||
|
"github.com/sjwhitworth/golearn/base"
|
||||||
|
)
|
||||||
|
|
||||||
|
type IsolationForest struct {
|
||||||
|
nTrees int
|
||||||
|
maxDepth int
|
||||||
|
subSpace int
|
||||||
|
trees []regressorNode
|
||||||
|
}
|
||||||
|
|
||||||
|
// Select A random feature for splitting from the data.
|
||||||
|
func selectFeature(data [][]float64) int64 {
|
||||||
|
return int64(rand.Intn(len(data[0])))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the minimum and maximum values of a feature. Used so that we can choose a random threshold.
|
||||||
|
func minMax(feature int64, data [][]float64) (float64, float64) {
|
||||||
|
|
||||||
|
var min, max float64
|
||||||
|
|
||||||
|
min = math.Inf(1)
|
||||||
|
max = math.Inf(-1)
|
||||||
|
for _, instance := range data {
|
||||||
|
if instance[feature] > max {
|
||||||
|
max = instance[feature]
|
||||||
|
}
|
||||||
|
if instance[feature] < min {
|
||||||
|
min = instance[feature]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return min, max
|
||||||
|
}
|
||||||
|
|
||||||
|
// Select a random threshold between the minimum and maximum of the feature.
|
||||||
|
func selectValue(min, max float64) float64 {
|
||||||
|
val := min + (rand.Float64() * (max - min))
|
||||||
|
if val == min {
|
||||||
|
val += 0.000001
|
||||||
|
} else if val == max {
|
||||||
|
val -= 0.000001
|
||||||
|
}
|
||||||
|
return val
|
||||||
|
}
|
||||||
|
|
||||||
|
// Split the data based on the threshold.
|
||||||
|
func splitData(val float64, feature int64, data [][]float64) ([][]float64, [][]float64) {
|
||||||
|
var leftData, rightData [][]float64
|
||||||
|
for _, instance := range data {
|
||||||
|
if instance[feature] <= val {
|
||||||
|
leftData = append(leftData, instance)
|
||||||
|
} else {
|
||||||
|
rightData = append(rightData, instance)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return leftData, rightData
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure that the data can still be split (all datapoints are not duplicate)
|
||||||
|
func checkData(data [][]float64) bool {
|
||||||
|
for _, instance := range data {
|
||||||
|
for i, val := range instance {
|
||||||
|
if val != data[0][i] {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Recusrively build a tree by randomly choosing a feature to split until nodes are pure or max depth is reached.
|
||||||
|
func buildTree(data [][]float64, upperNode regressorNode, depth int, maxDepth int) regressorNode {
|
||||||
|
depth++
|
||||||
|
|
||||||
|
upperNode.isNodeNeeded = true
|
||||||
|
if (depth > maxDepth) || (len(data) <= 1) || !checkData(data) {
|
||||||
|
upperNode.isNodeNeeded = false
|
||||||
|
return upperNode
|
||||||
|
}
|
||||||
|
|
||||||
|
var featureToSplit int64
|
||||||
|
var splitVal float64
|
||||||
|
var min, max float64
|
||||||
|
min, max = 0.0, 0.0
|
||||||
|
|
||||||
|
for min == max {
|
||||||
|
featureToSplit = selectFeature(data)
|
||||||
|
min, max = minMax(featureToSplit, data)
|
||||||
|
splitVal = selectValue(min, max)
|
||||||
|
}
|
||||||
|
|
||||||
|
leftData, rightData := splitData(splitVal, featureToSplit, data)
|
||||||
|
|
||||||
|
upperNode.Feature = featureToSplit
|
||||||
|
upperNode.Threshold = splitVal
|
||||||
|
upperNode.LeftPred = float64(len(leftData))
|
||||||
|
upperNode.RightPred = float64(len(rightData))
|
||||||
|
|
||||||
|
var leftNode, rightNode regressorNode
|
||||||
|
leftNode = buildTree(leftData, leftNode, depth, maxDepth)
|
||||||
|
rightNode = buildTree(rightData, rightNode, depth, maxDepth)
|
||||||
|
|
||||||
|
if leftNode.isNodeNeeded {
|
||||||
|
upperNode.Left = &leftNode
|
||||||
|
}
|
||||||
|
if rightNode.isNodeNeeded {
|
||||||
|
upperNode.Right = &rightNode
|
||||||
|
}
|
||||||
|
|
||||||
|
return upperNode
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get a random subset of the data. Helps making each tree in forest different.
|
||||||
|
func getRandomData(data [][]float64, subSpace int) [][]float64 {
|
||||||
|
var randomData [][]float64
|
||||||
|
for i := 0; i < subSpace; i++ {
|
||||||
|
randomData = append(randomData, data[rand.Intn(len(data))])
|
||||||
|
}
|
||||||
|
return randomData
|
||||||
|
}
|
||||||
|
|
||||||
|
// Function to create a new isolation forest. nTrees is number of trees in Forest. Maxdepth is maximum depth of each tree. Subspace is number of data points to use per tree.
|
||||||
|
func NewIsolationForest(nTrees int, maxDepth int, subSpace int) IsolationForest {
|
||||||
|
var iForest IsolationForest
|
||||||
|
iForest.nTrees = nTrees
|
||||||
|
iForest.maxDepth = maxDepth
|
||||||
|
iForest.subSpace = subSpace
|
||||||
|
return iForest
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fit the data based on hyperparameters and data.
|
||||||
|
func (iForest *IsolationForest) Fit(X base.FixedDataGrid) {
|
||||||
|
data := preprocessData(X)
|
||||||
|
nTrees := iForest.nTrees
|
||||||
|
subSpace := iForest.subSpace
|
||||||
|
maxDepth := iForest.maxDepth
|
||||||
|
|
||||||
|
var forest []regressorNode
|
||||||
|
for i := 0; i < nTrees; i++ {
|
||||||
|
subData := getRandomData(data, subSpace)
|
||||||
|
var tree regressorNode
|
||||||
|
|
||||||
|
tree = buildTree(subData, tree, 0, maxDepth)
|
||||||
|
forest = append(forest, tree)
|
||||||
|
}
|
||||||
|
iForest.trees = forest
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate the path length to reach a leaf node for a datapoint. Outliers have smaller path lengths than standard data points.
|
||||||
|
func pathLength(tree regressorNode, instance []float64, path float64) float64 {
|
||||||
|
path++
|
||||||
|
|
||||||
|
if instance[tree.Feature] <= tree.Threshold {
|
||||||
|
if tree.Left == nil {
|
||||||
|
if tree.LeftPred <= 1 {
|
||||||
|
return path
|
||||||
|
} else {
|
||||||
|
return path + cFactor(int(tree.LeftPred))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
path = pathLength(*tree.Left, instance, path)
|
||||||
|
} else {
|
||||||
|
if tree.Right == nil {
|
||||||
|
if tree.RightPred <= 1 {
|
||||||
|
return path
|
||||||
|
} else {
|
||||||
|
return path + cFactor(int(tree.RightPred))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
path = pathLength(*tree.Right, instance, path)
|
||||||
|
}
|
||||||
|
return path
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the path length of a a datapoints from all trees in forest.
|
||||||
|
func evaluateInstance(instance []float64, forest []regressorNode) []float64 {
|
||||||
|
var paths []float64
|
||||||
|
for _, tree := range forest {
|
||||||
|
paths = append(paths, pathLength(tree, instance, 0))
|
||||||
|
}
|
||||||
|
return paths
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to calculate anomaly score.
|
||||||
|
func cFactor(n int) float64 {
|
||||||
|
return 2.0*(math.Log(float64(n-1))+0.5772156649) - (float64(2.0*(n-1)) / float64(n))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Anamoly Score - How anomalous is a data point. closer to 1 - higher chance of it being outlier. closer to 0 - low chance of it being outlier.
|
||||||
|
func anomalyScore(instance []float64, forest []regressorNode, n int) float64 {
|
||||||
|
paths := evaluateInstance(instance, forest)
|
||||||
|
E := 0.0
|
||||||
|
for _, path := range paths {
|
||||||
|
E += path
|
||||||
|
}
|
||||||
|
E /= float64(len(paths))
|
||||||
|
c := cFactor(n)
|
||||||
|
return math.Pow(2, (-1 * E / c))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return anamoly score for all datapoints.
|
||||||
|
func (iForest *IsolationForest) Predict(X base.FixedDataGrid) []float64 {
|
||||||
|
data := preprocessData(X)
|
||||||
|
|
||||||
|
var preds []float64
|
||||||
|
for _, instance := range data {
|
||||||
|
score := anomalyScore(instance, iForest.trees, iForest.subSpace)
|
||||||
|
preds = append(preds, score)
|
||||||
|
}
|
||||||
|
return preds
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract data in the form of floats. Used in Fit and predict. Note that class labels are treated as normal data because Isolation Forest is unsupervised.
|
||||||
|
func preprocessData(X base.FixedDataGrid) [][]float64 {
|
||||||
|
data := convertInstancesToProblemVec(X)
|
||||||
|
class, err := regressorConvertInstancesToLabelVec(X)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
for i, point := range class {
|
||||||
|
data[i] = append(data[i], point)
|
||||||
|
}
|
||||||
|
return data
|
||||||
|
}
|
42
trees/isolation_test.go
Normal file
42
trees/isolation_test.go
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
package trees
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
. "github.com/smartystreets/goconvey/convey"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestIsolation(t *testing.T) {
|
||||||
|
|
||||||
|
Convey("Doing an Isolation Forest Test", t, func() {
|
||||||
|
|
||||||
|
var data [][]float64
|
||||||
|
data = append(data, []float64{8, 9, 8, 3})
|
||||||
|
data = append(data, []float64{4, 2, 5, 3})
|
||||||
|
data = append(data, []float64{3, 2, 5, 9})
|
||||||
|
data = append(data, []float64{2, 1, 5, 9})
|
||||||
|
|
||||||
|
featureChosen := selectFeature(data)
|
||||||
|
So(featureChosen, ShouldNotBeNil)
|
||||||
|
|
||||||
|
min, max := minMax(0, data)
|
||||||
|
So(min, ShouldEqual, 2)
|
||||||
|
So(max, ShouldEqual, 8)
|
||||||
|
|
||||||
|
min, max = minMax(featureChosen, data)
|
||||||
|
|
||||||
|
val := selectValue(min, max)
|
||||||
|
So(val, ShouldBeBetween, min, max)
|
||||||
|
|
||||||
|
leftData, rightData := splitData(val, featureChosen, data)
|
||||||
|
So(len(leftData), ShouldBeGreaterThan, 0)
|
||||||
|
So(len(rightData), ShouldBeGreaterThan, 0)
|
||||||
|
|
||||||
|
checked := checkData(data)
|
||||||
|
So(checked, ShouldBeTrue)
|
||||||
|
|
||||||
|
randomSubset := getRandomData(data, 2)
|
||||||
|
So(len(randomSubset), ShouldEqual, 2)
|
||||||
|
|
||||||
|
})
|
||||||
|
}
|
@ -11,13 +11,13 @@
|
|||||||
present, so discretise beforehand (see
|
present, so discretise beforehand (see
|
||||||
filters)
|
filters)
|
||||||
|
|
||||||
CART (Classification and Regression Trees):
|
CART (Classification and Regression Trees):
|
||||||
Builds a binary decision tree using the CART algorithm
|
Builds a binary decision tree using the CART algorithm
|
||||||
using a greedy approach to find the best split at each node.
|
using a greedy approach to find the best split at each node.
|
||||||
|
|
||||||
Can be used for regression and classficiation.
|
Can be used for regression and classficiation.
|
||||||
Attributes have to be FloatAttributes even for classification.
|
Attributes have to be FloatAttributes even for classification.
|
||||||
Hence, convert to Integer Labels before hand for Classficiation.
|
Hence, convert to Integer Labels before hand for Classficiation.
|
||||||
|
|
||||||
RandomTree:
|
RandomTree:
|
||||||
Builds a decision tree using the ID3 algorithm
|
Builds a decision tree using the ID3 algorithm
|
||||||
@ -29,6 +29,14 @@
|
|||||||
present, so discretise beforehand (see
|
present, so discretise beforehand (see
|
||||||
filters)
|
filters)
|
||||||
|
|
||||||
|
IsolationForest:
|
||||||
|
Unsupervised learning model for outlier detection.
|
||||||
|
|
||||||
|
Builds a tree by randomly picking an attribute and splitting value.
|
||||||
|
|
||||||
|
Attributes must be FloatAttributes.
|
||||||
|
All Class Attributes will be treated as Normal Feature Attributes,
|
||||||
|
So remove any Class Attributes you don't want during training beforehand.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package trees
|
package trees
|
||||||
|
Loading…
x
Reference in New Issue
Block a user