mirror of
https://github.com/sjwhitworth/golearn.git
synced 2025-04-26 13:49:14 +08:00
Removing Clutter
Partial Modularization of best split method. Shorten method by declaring variables in same line as well. Also removing redundant functions, and adding into cart_utils.
This commit is contained in:
parent
ef751e62c4
commit
2d2af0a58f
@ -90,6 +90,16 @@ func entropy(y []int64, labels []int64) (float64, int64) {
|
|||||||
return entropy, maxLabel
|
return entropy, maxLabel
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func calculateClassificationLoss(y []int64, labels []int64, criterion string) (float64, int64) {
|
||||||
|
if criterion == GINI {
|
||||||
|
return giniImpurity(y, labels)
|
||||||
|
} else if criterion == ENTROPY {
|
||||||
|
return entropy(y, labels)
|
||||||
|
} else {
|
||||||
|
panic("Invalid impurity function, choose from GINI or ENTROPY")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Split the data into left node and right node based on feature and threshold
|
// Split the data into left node and right node based on feature and threshold
|
||||||
func classifierCreateSplit(data [][]float64, feature int64, y []int64, threshold float64) ([][]float64, [][]float64, []int64, []int64) {
|
func classifierCreateSplit(data [][]float64, feature int64, y []int64, threshold float64) ([][]float64, [][]float64, []int64, []int64) {
|
||||||
var left [][]float64
|
var left [][]float64
|
||||||
@ -111,37 +121,6 @@ func classifierCreateSplit(data [][]float64, feature int64, y []int64, threshold
|
|||||||
return left, right, lefty, righty
|
return left, right, lefty, righty
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper Function to check if data point is unique or not.
|
|
||||||
// We will use this to isolate unique values of a feature
|
|
||||||
func classifierStringInSlice(a float64, list []float64) bool {
|
|
||||||
for _, b := range list {
|
|
||||||
if b == a {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
// Isolate only unique values. This way, we can try only unique splits and not redundant ones.
|
|
||||||
func classifierFindUnique(data []float64) []float64 {
|
|
||||||
var unique []float64
|
|
||||||
for i := range data {
|
|
||||||
if !classifierStringInSlice(data[i], unique) {
|
|
||||||
unique = append(unique, data[i])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return unique
|
|
||||||
}
|
|
||||||
|
|
||||||
// Isolate only the feature being considered for splitting. Reduces the complexity in managing splits.
|
|
||||||
func classifierGetFeature(data [][]float64, feature int64) []float64 {
|
|
||||||
var featureVals []float64
|
|
||||||
for i := range data {
|
|
||||||
featureVals = append(featureVals, data[i][feature])
|
|
||||||
}
|
|
||||||
return featureVals
|
|
||||||
}
|
|
||||||
|
|
||||||
// Function to Create New Decision Tree Classifier.
|
// Function to Create New Decision Tree Classifier.
|
||||||
// It assigns all of the hyperparameters by user into the tree attributes.
|
// It assigns all of the hyperparameters by user into the tree attributes.
|
||||||
func NewDecisionTreeClassifier(criterion string, maxDepth int64, labels []int64) *CARTDecisionTreeClassifier {
|
func NewDecisionTreeClassifier(criterion string, maxDepth int64, labels []int64) *CARTDecisionTreeClassifier {
|
||||||
@ -153,19 +132,6 @@ func NewDecisionTreeClassifier(criterion string, maxDepth int64, labels []int64)
|
|||||||
return &tree
|
return &tree
|
||||||
}
|
}
|
||||||
|
|
||||||
// Make sure that split being considered has not been done before.
|
|
||||||
// Else we will unnecessarily try splits that won't improve Impurity.
|
|
||||||
func classifierValidate(triedSplits [][]float64, feature int64, threshold float64) bool {
|
|
||||||
for i := range triedSplits {
|
|
||||||
split := triedSplits[i]
|
|
||||||
featureTried, thresholdTried := split[0], split[1]
|
|
||||||
if int64(featureTried) == feature && thresholdTried == threshold {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reorder the data by feature being considered. Optimizes code by reducing the number of times we have to loop over data for splitting
|
// Reorder the data by feature being considered. Optimizes code by reducing the number of times we have to loop over data for splitting
|
||||||
func classifierReOrderData(featureVal []float64, data [][]float64, y []int64) ([][]float64, []int64) {
|
func classifierReOrderData(featureVal []float64, data [][]float64, y []int64) ([][]float64, []int64) {
|
||||||
s := NewSlice(featureVal)
|
s := NewSlice(featureVal)
|
||||||
@ -202,7 +168,7 @@ func classifierUpdateSplit(left [][]float64, lefty []int64, right [][]float64, r
|
|||||||
func (tree *CARTDecisionTreeClassifier) Fit(X base.FixedDataGrid) {
|
func (tree *CARTDecisionTreeClassifier) Fit(X base.FixedDataGrid) {
|
||||||
var emptyNode classifierNode
|
var emptyNode classifierNode
|
||||||
|
|
||||||
data := classifierConvertInstancesToProblemVec(X)
|
data := convertInstancesToProblemVec(X)
|
||||||
y := classifierConvertInstancesToLabelVec(X)
|
y := classifierConvertInstancesToLabelVec(X)
|
||||||
emptyNode = classifierBestSplit(*tree, data, y, tree.labels, emptyNode, tree.criterion, tree.maxDepth, 0)
|
emptyNode = classifierBestSplit(*tree, data, y, tree.labels, emptyNode, tree.criterion, tree.maxDepth, 0)
|
||||||
|
|
||||||
@ -221,40 +187,29 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y []
|
|||||||
}
|
}
|
||||||
|
|
||||||
numFeatures := len(data[0])
|
numFeatures := len(data[0])
|
||||||
var bestGini float64
|
var bestGini, origGini float64
|
||||||
var origGini float64
|
|
||||||
|
|
||||||
// Calculate loss based on Criterion Specified by user
|
// Calculate loss based on Criterion Specified by user
|
||||||
if criterion == GINI {
|
origGini, upperNode.LeftLabel = calculateClassificationLoss(y, labels, criterion)
|
||||||
origGini, upperNode.LeftLabel = giniImpurity(y, labels)
|
|
||||||
} else if criterion == ENTROPY {
|
|
||||||
origGini, upperNode.LeftLabel = entropy(y, labels)
|
|
||||||
} else {
|
|
||||||
panic("Invalid impurity function, choose from GINI or ENTROPY")
|
|
||||||
}
|
|
||||||
|
|
||||||
bestGini = origGini
|
bestGini = origGini
|
||||||
|
|
||||||
bestLeft := data
|
bestLeft, bestRight, bestLefty, bestRighty := data, data, y, y
|
||||||
bestRight := data
|
|
||||||
bestLefty := y
|
|
||||||
bestRighty := y
|
|
||||||
|
|
||||||
numData := len(data)
|
numData := len(data)
|
||||||
|
|
||||||
bestLeftGini := bestGini
|
bestLeftGini, bestRightGini := bestGini, bestGini
|
||||||
bestRightGini := bestGini
|
|
||||||
|
|
||||||
upperNode.Use_not = true
|
upperNode.Use_not = true
|
||||||
|
|
||||||
var leftN classifierNode
|
var leftN, rightN classifierNode
|
||||||
var rightN classifierNode
|
|
||||||
// Iterate over all features
|
// Iterate over all features
|
||||||
for i := 0; i < numFeatures; i++ {
|
for i := 0; i < numFeatures; i++ {
|
||||||
featureVal := classifierGetFeature(data, int64(i))
|
|
||||||
unique := classifierFindUnique(featureVal)
|
featureVal := getFeature(data, int64(i))
|
||||||
|
unique := findUnique(featureVal)
|
||||||
sort.Float64s(unique)
|
sort.Float64s(unique)
|
||||||
numUnique := len(unique)
|
|
||||||
|
|
||||||
sortData, sortY := classifierReOrderData(featureVal, data, y)
|
sortData, sortY := classifierReOrderData(featureVal, data, y)
|
||||||
|
|
||||||
@ -263,53 +218,43 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y []
|
|||||||
var left, right [][]float64
|
var left, right [][]float64
|
||||||
var lefty, righty []int64
|
var lefty, righty []int64
|
||||||
// Iterate over all possible thresholds for that feature
|
// Iterate over all possible thresholds for that feature
|
||||||
for j := range unique {
|
for j := 0; j < len(unique)-1; j++ {
|
||||||
if j != (numUnique - 1) {
|
|
||||||
threshold := (unique[j] + unique[j+1]) / 2
|
|
||||||
// Ensure that same split has not been made before
|
|
||||||
if classifierValidate(tree.triedSplits, int64(i), threshold) {
|
|
||||||
// We need to split data from fresh when considering new feature for the first time.
|
|
||||||
// Otherwise, we need to update the split by moving data points from left to right.
|
|
||||||
if firstTime {
|
|
||||||
left, right, lefty, righty = classifierCreateSplit(sortData, int64(i), sortY, threshold)
|
|
||||||
firstTime = false
|
|
||||||
} else {
|
|
||||||
left, lefty, right, righty = classifierUpdateSplit(left, lefty, right, righty, int64(i), threshold)
|
|
||||||
}
|
|
||||||
|
|
||||||
var leftGini float64
|
threshold := (unique[j] + unique[j+1]) / 2
|
||||||
var rightGini float64
|
// Ensure that same split has not been made before
|
||||||
var leftLabels int64
|
if validate(tree.triedSplits, int64(i), threshold) {
|
||||||
var rightLabels int64
|
// We need to split data from fresh when considering new feature for the first time.
|
||||||
|
// Otherwise, we need to update the split by moving data points from left to right.
|
||||||
if criterion == GINI {
|
if firstTime {
|
||||||
leftGini, leftLabels = giniImpurity(lefty, labels)
|
left, right, lefty, righty = classifierCreateSplit(sortData, int64(i), sortY, threshold)
|
||||||
rightGini, rightLabels = giniImpurity(righty, labels)
|
firstTime = false
|
||||||
} else if criterion == ENTROPY {
|
} else {
|
||||||
leftGini, leftLabels = entropy(lefty, labels)
|
left, lefty, right, righty = classifierUpdateSplit(left, lefty, right, righty, int64(i), threshold)
|
||||||
rightGini, rightLabels = entropy(righty, labels)
|
|
||||||
}
|
|
||||||
// Calculate weighted gini impurity of child nodes
|
|
||||||
subGini := (leftGini * float64(len(left)) / float64(numData)) + (rightGini * float64(len(right)) / float64(numData))
|
|
||||||
|
|
||||||
// If we find a split that reduces impurity
|
|
||||||
if subGini < bestGini {
|
|
||||||
bestGini = subGini
|
|
||||||
bestLeft = left
|
|
||||||
bestRight = right
|
|
||||||
bestLefty = lefty
|
|
||||||
bestRighty = righty
|
|
||||||
upperNode.Threshold = threshold
|
|
||||||
upperNode.Feature = int64(i)
|
|
||||||
|
|
||||||
upperNode.LeftLabel = leftLabels
|
|
||||||
upperNode.RightLabel = rightLabels
|
|
||||||
|
|
||||||
bestLeftGini = leftGini
|
|
||||||
bestRightGini = rightGini
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var leftGini, rightGini float64
|
||||||
|
var leftLabels, rightLabels int64
|
||||||
|
|
||||||
|
leftGini, leftLabels = calculateClassificationLoss(lefty, labels, criterion)
|
||||||
|
rightGini, rightLabels = calculateClassificationLoss(righty, labels, criterion)
|
||||||
|
|
||||||
|
// Calculate weighted gini impurity of child nodes
|
||||||
|
subGini := (leftGini * float64(len(left)) / float64(numData)) + (rightGini * float64(len(right)) / float64(numData))
|
||||||
|
|
||||||
|
// If we find a split that reduces impurity
|
||||||
|
if subGini < bestGini {
|
||||||
|
bestGini = subGini
|
||||||
|
|
||||||
|
bestLeft, bestRight = left, right
|
||||||
|
|
||||||
|
bestLefty, bestRighty = lefty, righty
|
||||||
|
|
||||||
|
upperNode.Threshold, upperNode.Feature = threshold, int64(i)
|
||||||
|
|
||||||
|
upperNode.LeftLabel, upperNode.RightLabel = leftLabels, rightLabels
|
||||||
|
|
||||||
|
bestLeftGini, bestRightGini = leftGini, rightGini
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -366,10 +311,8 @@ func classifierPrintTreeFromNode(tree classifierNode, spacing string) string {
|
|||||||
returnString += spacing + "---> True" + "\n"
|
returnString += spacing + "---> True" + "\n"
|
||||||
returnString += " " + spacing + "PREDICT "
|
returnString += " " + spacing + "PREDICT "
|
||||||
returnString += strconv.FormatInt(tree.LeftLabel, 10) + "\n"
|
returnString += strconv.FormatInt(tree.LeftLabel, 10) + "\n"
|
||||||
|
|
||||||
}
|
}
|
||||||
if tree.Right == nil {
|
if tree.Right == nil {
|
||||||
|
|
||||||
returnString += spacing + "---> False" + "\n"
|
returnString += spacing + "---> False" + "\n"
|
||||||
returnString += " " + spacing + "PREDICT "
|
returnString += " " + spacing + "PREDICT "
|
||||||
returnString += strconv.FormatInt(tree.RightLabel, 10) + "\n"
|
returnString += strconv.FormatInt(tree.RightLabel, 10) + "\n"
|
||||||
@ -409,7 +352,7 @@ func classifierPredictSingle(tree classifierNode, instance []float64) int64 {
|
|||||||
// Given test data, return predictions for every datapoint. calls classifierPredictFromNode
|
// Given test data, return predictions for every datapoint. calls classifierPredictFromNode
|
||||||
func (tree *CARTDecisionTreeClassifier) Predict(X_test base.FixedDataGrid) []int64 {
|
func (tree *CARTDecisionTreeClassifier) Predict(X_test base.FixedDataGrid) []int64 {
|
||||||
root := *tree.RootNode
|
root := *tree.RootNode
|
||||||
test := classifierConvertInstancesToProblemVec(X_test)
|
test := convertInstancesToProblemVec(X_test)
|
||||||
return classifierPredictFromNode(root, test)
|
return classifierPredictFromNode(root, test)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -429,7 +372,7 @@ func classifierPredictFromNode(tree classifierNode, test [][]float64) []int64 {
|
|||||||
// Calls classifierEvaluateFromNode
|
// Calls classifierEvaluateFromNode
|
||||||
func (tree *CARTDecisionTreeClassifier) Evaluate(test base.FixedDataGrid) float64 {
|
func (tree *CARTDecisionTreeClassifier) Evaluate(test base.FixedDataGrid) float64 {
|
||||||
rootNode := *tree.RootNode
|
rootNode := *tree.RootNode
|
||||||
xTest := classifierConvertInstancesToProblemVec(test)
|
xTest := convertInstancesToProblemVec(test)
|
||||||
yTest := classifierConvertInstancesToLabelVec(test)
|
yTest := classifierConvertInstancesToLabelVec(test)
|
||||||
return classifierEvaluateFromNode(rootNode, xTest, yTest)
|
return classifierEvaluateFromNode(rootNode, xTest, yTest)
|
||||||
}
|
}
|
||||||
@ -447,31 +390,6 @@ func classifierEvaluateFromNode(tree classifierNode, xTest [][]float64, yTest []
|
|||||||
return accuracy
|
return accuracy
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to convert base.FixedDataGrid into required format. Called in Fit, Predict
|
|
||||||
func classifierConvertInstancesToProblemVec(X base.FixedDataGrid) [][]float64 {
|
|
||||||
// Allocate problem array
|
|
||||||
_, rows := X.Size()
|
|
||||||
problemVec := make([][]float64, rows)
|
|
||||||
|
|
||||||
// Retrieve numeric non-class Attributes
|
|
||||||
numericAttrs := base.NonClassFloatAttributes(X)
|
|
||||||
numericAttrSpecs := base.ResolveAttributes(X, numericAttrs)
|
|
||||||
|
|
||||||
// Convert each row
|
|
||||||
X.MapOverRows(numericAttrSpecs, func(row [][]byte, rowNo int) (bool, error) {
|
|
||||||
// Allocate a new row
|
|
||||||
probRow := make([]float64, len(numericAttrSpecs))
|
|
||||||
// Read out the row
|
|
||||||
for i, _ := range numericAttrSpecs {
|
|
||||||
probRow[i] = base.UnpackBytesToFloat(row[i])
|
|
||||||
}
|
|
||||||
// Add the row
|
|
||||||
problemVec[rowNo] = probRow
|
|
||||||
return true, nil
|
|
||||||
})
|
|
||||||
return problemVec
|
|
||||||
}
|
|
||||||
|
|
||||||
// Helper function to convert base.FixedDataGrid into required format. Called in Fit, Predict
|
// Helper function to convert base.FixedDataGrid into required format. Called in Fit, Predict
|
||||||
func classifierConvertInstancesToLabelVec(X base.FixedDataGrid) []int64 {
|
func classifierConvertInstancesToLabelVec(X base.FixedDataGrid) []int64 {
|
||||||
// Get the class Attributes
|
// Get the class Attributes
|
||||||
|
@ -81,6 +81,16 @@ func mseImpurity(y []float64) (float64, float64) {
|
|||||||
return meanSquaredError(y, yHat), yHat
|
return meanSquaredError(y, yHat), yHat
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func calculateRegressionLoss(y []float64, criterion string) (float64, float64) {
|
||||||
|
if criterion == MAE {
|
||||||
|
return maeImpurity(y)
|
||||||
|
} else if criterion == MSE {
|
||||||
|
return mseImpurity(y)
|
||||||
|
} else {
|
||||||
|
panic("Invalid impurity function, choose from MAE or MSE")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Split the data into left and right based on trehsold and feature.
|
// Split the data into left and right based on trehsold and feature.
|
||||||
func regressorCreateSplit(data [][]float64, feature int64, y []float64, threshold float64) ([][]float64, [][]float64, []float64, []float64) {
|
func regressorCreateSplit(data [][]float64, feature int64, y []float64, threshold float64) ([][]float64, [][]float64, []float64, []float64) {
|
||||||
var left [][]float64
|
var left [][]float64
|
||||||
@ -102,39 +112,6 @@ func regressorCreateSplit(data [][]float64, feature int64, y []float64, threshol
|
|||||||
return left, right, lefty, righty
|
return left, right, lefty, righty
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function for finding unique values.
|
|
||||||
// Used for isolating unique values in a feature.
|
|
||||||
func regressorStringInSlice(a float64, list []float64) bool {
|
|
||||||
for _, b := range list {
|
|
||||||
if b == a {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
// Isolate only unique values.
|
|
||||||
// This way we can only try unique splits.
|
|
||||||
func regressorFindUnique(data []float64) []float64 {
|
|
||||||
var unique []float64
|
|
||||||
for i := range data {
|
|
||||||
if !regressorStringInSlice(data[i], unique) {
|
|
||||||
unique = append(unique, data[i])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return unique
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract out a single feature from data.
|
|
||||||
// Reduces complexity in managing splits and sorting
|
|
||||||
func regressorGetFeature(data [][]float64, feature int64) []float64 {
|
|
||||||
var featureVals []float64
|
|
||||||
for i := range data {
|
|
||||||
featureVals = append(featureVals, data[i][feature])
|
|
||||||
}
|
|
||||||
return featureVals
|
|
||||||
}
|
|
||||||
|
|
||||||
// Interface for creating new Decision Tree Regressor
|
// Interface for creating new Decision Tree Regressor
|
||||||
func NewDecisionTreeRegressor(criterion string, maxDepth int64) *CARTDecisionTreeRegressor {
|
func NewDecisionTreeRegressor(criterion string, maxDepth int64) *CARTDecisionTreeRegressor {
|
||||||
var tree CARTDecisionTreeRegressor
|
var tree CARTDecisionTreeRegressor
|
||||||
@ -143,19 +120,6 @@ func NewDecisionTreeRegressor(criterion string, maxDepth int64) *CARTDecisionTre
|
|||||||
return &tree
|
return &tree
|
||||||
}
|
}
|
||||||
|
|
||||||
// Validate that the split being tested has not been done before.
|
|
||||||
// This prevents redundant splits from hapenning.
|
|
||||||
func regressorValidate(triedSplits [][]float64, feature int64, threshold float64) bool {
|
|
||||||
for i := range triedSplits {
|
|
||||||
split := triedSplits[i]
|
|
||||||
featureTried, thresholdTried := split[0], split[1]
|
|
||||||
if int64(featureTried) == feature && thresholdTried == threshold {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
// Re order data based on a feature for optimizing code
|
// Re order data based on a feature for optimizing code
|
||||||
// Helps in updating splits without reiterating entire dataset
|
// Helps in updating splits without reiterating entire dataset
|
||||||
func regressorReOrderData(featureVal []float64, data [][]float64, y []float64) ([][]float64, []float64) {
|
func regressorReOrderData(featureVal []float64, data [][]float64, y []float64) ([][]float64, []float64) {
|
||||||
@ -204,6 +168,7 @@ func (tree *CARTDecisionTreeRegressor) Fit(X base.FixedDataGrid) {
|
|||||||
// Recursive function - stops if maxDepth is reached or nodes are pure
|
// Recursive function - stops if maxDepth is reached or nodes are pure
|
||||||
func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []float64, upperNode regressorNode, criterion string, maxDepth int64, depth int64) regressorNode {
|
func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []float64, upperNode regressorNode, criterion string, maxDepth int64, depth int64) regressorNode {
|
||||||
|
|
||||||
|
// Ensure that we have not reached maxDepth. maxDepth =-1 means split until nodes are pure
|
||||||
depth++
|
depth++
|
||||||
|
|
||||||
if depth > maxDepth && maxDepth != -1 {
|
if depth > maxDepth && maxDepth != -1 {
|
||||||
@ -211,39 +176,27 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl
|
|||||||
}
|
}
|
||||||
|
|
||||||
numFeatures := len(data[0])
|
numFeatures := len(data[0])
|
||||||
var bestLoss float64
|
var bestLoss, origLoss float64
|
||||||
var origLoss float64
|
|
||||||
|
|
||||||
if criterion == MAE {
|
origLoss, upperNode.LeftPred = calculateRegressionLoss(y, criterion)
|
||||||
origLoss, upperNode.LeftPred = maeImpurity(y)
|
|
||||||
} else if criterion == MSE {
|
|
||||||
origLoss, upperNode.LeftPred = mseImpurity(y)
|
|
||||||
} else {
|
|
||||||
panic("Invalid impurity function, choose from MAE or MSE")
|
|
||||||
}
|
|
||||||
|
|
||||||
bestLoss = origLoss
|
bestLoss = origLoss
|
||||||
|
|
||||||
bestLeft := data
|
bestLeft, bestRight, bestLefty, bestRighty := data, data, y, y
|
||||||
bestRight := data
|
|
||||||
bestLefty := y
|
|
||||||
bestRighty := y
|
|
||||||
|
|
||||||
numData := len(data)
|
numData := len(data)
|
||||||
|
|
||||||
bestLeftLoss := bestLoss
|
bestLeftLoss, bestRightLoss := bestLoss, bestLoss
|
||||||
bestRightLoss := bestLoss
|
|
||||||
|
|
||||||
upperNode.Use_not = true
|
upperNode.Use_not = true
|
||||||
|
|
||||||
var leftN regressorNode
|
var leftN, rightN regressorNode
|
||||||
var rightN regressorNode
|
|
||||||
// Iterate over all features
|
// Iterate over all features
|
||||||
for i := 0; i < numFeatures; i++ {
|
for i := 0; i < numFeatures; i++ {
|
||||||
featureVal := regressorGetFeature(data, int64(i))
|
|
||||||
unique := regressorFindUnique(featureVal)
|
featureVal := getFeature(data, int64(i))
|
||||||
|
unique := findUnique(featureVal)
|
||||||
sort.Float64s(unique)
|
sort.Float64s(unique)
|
||||||
numUnique := len(unique)
|
|
||||||
|
|
||||||
sortData, sortY := regressorReOrderData(featureVal, data, y)
|
sortData, sortY := regressorReOrderData(featureVal, data, y)
|
||||||
|
|
||||||
@ -252,49 +205,36 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl
|
|||||||
var left, right [][]float64
|
var left, right [][]float64
|
||||||
var lefty, righty []float64
|
var lefty, righty []float64
|
||||||
|
|
||||||
for j := range unique {
|
for j := 0; j < len(unique)-1; j++ {
|
||||||
if j != (numUnique - 1) {
|
threshold := (unique[j] + unique[j+1]) / 2
|
||||||
threshold := (unique[j] + unique[j+1]) / 2
|
if validate(tree.triedSplits, int64(i), threshold) {
|
||||||
if regressorValidate(tree.triedSplits, int64(i), threshold) {
|
if firstTime {
|
||||||
if firstTime {
|
left, right, lefty, righty = regressorCreateSplit(sortData, int64(i), sortY, threshold)
|
||||||
left, right, lefty, righty = regressorCreateSplit(sortData, int64(i), sortY, threshold)
|
firstTime = false
|
||||||
firstTime = false
|
} else {
|
||||||
} else {
|
left, lefty, right, righty = regressorUpdateSplit(left, lefty, right, righty, int64(i), threshold)
|
||||||
left, lefty, right, righty = regressorUpdateSplit(left, lefty, right, righty, int64(i), threshold)
|
|
||||||
}
|
|
||||||
|
|
||||||
var leftLoss float64
|
|
||||||
var rightLoss float64
|
|
||||||
var leftPred float64
|
|
||||||
var rightPred float64
|
|
||||||
|
|
||||||
if criterion == MAE {
|
|
||||||
leftLoss, leftPred = maeImpurity(lefty)
|
|
||||||
rightLoss, rightPred = maeImpurity(righty)
|
|
||||||
} else if criterion == MSE {
|
|
||||||
leftLoss, leftPred = mseImpurity(lefty)
|
|
||||||
rightLoss, rightPred = mseImpurity(righty)
|
|
||||||
}
|
|
||||||
|
|
||||||
subLoss := (leftLoss * float64(len(left)) / float64(numData)) + (rightLoss * float64(len(right)) / float64(numData))
|
|
||||||
|
|
||||||
if subLoss < bestLoss {
|
|
||||||
bestLoss = subLoss
|
|
||||||
bestLeft = left
|
|
||||||
bestRight = right
|
|
||||||
bestLefty = lefty
|
|
||||||
bestRighty = righty
|
|
||||||
upperNode.Threshold = threshold
|
|
||||||
upperNode.Feature = int64(i)
|
|
||||||
|
|
||||||
upperNode.LeftPred = leftPred
|
|
||||||
upperNode.RightPred = rightPred
|
|
||||||
|
|
||||||
bestLeftLoss = leftLoss
|
|
||||||
bestRightLoss = rightLoss
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var leftLoss, rightLoss float64
|
||||||
|
var leftPred, rightPred float64
|
||||||
|
|
||||||
|
leftLoss, leftPred = calculateRegressionLoss(lefty, criterion)
|
||||||
|
rightLoss, rightPred = calculateRegressionLoss(righty, criterion)
|
||||||
|
|
||||||
|
subLoss := (leftLoss * float64(len(left)) / float64(numData)) + (rightLoss * float64(len(right)) / float64(numData))
|
||||||
|
|
||||||
|
if subLoss < bestLoss {
|
||||||
|
bestLoss = subLoss
|
||||||
|
|
||||||
|
bestLeft, bestRight = left, right
|
||||||
|
bestLefty, bestRighty = lefty, righty
|
||||||
|
|
||||||
|
upperNode.Threshold, upperNode.Feature = threshold, int64(i)
|
||||||
|
|
||||||
|
upperNode.LeftPred, upperNode.RightPred = leftPred, rightPred
|
||||||
|
|
||||||
|
bestLeftLoss, bestRightLoss = leftLoss, rightLoss
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -312,19 +252,16 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl
|
|||||||
if leftN.Use_not == true {
|
if leftN.Use_not == true {
|
||||||
upperNode.Left = &leftN
|
upperNode.Left = &leftN
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if bestRightLoss > 0 {
|
if bestRightLoss > 0 {
|
||||||
tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold})
|
tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold})
|
||||||
rightN = regressorBestSplit(tree, bestRight, bestRighty, rightN, criterion, maxDepth, depth)
|
rightN = regressorBestSplit(tree, bestRight, bestRighty, rightN, criterion, maxDepth, depth)
|
||||||
if rightN.Use_not == true {
|
if rightN.Use_not == true {
|
||||||
upperNode.Right = &rightN
|
upperNode.Right = &rightN
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return upperNode
|
return upperNode
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -349,20 +286,17 @@ func regressorPrintTreeFromNode(tree regressorNode, spacing string) string {
|
|||||||
returnString += fmt.Sprintf("%.3f", tree.LeftPred) + "\n"
|
returnString += fmt.Sprintf("%.3f", tree.LeftPred) + "\n"
|
||||||
}
|
}
|
||||||
if tree.Right == nil {
|
if tree.Right == nil {
|
||||||
|
|
||||||
returnString += spacing + "---> False" + "\n"
|
returnString += spacing + "---> False" + "\n"
|
||||||
returnString += " " + spacing + "PREDICT "
|
returnString += " " + spacing + "PREDICT "
|
||||||
returnString += fmt.Sprintf("%.3f", tree.RightPred) + "\n"
|
returnString += fmt.Sprintf("%.3f", tree.RightPred) + "\n"
|
||||||
}
|
}
|
||||||
|
|
||||||
if tree.Left != nil {
|
if tree.Left != nil {
|
||||||
// fmt.Println(spacing + "---> True")
|
|
||||||
returnString += spacing + "---> True" + "\n"
|
returnString += spacing + "---> True" + "\n"
|
||||||
returnString += regressorPrintTreeFromNode(*tree.Left, spacing+" ")
|
returnString += regressorPrintTreeFromNode(*tree.Left, spacing+" ")
|
||||||
}
|
}
|
||||||
|
|
||||||
if tree.Right != nil {
|
if tree.Right != nil {
|
||||||
// fmt.Println(spacing + "---> False")
|
|
||||||
returnString += spacing + "---> False" + "\n"
|
returnString += spacing + "---> False" + "\n"
|
||||||
returnString += regressorPrintTreeFromNode(*tree.Right, spacing+" ")
|
returnString += regressorPrintTreeFromNode(*tree.Right, spacing+" ")
|
||||||
}
|
}
|
||||||
|
@ -38,10 +38,10 @@ func TestRegressor(t *testing.T) {
|
|||||||
So(len(righty), ShouldEqual, 2)
|
So(len(righty), ShouldEqual, 2)
|
||||||
|
|
||||||
// Is isolating unique values working properly
|
// Is isolating unique values working properly
|
||||||
So(len(classifierFindUnique([]float64{10, 1, 1})), ShouldEqual, 2)
|
So(len(findUnique([]float64{10, 1, 1})), ShouldEqual, 2)
|
||||||
|
|
||||||
// is data reordered correctly
|
// is data reordered correctly
|
||||||
orderedData, orderedY := classifierReOrderData(classifierGetFeature(classifierData, 1), classifierData, classifiery)
|
orderedData, orderedY := classifierReOrderData(getFeature(classifierData, 1), classifierData, classifiery)
|
||||||
fmt.Println(orderedData)
|
fmt.Println(orderedData)
|
||||||
fmt.Println(orderedY)
|
fmt.Println(orderedY)
|
||||||
So(orderedData[1][1], ShouldEqual, 3.0)
|
So(orderedData[1][1], ShouldEqual, 3.0)
|
||||||
@ -85,11 +85,8 @@ func TestRegressor(t *testing.T) {
|
|||||||
So(len(rightData), ShouldEqual, 2)
|
So(len(rightData), ShouldEqual, 2)
|
||||||
So(len(righty), ShouldEqual, 2)
|
So(len(righty), ShouldEqual, 2)
|
||||||
|
|
||||||
// Is isolating unique values working properly
|
|
||||||
So(len(regressorFindUnique([]float64{10, 1, 1})), ShouldEqual, 2)
|
|
||||||
|
|
||||||
// is data reordered correctly
|
// is data reordered correctly
|
||||||
regressorOrderedData, regressorOrderedY := regressorReOrderData(regressorGetFeature(data, 1), data, y)
|
regressorOrderedData, regressorOrderedY := regressorReOrderData(getFeature(data, 1), data, y)
|
||||||
|
|
||||||
So(regressorOrderedData[1][1], ShouldEqual, 3.0)
|
So(regressorOrderedData[1][1], ShouldEqual, 3.0)
|
||||||
So(regressorOrderedY[0], ShouldEqual, 2)
|
So(regressorOrderedY[0], ShouldEqual, 2)
|
||||||
|
74
trees/cart_utils.go
Normal file
74
trees/cart_utils.go
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
package trees
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/sjwhitworth/golearn/base"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Helper Function to check if data point is unique or not.
|
||||||
|
// We will use this to isolate unique values of a feature
|
||||||
|
func stringInSlice(a float64, list []float64) bool {
|
||||||
|
for _, b := range list {
|
||||||
|
if b == a {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Isolate only unique values. This way, we can try only unique splits and not redundant ones.
|
||||||
|
func findUnique(data []float64) []float64 {
|
||||||
|
var unique []float64
|
||||||
|
for i := range data {
|
||||||
|
if !stringInSlice(data[i], unique) {
|
||||||
|
unique = append(unique, data[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return unique
|
||||||
|
}
|
||||||
|
|
||||||
|
// Isolate only the feature being considered for splitting. Reduces the complexity in managing splits.
|
||||||
|
func getFeature(data [][]float64, feature int64) []float64 {
|
||||||
|
var featureVals []float64
|
||||||
|
for i := range data {
|
||||||
|
featureVals = append(featureVals, data[i][feature])
|
||||||
|
}
|
||||||
|
return featureVals
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure that split being considered has not been done before.
|
||||||
|
// Else we will unnecessarily try splits that won't improve Impurity.
|
||||||
|
func validate(triedSplits [][]float64, feature int64, threshold float64) bool {
|
||||||
|
for i := range triedSplits {
|
||||||
|
split := triedSplits[i]
|
||||||
|
featureTried, thresholdTried := split[0], split[1]
|
||||||
|
if int64(featureTried) == feature && thresholdTried == threshold {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to convert base.FixedDataGrid into required format. Called in Fit, Predict
|
||||||
|
func convertInstancesToProblemVec(X base.FixedDataGrid) [][]float64 {
|
||||||
|
// Allocate problem array
|
||||||
|
_, rows := X.Size()
|
||||||
|
problemVec := make([][]float64, rows)
|
||||||
|
|
||||||
|
// Retrieve numeric non-class Attributes
|
||||||
|
numericAttrs := base.NonClassFloatAttributes(X)
|
||||||
|
numericAttrSpecs := base.ResolveAttributes(X, numericAttrs)
|
||||||
|
|
||||||
|
// Convert each row
|
||||||
|
X.MapOverRows(numericAttrSpecs, func(row [][]byte, rowNo int) (bool, error) {
|
||||||
|
// Allocate a new row
|
||||||
|
probRow := make([]float64, len(numericAttrSpecs))
|
||||||
|
// Read out the row
|
||||||
|
for i, _ := range numericAttrSpecs {
|
||||||
|
probRow[i] = base.UnpackBytesToFloat(row[i])
|
||||||
|
}
|
||||||
|
// Add the row
|
||||||
|
problemVec[rowNo] = probRow
|
||||||
|
return true, nil
|
||||||
|
})
|
||||||
|
return problemVec
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user