diff --git a/trees/cart_classifier.go b/trees/cart_classifier.go index 9ae3b8e..c2ba59c 100644 --- a/trees/cart_classifier.go +++ b/trees/cart_classifier.go @@ -90,6 +90,16 @@ func entropy(y []int64, labels []int64) (float64, int64) { return entropy, maxLabel } +func calculateClassificationLoss(y []int64, labels []int64, criterion string) (float64, int64) { + if criterion == GINI { + return giniImpurity(y, labels) + } else if criterion == ENTROPY { + return entropy(y, labels) + } else { + panic("Invalid impurity function, choose from GINI or ENTROPY") + } +} + // Split the data into left node and right node based on feature and threshold func classifierCreateSplit(data [][]float64, feature int64, y []int64, threshold float64) ([][]float64, [][]float64, []int64, []int64) { var left [][]float64 @@ -111,37 +121,6 @@ func classifierCreateSplit(data [][]float64, feature int64, y []int64, threshold return left, right, lefty, righty } -// Helper Function to check if data point is unique or not. -// We will use this to isolate unique values of a feature -func classifierStringInSlice(a float64, list []float64) bool { - for _, b := range list { - if b == a { - return true - } - } - return false -} - -// Isolate only unique values. This way, we can try only unique splits and not redundant ones. -func classifierFindUnique(data []float64) []float64 { - var unique []float64 - for i := range data { - if !classifierStringInSlice(data[i], unique) { - unique = append(unique, data[i]) - } - } - return unique -} - -// Isolate only the feature being considered for splitting. Reduces the complexity in managing splits. -func classifierGetFeature(data [][]float64, feature int64) []float64 { - var featureVals []float64 - for i := range data { - featureVals = append(featureVals, data[i][feature]) - } - return featureVals -} - // Function to Create New Decision Tree Classifier. // It assigns all of the hyperparameters by user into the tree attributes. func NewDecisionTreeClassifier(criterion string, maxDepth int64, labels []int64) *CARTDecisionTreeClassifier { @@ -153,19 +132,6 @@ func NewDecisionTreeClassifier(criterion string, maxDepth int64, labels []int64) return &tree } -// Make sure that split being considered has not been done before. -// Else we will unnecessarily try splits that won't improve Impurity. -func classifierValidate(triedSplits [][]float64, feature int64, threshold float64) bool { - for i := range triedSplits { - split := triedSplits[i] - featureTried, thresholdTried := split[0], split[1] - if int64(featureTried) == feature && thresholdTried == threshold { - return false - } - } - return true -} - // Reorder the data by feature being considered. Optimizes code by reducing the number of times we have to loop over data for splitting func classifierReOrderData(featureVal []float64, data [][]float64, y []int64) ([][]float64, []int64) { s := NewSlice(featureVal) @@ -202,7 +168,7 @@ func classifierUpdateSplit(left [][]float64, lefty []int64, right [][]float64, r func (tree *CARTDecisionTreeClassifier) Fit(X base.FixedDataGrid) { var emptyNode classifierNode - data := classifierConvertInstancesToProblemVec(X) + data := convertInstancesToProblemVec(X) y := classifierConvertInstancesToLabelVec(X) emptyNode = classifierBestSplit(*tree, data, y, tree.labels, emptyNode, tree.criterion, tree.maxDepth, 0) @@ -221,40 +187,29 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y [] } numFeatures := len(data[0]) - var bestGini float64 - var origGini float64 + var bestGini, origGini float64 // Calculate loss based on Criterion Specified by user - if criterion == GINI { - origGini, upperNode.LeftLabel = giniImpurity(y, labels) - } else if criterion == ENTROPY { - origGini, upperNode.LeftLabel = entropy(y, labels) - } else { - panic("Invalid impurity function, choose from GINI or ENTROPY") - } + origGini, upperNode.LeftLabel = calculateClassificationLoss(y, labels, criterion) bestGini = origGini - bestLeft := data - bestRight := data - bestLefty := y - bestRighty := y + bestLeft, bestRight, bestLefty, bestRighty := data, data, y, y numData := len(data) - bestLeftGini := bestGini - bestRightGini := bestGini + bestLeftGini, bestRightGini := bestGini, bestGini upperNode.Use_not = true - var leftN classifierNode - var rightN classifierNode + var leftN, rightN classifierNode + // Iterate over all features for i := 0; i < numFeatures; i++ { - featureVal := classifierGetFeature(data, int64(i)) - unique := classifierFindUnique(featureVal) + + featureVal := getFeature(data, int64(i)) + unique := findUnique(featureVal) sort.Float64s(unique) - numUnique := len(unique) sortData, sortY := classifierReOrderData(featureVal, data, y) @@ -263,53 +218,43 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y [] var left, right [][]float64 var lefty, righty []int64 // Iterate over all possible thresholds for that feature - for j := range unique { - if j != (numUnique - 1) { - threshold := (unique[j] + unique[j+1]) / 2 - // Ensure that same split has not been made before - if classifierValidate(tree.triedSplits, int64(i), threshold) { - // We need to split data from fresh when considering new feature for the first time. - // Otherwise, we need to update the split by moving data points from left to right. - if firstTime { - left, right, lefty, righty = classifierCreateSplit(sortData, int64(i), sortY, threshold) - firstTime = false - } else { - left, lefty, right, righty = classifierUpdateSplit(left, lefty, right, righty, int64(i), threshold) - } + for j := 0; j < len(unique)-1; j++ { - var leftGini float64 - var rightGini float64 - var leftLabels int64 - var rightLabels int64 - - if criterion == GINI { - leftGini, leftLabels = giniImpurity(lefty, labels) - rightGini, rightLabels = giniImpurity(righty, labels) - } else if criterion == ENTROPY { - leftGini, leftLabels = entropy(lefty, labels) - rightGini, rightLabels = entropy(righty, labels) - } - // Calculate weighted gini impurity of child nodes - subGini := (leftGini * float64(len(left)) / float64(numData)) + (rightGini * float64(len(right)) / float64(numData)) - - // If we find a split that reduces impurity - if subGini < bestGini { - bestGini = subGini - bestLeft = left - bestRight = right - bestLefty = lefty - bestRighty = righty - upperNode.Threshold = threshold - upperNode.Feature = int64(i) - - upperNode.LeftLabel = leftLabels - upperNode.RightLabel = rightLabels - - bestLeftGini = leftGini - bestRightGini = rightGini - } + threshold := (unique[j] + unique[j+1]) / 2 + // Ensure that same split has not been made before + if validate(tree.triedSplits, int64(i), threshold) { + // We need to split data from fresh when considering new feature for the first time. + // Otherwise, we need to update the split by moving data points from left to right. + if firstTime { + left, right, lefty, righty = classifierCreateSplit(sortData, int64(i), sortY, threshold) + firstTime = false + } else { + left, lefty, right, righty = classifierUpdateSplit(left, lefty, right, righty, int64(i), threshold) } + var leftGini, rightGini float64 + var leftLabels, rightLabels int64 + + leftGini, leftLabels = calculateClassificationLoss(lefty, labels, criterion) + rightGini, rightLabels = calculateClassificationLoss(righty, labels, criterion) + + // Calculate weighted gini impurity of child nodes + subGini := (leftGini * float64(len(left)) / float64(numData)) + (rightGini * float64(len(right)) / float64(numData)) + + // If we find a split that reduces impurity + if subGini < bestGini { + bestGini = subGini + + bestLeft, bestRight = left, right + + bestLefty, bestRighty = lefty, righty + + upperNode.Threshold, upperNode.Feature = threshold, int64(i) + + upperNode.LeftLabel, upperNode.RightLabel = leftLabels, rightLabels + + bestLeftGini, bestRightGini = leftGini, rightGini + } } } } @@ -366,10 +311,8 @@ func classifierPrintTreeFromNode(tree classifierNode, spacing string) string { returnString += spacing + "---> True" + "\n" returnString += " " + spacing + "PREDICT " returnString += strconv.FormatInt(tree.LeftLabel, 10) + "\n" - } if tree.Right == nil { - returnString += spacing + "---> False" + "\n" returnString += " " + spacing + "PREDICT " returnString += strconv.FormatInt(tree.RightLabel, 10) + "\n" @@ -409,7 +352,7 @@ func classifierPredictSingle(tree classifierNode, instance []float64) int64 { // Given test data, return predictions for every datapoint. calls classifierPredictFromNode func (tree *CARTDecisionTreeClassifier) Predict(X_test base.FixedDataGrid) []int64 { root := *tree.RootNode - test := classifierConvertInstancesToProblemVec(X_test) + test := convertInstancesToProblemVec(X_test) return classifierPredictFromNode(root, test) } @@ -429,7 +372,7 @@ func classifierPredictFromNode(tree classifierNode, test [][]float64) []int64 { // Calls classifierEvaluateFromNode func (tree *CARTDecisionTreeClassifier) Evaluate(test base.FixedDataGrid) float64 { rootNode := *tree.RootNode - xTest := classifierConvertInstancesToProblemVec(test) + xTest := convertInstancesToProblemVec(test) yTest := classifierConvertInstancesToLabelVec(test) return classifierEvaluateFromNode(rootNode, xTest, yTest) } @@ -447,31 +390,6 @@ func classifierEvaluateFromNode(tree classifierNode, xTest [][]float64, yTest [] return accuracy } -// Helper function to convert base.FixedDataGrid into required format. Called in Fit, Predict -func classifierConvertInstancesToProblemVec(X base.FixedDataGrid) [][]float64 { - // Allocate problem array - _, rows := X.Size() - problemVec := make([][]float64, rows) - - // Retrieve numeric non-class Attributes - numericAttrs := base.NonClassFloatAttributes(X) - numericAttrSpecs := base.ResolveAttributes(X, numericAttrs) - - // Convert each row - X.MapOverRows(numericAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { - // Allocate a new row - probRow := make([]float64, len(numericAttrSpecs)) - // Read out the row - for i, _ := range numericAttrSpecs { - probRow[i] = base.UnpackBytesToFloat(row[i]) - } - // Add the row - problemVec[rowNo] = probRow - return true, nil - }) - return problemVec -} - // Helper function to convert base.FixedDataGrid into required format. Called in Fit, Predict func classifierConvertInstancesToLabelVec(X base.FixedDataGrid) []int64 { // Get the class Attributes diff --git a/trees/cart_regressor.go b/trees/cart_regressor.go index 34b7880..1d2d326 100644 --- a/trees/cart_regressor.go +++ b/trees/cart_regressor.go @@ -81,6 +81,16 @@ func mseImpurity(y []float64) (float64, float64) { return meanSquaredError(y, yHat), yHat } +func calculateRegressionLoss(y []float64, criterion string) (float64, float64) { + if criterion == MAE { + return maeImpurity(y) + } else if criterion == MSE { + return mseImpurity(y) + } else { + panic("Invalid impurity function, choose from MAE or MSE") + } +} + // Split the data into left and right based on trehsold and feature. func regressorCreateSplit(data [][]float64, feature int64, y []float64, threshold float64) ([][]float64, [][]float64, []float64, []float64) { var left [][]float64 @@ -102,39 +112,6 @@ func regressorCreateSplit(data [][]float64, feature int64, y []float64, threshol return left, right, lefty, righty } -// Helper function for finding unique values. -// Used for isolating unique values in a feature. -func regressorStringInSlice(a float64, list []float64) bool { - for _, b := range list { - if b == a { - return true - } - } - return false -} - -// Isolate only unique values. -// This way we can only try unique splits. -func regressorFindUnique(data []float64) []float64 { - var unique []float64 - for i := range data { - if !regressorStringInSlice(data[i], unique) { - unique = append(unique, data[i]) - } - } - return unique -} - -// Extract out a single feature from data. -// Reduces complexity in managing splits and sorting -func regressorGetFeature(data [][]float64, feature int64) []float64 { - var featureVals []float64 - for i := range data { - featureVals = append(featureVals, data[i][feature]) - } - return featureVals -} - // Interface for creating new Decision Tree Regressor func NewDecisionTreeRegressor(criterion string, maxDepth int64) *CARTDecisionTreeRegressor { var tree CARTDecisionTreeRegressor @@ -143,19 +120,6 @@ func NewDecisionTreeRegressor(criterion string, maxDepth int64) *CARTDecisionTre return &tree } -// Validate that the split being tested has not been done before. -// This prevents redundant splits from hapenning. -func regressorValidate(triedSplits [][]float64, feature int64, threshold float64) bool { - for i := range triedSplits { - split := triedSplits[i] - featureTried, thresholdTried := split[0], split[1] - if int64(featureTried) == feature && thresholdTried == threshold { - return false - } - } - return true -} - // Re order data based on a feature for optimizing code // Helps in updating splits without reiterating entire dataset func regressorReOrderData(featureVal []float64, data [][]float64, y []float64) ([][]float64, []float64) { @@ -204,6 +168,7 @@ func (tree *CARTDecisionTreeRegressor) Fit(X base.FixedDataGrid) { // Recursive function - stops if maxDepth is reached or nodes are pure func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []float64, upperNode regressorNode, criterion string, maxDepth int64, depth int64) regressorNode { + // Ensure that we have not reached maxDepth. maxDepth =-1 means split until nodes are pure depth++ if depth > maxDepth && maxDepth != -1 { @@ -211,39 +176,27 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl } numFeatures := len(data[0]) - var bestLoss float64 - var origLoss float64 + var bestLoss, origLoss float64 - if criterion == MAE { - origLoss, upperNode.LeftPred = maeImpurity(y) - } else if criterion == MSE { - origLoss, upperNode.LeftPred = mseImpurity(y) - } else { - panic("Invalid impurity function, choose from MAE or MSE") - } + origLoss, upperNode.LeftPred = calculateRegressionLoss(y, criterion) bestLoss = origLoss - bestLeft := data - bestRight := data - bestLefty := y - bestRighty := y + bestLeft, bestRight, bestLefty, bestRighty := data, data, y, y numData := len(data) - bestLeftLoss := bestLoss - bestRightLoss := bestLoss + bestLeftLoss, bestRightLoss := bestLoss, bestLoss upperNode.Use_not = true - var leftN regressorNode - var rightN regressorNode + var leftN, rightN regressorNode // Iterate over all features for i := 0; i < numFeatures; i++ { - featureVal := regressorGetFeature(data, int64(i)) - unique := regressorFindUnique(featureVal) + + featureVal := getFeature(data, int64(i)) + unique := findUnique(featureVal) sort.Float64s(unique) - numUnique := len(unique) sortData, sortY := regressorReOrderData(featureVal, data, y) @@ -252,49 +205,36 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl var left, right [][]float64 var lefty, righty []float64 - for j := range unique { - if j != (numUnique - 1) { - threshold := (unique[j] + unique[j+1]) / 2 - if regressorValidate(tree.triedSplits, int64(i), threshold) { - if firstTime { - left, right, lefty, righty = regressorCreateSplit(sortData, int64(i), sortY, threshold) - firstTime = false - } else { - left, lefty, right, righty = regressorUpdateSplit(left, lefty, right, righty, int64(i), threshold) - } - - var leftLoss float64 - var rightLoss float64 - var leftPred float64 - var rightPred float64 - - if criterion == MAE { - leftLoss, leftPred = maeImpurity(lefty) - rightLoss, rightPred = maeImpurity(righty) - } else if criterion == MSE { - leftLoss, leftPred = mseImpurity(lefty) - rightLoss, rightPred = mseImpurity(righty) - } - - subLoss := (leftLoss * float64(len(left)) / float64(numData)) + (rightLoss * float64(len(right)) / float64(numData)) - - if subLoss < bestLoss { - bestLoss = subLoss - bestLeft = left - bestRight = right - bestLefty = lefty - bestRighty = righty - upperNode.Threshold = threshold - upperNode.Feature = int64(i) - - upperNode.LeftPred = leftPred - upperNode.RightPred = rightPred - - bestLeftLoss = leftLoss - bestRightLoss = rightLoss - } + for j := 0; j < len(unique)-1; j++ { + threshold := (unique[j] + unique[j+1]) / 2 + if validate(tree.triedSplits, int64(i), threshold) { + if firstTime { + left, right, lefty, righty = regressorCreateSplit(sortData, int64(i), sortY, threshold) + firstTime = false + } else { + left, lefty, right, righty = regressorUpdateSplit(left, lefty, right, righty, int64(i), threshold) } + var leftLoss, rightLoss float64 + var leftPred, rightPred float64 + + leftLoss, leftPred = calculateRegressionLoss(lefty, criterion) + rightLoss, rightPred = calculateRegressionLoss(righty, criterion) + + subLoss := (leftLoss * float64(len(left)) / float64(numData)) + (rightLoss * float64(len(right)) / float64(numData)) + + if subLoss < bestLoss { + bestLoss = subLoss + + bestLeft, bestRight = left, right + bestLefty, bestRighty = lefty, righty + + upperNode.Threshold, upperNode.Feature = threshold, int64(i) + + upperNode.LeftPred, upperNode.RightPred = leftPred, rightPred + + bestLeftLoss, bestRightLoss = leftLoss, rightLoss + } } } } @@ -312,19 +252,16 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl if leftN.Use_not == true { upperNode.Left = &leftN } - } + if bestRightLoss > 0 { tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold}) rightN = regressorBestSplit(tree, bestRight, bestRighty, rightN, criterion, maxDepth, depth) if rightN.Use_not == true { upperNode.Right = &rightN } - } - } - return upperNode } @@ -349,20 +286,17 @@ func regressorPrintTreeFromNode(tree regressorNode, spacing string) string { returnString += fmt.Sprintf("%.3f", tree.LeftPred) + "\n" } if tree.Right == nil { - returnString += spacing + "---> False" + "\n" returnString += " " + spacing + "PREDICT " returnString += fmt.Sprintf("%.3f", tree.RightPred) + "\n" } if tree.Left != nil { - // fmt.Println(spacing + "---> True") returnString += spacing + "---> True" + "\n" returnString += regressorPrintTreeFromNode(*tree.Left, spacing+" ") } if tree.Right != nil { - // fmt.Println(spacing + "---> False") returnString += spacing + "---> False" + "\n" returnString += regressorPrintTreeFromNode(*tree.Right, spacing+" ") } diff --git a/trees/cart_test.go b/trees/cart_test.go index 047392a..50387b9 100644 --- a/trees/cart_test.go +++ b/trees/cart_test.go @@ -38,10 +38,10 @@ func TestRegressor(t *testing.T) { So(len(righty), ShouldEqual, 2) // Is isolating unique values working properly - So(len(classifierFindUnique([]float64{10, 1, 1})), ShouldEqual, 2) + So(len(findUnique([]float64{10, 1, 1})), ShouldEqual, 2) // is data reordered correctly - orderedData, orderedY := classifierReOrderData(classifierGetFeature(classifierData, 1), classifierData, classifiery) + orderedData, orderedY := classifierReOrderData(getFeature(classifierData, 1), classifierData, classifiery) fmt.Println(orderedData) fmt.Println(orderedY) So(orderedData[1][1], ShouldEqual, 3.0) @@ -85,11 +85,8 @@ func TestRegressor(t *testing.T) { So(len(rightData), ShouldEqual, 2) So(len(righty), ShouldEqual, 2) - // Is isolating unique values working properly - So(len(regressorFindUnique([]float64{10, 1, 1})), ShouldEqual, 2) - // is data reordered correctly - regressorOrderedData, regressorOrderedY := regressorReOrderData(regressorGetFeature(data, 1), data, y) + regressorOrderedData, regressorOrderedY := regressorReOrderData(getFeature(data, 1), data, y) So(regressorOrderedData[1][1], ShouldEqual, 3.0) So(regressorOrderedY[0], ShouldEqual, 2) diff --git a/trees/cart_utils.go b/trees/cart_utils.go new file mode 100644 index 0000000..d3b9b4a --- /dev/null +++ b/trees/cart_utils.go @@ -0,0 +1,74 @@ +package trees + +import ( + "github.com/sjwhitworth/golearn/base" +) + +// Helper Function to check if data point is unique or not. +// We will use this to isolate unique values of a feature +func stringInSlice(a float64, list []float64) bool { + for _, b := range list { + if b == a { + return true + } + } + return false +} + +// Isolate only unique values. This way, we can try only unique splits and not redundant ones. +func findUnique(data []float64) []float64 { + var unique []float64 + for i := range data { + if !stringInSlice(data[i], unique) { + unique = append(unique, data[i]) + } + } + return unique +} + +// Isolate only the feature being considered for splitting. Reduces the complexity in managing splits. +func getFeature(data [][]float64, feature int64) []float64 { + var featureVals []float64 + for i := range data { + featureVals = append(featureVals, data[i][feature]) + } + return featureVals +} + +// Make sure that split being considered has not been done before. +// Else we will unnecessarily try splits that won't improve Impurity. +func validate(triedSplits [][]float64, feature int64, threshold float64) bool { + for i := range triedSplits { + split := triedSplits[i] + featureTried, thresholdTried := split[0], split[1] + if int64(featureTried) == feature && thresholdTried == threshold { + return false + } + } + return true +} + +// Helper function to convert base.FixedDataGrid into required format. Called in Fit, Predict +func convertInstancesToProblemVec(X base.FixedDataGrid) [][]float64 { + // Allocate problem array + _, rows := X.Size() + problemVec := make([][]float64, rows) + + // Retrieve numeric non-class Attributes + numericAttrs := base.NonClassFloatAttributes(X) + numericAttrSpecs := base.ResolveAttributes(X, numericAttrs) + + // Convert each row + X.MapOverRows(numericAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { + // Allocate a new row + probRow := make([]float64, len(numericAttrSpecs)) + // Read out the row + for i, _ := range numericAttrSpecs { + probRow[i] = base.UnpackBytesToFloat(row[i]) + } + // Add the row + problemVec[rowNo] = probRow + return true, nil + }) + return problemVec +}