Adding Integration For Fixed Data Grid in Predict And Evaluate

2025-04-26 13:49:14 +08:00 · 2020-07-18 10:47:22 +05:30 · 2020-07-18 10:47:22 +05:30 · d1228c5508
commit d1228c5508
parent 8848652943
2 changed files with 38 additions and 34 deletions
--- a/linear_models/logistic.go
+++ b/linear_models/logistic.go
@ -3,6 +3,7 @@ package linear_models
 import (
 	"errors"
 	"fmt"
+
 	"github.com/sjwhitworth/golearn/base"
 )

--- a/trees/cart_classifier.go
+++ b/trees/cart_classifier.go
@ -22,6 +22,7 @@ type CNode struct {
 }

 // CTree: Tree struct for Decision Tree Classifier
+type CTree struct {
 	RootNode    *CNode
 	criterion   string
 	maxDepth    int64
@ -81,7 +82,7 @@ func entropy(y []int64, labels []int64) (float64, int64) {
 }

 // Split the data into left node and right node based on feature and threshold - only needed for fresh nodes
-func testSplit(data [][]float64, feature int64, y []int64, threshold float64) ([][]float64, [][]float64, []int64, []int64) {
+func ctestSplit(data [][]float64, feature int64, y []int64, threshold float64) ([][]float64, [][]float64, []int64, []int64) {
 	var left [][]float64
 	var right [][]float64
 	var lefty []int64
@ -102,7 +103,7 @@ func testSplit(data [][]float64, feature int64, y []int64, threshold float64) ([
 }

 // Helper Function to check if data point is unique or not
-func stringInSlice(a float64, list []float64) bool {
+func cstringInSlice(a float64, list []float64) bool {
 	for _, b := range list {
 		if b == a {
 			return true
@ -112,10 +113,10 @@ func stringInSlice(a float64, list []float64) bool {
 }

 // Isolate only unique values. Needed for splitting data.
-func findUnique(data []float64) []float64 {
+func cfindUnique(data []float64) []float64 {
 	var unique []float64
 	for i := range data {
-		if !stringInSlice(data[i], unique) {
+		if !cstringInSlice(data[i], unique) {
 			unique = append(unique, data[i])
 		}
 	}
@ -123,7 +124,7 @@ func findUnique(data []float64) []float64 {
 }

 // Isolate only the feature being considered for splitting
-func getFeature(data [][]float64, feature int64) []float64 {
+func cgetFeature(data [][]float64, feature int64) []float64 {
 	var featureVals []float64
 	for i := range data {
 		featureVals = append(featureVals, data[i][feature])
@ -142,7 +143,7 @@ func NewDecisionTreeClassifier(criterion string, maxDepth int64, labels []int64)
 }

 // Make sure that split being considered has not been done before
-func validate(triedSplits [][]float64, feature int64, threshold float64) bool {
+func cvalidate(triedSplits [][]float64, feature int64, threshold float64) bool {
 	for i := range triedSplits {
 		split := triedSplits[i]
 		featureTried, thresholdTried := split[0], split[1]
@ -176,7 +177,7 @@ func cNewSlice(n []float64) *cSlice {
 }

 // Reorder the data by feature being considered. Optimizes code by reducing the number of times we have to loop over data for splitting
-func reOrderData(featureVal []float64, data [][]float64, y []int64) ([][]float64, []int64) {
+func creOrderData(featureVal []float64, data [][]float64, y []int64) ([][]float64, []int64) {
 	s := cNewSlice(featureVal)
 	sort.Sort(s)

@ -194,7 +195,7 @@ func reOrderData(featureVal []float64, data [][]float64, y []int64) ([][]float64
 }

 // Change data in Left Node and Right Node based on change in threshold
-func updateSplit(left [][]float64, lefty []int64, right [][]float64, righty []int64, feature int64, threshold float64) ([][]float64, []int64, [][]float64, []int64) {
+func cupdateSplit(left [][]float64, lefty []int64, right [][]float64, righty []int64, feature int64, threshold float64) ([][]float64, []int64, [][]float64, []int64) {

 	for right[0][feature] < threshold {
 		left = append(left, right[0])
@ -212,13 +213,13 @@ func (tree *CTree) Fit(X base.FixedDataGrid) {

 	data := classifierConvertInstancesToProblemVec(X)
 	y := classifierConvertInstancesToLabelVec(X)
-	emptyNode = bestSplit(*tree, data, y, tree.labels, emptyNode, tree.criterion, tree.maxDepth, 0)
+	emptyNode = cbestSplit(*tree, data, y, tree.labels, emptyNode, tree.criterion, tree.maxDepth, 0)

 	tree.RootNode = &emptyNode
 }

 // Iterativly find and record the best split - recursive function
-func bestSplit(tree CTree, data [][]float64, y []int64, labels []int64, upperNode CNode, criterion string, maxDepth int64, depth int64) CNode {
+func cbestSplit(tree CTree, data [][]float64, y []int64, labels []int64, upperNode CNode, criterion string, maxDepth int64, depth int64) CNode {

 	// Ensure that we have not reached maxDepth. maxDepth =-1 means split until nodes are pure
 	depth++
@ -258,12 +259,12 @@ func bestSplit(tree CTree, data [][]float64, y []int64, labels []int64, upperNod
 	var rightN CNode
 	// Iterate over all features
 	for i := 0; i < numFeatures; i++ {
-		featureVal := getFeature(data, int64(i))
-		unique := findUnique(featureVal)
+		featureVal := cgetFeature(data, int64(i))
+		unique := cfindUnique(featureVal)
 		sort.Float64s(unique)
 		numUnique := len(unique)

-		sortData, sortY := reOrderData(featureVal, data, y)
+		sortData, sortY := creOrderData(featureVal, data, y)

 		firstTime := true

@ -274,14 +275,14 @@ func bestSplit(tree CTree, data [][]float64, y []int64, labels []int64, upperNod
 			if j != (numUnique - 1) {
 				threshold := (unique[j] + unique[j+1]) / 2
 				// Ensure that same split has not been made before
-				if validate(tree.triedSplits, int64(i), threshold) {
+				if cvalidate(tree.triedSplits, int64(i), threshold) {
 					// We need to split data from fresh when considering new feature for the first time.
 					// Otherwise, we need to update the split by moving data points from left to right.
 					if firstTime {
-						left, right, lefty, righty = testSplit(sortData, int64(i), sortY, threshold)
+						left, right, lefty, righty = ctestSplit(sortData, int64(i), sortY, threshold)
 						firstTime = false
 					} else {
-						left, lefty, right, righty = updateSplit(left, lefty, right, righty, int64(i), threshold)
+						left, lefty, right, righty = cupdateSplit(left, lefty, right, righty, int64(i), threshold)
 					}

 					var leftGini float64
@ -332,7 +333,7 @@ func bestSplit(tree CTree, data [][]float64, y []int64, labels []int64, upperNod
 		if bestLeftGini > 0 {
 			tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold})
 			// Recursive splitting logic
-			leftN = bestSplit(tree, bestLeft, bestLefty, labels, leftN, criterion, maxDepth, depth)
+			leftN = cbestSplit(tree, bestLeft, bestLefty, labels, leftN, criterion, maxDepth, depth)
 			if leftN.Use_not == true {
 				upperNode.Left = &leftN
 			}
@ -342,7 +343,7 @@ func bestSplit(tree CTree, data [][]float64, y []int64, labels []int64, upperNod
 		if bestRightGini > 0 {
 			tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold})
 			// Recursive splitting logic
-			rightN = bestSplit(tree, bestRight, bestRighty, labels, rightN, criterion, maxDepth, depth)
+			rightN = cbestSplit(tree, bestRight, bestRighty, labels, rightN, criterion, maxDepth, depth)
 			if rightN.Use_not == true {
 				upperNode.Right = &rightN
 			}
@ -357,11 +358,11 @@ func bestSplit(tree CTree, data [][]float64, y []int64, labels []int64, upperNod
 // PrintTree : this function prints out entire tree for visualization - visible to user
 func (tree *CTree) PrintTree() {
 	rootNode := *tree.RootNode
-	printTreeFromNode(rootNode, "")
+	cprintTreeFromNode(rootNode, "")
 }

 // Tree struct has root node. That is used to print tree - invisible to user but called from PrintTree
-func printTreeFromNode(tree CNode, spacing string) float64 {
+func cprintTreeFromNode(tree CNode, spacing string) float64 {

 	fmt.Print(spacing + "Feature ")
 	fmt.Print(tree.Feature)
@ -381,59 +382,61 @@ func printTreeFromNode(tree CNode, spacing string) float64 {

 	if tree.Left != nil {
 		fmt.Println(spacing + "---> True")
-		printTreeFromNode(*tree.Left, spacing+"  ")
+		cprintTreeFromNode(*tree.Left, spacing+"  ")
 	}

 	if tree.Right != nil {
 		fmt.Println(spacing + "---> False")
-		printTreeFromNode(*tree.Right, spacing+"  ")
+		cprintTreeFromNode(*tree.Right, spacing+"  ")
 	}

 	return 0.0
 }

 // Predict a single data point by traversing the entire tree
-func predictSingle(tree CNode, instance []float64) int64 {
+func cpredictSingle(tree CNode, instance []float64) int64 {
 	if instance[tree.Feature] < tree.Threshold {
 		if tree.Left == nil {
 			return tree.LeftLabel
 		} else {
-			return predictSingle(*tree.Left, instance)
+			return cpredictSingle(*tree.Left, instance)
 		}
 	} else {
 		if tree.Right == nil {
 			return tree.RightLabel
 		} else {
-			return predictSingle(*tree.Right, instance)
+			return cpredictSingle(*tree.Right, instance)
 		}
 	}
 }

 // Predict is visible to user. Given test data, they receive predictions for every datapoint.
-func (tree *CTree) Predict(test [][]float64) []int64 {
+func (tree *CTree) Predict(X_test base.FixedDataGrid) []int64 {
 	root := *tree.RootNode
-
-	return predictFromNode(root, test)
+	test := classifierConvertInstancesToProblemVec(X_test)
+	return cpredictFromNode(root, test)
 }

 // This function uses the rootnode from Predict. It is invisible to user, but called from predict method.
-func predictFromNode(tree CNode, test [][]float64) []int64 {
+func cpredictFromNode(tree CNode, test [][]float64) []int64 {
 	var preds []int64
 	for i := range test {
-		iPred := predictSingle(tree, test[i])
+		iPred := cpredictSingle(tree, test[i])
 		preds = append(preds, iPred)
 	}
 	return preds
 }

 // Given Test data and label, return the accuracy of the classifier. Data has to be in float slice format before feeding.
-func (tree *CTree) Evaluate(xTest [][]float64, yTest []int64) float64 {
+func (tree *CTree) Evaluate(test base.FixedDataGrid) float64 {
 	rootNode := *tree.RootNode
-	return evaluateFromNode(rootNode, xTest, yTest)
+	xTest := classifierConvertInstancesToProblemVec(test)
+	yTest := classifierConvertInstancesToLabelVec(test)
+	return cevaluateFromNode(rootNode, xTest, yTest)
 }

-func evaluateFromNode(tree CNode, xTest [][]float64, yTest []int64) float64 {
-	preds := predictFromNode(tree, xTest)
+func cevaluateFromNode(tree CNode, xTest [][]float64, yTest []int64) float64 {
+	preds := cpredictFromNode(tree, xTest)
 	accuracy := 0.0
 	for i := range preds {
 		if preds[i] == yTest[i] {