From 884865294358810caf94588772398bf1aa1d8a5f Mon Sep 17 00:00:00 2001 From: Ayush Date: Thu, 16 Jul 2020 13:37:34 +0530 Subject: [PATCH 01/24] Added Decision Tree Classifier CART implementation of Decision Tree Classifier, based on Gini Impurity or Entropy, as selected by the user. --- trees/cart_classifier.go | 495 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 495 insertions(+) create mode 100644 trees/cart_classifier.go diff --git a/trees/cart_classifier.go b/trees/cart_classifier.go new file mode 100644 index 0000000..c1e4043 --- /dev/null +++ b/trees/cart_classifier.go @@ -0,0 +1,495 @@ +package trees + +import ( + "fmt" + "math" + "sort" + "strings" + + "github.com/sjwhitworth/golearn/base" +) + +// CNode is Node struct for Decision Tree Classifier +type CNode struct { + Left *CNode + Right *CNode + Threshold float64 + Feature int64 + LeftLabel int64 + RightLabel int64 + Use_not bool + maxDepth int64 +} + +// CTree: Tree struct for Decision Tree Classifier + RootNode *CNode + criterion string + maxDepth int64 + labels []int64 + triedSplits [][]float64 +} + +// Calculate Gini Impurity of Target Labels +func giniImpurity(y []int64, labels []int64) (float64, int64) { + nInstances := len(y) + gini := 0.0 + maxLabelCount := 0 + var maxLabel int64 = 0 + for label := range labels { + numLabel := 0 + for target := range y { + if y[target] == labels[label] { + numLabel++ + } + } + p := float64(numLabel) / float64(nInstances) + gini += p * (1 - p) + if numLabel > maxLabelCount { + maxLabel = labels[label] + maxLabelCount = numLabel + } + } + return gini, maxLabel +} + +// Calculate Entropy loss of Target Labels +func entropy(y []int64, labels []int64) (float64, int64) { + nInstances := len(y) + entropy := 0.0 + maxLabelCount := 0 + var maxLabel int64 = 0 + for label := range labels { + numLabel := 0 + for target := range y { + if y[target] == labels[label] { + numLabel++ + } + } + p := float64(numLabel) / float64(nInstances) + + logP := math.Log2(p) + if p == 0 { + logP = 0 + } + entropy += -p * logP + if numLabel > maxLabelCount { + maxLabel = labels[label] + maxLabelCount = numLabel + } + } + return entropy, maxLabel +} + +// Split the data into left node and right node based on feature and threshold - only needed for fresh nodes +func testSplit(data [][]float64, feature int64, y []int64, threshold float64) ([][]float64, [][]float64, []int64, []int64) { + var left [][]float64 + var right [][]float64 + var lefty []int64 + var righty []int64 + + for i := range data { + example := data[i] + if example[feature] < threshold { + left = append(left, example) + lefty = append(lefty, y[i]) + } else { + right = append(right, example) + righty = append(righty, y[i]) + } + } + + return left, right, lefty, righty +} + +// Helper Function to check if data point is unique or not +func stringInSlice(a float64, list []float64) bool { + for _, b := range list { + if b == a { + return true + } + } + return false +} + +// Isolate only unique values. Needed for splitting data. +func findUnique(data []float64) []float64 { + var unique []float64 + for i := range data { + if !stringInSlice(data[i], unique) { + unique = append(unique, data[i]) + } + } + return unique +} + +// Isolate only the feature being considered for splitting +func getFeature(data [][]float64, feature int64) []float64 { + var featureVals []float64 + for i := range data { + featureVals = append(featureVals, data[i][feature]) + } + return featureVals +} + +// Function to Create New Decision Tree Classifier +func NewDecisionTreeClassifier(criterion string, maxDepth int64, labels []int64) *CTree { + var tree CTree + tree.criterion = strings.ToLower(criterion) + tree.maxDepth = maxDepth + tree.labels = labels + + return &tree +} + +// Make sure that split being considered has not been done before +func validate(triedSplits [][]float64, feature int64, threshold float64) bool { + for i := range triedSplits { + split := triedSplits[i] + featureTried, thresholdTried := split[0], split[1] + if int64(featureTried) == feature && thresholdTried == threshold { + return false + } + } + return true +} + +// Helper struct for re-rdering data +type cSlice struct { + sort.Float64Slice + Idx []int +} + +// Helper function for re-ordering data +func (s cSlice) cSwap(i, j int) { + s.Float64Slice.Swap(i, j) + s.Idx[i], s.Idx[j] = s.Idx[j], s.Idx[i] +} + +// Final Helper Function for re-ordering data +func cNewSlice(n []float64) *cSlice { + s := &cSlice{Float64Slice: sort.Float64Slice(n), Idx: make([]int, len(n))} + + for i := range s.Idx { + s.Idx[i] = i + } + return s +} + +// Reorder the data by feature being considered. Optimizes code by reducing the number of times we have to loop over data for splitting +func reOrderData(featureVal []float64, data [][]float64, y []int64) ([][]float64, []int64) { + s := cNewSlice(featureVal) + sort.Sort(s) + + indexes := s.Idx + + var dataSorted [][]float64 + var ySorted []int64 + + for _, index := range indexes { + dataSorted = append(dataSorted, data[index]) + ySorted = append(ySorted, y[index]) + } + + return dataSorted, ySorted +} + +// Change data in Left Node and Right Node based on change in threshold +func updateSplit(left [][]float64, lefty []int64, right [][]float64, righty []int64, feature int64, threshold float64) ([][]float64, []int64, [][]float64, []int64) { + + for right[0][feature] < threshold { + left = append(left, right[0]) + right = right[1:] + lefty = append(lefty, righty[0]) + righty = righty[1:] + } + + return left, lefty, right, righty +} + +// Fit - Method visible to user to train tree +func (tree *CTree) Fit(X base.FixedDataGrid) { + var emptyNode CNode + + data := classifierConvertInstancesToProblemVec(X) + y := classifierConvertInstancesToLabelVec(X) + emptyNode = bestSplit(*tree, data, y, tree.labels, emptyNode, tree.criterion, tree.maxDepth, 0) + + tree.RootNode = &emptyNode +} + +// Iterativly find and record the best split - recursive function +func bestSplit(tree CTree, data [][]float64, y []int64, labels []int64, upperNode CNode, criterion string, maxDepth int64, depth int64) CNode { + + // Ensure that we have not reached maxDepth. maxDepth =-1 means split until nodes are pure + depth++ + + if maxDepth != -1 && depth > maxDepth { + return upperNode + } + + numFeatures := len(data[0]) + var bestGini float64 + var origGini float64 + + // Calculate loss based on Criterion Specified by user + if criterion == "gini" { + origGini, upperNode.LeftLabel = giniImpurity(y, labels) + } else if criterion == "entropy" { + origGini, upperNode.LeftLabel = entropy(y, labels) + } else { + panic("Invalid impurity function, choose from GINI or ENTROPY") + } + + bestGini = origGini + + bestLeft := data + bestRight := data + bestLefty := y + bestRighty := y + + numData := len(data) + + bestLeftGini := bestGini + bestRightGini := bestGini + + upperNode.Use_not = true + + var leftN CNode + var rightN CNode + // Iterate over all features + for i := 0; i < numFeatures; i++ { + featureVal := getFeature(data, int64(i)) + unique := findUnique(featureVal) + sort.Float64s(unique) + numUnique := len(unique) + + sortData, sortY := reOrderData(featureVal, data, y) + + firstTime := true + + var left, right [][]float64 + var lefty, righty []int64 + // Iterate over all possible thresholds for that feature + for j := range unique { + if j != (numUnique - 1) { + threshold := (unique[j] + unique[j+1]) / 2 + // Ensure that same split has not been made before + if validate(tree.triedSplits, int64(i), threshold) { + // We need to split data from fresh when considering new feature for the first time. + // Otherwise, we need to update the split by moving data points from left to right. + if firstTime { + left, right, lefty, righty = testSplit(sortData, int64(i), sortY, threshold) + firstTime = false + } else { + left, lefty, right, righty = updateSplit(left, lefty, right, righty, int64(i), threshold) + } + + var leftGini float64 + var rightGini float64 + var leftLabels int64 + var rightLabels int64 + + if criterion == "gini" { + leftGini, leftLabels = giniImpurity(lefty, labels) + rightGini, rightLabels = giniImpurity(righty, labels) + } else if criterion == "entropy" { + leftGini, leftLabels = entropy(lefty, labels) + rightGini, rightLabels = entropy(righty, labels) + } + // Calculate weighted gini impurity of child nodes + subGini := (leftGini * float64(len(left)) / float64(numData)) + (rightGini * float64(len(right)) / float64(numData)) + + // If we find a split that reduces impurity + if subGini < bestGini { + bestGini = subGini + bestLeft = left + bestRight = right + bestLefty = lefty + bestRighty = righty + upperNode.Threshold = threshold + upperNode.Feature = int64(i) + + upperNode.LeftLabel = leftLabels + upperNode.RightLabel = rightLabels + + bestLeftGini = leftGini + bestRightGini = rightGini + } + } + + } + } + } + // If no split was found, we don't want to use this node, so we will flag it + if bestGini == origGini { + upperNode.Use_not = false + return upperNode + } + // Until nodes are not pure + if bestGini > 0 { + + // If left node is pure, no need to split on left side again + if bestLeftGini > 0 { + tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold}) + // Recursive splitting logic + leftN = bestSplit(tree, bestLeft, bestLefty, labels, leftN, criterion, maxDepth, depth) + if leftN.Use_not == true { + upperNode.Left = &leftN + } + + } + // If right node is pure, no need to split on right side again + if bestRightGini > 0 { + tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold}) + // Recursive splitting logic + rightN = bestSplit(tree, bestRight, bestRighty, labels, rightN, criterion, maxDepth, depth) + if rightN.Use_not == true { + upperNode.Right = &rightN + } + + } + + } + // Return the node - contains all information regarding feature and threshold. + return upperNode +} + +// PrintTree : this function prints out entire tree for visualization - visible to user +func (tree *CTree) PrintTree() { + rootNode := *tree.RootNode + printTreeFromNode(rootNode, "") +} + +// Tree struct has root node. That is used to print tree - invisible to user but called from PrintTree +func printTreeFromNode(tree CNode, spacing string) float64 { + + fmt.Print(spacing + "Feature ") + fmt.Print(tree.Feature) + fmt.Print(" < ") + fmt.Println(tree.Threshold) + + if tree.Left == nil { + fmt.Println(spacing + "---> True") + fmt.Print(" " + spacing + "PREDICT ") + fmt.Println(tree.LeftLabel) + } + if tree.Right == nil { + fmt.Println(spacing + "---> FALSE") + fmt.Print(" " + spacing + "PREDICT ") + fmt.Println(tree.RightLabel) + } + + if tree.Left != nil { + fmt.Println(spacing + "---> True") + printTreeFromNode(*tree.Left, spacing+" ") + } + + if tree.Right != nil { + fmt.Println(spacing + "---> False") + printTreeFromNode(*tree.Right, spacing+" ") + } + + return 0.0 +} + +// Predict a single data point by traversing the entire tree +func predictSingle(tree CNode, instance []float64) int64 { + if instance[tree.Feature] < tree.Threshold { + if tree.Left == nil { + return tree.LeftLabel + } else { + return predictSingle(*tree.Left, instance) + } + } else { + if tree.Right == nil { + return tree.RightLabel + } else { + return predictSingle(*tree.Right, instance) + } + } +} + +// Predict is visible to user. Given test data, they receive predictions for every datapoint. +func (tree *CTree) Predict(test [][]float64) []int64 { + root := *tree.RootNode + + return predictFromNode(root, test) +} + +// This function uses the rootnode from Predict. It is invisible to user, but called from predict method. +func predictFromNode(tree CNode, test [][]float64) []int64 { + var preds []int64 + for i := range test { + iPred := predictSingle(tree, test[i]) + preds = append(preds, iPred) + } + return preds +} + +// Given Test data and label, return the accuracy of the classifier. Data has to be in float slice format before feeding. +func (tree *CTree) Evaluate(xTest [][]float64, yTest []int64) float64 { + rootNode := *tree.RootNode + return evaluateFromNode(rootNode, xTest, yTest) +} + +func evaluateFromNode(tree CNode, xTest [][]float64, yTest []int64) float64 { + preds := predictFromNode(tree, xTest) + accuracy := 0.0 + for i := range preds { + if preds[i] == yTest[i] { + accuracy++ + } + } + accuracy /= float64(len(yTest)) + return accuracy +} + +// Helper function to convert base.FixedDataGrid into required format. Called in Fit +func classifierConvertInstancesToProblemVec(X base.FixedDataGrid) [][]float64 { + // Allocate problem array + _, rows := X.Size() + problemVec := make([][]float64, rows) + + // Retrieve numeric non-class Attributes + numericAttrs := base.NonClassFloatAttributes(X) + numericAttrSpecs := base.ResolveAttributes(X, numericAttrs) + + // Convert each row + X.MapOverRows(numericAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { + // Allocate a new row + probRow := make([]float64, len(numericAttrSpecs)) + // Read out the row + for i, _ := range numericAttrSpecs { + probRow[i] = base.UnpackBytesToFloat(row[i]) + } + // Add the row + problemVec[rowNo] = probRow + return true, nil + }) + return problemVec +} + +// Helper function to convert base.FixedDataGrid into required format. Called in Fit +func classifierConvertInstancesToLabelVec(X base.FixedDataGrid) []int64 { + // Get the class Attributes + classAttrs := X.AllClassAttributes() + // Only support 1 class Attribute + if len(classAttrs) != 1 { + panic(fmt.Sprintf("%d ClassAttributes (1 expected)", len(classAttrs))) + } + // ClassAttribute must be numeric + if _, ok := classAttrs[0].(*base.FloatAttribute); !ok { + panic(fmt.Sprintf("%s: ClassAttribute must be a FloatAttribute", classAttrs[0])) + } + // Allocate return structure + _, rows := X.Size() + // labelVec := make([]float64, rows) + labelVec := make([]int64, rows) + // Resolve class Attribute specification + classAttrSpecs := base.ResolveAttributes(X, classAttrs) + X.MapOverRows(classAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { + labelVec[rowNo] = int64(base.UnpackBytesToFloat(row[0])) + return true, nil + }) + return labelVec +} From d1228c55083966c4e3aa8b99abe3968e28ab3c18 Mon Sep 17 00:00:00 2001 From: Ayush Date: Sat, 18 Jul 2020 10:47:22 +0530 Subject: [PATCH 02/24] Adding Integration For Fixed Data Grid in Predict And Evaluate --- linear_models/logistic.go | 1 + trees/cart_classifier.go | 71 ++++++++++++++++++++------------------- 2 files changed, 38 insertions(+), 34 deletions(-) diff --git a/linear_models/logistic.go b/linear_models/logistic.go index 96c3206..14ff0d2 100644 --- a/linear_models/logistic.go +++ b/linear_models/logistic.go @@ -3,6 +3,7 @@ package linear_models import ( "errors" "fmt" + "github.com/sjwhitworth/golearn/base" ) diff --git a/trees/cart_classifier.go b/trees/cart_classifier.go index c1e4043..90139bf 100644 --- a/trees/cart_classifier.go +++ b/trees/cart_classifier.go @@ -22,6 +22,7 @@ type CNode struct { } // CTree: Tree struct for Decision Tree Classifier +type CTree struct { RootNode *CNode criterion string maxDepth int64 @@ -81,7 +82,7 @@ func entropy(y []int64, labels []int64) (float64, int64) { } // Split the data into left node and right node based on feature and threshold - only needed for fresh nodes -func testSplit(data [][]float64, feature int64, y []int64, threshold float64) ([][]float64, [][]float64, []int64, []int64) { +func ctestSplit(data [][]float64, feature int64, y []int64, threshold float64) ([][]float64, [][]float64, []int64, []int64) { var left [][]float64 var right [][]float64 var lefty []int64 @@ -102,7 +103,7 @@ func testSplit(data [][]float64, feature int64, y []int64, threshold float64) ([ } // Helper Function to check if data point is unique or not -func stringInSlice(a float64, list []float64) bool { +func cstringInSlice(a float64, list []float64) bool { for _, b := range list { if b == a { return true @@ -112,10 +113,10 @@ func stringInSlice(a float64, list []float64) bool { } // Isolate only unique values. Needed for splitting data. -func findUnique(data []float64) []float64 { +func cfindUnique(data []float64) []float64 { var unique []float64 for i := range data { - if !stringInSlice(data[i], unique) { + if !cstringInSlice(data[i], unique) { unique = append(unique, data[i]) } } @@ -123,7 +124,7 @@ func findUnique(data []float64) []float64 { } // Isolate only the feature being considered for splitting -func getFeature(data [][]float64, feature int64) []float64 { +func cgetFeature(data [][]float64, feature int64) []float64 { var featureVals []float64 for i := range data { featureVals = append(featureVals, data[i][feature]) @@ -142,7 +143,7 @@ func NewDecisionTreeClassifier(criterion string, maxDepth int64, labels []int64) } // Make sure that split being considered has not been done before -func validate(triedSplits [][]float64, feature int64, threshold float64) bool { +func cvalidate(triedSplits [][]float64, feature int64, threshold float64) bool { for i := range triedSplits { split := triedSplits[i] featureTried, thresholdTried := split[0], split[1] @@ -176,7 +177,7 @@ func cNewSlice(n []float64) *cSlice { } // Reorder the data by feature being considered. Optimizes code by reducing the number of times we have to loop over data for splitting -func reOrderData(featureVal []float64, data [][]float64, y []int64) ([][]float64, []int64) { +func creOrderData(featureVal []float64, data [][]float64, y []int64) ([][]float64, []int64) { s := cNewSlice(featureVal) sort.Sort(s) @@ -194,7 +195,7 @@ func reOrderData(featureVal []float64, data [][]float64, y []int64) ([][]float64 } // Change data in Left Node and Right Node based on change in threshold -func updateSplit(left [][]float64, lefty []int64, right [][]float64, righty []int64, feature int64, threshold float64) ([][]float64, []int64, [][]float64, []int64) { +func cupdateSplit(left [][]float64, lefty []int64, right [][]float64, righty []int64, feature int64, threshold float64) ([][]float64, []int64, [][]float64, []int64) { for right[0][feature] < threshold { left = append(left, right[0]) @@ -212,13 +213,13 @@ func (tree *CTree) Fit(X base.FixedDataGrid) { data := classifierConvertInstancesToProblemVec(X) y := classifierConvertInstancesToLabelVec(X) - emptyNode = bestSplit(*tree, data, y, tree.labels, emptyNode, tree.criterion, tree.maxDepth, 0) + emptyNode = cbestSplit(*tree, data, y, tree.labels, emptyNode, tree.criterion, tree.maxDepth, 0) tree.RootNode = &emptyNode } // Iterativly find and record the best split - recursive function -func bestSplit(tree CTree, data [][]float64, y []int64, labels []int64, upperNode CNode, criterion string, maxDepth int64, depth int64) CNode { +func cbestSplit(tree CTree, data [][]float64, y []int64, labels []int64, upperNode CNode, criterion string, maxDepth int64, depth int64) CNode { // Ensure that we have not reached maxDepth. maxDepth =-1 means split until nodes are pure depth++ @@ -258,12 +259,12 @@ func bestSplit(tree CTree, data [][]float64, y []int64, labels []int64, upperNod var rightN CNode // Iterate over all features for i := 0; i < numFeatures; i++ { - featureVal := getFeature(data, int64(i)) - unique := findUnique(featureVal) + featureVal := cgetFeature(data, int64(i)) + unique := cfindUnique(featureVal) sort.Float64s(unique) numUnique := len(unique) - sortData, sortY := reOrderData(featureVal, data, y) + sortData, sortY := creOrderData(featureVal, data, y) firstTime := true @@ -274,14 +275,14 @@ func bestSplit(tree CTree, data [][]float64, y []int64, labels []int64, upperNod if j != (numUnique - 1) { threshold := (unique[j] + unique[j+1]) / 2 // Ensure that same split has not been made before - if validate(tree.triedSplits, int64(i), threshold) { + if cvalidate(tree.triedSplits, int64(i), threshold) { // We need to split data from fresh when considering new feature for the first time. // Otherwise, we need to update the split by moving data points from left to right. if firstTime { - left, right, lefty, righty = testSplit(sortData, int64(i), sortY, threshold) + left, right, lefty, righty = ctestSplit(sortData, int64(i), sortY, threshold) firstTime = false } else { - left, lefty, right, righty = updateSplit(left, lefty, right, righty, int64(i), threshold) + left, lefty, right, righty = cupdateSplit(left, lefty, right, righty, int64(i), threshold) } var leftGini float64 @@ -332,7 +333,7 @@ func bestSplit(tree CTree, data [][]float64, y []int64, labels []int64, upperNod if bestLeftGini > 0 { tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold}) // Recursive splitting logic - leftN = bestSplit(tree, bestLeft, bestLefty, labels, leftN, criterion, maxDepth, depth) + leftN = cbestSplit(tree, bestLeft, bestLefty, labels, leftN, criterion, maxDepth, depth) if leftN.Use_not == true { upperNode.Left = &leftN } @@ -342,7 +343,7 @@ func bestSplit(tree CTree, data [][]float64, y []int64, labels []int64, upperNod if bestRightGini > 0 { tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold}) // Recursive splitting logic - rightN = bestSplit(tree, bestRight, bestRighty, labels, rightN, criterion, maxDepth, depth) + rightN = cbestSplit(tree, bestRight, bestRighty, labels, rightN, criterion, maxDepth, depth) if rightN.Use_not == true { upperNode.Right = &rightN } @@ -357,11 +358,11 @@ func bestSplit(tree CTree, data [][]float64, y []int64, labels []int64, upperNod // PrintTree : this function prints out entire tree for visualization - visible to user func (tree *CTree) PrintTree() { rootNode := *tree.RootNode - printTreeFromNode(rootNode, "") + cprintTreeFromNode(rootNode, "") } // Tree struct has root node. That is used to print tree - invisible to user but called from PrintTree -func printTreeFromNode(tree CNode, spacing string) float64 { +func cprintTreeFromNode(tree CNode, spacing string) float64 { fmt.Print(spacing + "Feature ") fmt.Print(tree.Feature) @@ -381,59 +382,61 @@ func printTreeFromNode(tree CNode, spacing string) float64 { if tree.Left != nil { fmt.Println(spacing + "---> True") - printTreeFromNode(*tree.Left, spacing+" ") + cprintTreeFromNode(*tree.Left, spacing+" ") } if tree.Right != nil { fmt.Println(spacing + "---> False") - printTreeFromNode(*tree.Right, spacing+" ") + cprintTreeFromNode(*tree.Right, spacing+" ") } return 0.0 } // Predict a single data point by traversing the entire tree -func predictSingle(tree CNode, instance []float64) int64 { +func cpredictSingle(tree CNode, instance []float64) int64 { if instance[tree.Feature] < tree.Threshold { if tree.Left == nil { return tree.LeftLabel } else { - return predictSingle(*tree.Left, instance) + return cpredictSingle(*tree.Left, instance) } } else { if tree.Right == nil { return tree.RightLabel } else { - return predictSingle(*tree.Right, instance) + return cpredictSingle(*tree.Right, instance) } } } // Predict is visible to user. Given test data, they receive predictions for every datapoint. -func (tree *CTree) Predict(test [][]float64) []int64 { +func (tree *CTree) Predict(X_test base.FixedDataGrid) []int64 { root := *tree.RootNode - - return predictFromNode(root, test) + test := classifierConvertInstancesToProblemVec(X_test) + return cpredictFromNode(root, test) } // This function uses the rootnode from Predict. It is invisible to user, but called from predict method. -func predictFromNode(tree CNode, test [][]float64) []int64 { +func cpredictFromNode(tree CNode, test [][]float64) []int64 { var preds []int64 for i := range test { - iPred := predictSingle(tree, test[i]) + iPred := cpredictSingle(tree, test[i]) preds = append(preds, iPred) } return preds } // Given Test data and label, return the accuracy of the classifier. Data has to be in float slice format before feeding. -func (tree *CTree) Evaluate(xTest [][]float64, yTest []int64) float64 { +func (tree *CTree) Evaluate(test base.FixedDataGrid) float64 { rootNode := *tree.RootNode - return evaluateFromNode(rootNode, xTest, yTest) + xTest := classifierConvertInstancesToProblemVec(test) + yTest := classifierConvertInstancesToLabelVec(test) + return cevaluateFromNode(rootNode, xTest, yTest) } -func evaluateFromNode(tree CNode, xTest [][]float64, yTest []int64) float64 { - preds := predictFromNode(tree, xTest) +func cevaluateFromNode(tree CNode, xTest [][]float64, yTest []int64) float64 { + preds := cpredictFromNode(tree, xTest) accuracy := 0.0 for i := range preds { if preds[i] == yTest[i] { From 16eac7d86d464f6059fc842fbc1e845c07af5ff2 Mon Sep 17 00:00:00 2001 From: Ayush Date: Sat, 18 Jul 2020 12:26:50 +0530 Subject: [PATCH 03/24] Adding Regression Trees --- trees/cart_classifier.go | 2 + trees/cart_regressor.go | 446 +++++++++++++++++++++++++++++++++++++++ trees/tmp | Bin 413 -> 409 bytes 3 files changed, 448 insertions(+) create mode 100644 trees/cart_regressor.go diff --git a/trees/cart_classifier.go b/trees/cart_classifier.go index 90139bf..373fdad 100644 --- a/trees/cart_classifier.go +++ b/trees/cart_classifier.go @@ -9,6 +9,8 @@ import ( "github.com/sjwhitworth/golearn/base" ) +// The "c" prefix to function names indicates that they were tailored for classification + // CNode is Node struct for Decision Tree Classifier type CNode struct { Left *CNode diff --git a/trees/cart_regressor.go b/trees/cart_regressor.go new file mode 100644 index 0000000..2a962e8 --- /dev/null +++ b/trees/cart_regressor.go @@ -0,0 +1,446 @@ +package trees + +import ( + "fmt" + "math" + "sort" + "strings" + + "github.com/sjwhitworth/golearn/base" +) + +// The "r" prefix to all function names indicates that they were tailored to support regression. + +// See cart_classifier for details on functions. +type RNode struct { + Left *RNode + Right *RNode + Threshold float64 + Feature int64 + LeftPred float64 + RightPred float64 + Use_not bool +} + +type RTree struct { + RootNode *RNode + criterion string + maxDepth int64 + triedSplits [][]float64 +} + +func meanAbsoluteError(y []float64, yBar float64) float64 { + error := 0.0 + for _, target := range y { + error += math.Abs(target - yBar) + } + error /= float64(len(y)) + return error +} + +func average(y []float64) float64 { + mean := 0.0 + for _, value := range y { + mean += value + } + mean /= float64(len(y)) + return mean +} + +func maeImpurity(y []float64) (float64, float64) { + yHat := average(y) + return meanAbsoluteError(y, yHat), yHat +} + +func meanSquaredError(y []float64, yBar float64) float64 { + error := 0.0 + for _, target := range y { + item_error := target - yBar + error += math.Pow(item_error, 2) + } + error /= float64(len(y)) + return error +} + +func mseImpurity(y []float64) (float64, float64) { + yHat := average(y) + return meanSquaredError(y, yHat), yHat +} + +func rtestSplit(data [][]float64, feature int64, y []float64, threshold float64) ([][]float64, [][]float64, []float64, []float64) { + var left [][]float64 + var lefty []float64 + var right [][]float64 + var righty []float64 + + for i := range data { + example := data[i] + if example[feature] < threshold { + left = append(left, example) + lefty = append(lefty, y[i]) + } else { + right = append(right, example) + righty = append(righty, y[i]) + } + } + + return left, right, lefty, righty +} + +func rstringInSlice(a float64, list []float64) bool { + for _, b := range list { + if b == a { + return true + } + } + return false +} + +func rfindUnique(data []float64) []float64 { + var unique []float64 + for i := range data { + if !rstringInSlice(data[i], unique) { + unique = append(unique, data[i]) + } + } + return unique +} + +func rgetFeature(data [][]float64, feature int64) []float64 { + var featureVals []float64 + for i := range data { + featureVals = append(featureVals, data[i][feature]) + } + return featureVals +} + +func NewDecisionTreeRegressor(criterion string, maxDepth int64) *RTree { + var tree RTree + tree.maxDepth = maxDepth + tree.criterion = strings.ToLower(criterion) + return &tree +} + +func rvalidate(triedSplits [][]float64, feature int64, threshold float64) bool { + for i := range triedSplits { + split := triedSplits[i] + featureTried, thresholdTried := split[0], split[1] + if int64(featureTried) == feature && thresholdTried == threshold { + return false + } + } + return true +} + +// Helper struct for re-rdering data +type rSlice struct { + sort.Float64Slice + Idx []int +} + +// Helper function for re-ordering data +func (s rSlice) rSwap(i, j int) { + s.Float64Slice.Swap(i, j) + s.Idx[i], s.Idx[j] = s.Idx[j], s.Idx[i] +} + +// Final Helper Function for re-ordering data +func rNewSlice(n []float64) *rSlice { + s := &rSlice{Float64Slice: sort.Float64Slice(n), Idx: make([]int, len(n))} + + for i := range s.Idx { + s.Idx[i] = i + } + return s +} + +func rreOrderData(featureVal []float64, data [][]float64, y []float64) ([][]float64, []float64) { + s := rNewSlice(featureVal) + sort.Sort(s) + + indexes := s.Idx + + var dataSorted [][]float64 + var ySorted []float64 + + for _, index := range indexes { + dataSorted = append(dataSorted, data[index]) + ySorted = append(ySorted, y[index]) + } + + return dataSorted, ySorted + +} + +func rupdateSplit(left [][]float64, lefty []float64, right [][]float64, righty []float64, feature int64, threshold float64) ([][]float64, []float64, [][]float64, []float64) { + + for right[0][feature] < threshold { + left = append(left, right[0]) + right = right[1:] + lefty = append(lefty, righty[0]) + righty = righty[1:] + } + + return left, lefty, right, righty +} + +func sum(y []int64) int64 { + var sum_ int64 = 0 + for i := range y { + sum_ += y[i] + } + return sum_ +} + +// Extra Method for creating simple to use interface. Many params are either redundant for user but are needed only for recursive logic. +func (tree *RTree) Fit(X base.FixedDataGrid) { + var emptyNode RNode + data := regressorConvertInstancesToProblemVec(X) + y := regressorConvertInstancesToLabelVec(X) + + emptyNode = rbestSplit(*tree, data, y, emptyNode, tree.criterion, tree.maxDepth, 0) + + tree.RootNode = &emptyNode +} + +// Essentially the Fit Method +func rbestSplit(tree RTree, data [][]float64, y []float64, upperNode RNode, criterion string, maxDepth int64, depth int64) RNode { + + depth++ + + if depth > maxDepth && maxDepth != -1 { + return upperNode + } + + numFeatures := len(data[0]) + var bestLoss float64 + var origLoss float64 + + if criterion == "mae" { + origLoss, upperNode.LeftPred = maeImpurity(y) + } else { + origLoss, upperNode.LeftPred = mseImpurity(y) + } + + bestLoss = origLoss + + bestLeft := data + bestRight := data + bestLefty := y + bestRighty := y + + numData := len(data) + + bestLeftLoss := bestLoss + bestRightLoss := bestLoss + + upperNode.Use_not = true + + var leftN RNode + var rightN RNode + // Iterate over all features + for i := 0; i < numFeatures; i++ { + featureVal := rgetFeature(data, int64(i)) + unique := rfindUnique(featureVal) + sort.Float64s(unique) + numUnique := len(unique) + + sortData, sortY := rreOrderData(featureVal, data, y) + + firstTime := true + + var left, right [][]float64 + var lefty, righty []float64 + + for j := range unique { + if j != (numUnique - 1) { + threshold := (unique[j] + unique[j+1]) / 2 + if rvalidate(tree.triedSplits, int64(i), threshold) { + if firstTime { + left, right, lefty, righty = rtestSplit(sortData, int64(i), sortY, threshold) + firstTime = false + } else { + left, lefty, right, righty = rupdateSplit(left, lefty, right, righty, int64(i), threshold) + } + + var leftLoss float64 + var rightLoss float64 + var leftPred float64 + var rightPred float64 + + if criterion == "mae" { + leftLoss, leftPred = maeImpurity(lefty) + rightLoss, rightPred = maeImpurity(righty) + } else { + leftLoss, leftPred = mseImpurity(lefty) + rightLoss, rightPred = mseImpurity(righty) + } + + subLoss := (leftLoss * float64(len(left)) / float64(numData)) + (rightLoss * float64(len(right)) / float64(numData)) + + if subLoss < bestLoss { + bestLoss = subLoss + bestLeft = left + bestRight = right + bestLefty = lefty + bestRighty = righty + upperNode.Threshold = threshold + upperNode.Feature = int64(i) + + upperNode.LeftPred = leftPred + upperNode.RightPred = rightPred + + bestLeftLoss = leftLoss + bestRightLoss = rightLoss + } + } + + } + } + } + + if bestLoss == origLoss { + upperNode.Use_not = false + return upperNode + } + + if bestLoss > 0 { + + if bestLeftLoss > 0 { + tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold}) + leftN = rbestSplit(tree, bestLeft, bestLefty, leftN, criterion, maxDepth, depth) + if leftN.Use_not == true { + upperNode.Left = &leftN + } + + } + if bestRightLoss > 0 { + tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold}) + rightN = rbestSplit(tree, bestRight, bestRighty, rightN, criterion, maxDepth, depth) + if rightN.Use_not == true { + upperNode.Right = &rightN + } + + } + + } + + return upperNode +} + +func (tree *RTree) PrintTree() { + rootNode := *tree.RootNode + printTreeFromNode(rootNode, "") +} + +func printTreeFromNode(tree RNode, spacing string) float64 { + + fmt.Print(spacing + "Feature ") + fmt.Print(tree.Feature) + fmt.Print(" < ") + fmt.Println(tree.Threshold) + + if tree.Left == nil { + fmt.Println(spacing + "---> True") + fmt.Print(" " + spacing + "PREDICT ") + fmt.Println(tree.LeftPred) + } + if tree.Right == nil { + fmt.Println(spacing + "---> FALSE") + fmt.Print(" " + spacing + "PREDICT ") + fmt.Println(tree.RightPred) + } + + if tree.Left != nil { + fmt.Println(spacing + "---> True") + printTreeFromNode(*tree.Left, spacing+" ") + } + + if tree.Right != nil { + fmt.Println(spacing + "---> False") + printTreeFromNode(*tree.Right, spacing+" ") + } + + return 0.0 +} + +func predictSingle(tree RNode, instance []float64) float64 { + if instance[tree.Feature] < tree.Threshold { + if tree.Left == nil { + return tree.LeftPred + } else { + return predictSingle(*tree.Left, instance) + } + } else { + if tree.Right == nil { + return tree.RightPred + } else { + return predictSingle(*tree.Right, instance) + } + } +} + +func (tree *RTree) Predict(X_test base.FixedDataGrid) []float64 { + root := *tree.RootNode + test := regressorConvertInstancesToProblemVec(X_test) + return predictFromNode(root, test) +} + +func predictFromNode(tree RNode, test [][]float64) []float64 { + var preds []float64 + for i := range test { + i_pred := predictSingle(tree, test[i]) + preds = append(preds, i_pred) + } + return preds +} + +// Helper function to convert base.FixedDataGrid into required format. Called in Fit +func regressorConvertInstancesToProblemVec(X base.FixedDataGrid) [][]float64 { + // Allocate problem array + _, rows := X.Size() + problemVec := make([][]float64, rows) + + // Retrieve numeric non-class Attributes + numericAttrs := base.NonClassFloatAttributes(X) + numericAttrSpecs := base.ResolveAttributes(X, numericAttrs) + + // Convert each row + X.MapOverRows(numericAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { + // Allocate a new row + probRow := make([]float64, len(numericAttrSpecs)) + // Read out the row + for i, _ := range numericAttrSpecs { + probRow[i] = base.UnpackBytesToFloat(row[i]) + } + // Add the row + problemVec[rowNo] = probRow + return true, nil + }) + return problemVec +} + +// Helper function to convert base.FixedDataGrid into required format. Called in Fit +func regressorConvertInstancesToLabelVec(X base.FixedDataGrid) []float64 { + // Get the class Attributes + classAttrs := X.AllClassAttributes() + // Only support 1 class Attribute + if len(classAttrs) != 1 { + panic(fmt.Sprintf("%d ClassAttributes (1 expected)", len(classAttrs))) + } + // ClassAttribute must be numeric + if _, ok := classAttrs[0].(*base.FloatAttribute); !ok { + panic(fmt.Sprintf("%s: ClassAttribute must be a FloatAttribute", classAttrs[0])) + } + // Allocate return structure + _, rows := X.Size() + // labelVec := make([]float64, rows) + labelVec := make([]float64, rows) + // Resolve class Attribute specification + classAttrSpecs := base.ResolveAttributes(X, classAttrs) + X.MapOverRows(classAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { + labelVec[rowNo] = base.UnpackBytesToFloat(row[0]) + return true, nil + }) + return labelVec +} diff --git a/trees/tmp b/trees/tmp index 28c93c507c8869a97a9ff1d9eecbd160475a62e3..af98d1a33b82338d7466955c2c6aafb41cd3496c 100644 GIT binary patch delta 379 zcmV->0fhdY1DOMmCw~M9DHzboLCB!AxFoTN!GL1B&^oYGP5If}!4!aq=jj zBJgz$addGEaipakgz~?ku?brKH!w0c7|s9m0M$xq`9-;jCGlmcMa7xI!eho ziN(d4X_=`-N>)mqF2+haO3rYhFqlFmLp=i}WX18hsU?XiiGL-DN>+KLIXSfh?YB`t z7ytkO0RR8&l(7oJKn#Z8C3A(~+*y{GQ<_?=WEG9b$g#CLN_mO7si>J7QdTG> ZCzhn9=NDxrC*~-PX4s($00000|NlQXwFm$J delta 383 zcmV-_0f7FQ1DykqCx0+EHfB(um3vS?X>mzn5rcsNrRJb}Zis;N{G8OpqC5pdy&>b| zQ9woD>l)(d;uzvcOFIbVe?wyv^!#sRXfT@p=>e*h((;RP6HDUDQj3Z+^YfIf40V)} za}tY-Gt)9ti Date: Sat, 18 Jul 2020 14:21:50 +0530 Subject: [PATCH 04/24] Added Comments for Regressor --- trees/cart_classifier.go | 4 ++-- trees/cart_regressor.go | 40 +++++++++++++++++++++++++--------------- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/trees/cart_classifier.go b/trees/cart_classifier.go index 373fdad..f9cb6a1 100644 --- a/trees/cart_classifier.go +++ b/trees/cart_classifier.go @@ -449,7 +449,7 @@ func cevaluateFromNode(tree CNode, xTest [][]float64, yTest []int64) float64 { return accuracy } -// Helper function to convert base.FixedDataGrid into required format. Called in Fit +// Helper function to convert base.FixedDataGrid into required format. Called in Fit, Predict func classifierConvertInstancesToProblemVec(X base.FixedDataGrid) [][]float64 { // Allocate problem array _, rows := X.Size() @@ -474,7 +474,7 @@ func classifierConvertInstancesToProblemVec(X base.FixedDataGrid) [][]float64 { return problemVec } -// Helper function to convert base.FixedDataGrid into required format. Called in Fit +// Helper function to convert base.FixedDataGrid into required format. Called in Fit, Predict func classifierConvertInstancesToLabelVec(X base.FixedDataGrid) []int64 { // Get the class Attributes classAttrs := X.AllClassAttributes() diff --git a/trees/cart_regressor.go b/trees/cart_regressor.go index 2a962e8..7ec044a 100644 --- a/trees/cart_regressor.go +++ b/trees/cart_regressor.go @@ -11,7 +11,7 @@ import ( // The "r" prefix to all function names indicates that they were tailored to support regression. -// See cart_classifier for details on functions. +// RNode - Node struct for Decision Tree Regressor type RNode struct { Left *RNode Right *RNode @@ -22,6 +22,7 @@ type RNode struct { Use_not bool } +// RTree - Tree struct for Decision Tree Regressor type RTree struct { RootNode *RNode criterion string @@ -29,6 +30,7 @@ type RTree struct { triedSplits [][]float64 } +// Calculate Mean Absolute Error for a constant prediction func meanAbsoluteError(y []float64, yBar float64) float64 { error := 0.0 for _, target := range y { @@ -38,6 +40,7 @@ func meanAbsoluteError(y []float64, yBar float64) float64 { return error } +// Find average func average(y []float64) float64 { mean := 0.0 for _, value := range y { @@ -47,26 +50,30 @@ func average(y []float64) float64 { return mean } +// Turn Mean Absolute Error into impurity function for decision trees. func maeImpurity(y []float64) (float64, float64) { yHat := average(y) return meanAbsoluteError(y, yHat), yHat } +// Calculate Mean Squared Error for constant prediction func meanSquaredError(y []float64, yBar float64) float64 { error := 0.0 for _, target := range y { - item_error := target - yBar - error += math.Pow(item_error, 2) + itemError := target - yBar + error += math.Pow(itemError, 2) } error /= float64(len(y)) return error } +// Convert mean squared error into impurity function for decision trees func mseImpurity(y []float64) (float64, float64) { yHat := average(y) return meanSquaredError(y, yHat), yHat } +// Split the data based on threshold and feature for testing information gain func rtestSplit(data [][]float64, feature int64, y []float64, threshold float64) ([][]float64, [][]float64, []float64, []float64) { var left [][]float64 var lefty []float64 @@ -87,6 +94,7 @@ func rtestSplit(data [][]float64, feature int64, y []float64, threshold float64) return left, right, lefty, righty } +// Helper function for finding unique values func rstringInSlice(a float64, list []float64) bool { for _, b := range list { if b == a { @@ -96,6 +104,7 @@ func rstringInSlice(a float64, list []float64) bool { return false } +// Return only unique values of a feature func rfindUnique(data []float64) []float64 { var unique []float64 for i := range data { @@ -106,6 +115,7 @@ func rfindUnique(data []float64) []float64 { return unique } +// Extract out a single feature from data func rgetFeature(data [][]float64, feature int64) []float64 { var featureVals []float64 for i := range data { @@ -114,6 +124,7 @@ func rgetFeature(data [][]float64, feature int64) []float64 { return featureVals } +// Interface for creating new Decision Tree Regressor - cals rbestSplit() func NewDecisionTreeRegressor(criterion string, maxDepth int64) *RTree { var tree RTree tree.maxDepth = maxDepth @@ -121,6 +132,7 @@ func NewDecisionTreeRegressor(criterion string, maxDepth int64) *RTree { return &tree } +// Validate that the split being tested has not been done before. func rvalidate(triedSplits [][]float64, feature int64, threshold float64) bool { for i := range triedSplits { split := triedSplits[i] @@ -154,6 +166,7 @@ func rNewSlice(n []float64) *rSlice { return s } +// Re order data based on a feature for optimizing code func rreOrderData(featureVal []float64, data [][]float64, y []float64) ([][]float64, []float64) { s := rNewSlice(featureVal) sort.Sort(s) @@ -169,9 +182,9 @@ func rreOrderData(featureVal []float64, data [][]float64, y []float64) ([][]floa } return dataSorted, ySorted - } +// Update the left and right data based on change in threshold func rupdateSplit(left [][]float64, lefty []float64, right [][]float64, righty []float64, feature int64, threshold float64) ([][]float64, []float64, [][]float64, []float64) { for right[0][feature] < threshold { @@ -184,14 +197,6 @@ func rupdateSplit(left [][]float64, lefty []float64, right [][]float64, righty [ return left, lefty, right, righty } -func sum(y []int64) int64 { - var sum_ int64 = 0 - for i := range y { - sum_ += y[i] - } - return sum_ -} - // Extra Method for creating simple to use interface. Many params are either redundant for user but are needed only for recursive logic. func (tree *RTree) Fit(X base.FixedDataGrid) { var emptyNode RNode @@ -203,7 +208,7 @@ func (tree *RTree) Fit(X base.FixedDataGrid) { tree.RootNode = &emptyNode } -// Essentially the Fit Method +// Essentially the Fit Method - Impelements recursive logic func rbestSplit(tree RTree, data [][]float64, y []float64, upperNode RNode, criterion string, maxDepth int64, depth int64) RNode { depth++ @@ -328,11 +333,13 @@ func rbestSplit(tree RTree, data [][]float64, y []float64, upperNode RNode, crit return upperNode } +// Print Tree for Visualtion - calls printTreeFromNode() func (tree *RTree) PrintTree() { rootNode := *tree.RootNode printTreeFromNode(rootNode, "") } +// Use tree's root node to print out entire tree func printTreeFromNode(tree RNode, spacing string) float64 { fmt.Print(spacing + "Feature ") @@ -364,6 +371,7 @@ func printTreeFromNode(tree RNode, spacing string) float64 { return 0.0 } +// Predict a single data point func predictSingle(tree RNode, instance []float64) float64 { if instance[tree.Feature] < tree.Threshold { if tree.Left == nil { @@ -380,12 +388,14 @@ func predictSingle(tree RNode, instance []float64) float64 { } } +// Predict method for multiple data points. Calls predictFromNode() func (tree *RTree) Predict(X_test base.FixedDataGrid) []float64 { root := *tree.RootNode test := regressorConvertInstancesToProblemVec(X_test) return predictFromNode(root, test) } +// Use tree's root node to print out entire tree func predictFromNode(tree RNode, test [][]float64) []float64 { var preds []float64 for i := range test { @@ -395,7 +405,7 @@ func predictFromNode(tree RNode, test [][]float64) []float64 { return preds } -// Helper function to convert base.FixedDataGrid into required format. Called in Fit +// Helper function to convert base.FixedDataGrid into required format. Called in Fit, Predict func regressorConvertInstancesToProblemVec(X base.FixedDataGrid) [][]float64 { // Allocate problem array _, rows := X.Size() @@ -420,7 +430,7 @@ func regressorConvertInstancesToProblemVec(X base.FixedDataGrid) [][]float64 { return problemVec } -// Helper function to convert base.FixedDataGrid into required format. Called in Fit +// Helper function to convert base.FixedDataGrid into required format. Called in Fit, Predict func regressorConvertInstancesToLabelVec(X base.FixedDataGrid) []float64 { // Get the class Attributes classAttrs := X.AllClassAttributes() From c0837595238c35be0f0dfad8f05061811412007e Mon Sep 17 00:00:00 2001 From: Ayush Date: Wed, 22 Jul 2020 14:34:59 +0530 Subject: [PATCH 05/24] Adding Changes --- trees/cart_classifier.go | 59 +++++++++++++++++---------------- trees/cart_regressor.go | 70 +++++++++++++++++++++------------------- 2 files changed, 68 insertions(+), 61 deletions(-) diff --git a/trees/cart_classifier.go b/trees/cart_classifier.go index f9cb6a1..29646fb 100644 --- a/trees/cart_classifier.go +++ b/trees/cart_classifier.go @@ -4,6 +4,7 @@ import ( "fmt" "math" "sort" + "strconv" "strings" "github.com/sjwhitworth/golearn/base" @@ -23,8 +24,8 @@ type CNode struct { maxDepth int64 } -// CTree: Tree struct for Decision Tree Classifier -type CTree struct { +// CARTDecisionTreeClassifier: Tree struct for Decision Tree Classifier +type CARTDecisionTreeClassifier struct { RootNode *CNode criterion string maxDepth int64 @@ -135,8 +136,8 @@ func cgetFeature(data [][]float64, feature int64) []float64 { } // Function to Create New Decision Tree Classifier -func NewDecisionTreeClassifier(criterion string, maxDepth int64, labels []int64) *CTree { - var tree CTree +func NewDecisionTreeClassifier(criterion string, maxDepth int64, labels []int64) *CARTDecisionTreeClassifier { + var tree CARTDecisionTreeClassifier tree.criterion = strings.ToLower(criterion) tree.maxDepth = maxDepth tree.labels = labels @@ -210,7 +211,7 @@ func cupdateSplit(left [][]float64, lefty []int64, right [][]float64, righty []i } // Fit - Method visible to user to train tree -func (tree *CTree) Fit(X base.FixedDataGrid) { +func (tree *CARTDecisionTreeClassifier) Fit(X base.FixedDataGrid) { var emptyNode CNode data := classifierConvertInstancesToProblemVec(X) @@ -221,7 +222,7 @@ func (tree *CTree) Fit(X base.FixedDataGrid) { } // Iterativly find and record the best split - recursive function -func cbestSplit(tree CTree, data [][]float64, y []int64, labels []int64, upperNode CNode, criterion string, maxDepth int64, depth int64) CNode { +func cbestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y []int64, labels []int64, upperNode CNode, criterion string, maxDepth int64, depth int64) CNode { // Ensure that we have not reached maxDepth. maxDepth =-1 means split until nodes are pure depth++ @@ -358,41 +359,43 @@ func cbestSplit(tree CTree, data [][]float64, y []int64, labels []int64, upperNo } // PrintTree : this function prints out entire tree for visualization - visible to user -func (tree *CTree) PrintTree() { +func (tree *CARTDecisionTreeClassifier) String() string { rootNode := *tree.RootNode - cprintTreeFromNode(rootNode, "") + return cprintTreeFromNode(rootNode, "") } -// Tree struct has root node. That is used to print tree - invisible to user but called from PrintTree -func cprintTreeFromNode(tree CNode, spacing string) float64 { - - fmt.Print(spacing + "Feature ") - fmt.Print(tree.Feature) - fmt.Print(" < ") - fmt.Println(tree.Threshold) +func cprintTreeFromNode(tree CNode, spacing string) string { + returnString := "" + returnString += spacing + "Feature " + returnString += strconv.FormatInt(tree.Feature, 10) + returnString += " < " + returnString += fmt.Sprintf("%.3f", tree.Threshold) + returnString += "\n" if tree.Left == nil { - fmt.Println(spacing + "---> True") - fmt.Print(" " + spacing + "PREDICT ") - fmt.Println(tree.LeftLabel) + returnString += spacing + "---> True" + "\n" + returnString += " " + spacing + "PREDICT " + returnString += strconv.FormatInt(tree.LeftLabel, 10) + "\n" + } if tree.Right == nil { - fmt.Println(spacing + "---> FALSE") - fmt.Print(" " + spacing + "PREDICT ") - fmt.Println(tree.RightLabel) + + returnString += spacing + "---> False" + "\n" + returnString += " " + spacing + "PREDICT " + returnString += strconv.FormatInt(tree.RightLabel, 10) + "\n" } if tree.Left != nil { - fmt.Println(spacing + "---> True") - cprintTreeFromNode(*tree.Left, spacing+" ") + returnString += spacing + "---> True" + "\n" + returnString += cprintTreeFromNode(*tree.Left, spacing+" ") } if tree.Right != nil { - fmt.Println(spacing + "---> False") - cprintTreeFromNode(*tree.Right, spacing+" ") + returnString += spacing + "---> False" + "\n" + returnString += cprintTreeFromNode(*tree.Right, spacing+" ") } - return 0.0 + return returnString } // Predict a single data point by traversing the entire tree @@ -413,7 +416,7 @@ func cpredictSingle(tree CNode, instance []float64) int64 { } // Predict is visible to user. Given test data, they receive predictions for every datapoint. -func (tree *CTree) Predict(X_test base.FixedDataGrid) []int64 { +func (tree *CARTDecisionTreeClassifier) Predict(X_test base.FixedDataGrid) []int64 { root := *tree.RootNode test := classifierConvertInstancesToProblemVec(X_test) return cpredictFromNode(root, test) @@ -430,7 +433,7 @@ func cpredictFromNode(tree CNode, test [][]float64) []int64 { } // Given Test data and label, return the accuracy of the classifier. Data has to be in float slice format before feeding. -func (tree *CTree) Evaluate(test base.FixedDataGrid) float64 { +func (tree *CARTDecisionTreeClassifier) Evaluate(test base.FixedDataGrid) float64 { rootNode := *tree.RootNode xTest := classifierConvertInstancesToProblemVec(test) yTest := classifierConvertInstancesToLabelVec(test) diff --git a/trees/cart_regressor.go b/trees/cart_regressor.go index 7ec044a..48e61d0 100644 --- a/trees/cart_regressor.go +++ b/trees/cart_regressor.go @@ -4,6 +4,7 @@ import ( "fmt" "math" "sort" + "strconv" "strings" "github.com/sjwhitworth/golearn/base" @@ -22,8 +23,8 @@ type RNode struct { Use_not bool } -// RTree - Tree struct for Decision Tree Regressor -type RTree struct { +// CARTDecisionTreeRegressor - Tree struct for Decision Tree Regressor +type CARTDecisionTreeRegressor struct { RootNode *RNode criterion string maxDepth int64 @@ -125,8 +126,8 @@ func rgetFeature(data [][]float64, feature int64) []float64 { } // Interface for creating new Decision Tree Regressor - cals rbestSplit() -func NewDecisionTreeRegressor(criterion string, maxDepth int64) *RTree { - var tree RTree +func NewDecisionTreeRegressor(criterion string, maxDepth int64) *CARTDecisionTreeRegressor { + var tree CARTDecisionTreeRegressor tree.maxDepth = maxDepth tree.criterion = strings.ToLower(criterion) return &tree @@ -198,7 +199,7 @@ func rupdateSplit(left [][]float64, lefty []float64, right [][]float64, righty [ } // Extra Method for creating simple to use interface. Many params are either redundant for user but are needed only for recursive logic. -func (tree *RTree) Fit(X base.FixedDataGrid) { +func (tree *CARTDecisionTreeRegressor) Fit(X base.FixedDataGrid) { var emptyNode RNode data := regressorConvertInstancesToProblemVec(X) y := regressorConvertInstancesToLabelVec(X) @@ -209,7 +210,7 @@ func (tree *RTree) Fit(X base.FixedDataGrid) { } // Essentially the Fit Method - Impelements recursive logic -func rbestSplit(tree RTree, data [][]float64, y []float64, upperNode RNode, criterion string, maxDepth int64, depth int64) RNode { +func rbestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []float64, upperNode RNode, criterion string, maxDepth int64, depth int64) RNode { depth++ @@ -334,72 +335,75 @@ func rbestSplit(tree RTree, data [][]float64, y []float64, upperNode RNode, crit } // Print Tree for Visualtion - calls printTreeFromNode() -func (tree *RTree) PrintTree() { +func (tree *CARTDecisionTreeRegressor) String() string { rootNode := *tree.RootNode - printTreeFromNode(rootNode, "") + return rprintTreeFromNode(rootNode, "") } -// Use tree's root node to print out entire tree -func printTreeFromNode(tree RNode, spacing string) float64 { - - fmt.Print(spacing + "Feature ") - fmt.Print(tree.Feature) - fmt.Print(" < ") - fmt.Println(tree.Threshold) +func rprintTreeFromNode(tree RNode, spacing string) string { + returnString := "" + returnString += spacing + "Feature " + returnString += strconv.FormatInt(tree.Feature, 10) + returnString += " < " + returnString += fmt.Sprintf("%.3f", tree.Threshold) + returnString += "\n" if tree.Left == nil { - fmt.Println(spacing + "---> True") - fmt.Print(" " + spacing + "PREDICT ") - fmt.Println(tree.LeftPred) + returnString += spacing + "---> True" + "\n" + returnString += " " + spacing + "PREDICT " + returnString += fmt.Sprintf("%.3f", tree.LeftPred) + "\n" } if tree.Right == nil { - fmt.Println(spacing + "---> FALSE") - fmt.Print(" " + spacing + "PREDICT ") - fmt.Println(tree.RightPred) + + returnString += spacing + "---> False" + "\n" + returnString += " " + spacing + "PREDICT " + returnString += fmt.Sprintf("%.3f", tree.RightPred) + "\n" } if tree.Left != nil { - fmt.Println(spacing + "---> True") - printTreeFromNode(*tree.Left, spacing+" ") + // fmt.Println(spacing + "---> True") + returnString += spacing + "---> True" + "\n" + returnString += rprintTreeFromNode(*tree.Left, spacing+" ") } if tree.Right != nil { - fmt.Println(spacing + "---> False") - printTreeFromNode(*tree.Right, spacing+" ") + // fmt.Println(spacing + "---> False") + returnString += spacing + "---> False" + "\n" + returnString += rprintTreeFromNode(*tree.Right, spacing+" ") } - return 0.0 + return returnString } // Predict a single data point -func predictSingle(tree RNode, instance []float64) float64 { +func rpredictSingle(tree RNode, instance []float64) float64 { if instance[tree.Feature] < tree.Threshold { if tree.Left == nil { return tree.LeftPred } else { - return predictSingle(*tree.Left, instance) + return rpredictSingle(*tree.Left, instance) } } else { if tree.Right == nil { return tree.RightPred } else { - return predictSingle(*tree.Right, instance) + return rpredictSingle(*tree.Right, instance) } } } // Predict method for multiple data points. Calls predictFromNode() -func (tree *RTree) Predict(X_test base.FixedDataGrid) []float64 { +func (tree *CARTDecisionTreeRegressor) Predict(X_test base.FixedDataGrid) []float64 { root := *tree.RootNode test := regressorConvertInstancesToProblemVec(X_test) - return predictFromNode(root, test) + return rpredictFromNode(root, test) } // Use tree's root node to print out entire tree -func predictFromNode(tree RNode, test [][]float64) []float64 { +func rpredictFromNode(tree RNode, test [][]float64) []float64 { var preds []float64 for i := range test { - i_pred := predictSingle(tree, test[i]) + i_pred := rpredictSingle(tree, test[i]) preds = append(preds, i_pred) } return preds From b16b60fcb56f801070b89a4867e69422a78dd81a Mon Sep 17 00:00:00 2001 From: Ayush Date: Thu, 23 Jul 2020 16:45:31 +0530 Subject: [PATCH 06/24] Adding Example script for CART --- examples/datasets/boston_house_prices.csv | 1460 +++++++++++++++++++++ examples/datasets/titanic.csv | 889 +++++++++++++ examples/trees/cart.go | 61 + trees/cart_classifier.go | 2 +- 4 files changed, 2411 insertions(+), 1 deletion(-) create mode 100644 examples/datasets/boston_house_prices.csv create mode 100644 examples/datasets/titanic.csv create mode 100644 examples/trees/cart.go diff --git a/examples/datasets/boston_house_prices.csv b/examples/datasets/boston_house_prices.csv new file mode 100644 index 0000000..a330ca0 --- /dev/null +++ b/examples/datasets/boston_house_prices.csv @@ -0,0 +1,1460 @@ +7,208500 +6,181500 +7,223500 +7,140000 +8,250000 +5,143000 +8,307000 +7,200000 +7,129900 +5,118000 +5,129500 +9,345000 +5,144000 +7,279500 +6,157000 +7,132000 +6,149000 +4,90000 +5,159000 +5,139000 +8,325300 +7,139400 +8,230000 +5,129900 +5,154000 +8,256300 +5,134800 +8,306000 +5,207500 +4,68500 +4,40000 +5,149350 +8,179900 +5,165500 +9,277500 +8,309000 +5,145000 +5,153000 +5,109000 +4,82000 +6,160000 +5,170000 +5,144000 +5,130250 +5,141000 +9,319900 +7,239686 +8,249700 +4,113000 +5,127000 +6,177000 +6,114500 +5,110000 +9,385000 +5,130000 +6,180500 +8,172500 +7,196500 +10,438780 +5,124900 +6,158000 +5,101000 +8,202500 +7,140000 +7,219500 +8,317000 +7,180000 +7,226000 +4,80000 +7,225000 +7,244000 +4,129500 +7,185000 +5,144900 +3,107400 +4,91000 +4,135750 +5,127000 +4,136500 +5,110000 +6,193500 +6,153500 +8,245000 +5,126500 +7,168500 +8,260000 +6,174000 +6,164500 +3,85000 +4,123600 +4,109900 +5,98600 +5,163500 +6,133900 +6,204750 +6,185000 +7,214000 +4,94750 +5,83000 +4,128950 +6,205000 +6,178000 +5,118964 +7,198900 +7,169500 +8,250000 +4,100000 +5,115000 +5,115000 +6,190000 +6,136900 +7,180000 +7,383970 +6,217000 +6,259500 +6,176000 +5,139000 +5,155000 +7,320000 +6,163990 +6,180000 +4,100000 +6,136000 +6,153900 +6,181000 +6,84500 +6,128000 +5,87000 +6,155000 +5,150000 +7,226000 +6,244000 +5,150750 +8,220000 +5,180000 +7,174000 +5,143000 +7,171000 +8,230000 +6,231500 +4,115000 +7,260000 +5,166000 +7,204000 +5,125000 +6,130000 +5,105000 +7,222500 +7,141000 +5,115000 +5,122000 +8,372402 +6,190000 +6,235000 +6,125000 +6,79000 +5,109500 +8,269500 +7,254900 +7,320000 +6,162500 +9,412500 +7,220000 +4,103200 +6,152000 +5,127500 +5,190000 +8,325624 +7,183500 +8,228000 +5,128500 +6,215000 +7,239000 +6,163000 +6,184000 +6,243000 +6,211000 +5,172500 +9,501837 +5,100000 +6,177000 +7,200100 +5,120000 +7,200000 +5,127000 +10,475000 +7,173000 +5,135000 +5,153337 +8,286000 +8,315000 +7,184000 +7,192000 +7,130000 +5,127000 +6,148500 +7,311872 +8,235000 +6,104000 +8,274900 +4,140000 +6,171500 +6,112000 +6,149000 +5,110000 +7,180500 +5,143900 +4,141000 +7,277000 +6,145000 +5,98000 +6,186000 +7,252678 +5,156000 +6,161750 +5,134450 +7,210000 +4,107000 +7,311500 +7,167240 +7,204900 +6,200000 +6,179900 +4,97000 +10,386250 +5,112000 +7,290000 +6,106000 +5,125000 +7,192500 +6,148000 +8,403000 +6,94500 +5,128200 +6,216500 +6,89500 +7,185500 +7,194500 +8,318000 +6,113000 +8,262500 +5,110500 +5,79000 +6,120000 +7,205000 +7,241500 +6,137000 +6,140000 +7,180000 +6,277000 +3,76500 +8,235000 +6,173000 +6,158000 +5,145000 +7,230000 +6,207500 +7,220000 +7,231500 +5,97000 +6,176000 +8,276000 +6,151000 +5,130000 +5,73000 +6,175500 +6,185000 +5,179500 +5,120500 +6,148000 +8,266000 +7,241500 +8,290000 +6,139000 +5,124500 +7,205000 +7,201000 +4,141000 +9,415298 +7,192000 +7,228500 +6,185000 +7,207500 +8,244600 +6,179200 +7,164700 +6,159000 +4,88000 +5,122000 +6,153575 +8,233230 +5,135900 +5,131000 +7,235000 +6,167000 +6,142500 +5,152000 +7,239000 +6,175000 +6,158500 +5,157000 +8,267000 +7,205000 +5,149900 +7,295000 +8,305900 +7,225000 +6,89500 +4,82500 +9,360000 +6,165600 +6,132000 +5,119900 +7,375000 +7,178000 +7,188500 +7,260000 +8,270000 +7,260000 +7,187500 +9,342643 +8,354000 +7,301000 +3,126175 +7,242000 +5,87000 +8,324000 +6,145250 +6,214500 +5,78000 +5,119000 +5,139000 +8,284000 +7,207000 +6,192000 +5,228950 +9,377426 +7,214000 +7,202500 +6,155000 +8,202900 +4,82000 +3,87500 +9,266000 +5,85000 +6,140200 +6,151500 +6,157500 +7,154000 +9,437154 +9,318061 +7,190000 +5,95000 +6,105900 +6,140000 +6,177500 +6,173000 +5,134000 +5,130000 +8,280000 +6,156000 +5,145000 +7,198500 +6,118000 +6,190000 +5,147000 +6,159000 +6,165000 +5,132000 +5,162000 +6,172400 +4,134432 +6,125000 +5,123000 +7,219500 +1,61000 +5,148000 +8,340000 +9,394432 +6,179000 +5,127000 +7,187750 +7,213500 +6,76000 +6,240000 +8,192000 +5,81000 +6,125000 +7,191000 +10,426000 +5,119000 +6,215000 +5,106500 +4,100000 +5,109000 +5,129000 +5,123000 +5,169500 +5,67000 +7,241000 +8,245500 +7,164990 +5,108000 +8,258000 +6,168000 +4,150000 +6,115000 +6,177000 +7,280000 +8,339750 +5,60000 +5,145000 +7,222000 +5,115000 +7,228000 +7,181134 +6,149500 +6,239000 +5,126000 +5,142000 +7,206300 +6,215000 +5,113000 +8,315000 +6,139000 +7,135000 +7,275000 +4,109008 +7,195400 +6,175000 +6,85400 +6,79900 +5,122500 +6,181000 +4,81000 +7,212000 +6,116000 +6,119000 +5,90350 +6,110000 +10,555000 +4,118000 +5,162900 +7,172500 +7,210000 +6,127500 +6,190000 +7,199900 +6,119500 +3,120000 +6,110000 +7,280000 +6,204000 +8,210000 +5,188000 +7,175500 +5,98000 +4,256000 +8,161000 +5,110000 +8,263435 +7,155000 +5,62383 +6,188700 +5,124000 +7,178740 +7,167000 +5,146500 +8,250000 +6,187000 +8,212000 +7,190000 +6,148000 +8,440000 +8,251000 +5,132500 +6,208900 +9,380000 +8,297000 +4,89471 +9,326000 +9,374000 +7,155000 +6,164000 +5,132500 +5,147000 +5,156000 +5,175000 +5,160000 +4,86000 +5,115000 +6,133000 +6,172785 +5,155000 +5,91300 +4,34900 +8,430000 +7,184000 +5,130000 +5,120000 +6,113000 +7,226700 +5,140000 +7,289000 +6,147000 +5,124500 +8,215000 +6,208300 +7,161000 +5,124500 +5,164900 +7,202665 +5,129900 +6,134000 +5,96500 +10,402861 +6,158000 +7,265000 +6,211000 +7,234000 +4,106250 +6,150000 +6,159000 +10,184750 +7,315750 +7,176000 +5,132000 +9,446261 +4,86000 +6,200624 +6,175000 +6,128000 +5,107500 +1,39300 +8,178000 +5,107500 +7,188000 +4,111250 +5,158000 +8,272000 +9,315000 +8,248000 +7,213250 +7,133000 +7,179665 +7,229000 +6,210000 +5,129500 +5,125000 +7,263000 +6,140000 +5,112500 +8,255500 +4,108000 +7,284000 +5,113000 +5,141000 +4,108000 +7,175000 +7,234000 +5,121500 +5,170000 +5,108000 +6,185000 +7,268000 +6,128000 +9,325000 +7,214000 +8,316600 +5,135960 +5,142600 +6,120000 +7,224500 +7,170000 +5,139000 +5,118500 +7,145000 +5,164500 +7,146000 +5,131500 +6,181900 +8,253293 +6,118500 +10,325000 +4,133000 +8,369900 +6,130000 +5,137000 +5,143000 +5,79500 +7,185900 +10,451950 +5,138000 +6,140000 +5,110000 +8,319000 +6,114504 +7,194201 +5,217500 +6,151000 +8,275000 +6,141000 +8,220000 +7,151000 +7,221000 +7,205000 +5,152000 +5,225000 +8,359100 +4,118500 +9,313000 +6,148000 +8,261500 +5,147000 +4,75500 +6,137500 +6,183200 +6,105500 +9,314813 +8,305000 +3,67000 +6,240000 +5,135000 +6,168500 +6,165150 +6,160000 +5,139900 +6,153000 +5,135000 +6,168500 +5,124000 +8,209500 +7,82500 +5,139400 +6,144000 +6,200000 +2,60000 +5,93000 +5,85000 +8,264561 +8,274000 +7,226000 +8,345000 +5,152000 +9,370878 +6,143250 +5,98300 +6,155000 +6,155000 +4,84500 +7,205950 +4,108000 +7,191000 +6,135000 +8,350000 +6,88000 +5,145500 +7,149000 +6,97500 +5,167000 +7,197900 +8,402000 +6,110000 +4,137500 +8,423000 +8,230500 +6,129000 +6,193500 +5,168000 +4,137500 +6,173500 +6,103600 +6,165000 +6,257500 +6,140000 +6,148500 +4,87000 +5,109500 +8,372500 +5,128500 +6,143000 +5,159434 +6,173000 +9,285000 +7,221000 +7,207500 +7,227875 +7,148800 +8,392000 +6,194700 +6,141000 +10,755000 +7,335000 +5,108480 +5,141500 +6,176000 +5,89000 +5,123500 +5,138500 +7,196000 +8,312500 +7,140000 +8,361919 +5,140000 +7,213000 +4,55000 +7,302000 +8,254000 +7,179540 +5,109900 +3,52000 +4,102776 +8,189000 +4,129000 +6,130500 +6,165000 +7,159500 +5,157000 +7,341000 +5,128500 +8,275000 +6,143000 +4,124500 +4,135000 +9,320000 +4,120500 +6,222000 +7,194500 +5,110000 +4,103000 +8,236500 +7,187500 +7,222500 +5,131400 +5,108000 +7,163000 +3,93500 +8,239900 +5,179000 +7,190000 +5,132000 +6,142000 +7,179000 +5,175000 +8,180000 +8,299800 +7,236000 +7,265979 +7,260400 +4,98000 +4,96500 +7,162000 +6,217000 +8,275500 +6,156000 +6,172500 +8,212000 +6,158900 +7,179400 +8,290000 +6,127500 +5,100000 +7,215200 +8,337000 +8,270000 +9,264132 +7,196500 +6,160000 +7,216837 +8,538000 +5,134900 +4,102000 +6,107000 +5,114500 +8,395000 +6,162000 +7,221500 +5,142500 +5,144000 +6,135000 +7,176000 +6,175900 +7,187100 +5,165500 +6,128000 +6,161500 +5,139000 +7,233000 +4,107900 +6,187500 +7,160200 +6,146800 +7,269790 +8,225000 +7,194500 +6,171000 +6,143500 +5,110000 +9,485000 +5,175000 +6,200000 +4,109900 +7,189000 +9,582933 +5,118000 +7,227680 +5,135500 +5,223500 +5,159950 +5,106000 +6,181000 +6,144500 +5,55993 +6,157900 +5,116000 +7,224900 +5,137000 +8,271000 +6,155000 +7,224000 +7,183000 +4,93000 +7,225000 +6,139500 +8,232600 +10,385000 +5,109500 +7,189000 +5,185000 +7,147400 +6,166000 +7,151000 +7,237000 +6,167000 +5,139950 +4,128000 +5,153500 +6,100000 +5,144000 +5,130500 +6,140000 +5,157500 +6,174900 +5,141000 +5,153900 +5,171000 +7,213000 +5,133500 +6,240000 +6,187000 +6,131500 +8,215000 +7,164000 +6,158000 +5,170000 +5,127000 +6,147000 +6,174000 +7,152000 +6,250000 +7,189950 +5,131500 +6,152000 +5,132500 +7,250580 +5,148500 +8,248900 +4,129000 +5,169000 +7,236000 +5,109500 +6,200500 +5,116000 +5,133000 +5,66500 +8,303477 +4,132250 +9,350000 +5,148000 +5,136500 +5,157000 +7,187500 +6,178000 +4,118500 +5,100000 +9,328900 +5,145000 +5,135500 +8,268000 +6,149500 +5,122900 +6,172500 +6,154500 +5,165000 +5,118858 +6,140000 +4,106500 +5,142953 +9,611657 +5,135000 +4,110000 +5,153000 +7,180000 +7,240000 +5,125500 +5,128000 +8,255000 +7,250000 +5,131000 +6,174000 +5,154300 +5,143500 +5,88000 +5,145000 +6,173733 +4,75000 +2,35311 +4,135000 +7,238000 +6,176500 +6,201000 +5,145900 +6,169990 +6,193000 +6,207500 +5,175000 +8,285000 +7,176000 +8,236500 +7,222000 +8,201000 +5,117500 +9,320000 +7,190000 +7,242000 +4,79900 +7,184900 +7,253000 +7,239799 +7,244400 +6,150900 +7,214000 +4,150000 +5,143000 +6,137500 +5,124900 +5,143000 +8,270000 +7,192500 +6,197500 +5,129000 +5,119900 +5,133900 +5,172000 +6,127500 +6,145000 +6,124000 +5,132000 +7,185000 +7,155000 +5,116500 +6,272000 +6,155000 +9,239000 +7,214900 +6,178900 +5,160000 +5,135000 +3,37900 +6,140000 +4,135000 +7,173000 +6,99500 +7,182000 +7,167500 +7,165000 +4,85500 +7,199900 +4,110000 +5,139000 +7,178400 +8,336000 +7,159895 +8,255900 +5,126000 +5,125000 +6,117000 +9,395192 +6,195000 +7,197000 +8,348000 +8,168000 +6,187000 +6,173900 +10,337500 +4,121600 +5,136500 +6,185000 +3,91000 +7,206000 +3,82000 +5,86000 +8,232000 +5,136905 +7,181000 +5,149900 +6,163500 +4,88000 +7,240000 +5,102000 +5,135000 +5,100000 +6,165000 +5,85000 +6,119200 +8,227000 +7,203000 +8,187500 +7,160000 +7,213490 +4,176000 +7,194000 +5,87000 +7,191000 +8,287000 +5,112500 +5,167500 +8,293077 +5,105000 +6,118000 +5,160000 +7,197000 +8,310000 +7,230000 +5,119750 +4,84000 +9,315500 +8,287000 +4,97000 +4,80000 +5,155000 +6,173000 +6,196000 +7,262280 +8,278000 +3,139600 +9,556581 +5,145000 +5,115000 +4,84900 +7,176485 +7,200141 +6,165000 +5,144500 +8,255000 +6,180000 +7,185850 +7,248000 +9,335000 +6,220000 +8,213500 +3,81000 +5,90000 +6,110500 +5,154000 +7,328000 +6,178000 +6,167900 +6,151400 +5,135000 +5,135000 +6,154000 +5,91500 +6,159500 +7,194000 +7,219500 +5,170000 +5,138800 +6,155900 +5,126000 +6,145000 +5,133000 +7,192000 +6,160000 +6,187500 +6,147000 +4,83500 +8,252000 +7,137500 +8,197000 +3,92900 +7,160000 +6,136500 +5,146000 +5,129000 +6,176432 +6,127000 +8,170000 +4,128000 +7,157000 +2,60000 +5,119500 +5,135000 +6,159500 +5,106000 +8,325000 +7,179900 +7,274725 +6,181000 +8,280000 +6,188000 +7,205000 +5,129900 +5,134500 +5,117000 +8,318000 +8,184100 +5,130000 +5,140000 +5,133700 +6,118400 +7,212900 +4,112000 +5,118000 +7,163900 +4,115000 +7,174000 +7,259000 +7,215000 +5,140000 +4,135000 +5,93500 +6,117500 +8,239500 +6,169000 +6,102000 +6,119000 +5,94000 +6,196000 +5,144000 +5,139000 +5,197500 +8,424870 +5,80000 +4,80000 +5,149000 +6,180000 +7,174500 +7,116900 +7,143000 +6,124000 +5,149900 +6,230000 +6,120500 +7,201800 +5,218000 +5,179900 +7,230000 +8,235128 +6,185000 +6,146000 +6,224000 +5,129000 +4,108959 +5,194000 +7,233170 +8,245350 +6,173000 +6,235000 +10,625000 +6,171000 +6,163000 +7,171900 +5,200500 +6,239000 +8,285000 +5,119500 +6,115000 +5,154900 +5,93000 +7,250000 +8,392500 +10,745000 +5,120000 +5,186700 +5,104900 +3,95000 +8,262000 +7,195000 +7,189000 +4,168000 +8,174000 +5,125000 +6,165000 +6,158000 +6,176000 +7,219210 +7,144000 +7,178000 +4,148000 +4,116050 +7,197900 +5,117000 +7,213000 +5,153500 +7,271900 +4,107000 +6,200000 +5,140000 +8,290000 +6,189000 +8,164000 +4,113000 +4,145000 +5,134500 +5,125000 +6,112000 +8,229456 +4,80500 +6,91500 +5,115000 +5,134000 +6,143000 +5,137900 +7,184000 +6,145000 +6,214000 +5,147000 +9,367294 +5,127000 +5,190000 +5,132500 +4,101800 +5,142000 +5,130000 +5,138887 +7,175500 +7,195000 +6,142500 +8,265900 +7,224900 +7,248328 +7,170000 +10,465000 +8,230000 +6,178000 +7,186500 +6,169900 +6,129500 +5,119000 +7,244000 +7,171750 +5,130000 +7,294000 +7,165400 +6,127500 +8,301500 +5,99900 +7,190000 +6,151000 +6,181000 +5,128900 +4,161500 +6,180500 +6,181000 +7,183900 +7,122000 +9,378500 +8,381000 +5,144000 +7,260000 +6,185750 +5,137000 +6,177000 +5,139000 +5,137000 +6,162000 +6,197900 +8,237000 +4,68400 +7,227000 +7,180000 +5,150500 +6,139000 +6,169000 +6,132500 +6,143000 +5,190000 +8,278000 +8,281000 +5,180500 +5,119500 +5,107500 +7,162900 +5,115000 +5,138500 +5,155000 +6,140000 +10,160000 +5,154000 +7,225000 +6,177500 +8,290000 +7,232000 +7,130000 +9,325000 +7,202500 +5,138000 +5,147000 +6,179200 +7,335000 +7,203000 +8,302000 +9,333168 +4,119000 +6,206900 +8,295493 +7,208900 +8,275000 +4,111000 +6,156500 +3,72500 +7,190000 +4,82500 +8,147000 +4,55000 +3,79000 +5,130500 +6,256000 +7,176500 +8,227000 +5,132500 +4,100000 +5,125500 +5,125000 +6,167900 +5,135000 +4,52500 +7,200000 +5,128500 +4,123000 +6,155000 +8,228500 +6,177000 +7,155835 +4,108500 +7,262500 +8,283463 +7,215000 +8,122000 +5,200000 +6,171000 +6,134900 +8,410000 +7,235000 +7,170000 +5,110000 +5,149900 +6,177500 +9,315000 +5,189000 +7,260000 +4,104900 +6,156932 +7,144152 +7,216000 +7,193000 +5,127000 +6,144000 +8,232000 +4,105000 +6,165500 +7,274300 +10,466500 +7,250000 +8,239000 +6,91000 +5,117000 +6,83000 +5,167500 +3,58500 +6,237500 +7,157000 +5,112000 +6,105000 +4,125500 +7,250000 +6,136000 +9,377500 +6,131000 +7,235000 +5,124000 +5,123000 +6,163000 +7,246578 +8,281213 +5,160000 +5,137500 +5,138000 +6,137450 +6,120000 +6,193000 +7,193879 +8,282922 +3,105000 +8,275000 +5,133000 +5,112000 +4,125500 +7,215000 +7,230000 +6,140000 +4,90000 +8,257000 +6,207000 +7,175900 +4,122500 +8,340000 +5,124000 +6,223000 +6,179900 +6,127500 +6,136500 +6,274970 +5,144000 +6,142000 +7,271000 +5,140000 +5,119000 +6,182900 +5,192140 +6,143750 +4,64500 +6,186500 +5,160000 +6,174000 +4,120500 +8,394617 +6,149700 +7,197000 +6,191000 +6,149300 +10,310000 +6,121000 +7,179600 +6,129000 +5,157900 +8,240000 +4,112000 +5,92000 +5,136000 +8,287090 +5,145000 +5,84500 +7,185000 +6,175000 +6,210000 +7,266500 +5,142125 +5,147500 \ No newline at end of file diff --git a/examples/datasets/titanic.csv b/examples/datasets/titanic.csv new file mode 100644 index 0000000..bb66baf --- /dev/null +++ b/examples/datasets/titanic.csv @@ -0,0 +1,889 @@ +0,3,1,2 +1,1,0,0 +1,3,0,2 +1,1,0,2 +0,3,1,2 +0,3,1,1 +0,1,1,2 +0,3,1,2 +1,3,0,2 +1,2,0,0 +1,3,0,2 +1,1,0,2 +0,3,1,2 +0,3,1,2 +0,3,0,2 +1,2,0,2 +0,3,1,1 +1,2,1,2 +0,3,0,2 +1,3,0,0 +0,2,1,2 +1,2,1,2 +1,3,0,1 +1,1,1,2 +0,3,0,2 +1,3,0,2 +0,3,1,0 +0,1,1,2 +1,3,0,1 +0,3,1,2 +0,1,1,0 +1,1,0,0 +1,3,0,1 +0,2,1,2 +0,1,1,0 +0,1,1,2 +1,3,1,0 +0,3,1,2 +0,3,0,2 +1,3,0,0 +0,3,0,2 +0,2,0,2 +0,3,1,0 +1,2,0,0 +1,3,0,1 +0,3,1,2 +0,3,1,1 +1,3,0,1 +0,3,1,0 +0,3,0,2 +0,3,1,2 +0,3,1,2 +1,1,0,0 +1,2,0,2 +0,1,1,0 +1,1,1,2 +1,2,0,2 +0,3,1,0 +1,2,0,2 +0,3,1,2 +0,3,1,0 +0,1,1,2 +0,3,1,2 +0,1,1,0 +1,3,1,0 +1,2,0,2 +0,3,1,2 +1,3,0,2 +0,3,1,2 +0,2,1,2 +0,3,0,2 +0,2,1,2 +0,3,1,0 +1,3,1,2 +0,3,1,2 +0,3,1,2 +0,3,1,2 +1,2,1,2 +1,3,0,2 +0,3,1,2 +1,3,1,2 +1,3,0,1 +0,1,1,2 +1,2,0,2 +1,3,0,2 +0,3,1,2 +0,3,1,2 +1,1,0,2 +0,3,1,2 +0,3,1,2 +0,3,1,2 +0,1,1,2 +0,3,1,2 +0,3,1,2 +0,3,1,2 +0,1,1,0 +1,1,1,0 +1,2,0,2 +0,2,1,2 +0,3,0,2 +0,3,1,2 +0,1,1,2 +0,3,1,2 +0,3,1,2 +0,3,1,2 +1,3,0,2 +1,3,1,2 +0,3,1,2 +1,3,0,1 +0,1,1,2 +0,3,0,0 +0,3,1,2 +0,3,0,2 +0,3,0,0 +0,3,1,2 +0,3,1,1 +0,2,1,2 +0,1,1,0 +0,3,0,2 +0,2,1,2 +0,3,1,2 +0,2,1,0 +1,2,0,2 +0,1,1,2 +1,3,1,0 +0,3,1,1 +1,3,1,2 +1,3,0,0 +0,3,1,2 +0,3,1,0 +0,3,1,2 +0,3,0,2 +1,2,0,2 +0,2,1,2 +0,2,1,0 +1,1,0,2 +0,1,1,2 +0,3,1,2 +0,1,1,0 +0,3,0,0 +1,3,0,2 +1,3,0,2 +0,3,1,1 +0,2,1,2 +0,2,1,2 +1,3,1,2 +0,3,0,2 +0,2,1,2 +0,2,1,2 +0,2,1,2 +1,1,0,2 +0,3,1,2 +0,3,1,2 +0,3,1,2 +0,1,1,0 +1,3,0,1 +0,3,1,2 +0,3,1,2 +0,3,1,2 +0,3,1,2 +1,2,0,2 +0,3,1,2 +0,3,1,2 +0,3,1,2 +1,3,1,2 +1,1,0,2 +0,3,0,2 +0,1,1,2 +0,3,1,2 +0,1,1,2 +0,3,1,1 +1,3,0,2 +0,3,1,2 +0,1,1,0 +0,3,1,2 +0,3,1,2 +0,1,0,0 +0,2,1,2 +0,3,1,2 +0,3,0,2 +0,2,1,0 +0,3,1,2 +1,2,1,2 +1,3,0,2 +0,1,1,2 +1,3,0,1 +1,1,1,2 +0,3,1,1 +0,3,1,2 +1,2,0,2 +0,2,1,2 +1,3,0,2 +1,2,1,2 +1,1,0,0 +1,1,0,0 +0,3,1,1 +0,3,1,2 +1,3,0,1 +0,2,0,2 +0,3,1,2 +0,3,1,2 +0,3,1,2 +0,3,1,0 +1,3,1,2 +0,3,0,2 +0,3,1,2 +1,3,1,0 +1,3,0,1 +1,1,1,0 +0,3,1,2 +1,2,0,2 +0,3,1,2 +0,2,1,2 +0,3,1,1 +1,1,0,0 +1,3,0,2 +0,2,1,2 +1,1,0,0 +0,2,1,2 +1,3,1,2 +0,2,1,2 +0,3,1,2 +0,3,1,2 +1,1,1,2 +0,3,1,2 +1,2,1,2 +0,3,1,2 +0,2,1,2 +0,3,0,2 +1,1,0,2 +0,3,1,2 +0,2,1,2 +1,3,0,2 +0,2,1,2 +0,3,0,2 +0,2,1,2 +1,2,0,2 +0,2,1,2 +0,2,1,2 +0,3,0,0 +1,3,0,1 +0,2,1,2 +0,3,1,2 +0,3,1,0 +0,1,1,1 +0,3,0,2 +1,2,0,2 +1,1,1,2 +0,2,1,2 +0,3,1,2 +0,3,0,2 +0,1,1,2 +0,3,1,2 +0,3,0,2 +1,3,0,0 +1,1,0,0 +1,1,0,2 +1,1,0,0 +1,2,0,2 +0,3,1,1 +1,3,1,2 +0,1,1,2 +0,1,1,2 +0,3,0,1 +0,2,1,2 +0,3,1,2 +1,3,1,2 +1,1,0,2 +1,1,0,2 +0,1,1,2 +1,3,1,2 +1,2,0,2 +0,1,1,0 +1,3,0,1 +1,1,0,2 +0,3,0,2 +0,2,1,2 +0,3,1,1 +1,3,0,2 +0,3,1,1 +0,3,1,2 +0,3,1,2 +1,3,1,2 +0,1,1,2 +0,3,1,0 +1,3,1,2 +0,3,1,2 +1,2,1,2 +1,3,0,1 +1,1,0,2 +1,1,0,0 +0,2,1,0 +0,3,0,2 +0,3,1,2 +0,1,1,0 +0,3,1,0 +0,1,0,2 +1,1,1,2 +1,1,0,0 +1,3,0,1 +1,3,1,1 +0,3,1,2 +1,2,0,1 +0,3,1,2 +1,1,1,2 +1,1,0,0 +1,1,0,0 +0,2,1,0 +1,1,0,0 +1,1,0,0 +1,1,0,0 +0,2,0,2 +0,3,1,2 +0,2,1,2 +1,3,0,2 +1,2,0,2 +0,2,1,2 +1,1,0,2 +1,1,0,0 +0,3,1,2 +0,3,1,2 +1,2,0,1 +1,2,0,2 +0,3,1,2 +1,1,0,0 +0,3,1,2 +1,2,0,2 +1,3,0,2 +1,1,0,0 +1,3,0,1 +0,1,1,2 +0,1,1,2 +0,3,1,2 +1,1,0,2 +0,3,1,2 +0,1,1,2 +1,1,0,0 +1,3,1,2 +0,1,1,2 +1,2,1,2 +1,1,0,2 +0,2,1,2 +0,2,1,2 +0,2,1,2 +1,2,0,2 +1,2,0,2 +1,3,0,2 +1,3,1,2 +0,3,1,2 +0,3,1,2 +0,1,1,2 +0,3,1,0 +0,3,1,2 +0,3,1,0 +0,3,1,2 +1,1,0,2 +0,2,0,2 +1,3,0,1 +1,3,0,1 +0,3,1,2 +0,2,1,0 +0,3,0,0 +0,3,1,2 +0,3,1,1 +0,3,1,2 +1,1,0,0 +1,3,0,0 +1,3,0,1 +1,1,0,0 +1,1,1,0 +0,3,1,2 +0,3,1,2 +0,1,1,0 +0,3,0,2 +1,1,0,0 +1,3,0,2 +0,1,1,0 +0,3,1,0 +0,3,1,2 +1,1,0,0 +1,3,0,0 +0,3,1,2 +1,1,0,2 +0,3,1,2 +0,2,1,2 +0,3,1,2 +1,2,0,2 +0,3,1,1 +1,2,0,0 +1,1,1,2 +1,3,1,2 +0,3,1,2 +1,1,0,0 +1,3,0,2 +0,3,1,2 +0,3,0,2 +0,2,1,2 +0,2,1,2 +1,2,0,2 +1,3,1,2 +0,3,1,2 +0,3,0,2 +0,3,1,2 +0,3,0,2 +0,2,1,2 +0,3,1,2 +1,2,1,2 +0,3,1,2 +0,3,0,2 +0,3,1,2 +0,3,1,1 +1,1,0,1 +0,2,1,2 +1,3,1,2 +0,3,0,2 +1,2,0,2 +1,2,0,2 +0,2,1,2 +0,3,0,2 +0,3,1,0 +0,3,1,1 +0,3,1,2 +0,3,0,2 +0,3,1,2 +0,3,1,2 +1,2,0,2 +1,2,0,2 +0,3,1,1 +1,3,1,2 +1,1,1,2 +1,3,0,2 +1,2,0,2 +0,3,1,2 +0,1,1,2 +1,1,0,2 +0,3,0,2 +1,2,0,2 +0,1,1,2 +0,2,1,2 +1,2,0,2 +0,3,1,2 +0,3,1,2 +1,2,0,2 +1,3,1,2 +1,1,1,2 +1,2,0,2 +1,1,1,2 +1,3,0,0 +1,1,1,2 +0,2,1,2 +0,3,1,2 +0,1,1,0 +1,1,1,0 +0,3,1,2 +1,3,1,0 +0,1,1,2 +1,1,0,2 +1,2,0,2 +0,3,1,1 +1,1,1,2 +0,3,1,2 +0,1,1,2 +0,2,1,2 +0,3,1,2 +0,3,1,2 +0,2,1,2 +0,1,1,2 +0,3,1,1 +1,3,0,0 +0,3,1,2 +0,3,1,2 +1,2,0,2 +1,2,0,0 +0,3,0,2 +0,1,1,2 +0,2,1,2 +0,3,1,2 +0,3,1,2 +1,3,0,2 +0,3,1,2 +0,2,1,2 +0,3,1,2 +1,3,0,2 +1,1,1,0 +0,3,0,2 +1,1,0,2 +0,1,1,0 +0,3,1,2 +1,3,1,2 +0,3,1,2 +0,3,1,2 +0,1,1,2 +0,1,1,0 +0,3,1,2 +0,3,1,0 +1,1,0,0 +0,3,1,2 +0,1,0,2 +0,3,1,2 +0,3,1,2 +0,3,0,1 +0,3,0,1 +0,3,0,2 +1,1,0,2 +0,1,1,0 +1,2,0,2 +1,1,1,2 +0,3,1,2 +1,3,1,2 +1,3,1,1 +0,3,1,2 +1,1,1,2 +1,1,0,0 +0,3,1,2 +0,1,1,2 +1,2,0,2 +0,3,1,1 +1,2,0,2 +0,3,1,2 +1,1,0,2 +0,3,1,2 +0,3,1,0 +1,1,0,0 +0,3,1,0 +0,3,1,1 +1,2,0,2 +0,1,1,2 +0,3,1,2 +0,2,1,2 +1,2,0,2 +0,3,1,0 +0,3,1,0 +1,3,0,0 +0,3,0,2 +1,2,0,2 +0,1,1,2 +1,1,0,0 +0,3,1,2 +1,1,0,0 +1,1,0,2 +0,3,0,2 +0,3,0,2 +1,2,1,2 +0,1,1,0 +0,1,1,2 +1,2,0,2 +1,2,1,0 +0,3,1,2 +1,2,1,2 +1,1,1,0 +0,2,1,2 +0,3,1,1 +1,3,1,0 +1,3,0,2 +0,1,1,2 +1,1,0,0 +0,1,1,0 +1,1,0,2 +1,3,0,2 +0,3,1,1 +0,3,1,2 +0,2,1,2 +0,3,1,2 +0,3,0,2 +0,3,1,2 +0,3,1,2 +0,3,0,2 +0,3,1,0 +1,3,1,2 +1,2,1,2 +1,1,0,2 +1,1,1,2 +1,3,0,1 +0,3,1,2 +0,3,1,2 +1,2,0,2 +1,1,0,2 +0,3,0,0 +1,3,1,2 +1,2,0,2 +1,1,0,0 +0,2,1,2 +0,1,1,0 +0,3,1,0 +1,1,0,2 +0,2,1,2 +1,1,1,0 +0,3,1,2 +0,3,1,2 +0,3,1,2 +1,1,0,0 +0,3,1,2 +0,3,0,1 +0,2,1,2 +0,3,1,2 +1,2,0,2 +0,3,1,2 +0,3,1,0 +1,1,1,0 +1,2,0,2 +0,3,1,2 +0,1,1,2 +0,3,1,2 +1,1,1,0 +0,3,1,2 +0,3,1,2 +1,1,1,2 +1,2,0,0 +1,1,0,2 +0,3,0,2 +0,3,1,2 +1,3,0,1 +0,3,1,1 +0,3,1,2 +1,2,0,2 +0,3,1,2 +0,3,0,2 +1,2,0,2 +0,2,1,2 +0,3,1,0 +1,1,1,2 +1,3,1,0 +0,3,1,2 +0,3,1,2 +0,1,1,2 +0,2,1,1 +1,1,0,2 +0,3,1,2 +0,3,1,1 +1,1,1,2 +0,3,1,2 +1,1,1,0 +0,1,1,2 +0,3,0,2 +1,2,0,2 +0,3,1,2 +0,2,1,2 +0,3,0,2 +0,3,1,2 +0,3,1,2 +1,1,0,0 +0,3,0,2 +1,3,1,2 +1,3,0,0 +1,1,1,0 +0,3,1,2 +1,1,1,0 +0,3,1,2 +1,3,0,2 +0,3,1,2 +1,2,0,2 +0,3,1,2 +1,3,0,1 +0,3,0,1 +0,2,1,2 +0,3,1,2 +0,3,0,1 +0,2,1,2 +0,1,1,0 +1,1,1,2 +0,3,1,0 +0,1,1,2 +0,3,1,2 +1,3,1,2 +0,2,1,2 +0,2,1,2 +0,3,1,2 +0,3,1,2 +1,1,0,2 +1,2,0,2 +0,1,1,2 +0,2,1,2 +1,2,1,2 +0,2,1,2 +0,3,1,2 +0,3,1,2 +1,3,0,2 +0,3,0,2 +1,1,1,0 +0,3,0,1 +1,1,1,0 +0,3,1,2 +0,3,1,2 +0,2,1,2 +0,2,1,0 +0,3,1,2 +0,3,1,2 +0,3,1,2 +1,1,0,2 +1,1,1,2 +1,3,0,0 +1,3,1,2 +0,3,1,0 +0,1,1,2 +0,2,1,2 +0,3,1,2 +1,3,0,1 +0,1,1,0 +0,3,1,2 +1,1,0,0 +1,1,1,2 +0,3,0,0 +0,3,1,1 +0,3,1,2 +0,2,1,2 +1,2,0,2 +1,1,1,2 +1,1,0,2 +1,3,1,0 +1,1,0,0 +0,1,1,2 +1,1,1,2 +0,3,1,2 +0,2,1,2 +0,3,1,2 +1,1,0,0 +1,2,0,2 +0,3,1,1 +0,3,1,2 +1,2,0,2 +0,3,1,2 +0,2,1,2 +0,2,1,2 +1,1,1,2 +0,3,1,2 +1,2,0,2 +1,3,0,1 +0,2,1,2 +0,3,0,2 +1,1,0,2 +0,3,1,0 +0,2,1,2 +0,2,1,2 +0,2,1,2 +0,3,1,2 +0,3,0,2 +1,1,1,0 +0,3,1,2 +0,3,1,2 +1,1,1,2 +0,1,1,2 +1,1,0,0 +0,3,1,2 +1,3,1,2 +0,1,1,2 +0,3,1,2 +1,2,0,2 +0,1,1,2 +0,3,1,1 +1,2,0,2 +1,3,1,2 +0,3,1,2 +0,3,1,2 +1,2,0,2 +1,2,1,2 +0,3,1,2 +0,2,1,2 +0,3,1,2 +1,1,0,2 +0,3,1,2 +0,3,1,2 +1,3,1,0 +1,1,0,2 +0,3,1,2 +1,1,0,2 +0,1,1,0 +0,3,0,1 +0,3,1,1 +0,3,1,2 +0,3,1,2 +0,3,1,2 +0,2,0,2 +0,3,1,0 +1,2,0,2 +0,3,1,2 +0,3,1,1 +1,3,0,2 +0,3,1,1 +1,1,0,2 +1,3,0,0 +1,1,0,2 +0,1,1,2 +0,3,1,2 +0,3,1,2 +0,3,1,2 +1,3,0,2 +0,3,1,1 +1,3,1,2 +0,1,1,0 +0,3,1,1 +0,2,1,2 +0,3,0,2 +0,1,1,0 +0,3,1,2 +0,2,1,2 +1,1,0,2 +1,3,0,2 +0,3,1,0 +0,3,0,2 +0,2,1,2 +1,2,0,2 +1,1,1,2 +1,3,1,0 +1,3,1,2 +0,3,1,2 +0,1,1,2 +0,3,0,2 +0,2,1,2 +1,1,0,2 +0,3,1,2 +0,3,1,2 +0,2,1,2 +0,3,0,2 +0,3,1,2 +0,1,1,2 +0,3,0,2 +0,2,1,0 +0,3,1,2 +0,3,1,2 +1,1,0,2 +1,3,1,2 +0,1,1,2 +1,3,0,2 +0,3,1,2 +0,3,1,1 +0,3,1,2 +1,2,1,0 +1,3,1,1 +1,3,0,0 +1,2,1,2 +0,3,1,0 +0,3,1,2 +0,3,1,2 +1,1,0,0 +0,3,1,2 +0,3,1,2 +1,3,1,2 +1,1,1,0 +0,3,1,2 +0,2,1,2 +1,1,0,0 +0,3,1,0 +0,3,1,2 +0,3,1,2 +0,3,1,2 +0,3,1,0 +0,2,1,2 +1,1,0,0 +0,3,1,2 +0,3,1,2 +0,3,0,0 +1,1,0,2 +0,2,0,2 +1,3,0,2 +1,1,0,2 +1,1,1,2 +1,3,0,0 +0,3,1,0 +0,3,1,2 +0,2,1,2 +1,1,0,2 +0,3,0,2 +0,2,1,2 +1,2,0,2 +1,2,0,0 +0,1,1,2 +0,3,1,2 +1,3,1,2 +0,3,1,2 +1,1,0,2 +0,1,1,2 +0,3,1,2 +1,2,0,0 +1,3,0,0 +0,3,1,2 +0,3,1,2 +0,3,1,2 +1,1,0,0 +1,2,0,2 +0,3,1,2 +0,3,0,2 +0,2,1,2 +0,3,1,2 +0,3,0,1 +0,2,1,2 +1,1,0,2 +0,3,0,2 +1,1,1,0 +0,3,1,1 \ No newline at end of file diff --git a/examples/trees/cart.go b/examples/trees/cart.go new file mode 100644 index 0000000..931b7e5 --- /dev/null +++ b/examples/trees/cart.go @@ -0,0 +1,61 @@ +// Example of how to use CART trees for both Classification and Regression + +package main + +import ( + "fmt" + + "github.com/sjwhitworth/golearn/base" +) + +func main() { + + // Load Titanic Data For classification + classificationData, err := base.ParseCSVToInstances("../datasets/titanic.csv", false) + if err != nil { + panic(err) + } + trainData, testData := base.InstancesTrainTestSplit(classificationData, 0.5) + + // Create New Classification Tree + // Hyperparameters - loss function, max Depth (-1 will split until pure), list of unique labels + decTree = NewDecisionTreeClassifier("entropy", -1, []int64{0, 1}) + + // Train Tree + decTree.Fit(trainData) + // Print out tree for visualization - shows splits and feature and predictions + fmt.Println(decTree.String()) + + // Access Predictions + classificationPreds := decTree.Predict(testData) + + fmt.Println("Titanic Predictions") + fmt.Println(classificationPreds) + + // Evaluate Accuracy on Test Data + fmt.Println(decTree.Evaluate(testData)) + + // Load House Price Data For Regression + + regressionData, err := base.ParseCSVToInstances("../datasets/boston_house_prices.csv", false) + if err != nil { + panic(err) + } + trainRegData, testRegData := base.InstancesTrainTestSplit(regressionData, 0.5) + + // Hyperparameters - Loss function, max Depth (-1 will split until pure) + regTree := NewDecisionTreeRegressor("mse", -1) + + // Train Tree + regTree.Fit(trainRegData) + + // Print out tree for visualization + fmt.Println(regTree.String()) + + // Access Predictions + regressionPreds := regTree.Predict(testRegData) + + fmt.Println("Boston House Price Predictions") + fmt.Println(regressionPreds) + +} diff --git a/trees/cart_classifier.go b/trees/cart_classifier.go index 29646fb..39b7165 100644 --- a/trees/cart_classifier.go +++ b/trees/cart_classifier.go @@ -432,7 +432,7 @@ func cpredictFromNode(tree CNode, test [][]float64) []int64 { return preds } -// Given Test data and label, return the accuracy of the classifier. Data has to be in float slice format before feeding. +// Given Test data and label, return the accuracy of the classifier. func (tree *CARTDecisionTreeClassifier) Evaluate(test base.FixedDataGrid) float64 { rootNode := *tree.RootNode xTest := classifierConvertInstancesToProblemVec(test) From c0c3b2e1bf54a57272bf86ddaa5df36bc087b05a Mon Sep 17 00:00:00 2001 From: Ayush Date: Sat, 25 Jul 2020 13:22:15 +0530 Subject: [PATCH 07/24] Fixing Sorting --- trees/cart_classifier.go | 24 +----------------------- trees/cart_regressor.go | 24 +----------------------- trees/sorter.go | 24 ++++++++++++++++++++++++ 3 files changed, 26 insertions(+), 46 deletions(-) create mode 100644 trees/sorter.go diff --git a/trees/cart_classifier.go b/trees/cart_classifier.go index 39b7165..7f7575d 100644 --- a/trees/cart_classifier.go +++ b/trees/cart_classifier.go @@ -157,31 +157,9 @@ func cvalidate(triedSplits [][]float64, feature int64, threshold float64) bool { return true } -// Helper struct for re-rdering data -type cSlice struct { - sort.Float64Slice - Idx []int -} - -// Helper function for re-ordering data -func (s cSlice) cSwap(i, j int) { - s.Float64Slice.Swap(i, j) - s.Idx[i], s.Idx[j] = s.Idx[j], s.Idx[i] -} - -// Final Helper Function for re-ordering data -func cNewSlice(n []float64) *cSlice { - s := &cSlice{Float64Slice: sort.Float64Slice(n), Idx: make([]int, len(n))} - - for i := range s.Idx { - s.Idx[i] = i - } - return s -} - // Reorder the data by feature being considered. Optimizes code by reducing the number of times we have to loop over data for splitting func creOrderData(featureVal []float64, data [][]float64, y []int64) ([][]float64, []int64) { - s := cNewSlice(featureVal) + s := NewSlice(featureVal) sort.Sort(s) indexes := s.Idx diff --git a/trees/cart_regressor.go b/trees/cart_regressor.go index 48e61d0..6841a71 100644 --- a/trees/cart_regressor.go +++ b/trees/cart_regressor.go @@ -145,31 +145,9 @@ func rvalidate(triedSplits [][]float64, feature int64, threshold float64) bool { return true } -// Helper struct for re-rdering data -type rSlice struct { - sort.Float64Slice - Idx []int -} - -// Helper function for re-ordering data -func (s rSlice) rSwap(i, j int) { - s.Float64Slice.Swap(i, j) - s.Idx[i], s.Idx[j] = s.Idx[j], s.Idx[i] -} - -// Final Helper Function for re-ordering data -func rNewSlice(n []float64) *rSlice { - s := &rSlice{Float64Slice: sort.Float64Slice(n), Idx: make([]int, len(n))} - - for i := range s.Idx { - s.Idx[i] = i - } - return s -} - // Re order data based on a feature for optimizing code func rreOrderData(featureVal []float64, data [][]float64, y []float64) ([][]float64, []float64) { - s := rNewSlice(featureVal) + s := NewSlice(featureVal) sort.Sort(s) indexes := s.Idx diff --git a/trees/sorter.go b/trees/sorter.go new file mode 100644 index 0000000..157b25d --- /dev/null +++ b/trees/sorter.go @@ -0,0 +1,24 @@ +package trees + +import ( + "sort" +) + +type Slice struct { + sort.Float64Slice + Idx []int +} + +func (s Slice) Swap(i, j int) { + s.Float64Slice.Swap(i, j) + s.Idx[i], s.Idx[j] = s.Idx[j], s.Idx[i] +} + +func NewSlice(n []float64) *Slice { + s := &Slice{Float64Slice: sort.Float64Slice(n), Idx: make([]int, len(n))} + + for i := range s.Idx { + s.Idx[i] = i + } + return s +} From abed408f9bcd41fc9757fbbed1b783f572a11845 Mon Sep 17 00:00:00 2001 From: Ayush Date: Sun, 26 Jul 2020 11:21:20 +0530 Subject: [PATCH 08/24] Updating Dataset + Naming --- examples/datasets/titanic.csv | 1732 ++++++++++++++++----------------- trees/cart_classifier.go | 76 +- trees/cart_regressor.go | 90 +- 3 files changed, 949 insertions(+), 949 deletions(-) diff --git a/examples/datasets/titanic.csv b/examples/datasets/titanic.csv index bb66baf..fe6999f 100644 --- a/examples/datasets/titanic.csv +++ b/examples/datasets/titanic.csv @@ -1,889 +1,889 @@ -0,3,1,2 +3,1,2,0 +1,0,0,1 +3,0,2,1 +1,0,2,1 +3,1,2,0 +3,1,1,0 +1,1,2,0 +3,1,2,0 +3,0,2,1 +2,0,0,1 +3,0,2,1 +1,0,2,1 +3,1,2,0 +3,1,2,0 +3,0,2,0 +2,0,2,1 +3,1,1,0 +2,1,2,1 +3,0,2,0 +3,0,0,1 +2,1,2,0 +2,1,2,1 +3,0,1,1 +1,1,2,1 +3,0,2,0 +3,0,2,1 +3,1,0,0 +1,1,2,0 +3,0,1,1 +3,1,2,0 1,1,0,0 -1,3,0,2 -1,1,0,2 -0,3,1,2 -0,3,1,1 -0,1,1,2 -0,3,1,2 -1,3,0,2 -1,2,0,0 -1,3,0,2 -1,1,0,2 -0,3,1,2 -0,3,1,2 -0,3,0,2 -1,2,0,2 -0,3,1,1 -1,2,1,2 -0,3,0,2 -1,3,0,0 -0,2,1,2 -1,2,1,2 -1,3,0,1 -1,1,1,2 -0,3,0,2 -1,3,0,2 -0,3,1,0 -0,1,1,2 -1,3,0,1 -0,3,1,2 -0,1,1,0 +1,0,0,1 +3,0,1,1 +2,1,2,0 1,1,0,0 -1,3,0,1 -0,2,1,2 -0,1,1,0 -0,1,1,2 -1,3,1,0 -0,3,1,2 -0,3,0,2 -1,3,0,0 -0,3,0,2 -0,2,0,2 -0,3,1,0 -1,2,0,0 -1,3,0,1 -0,3,1,2 -0,3,1,1 -1,3,0,1 -0,3,1,0 -0,3,0,2 -0,3,1,2 -0,3,1,2 +1,1,2,0 +3,1,0,1 +3,1,2,0 +3,0,2,0 +3,0,0,1 +3,0,2,0 +2,0,2,0 +3,1,0,0 +2,0,0,1 +3,0,1,1 +3,1,2,0 +3,1,1,0 +3,0,1,1 +3,1,0,0 +3,0,2,0 +3,1,2,0 +3,1,2,0 +1,0,0,1 +2,0,2,1 1,1,0,0 -1,2,0,2 -0,1,1,0 -1,1,1,2 -1,2,0,2 -0,3,1,0 -1,2,0,2 -0,3,1,2 -0,3,1,0 -0,1,1,2 -0,3,1,2 -0,1,1,0 -1,3,1,0 -1,2,0,2 -0,3,1,2 -1,3,0,2 -0,3,1,2 -0,2,1,2 -0,3,0,2 -0,2,1,2 -0,3,1,0 -1,3,1,2 -0,3,1,2 -0,3,1,2 -0,3,1,2 -1,2,1,2 -1,3,0,2 -0,3,1,2 -1,3,1,2 -1,3,0,1 -0,1,1,2 -1,2,0,2 -1,3,0,2 -0,3,1,2 -0,3,1,2 -1,1,0,2 -0,3,1,2 -0,3,1,2 -0,3,1,2 -0,1,1,2 -0,3,1,2 -0,3,1,2 -0,3,1,2 -0,1,1,0 -1,1,1,0 -1,2,0,2 -0,2,1,2 -0,3,0,2 -0,3,1,2 -0,1,1,2 -0,3,1,2 -0,3,1,2 -0,3,1,2 -1,3,0,2 -1,3,1,2 -0,3,1,2 -1,3,0,1 -0,1,1,2 -0,3,0,0 -0,3,1,2 -0,3,0,2 -0,3,0,0 -0,3,1,2 -0,3,1,1 -0,2,1,2 -0,1,1,0 -0,3,0,2 -0,2,1,2 -0,3,1,2 -0,2,1,0 -1,2,0,2 -0,1,1,2 -1,3,1,0 -0,3,1,1 -1,3,1,2 -1,3,0,0 -0,3,1,2 -0,3,1,0 -0,3,1,2 -0,3,0,2 -1,2,0,2 -0,2,1,2 -0,2,1,0 -1,1,0,2 -0,1,1,2 -0,3,1,2 -0,1,1,0 -0,3,0,0 -1,3,0,2 -1,3,0,2 -0,3,1,1 -0,2,1,2 -0,2,1,2 -1,3,1,2 -0,3,0,2 -0,2,1,2 -0,2,1,2 -0,2,1,2 -1,1,0,2 -0,3,1,2 -0,3,1,2 -0,3,1,2 -0,1,1,0 -1,3,0,1 -0,3,1,2 -0,3,1,2 -0,3,1,2 -0,3,1,2 -1,2,0,2 -0,3,1,2 -0,3,1,2 -0,3,1,2 -1,3,1,2 -1,1,0,2 -0,3,0,2 -0,1,1,2 -0,3,1,2 -0,1,1,2 -0,3,1,1 -1,3,0,2 -0,3,1,2 -0,1,1,0 -0,3,1,2 -0,3,1,2 -0,1,0,0 -0,2,1,2 -0,3,1,2 -0,3,0,2 -0,2,1,0 -0,3,1,2 -1,2,1,2 -1,3,0,2 -0,1,1,2 -1,3,0,1 -1,1,1,2 -0,3,1,1 -0,3,1,2 -1,2,0,2 -0,2,1,2 -1,3,0,2 -1,2,1,2 +1,1,2,1 +2,0,2,1 +3,1,0,0 +2,0,2,1 +3,1,2,0 +3,1,0,0 +1,1,2,0 +3,1,2,0 1,1,0,0 +3,1,0,1 +2,0,2,1 +3,1,2,0 +3,0,2,1 +3,1,2,0 +2,1,2,0 +3,0,2,0 +2,1,2,0 +3,1,0,0 +3,1,2,1 +3,1,2,0 +3,1,2,0 +3,1,2,0 +2,1,2,1 +3,0,2,1 +3,1,2,0 +3,1,2,1 +3,0,1,1 +1,1,2,0 +2,0,2,1 +3,0,2,1 +3,1,2,0 +3,1,2,0 +1,0,2,1 +3,1,2,0 +3,1,2,0 +3,1,2,0 +1,1,2,0 +3,1,2,0 +3,1,2,0 +3,1,2,0 1,1,0,0 -0,3,1,1 -0,3,1,2 -1,3,0,1 -0,2,0,2 -0,3,1,2 -0,3,1,2 -0,3,1,2 -0,3,1,0 -1,3,1,2 -0,3,0,2 -0,3,1,2 -1,3,1,0 -1,3,0,1 -1,1,1,0 -0,3,1,2 -1,2,0,2 -0,3,1,2 -0,2,1,2 -0,3,1,1 -1,1,0,0 -1,3,0,2 -0,2,1,2 -1,1,0,0 -0,2,1,2 -1,3,1,2 -0,2,1,2 -0,3,1,2 -0,3,1,2 -1,1,1,2 -0,3,1,2 -1,2,1,2 -0,3,1,2 -0,2,1,2 -0,3,0,2 -1,1,0,2 -0,3,1,2 -0,2,1,2 -1,3,0,2 -0,2,1,2 -0,3,0,2 -0,2,1,2 -1,2,0,2 -0,2,1,2 -0,2,1,2 -0,3,0,0 -1,3,0,1 -0,2,1,2 -0,3,1,2 -0,3,1,0 -0,1,1,1 -0,3,0,2 -1,2,0,2 -1,1,1,2 -0,2,1,2 -0,3,1,2 -0,3,0,2 -0,1,1,2 -0,3,1,2 -0,3,0,2 -1,3,0,0 -1,1,0,0 -1,1,0,2 -1,1,0,0 -1,2,0,2 -0,3,1,1 -1,3,1,2 -0,1,1,2 -0,1,1,2 -0,3,0,1 -0,2,1,2 -0,3,1,2 -1,3,1,2 -1,1,0,2 -1,1,0,2 -0,1,1,2 -1,3,1,2 -1,2,0,2 -0,1,1,0 -1,3,0,1 -1,1,0,2 -0,3,0,2 -0,2,1,2 -0,3,1,1 -1,3,0,2 -0,3,1,1 -0,3,1,2 -0,3,1,2 -1,3,1,2 -0,1,1,2 -0,3,1,0 -1,3,1,2 -0,3,1,2 -1,2,1,2 -1,3,0,1 -1,1,0,2 -1,1,0,0 -0,2,1,0 -0,3,0,2 -0,3,1,2 -0,1,1,0 -0,3,1,0 -0,1,0,2 -1,1,1,2 -1,1,0,0 -1,3,0,1 -1,3,1,1 -0,3,1,2 -1,2,0,1 -0,3,1,2 -1,1,1,2 -1,1,0,0 -1,1,0,0 -0,2,1,0 -1,1,0,0 -1,1,0,0 -1,1,0,0 -0,2,0,2 -0,3,1,2 -0,2,1,2 -1,3,0,2 -1,2,0,2 -0,2,1,2 -1,1,0,2 -1,1,0,0 -0,3,1,2 -0,3,1,2 -1,2,0,1 -1,2,0,2 -0,3,1,2 -1,1,0,0 -0,3,1,2 -1,2,0,2 -1,3,0,2 -1,1,0,0 -1,3,0,1 -0,1,1,2 -0,1,1,2 -0,3,1,2 -1,1,0,2 -0,3,1,2 -0,1,1,2 -1,1,0,0 -1,3,1,2 -0,1,1,2 -1,2,1,2 -1,1,0,2 -0,2,1,2 -0,2,1,2 -0,2,1,2 -1,2,0,2 -1,2,0,2 -1,3,0,2 -1,3,1,2 -0,3,1,2 -0,3,1,2 -0,1,1,2 -0,3,1,0 -0,3,1,2 -0,3,1,0 -0,3,1,2 -1,1,0,2 -0,2,0,2 -1,3,0,1 -1,3,0,1 -0,3,1,2 -0,2,1,0 -0,3,0,0 -0,3,1,2 -0,3,1,1 -0,3,1,2 -1,1,0,0 -1,3,0,0 -1,3,0,1 -1,1,0,0 -1,1,1,0 -0,3,1,2 -0,3,1,2 -0,1,1,0 -0,3,0,2 -1,1,0,0 -1,3,0,2 -0,1,1,0 -0,3,1,0 -0,3,1,2 -1,1,0,0 -1,3,0,0 -0,3,1,2 -1,1,0,2 -0,3,1,2 -0,2,1,2 -0,3,1,2 -1,2,0,2 -0,3,1,1 -1,2,0,0 -1,1,1,2 -1,3,1,2 -0,3,1,2 -1,1,0,0 -1,3,0,2 -0,3,1,2 -0,3,0,2 -0,2,1,2 -0,2,1,2 -1,2,0,2 -1,3,1,2 -0,3,1,2 -0,3,0,2 -0,3,1,2 -0,3,0,2 -0,2,1,2 -0,3,1,2 -1,2,1,2 -0,3,1,2 -0,3,0,2 -0,3,1,2 -0,3,1,1 1,1,0,1 -0,2,1,2 -1,3,1,2 -0,3,0,2 -1,2,0,2 -1,2,0,2 -0,2,1,2 -0,3,0,2 -0,3,1,0 -0,3,1,1 -0,3,1,2 -0,3,0,2 -0,3,1,2 -0,3,1,2 -1,2,0,2 -1,2,0,2 -0,3,1,1 -1,3,1,2 -1,1,1,2 -1,3,0,2 -1,2,0,2 -0,3,1,2 -0,1,1,2 -1,1,0,2 -0,3,0,2 -1,2,0,2 -0,1,1,2 -0,2,1,2 -1,2,0,2 -0,3,1,2 -0,3,1,2 -1,2,0,2 -1,3,1,2 -1,1,1,2 -1,2,0,2 -1,1,1,2 -1,3,0,0 -1,1,1,2 -0,2,1,2 -0,3,1,2 -0,1,1,0 +2,0,2,1 +2,1,2,0 +3,0,2,0 +3,1,2,0 +1,1,2,0 +3,1,2,0 +3,1,2,0 +3,1,2,0 +3,0,2,1 +3,1,2,1 +3,1,2,0 +3,0,1,1 +1,1,2,0 +3,0,0,0 +3,1,2,0 +3,0,2,0 +3,0,0,0 +3,1,2,0 +3,1,1,0 +2,1,2,0 +1,1,0,0 +3,0,2,0 +2,1,2,0 +3,1,2,0 +2,1,0,0 +2,0,2,1 +1,1,2,0 +3,1,0,1 +3,1,1,0 +3,1,2,1 +3,0,0,1 +3,1,2,0 +3,1,0,0 +3,1,2,0 +3,0,2,0 +2,0,2,1 +2,1,2,0 +2,1,0,0 +1,0,2,1 +1,1,2,0 +3,1,2,0 +1,1,0,0 +3,0,0,0 +3,0,2,1 +3,0,2,1 +3,1,1,0 +2,1,2,0 +2,1,2,0 +3,1,2,1 +3,0,2,0 +2,1,2,0 +2,1,2,0 +2,1,2,0 +1,0,2,1 +3,1,2,0 +3,1,2,0 +3,1,2,0 +1,1,0,0 +3,0,1,1 +3,1,2,0 +3,1,2,0 +3,1,2,0 +3,1,2,0 +2,0,2,1 +3,1,2,0 +3,1,2,0 +3,1,2,0 +3,1,2,1 +1,0,2,1 +3,0,2,0 +1,1,2,0 +3,1,2,0 +1,1,2,0 +3,1,1,0 +3,0,2,1 +3,1,2,0 +1,1,0,0 +3,1,2,0 +3,1,2,0 +1,0,0,0 +2,1,2,0 +3,1,2,0 +3,0,2,0 +2,1,0,0 +3,1,2,0 +2,1,2,1 +3,0,2,1 +1,1,2,0 +3,0,1,1 +1,1,2,1 +3,1,1,0 +3,1,2,0 +2,0,2,1 +2,1,2,0 +3,0,2,1 +2,1,2,1 +1,0,0,1 +1,0,0,1 +3,1,1,0 +3,1,2,0 +3,0,1,1 +2,0,2,0 +3,1,2,0 +3,1,2,0 +3,1,2,0 +3,1,0,0 +3,1,2,1 +3,0,2,0 +3,1,2,0 +3,1,0,1 +3,0,1,1 +1,1,0,1 +3,1,2,0 +2,0,2,1 +3,1,2,0 +2,1,2,0 +3,1,1,0 +1,0,0,1 +3,0,2,1 +2,1,2,0 +1,0,0,1 +2,1,2,0 +3,1,2,1 +2,1,2,0 +3,1,2,0 +3,1,2,0 +1,1,2,1 +3,1,2,0 +2,1,2,1 +3,1,2,0 +2,1,2,0 +3,0,2,0 +1,0,2,1 +3,1,2,0 +2,1,2,0 +3,0,2,1 +2,1,2,0 +3,0,2,0 +2,1,2,0 +2,0,2,1 +2,1,2,0 +2,1,2,0 +3,0,0,0 +3,0,1,1 +2,1,2,0 +3,1,2,0 +3,1,0,0 1,1,1,0 -0,3,1,2 -1,3,1,0 -0,1,1,2 -1,1,0,2 -1,2,0,2 -0,3,1,1 -1,1,1,2 -0,3,1,2 -0,1,1,2 -0,2,1,2 -0,3,1,2 -0,3,1,2 -0,2,1,2 -0,1,1,2 -0,3,1,1 -1,3,0,0 -0,3,1,2 -0,3,1,2 -1,2,0,2 -1,2,0,0 -0,3,0,2 -0,1,1,2 -0,2,1,2 -0,3,1,2 -0,3,1,2 -1,3,0,2 -0,3,1,2 -0,2,1,2 -0,3,1,2 -1,3,0,2 -1,1,1,0 -0,3,0,2 -1,1,0,2 -0,1,1,0 -0,3,1,2 -1,3,1,2 -0,3,1,2 -0,3,1,2 -0,1,1,2 -0,1,1,0 -0,3,1,2 -0,3,1,0 +3,0,2,0 +2,0,2,1 +1,1,2,1 +2,1,2,0 +3,1,2,0 +3,0,2,0 +1,1,2,0 +3,1,2,0 +3,0,2,0 +3,0,0,1 +1,0,0,1 +1,0,2,1 +1,0,0,1 +2,0,2,1 +3,1,1,0 +3,1,2,1 +1,1,2,0 +1,1,2,0 +3,0,1,0 +2,1,2,0 +3,1,2,0 +3,1,2,1 +1,0,2,1 +1,0,2,1 +1,1,2,0 +3,1,2,1 +2,0,2,1 1,1,0,0 -0,3,1,2 -0,1,0,2 -0,3,1,2 -0,3,1,2 -0,3,0,1 -0,3,0,1 -0,3,0,2 -1,1,0,2 -0,1,1,0 -1,2,0,2 -1,1,1,2 -0,3,1,2 -1,3,1,2 -1,3,1,1 -0,3,1,2 -1,1,1,2 +3,0,1,1 +1,0,2,1 +3,0,2,0 +2,1,2,0 +3,1,1,0 +3,0,2,1 +3,1,1,0 +3,1,2,0 +3,1,2,0 +3,1,2,1 +1,1,2,0 +3,1,0,0 +3,1,2,1 +3,1,2,0 +2,1,2,1 +3,0,1,1 +1,0,2,1 +1,0,0,1 +2,1,0,0 +3,0,2,0 +3,1,2,0 1,1,0,0 -0,3,1,2 -0,1,1,2 -1,2,0,2 -0,3,1,1 -1,2,0,2 -0,3,1,2 -1,1,0,2 -0,3,1,2 -0,3,1,0 +3,1,0,0 +1,0,2,0 +1,1,2,1 +1,0,0,1 +3,0,1,1 +3,1,1,1 +3,1,2,0 +2,0,1,1 +3,1,2,0 +1,1,2,1 +1,0,0,1 +1,0,0,1 +2,1,0,0 +1,0,0,1 +1,0,0,1 +1,0,0,1 +2,0,2,0 +3,1,2,0 +2,1,2,0 +3,0,2,1 +2,0,2,1 +2,1,2,0 +1,0,2,1 +1,0,0,1 +3,1,2,0 +3,1,2,0 +2,0,1,1 +2,0,2,1 +3,1,2,0 +1,0,0,1 +3,1,2,0 +2,0,2,1 +3,0,2,1 +1,0,0,1 +3,0,1,1 +1,1,2,0 +1,1,2,0 +3,1,2,0 +1,0,2,1 +3,1,2,0 +1,1,2,0 +1,0,0,1 +3,1,2,1 +1,1,2,0 +2,1,2,1 +1,0,2,1 +2,1,2,0 +2,1,2,0 +2,1,2,0 +2,0,2,1 +2,0,2,1 +3,0,2,1 +3,1,2,1 +3,1,2,0 +3,1,2,0 +1,1,2,0 +3,1,0,0 +3,1,2,0 +3,1,0,0 +3,1,2,0 +1,0,2,1 +2,0,2,0 +3,0,1,1 +3,0,1,1 +3,1,2,0 +2,1,0,0 +3,0,0,0 +3,1,2,0 +3,1,1,0 +3,1,2,0 +1,0,0,1 +3,0,0,1 +3,0,1,1 +1,0,0,1 +1,1,0,1 +3,1,2,0 +3,1,2,0 1,1,0,0 -0,3,1,0 -0,3,1,1 -1,2,0,2 -0,1,1,2 -0,3,1,2 -0,2,1,2 -1,2,0,2 -0,3,1,0 -0,3,1,0 -1,3,0,0 -0,3,0,2 -1,2,0,2 -0,1,1,2 +3,0,2,0 +1,0,0,1 +3,0,2,1 1,1,0,0 -0,3,1,2 +3,1,0,0 +3,1,2,0 +1,0,0,1 +3,0,0,1 +3,1,2,0 +1,0,2,1 +3,1,2,0 +2,1,2,0 +3,1,2,0 +2,0,2,1 +3,1,1,0 +2,0,0,1 +1,1,2,1 +3,1,2,1 +3,1,2,0 +1,0,0,1 +3,0,2,1 +3,1,2,0 +3,0,2,0 +2,1,2,0 +2,1,2,0 +2,0,2,1 +3,1,2,1 +3,1,2,0 +3,0,2,0 +3,1,2,0 +3,0,2,0 +2,1,2,0 +3,1,2,0 +2,1,2,1 +3,1,2,0 +3,0,2,0 +3,1,2,0 +3,1,1,0 +1,0,1,1 +2,1,2,0 +3,1,2,1 +3,0,2,0 +2,0,2,1 +2,0,2,1 +2,1,2,0 +3,0,2,0 +3,1,0,0 +3,1,1,0 +3,1,2,0 +3,0,2,0 +3,1,2,0 +3,1,2,0 +2,0,2,1 +2,0,2,1 +3,1,1,0 +3,1,2,1 +1,1,2,1 +3,0,2,1 +2,0,2,1 +3,1,2,0 +1,1,2,0 +1,0,2,1 +3,0,2,0 +2,0,2,1 +1,1,2,0 +2,1,2,0 +2,0,2,1 +3,1,2,0 +3,1,2,0 +2,0,2,1 +3,1,2,1 +1,1,2,1 +2,0,2,1 +1,1,2,1 +3,0,0,1 +1,1,2,1 +2,1,2,0 +3,1,2,0 1,1,0,0 -1,1,0,2 -0,3,0,2 -0,3,0,2 -1,2,1,2 -0,1,1,0 -0,1,1,2 -1,2,0,2 -1,2,1,0 -0,3,1,2 -1,2,1,2 -1,1,1,0 -0,2,1,2 -0,3,1,1 -1,3,1,0 -1,3,0,2 -0,1,1,2 +1,1,0,1 +3,1,2,0 +3,1,0,1 +1,1,2,0 +1,0,2,1 +2,0,2,1 +3,1,1,0 +1,1,2,1 +3,1,2,0 +1,1,2,0 +2,1,2,0 +3,1,2,0 +3,1,2,0 +2,1,2,0 +1,1,2,0 +3,1,1,0 +3,0,0,1 +3,1,2,0 +3,1,2,0 +2,0,2,1 +2,0,0,1 +3,0,2,0 +1,1,2,0 +2,1,2,0 +3,1,2,0 +3,1,2,0 +3,0,2,1 +3,1,2,0 +2,1,2,0 +3,1,2,0 +3,0,2,1 +1,1,0,1 +3,0,2,0 +1,0,2,1 1,1,0,0 -0,1,1,0 -1,1,0,2 -1,3,0,2 -0,3,1,1 -0,3,1,2 -0,2,1,2 -0,3,1,2 -0,3,0,2 -0,3,1,2 -0,3,1,2 -0,3,0,2 -0,3,1,0 -1,3,1,2 -1,2,1,2 -1,1,0,2 -1,1,1,2 -1,3,0,1 -0,3,1,2 -0,3,1,2 -1,2,0,2 -1,1,0,2 -0,3,0,0 -1,3,1,2 -1,2,0,2 +3,1,2,0 +3,1,2,1 +3,1,2,0 +3,1,2,0 +1,1,2,0 1,1,0,0 -0,2,1,2 -0,1,1,0 -0,3,1,0 -1,1,0,2 -0,2,1,2 -1,1,1,0 -0,3,1,2 -0,3,1,2 -0,3,1,2 +3,1,2,0 +3,1,0,0 +1,0,0,1 +3,1,2,0 +1,0,2,0 +3,1,2,0 +3,1,2,0 +3,0,1,0 +3,0,1,0 +3,0,2,0 +1,0,2,1 1,1,0,0 -0,3,1,2 -0,3,0,1 -0,2,1,2 -0,3,1,2 -1,2,0,2 -0,3,1,2 -0,3,1,0 -1,1,1,0 -1,2,0,2 -0,3,1,2 -0,1,1,2 -0,3,1,2 -1,1,1,0 -0,3,1,2 -0,3,1,2 -1,1,1,2 -1,2,0,0 -1,1,0,2 -0,3,0,2 -0,3,1,2 -1,3,0,1 -0,3,1,1 -0,3,1,2 -1,2,0,2 -0,3,1,2 -0,3,0,2 -1,2,0,2 -0,2,1,2 -0,3,1,0 -1,1,1,2 -1,3,1,0 -0,3,1,2 -0,3,1,2 -0,1,1,2 -0,2,1,1 -1,1,0,2 -0,3,1,2 -0,3,1,1 -1,1,1,2 -0,3,1,2 -1,1,1,0 -0,1,1,2 -0,3,0,2 -1,2,0,2 -0,3,1,2 -0,2,1,2 -0,3,0,2 -0,3,1,2 -0,3,1,2 +2,0,2,1 +1,1,2,1 +3,1,2,0 +3,1,2,1 +3,1,1,1 +3,1,2,0 +1,1,2,1 +1,0,0,1 +3,1,2,0 +1,1,2,0 +2,0,2,1 +3,1,1,0 +2,0,2,1 +3,1,2,0 +1,0,2,1 +3,1,2,0 +3,1,0,0 +1,0,0,1 +3,1,0,0 +3,1,1,0 +2,0,2,1 +1,1,2,0 +3,1,2,0 +2,1,2,0 +2,0,2,1 +3,1,0,0 +3,1,0,0 +3,0,0,1 +3,0,2,0 +2,0,2,1 +1,1,2,0 +1,0,0,1 +3,1,2,0 +1,0,0,1 +1,0,2,1 +3,0,2,0 +3,0,2,0 +2,1,2,1 1,1,0,0 -0,3,0,2 -1,3,1,2 -1,3,0,0 -1,1,1,0 -0,3,1,2 -1,1,1,0 -0,3,1,2 -1,3,0,2 -0,3,1,2 -1,2,0,2 -0,3,1,2 -1,3,0,1 -0,3,0,1 -0,2,1,2 -0,3,1,2 -0,3,0,1 -0,2,1,2 -0,1,1,0 -1,1,1,2 -0,3,1,0 -0,1,1,2 -0,3,1,2 -1,3,1,2 -0,2,1,2 -0,2,1,2 -0,3,1,2 -0,3,1,2 -1,1,0,2 -1,2,0,2 -0,1,1,2 -0,2,1,2 -1,2,1,2 -0,2,1,2 -0,3,1,2 -0,3,1,2 -1,3,0,2 -0,3,0,2 -1,1,1,0 -0,3,0,1 -1,1,1,0 -0,3,1,2 -0,3,1,2 -0,2,1,2 -0,2,1,0 -0,3,1,2 -0,3,1,2 -0,3,1,2 -1,1,0,2 -1,1,1,2 -1,3,0,0 -1,3,1,2 -0,3,1,0 -0,1,1,2 -0,2,1,2 -0,3,1,2 -1,3,0,1 -0,1,1,0 -0,3,1,2 +1,1,2,0 +2,0,2,1 +2,1,0,1 +3,1,2,0 +2,1,2,1 +1,1,0,1 +2,1,2,0 +3,1,1,0 +3,1,0,1 +3,0,2,1 +1,1,2,0 +1,0,0,1 1,1,0,0 -1,1,1,2 -0,3,0,0 -0,3,1,1 -0,3,1,2 -0,2,1,2 -1,2,0,2 -1,1,1,2 -1,1,0,2 -1,3,1,0 +1,0,2,1 +3,0,2,1 +3,1,1,0 +3,1,2,0 +2,1,2,0 +3,1,2,0 +3,0,2,0 +3,1,2,0 +3,1,2,0 +3,0,2,0 +3,1,0,0 +3,1,2,1 +2,1,2,1 +1,0,2,1 +1,1,2,1 +3,0,1,1 +3,1,2,0 +3,1,2,0 +2,0,2,1 +1,0,2,1 +3,0,0,0 +3,1,2,1 +2,0,2,1 +1,0,0,1 +2,1,2,0 1,1,0,0 -0,1,1,2 -1,1,1,2 -0,3,1,2 -0,2,1,2 -0,3,1,2 +3,1,0,0 +1,0,2,1 +2,1,2,0 +1,1,0,1 +3,1,2,0 +3,1,2,0 +3,1,2,0 +1,0,0,1 +3,1,2,0 +3,0,1,0 +2,1,2,0 +3,1,2,0 +2,0,2,1 +3,1,2,0 +3,1,0,0 +1,1,0,1 +2,0,2,1 +3,1,2,0 +1,1,2,0 +3,1,2,0 +1,1,0,1 +3,1,2,0 +3,1,2,0 +1,1,2,1 +2,0,0,1 +1,0,2,1 +3,0,2,0 +3,1,2,0 +3,0,1,1 +3,1,1,0 +3,1,2,0 +2,0,2,1 +3,1,2,0 +3,0,2,0 +2,0,2,1 +2,1,2,0 +3,1,0,0 +1,1,2,1 +3,1,0,1 +3,1,2,0 +3,1,2,0 +1,1,2,0 +2,1,1,0 +1,0,2,1 +3,1,2,0 +3,1,1,0 +1,1,2,1 +3,1,2,0 +1,1,0,1 +1,1,2,0 +3,0,2,0 +2,0,2,1 +3,1,2,0 +2,1,2,0 +3,0,2,0 +3,1,2,0 +3,1,2,0 +1,0,0,1 +3,0,2,0 +3,1,2,1 +3,0,0,1 +1,1,0,1 +3,1,2,0 +1,1,0,1 +3,1,2,0 +3,0,2,1 +3,1,2,0 +2,0,2,1 +3,1,2,0 +3,0,1,1 +3,0,1,0 +2,1,2,0 +3,1,2,0 +3,0,1,0 +2,1,2,0 1,1,0,0 -1,2,0,2 -0,3,1,1 -0,3,1,2 -1,2,0,2 -0,3,1,2 -0,2,1,2 -0,2,1,2 -1,1,1,2 -0,3,1,2 -1,2,0,2 -1,3,0,1 -0,2,1,2 -0,3,0,2 -1,1,0,2 -0,3,1,0 -0,2,1,2 -0,2,1,2 -0,2,1,2 -0,3,1,2 -0,3,0,2 -1,1,1,0 -0,3,1,2 -0,3,1,2 -1,1,1,2 -0,1,1,2 +1,1,2,1 +3,1,0,0 +1,1,2,0 +3,1,2,0 +3,1,2,1 +2,1,2,0 +2,1,2,0 +3,1,2,0 +3,1,2,0 +1,0,2,1 +2,0,2,1 +1,1,2,0 +2,1,2,0 +2,1,2,1 +2,1,2,0 +3,1,2,0 +3,1,2,0 +3,0,2,1 +3,0,2,0 +1,1,0,1 +3,0,1,0 +1,1,0,1 +3,1,2,0 +3,1,2,0 +2,1,2,0 +2,1,0,0 +3,1,2,0 +3,1,2,0 +3,1,2,0 +1,0,2,1 +1,1,2,1 +3,0,0,1 +3,1,2,1 +3,1,0,0 +1,1,2,0 +2,1,2,0 +3,1,2,0 +3,0,1,1 1,1,0,0 -0,3,1,2 -1,3,1,2 -0,1,1,2 -0,3,1,2 -1,2,0,2 -0,1,1,2 -0,3,1,1 -1,2,0,2 -1,3,1,2 -0,3,1,2 -0,3,1,2 -1,2,0,2 -1,2,1,2 -0,3,1,2 -0,2,1,2 -0,3,1,2 -1,1,0,2 -0,3,1,2 -0,3,1,2 -1,3,1,0 -1,1,0,2 -0,3,1,2 -1,1,0,2 -0,1,1,0 -0,3,0,1 -0,3,1,1 -0,3,1,2 -0,3,1,2 -0,3,1,2 -0,2,0,2 -0,3,1,0 -1,2,0,2 -0,3,1,2 -0,3,1,1 -1,3,0,2 -0,3,1,1 -1,1,0,2 -1,3,0,0 -1,1,0,2 -0,1,1,2 -0,3,1,2 -0,3,1,2 -0,3,1,2 -1,3,0,2 -0,3,1,1 -1,3,1,2 -0,1,1,0 -0,3,1,1 -0,2,1,2 -0,3,0,2 -0,1,1,0 -0,3,1,2 -0,2,1,2 -1,1,0,2 -1,3,0,2 -0,3,1,0 -0,3,0,2 -0,2,1,2 -1,2,0,2 -1,1,1,2 -1,3,1,0 -1,3,1,2 -0,3,1,2 -0,1,1,2 -0,3,0,2 -0,2,1,2 -1,1,0,2 -0,3,1,2 -0,3,1,2 -0,2,1,2 -0,3,0,2 -0,3,1,2 -0,1,1,2 -0,3,0,2 -0,2,1,0 -0,3,1,2 -0,3,1,2 -1,1,0,2 -1,3,1,2 -0,1,1,2 -1,3,0,2 -0,3,1,2 -0,3,1,1 -0,3,1,2 -1,2,1,0 -1,3,1,1 -1,3,0,0 -1,2,1,2 -0,3,1,0 -0,3,1,2 -0,3,1,2 +3,1,2,0 +1,0,0,1 +1,1,2,1 +3,0,0,0 +3,1,1,0 +3,1,2,0 +2,1,2,0 +2,0,2,1 +1,1,2,1 +1,0,2,1 +3,1,0,1 +1,0,0,1 +1,1,2,0 +1,1,2,1 +3,1,2,0 +2,1,2,0 +3,1,2,0 +1,0,0,1 +2,0,2,1 +3,1,1,0 +3,1,2,0 +2,0,2,1 +3,1,2,0 +2,1,2,0 +2,1,2,0 +1,1,2,1 +3,1,2,0 +2,0,2,1 +3,0,1,1 +2,1,2,0 +3,0,2,0 +1,0,2,1 +3,1,0,0 +2,1,2,0 +2,1,2,0 +2,1,2,0 +3,1,2,0 +3,0,2,0 +1,1,0,1 +3,1,2,0 +3,1,2,0 +1,1,2,1 +1,1,2,0 +1,0,0,1 +3,1,2,0 +3,1,2,1 +1,1,2,0 +3,1,2,0 +2,0,2,1 +1,1,2,0 +3,1,1,0 +2,0,2,1 +3,1,2,1 +3,1,2,0 +3,1,2,0 +2,0,2,1 +2,1,2,1 +3,1,2,0 +2,1,2,0 +3,1,2,0 +1,0,2,1 +3,1,2,0 +3,1,2,0 +3,1,0,1 +1,0,2,1 +3,1,2,0 +1,0,2,1 1,1,0,0 -0,3,1,2 -0,3,1,2 -1,3,1,2 -1,1,1,0 -0,3,1,2 -0,2,1,2 +3,0,1,0 +3,1,1,0 +3,1,2,0 +3,1,2,0 +3,1,2,0 +2,0,2,0 +3,1,0,0 +2,0,2,1 +3,1,2,0 +3,1,1,0 +3,0,2,1 +3,1,1,0 +1,0,2,1 +3,0,0,1 +1,0,2,1 +1,1,2,0 +3,1,2,0 +3,1,2,0 +3,1,2,0 +3,0,2,1 +3,1,1,0 +3,1,2,1 1,1,0,0 -0,3,1,0 -0,3,1,2 -0,3,1,2 -0,3,1,2 -0,3,1,0 -0,2,1,2 +3,1,1,0 +2,1,2,0 +3,0,2,0 1,1,0,0 -0,3,1,2 -0,3,1,2 -0,3,0,0 -1,1,0,2 -0,2,0,2 -1,3,0,2 -1,1,0,2 -1,1,1,2 -1,3,0,0 -0,3,1,0 -0,3,1,2 -0,2,1,2 -1,1,0,2 -0,3,0,2 -0,2,1,2 -1,2,0,2 -1,2,0,0 -0,1,1,2 -0,3,1,2 -1,3,1,2 -0,3,1,2 -1,1,0,2 -0,1,1,2 -0,3,1,2 -1,2,0,0 -1,3,0,0 -0,3,1,2 -0,3,1,2 -0,3,1,2 -1,1,0,0 -1,2,0,2 -0,3,1,2 -0,3,0,2 -0,2,1,2 -0,3,1,2 -0,3,0,1 -0,2,1,2 -1,1,0,2 -0,3,0,2 -1,1,1,0 -0,3,1,1 \ No newline at end of file +3,1,2,0 +2,1,2,0 +1,0,2,1 +3,0,2,1 +3,1,0,0 +3,0,2,0 +2,1,2,0 +2,0,2,1 +1,1,2,1 +3,1,0,1 +3,1,2,1 +3,1,2,0 +1,1,2,0 +3,0,2,0 +2,1,2,0 +1,0,2,1 +3,1,2,0 +3,1,2,0 +2,1,2,0 +3,0,2,0 +3,1,2,0 +1,1,2,0 +3,0,2,0 +2,1,0,0 +3,1,2,0 +3,1,2,0 +1,0,2,1 +3,1,2,1 +1,1,2,0 +3,0,2,1 +3,1,2,0 +3,1,1,0 +3,1,2,0 +2,1,0,1 +3,1,1,1 +3,0,0,1 +2,1,2,1 +3,1,0,0 +3,1,2,0 +3,1,2,0 +1,0,0,1 +3,1,2,0 +3,1,2,0 +3,1,2,1 +1,1,0,1 +3,1,2,0 +2,1,2,0 +1,0,0,1 +3,1,0,0 +3,1,2,0 +3,1,2,0 +3,1,2,0 +3,1,0,0 +2,1,2,0 +1,0,0,1 +3,1,2,0 +3,1,2,0 +3,0,0,0 +1,0,2,1 +2,0,2,0 +3,0,2,1 +1,0,2,1 +1,1,2,1 +3,0,0,1 +3,1,0,0 +3,1,2,0 +2,1,2,0 +1,0,2,1 +3,0,2,0 +2,1,2,0 +2,0,2,1 +2,0,0,1 +1,1,2,0 +3,1,2,0 +3,1,2,1 +3,1,2,0 +1,0,2,1 +1,1,2,0 +3,1,2,0 +2,0,0,1 +3,0,0,1 +3,1,2,0 +3,1,2,0 +3,1,2,0 +1,0,0,1 +2,0,2,1 +3,1,2,0 +3,0,2,0 +2,1,2,0 +3,1,2,0 +3,0,1,0 +2,1,2,0 +1,0,2,1 +3,0,2,0 +1,1,0,1 +3,1,1,0 \ No newline at end of file diff --git a/trees/cart_classifier.go b/trees/cart_classifier.go index 7f7575d..dc54d19 100644 --- a/trees/cart_classifier.go +++ b/trees/cart_classifier.go @@ -13,9 +13,9 @@ import ( // The "c" prefix to function names indicates that they were tailored for classification // CNode is Node struct for Decision Tree Classifier -type CNode struct { - Left *CNode - Right *CNode +type classifierNode struct { + Left *classifierNode + Right *classifierNode Threshold float64 Feature int64 LeftLabel int64 @@ -26,7 +26,7 @@ type CNode struct { // CARTDecisionTreeClassifier: Tree struct for Decision Tree Classifier type CARTDecisionTreeClassifier struct { - RootNode *CNode + RootNode *classifierNode criterion string maxDepth int64 labels []int64 @@ -85,7 +85,7 @@ func entropy(y []int64, labels []int64) (float64, int64) { } // Split the data into left node and right node based on feature and threshold - only needed for fresh nodes -func ctestSplit(data [][]float64, feature int64, y []int64, threshold float64) ([][]float64, [][]float64, []int64, []int64) { +func classifierCreateSplit(data [][]float64, feature int64, y []int64, threshold float64) ([][]float64, [][]float64, []int64, []int64) { var left [][]float64 var right [][]float64 var lefty []int64 @@ -106,7 +106,7 @@ func ctestSplit(data [][]float64, feature int64, y []int64, threshold float64) ( } // Helper Function to check if data point is unique or not -func cstringInSlice(a float64, list []float64) bool { +func classifierStringInSlice(a float64, list []float64) bool { for _, b := range list { if b == a { return true @@ -116,10 +116,10 @@ func cstringInSlice(a float64, list []float64) bool { } // Isolate only unique values. Needed for splitting data. -func cfindUnique(data []float64) []float64 { +func classifierFindUnique(data []float64) []float64 { var unique []float64 for i := range data { - if !cstringInSlice(data[i], unique) { + if !classifierStringInSlice(data[i], unique) { unique = append(unique, data[i]) } } @@ -127,7 +127,7 @@ func cfindUnique(data []float64) []float64 { } // Isolate only the feature being considered for splitting -func cgetFeature(data [][]float64, feature int64) []float64 { +func classifierGetFeature(data [][]float64, feature int64) []float64 { var featureVals []float64 for i := range data { featureVals = append(featureVals, data[i][feature]) @@ -146,7 +146,7 @@ func NewDecisionTreeClassifier(criterion string, maxDepth int64, labels []int64) } // Make sure that split being considered has not been done before -func cvalidate(triedSplits [][]float64, feature int64, threshold float64) bool { +func classifierValidate(triedSplits [][]float64, feature int64, threshold float64) bool { for i := range triedSplits { split := triedSplits[i] featureTried, thresholdTried := split[0], split[1] @@ -158,7 +158,7 @@ func cvalidate(triedSplits [][]float64, feature int64, threshold float64) bool { } // Reorder the data by feature being considered. Optimizes code by reducing the number of times we have to loop over data for splitting -func creOrderData(featureVal []float64, data [][]float64, y []int64) ([][]float64, []int64) { +func classifierReOrderData(featureVal []float64, data [][]float64, y []int64) ([][]float64, []int64) { s := NewSlice(featureVal) sort.Sort(s) @@ -176,7 +176,7 @@ func creOrderData(featureVal []float64, data [][]float64, y []int64) ([][]float6 } // Change data in Left Node and Right Node based on change in threshold -func cupdateSplit(left [][]float64, lefty []int64, right [][]float64, righty []int64, feature int64, threshold float64) ([][]float64, []int64, [][]float64, []int64) { +func classifierUpdateSplit(left [][]float64, lefty []int64, right [][]float64, righty []int64, feature int64, threshold float64) ([][]float64, []int64, [][]float64, []int64) { for right[0][feature] < threshold { left = append(left, right[0]) @@ -190,17 +190,17 @@ func cupdateSplit(left [][]float64, lefty []int64, right [][]float64, righty []i // Fit - Method visible to user to train tree func (tree *CARTDecisionTreeClassifier) Fit(X base.FixedDataGrid) { - var emptyNode CNode + var emptyNode classifierNode data := classifierConvertInstancesToProblemVec(X) y := classifierConvertInstancesToLabelVec(X) - emptyNode = cbestSplit(*tree, data, y, tree.labels, emptyNode, tree.criterion, tree.maxDepth, 0) + emptyNode = classifierBestSplit(*tree, data, y, tree.labels, emptyNode, tree.criterion, tree.maxDepth, 0) tree.RootNode = &emptyNode } // Iterativly find and record the best split - recursive function -func cbestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y []int64, labels []int64, upperNode CNode, criterion string, maxDepth int64, depth int64) CNode { +func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y []int64, labels []int64, upperNode classifierNode, criterion string, maxDepth int64, depth int64) classifierNode { // Ensure that we have not reached maxDepth. maxDepth =-1 means split until nodes are pure depth++ @@ -236,16 +236,16 @@ func cbestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y []int64, la upperNode.Use_not = true - var leftN CNode - var rightN CNode + var leftN classifierNode + var rightN classifierNode // Iterate over all features for i := 0; i < numFeatures; i++ { - featureVal := cgetFeature(data, int64(i)) - unique := cfindUnique(featureVal) + featureVal := classifierGetFeature(data, int64(i)) + unique := classifierFindUnique(featureVal) sort.Float64s(unique) numUnique := len(unique) - sortData, sortY := creOrderData(featureVal, data, y) + sortData, sortY := classifierReOrderData(featureVal, data, y) firstTime := true @@ -256,14 +256,14 @@ func cbestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y []int64, la if j != (numUnique - 1) { threshold := (unique[j] + unique[j+1]) / 2 // Ensure that same split has not been made before - if cvalidate(tree.triedSplits, int64(i), threshold) { + if classifierValidate(tree.triedSplits, int64(i), threshold) { // We need to split data from fresh when considering new feature for the first time. // Otherwise, we need to update the split by moving data points from left to right. if firstTime { - left, right, lefty, righty = ctestSplit(sortData, int64(i), sortY, threshold) + left, right, lefty, righty = classifierCreateSplit(sortData, int64(i), sortY, threshold) firstTime = false } else { - left, lefty, right, righty = cupdateSplit(left, lefty, right, righty, int64(i), threshold) + left, lefty, right, righty = classifierUpdateSplit(left, lefty, right, righty, int64(i), threshold) } var leftGini float64 @@ -314,7 +314,7 @@ func cbestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y []int64, la if bestLeftGini > 0 { tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold}) // Recursive splitting logic - leftN = cbestSplit(tree, bestLeft, bestLefty, labels, leftN, criterion, maxDepth, depth) + leftN = classifierBestSplit(tree, bestLeft, bestLefty, labels, leftN, criterion, maxDepth, depth) if leftN.Use_not == true { upperNode.Left = &leftN } @@ -324,7 +324,7 @@ func cbestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y []int64, la if bestRightGini > 0 { tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold}) // Recursive splitting logic - rightN = cbestSplit(tree, bestRight, bestRighty, labels, rightN, criterion, maxDepth, depth) + rightN = classifierBestSplit(tree, bestRight, bestRighty, labels, rightN, criterion, maxDepth, depth) if rightN.Use_not == true { upperNode.Right = &rightN } @@ -339,10 +339,10 @@ func cbestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y []int64, la // PrintTree : this function prints out entire tree for visualization - visible to user func (tree *CARTDecisionTreeClassifier) String() string { rootNode := *tree.RootNode - return cprintTreeFromNode(rootNode, "") + return classifierPrintTreeFromNode(rootNode, "") } -func cprintTreeFromNode(tree CNode, spacing string) string { +func classifierPrintTreeFromNode(tree classifierNode, spacing string) string { returnString := "" returnString += spacing + "Feature " returnString += strconv.FormatInt(tree.Feature, 10) @@ -365,30 +365,30 @@ func cprintTreeFromNode(tree CNode, spacing string) string { if tree.Left != nil { returnString += spacing + "---> True" + "\n" - returnString += cprintTreeFromNode(*tree.Left, spacing+" ") + returnString += classifierPrintTreeFromNode(*tree.Left, spacing+" ") } if tree.Right != nil { returnString += spacing + "---> False" + "\n" - returnString += cprintTreeFromNode(*tree.Right, spacing+" ") + returnString += classifierPrintTreeFromNode(*tree.Right, spacing+" ") } return returnString } // Predict a single data point by traversing the entire tree -func cpredictSingle(tree CNode, instance []float64) int64 { +func classifierPredictSingle(tree classifierNode, instance []float64) int64 { if instance[tree.Feature] < tree.Threshold { if tree.Left == nil { return tree.LeftLabel } else { - return cpredictSingle(*tree.Left, instance) + return classifierPredictSingle(*tree.Left, instance) } } else { if tree.Right == nil { return tree.RightLabel } else { - return cpredictSingle(*tree.Right, instance) + return classifierPredictSingle(*tree.Right, instance) } } } @@ -397,14 +397,14 @@ func cpredictSingle(tree CNode, instance []float64) int64 { func (tree *CARTDecisionTreeClassifier) Predict(X_test base.FixedDataGrid) []int64 { root := *tree.RootNode test := classifierConvertInstancesToProblemVec(X_test) - return cpredictFromNode(root, test) + return classifierPredictFromNode(root, test) } // This function uses the rootnode from Predict. It is invisible to user, but called from predict method. -func cpredictFromNode(tree CNode, test [][]float64) []int64 { +func classifierPredictFromNode(tree classifierNode, test [][]float64) []int64 { var preds []int64 for i := range test { - iPred := cpredictSingle(tree, test[i]) + iPred := classifierPredictSingle(tree, test[i]) preds = append(preds, iPred) } return preds @@ -415,11 +415,11 @@ func (tree *CARTDecisionTreeClassifier) Evaluate(test base.FixedDataGrid) float6 rootNode := *tree.RootNode xTest := classifierConvertInstancesToProblemVec(test) yTest := classifierConvertInstancesToLabelVec(test) - return cevaluateFromNode(rootNode, xTest, yTest) + return classifierEvaluateFromNode(rootNode, xTest, yTest) } -func cevaluateFromNode(tree CNode, xTest [][]float64, yTest []int64) float64 { - preds := cpredictFromNode(tree, xTest) +func classifierEvaluateFromNode(tree classifierNode, xTest [][]float64, yTest []int64) float64 { + preds := classifierPredictFromNode(tree, xTest) accuracy := 0.0 for i := range preds { if preds[i] == yTest[i] { diff --git a/trees/cart_regressor.go b/trees/cart_regressor.go index 6841a71..d894db5 100644 --- a/trees/cart_regressor.go +++ b/trees/cart_regressor.go @@ -13,9 +13,9 @@ import ( // The "r" prefix to all function names indicates that they were tailored to support regression. // RNode - Node struct for Decision Tree Regressor -type RNode struct { - Left *RNode - Right *RNode +type regressorNode struct { + Left *regressorNode + Right *regressorNode Threshold float64 Feature int64 LeftPred float64 @@ -25,22 +25,12 @@ type RNode struct { // CARTDecisionTreeRegressor - Tree struct for Decision Tree Regressor type CARTDecisionTreeRegressor struct { - RootNode *RNode + RootNode *regressorNode criterion string maxDepth int64 triedSplits [][]float64 } -// Calculate Mean Absolute Error for a constant prediction -func meanAbsoluteError(y []float64, yBar float64) float64 { - error := 0.0 - for _, target := range y { - error += math.Abs(target - yBar) - } - error /= float64(len(y)) - return error -} - // Find average func average(y []float64) float64 { mean := 0.0 @@ -51,6 +41,16 @@ func average(y []float64) float64 { return mean } +// Calculate Mean Absolute Error for a constant prediction +func meanAbsoluteError(y []float64, yBar float64) float64 { + error := 0.0 + for _, target := range y { + error += math.Abs(target - yBar) + } + error /= float64(len(y)) + return error +} + // Turn Mean Absolute Error into impurity function for decision trees. func maeImpurity(y []float64) (float64, float64) { yHat := average(y) @@ -75,7 +75,7 @@ func mseImpurity(y []float64) (float64, float64) { } // Split the data based on threshold and feature for testing information gain -func rtestSplit(data [][]float64, feature int64, y []float64, threshold float64) ([][]float64, [][]float64, []float64, []float64) { +func regressorCreateSplit(data [][]float64, feature int64, y []float64, threshold float64) ([][]float64, [][]float64, []float64, []float64) { var left [][]float64 var lefty []float64 var right [][]float64 @@ -96,7 +96,7 @@ func rtestSplit(data [][]float64, feature int64, y []float64, threshold float64) } // Helper function for finding unique values -func rstringInSlice(a float64, list []float64) bool { +func regressorStringInSlice(a float64, list []float64) bool { for _, b := range list { if b == a { return true @@ -106,10 +106,10 @@ func rstringInSlice(a float64, list []float64) bool { } // Return only unique values of a feature -func rfindUnique(data []float64) []float64 { +func regressorFindUnique(data []float64) []float64 { var unique []float64 for i := range data { - if !rstringInSlice(data[i], unique) { + if !regressorStringInSlice(data[i], unique) { unique = append(unique, data[i]) } } @@ -117,7 +117,7 @@ func rfindUnique(data []float64) []float64 { } // Extract out a single feature from data -func rgetFeature(data [][]float64, feature int64) []float64 { +func regressorGetFeature(data [][]float64, feature int64) []float64 { var featureVals []float64 for i := range data { featureVals = append(featureVals, data[i][feature]) @@ -134,7 +134,7 @@ func NewDecisionTreeRegressor(criterion string, maxDepth int64) *CARTDecisionTre } // Validate that the split being tested has not been done before. -func rvalidate(triedSplits [][]float64, feature int64, threshold float64) bool { +func regressorValidate(triedSplits [][]float64, feature int64, threshold float64) bool { for i := range triedSplits { split := triedSplits[i] featureTried, thresholdTried := split[0], split[1] @@ -146,7 +146,7 @@ func rvalidate(triedSplits [][]float64, feature int64, threshold float64) bool { } // Re order data based on a feature for optimizing code -func rreOrderData(featureVal []float64, data [][]float64, y []float64) ([][]float64, []float64) { +func regressorReOrderData(featureVal []float64, data [][]float64, y []float64) ([][]float64, []float64) { s := NewSlice(featureVal) sort.Sort(s) @@ -164,7 +164,7 @@ func rreOrderData(featureVal []float64, data [][]float64, y []float64) ([][]floa } // Update the left and right data based on change in threshold -func rupdateSplit(left [][]float64, lefty []float64, right [][]float64, righty []float64, feature int64, threshold float64) ([][]float64, []float64, [][]float64, []float64) { +func regressorUpdateSplit(left [][]float64, lefty []float64, right [][]float64, righty []float64, feature int64, threshold float64) ([][]float64, []float64, [][]float64, []float64) { for right[0][feature] < threshold { left = append(left, right[0]) @@ -178,17 +178,17 @@ func rupdateSplit(left [][]float64, lefty []float64, right [][]float64, righty [ // Extra Method for creating simple to use interface. Many params are either redundant for user but are needed only for recursive logic. func (tree *CARTDecisionTreeRegressor) Fit(X base.FixedDataGrid) { - var emptyNode RNode + var emptyNode regressorNode data := regressorConvertInstancesToProblemVec(X) y := regressorConvertInstancesToLabelVec(X) - emptyNode = rbestSplit(*tree, data, y, emptyNode, tree.criterion, tree.maxDepth, 0) + emptyNode = regressorBestSplit(*tree, data, y, emptyNode, tree.criterion, tree.maxDepth, 0) tree.RootNode = &emptyNode } // Essentially the Fit Method - Impelements recursive logic -func rbestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []float64, upperNode RNode, criterion string, maxDepth int64, depth int64) RNode { +func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []float64, upperNode regressorNode, criterion string, maxDepth int64, depth int64) regressorNode { depth++ @@ -220,16 +220,16 @@ func rbestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []float64, u upperNode.Use_not = true - var leftN RNode - var rightN RNode + var leftN regressorNode + var rightN regressorNode // Iterate over all features for i := 0; i < numFeatures; i++ { - featureVal := rgetFeature(data, int64(i)) - unique := rfindUnique(featureVal) + featureVal := regressorGetFeature(data, int64(i)) + unique := regressorFindUnique(featureVal) sort.Float64s(unique) numUnique := len(unique) - sortData, sortY := rreOrderData(featureVal, data, y) + sortData, sortY := regressorReOrderData(featureVal, data, y) firstTime := true @@ -239,12 +239,12 @@ func rbestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []float64, u for j := range unique { if j != (numUnique - 1) { threshold := (unique[j] + unique[j+1]) / 2 - if rvalidate(tree.triedSplits, int64(i), threshold) { + if regressorValidate(tree.triedSplits, int64(i), threshold) { if firstTime { - left, right, lefty, righty = rtestSplit(sortData, int64(i), sortY, threshold) + left, right, lefty, righty = regressorCreateSplit(sortData, int64(i), sortY, threshold) firstTime = false } else { - left, lefty, right, righty = rupdateSplit(left, lefty, right, righty, int64(i), threshold) + left, lefty, right, righty = regressorUpdateSplit(left, lefty, right, righty, int64(i), threshold) } var leftLoss float64 @@ -292,7 +292,7 @@ func rbestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []float64, u if bestLeftLoss > 0 { tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold}) - leftN = rbestSplit(tree, bestLeft, bestLefty, leftN, criterion, maxDepth, depth) + leftN = regressorBestSplit(tree, bestLeft, bestLefty, leftN, criterion, maxDepth, depth) if leftN.Use_not == true { upperNode.Left = &leftN } @@ -300,7 +300,7 @@ func rbestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []float64, u } if bestRightLoss > 0 { tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold}) - rightN = rbestSplit(tree, bestRight, bestRighty, rightN, criterion, maxDepth, depth) + rightN = regressorBestSplit(tree, bestRight, bestRighty, rightN, criterion, maxDepth, depth) if rightN.Use_not == true { upperNode.Right = &rightN } @@ -315,10 +315,10 @@ func rbestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []float64, u // Print Tree for Visualtion - calls printTreeFromNode() func (tree *CARTDecisionTreeRegressor) String() string { rootNode := *tree.RootNode - return rprintTreeFromNode(rootNode, "") + return regressorPrintTreeFromNode(rootNode, "") } -func rprintTreeFromNode(tree RNode, spacing string) string { +func regressorPrintTreeFromNode(tree regressorNode, spacing string) string { returnString := "" returnString += spacing + "Feature " returnString += strconv.FormatInt(tree.Feature, 10) @@ -341,31 +341,31 @@ func rprintTreeFromNode(tree RNode, spacing string) string { if tree.Left != nil { // fmt.Println(spacing + "---> True") returnString += spacing + "---> True" + "\n" - returnString += rprintTreeFromNode(*tree.Left, spacing+" ") + returnString += regressorPrintTreeFromNode(*tree.Left, spacing+" ") } if tree.Right != nil { // fmt.Println(spacing + "---> False") returnString += spacing + "---> False" + "\n" - returnString += rprintTreeFromNode(*tree.Right, spacing+" ") + returnString += regressorPrintTreeFromNode(*tree.Right, spacing+" ") } return returnString } // Predict a single data point -func rpredictSingle(tree RNode, instance []float64) float64 { +func regressorPredictSingle(tree regressorNode, instance []float64) float64 { if instance[tree.Feature] < tree.Threshold { if tree.Left == nil { return tree.LeftPred } else { - return rpredictSingle(*tree.Left, instance) + return regressorPredictSingle(*tree.Left, instance) } } else { if tree.Right == nil { return tree.RightPred } else { - return rpredictSingle(*tree.Right, instance) + return regressorPredictSingle(*tree.Right, instance) } } } @@ -374,14 +374,14 @@ func rpredictSingle(tree RNode, instance []float64) float64 { func (tree *CARTDecisionTreeRegressor) Predict(X_test base.FixedDataGrid) []float64 { root := *tree.RootNode test := regressorConvertInstancesToProblemVec(X_test) - return rpredictFromNode(root, test) + return regressorPredictFromNode(root, test) } // Use tree's root node to print out entire tree -func rpredictFromNode(tree RNode, test [][]float64) []float64 { +func regressorPredictFromNode(tree regressorNode, test [][]float64) []float64 { var preds []float64 for i := range test { - i_pred := rpredictSingle(tree, test[i]) + i_pred := regressorPredictSingle(tree, test[i]) preds = append(preds, i_pred) } return preds From 91a27e3ca0f4d93f6331b647c1b5aeaef50a59e1 Mon Sep 17 00:00:00 2001 From: Ayush Date: Mon, 27 Jul 2020 15:03:12 +0530 Subject: [PATCH 09/24] Fixing Comments --- examples/trees/cart.go | 1 - trees/cart_classifier.go | 53 ++++++++++++++++++++++++++-------------- trees/cart_regressor.go | 52 +++++++++++++++++++++++++++------------ 3 files changed, 71 insertions(+), 35 deletions(-) diff --git a/examples/trees/cart.go b/examples/trees/cart.go index 931b7e5..833e287 100644 --- a/examples/trees/cart.go +++ b/examples/trees/cart.go @@ -36,7 +36,6 @@ func main() { fmt.Println(decTree.Evaluate(testData)) // Load House Price Data For Regression - regressionData, err := base.ParseCSVToInstances("../datasets/boston_house_prices.csv", false) if err != nil { panic(err) diff --git a/trees/cart_classifier.go b/trees/cart_classifier.go index dc54d19..9ae3b8e 100644 --- a/trees/cart_classifier.go +++ b/trees/cart_classifier.go @@ -10,9 +10,13 @@ import ( "github.com/sjwhitworth/golearn/base" ) -// The "c" prefix to function names indicates that they were tailored for classification +const ( + GINI string = "gini" + ENTROPY string = "entropy" +) -// CNode is Node struct for Decision Tree Classifier +// CNode is Node struct for Decision Tree Classifier. +// It holds the information for each split (which feature to use, what threshold, and which label to assign for each side of the split) type classifierNode struct { Left *classifierNode Right *classifierNode @@ -25,6 +29,8 @@ type classifierNode struct { } // CARTDecisionTreeClassifier: Tree struct for Decision Tree Classifier +// It contains the rootNode, as well as all of the hyperparameters chosen by the user. +// It also keeps track of all splits done at the tree level. type CARTDecisionTreeClassifier struct { RootNode *classifierNode criterion string @@ -84,7 +90,7 @@ func entropy(y []int64, labels []int64) (float64, int64) { return entropy, maxLabel } -// Split the data into left node and right node based on feature and threshold - only needed for fresh nodes +// Split the data into left node and right node based on feature and threshold func classifierCreateSplit(data [][]float64, feature int64, y []int64, threshold float64) ([][]float64, [][]float64, []int64, []int64) { var left [][]float64 var right [][]float64 @@ -105,7 +111,8 @@ func classifierCreateSplit(data [][]float64, feature int64, y []int64, threshold return left, right, lefty, righty } -// Helper Function to check if data point is unique or not +// Helper Function to check if data point is unique or not. +// We will use this to isolate unique values of a feature func classifierStringInSlice(a float64, list []float64) bool { for _, b := range list { if b == a { @@ -115,7 +122,7 @@ func classifierStringInSlice(a float64, list []float64) bool { return false } -// Isolate only unique values. Needed for splitting data. +// Isolate only unique values. This way, we can try only unique splits and not redundant ones. func classifierFindUnique(data []float64) []float64 { var unique []float64 for i := range data { @@ -126,7 +133,7 @@ func classifierFindUnique(data []float64) []float64 { return unique } -// Isolate only the feature being considered for splitting +// Isolate only the feature being considered for splitting. Reduces the complexity in managing splits. func classifierGetFeature(data [][]float64, feature int64) []float64 { var featureVals []float64 for i := range data { @@ -135,7 +142,8 @@ func classifierGetFeature(data [][]float64, feature int64) []float64 { return featureVals } -// Function to Create New Decision Tree Classifier +// Function to Create New Decision Tree Classifier. +// It assigns all of the hyperparameters by user into the tree attributes. func NewDecisionTreeClassifier(criterion string, maxDepth int64, labels []int64) *CARTDecisionTreeClassifier { var tree CARTDecisionTreeClassifier tree.criterion = strings.ToLower(criterion) @@ -145,7 +153,8 @@ func NewDecisionTreeClassifier(criterion string, maxDepth int64, labels []int64) return &tree } -// Make sure that split being considered has not been done before +// Make sure that split being considered has not been done before. +// Else we will unnecessarily try splits that won't improve Impurity. func classifierValidate(triedSplits [][]float64, feature int64, threshold float64) bool { for i := range triedSplits { split := triedSplits[i] @@ -175,7 +184,7 @@ func classifierReOrderData(featureVal []float64, data [][]float64, y []int64) ([ return dataSorted, ySorted } -// Change data in Left Node and Right Node based on change in threshold +// Update the left and right side of the split based on the threshold. func classifierUpdateSplit(left [][]float64, lefty []int64, right [][]float64, righty []int64, feature int64, threshold float64) ([][]float64, []int64, [][]float64, []int64) { for right[0][feature] < threshold { @@ -188,7 +197,8 @@ func classifierUpdateSplit(left [][]float64, lefty []int64, right [][]float64, r return left, lefty, right, righty } -// Fit - Method visible to user to train tree +// Fit - Creates an Emppty Root Node +// Trains the tree by calling recursive function classifierBestSplit func (tree *CARTDecisionTreeClassifier) Fit(X base.FixedDataGrid) { var emptyNode classifierNode @@ -199,7 +209,8 @@ func (tree *CARTDecisionTreeClassifier) Fit(X base.FixedDataGrid) { tree.RootNode = &emptyNode } -// Iterativly find and record the best split - recursive function +// Iterativly find and record the best split +// Stop If depth reaches maxDepth or nodes are pure func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y []int64, labels []int64, upperNode classifierNode, criterion string, maxDepth int64, depth int64) classifierNode { // Ensure that we have not reached maxDepth. maxDepth =-1 means split until nodes are pure @@ -214,9 +225,9 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y [] var origGini float64 // Calculate loss based on Criterion Specified by user - if criterion == "gini" { + if criterion == GINI { origGini, upperNode.LeftLabel = giniImpurity(y, labels) - } else if criterion == "entropy" { + } else if criterion == ENTROPY { origGini, upperNode.LeftLabel = entropy(y, labels) } else { panic("Invalid impurity function, choose from GINI or ENTROPY") @@ -271,10 +282,10 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y [] var leftLabels int64 var rightLabels int64 - if criterion == "gini" { + if criterion == GINI { leftGini, leftLabels = giniImpurity(lefty, labels) rightGini, rightLabels = giniImpurity(righty, labels) - } else if criterion == "entropy" { + } else if criterion == ENTROPY { leftGini, leftLabels = entropy(lefty, labels) rightGini, rightLabels = entropy(righty, labels) } @@ -336,7 +347,8 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y [] return upperNode } -// PrintTree : this function prints out entire tree for visualization - visible to user +// String : this function prints out entire tree for visualization. +// Calls a recursive function to print the tree - classifierPrintTreeFromNode func (tree *CARTDecisionTreeClassifier) String() string { rootNode := *tree.RootNode return classifierPrintTreeFromNode(rootNode, "") @@ -377,6 +389,7 @@ func classifierPrintTreeFromNode(tree classifierNode, spacing string) string { } // Predict a single data point by traversing the entire tree +// Uses recursive logic to navigate the tree. func classifierPredictSingle(tree classifierNode, instance []float64) int64 { if instance[tree.Feature] < tree.Threshold { if tree.Left == nil { @@ -393,14 +406,15 @@ func classifierPredictSingle(tree classifierNode, instance []float64) int64 { } } -// Predict is visible to user. Given test data, they receive predictions for every datapoint. +// Given test data, return predictions for every datapoint. calls classifierPredictFromNode func (tree *CARTDecisionTreeClassifier) Predict(X_test base.FixedDataGrid) []int64 { root := *tree.RootNode test := classifierConvertInstancesToProblemVec(X_test) return classifierPredictFromNode(root, test) } -// This function uses the rootnode from Predict. It is invisible to user, but called from predict method. +// This function uses the rootnode from Predict. +// It iterates through every data point and calls the recursive function to give predictions and then summarizes them. func classifierPredictFromNode(tree classifierNode, test [][]float64) []int64 { var preds []int64 for i := range test { @@ -411,6 +425,8 @@ func classifierPredictFromNode(tree classifierNode, test [][]float64) []int64 { } // Given Test data and label, return the accuracy of the classifier. +// First it retreives predictions from the data, then compares for accuracy. +// Calls classifierEvaluateFromNode func (tree *CARTDecisionTreeClassifier) Evaluate(test base.FixedDataGrid) float64 { rootNode := *tree.RootNode xTest := classifierConvertInstancesToProblemVec(test) @@ -418,6 +434,7 @@ func (tree *CARTDecisionTreeClassifier) Evaluate(test base.FixedDataGrid) float6 return classifierEvaluateFromNode(rootNode, xTest, yTest) } +// Retrieve predictions and then calculate accuracy. func classifierEvaluateFromNode(tree classifierNode, xTest [][]float64, yTest []int64) float64 { preds := classifierPredictFromNode(tree, xTest) accuracy := 0.0 diff --git a/trees/cart_regressor.go b/trees/cart_regressor.go index d894db5..34b7880 100644 --- a/trees/cart_regressor.go +++ b/trees/cart_regressor.go @@ -10,9 +10,14 @@ import ( "github.com/sjwhitworth/golearn/base" ) -// The "r" prefix to all function names indicates that they were tailored to support regression. +const ( + MAE string = "mae" + MSE string = "mse" +) // RNode - Node struct for Decision Tree Regressor +// It holds the information for each split +// Which feature to use, threshold, left prediction and right prediction type regressorNode struct { Left *regressorNode Right *regressorNode @@ -24,6 +29,8 @@ type regressorNode struct { } // CARTDecisionTreeRegressor - Tree struct for Decision Tree Regressor +// It contains the rootNode, as well as the hyperparameters chosen by user. +// Also keeps track of splits used at tree level. type CARTDecisionTreeRegressor struct { RootNode *regressorNode criterion string @@ -74,7 +81,7 @@ func mseImpurity(y []float64) (float64, float64) { return meanSquaredError(y, yHat), yHat } -// Split the data based on threshold and feature for testing information gain +// Split the data into left and right based on trehsold and feature. func regressorCreateSplit(data [][]float64, feature int64, y []float64, threshold float64) ([][]float64, [][]float64, []float64, []float64) { var left [][]float64 var lefty []float64 @@ -95,7 +102,8 @@ func regressorCreateSplit(data [][]float64, feature int64, y []float64, threshol return left, right, lefty, righty } -// Helper function for finding unique values +// Helper function for finding unique values. +// Used for isolating unique values in a feature. func regressorStringInSlice(a float64, list []float64) bool { for _, b := range list { if b == a { @@ -105,7 +113,8 @@ func regressorStringInSlice(a float64, list []float64) bool { return false } -// Return only unique values of a feature +// Isolate only unique values. +// This way we can only try unique splits. func regressorFindUnique(data []float64) []float64 { var unique []float64 for i := range data { @@ -116,7 +125,8 @@ func regressorFindUnique(data []float64) []float64 { return unique } -// Extract out a single feature from data +// Extract out a single feature from data. +// Reduces complexity in managing splits and sorting func regressorGetFeature(data [][]float64, feature int64) []float64 { var featureVals []float64 for i := range data { @@ -125,7 +135,7 @@ func regressorGetFeature(data [][]float64, feature int64) []float64 { return featureVals } -// Interface for creating new Decision Tree Regressor - cals rbestSplit() +// Interface for creating new Decision Tree Regressor func NewDecisionTreeRegressor(criterion string, maxDepth int64) *CARTDecisionTreeRegressor { var tree CARTDecisionTreeRegressor tree.maxDepth = maxDepth @@ -134,6 +144,7 @@ func NewDecisionTreeRegressor(criterion string, maxDepth int64) *CARTDecisionTre } // Validate that the split being tested has not been done before. +// This prevents redundant splits from hapenning. func regressorValidate(triedSplits [][]float64, feature int64, threshold float64) bool { for i := range triedSplits { split := triedSplits[i] @@ -146,6 +157,7 @@ func regressorValidate(triedSplits [][]float64, feature int64, threshold float64 } // Re order data based on a feature for optimizing code +// Helps in updating splits without reiterating entire dataset func regressorReOrderData(featureVal []float64, data [][]float64, y []float64) ([][]float64, []float64) { s := NewSlice(featureVal) sort.Sort(s) @@ -176,7 +188,8 @@ func regressorUpdateSplit(left [][]float64, lefty []float64, right [][]float64, return left, lefty, right, righty } -// Extra Method for creating simple to use interface. Many params are either redundant for user but are needed only for recursive logic. +// Fit - Build the tree using the data +// Creates empty root node and builds tree by calling regressorBestSplit func (tree *CARTDecisionTreeRegressor) Fit(X base.FixedDataGrid) { var emptyNode regressorNode data := regressorConvertInstancesToProblemVec(X) @@ -187,7 +200,8 @@ func (tree *CARTDecisionTreeRegressor) Fit(X base.FixedDataGrid) { tree.RootNode = &emptyNode } -// Essentially the Fit Method - Impelements recursive logic +// Builds the tree by iteratively finding the best split. +// Recursive function - stops if maxDepth is reached or nodes are pure func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []float64, upperNode regressorNode, criterion string, maxDepth int64, depth int64) regressorNode { depth++ @@ -200,10 +214,12 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl var bestLoss float64 var origLoss float64 - if criterion == "mae" { + if criterion == MAE { origLoss, upperNode.LeftPred = maeImpurity(y) - } else { + } else if criterion == MSE { origLoss, upperNode.LeftPred = mseImpurity(y) + } else { + panic("Invalid impurity function, choose from MAE or MSE") } bestLoss = origLoss @@ -252,10 +268,10 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl var leftPred float64 var rightPred float64 - if criterion == "mae" { + if criterion == MAE { leftLoss, leftPred = maeImpurity(lefty) rightLoss, rightPred = maeImpurity(righty) - } else { + } else if criterion == MSE { leftLoss, leftPred = mseImpurity(lefty) rightLoss, rightPred = mseImpurity(righty) } @@ -312,12 +328,13 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl return upperNode } -// Print Tree for Visualtion - calls printTreeFromNode() +// Print Tree for Visualtion - calls regressorPrintTreeFromNode() func (tree *CARTDecisionTreeRegressor) String() string { rootNode := *tree.RootNode return regressorPrintTreeFromNode(rootNode, "") } +// Recursively explore the entire tree and print out all details such as threshold, feature, prediction func regressorPrintTreeFromNode(tree regressorNode, spacing string) string { returnString := "" returnString += spacing + "Feature " @@ -353,7 +370,8 @@ func regressorPrintTreeFromNode(tree regressorNode, spacing string) string { return returnString } -// Predict a single data point +// Predict a single data point by navigating to rootNodes. +// Uses a recursive logic func regressorPredictSingle(tree regressorNode, instance []float64) float64 { if instance[tree.Feature] < tree.Threshold { if tree.Left == nil { @@ -370,14 +388,16 @@ func regressorPredictSingle(tree regressorNode, instance []float64) float64 { } } -// Predict method for multiple data points. Calls predictFromNode() +// Predict method for multiple data points. +// First converts input data into usable format, and then calls regressorPredictFromNode func (tree *CARTDecisionTreeRegressor) Predict(X_test base.FixedDataGrid) []float64 { root := *tree.RootNode test := regressorConvertInstancesToProblemVec(X_test) return regressorPredictFromNode(root, test) } -// Use tree's root node to print out entire tree +// Use tree's root node to print out entire tree. +// Iterates over all data points and calls regressorPredictSingle to predict individual datapoints. func regressorPredictFromNode(tree regressorNode, test [][]float64) []float64 { var preds []float64 for i := range test { From ef751e62c484badf66a142053e7c7b55eb5e38f2 Mon Sep 17 00:00:00 2001 From: Ayush Date: Mon, 27 Jul 2020 17:08:44 +0530 Subject: [PATCH 10/24] Adding cart_test.go --- trees/cart_test.go | 109 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 trees/cart_test.go diff --git a/trees/cart_test.go b/trees/cart_test.go new file mode 100644 index 0000000..047392a --- /dev/null +++ b/trees/cart_test.go @@ -0,0 +1,109 @@ +package trees + +import ( + "fmt" + "testing" + + . "github.com/smartystreets/goconvey/convey" +) + +func TestRegressor(t *testing.T) { + + Convey("Doing a CART Test", t, func() { + // For Classification Trees: + + // Is Gini being calculated correctly + gini, giniMaxLabel := giniImpurity([]int64{1, 0, 0, 1}, []int64{0, 1}) + So(gini, ShouldEqual, 0.5) + So(giniMaxLabel, ShouldNotBeNil) + + // Is Entropy being calculated correctly + entropy, entropyMaxLabel := entropy([]int64{1, 0, 0, 1}, []int64{0, 1}) + So(entropy, ShouldEqual, 1.0) + So(entropyMaxLabel, ShouldNotBeNil) + + // Is Data being split into left and right properly + classifierData := [][]float64{[]float64{1, 3, 6}, + []float64{1, 2, 3}, + []float64{1, 9, 6}, + []float64{1, 11, 1}} + + classifiery := []int64{0, 1, 0, 0} + + leftdata, rightdata, lefty, righty := classifierCreateSplit(classifierData, 1, classifiery, 5.0) + + So(len(leftdata), ShouldEqual, 2) + So(len(lefty), ShouldEqual, 2) + So(len(rightdata), ShouldEqual, 2) + So(len(righty), ShouldEqual, 2) + + // Is isolating unique values working properly + So(len(classifierFindUnique([]float64{10, 1, 1})), ShouldEqual, 2) + + // is data reordered correctly + orderedData, orderedY := classifierReOrderData(classifierGetFeature(classifierData, 1), classifierData, classifiery) + fmt.Println(orderedData) + fmt.Println(orderedY) + So(orderedData[1][1], ShouldEqual, 3.0) + So(orderedY[0], ShouldEqual, 1) + + // Is split being updated properly based on threshold + leftdata, lefty, rightdata, righty = classifierUpdateSplit(leftdata, lefty, rightdata, righty, 1, 9.5) + So(len(leftdata), ShouldEqual, 3) + So(len(rightdata), ShouldEqual, 1) + + // Is the root Node null when tree is not trained? + tree := NewDecisionTreeClassifier("gini", -1, []int64{0, 1}) + So(tree.RootNode, ShouldBeNil) + So(tree.triedSplits, ShouldBeEmpty) + + // ------------------------------------------ + // For Regression Trees + + // Is MAE being calculated correctly + mae, maeMaxLabel := maeImpurity([]float64{1, 3, 5}) + So(mae, ShouldEqual, (4.0 / 3.0)) + So(maeMaxLabel, ShouldNotBeNil) + + // Is Entropy being calculated correctly + mse, mseMaxLabel := mseImpurity([]float64{1, 3, 5}) + So(mse, ShouldEqual, (8.0 / 3.0)) + So(mseMaxLabel, ShouldNotBeNil) + + // Is Data being split into left and right properly + data := [][]float64{[]float64{1, 3, 6}, + []float64{1, 2, 3}, + []float64{1, 9, 6}, + []float64{1, 11, 1}} + + y := []float64{1, 2, 3, 4} + + leftData, rightData, leftY, rightY := regressorCreateSplit(data, 1, y, 5.0) + + So(len(leftData), ShouldEqual, 2) + So(len(lefty), ShouldEqual, 2) + So(len(rightData), ShouldEqual, 2) + So(len(righty), ShouldEqual, 2) + + // Is isolating unique values working properly + So(len(regressorFindUnique([]float64{10, 1, 1})), ShouldEqual, 2) + + // is data reordered correctly + regressorOrderedData, regressorOrderedY := regressorReOrderData(regressorGetFeature(data, 1), data, y) + + So(regressorOrderedData[1][1], ShouldEqual, 3.0) + So(regressorOrderedY[0], ShouldEqual, 2) + + // Is split being updated properly based on threshold + leftData, leftY, rightData, rightY = regressorUpdateSplit(leftData, leftY, rightData, rightY, 1, 9.5) + So(len(leftData), ShouldEqual, 3) + So(len(rightData), ShouldEqual, 1) + + // Is the root Node null when tree is not trained? + regressorTreetree := NewDecisionTreeRegressor("mae", -1) + So(regressorTreetree.RootNode, ShouldBeNil) + So(regressorTreetree.triedSplits, ShouldBeEmpty) + + }) + +} From 2d2af0a58f54044bdbe10238e4aba88f2a8d45cf Mon Sep 17 00:00:00 2001 From: Ayush Date: Tue, 28 Jul 2020 14:17:18 +0530 Subject: [PATCH 11/24] Removing Clutter Partial Modularization of best split method. Shorten method by declaring variables in same line as well. Also removing redundant functions, and adding into cart_utils. --- trees/cart_classifier.go | 194 +++++++++++---------------------------- trees/cart_regressor.go | 162 ++++++++++---------------------- trees/cart_test.go | 9 +- trees/cart_utils.go | 74 +++++++++++++++ 4 files changed, 181 insertions(+), 258 deletions(-) create mode 100644 trees/cart_utils.go diff --git a/trees/cart_classifier.go b/trees/cart_classifier.go index 9ae3b8e..c2ba59c 100644 --- a/trees/cart_classifier.go +++ b/trees/cart_classifier.go @@ -90,6 +90,16 @@ func entropy(y []int64, labels []int64) (float64, int64) { return entropy, maxLabel } +func calculateClassificationLoss(y []int64, labels []int64, criterion string) (float64, int64) { + if criterion == GINI { + return giniImpurity(y, labels) + } else if criterion == ENTROPY { + return entropy(y, labels) + } else { + panic("Invalid impurity function, choose from GINI or ENTROPY") + } +} + // Split the data into left node and right node based on feature and threshold func classifierCreateSplit(data [][]float64, feature int64, y []int64, threshold float64) ([][]float64, [][]float64, []int64, []int64) { var left [][]float64 @@ -111,37 +121,6 @@ func classifierCreateSplit(data [][]float64, feature int64, y []int64, threshold return left, right, lefty, righty } -// Helper Function to check if data point is unique or not. -// We will use this to isolate unique values of a feature -func classifierStringInSlice(a float64, list []float64) bool { - for _, b := range list { - if b == a { - return true - } - } - return false -} - -// Isolate only unique values. This way, we can try only unique splits and not redundant ones. -func classifierFindUnique(data []float64) []float64 { - var unique []float64 - for i := range data { - if !classifierStringInSlice(data[i], unique) { - unique = append(unique, data[i]) - } - } - return unique -} - -// Isolate only the feature being considered for splitting. Reduces the complexity in managing splits. -func classifierGetFeature(data [][]float64, feature int64) []float64 { - var featureVals []float64 - for i := range data { - featureVals = append(featureVals, data[i][feature]) - } - return featureVals -} - // Function to Create New Decision Tree Classifier. // It assigns all of the hyperparameters by user into the tree attributes. func NewDecisionTreeClassifier(criterion string, maxDepth int64, labels []int64) *CARTDecisionTreeClassifier { @@ -153,19 +132,6 @@ func NewDecisionTreeClassifier(criterion string, maxDepth int64, labels []int64) return &tree } -// Make sure that split being considered has not been done before. -// Else we will unnecessarily try splits that won't improve Impurity. -func classifierValidate(triedSplits [][]float64, feature int64, threshold float64) bool { - for i := range triedSplits { - split := triedSplits[i] - featureTried, thresholdTried := split[0], split[1] - if int64(featureTried) == feature && thresholdTried == threshold { - return false - } - } - return true -} - // Reorder the data by feature being considered. Optimizes code by reducing the number of times we have to loop over data for splitting func classifierReOrderData(featureVal []float64, data [][]float64, y []int64) ([][]float64, []int64) { s := NewSlice(featureVal) @@ -202,7 +168,7 @@ func classifierUpdateSplit(left [][]float64, lefty []int64, right [][]float64, r func (tree *CARTDecisionTreeClassifier) Fit(X base.FixedDataGrid) { var emptyNode classifierNode - data := classifierConvertInstancesToProblemVec(X) + data := convertInstancesToProblemVec(X) y := classifierConvertInstancesToLabelVec(X) emptyNode = classifierBestSplit(*tree, data, y, tree.labels, emptyNode, tree.criterion, tree.maxDepth, 0) @@ -221,40 +187,29 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y [] } numFeatures := len(data[0]) - var bestGini float64 - var origGini float64 + var bestGini, origGini float64 // Calculate loss based on Criterion Specified by user - if criterion == GINI { - origGini, upperNode.LeftLabel = giniImpurity(y, labels) - } else if criterion == ENTROPY { - origGini, upperNode.LeftLabel = entropy(y, labels) - } else { - panic("Invalid impurity function, choose from GINI or ENTROPY") - } + origGini, upperNode.LeftLabel = calculateClassificationLoss(y, labels, criterion) bestGini = origGini - bestLeft := data - bestRight := data - bestLefty := y - bestRighty := y + bestLeft, bestRight, bestLefty, bestRighty := data, data, y, y numData := len(data) - bestLeftGini := bestGini - bestRightGini := bestGini + bestLeftGini, bestRightGini := bestGini, bestGini upperNode.Use_not = true - var leftN classifierNode - var rightN classifierNode + var leftN, rightN classifierNode + // Iterate over all features for i := 0; i < numFeatures; i++ { - featureVal := classifierGetFeature(data, int64(i)) - unique := classifierFindUnique(featureVal) + + featureVal := getFeature(data, int64(i)) + unique := findUnique(featureVal) sort.Float64s(unique) - numUnique := len(unique) sortData, sortY := classifierReOrderData(featureVal, data, y) @@ -263,53 +218,43 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y [] var left, right [][]float64 var lefty, righty []int64 // Iterate over all possible thresholds for that feature - for j := range unique { - if j != (numUnique - 1) { - threshold := (unique[j] + unique[j+1]) / 2 - // Ensure that same split has not been made before - if classifierValidate(tree.triedSplits, int64(i), threshold) { - // We need to split data from fresh when considering new feature for the first time. - // Otherwise, we need to update the split by moving data points from left to right. - if firstTime { - left, right, lefty, righty = classifierCreateSplit(sortData, int64(i), sortY, threshold) - firstTime = false - } else { - left, lefty, right, righty = classifierUpdateSplit(left, lefty, right, righty, int64(i), threshold) - } + for j := 0; j < len(unique)-1; j++ { - var leftGini float64 - var rightGini float64 - var leftLabels int64 - var rightLabels int64 - - if criterion == GINI { - leftGini, leftLabels = giniImpurity(lefty, labels) - rightGini, rightLabels = giniImpurity(righty, labels) - } else if criterion == ENTROPY { - leftGini, leftLabels = entropy(lefty, labels) - rightGini, rightLabels = entropy(righty, labels) - } - // Calculate weighted gini impurity of child nodes - subGini := (leftGini * float64(len(left)) / float64(numData)) + (rightGini * float64(len(right)) / float64(numData)) - - // If we find a split that reduces impurity - if subGini < bestGini { - bestGini = subGini - bestLeft = left - bestRight = right - bestLefty = lefty - bestRighty = righty - upperNode.Threshold = threshold - upperNode.Feature = int64(i) - - upperNode.LeftLabel = leftLabels - upperNode.RightLabel = rightLabels - - bestLeftGini = leftGini - bestRightGini = rightGini - } + threshold := (unique[j] + unique[j+1]) / 2 + // Ensure that same split has not been made before + if validate(tree.triedSplits, int64(i), threshold) { + // We need to split data from fresh when considering new feature for the first time. + // Otherwise, we need to update the split by moving data points from left to right. + if firstTime { + left, right, lefty, righty = classifierCreateSplit(sortData, int64(i), sortY, threshold) + firstTime = false + } else { + left, lefty, right, righty = classifierUpdateSplit(left, lefty, right, righty, int64(i), threshold) } + var leftGini, rightGini float64 + var leftLabels, rightLabels int64 + + leftGini, leftLabels = calculateClassificationLoss(lefty, labels, criterion) + rightGini, rightLabels = calculateClassificationLoss(righty, labels, criterion) + + // Calculate weighted gini impurity of child nodes + subGini := (leftGini * float64(len(left)) / float64(numData)) + (rightGini * float64(len(right)) / float64(numData)) + + // If we find a split that reduces impurity + if subGini < bestGini { + bestGini = subGini + + bestLeft, bestRight = left, right + + bestLefty, bestRighty = lefty, righty + + upperNode.Threshold, upperNode.Feature = threshold, int64(i) + + upperNode.LeftLabel, upperNode.RightLabel = leftLabels, rightLabels + + bestLeftGini, bestRightGini = leftGini, rightGini + } } } } @@ -366,10 +311,8 @@ func classifierPrintTreeFromNode(tree classifierNode, spacing string) string { returnString += spacing + "---> True" + "\n" returnString += " " + spacing + "PREDICT " returnString += strconv.FormatInt(tree.LeftLabel, 10) + "\n" - } if tree.Right == nil { - returnString += spacing + "---> False" + "\n" returnString += " " + spacing + "PREDICT " returnString += strconv.FormatInt(tree.RightLabel, 10) + "\n" @@ -409,7 +352,7 @@ func classifierPredictSingle(tree classifierNode, instance []float64) int64 { // Given test data, return predictions for every datapoint. calls classifierPredictFromNode func (tree *CARTDecisionTreeClassifier) Predict(X_test base.FixedDataGrid) []int64 { root := *tree.RootNode - test := classifierConvertInstancesToProblemVec(X_test) + test := convertInstancesToProblemVec(X_test) return classifierPredictFromNode(root, test) } @@ -429,7 +372,7 @@ func classifierPredictFromNode(tree classifierNode, test [][]float64) []int64 { // Calls classifierEvaluateFromNode func (tree *CARTDecisionTreeClassifier) Evaluate(test base.FixedDataGrid) float64 { rootNode := *tree.RootNode - xTest := classifierConvertInstancesToProblemVec(test) + xTest := convertInstancesToProblemVec(test) yTest := classifierConvertInstancesToLabelVec(test) return classifierEvaluateFromNode(rootNode, xTest, yTest) } @@ -447,31 +390,6 @@ func classifierEvaluateFromNode(tree classifierNode, xTest [][]float64, yTest [] return accuracy } -// Helper function to convert base.FixedDataGrid into required format. Called in Fit, Predict -func classifierConvertInstancesToProblemVec(X base.FixedDataGrid) [][]float64 { - // Allocate problem array - _, rows := X.Size() - problemVec := make([][]float64, rows) - - // Retrieve numeric non-class Attributes - numericAttrs := base.NonClassFloatAttributes(X) - numericAttrSpecs := base.ResolveAttributes(X, numericAttrs) - - // Convert each row - X.MapOverRows(numericAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { - // Allocate a new row - probRow := make([]float64, len(numericAttrSpecs)) - // Read out the row - for i, _ := range numericAttrSpecs { - probRow[i] = base.UnpackBytesToFloat(row[i]) - } - // Add the row - problemVec[rowNo] = probRow - return true, nil - }) - return problemVec -} - // Helper function to convert base.FixedDataGrid into required format. Called in Fit, Predict func classifierConvertInstancesToLabelVec(X base.FixedDataGrid) []int64 { // Get the class Attributes diff --git a/trees/cart_regressor.go b/trees/cart_regressor.go index 34b7880..1d2d326 100644 --- a/trees/cart_regressor.go +++ b/trees/cart_regressor.go @@ -81,6 +81,16 @@ func mseImpurity(y []float64) (float64, float64) { return meanSquaredError(y, yHat), yHat } +func calculateRegressionLoss(y []float64, criterion string) (float64, float64) { + if criterion == MAE { + return maeImpurity(y) + } else if criterion == MSE { + return mseImpurity(y) + } else { + panic("Invalid impurity function, choose from MAE or MSE") + } +} + // Split the data into left and right based on trehsold and feature. func regressorCreateSplit(data [][]float64, feature int64, y []float64, threshold float64) ([][]float64, [][]float64, []float64, []float64) { var left [][]float64 @@ -102,39 +112,6 @@ func regressorCreateSplit(data [][]float64, feature int64, y []float64, threshol return left, right, lefty, righty } -// Helper function for finding unique values. -// Used for isolating unique values in a feature. -func regressorStringInSlice(a float64, list []float64) bool { - for _, b := range list { - if b == a { - return true - } - } - return false -} - -// Isolate only unique values. -// This way we can only try unique splits. -func regressorFindUnique(data []float64) []float64 { - var unique []float64 - for i := range data { - if !regressorStringInSlice(data[i], unique) { - unique = append(unique, data[i]) - } - } - return unique -} - -// Extract out a single feature from data. -// Reduces complexity in managing splits and sorting -func regressorGetFeature(data [][]float64, feature int64) []float64 { - var featureVals []float64 - for i := range data { - featureVals = append(featureVals, data[i][feature]) - } - return featureVals -} - // Interface for creating new Decision Tree Regressor func NewDecisionTreeRegressor(criterion string, maxDepth int64) *CARTDecisionTreeRegressor { var tree CARTDecisionTreeRegressor @@ -143,19 +120,6 @@ func NewDecisionTreeRegressor(criterion string, maxDepth int64) *CARTDecisionTre return &tree } -// Validate that the split being tested has not been done before. -// This prevents redundant splits from hapenning. -func regressorValidate(triedSplits [][]float64, feature int64, threshold float64) bool { - for i := range triedSplits { - split := triedSplits[i] - featureTried, thresholdTried := split[0], split[1] - if int64(featureTried) == feature && thresholdTried == threshold { - return false - } - } - return true -} - // Re order data based on a feature for optimizing code // Helps in updating splits without reiterating entire dataset func regressorReOrderData(featureVal []float64, data [][]float64, y []float64) ([][]float64, []float64) { @@ -204,6 +168,7 @@ func (tree *CARTDecisionTreeRegressor) Fit(X base.FixedDataGrid) { // Recursive function - stops if maxDepth is reached or nodes are pure func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []float64, upperNode regressorNode, criterion string, maxDepth int64, depth int64) regressorNode { + // Ensure that we have not reached maxDepth. maxDepth =-1 means split until nodes are pure depth++ if depth > maxDepth && maxDepth != -1 { @@ -211,39 +176,27 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl } numFeatures := len(data[0]) - var bestLoss float64 - var origLoss float64 + var bestLoss, origLoss float64 - if criterion == MAE { - origLoss, upperNode.LeftPred = maeImpurity(y) - } else if criterion == MSE { - origLoss, upperNode.LeftPred = mseImpurity(y) - } else { - panic("Invalid impurity function, choose from MAE or MSE") - } + origLoss, upperNode.LeftPred = calculateRegressionLoss(y, criterion) bestLoss = origLoss - bestLeft := data - bestRight := data - bestLefty := y - bestRighty := y + bestLeft, bestRight, bestLefty, bestRighty := data, data, y, y numData := len(data) - bestLeftLoss := bestLoss - bestRightLoss := bestLoss + bestLeftLoss, bestRightLoss := bestLoss, bestLoss upperNode.Use_not = true - var leftN regressorNode - var rightN regressorNode + var leftN, rightN regressorNode // Iterate over all features for i := 0; i < numFeatures; i++ { - featureVal := regressorGetFeature(data, int64(i)) - unique := regressorFindUnique(featureVal) + + featureVal := getFeature(data, int64(i)) + unique := findUnique(featureVal) sort.Float64s(unique) - numUnique := len(unique) sortData, sortY := regressorReOrderData(featureVal, data, y) @@ -252,49 +205,36 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl var left, right [][]float64 var lefty, righty []float64 - for j := range unique { - if j != (numUnique - 1) { - threshold := (unique[j] + unique[j+1]) / 2 - if regressorValidate(tree.triedSplits, int64(i), threshold) { - if firstTime { - left, right, lefty, righty = regressorCreateSplit(sortData, int64(i), sortY, threshold) - firstTime = false - } else { - left, lefty, right, righty = regressorUpdateSplit(left, lefty, right, righty, int64(i), threshold) - } - - var leftLoss float64 - var rightLoss float64 - var leftPred float64 - var rightPred float64 - - if criterion == MAE { - leftLoss, leftPred = maeImpurity(lefty) - rightLoss, rightPred = maeImpurity(righty) - } else if criterion == MSE { - leftLoss, leftPred = mseImpurity(lefty) - rightLoss, rightPred = mseImpurity(righty) - } - - subLoss := (leftLoss * float64(len(left)) / float64(numData)) + (rightLoss * float64(len(right)) / float64(numData)) - - if subLoss < bestLoss { - bestLoss = subLoss - bestLeft = left - bestRight = right - bestLefty = lefty - bestRighty = righty - upperNode.Threshold = threshold - upperNode.Feature = int64(i) - - upperNode.LeftPred = leftPred - upperNode.RightPred = rightPred - - bestLeftLoss = leftLoss - bestRightLoss = rightLoss - } + for j := 0; j < len(unique)-1; j++ { + threshold := (unique[j] + unique[j+1]) / 2 + if validate(tree.triedSplits, int64(i), threshold) { + if firstTime { + left, right, lefty, righty = regressorCreateSplit(sortData, int64(i), sortY, threshold) + firstTime = false + } else { + left, lefty, right, righty = regressorUpdateSplit(left, lefty, right, righty, int64(i), threshold) } + var leftLoss, rightLoss float64 + var leftPred, rightPred float64 + + leftLoss, leftPred = calculateRegressionLoss(lefty, criterion) + rightLoss, rightPred = calculateRegressionLoss(righty, criterion) + + subLoss := (leftLoss * float64(len(left)) / float64(numData)) + (rightLoss * float64(len(right)) / float64(numData)) + + if subLoss < bestLoss { + bestLoss = subLoss + + bestLeft, bestRight = left, right + bestLefty, bestRighty = lefty, righty + + upperNode.Threshold, upperNode.Feature = threshold, int64(i) + + upperNode.LeftPred, upperNode.RightPred = leftPred, rightPred + + bestLeftLoss, bestRightLoss = leftLoss, rightLoss + } } } } @@ -312,19 +252,16 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl if leftN.Use_not == true { upperNode.Left = &leftN } - } + if bestRightLoss > 0 { tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold}) rightN = regressorBestSplit(tree, bestRight, bestRighty, rightN, criterion, maxDepth, depth) if rightN.Use_not == true { upperNode.Right = &rightN } - } - } - return upperNode } @@ -349,20 +286,17 @@ func regressorPrintTreeFromNode(tree regressorNode, spacing string) string { returnString += fmt.Sprintf("%.3f", tree.LeftPred) + "\n" } if tree.Right == nil { - returnString += spacing + "---> False" + "\n" returnString += " " + spacing + "PREDICT " returnString += fmt.Sprintf("%.3f", tree.RightPred) + "\n" } if tree.Left != nil { - // fmt.Println(spacing + "---> True") returnString += spacing + "---> True" + "\n" returnString += regressorPrintTreeFromNode(*tree.Left, spacing+" ") } if tree.Right != nil { - // fmt.Println(spacing + "---> False") returnString += spacing + "---> False" + "\n" returnString += regressorPrintTreeFromNode(*tree.Right, spacing+" ") } diff --git a/trees/cart_test.go b/trees/cart_test.go index 047392a..50387b9 100644 --- a/trees/cart_test.go +++ b/trees/cart_test.go @@ -38,10 +38,10 @@ func TestRegressor(t *testing.T) { So(len(righty), ShouldEqual, 2) // Is isolating unique values working properly - So(len(classifierFindUnique([]float64{10, 1, 1})), ShouldEqual, 2) + So(len(findUnique([]float64{10, 1, 1})), ShouldEqual, 2) // is data reordered correctly - orderedData, orderedY := classifierReOrderData(classifierGetFeature(classifierData, 1), classifierData, classifiery) + orderedData, orderedY := classifierReOrderData(getFeature(classifierData, 1), classifierData, classifiery) fmt.Println(orderedData) fmt.Println(orderedY) So(orderedData[1][1], ShouldEqual, 3.0) @@ -85,11 +85,8 @@ func TestRegressor(t *testing.T) { So(len(rightData), ShouldEqual, 2) So(len(righty), ShouldEqual, 2) - // Is isolating unique values working properly - So(len(regressorFindUnique([]float64{10, 1, 1})), ShouldEqual, 2) - // is data reordered correctly - regressorOrderedData, regressorOrderedY := regressorReOrderData(regressorGetFeature(data, 1), data, y) + regressorOrderedData, regressorOrderedY := regressorReOrderData(getFeature(data, 1), data, y) So(regressorOrderedData[1][1], ShouldEqual, 3.0) So(regressorOrderedY[0], ShouldEqual, 2) diff --git a/trees/cart_utils.go b/trees/cart_utils.go new file mode 100644 index 0000000..d3b9b4a --- /dev/null +++ b/trees/cart_utils.go @@ -0,0 +1,74 @@ +package trees + +import ( + "github.com/sjwhitworth/golearn/base" +) + +// Helper Function to check if data point is unique or not. +// We will use this to isolate unique values of a feature +func stringInSlice(a float64, list []float64) bool { + for _, b := range list { + if b == a { + return true + } + } + return false +} + +// Isolate only unique values. This way, we can try only unique splits and not redundant ones. +func findUnique(data []float64) []float64 { + var unique []float64 + for i := range data { + if !stringInSlice(data[i], unique) { + unique = append(unique, data[i]) + } + } + return unique +} + +// Isolate only the feature being considered for splitting. Reduces the complexity in managing splits. +func getFeature(data [][]float64, feature int64) []float64 { + var featureVals []float64 + for i := range data { + featureVals = append(featureVals, data[i][feature]) + } + return featureVals +} + +// Make sure that split being considered has not been done before. +// Else we will unnecessarily try splits that won't improve Impurity. +func validate(triedSplits [][]float64, feature int64, threshold float64) bool { + for i := range triedSplits { + split := triedSplits[i] + featureTried, thresholdTried := split[0], split[1] + if int64(featureTried) == feature && thresholdTried == threshold { + return false + } + } + return true +} + +// Helper function to convert base.FixedDataGrid into required format. Called in Fit, Predict +func convertInstancesToProblemVec(X base.FixedDataGrid) [][]float64 { + // Allocate problem array + _, rows := X.Size() + problemVec := make([][]float64, rows) + + // Retrieve numeric non-class Attributes + numericAttrs := base.NonClassFloatAttributes(X) + numericAttrSpecs := base.ResolveAttributes(X, numericAttrs) + + // Convert each row + X.MapOverRows(numericAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { + // Allocate a new row + probRow := make([]float64, len(numericAttrSpecs)) + // Read out the row + for i, _ := range numericAttrSpecs { + probRow[i] = base.UnpackBytesToFloat(row[i]) + } + // Add the row + problemVec[rowNo] = probRow + return true, nil + }) + return problemVec +} From 1954aae7a685bdcef60080aa096cc6033e9012aa Mon Sep 17 00:00:00 2001 From: Ayush Date: Thu, 30 Jul 2020 10:27:16 +0530 Subject: [PATCH 12/24] Changing name of Use_not --- trees/cart_classifier.go | 25 ++++++++++++------------- trees/cart_regressor.go | 22 +++++++++++----------- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/trees/cart_classifier.go b/trees/cart_classifier.go index c2ba59c..17a3ee7 100644 --- a/trees/cart_classifier.go +++ b/trees/cart_classifier.go @@ -18,14 +18,13 @@ const ( // CNode is Node struct for Decision Tree Classifier. // It holds the information for each split (which feature to use, what threshold, and which label to assign for each side of the split) type classifierNode struct { - Left *classifierNode - Right *classifierNode - Threshold float64 - Feature int64 - LeftLabel int64 - RightLabel int64 - Use_not bool - maxDepth int64 + Left *classifierNode + Right *classifierNode + Threshold float64 + Feature int64 + LeftLabel int64 + RightLabel int64 + isNodeNeeded bool } // CARTDecisionTreeClassifier: Tree struct for Decision Tree Classifier @@ -163,7 +162,7 @@ func classifierUpdateSplit(left [][]float64, lefty []int64, right [][]float64, r return left, lefty, right, righty } -// Fit - Creates an Emppty Root Node +// Fit - Creates an Emppty Root Node2 // Trains the tree by calling recursive function classifierBestSplit func (tree *CARTDecisionTreeClassifier) Fit(X base.FixedDataGrid) { var emptyNode classifierNode @@ -200,7 +199,7 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y [] bestLeftGini, bestRightGini := bestGini, bestGini - upperNode.Use_not = true + upperNode.isNodeNeeded = true var leftN, rightN classifierNode @@ -260,7 +259,7 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y [] } // If no split was found, we don't want to use this node, so we will flag it if bestGini == origGini { - upperNode.Use_not = false + upperNode.isNodeNeeded = false return upperNode } // Until nodes are not pure @@ -271,7 +270,7 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y [] tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold}) // Recursive splitting logic leftN = classifierBestSplit(tree, bestLeft, bestLefty, labels, leftN, criterion, maxDepth, depth) - if leftN.Use_not == true { + if leftN.isNodeNeeded == true { upperNode.Left = &leftN } @@ -281,7 +280,7 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y [] tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold}) // Recursive splitting logic rightN = classifierBestSplit(tree, bestRight, bestRighty, labels, rightN, criterion, maxDepth, depth) - if rightN.Use_not == true { + if rightN.isNodeNeeded == true { upperNode.Right = &rightN } diff --git a/trees/cart_regressor.go b/trees/cart_regressor.go index 1d2d326..b94da1d 100644 --- a/trees/cart_regressor.go +++ b/trees/cart_regressor.go @@ -19,13 +19,13 @@ const ( // It holds the information for each split // Which feature to use, threshold, left prediction and right prediction type regressorNode struct { - Left *regressorNode - Right *regressorNode - Threshold float64 - Feature int64 - LeftPred float64 - RightPred float64 - Use_not bool + Left *regressorNode + Right *regressorNode + Threshold float64 + Feature int64 + LeftPred float64 + RightPred float64 + isNodeNeeded bool } // CARTDecisionTreeRegressor - Tree struct for Decision Tree Regressor @@ -188,7 +188,7 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl bestLeftLoss, bestRightLoss := bestLoss, bestLoss - upperNode.Use_not = true + upperNode.isNodeNeeded = true var leftN, rightN regressorNode // Iterate over all features @@ -240,7 +240,7 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl } if bestLoss == origLoss { - upperNode.Use_not = false + upperNode.isNodeNeeded = false return upperNode } @@ -249,7 +249,7 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl if bestLeftLoss > 0 { tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold}) leftN = regressorBestSplit(tree, bestLeft, bestLefty, leftN, criterion, maxDepth, depth) - if leftN.Use_not == true { + if leftN.isNodeNeeded == true { upperNode.Left = &leftN } } @@ -257,7 +257,7 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl if bestRightLoss > 0 { tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold}) rightN = regressorBestSplit(tree, bestRight, bestRighty, rightN, criterion, maxDepth, depth) - if rightN.Use_not == true { + if rightN.isNodeNeeded == true { upperNode.Right = &rightN } } From d587340e4a995f17aa70dc52507e7d8c91341480 Mon Sep 17 00:00:00 2001 From: Ayush Date: Thu, 30 Jul 2020 11:21:06 +0530 Subject: [PATCH 13/24] Renaming Impurity Functions --- trees/cart_classifier.go | 8 ++++---- trees/cart_regressor.go | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/trees/cart_classifier.go b/trees/cart_classifier.go index 17a3ee7..fee9043 100644 --- a/trees/cart_classifier.go +++ b/trees/cart_classifier.go @@ -39,7 +39,7 @@ type CARTDecisionTreeClassifier struct { } // Calculate Gini Impurity of Target Labels -func giniImpurity(y []int64, labels []int64) (float64, int64) { +func computeGiniImpurityAndModeLabel(y []int64, labels []int64) (float64, int64) { nInstances := len(y) gini := 0.0 maxLabelCount := 0 @@ -62,7 +62,7 @@ func giniImpurity(y []int64, labels []int64) (float64, int64) { } // Calculate Entropy loss of Target Labels -func entropy(y []int64, labels []int64) (float64, int64) { +func computeEntropyAndModeLabel(y []int64, labels []int64) (float64, int64) { nInstances := len(y) entropy := 0.0 maxLabelCount := 0 @@ -91,9 +91,9 @@ func entropy(y []int64, labels []int64) (float64, int64) { func calculateClassificationLoss(y []int64, labels []int64, criterion string) (float64, int64) { if criterion == GINI { - return giniImpurity(y, labels) + return computeGiniImpurityAndModeLabel(y, labels) } else if criterion == ENTROPY { - return entropy(y, labels) + return computeEntropyAndModeLabel(y, labels) } else { panic("Invalid impurity function, choose from GINI or ENTROPY") } diff --git a/trees/cart_regressor.go b/trees/cart_regressor.go index b94da1d..3509a15 100644 --- a/trees/cart_regressor.go +++ b/trees/cart_regressor.go @@ -59,7 +59,7 @@ func meanAbsoluteError(y []float64, yBar float64) float64 { } // Turn Mean Absolute Error into impurity function for decision trees. -func maeImpurity(y []float64) (float64, float64) { +func computeMaeImpurityAndAverage(y []float64) (float64, float64) { yHat := average(y) return meanAbsoluteError(y, yHat), yHat } @@ -76,16 +76,16 @@ func meanSquaredError(y []float64, yBar float64) float64 { } // Convert mean squared error into impurity function for decision trees -func mseImpurity(y []float64) (float64, float64) { +func computeMseImpurityAndAverage(y []float64) (float64, float64) { yHat := average(y) return meanSquaredError(y, yHat), yHat } func calculateRegressionLoss(y []float64, criterion string) (float64, float64) { if criterion == MAE { - return maeImpurity(y) + return computeMaeImpurityAndAverage(y) } else if criterion == MSE { - return mseImpurity(y) + return computeMseImpurityAndAverage(y) } else { panic("Invalid impurity function, choose from MAE or MSE") } From 7276108661eec19f3da12131389eeb2013314075 Mon Sep 17 00:00:00 2001 From: Ayush Date: Thu, 30 Jul 2020 11:48:50 +0530 Subject: [PATCH 14/24] Adding Documentation Comparision in performance and implementation with sklearn. --- examples/trees/cart.go | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/examples/trees/cart.go b/examples/trees/cart.go index 833e287..f465d54 100644 --- a/examples/trees/cart.go +++ b/examples/trees/cart.go @@ -9,6 +9,22 @@ import ( ) func main() { + /* Performance of CART Algorithm: + + Training Time for Titanic Dataset ≈ 713 µs + Prediction Time for Titanic Datset ≈ 133 µs + + Sklearn: + Training Time for Titanic Dataset ≈ 8.8 µs + Prediction Time for Titanic Datset ≈ 7.87 µs + + This implementation and sci-kit learn produce the exact same tree for the exact same dataset. + Predictions on the same test set also yield the exact same accuracy. + + This implementation is optimized to prevent redundant iterations over the dataset, but it is not completely optimized. Also, sklearn makes use of numpy to access column easily, whereas here a complete iteration is required. + In terms of Hyperparameters, this implmentation gives you the ability to choose the impurity function and the maxDepth. + Many of the other hyperparameters used in sklearn are not here, but pruning and impurity is included. + */ // Load Titanic Data For classification classificationData, err := base.ParseCSVToInstances("../datasets/titanic.csv", false) From 7f8ce6d1138c7d045073cdd9025034710ab60802 Mon Sep 17 00:00:00 2001 From: Ayush Date: Fri, 31 Jul 2020 11:01:20 +0530 Subject: [PATCH 15/24] Removing Panics --- trees/cart_classifier.go | 75 ++++++++++++++++++++++++++-------------- trees/cart_regressor.go | 64 ++++++++++++++++++++++------------ trees/cart_test.go | 8 ++--- 3 files changed, 96 insertions(+), 51 deletions(-) diff --git a/trees/cart_classifier.go b/trees/cart_classifier.go index fee9043..828f2dc 100644 --- a/trees/cart_classifier.go +++ b/trees/cart_classifier.go @@ -1,6 +1,7 @@ package trees import ( + "errors" "fmt" "math" "sort" @@ -89,13 +90,15 @@ func computeEntropyAndModeLabel(y []int64, labels []int64) (float64, int64) { return entropy, maxLabel } -func calculateClassificationLoss(y []int64, labels []int64, criterion string) (float64, int64) { +func calculateClassificationLoss(y []int64, labels []int64, criterion string) (float64, int64, error) { if criterion == GINI { - return computeGiniImpurityAndModeLabel(y, labels) + loss, modeLabel := computeGiniImpurityAndModeLabel(y, labels) + return loss, modeLabel, nil } else if criterion == ENTROPY { - return computeEntropyAndModeLabel(y, labels) + loss, modeLabel := computeEntropyAndModeLabel(y, labels) + return loss, modeLabel, nil } else { - panic("Invalid impurity function, choose from GINI or ENTROPY") + return 0, 0, errors.New("Invalid impurity function, choose from GINI or ENTROPY") } } @@ -164,32 +167,44 @@ func classifierUpdateSplit(left [][]float64, lefty []int64, right [][]float64, r // Fit - Creates an Emppty Root Node2 // Trains the tree by calling recursive function classifierBestSplit -func (tree *CARTDecisionTreeClassifier) Fit(X base.FixedDataGrid) { +func (tree *CARTDecisionTreeClassifier) Fit(X base.FixedDataGrid) error { var emptyNode classifierNode + var err error data := convertInstancesToProblemVec(X) - y := classifierConvertInstancesToLabelVec(X) - emptyNode = classifierBestSplit(*tree, data, y, tree.labels, emptyNode, tree.criterion, tree.maxDepth, 0) + y, err := classifierConvertInstancesToLabelVec(X) + if err != nil { + return err + } + emptyNode, err = classifierBestSplit(*tree, data, y, tree.labels, emptyNode, tree.criterion, tree.maxDepth, 0) + + if err != nil { + return err + } tree.RootNode = &emptyNode + return nil } // Iterativly find and record the best split // Stop If depth reaches maxDepth or nodes are pure -func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y []int64, labels []int64, upperNode classifierNode, criterion string, maxDepth int64, depth int64) classifierNode { +func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y []int64, labels []int64, upperNode classifierNode, criterion string, maxDepth int64, depth int64) (classifierNode, error) { // Ensure that we have not reached maxDepth. maxDepth =-1 means split until nodes are pure depth++ if maxDepth != -1 && depth > maxDepth { - return upperNode + return upperNode, nil } numFeatures := len(data[0]) var bestGini, origGini float64 - + var err error // Calculate loss based on Criterion Specified by user - origGini, upperNode.LeftLabel = calculateClassificationLoss(y, labels, criterion) + origGini, upperNode.LeftLabel, err = calculateClassificationLoss(y, labels, criterion) + if err != nil { + return upperNode, err + } bestGini = origGini @@ -234,8 +249,8 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y [] var leftGini, rightGini float64 var leftLabels, rightLabels int64 - leftGini, leftLabels = calculateClassificationLoss(lefty, labels, criterion) - rightGini, rightLabels = calculateClassificationLoss(righty, labels, criterion) + leftGini, leftLabels, _ = calculateClassificationLoss(lefty, labels, criterion) + rightGini, rightLabels, _ = calculateClassificationLoss(righty, labels, criterion) // Calculate weighted gini impurity of child nodes subGini := (leftGini * float64(len(left)) / float64(numData)) + (rightGini * float64(len(right)) / float64(numData)) @@ -260,7 +275,7 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y [] // If no split was found, we don't want to use this node, so we will flag it if bestGini == origGini { upperNode.isNodeNeeded = false - return upperNode + return upperNode, nil } // Until nodes are not pure if bestGini > 0 { @@ -269,7 +284,10 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y [] if bestLeftGini > 0 { tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold}) // Recursive splitting logic - leftN = classifierBestSplit(tree, bestLeft, bestLefty, labels, leftN, criterion, maxDepth, depth) + leftN, err = classifierBestSplit(tree, bestLeft, bestLefty, labels, leftN, criterion, maxDepth, depth) + if err != nil { + return upperNode, err + } if leftN.isNodeNeeded == true { upperNode.Left = &leftN } @@ -279,7 +297,10 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y [] if bestRightGini > 0 { tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold}) // Recursive splitting logic - rightN = classifierBestSplit(tree, bestRight, bestRighty, labels, rightN, criterion, maxDepth, depth) + rightN, err = classifierBestSplit(tree, bestRight, bestRighty, labels, rightN, criterion, maxDepth, depth) + if err != nil { + return upperNode, err + } if rightN.isNodeNeeded == true { upperNode.Right = &rightN } @@ -288,7 +309,7 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y [] } // Return the node - contains all information regarding feature and threshold. - return upperNode + return upperNode, nil } // String : this function prints out entire tree for visualization. @@ -369,11 +390,14 @@ func classifierPredictFromNode(tree classifierNode, test [][]float64) []int64 { // Given Test data and label, return the accuracy of the classifier. // First it retreives predictions from the data, then compares for accuracy. // Calls classifierEvaluateFromNode -func (tree *CARTDecisionTreeClassifier) Evaluate(test base.FixedDataGrid) float64 { +func (tree *CARTDecisionTreeClassifier) Evaluate(test base.FixedDataGrid) (float64, error) { rootNode := *tree.RootNode xTest := convertInstancesToProblemVec(test) - yTest := classifierConvertInstancesToLabelVec(test) - return classifierEvaluateFromNode(rootNode, xTest, yTest) + yTest, err := classifierConvertInstancesToLabelVec(test) + if err != nil { + return 0, err + } + return classifierEvaluateFromNode(rootNode, xTest, yTest), nil } // Retrieve predictions and then calculate accuracy. @@ -390,20 +414,21 @@ func classifierEvaluateFromNode(tree classifierNode, xTest [][]float64, yTest [] } // Helper function to convert base.FixedDataGrid into required format. Called in Fit, Predict -func classifierConvertInstancesToLabelVec(X base.FixedDataGrid) []int64 { +func classifierConvertInstancesToLabelVec(X base.FixedDataGrid) ([]int64, error) { // Get the class Attributes classAttrs := X.AllClassAttributes() // Only support 1 class Attribute if len(classAttrs) != 1 { - panic(fmt.Sprintf("%d ClassAttributes (1 expected)", len(classAttrs))) + return []int64{0}, errors.New(fmt.Sprintf("%d ClassAttributes (1 expected)", len(classAttrs))) + } // ClassAttribute must be numeric if _, ok := classAttrs[0].(*base.FloatAttribute); !ok { - panic(fmt.Sprintf("%s: ClassAttribute must be a FloatAttribute", classAttrs[0])) + return []int64{0}, errors.New(fmt.Sprintf("%s: ClassAttribute must be a FloatAttribute", classAttrs[0])) } // Allocate return structure _, rows := X.Size() - // labelVec := make([]float64, rows) + labelVec := make([]int64, rows) // Resolve class Attribute specification classAttrSpecs := base.ResolveAttributes(X, classAttrs) @@ -411,5 +436,5 @@ func classifierConvertInstancesToLabelVec(X base.FixedDataGrid) []int64 { labelVec[rowNo] = int64(base.UnpackBytesToFloat(row[0])) return true, nil }) - return labelVec + return labelVec, nil } diff --git a/trees/cart_regressor.go b/trees/cart_regressor.go index 3509a15..96d3405 100644 --- a/trees/cart_regressor.go +++ b/trees/cart_regressor.go @@ -1,6 +1,7 @@ package trees import ( + "errors" "fmt" "math" "sort" @@ -81,11 +82,13 @@ func computeMseImpurityAndAverage(y []float64) (float64, float64) { return meanSquaredError(y, yHat), yHat } -func calculateRegressionLoss(y []float64, criterion string) (float64, float64) { +func calculateRegressionLoss(y []float64, criterion string) (float64, float64, error) { if criterion == MAE { - return computeMaeImpurityAndAverage(y) + loss, avg := computeMaeImpurityAndAverage(y) + return loss, avg, nil } else if criterion == MSE { - return computeMseImpurityAndAverage(y) + loss, avg := computeMseImpurityAndAverage(y) + return loss, avg, nil } else { panic("Invalid impurity function, choose from MAE or MSE") } @@ -154,31 +157,42 @@ func regressorUpdateSplit(left [][]float64, lefty []float64, right [][]float64, // Fit - Build the tree using the data // Creates empty root node and builds tree by calling regressorBestSplit -func (tree *CARTDecisionTreeRegressor) Fit(X base.FixedDataGrid) { +func (tree *CARTDecisionTreeRegressor) Fit(X base.FixedDataGrid) error { var emptyNode regressorNode + var err error + data := regressorConvertInstancesToProblemVec(X) - y := regressorConvertInstancesToLabelVec(X) - - emptyNode = regressorBestSplit(*tree, data, y, emptyNode, tree.criterion, tree.maxDepth, 0) + y, err := regressorConvertInstancesToLabelVec(X) + if err != nil { + return err + } + emptyNode, err = regressorBestSplit(*tree, data, y, emptyNode, tree.criterion, tree.maxDepth, 0) + if err != nil { + return err + } tree.RootNode = &emptyNode + return nil } // Builds the tree by iteratively finding the best split. // Recursive function - stops if maxDepth is reached or nodes are pure -func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []float64, upperNode regressorNode, criterion string, maxDepth int64, depth int64) regressorNode { +func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []float64, upperNode regressorNode, criterion string, maxDepth int64, depth int64) (regressorNode, error) { // Ensure that we have not reached maxDepth. maxDepth =-1 means split until nodes are pure depth++ if depth > maxDepth && maxDepth != -1 { - return upperNode + return upperNode, nil } numFeatures := len(data[0]) var bestLoss, origLoss float64 - - origLoss, upperNode.LeftPred = calculateRegressionLoss(y, criterion) + var err error + origLoss, upperNode.LeftPred, err = calculateRegressionLoss(y, criterion) + if err != nil { + return upperNode, err + } bestLoss = origLoss @@ -218,8 +232,8 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl var leftLoss, rightLoss float64 var leftPred, rightPred float64 - leftLoss, leftPred = calculateRegressionLoss(lefty, criterion) - rightLoss, rightPred = calculateRegressionLoss(righty, criterion) + leftLoss, leftPred, _ = calculateRegressionLoss(lefty, criterion) + rightLoss, rightPred, _ = calculateRegressionLoss(righty, criterion) subLoss := (leftLoss * float64(len(left)) / float64(numData)) + (rightLoss * float64(len(right)) / float64(numData)) @@ -241,14 +255,17 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl if bestLoss == origLoss { upperNode.isNodeNeeded = false - return upperNode + return upperNode, nil } if bestLoss > 0 { if bestLeftLoss > 0 { tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold}) - leftN = regressorBestSplit(tree, bestLeft, bestLefty, leftN, criterion, maxDepth, depth) + leftN, err = regressorBestSplit(tree, bestLeft, bestLefty, leftN, criterion, maxDepth, depth) + if err != nil { + return upperNode, err + } if leftN.isNodeNeeded == true { upperNode.Left = &leftN } @@ -256,13 +273,16 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl if bestRightLoss > 0 { tree.triedSplits = append(tree.triedSplits, []float64{float64(upperNode.Feature), upperNode.Threshold}) - rightN = regressorBestSplit(tree, bestRight, bestRighty, rightN, criterion, maxDepth, depth) + rightN, err = regressorBestSplit(tree, bestRight, bestRighty, rightN, criterion, maxDepth, depth) + if err != nil { + return upperNode, err + } if rightN.isNodeNeeded == true { upperNode.Right = &rightN } } } - return upperNode + return upperNode, nil } // Print Tree for Visualtion - calls regressorPrintTreeFromNode() @@ -367,20 +387,20 @@ func regressorConvertInstancesToProblemVec(X base.FixedDataGrid) [][]float64 { } // Helper function to convert base.FixedDataGrid into required format. Called in Fit, Predict -func regressorConvertInstancesToLabelVec(X base.FixedDataGrid) []float64 { +func regressorConvertInstancesToLabelVec(X base.FixedDataGrid) ([]float64, error) { // Get the class Attributes classAttrs := X.AllClassAttributes() // Only support 1 class Attribute if len(classAttrs) != 1 { - panic(fmt.Sprintf("%d ClassAttributes (1 expected)", len(classAttrs))) + return []float64{0}, errors.New(fmt.Sprintf("%d ClassAttributes (1 expected)", len(classAttrs))) } // ClassAttribute must be numeric if _, ok := classAttrs[0].(*base.FloatAttribute); !ok { - panic(fmt.Sprintf("%s: ClassAttribute must be a FloatAttribute", classAttrs[0])) + return []float64{0}, errors.New(fmt.Sprintf("%s: ClassAttribute must be a FloatAttribute", classAttrs[0])) } // Allocate return structure _, rows := X.Size() - // labelVec := make([]float64, rows) + labelVec := make([]float64, rows) // Resolve class Attribute specification classAttrSpecs := base.ResolveAttributes(X, classAttrs) @@ -388,5 +408,5 @@ func regressorConvertInstancesToLabelVec(X base.FixedDataGrid) []float64 { labelVec[rowNo] = base.UnpackBytesToFloat(row[0]) return true, nil }) - return labelVec + return labelVec, nil } diff --git a/trees/cart_test.go b/trees/cart_test.go index 50387b9..99374c8 100644 --- a/trees/cart_test.go +++ b/trees/cart_test.go @@ -13,12 +13,12 @@ func TestRegressor(t *testing.T) { // For Classification Trees: // Is Gini being calculated correctly - gini, giniMaxLabel := giniImpurity([]int64{1, 0, 0, 1}, []int64{0, 1}) + gini, giniMaxLabel := computeGiniImpurityAndModeLabel([]int64{1, 0, 0, 1}, []int64{0, 1}) So(gini, ShouldEqual, 0.5) So(giniMaxLabel, ShouldNotBeNil) // Is Entropy being calculated correctly - entropy, entropyMaxLabel := entropy([]int64{1, 0, 0, 1}, []int64{0, 1}) + entropy, entropyMaxLabel := computeEntropyAndModeLabel([]int64{1, 0, 0, 1}, []int64{0, 1}) So(entropy, ShouldEqual, 1.0) So(entropyMaxLabel, ShouldNotBeNil) @@ -61,12 +61,12 @@ func TestRegressor(t *testing.T) { // For Regression Trees // Is MAE being calculated correctly - mae, maeMaxLabel := maeImpurity([]float64{1, 3, 5}) + mae, maeMaxLabel := computeMaeImpurityAndAverage([]float64{1, 3, 5}) So(mae, ShouldEqual, (4.0 / 3.0)) So(maeMaxLabel, ShouldNotBeNil) // Is Entropy being calculated correctly - mse, mseMaxLabel := mseImpurity([]float64{1, 3, 5}) + mse, mseMaxLabel := computeMseImpurityAndAverage([]float64{1, 3, 5}) So(mse, ShouldEqual, (8.0 / 3.0)) So(mseMaxLabel, ShouldNotBeNil) From ae2338c2c1b98fecf56b193591a331fdf3eca76f Mon Sep 17 00:00:00 2001 From: Ayush Date: Fri, 31 Jul 2020 12:38:34 +0530 Subject: [PATCH 16/24] Updating package level details --- trees/trees.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/trees/trees.go b/trees/trees.go index ae8271e..d968b3e 100644 --- a/trees/trees.go +++ b/trees/trees.go @@ -11,6 +11,14 @@ present, so discretise beforehand (see filters) + CART (Classification and Regression Trees): + Builds a binary decision tree using the CART algorithm + using a greedy approach to find the best split at each node. + + Can be used for regression and classficiation. + Attributes have to be FloatAttributes even for classification. + Hence, convert to Integer Labels before hand for Classficiation. + RandomTree: Builds a decision tree using the ID3 algorithm by picking the Attribute amongst those From 9d1ac82a40d6141e1bd0cdd6d1dd68bc430d981b Mon Sep 17 00:00:00 2001 From: Ayush Date: Sat, 1 Aug 2020 11:25:53 +0530 Subject: [PATCH 17/24] Optimizing Loss Calculation --- examples/trees/cart.go | 12 +++++++--- trees/cart_classifier.go | 51 ++++++++++++++++++++-------------------- trees/cart_test.go | 8 +++---- trees/cart_utils.go | 21 +++++------------ 4 files changed, 43 insertions(+), 49 deletions(-) diff --git a/examples/trees/cart.go b/examples/trees/cart.go index f465d54..a6fc909 100644 --- a/examples/trees/cart.go +++ b/examples/trees/cart.go @@ -35,10 +35,13 @@ func main() { // Create New Classification Tree // Hyperparameters - loss function, max Depth (-1 will split until pure), list of unique labels - decTree = NewDecisionTreeClassifier("entropy", -1, []int64{0, 1}) + decTree := NewDecisionTreeClassifier("entropy", -1, []int64{0, 1}) // Train Tree - decTree.Fit(trainData) + err = decTree.Fit(trainData) + if err != nil { + panic(err) + } // Print out tree for visualization - shows splits and feature and predictions fmt.Println(decTree.String()) @@ -62,7 +65,10 @@ func main() { regTree := NewDecisionTreeRegressor("mse", -1) // Train Tree - regTree.Fit(trainRegData) + err = regTree.Fit(trainRegData) + if err != nil { + panic(err) + } // Print out tree for visualization fmt.Println(regTree.String()) diff --git a/trees/cart_classifier.go b/trees/cart_classifier.go index 828f2dc..bb9af51 100644 --- a/trees/cart_classifier.go +++ b/trees/cart_classifier.go @@ -39,25 +39,31 @@ type CARTDecisionTreeClassifier struct { triedSplits [][]float64 } +// Convert a series of labels to frequency map for efficient impurity calculation +func convertToMap(y []int64, labels []int64) map[int64]int { + labelCount := make(map[int64]int) + for _, label := range labels { + labelCount[label] = 0 + } + for _, value := range y { + labelCount[value]++ + } + return labelCount +} + // Calculate Gini Impurity of Target Labels func computeGiniImpurityAndModeLabel(y []int64, labels []int64) (float64, int64) { nInstances := len(y) gini := 0.0 - maxLabelCount := 0 var maxLabel int64 = 0 - for label := range labels { - numLabel := 0 - for target := range y { - if y[target] == labels[label] { - numLabel++ - } + + labelCount := convertToMap(y, labels) + for _, label := range labels { + if labelCount[label] > labelCount[maxLabel] { + maxLabel = label } - p := float64(numLabel) / float64(nInstances) + p := float64(labelCount[label]) / float64(nInstances) gini += p * (1 - p) - if numLabel > maxLabelCount { - maxLabel = labels[label] - maxLabelCount = numLabel - } } return gini, maxLabel } @@ -66,26 +72,19 @@ func computeGiniImpurityAndModeLabel(y []int64, labels []int64) (float64, int64) func computeEntropyAndModeLabel(y []int64, labels []int64) (float64, int64) { nInstances := len(y) entropy := 0.0 - maxLabelCount := 0 var maxLabel int64 = 0 - for label := range labels { - numLabel := 0 - for target := range y { - if y[target] == labels[label] { - numLabel++ - } - } - p := float64(numLabel) / float64(nInstances) + labelCount := convertToMap(y, labels) + for _, label := range labels { + if labelCount[label] > labelCount[maxLabel] { + maxLabel = label + } + p := float64(labelCount[label]) / float64(nInstances) logP := math.Log2(p) if p == 0 { logP = 0 } - entropy += -p * logP - if numLabel > maxLabelCount { - maxLabel = labels[label] - maxLabelCount = numLabel - } + entropy += (-p * logP) } return entropy, maxLabel } diff --git a/trees/cart_test.go b/trees/cart_test.go index 99374c8..3edee6d 100644 --- a/trees/cart_test.go +++ b/trees/cart_test.go @@ -1,7 +1,6 @@ package trees import ( - "fmt" "testing" . "github.com/smartystreets/goconvey/convey" @@ -42,8 +41,7 @@ func TestRegressor(t *testing.T) { // is data reordered correctly orderedData, orderedY := classifierReOrderData(getFeature(classifierData, 1), classifierData, classifiery) - fmt.Println(orderedData) - fmt.Println(orderedY) + So(orderedData[1][1], ShouldEqual, 3.0) So(orderedY[0], ShouldEqual, 1) @@ -81,9 +79,9 @@ func TestRegressor(t *testing.T) { leftData, rightData, leftY, rightY := regressorCreateSplit(data, 1, y, 5.0) So(len(leftData), ShouldEqual, 2) - So(len(lefty), ShouldEqual, 2) + So(len(leftY), ShouldEqual, 2) So(len(rightData), ShouldEqual, 2) - So(len(righty), ShouldEqual, 2) + So(len(rightY), ShouldEqual, 2) // is data reordered correctly regressorOrderedData, regressorOrderedY := regressorReOrderData(getFeature(data, 1), data, y) diff --git a/trees/cart_utils.go b/trees/cart_utils.go index d3b9b4a..251dee9 100644 --- a/trees/cart_utils.go +++ b/trees/cart_utils.go @@ -4,23 +4,14 @@ import ( "github.com/sjwhitworth/golearn/base" ) -// Helper Function to check if data point is unique or not. -// We will use this to isolate unique values of a feature -func stringInSlice(a float64, list []float64) bool { - for _, b := range list { - if b == a { - return true - } - } - return false -} - // Isolate only unique values. This way, we can try only unique splits and not redundant ones. func findUnique(data []float64) []float64 { - var unique []float64 - for i := range data { - if !stringInSlice(data[i], unique) { - unique = append(unique, data[i]) + keys := make(map[float64]bool) + unique := []float64{} + for _, entry := range data { + if _, value := keys[entry]; !value { + keys[entry] = true + unique = append(unique, entry) } } return unique From 6a42fcd4aede0d430800cfb1a668e04cf172d386 Mon Sep 17 00:00:00 2001 From: Ayush Date: Sat, 1 Aug 2020 11:36:53 +0530 Subject: [PATCH 18/24] catching nInstances == 0 --- trees/cart_classifier.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/trees/cart_classifier.go b/trees/cart_classifier.go index bb9af51..cf9f98d 100644 --- a/trees/cart_classifier.go +++ b/trees/cart_classifier.go @@ -90,6 +90,9 @@ func computeEntropyAndModeLabel(y []int64, labels []int64) (float64, int64) { } func calculateClassificationLoss(y []int64, labels []int64, criterion string) (float64, int64, error) { + if len(y) == 0 { + return 0, 0, errors.New("Need atleast 1 value to compute impurity") + } if criterion == GINI { loss, modeLabel := computeGiniImpurityAndModeLabel(y, labels) return loss, modeLabel, nil From cd2b86aa2edbf1a609e2925d1bfa6abf29f1e6f5 Mon Sep 17 00:00:00 2001 From: Ayush Date: Sat, 1 Aug 2020 11:43:14 +0530 Subject: [PATCH 19/24] Changing var name --- trees/cart_classifier.go | 20 ++++++++++---------- trees/cart_regressor.go | 20 ++++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/trees/cart_classifier.go b/trees/cart_classifier.go index cf9f98d..1ed92d1 100644 --- a/trees/cart_classifier.go +++ b/trees/cart_classifier.go @@ -155,16 +155,16 @@ func classifierReOrderData(featureVal []float64, data [][]float64, y []int64) ([ } // Update the left and right side of the split based on the threshold. -func classifierUpdateSplit(left [][]float64, lefty []int64, right [][]float64, righty []int64, feature int64, threshold float64) ([][]float64, []int64, [][]float64, []int64) { +func classifierUpdateSplit(left [][]float64, leftY []int64, right [][]float64, rightY []int64, feature int64, threshold float64) ([][]float64, []int64, [][]float64, []int64) { for right[0][feature] < threshold { left = append(left, right[0]) right = right[1:] - lefty = append(lefty, righty[0]) - righty = righty[1:] + leftY = append(leftY, rightY[0]) + rightY = rightY[1:] } - return left, lefty, right, righty + return left, leftY, right, rightY } // Fit - Creates an Emppty Root Node2 @@ -232,7 +232,7 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y [] firstTime := true var left, right [][]float64 - var lefty, righty []int64 + var leftY, rightY []int64 // Iterate over all possible thresholds for that feature for j := 0; j < len(unique)-1; j++ { @@ -242,17 +242,17 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y [] // We need to split data from fresh when considering new feature for the first time. // Otherwise, we need to update the split by moving data points from left to right. if firstTime { - left, right, lefty, righty = classifierCreateSplit(sortData, int64(i), sortY, threshold) + left, right, leftY, rightY = classifierCreateSplit(sortData, int64(i), sortY, threshold) firstTime = false } else { - left, lefty, right, righty = classifierUpdateSplit(left, lefty, right, righty, int64(i), threshold) + left, leftY, right, rightY = classifierUpdateSplit(left, leftY, right, rightY, int64(i), threshold) } var leftGini, rightGini float64 var leftLabels, rightLabels int64 - leftGini, leftLabels, _ = calculateClassificationLoss(lefty, labels, criterion) - rightGini, rightLabels, _ = calculateClassificationLoss(righty, labels, criterion) + leftGini, leftLabels, _ = calculateClassificationLoss(leftY, labels, criterion) + rightGini, rightLabels, _ = calculateClassificationLoss(rightY, labels, criterion) // Calculate weighted gini impurity of child nodes subGini := (leftGini * float64(len(left)) / float64(numData)) + (rightGini * float64(len(right)) / float64(numData)) @@ -263,7 +263,7 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y [] bestLeft, bestRight = left, right - bestLefty, bestRighty = lefty, righty + bestLefty, bestRighty = leftY, rightY upperNode.Threshold, upperNode.Feature = threshold, int64(i) diff --git a/trees/cart_regressor.go b/trees/cart_regressor.go index 96d3405..69ae9d6 100644 --- a/trees/cart_regressor.go +++ b/trees/cart_regressor.go @@ -143,16 +143,16 @@ func regressorReOrderData(featureVal []float64, data [][]float64, y []float64) ( } // Update the left and right data based on change in threshold -func regressorUpdateSplit(left [][]float64, lefty []float64, right [][]float64, righty []float64, feature int64, threshold float64) ([][]float64, []float64, [][]float64, []float64) { +func regressorUpdateSplit(left [][]float64, leftY []float64, right [][]float64, rightY []float64, feature int64, threshold float64) ([][]float64, []float64, [][]float64, []float64) { for right[0][feature] < threshold { left = append(left, right[0]) right = right[1:] - lefty = append(lefty, righty[0]) - righty = righty[1:] + leftY = append(leftY, rightY[0]) + rightY = rightY[1:] } - return left, lefty, right, righty + return left, leftY, right, rightY } // Fit - Build the tree using the data @@ -217,23 +217,23 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl firstTime := true var left, right [][]float64 - var lefty, righty []float64 + var leftY, rightY []float64 for j := 0; j < len(unique)-1; j++ { threshold := (unique[j] + unique[j+1]) / 2 if validate(tree.triedSplits, int64(i), threshold) { if firstTime { - left, right, lefty, righty = regressorCreateSplit(sortData, int64(i), sortY, threshold) + left, right, leftY, rightY = regressorCreateSplit(sortData, int64(i), sortY, threshold) firstTime = false } else { - left, lefty, right, righty = regressorUpdateSplit(left, lefty, right, righty, int64(i), threshold) + left, leftY, right, rightY = regressorUpdateSplit(left, leftY, right, rightY, int64(i), threshold) } var leftLoss, rightLoss float64 var leftPred, rightPred float64 - leftLoss, leftPred, _ = calculateRegressionLoss(lefty, criterion) - rightLoss, rightPred, _ = calculateRegressionLoss(righty, criterion) + leftLoss, leftPred, _ = calculateRegressionLoss(leftY, criterion) + rightLoss, rightPred, _ = calculateRegressionLoss(rightY, criterion) subLoss := (leftLoss * float64(len(left)) / float64(numData)) + (rightLoss * float64(len(right)) / float64(numData)) @@ -241,7 +241,7 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl bestLoss = subLoss bestLeft, bestRight = left, right - bestLefty, bestRighty = lefty, righty + bestLefty, bestRighty = leftY, rightY upperNode.Threshold, upperNode.Feature = threshold, int64(i) From 8ae385ca25a807a0e268324c91a89d7088d75a01 Mon Sep 17 00:00:00 2001 From: Ayush Date: Sat, 1 Aug 2020 13:16:34 +0530 Subject: [PATCH 20/24] Complexity Analysis for Algorithm --- examples/trees/cart.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/examples/trees/cart.go b/examples/trees/cart.go index a6fc909..22d3763 100644 --- a/examples/trees/cart.go +++ b/examples/trees/cart.go @@ -11,13 +11,21 @@ import ( func main() { /* Performance of CART Algorithm: - Training Time for Titanic Dataset ≈ 713 µs - Prediction Time for Titanic Datset ≈ 133 µs + Training Time for Titanic Dataset ≈ 611 µs + Prediction Time for Titanic Datset ≈ 101 µs + + Complexity Analysis: + 1x Dataset -- x ms + 2x Dataset -- 1.7x ms + 128x Dataset -- 74x ms + + Complexity is sub linear Sklearn: Training Time for Titanic Dataset ≈ 8.8 µs Prediction Time for Titanic Datset ≈ 7.87 µs + This implementation and sci-kit learn produce the exact same tree for the exact same dataset. Predictions on the same test set also yield the exact same accuracy. From cad05a087a1785511040d223d57bf47473f8d84c Mon Sep 17 00:00:00 2001 From: Ayush Date: Sat, 1 Aug 2020 15:11:38 +0530 Subject: [PATCH 21/24] Updating Logistic.go --- linear_models/logistic.go | 1 - 1 file changed, 1 deletion(-) diff --git a/linear_models/logistic.go b/linear_models/logistic.go index 14ff0d2..96c3206 100644 --- a/linear_models/logistic.go +++ b/linear_models/logistic.go @@ -3,7 +3,6 @@ package linear_models import ( "errors" "fmt" - "github.com/sjwhitworth/golearn/base" ) From e55a329d8aa8938e52e2bfe3622d175ad59d39ee Mon Sep 17 00:00:00 2001 From: Ayush Date: Sat, 1 Aug 2020 15:32:59 +0530 Subject: [PATCH 22/24] Fixing Bug --- examples/trees/cart.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/trees/cart.go b/examples/trees/cart.go index 22d3763..5a19cab 100644 --- a/examples/trees/cart.go +++ b/examples/trees/cart.go @@ -6,6 +6,8 @@ import ( "fmt" "github.com/sjwhitworth/golearn/base" + "github.com/sjwhitworth/golearn/trees" + ) func main() { From b689fe0c58d68d6d91ef77729f884ba5190a25aa Mon Sep 17 00:00:00 2001 From: Ayush Date: Mon, 3 Aug 2020 09:41:35 +0530 Subject: [PATCH 23/24] Fixing Typo + tmp file --- trees/cart_classifier.go | 3 +-- trees/tmp | Bin 409 -> 413 bytes 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/trees/cart_classifier.go b/trees/cart_classifier.go index 1ed92d1..daf6a29 100644 --- a/trees/cart_classifier.go +++ b/trees/cart_classifier.go @@ -167,7 +167,7 @@ func classifierUpdateSplit(left [][]float64, leftY []int64, right [][]float64, r return left, leftY, right, rightY } -// Fit - Creates an Emppty Root Node2 +// Fit - Creates an Empty Root Node // Trains the tree by calling recursive function classifierBestSplit func (tree *CARTDecisionTreeClassifier) Fit(X base.FixedDataGrid) error { var emptyNode classifierNode @@ -422,7 +422,6 @@ func classifierConvertInstancesToLabelVec(X base.FixedDataGrid) ([]int64, error) // Only support 1 class Attribute if len(classAttrs) != 1 { return []int64{0}, errors.New(fmt.Sprintf("%d ClassAttributes (1 expected)", len(classAttrs))) - } // ClassAttribute must be numeric if _, ok := classAttrs[0].(*base.FloatAttribute); !ok { diff --git a/trees/tmp b/trees/tmp index af98d1a33b82338d7466955c2c6aafb41cd3496c..28c93c507c8869a97a9ff1d9eecbd160475a62e3 100644 GIT binary patch delta 383 zcmV-_0f7FQ1DykqCx0+EHfB(um3vS?X>mzn5rcsNrRJb}Zis;N{G8OpqC5pdy&>b| zQ9woD>l)(d;uzvcOFIbVe?wyv^!#sRXfT@p=>e*h((;RP6HDUDQj3Z+^YfIf40V)} za}tY-Gt)9ti0fhdY1DOMmCw~M9DHzboLCB!AxFoTN!GL1B&^oYGP5If}!4!aq=jj zBJgz$addGEaipakgz~?ku?brKH!w0c7|s9m0M$xq`9-;jCGlmcMa7xI!eho ziN(d4X_=`-N>)mqF2+haO3rYhFqlFmLp=i}WX18hsU?XiiGL-DN>+KLIXSfh?YB`t z7ytkO0RR8&l(7oJKn#Z8C3A(~+*y{GQ<_?=WEG9b$g#CLN_mO7si>J7QdTG> ZCzhn9=NDxrC*~-PX4s($00000|NlQXwFm$J From 27b86ce3ea4e3f9137c6acb0c1f8790fe2813806 Mon Sep 17 00:00:00 2001 From: Ayush Date: Thu, 6 Aug 2020 21:43:47 +0530 Subject: [PATCH 24/24] Delete tmp --- trees/tmp | Bin 413 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 trees/tmp diff --git a/trees/tmp b/trees/tmp deleted file mode 100644 index 28c93c507c8869a97a9ff1d9eecbd160475a62e3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 413 zcmV;O0b>3iiwFP!00000|8n*Tj`wx+^K^3!4q>1XFn|DDw1I)4F@g;eH#9IeHfB(u zm3vS?X>mzn5rcsNrRJb}Zis;N{G8OpqC5pdy&>b|Q9woD>l)(d;uzvcOFIbVe?wyv z^!#sRXfT@p=>e*h((;RP6HDUDQj3Z+^YfIf40V)}a}tY-Gt)9ti12+Ha$PFaQ7m0RR8&l(DMAFbqZiMRSHE z!!!72OkS_prqrzLIs~%qyf?>Yc>n+a|NjC-5V&l2EGa3XksSo