From 91a27e3ca0f4d93f6331b647c1b5aeaef50a59e1 Mon Sep 17 00:00:00 2001
From: Ayush <ayushgoel2004@gmail.com>
Date: Mon, 27 Jul 2020 15:03:12 +0530
Subject: [PATCH] Fixing Comments

---
 examples/trees/cart.go   |  1 -
 trees/cart_classifier.go | 53 ++++++++++++++++++++++++++--------------
 trees/cart_regressor.go  | 52 +++++++++++++++++++++++++++------------
 3 files changed, 71 insertions(+), 35 deletions(-)

diff --git a/examples/trees/cart.go b/examples/trees/cart.go
index 931b7e5..833e287 100644
--- a/examples/trees/cart.go
+++ b/examples/trees/cart.go
@@ -36,7 +36,6 @@ func main() {
 	fmt.Println(decTree.Evaluate(testData))
 
 	// Load House Price Data For Regression
-
 	regressionData, err := base.ParseCSVToInstances("../datasets/boston_house_prices.csv", false)
 	if err != nil {
 		panic(err)
diff --git a/trees/cart_classifier.go b/trees/cart_classifier.go
index dc54d19..9ae3b8e 100644
--- a/trees/cart_classifier.go
+++ b/trees/cart_classifier.go
@@ -10,9 +10,13 @@ import (
 	"github.com/sjwhitworth/golearn/base"
 )
 
-// The "c" prefix to function names indicates that they were tailored for classification
+const (
+	GINI    string = "gini"
+	ENTROPY string = "entropy"
+)
 
-// CNode is Node struct for Decision Tree Classifier
+// CNode is Node struct for Decision Tree Classifier.
+// It holds the information for each split (which feature to use, what threshold, and which label to assign for each side of the split)
 type classifierNode struct {
 	Left       *classifierNode
 	Right      *classifierNode
@@ -25,6 +29,8 @@ type classifierNode struct {
 }
 
 // CARTDecisionTreeClassifier: Tree struct for Decision Tree Classifier
+// It contains the rootNode, as well as all of the hyperparameters chosen by the user.
+// It also keeps track of all splits done at the tree level.
 type CARTDecisionTreeClassifier struct {
 	RootNode    *classifierNode
 	criterion   string
@@ -84,7 +90,7 @@ func entropy(y []int64, labels []int64) (float64, int64) {
 	return entropy, maxLabel
 }
 
-// Split the data into left node and right node based on feature and threshold - only needed for fresh nodes
+// Split the data into left node and right node based on feature and threshold
 func classifierCreateSplit(data [][]float64, feature int64, y []int64, threshold float64) ([][]float64, [][]float64, []int64, []int64) {
 	var left [][]float64
 	var right [][]float64
@@ -105,7 +111,8 @@ func classifierCreateSplit(data [][]float64, feature int64, y []int64, threshold
 	return left, right, lefty, righty
 }
 
-// Helper Function to check if data point is unique or not
+// Helper Function to check if data point is unique or not.
+// We will use this to isolate unique values of a feature
 func classifierStringInSlice(a float64, list []float64) bool {
 	for _, b := range list {
 		if b == a {
@@ -115,7 +122,7 @@ func classifierStringInSlice(a float64, list []float64) bool {
 	return false
 }
 
-// Isolate only unique values. Needed for splitting data.
+// Isolate only unique values. This way, we can try only unique splits and not redundant ones.
 func classifierFindUnique(data []float64) []float64 {
 	var unique []float64
 	for i := range data {
@@ -126,7 +133,7 @@ func classifierFindUnique(data []float64) []float64 {
 	return unique
 }
 
-// Isolate only the feature being considered for splitting
+// Isolate only the feature being considered for splitting. Reduces the complexity in managing splits.
 func classifierGetFeature(data [][]float64, feature int64) []float64 {
 	var featureVals []float64
 	for i := range data {
@@ -135,7 +142,8 @@ func classifierGetFeature(data [][]float64, feature int64) []float64 {
 	return featureVals
 }
 
-// Function to Create New Decision Tree Classifier
+// Function to Create New Decision Tree Classifier.
+// It assigns all of the hyperparameters by user into the tree attributes.
 func NewDecisionTreeClassifier(criterion string, maxDepth int64, labels []int64) *CARTDecisionTreeClassifier {
 	var tree CARTDecisionTreeClassifier
 	tree.criterion = strings.ToLower(criterion)
@@ -145,7 +153,8 @@ func NewDecisionTreeClassifier(criterion string, maxDepth int64, labels []int64)
 	return &tree
 }
 
-// Make sure that split being considered has not been done before
+// Make sure that split being considered has not been done before.
+// Else we will unnecessarily try splits that won't improve Impurity.
 func classifierValidate(triedSplits [][]float64, feature int64, threshold float64) bool {
 	for i := range triedSplits {
 		split := triedSplits[i]
@@ -175,7 +184,7 @@ func classifierReOrderData(featureVal []float64, data [][]float64, y []int64) ([
 	return dataSorted, ySorted
 }
 
-// Change data in Left Node and Right Node based on change in threshold
+// Update the left and right side of the split based on the threshold.
 func classifierUpdateSplit(left [][]float64, lefty []int64, right [][]float64, righty []int64, feature int64, threshold float64) ([][]float64, []int64, [][]float64, []int64) {
 
 	for right[0][feature] < threshold {
@@ -188,7 +197,8 @@ func classifierUpdateSplit(left [][]float64, lefty []int64, right [][]float64, r
 	return left, lefty, right, righty
 }
 
-// Fit - Method visible to user to train tree
+// Fit - Creates an Emppty Root Node
+// Trains the tree by calling recursive function classifierBestSplit
 func (tree *CARTDecisionTreeClassifier) Fit(X base.FixedDataGrid) {
 	var emptyNode classifierNode
 
@@ -199,7 +209,8 @@ func (tree *CARTDecisionTreeClassifier) Fit(X base.FixedDataGrid) {
 	tree.RootNode = &emptyNode
 }
 
-// Iterativly find and record the best split - recursive function
+// Iterativly find and record the best split
+// Stop If depth reaches maxDepth or nodes are pure
 func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y []int64, labels []int64, upperNode classifierNode, criterion string, maxDepth int64, depth int64) classifierNode {
 
 	// Ensure that we have not reached maxDepth. maxDepth =-1 means split until nodes are pure
@@ -214,9 +225,9 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y []
 	var origGini float64
 
 	// Calculate loss based on Criterion Specified by user
-	if criterion == "gini" {
+	if criterion == GINI {
 		origGini, upperNode.LeftLabel = giniImpurity(y, labels)
-	} else if criterion == "entropy" {
+	} else if criterion == ENTROPY {
 		origGini, upperNode.LeftLabel = entropy(y, labels)
 	} else {
 		panic("Invalid impurity function, choose from GINI or ENTROPY")
@@ -271,10 +282,10 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y []
 					var leftLabels int64
 					var rightLabels int64
 
-					if criterion == "gini" {
+					if criterion == GINI {
 						leftGini, leftLabels = giniImpurity(lefty, labels)
 						rightGini, rightLabels = giniImpurity(righty, labels)
-					} else if criterion == "entropy" {
+					} else if criterion == ENTROPY {
 						leftGini, leftLabels = entropy(lefty, labels)
 						rightGini, rightLabels = entropy(righty, labels)
 					}
@@ -336,7 +347,8 @@ func classifierBestSplit(tree CARTDecisionTreeClassifier, data [][]float64, y []
 	return upperNode
 }
 
-// PrintTree : this function prints out entire tree for visualization - visible to user
+// String : this function prints out entire tree for visualization.
+// Calls a recursive function to print the tree - classifierPrintTreeFromNode
 func (tree *CARTDecisionTreeClassifier) String() string {
 	rootNode := *tree.RootNode
 	return classifierPrintTreeFromNode(rootNode, "")
@@ -377,6 +389,7 @@ func classifierPrintTreeFromNode(tree classifierNode, spacing string) string {
 }
 
 // Predict a single data point by traversing the entire tree
+// Uses recursive logic to navigate the tree.
 func classifierPredictSingle(tree classifierNode, instance []float64) int64 {
 	if instance[tree.Feature] < tree.Threshold {
 		if tree.Left == nil {
@@ -393,14 +406,15 @@ func classifierPredictSingle(tree classifierNode, instance []float64) int64 {
 	}
 }
 
-// Predict is visible to user. Given test data, they receive predictions for every datapoint.
+// Given test data, return predictions for every datapoint. calls classifierPredictFromNode
 func (tree *CARTDecisionTreeClassifier) Predict(X_test base.FixedDataGrid) []int64 {
 	root := *tree.RootNode
 	test := classifierConvertInstancesToProblemVec(X_test)
 	return classifierPredictFromNode(root, test)
 }
 
-// This function uses the rootnode from Predict. It is invisible to user, but called from predict method.
+// This function uses the rootnode from Predict.
+// It iterates through every data point and calls the recursive function to give predictions and then summarizes them.
 func classifierPredictFromNode(tree classifierNode, test [][]float64) []int64 {
 	var preds []int64
 	for i := range test {
@@ -411,6 +425,8 @@ func classifierPredictFromNode(tree classifierNode, test [][]float64) []int64 {
 }
 
 // Given Test data and label, return the accuracy of the classifier.
+// First it retreives predictions from the data, then compares for accuracy.
+// Calls classifierEvaluateFromNode
 func (tree *CARTDecisionTreeClassifier) Evaluate(test base.FixedDataGrid) float64 {
 	rootNode := *tree.RootNode
 	xTest := classifierConvertInstancesToProblemVec(test)
@@ -418,6 +434,7 @@ func (tree *CARTDecisionTreeClassifier) Evaluate(test base.FixedDataGrid) float6
 	return classifierEvaluateFromNode(rootNode, xTest, yTest)
 }
 
+// Retrieve predictions and then calculate accuracy.
 func classifierEvaluateFromNode(tree classifierNode, xTest [][]float64, yTest []int64) float64 {
 	preds := classifierPredictFromNode(tree, xTest)
 	accuracy := 0.0
diff --git a/trees/cart_regressor.go b/trees/cart_regressor.go
index d894db5..34b7880 100644
--- a/trees/cart_regressor.go
+++ b/trees/cart_regressor.go
@@ -10,9 +10,14 @@ import (
 	"github.com/sjwhitworth/golearn/base"
 )
 
-// The "r" prefix to all function names indicates that they were tailored to support regression.
+const (
+	MAE string = "mae"
+	MSE string = "mse"
+)
 
 // RNode - Node struct for Decision Tree Regressor
+// It holds the information for each split
+// Which feature to use, threshold, left prediction and right prediction
 type regressorNode struct {
 	Left      *regressorNode
 	Right     *regressorNode
@@ -24,6 +29,8 @@ type regressorNode struct {
 }
 
 // CARTDecisionTreeRegressor - Tree struct for Decision Tree Regressor
+// It contains the rootNode, as well as the hyperparameters chosen by user.
+// Also keeps track of splits used at tree level.
 type CARTDecisionTreeRegressor struct {
 	RootNode    *regressorNode
 	criterion   string
@@ -74,7 +81,7 @@ func mseImpurity(y []float64) (float64, float64) {
 	return meanSquaredError(y, yHat), yHat
 }
 
-// Split the data based on threshold and feature for testing information gain
+// Split the data into left and right based on trehsold and feature.
 func regressorCreateSplit(data [][]float64, feature int64, y []float64, threshold float64) ([][]float64, [][]float64, []float64, []float64) {
 	var left [][]float64
 	var lefty []float64
@@ -95,7 +102,8 @@ func regressorCreateSplit(data [][]float64, feature int64, y []float64, threshol
 	return left, right, lefty, righty
 }
 
-// Helper function for finding unique values
+// Helper function for finding unique values.
+// Used for isolating unique values in a feature.
 func regressorStringInSlice(a float64, list []float64) bool {
 	for _, b := range list {
 		if b == a {
@@ -105,7 +113,8 @@ func regressorStringInSlice(a float64, list []float64) bool {
 	return false
 }
 
-// Return only unique values of a feature
+// Isolate only unique values.
+// This way we can only try unique splits.
 func regressorFindUnique(data []float64) []float64 {
 	var unique []float64
 	for i := range data {
@@ -116,7 +125,8 @@ func regressorFindUnique(data []float64) []float64 {
 	return unique
 }
 
-// Extract out a single feature from data
+// Extract out a single feature from data.
+// Reduces complexity in managing splits and sorting
 func regressorGetFeature(data [][]float64, feature int64) []float64 {
 	var featureVals []float64
 	for i := range data {
@@ -125,7 +135,7 @@ func regressorGetFeature(data [][]float64, feature int64) []float64 {
 	return featureVals
 }
 
-// Interface for creating new Decision Tree Regressor - cals rbestSplit()
+// Interface for creating new Decision Tree Regressor
 func NewDecisionTreeRegressor(criterion string, maxDepth int64) *CARTDecisionTreeRegressor {
 	var tree CARTDecisionTreeRegressor
 	tree.maxDepth = maxDepth
@@ -134,6 +144,7 @@ func NewDecisionTreeRegressor(criterion string, maxDepth int64) *CARTDecisionTre
 }
 
 // Validate that the split being tested has not been done before.
+// This prevents redundant splits from hapenning.
 func regressorValidate(triedSplits [][]float64, feature int64, threshold float64) bool {
 	for i := range triedSplits {
 		split := triedSplits[i]
@@ -146,6 +157,7 @@ func regressorValidate(triedSplits [][]float64, feature int64, threshold float64
 }
 
 // Re order data based on a feature for optimizing code
+// Helps in updating splits without reiterating entire dataset
 func regressorReOrderData(featureVal []float64, data [][]float64, y []float64) ([][]float64, []float64) {
 	s := NewSlice(featureVal)
 	sort.Sort(s)
@@ -176,7 +188,8 @@ func regressorUpdateSplit(left [][]float64, lefty []float64, right [][]float64,
 	return left, lefty, right, righty
 }
 
-// Extra Method for creating simple to use interface. Many params are either redundant for user but are needed only for recursive logic.
+// Fit - Build the tree using the data
+// Creates empty root node and builds tree by calling regressorBestSplit
 func (tree *CARTDecisionTreeRegressor) Fit(X base.FixedDataGrid) {
 	var emptyNode regressorNode
 	data := regressorConvertInstancesToProblemVec(X)
@@ -187,7 +200,8 @@ func (tree *CARTDecisionTreeRegressor) Fit(X base.FixedDataGrid) {
 	tree.RootNode = &emptyNode
 }
 
-// Essentially the Fit Method - Impelements recursive logic
+// Builds the tree by iteratively finding the best split.
+// Recursive function - stops if maxDepth is reached or nodes are pure
 func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []float64, upperNode regressorNode, criterion string, maxDepth int64, depth int64) regressorNode {
 
 	depth++
@@ -200,10 +214,12 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl
 	var bestLoss float64
 	var origLoss float64
 
-	if criterion == "mae" {
+	if criterion == MAE {
 		origLoss, upperNode.LeftPred = maeImpurity(y)
-	} else {
+	} else if criterion == MSE {
 		origLoss, upperNode.LeftPred = mseImpurity(y)
+	} else {
+		panic("Invalid impurity function, choose from MAE or MSE")
 	}
 
 	bestLoss = origLoss
@@ -252,10 +268,10 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl
 					var leftPred float64
 					var rightPred float64
 
-					if criterion == "mae" {
+					if criterion == MAE {
 						leftLoss, leftPred = maeImpurity(lefty)
 						rightLoss, rightPred = maeImpurity(righty)
-					} else {
+					} else if criterion == MSE {
 						leftLoss, leftPred = mseImpurity(lefty)
 						rightLoss, rightPred = mseImpurity(righty)
 					}
@@ -312,12 +328,13 @@ func regressorBestSplit(tree CARTDecisionTreeRegressor, data [][]float64, y []fl
 	return upperNode
 }
 
-// Print Tree for Visualtion - calls printTreeFromNode()
+// Print Tree for Visualtion - calls regressorPrintTreeFromNode()
 func (tree *CARTDecisionTreeRegressor) String() string {
 	rootNode := *tree.RootNode
 	return regressorPrintTreeFromNode(rootNode, "")
 }
 
+// Recursively explore the entire tree and print out all details such as threshold, feature, prediction
 func regressorPrintTreeFromNode(tree regressorNode, spacing string) string {
 	returnString := ""
 	returnString += spacing + "Feature "
@@ -353,7 +370,8 @@ func regressorPrintTreeFromNode(tree regressorNode, spacing string) string {
 	return returnString
 }
 
-// Predict a single data point
+// Predict a single data point by navigating to rootNodes.
+// Uses a recursive logic
 func regressorPredictSingle(tree regressorNode, instance []float64) float64 {
 	if instance[tree.Feature] < tree.Threshold {
 		if tree.Left == nil {
@@ -370,14 +388,16 @@ func regressorPredictSingle(tree regressorNode, instance []float64) float64 {
 	}
 }
 
-// Predict method for multiple data points. Calls predictFromNode()
+// Predict method for multiple data points.
+// First converts input data into usable format, and then calls regressorPredictFromNode
 func (tree *CARTDecisionTreeRegressor) Predict(X_test base.FixedDataGrid) []float64 {
 	root := *tree.RootNode
 	test := regressorConvertInstancesToProblemVec(X_test)
 	return regressorPredictFromNode(root, test)
 }
 
-// Use tree's root node to print out entire tree
+// Use tree's root node to print out entire tree.
+// Iterates over all data points and calls regressorPredictSingle to predict individual datapoints.
 func regressorPredictFromNode(tree regressorNode, test [][]float64) []float64 {
 	var preds []float64
 	for i := range test {