base: Cleaned up duplicate Attribute resolution functions

2025-04-25 13:48:49 +08:00 · 2014-08-03 12:31:26 +01:00 · 2014-08-03 12:31:26 +01:00 · 47341b2869
commit 47341b2869
parent ff97065261
15 changed files with 151 additions and 155 deletions
--- a/base/csv.go
+++ b/base/csv.go
@ -129,7 +129,7 @@ func ParseCSVBuildInstances(filepath string, hasHeaders bool, u UpdatableDataGri

 	rowCounter := 0

-	specs := ResolveAllAttributes(u, u.AllAttributes())
+	specs := ResolveAttributes(u, u.AllAttributes())

 	for {
 		record, err := reader.Read()
--- a/base/dense.go
+++ b/base/dense.go
@ -379,7 +379,7 @@ func (inst *DenseInstances) Size() (int, int) {

 // swapRows swaps over rows i and j
 func (inst *DenseInstances) swapRows(i, j int) {
-	as := GetAllAttributeSpecs(inst)
+	as := ResolveAllAttributes(inst)
 	for _, a := range as {
 		v1 := inst.Get(a, i)
 		v2 := inst.Get(a, j)
@ -424,7 +424,7 @@ func (inst *DenseInstances) String() string {
 	var buffer bytes.Buffer

 	// Get all Attribute information
-	as := GetAllAttributeSpecs(inst)
+	as := ResolveAllAttributes(inst)

 	// Print header
 	cols, rows := inst.Size()
--- a/base/filtered.go
+++ b/base/filtered.go
@ -153,7 +153,7 @@ func (l *LazilyFilteredInstances) MapOverRows(asv []AttributeSpec, mapFunc func(
 func (l *LazilyFilteredInstances) RowString(row int) string {
 	var buffer bytes.Buffer

-	as := GetAllAttributeSpecs(l) // Retrieve all Attribute data
+	as := ResolveAllAttributes(l) // Retrieve all Attribute data
 	first := true                 // Decide whether to prefix

 	for _, a := range as {
@ -188,7 +188,7 @@ func (l *LazilyFilteredInstances) String() string {
 	}

 	// Get all Attribute information
-	as := GetAllAttributeSpecs(l)
+	as := ResolveAllAttributes(l)

 	// Print header
 	buffer.WriteString("Lazily filtered instances using ")
--- a/base/lazy_sort_test.go
+++ b/base/lazy_sort_test.go
@ -17,8 +17,8 @@ func TestLazySortDesc(testEnv *testing.T) {
 		return
 	}

-	as1 := GetAllAttributeSpecs(inst1)
-	as2 := GetAllAttributeSpecs(inst2)
+	as1 := ResolveAllAttributes(inst1)
+	as2 := ResolveAllAttributes(inst2)

 	if isSortedDesc(inst1, as1[0]) {
 		testEnv.Error("Can't test descending sort order")
@ -44,7 +44,7 @@ func TestLazySortDesc(testEnv *testing.T) {

 func TestLazySortAsc(testEnv *testing.T) {
 	inst, err := ParseCSVToInstances("../examples/datasets/iris_headers.csv", true)
-	as1 := GetAllAttributeSpecs(inst)
+	as1 := ResolveAllAttributes(inst)
 	if isSortedAsc(inst, as1[0]) {
 		testEnv.Error("Can't test ascending sort on something ascending already")
 	}
@ -67,7 +67,7 @@ func TestLazySortAsc(testEnv *testing.T) {
 		testEnv.Error(err)
 		return
 	}
-	as2 := GetAllAttributeSpecs(inst2)
+	as2 := ResolveAllAttributes(inst2)
 	if !isSortedAsc(inst2, as2[0]) {
 		testEnv.Error("This file should be sorted in ascending order")
 	}
--- a/base/sort_test.go
+++ b/base/sort_test.go
@ -44,8 +44,8 @@ func TestSortDesc(testEnv *testing.T) {
 		return
 	}

-	as1 := GetAllAttributeSpecs(inst1)
-	as2 := GetAllAttributeSpecs(inst2)
+	as1 := ResolveAllAttributes(inst1)
+	as2 := ResolveAllAttributes(inst2)

 	if isSortedDesc(inst1, as1[0]) {
 		testEnv.Error("Can't test descending sort order")
@ -71,7 +71,7 @@ func TestSortDesc(testEnv *testing.T) {

 func TestSortAsc(testEnv *testing.T) {
 	inst, err := ParseCSVToInstances("../examples/datasets/iris_headers.csv", true)
-	as1 := GetAllAttributeSpecs(inst)
+	as1 := ResolveAllAttributes(inst)
 	if isSortedAsc(inst, as1[0]) {
 		testEnv.Error("Can't test ascending sort on something ascending already")
 	}
@ -90,7 +90,7 @@ func TestSortAsc(testEnv *testing.T) {
 		testEnv.Error(err)
 		return
 	}
-	as2 := GetAllAttributeSpecs(inst2)
+	as2 := ResolveAllAttributes(inst2)
 	if !isSortedAsc(inst2, as2[0]) {
 		testEnv.Error("This file should be sorted in ascending order")
 	}
--- a/base/util_attributes.go
+++ b/base/util_attributes.go
@ -38,9 +38,9 @@ func NonClassAttributes(d DataGrid) []Attribute {
 	return AttributeDifferenceReferences(allAttrs, classAttrs)
 }

-// ResolveAllAttributes returns AttributeSpecs describing
+// ResolveAttributes returns AttributeSpecs describing
 // all of the Attributes.
-func ResolveAllAttributes(d DataGrid, attrs []Attribute) []AttributeSpec {
+func ResolveAttributes(d DataGrid, attrs []Attribute) []AttributeSpec {
 	ret := make([]AttributeSpec, len(attrs))
 	for i, a := range attrs {
 		spec, err := d.GetAttribute(a)
@ -52,25 +52,9 @@ func ResolveAllAttributes(d DataGrid, attrs []Attribute) []AttributeSpec {
 	return ret
 }

-// GetAllAttributeSpecs retrieves every Attribute specification
-// from a given DataGrid. Useful in conjunction with MapOverRows.
-func GetAllAttributeSpecs(from DataGrid) []AttributeSpec {
-	attrs := from.AllAttributes()
-	return GetSomeAttributeSpecs(from, attrs)
-}
-
-// GetSomeAttributeSpecs returns a subset of Attribute specifications
-// from a given DataGrid.
-func GetSomeAttributeSpecs(from DataGrid, attrs []Attribute) []AttributeSpec {
-	ret := make([]AttributeSpec, len(attrs))
-	for i, a := range attrs {
-		as, err := from.GetAttribute(a)
-		if err != nil {
-			panic(err)
-		}
-		ret[i] = as
-	}
-	return ret
+// ResolveAllAttributes returns every AttributeSpec
+func ResolveAllAttributes(d DataGrid) []AttributeSpec {
+    return ResolveAttributes(d, d.AllAttributes())
 }

 func buildAttrSet(a []Attribute) map[Attribute]bool {
--- a/base/util_instances.go
+++ b/base/util_instances.go
@ -144,7 +144,7 @@ func DecomposeOnAttributeValues(inst FixedDataGrid, at Attribute) map[string]Fix
 	rowMaps := make(map[string][]int)

 	// Build full Attribute set
-	fullAttrSpec := ResolveAllAttributes(inst, newAttrs)
+	fullAttrSpec := ResolveAttributes(inst, newAttrs)
 	fullAttrSpec = append(fullAttrSpec, attrSpec)

 	// Decompose
--- a/base/view.go
+++ b/base/view.go
@ -78,7 +78,7 @@ func NewInstancesViewFromRows(src FixedDataGrid, rows map[int]int) *InstancesVie
 func NewInstancesViewFromVisible(src FixedDataGrid, rows []int, attrs []Attribute) *InstancesView {
 	ret := &InstancesView{
 		src,
-		GetSomeAttributeSpecs(src, attrs),
+		ResolveAttributes(src, attrs),
 		make(map[int]int),
 		make(map[Attribute]bool),
 		true,
@ -99,7 +99,7 @@ func NewInstancesViewFromVisible(src FixedDataGrid, rows []int, attrs []Attribut
 func NewInstancesViewFromAttrs(src FixedDataGrid, attrs []Attribute) *InstancesView {
 	ret := &InstancesView{
 		src,
-		GetSomeAttributeSpecs(src, attrs),
+		ResolveAttributes(src, attrs),
 		nil,
 		make(map[Attribute]bool),
 		false,
@ -252,7 +252,7 @@ func (v *InstancesView) String() string {
 	maxRows := 30

 	// Get all Attribute information
-	as := GetAllAttributeSpecs(v)
+	as := ResolveAllAttributes(v)

 	// Print header
 	cols, rows := v.Size()
@ -305,7 +305,7 @@ func (v *InstancesView) String() string {
 // RowString returns a string representation of a given row.
 func (v *InstancesView) RowString(row int) string {
 	var buffer bytes.Buffer
-	as := GetAllAttributeSpecs(v)
+	as := ResolveAllAttributes(v)
 	first := true
 	for _, a := range as {
 		val := v.Get(a, row)
--- a/examples/instances/instances.go
+++ b/examples/instances/instances.go
@ -46,7 +46,7 @@ func main() {
 	// for doing so is not very sophisticated.

 	// First, have to resolve Attribute Specifications
-	as := base.ResolveAllAttributes(rawData, rawData.AllAttributes())
+	as := base.ResolveAttributes(rawData, rawData.AllAttributes())

 	// Attribute Specifications describe where a given column lives
 	rawData.Set(as[0], 0, as[0].GetAttribute().GetSysValFromString("1.00"))
--- a/filters/chimerge_test.go
+++ b/filters/chimerge_test.go
@ -112,7 +112,7 @@ func TestChiMerge2(testEnv *testing.T) {

 	// Sort the instances
 	allAttrs := inst.AllAttributes()
-	sortAttrSpecs := base.ResolveAllAttributes(inst, allAttrs)[0:1]
+	sortAttrSpecs := base.ResolveAttributes(inst, allAttrs)[0:1]
 	instSorted, err := base.Sort(inst, base.Ascending, sortAttrSpecs)
 	if err != nil {
 		panic(err)
--- a/knn/knn.go
+++ b/knn/knn.go
@ -65,8 +65,8 @@ func (KNN *KNNClassifier) Predict(what base.FixedDataGrid) base.FixedDataGrid {
 	ret := base.GeneratePredictionVector(what)

 	// Resolve Attribute specifications for both
-	whatAttrSpecs := base.ResolveAllAttributes(what, allNumericAttrs)
-	trainAttrSpecs := base.ResolveAllAttributes(KNN.TrainingData, allNumericAttrs)
+	whatAttrSpecs := base.ResolveAttributes(what, allNumericAttrs)
+	trainAttrSpecs := base.ResolveAttributes(KNN.TrainingData, allNumericAttrs)

 	// Reserve storage for most the most similar items
 	distances := make(map[int]float64)
--- a/linear_models/logistic.go
+++ b/linear_models/logistic.go
@ -34,7 +34,7 @@ func convertInstancesToProblemVec(X base.FixedDataGrid) [][]float64 {

 	// Retrieve numeric non-class Attributes
 	numericAttrs := base.NonClassFloatAttributes(X)
-	numericAttrSpecs := base.ResolveAllAttributes(X, numericAttrs)
+	numericAttrSpecs := base.ResolveAttributes(X, numericAttrs)

 	// Convert each row
 	X.MapOverRows(numericAttrSpecs, func(row [][]byte, rowNo int) (bool, error) {
@ -66,7 +66,7 @@ func convertInstancesToLabelVec(X base.FixedDataGrid) []float64 {
 	_, rows := X.Size()
 	labelVec := make([]float64, rows)
 	// Resolve class Attribute specification
-	classAttrSpecs := base.ResolveAllAttributes(X, classAttrs)
+	classAttrSpecs := base.ResolveAttributes(X, classAttrs)
 	X.MapOverRows(classAttrSpecs, func(row [][]byte, rowNo int) (bool, error) {
 		labelVec[rowNo] = base.UnpackBytesToFloat(row[0])
 		return true, nil
@ -90,10 +90,10 @@ func (lr *LogisticRegression) Predict(X base.FixedDataGrid) base.FixedDataGrid {
 	}
 	// Generate return structure
 	ret := base.GeneratePredictionVector(X)
-	classAttrSpecs := base.ResolveAllAttributes(ret, classAttrs)
+	classAttrSpecs := base.ResolveAttributes(ret, classAttrs)
 	// Retrieve numeric non-class Attributes
 	numericAttrs := base.NonClassFloatAttributes(X)
-	numericAttrSpecs := base.ResolveAllAttributes(X, numericAttrs)
+	numericAttrSpecs := base.ResolveAttributes(X, numericAttrs)

 	// Allocate row storage
 	row := make([]float64, len(numericAttrSpecs))
--- a/meta/bagging.go
+++ b/meta/bagging.go
@ -112,7 +112,7 @@ func (b *BaggedModel) Predict(from base.FixedDataGrid) base.FixedDataGrid {
 		for { // Need to resolve the voting problem
 			incoming, ok := <-votes
 			if ok {
-				cSpecs := base.ResolveAllAttributes(incoming, incoming.AllClassAttributes())
+				cSpecs := base.ResolveAttributes(incoming, incoming.AllClassAttributes())
 				incoming.MapOverRows(cSpecs, func(row [][]byte, predRow int) (bool, error) {
 					// Check if we've seen this class before...
 					if _, ok := voting[predRow]; !ok {
--- a/naive/bernoulli_nb.go
+++ b/naive/bernoulli_nb.go
@ -1,8 +1,8 @@
 package naive

 import (
-    "math"
-    base "github.com/sjwhitworth/golearn/base"
+	base "github.com/sjwhitworth/golearn/base"
+	"math"
 )

 // A Bernoulli Naive Bayes Classifier. Naive Bayes classifiers assumes
@ -37,91 +37,103 @@ import (
 // Information Retrieval. Cambridge University Press, pp. 234-265.
 // http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
 type BernoulliNBClassifier struct {
-    base.BaseEstimator
-    // Conditional probability for each term. This vector should be
-    // accessed in the following way: p(f|c) = condProb[c][f].
-    // Logarithm is used in order to avoid underflow.
-    condProb map[string][]float64
-    // Number of instances in each class. This is necessary in order to
-    // calculate the laplace smooth value during the Predict step.
-    classInstances map[string]int
-    // Number of instances used in training.
-    trainingInstances int
-    // Number of features in the training set
-    features int
+	base.BaseEstimator
+	// Conditional probability for each term. This vector should be
+	// accessed in the following way: p(f|c) = condProb[c][f].
+	// Logarithm is used in order to avoid underflow.
+	condProb map[string][]float64
+	// Number of instances in each class. This is necessary in order to
+	// calculate the laplace smooth value during the Predict step.
+	classInstances map[string]int
+	// Number of instances used in training.
+	trainingInstances int
+	// Number of features in the training set
+	features int
 }

 // Create a new Bernoulli Naive Bayes Classifier. The argument 'classes'
 // is the number of possible labels in the classification task.
 func NewBernoulliNBClassifier() *BernoulliNBClassifier {
-    nb := BernoulliNBClassifier{}
-    nb.condProb = make(map[string][]float64)
-    nb.features = 0
-    nb.trainingInstances = 0
-    return &nb
+	nb := BernoulliNBClassifier{}
+	nb.condProb = make(map[string][]float64)
+	nb.features = 0
+	nb.trainingInstances = 0
+	return &nb
 }

 // Fill data matrix with Bernoulli Naive Bayes model. All values
 // necessary for calculating prior probability and p(f_i)
-func (nb *BernoulliNBClassifier) Fit(X *base.Instances) {
+func (nb *BernoulliNBClassifier) Fit(X base.FixedDataGrid) {

-    // Number of features and instances in this training set
-    nb.trainingInstances = X.Rows
-    nb.features = 0
-    if X.Rows > 0 {
-        nb.features = len(X.GetRowVectorWithoutClass(0))
-    }
+	// Check that all Attributes are binary
+	classAttrs := X.AllClassAttributes()
+	allAttrs := X.AllAttributes()
+	featAttrs := base.AttributeDifferenceReference(allAttrs, classAttrs)
+	for i := range featAttrs {
+		if _, ok := featAttrs[i].(*base.BinaryAttribute); !ok {
+			panic(fmt.Sprintf("%v: Should be BinaryAttribute", featAttrs[i]))
+		}
+	}
+	featAttrSpecs := base.ResolveAllAttributes(featAttrs, X)

-    // Number of instances in class
-    nb.classInstances = make(map[string]int)
+	// Check that only one classAttribute is defined
+	if len(classAttrs) > 0 {
+		panic("Only one class Attribute can be used")
+	}

-    // Number of documents with given term (by class)
-    docsContainingTerm := make(map[string][]int)
+	// Number of features and instances in this training set
+	nb.features, nb.trainingInstances() = X.Size()

-    // This algorithm could be vectorized after binarizing the data
-    // matrix. Since mat64 doesn't have this function, a iterative
-    // version is used.
-    for r := 0; r < X.Rows; r++ {
-        class := X.GetClass(r)
-        docVector := X.GetRowVectorWithoutClass(r)
+	// Number of instances in class
+	nb.classInstances = make(map[string]int)

-        // increment number of instances in class
-        t, ok := nb.classInstances[class]
-            if !ok { t = 0 }
-            nb.classInstances[class] = t + 1
+	// Number of documents with given term (by class)
+	docsContainingTerm := make(map[string][]int)

+	// This algorithm could be vectorized after binarizing the data
+	// matrix. Since mat64 doesn't have this function, a iterative
+	// version is used.
+	X.MapOverRows(featAttrSpecs, func(docVector [][]byte, r int) (bool, error) {
+		class := base.GetClass(X, r)

-        for feat := 0; feat < len(docVector); feat++ {
-            v := docVector[feat]
-            // In Bernoulli Naive Bayes the presence and absence of
-            // features are considered. All non-zero values are
-            // treated as presence.
-            if v > 0 {
-                // Update number of times this feature appeared within
-                // given label.
-                t, ok := docsContainingTerm[class]
-                if !ok {
-                    t = make([]int, nb.features)
-                    docsContainingTerm[class] = t
-                }
-                t[feat] += 1
-            }
-        }
-    }
+		// increment number of instances in class
+		t, ok := nb.classInstances[class]
+		if !ok {
+			t = 0
+		}
+		nb.classInstances[class] = t + 1

-    // Pre-calculate conditional probabilities for each class
-    for c, _ := range nb.classInstances {
-        nb.condProb[c] = make([]float64, nb.features)
-        for feat := 0; feat < nb.features; feat++ {
-            classTerms, _ := docsContainingTerm[c]
-            numDocs := classTerms[feat]
-            docsInClass, _ := nb.classInstances[c]
+		for feat := 0; feat < len(docVector); feat++ {
+			v := docVector[feat]
+			// In Bernoulli Naive Bayes the presence and absence of
+			// features are considered. All non-zero values are
+			// treated as presence.
+			if v[0] > 0 {
+				// Update number of times this feature appeared within
+				// given label.
+				t, ok := docsContainingTerm[class]
+				if !ok {
+					t = make([]int, nb.features)
+					docsContainingTerm[class] = t
+				}
+				t[feat] += 1
+			}
+		}
+	})

-            classCondProb, _ := nb.condProb[c]
-            // Calculate conditional probability with laplace smoothing
-            classCondProb[feat] = float64(numDocs + 1) / float64(docsInClass + 1)
-        }
-    }
+	// Pre-calculate conditional probabilities for each class
+	for c, _ := range nb.classInstances {
+		nb.condProb[c] = make([]float64, nb.features)
+		for feat := 0; feat < nb.features; feat++ {
+			classTerms, _ := docsContainingTerm[c]
+			numDocs := classTerms[feat]
+			docsInClass, _ := nb.classInstances[c]
+
+			classCondProb, _ := nb.condProb[c]
+			// Calculate conditional probability with laplace smoothing
+			classCondProb[feat] = float64(numDocs+1) / float64(docsInClass+1)
+		}
+	}
 }

 // Use trained model to predict test vector's class. The following
@ -134,43 +146,43 @@ func (nb *BernoulliNBClassifier) Fit(X *base.Instances) {
 // IMPORTANT: PredictOne panics if Fit was not called or if the
 // document vector and train matrix have a different number of columns.
 func (nb *BernoulliNBClassifier) PredictOne(vector []float64) string {
-    if nb.features == 0 {
-        panic("Fit should be called before predicting")
-    }
+	if nb.features == 0 {
+		panic("Fit should be called before predicting")
+	}

-    if len(vector) != nb.features {
-        panic("Different dimensions in Train and Test sets")
-    }
+	if len(vector) != nb.features {
+		panic("Different dimensions in Train and Test sets")
+	}

-    // Currently only the predicted class is returned.
-    bestScore := -math.MaxFloat64
-    bestClass := ""
+	// Currently only the predicted class is returned.
+	bestScore := -math.MaxFloat64
+	bestClass := ""

-    for class, classCount := range nb.classInstances {
-        // Init classScore with log(prior)
-        classScore := math.Log((float64(classCount))/float64(nb.trainingInstances))
-        for f := 0; f < nb.features; f++ {
-            if vector[f] > 0 {
-                // Test document has feature c
-                classScore += math.Log(nb.condProb[class][f])
-            } else {
-                if nb.condProb[class][f] == 1.0 {
-                    // special case when prob = 1.0, consider laplace
-                    // smooth
-                    classScore += math.Log(1.0 / float64(nb.classInstances[class] + 1))
-                } else {
-                    classScore += math.Log(1.0 - nb.condProb[class][f])
-                }
-            }
-        }
+	for class, classCount := range nb.classInstances {
+		// Init classScore with log(prior)
+		classScore := math.Log((float64(classCount)) / float64(nb.trainingInstances))
+		for f := 0; f < nb.features; f++ {
+			if vector[f] > 0 {
+				// Test document has feature c
+				classScore += math.Log(nb.condProb[class][f])
+			} else {
+				if nb.condProb[class][f] == 1.0 {
+					// special case when prob = 1.0, consider laplace
+					// smooth
+					classScore += math.Log(1.0 / float64(nb.classInstances[class]+1))
+				} else {
+					classScore += math.Log(1.0 - nb.condProb[class][f])
+				}
+			}
+		}

-        if classScore > bestScore {
-            bestScore = classScore
-            bestClass = class
-        }
-    }
+		if classScore > bestScore {
+			bestScore = classScore
+			bestClass = class
+		}
+	}

-    return bestClass
+	return bestClass
 }

 // Predict is just a wrapper for the PredictOne function.
@ -178,9 +190,9 @@ func (nb *BernoulliNBClassifier) PredictOne(vector []float64) string {
 // IMPORTANT: Predict panics if Fit was not called or if the
 // document vector and train matrix have a different number of columns.
 func (nb *BernoulliNBClassifier) Predict(what *base.Instances) *base.Instances {
-    ret := what.GeneratePredictionVector()
-    for i := 0; i < what.Rows; i++ {
-        ret.SetAttrStr(i, 0, nb.PredictOne(what.GetRowVectorWithoutClass(i)))
-    }
-    return ret
+	ret := what.GeneratePredictionVector()
+	for i := 0; i < what.Rows; i++ {
+		ret.SetAttrStr(i, 0, nb.PredictOne(what.GetRowVectorWithoutClass(i)))
+	}
+	return ret
 }
--- a/trees/id3.go
+++ b/trees/id3.go
@ -203,7 +203,7 @@ func (d *DecisionTreeNode) Predict(what base.FixedDataGrid) base.FixedDataGrid {
 		panic(err)
 	}
 	predAttrs := base.AttributeDifferenceReferences(what.AllAttributes(), predictions.AllClassAttributes())
-	predAttrSpecs := base.ResolveAllAttributes(what, predAttrs)
+	predAttrSpecs := base.ResolveAttributes(what, predAttrs)
 	what.MapOverRows(predAttrSpecs, func(row [][]byte, rowNo int) (bool, error) {
 		cur := d
 		for {