From 47341b2869ef07e989475b6ab6b3292b8413a86a Mon Sep 17 00:00:00 2001 From: Richard Townsend Date: Sun, 3 Aug 2014 12:31:26 +0100 Subject: [PATCH] base: Cleaned up duplicate Attribute resolution functions --- base/csv.go | 2 +- base/dense.go | 4 +- base/filtered.go | 4 +- base/lazy_sort_test.go | 8 +- base/sort_test.go | 8 +- base/util_attributes.go | 26 +--- base/util_instances.go | 2 +- base/view.go | 8 +- examples/instances/instances.go | 2 +- filters/chimerge_test.go | 2 +- knn/knn.go | 4 +- linear_models/logistic.go | 8 +- meta/bagging.go | 2 +- naive/bernoulli_nb.go | 224 +++++++++++++++++--------------- trees/id3.go | 2 +- 15 files changed, 151 insertions(+), 155 deletions(-) diff --git a/base/csv.go b/base/csv.go index 95bcf27..7d5e43e 100644 --- a/base/csv.go +++ b/base/csv.go @@ -129,7 +129,7 @@ func ParseCSVBuildInstances(filepath string, hasHeaders bool, u UpdatableDataGri rowCounter := 0 - specs := ResolveAllAttributes(u, u.AllAttributes()) + specs := ResolveAttributes(u, u.AllAttributes()) for { record, err := reader.Read() diff --git a/base/dense.go b/base/dense.go index 19d4689..b94df5f 100644 --- a/base/dense.go +++ b/base/dense.go @@ -379,7 +379,7 @@ func (inst *DenseInstances) Size() (int, int) { // swapRows swaps over rows i and j func (inst *DenseInstances) swapRows(i, j int) { - as := GetAllAttributeSpecs(inst) + as := ResolveAllAttributes(inst) for _, a := range as { v1 := inst.Get(a, i) v2 := inst.Get(a, j) @@ -424,7 +424,7 @@ func (inst *DenseInstances) String() string { var buffer bytes.Buffer // Get all Attribute information - as := GetAllAttributeSpecs(inst) + as := ResolveAllAttributes(inst) // Print header cols, rows := inst.Size() diff --git a/base/filtered.go b/base/filtered.go index 8b3e7b2..941d2f1 100644 --- a/base/filtered.go +++ b/base/filtered.go @@ -153,7 +153,7 @@ func (l *LazilyFilteredInstances) MapOverRows(asv []AttributeSpec, mapFunc func( func (l *LazilyFilteredInstances) RowString(row int) string { var buffer bytes.Buffer - as := GetAllAttributeSpecs(l) // Retrieve all Attribute data + as := ResolveAllAttributes(l) // Retrieve all Attribute data first := true // Decide whether to prefix for _, a := range as { @@ -188,7 +188,7 @@ func (l *LazilyFilteredInstances) String() string { } // Get all Attribute information - as := GetAllAttributeSpecs(l) + as := ResolveAllAttributes(l) // Print header buffer.WriteString("Lazily filtered instances using ") diff --git a/base/lazy_sort_test.go b/base/lazy_sort_test.go index 8ab2ecd..5f5e23f 100644 --- a/base/lazy_sort_test.go +++ b/base/lazy_sort_test.go @@ -17,8 +17,8 @@ func TestLazySortDesc(testEnv *testing.T) { return } - as1 := GetAllAttributeSpecs(inst1) - as2 := GetAllAttributeSpecs(inst2) + as1 := ResolveAllAttributes(inst1) + as2 := ResolveAllAttributes(inst2) if isSortedDesc(inst1, as1[0]) { testEnv.Error("Can't test descending sort order") @@ -44,7 +44,7 @@ func TestLazySortDesc(testEnv *testing.T) { func TestLazySortAsc(testEnv *testing.T) { inst, err := ParseCSVToInstances("../examples/datasets/iris_headers.csv", true) - as1 := GetAllAttributeSpecs(inst) + as1 := ResolveAllAttributes(inst) if isSortedAsc(inst, as1[0]) { testEnv.Error("Can't test ascending sort on something ascending already") } @@ -67,7 +67,7 @@ func TestLazySortAsc(testEnv *testing.T) { testEnv.Error(err) return } - as2 := GetAllAttributeSpecs(inst2) + as2 := ResolveAllAttributes(inst2) if !isSortedAsc(inst2, as2[0]) { testEnv.Error("This file should be sorted in ascending order") } diff --git a/base/sort_test.go b/base/sort_test.go index ea24ed0..cbddb83 100644 --- a/base/sort_test.go +++ b/base/sort_test.go @@ -44,8 +44,8 @@ func TestSortDesc(testEnv *testing.T) { return } - as1 := GetAllAttributeSpecs(inst1) - as2 := GetAllAttributeSpecs(inst2) + as1 := ResolveAllAttributes(inst1) + as2 := ResolveAllAttributes(inst2) if isSortedDesc(inst1, as1[0]) { testEnv.Error("Can't test descending sort order") @@ -71,7 +71,7 @@ func TestSortDesc(testEnv *testing.T) { func TestSortAsc(testEnv *testing.T) { inst, err := ParseCSVToInstances("../examples/datasets/iris_headers.csv", true) - as1 := GetAllAttributeSpecs(inst) + as1 := ResolveAllAttributes(inst) if isSortedAsc(inst, as1[0]) { testEnv.Error("Can't test ascending sort on something ascending already") } @@ -90,7 +90,7 @@ func TestSortAsc(testEnv *testing.T) { testEnv.Error(err) return } - as2 := GetAllAttributeSpecs(inst2) + as2 := ResolveAllAttributes(inst2) if !isSortedAsc(inst2, as2[0]) { testEnv.Error("This file should be sorted in ascending order") } diff --git a/base/util_attributes.go b/base/util_attributes.go index 84f789c..5259cad 100644 --- a/base/util_attributes.go +++ b/base/util_attributes.go @@ -38,9 +38,9 @@ func NonClassAttributes(d DataGrid) []Attribute { return AttributeDifferenceReferences(allAttrs, classAttrs) } -// ResolveAllAttributes returns AttributeSpecs describing +// ResolveAttributes returns AttributeSpecs describing // all of the Attributes. -func ResolveAllAttributes(d DataGrid, attrs []Attribute) []AttributeSpec { +func ResolveAttributes(d DataGrid, attrs []Attribute) []AttributeSpec { ret := make([]AttributeSpec, len(attrs)) for i, a := range attrs { spec, err := d.GetAttribute(a) @@ -52,25 +52,9 @@ func ResolveAllAttributes(d DataGrid, attrs []Attribute) []AttributeSpec { return ret } -// GetAllAttributeSpecs retrieves every Attribute specification -// from a given DataGrid. Useful in conjunction with MapOverRows. -func GetAllAttributeSpecs(from DataGrid) []AttributeSpec { - attrs := from.AllAttributes() - return GetSomeAttributeSpecs(from, attrs) -} - -// GetSomeAttributeSpecs returns a subset of Attribute specifications -// from a given DataGrid. -func GetSomeAttributeSpecs(from DataGrid, attrs []Attribute) []AttributeSpec { - ret := make([]AttributeSpec, len(attrs)) - for i, a := range attrs { - as, err := from.GetAttribute(a) - if err != nil { - panic(err) - } - ret[i] = as - } - return ret +// ResolveAllAttributes returns every AttributeSpec +func ResolveAllAttributes(d DataGrid) []AttributeSpec { + return ResolveAttributes(d, d.AllAttributes()) } func buildAttrSet(a []Attribute) map[Attribute]bool { diff --git a/base/util_instances.go b/base/util_instances.go index cf517c3..75c908f 100644 --- a/base/util_instances.go +++ b/base/util_instances.go @@ -144,7 +144,7 @@ func DecomposeOnAttributeValues(inst FixedDataGrid, at Attribute) map[string]Fix rowMaps := make(map[string][]int) // Build full Attribute set - fullAttrSpec := ResolveAllAttributes(inst, newAttrs) + fullAttrSpec := ResolveAttributes(inst, newAttrs) fullAttrSpec = append(fullAttrSpec, attrSpec) // Decompose diff --git a/base/view.go b/base/view.go index 0b8119d..99d59d8 100644 --- a/base/view.go +++ b/base/view.go @@ -78,7 +78,7 @@ func NewInstancesViewFromRows(src FixedDataGrid, rows map[int]int) *InstancesVie func NewInstancesViewFromVisible(src FixedDataGrid, rows []int, attrs []Attribute) *InstancesView { ret := &InstancesView{ src, - GetSomeAttributeSpecs(src, attrs), + ResolveAttributes(src, attrs), make(map[int]int), make(map[Attribute]bool), true, @@ -99,7 +99,7 @@ func NewInstancesViewFromVisible(src FixedDataGrid, rows []int, attrs []Attribut func NewInstancesViewFromAttrs(src FixedDataGrid, attrs []Attribute) *InstancesView { ret := &InstancesView{ src, - GetSomeAttributeSpecs(src, attrs), + ResolveAttributes(src, attrs), nil, make(map[Attribute]bool), false, @@ -252,7 +252,7 @@ func (v *InstancesView) String() string { maxRows := 30 // Get all Attribute information - as := GetAllAttributeSpecs(v) + as := ResolveAllAttributes(v) // Print header cols, rows := v.Size() @@ -305,7 +305,7 @@ func (v *InstancesView) String() string { // RowString returns a string representation of a given row. func (v *InstancesView) RowString(row int) string { var buffer bytes.Buffer - as := GetAllAttributeSpecs(v) + as := ResolveAllAttributes(v) first := true for _, a := range as { val := v.Get(a, row) diff --git a/examples/instances/instances.go b/examples/instances/instances.go index e0c3efe..3f507d7 100644 --- a/examples/instances/instances.go +++ b/examples/instances/instances.go @@ -46,7 +46,7 @@ func main() { // for doing so is not very sophisticated. // First, have to resolve Attribute Specifications - as := base.ResolveAllAttributes(rawData, rawData.AllAttributes()) + as := base.ResolveAttributes(rawData, rawData.AllAttributes()) // Attribute Specifications describe where a given column lives rawData.Set(as[0], 0, as[0].GetAttribute().GetSysValFromString("1.00")) diff --git a/filters/chimerge_test.go b/filters/chimerge_test.go index 8b57d56..94a4615 100644 --- a/filters/chimerge_test.go +++ b/filters/chimerge_test.go @@ -112,7 +112,7 @@ func TestChiMerge2(testEnv *testing.T) { // Sort the instances allAttrs := inst.AllAttributes() - sortAttrSpecs := base.ResolveAllAttributes(inst, allAttrs)[0:1] + sortAttrSpecs := base.ResolveAttributes(inst, allAttrs)[0:1] instSorted, err := base.Sort(inst, base.Ascending, sortAttrSpecs) if err != nil { panic(err) diff --git a/knn/knn.go b/knn/knn.go index cc1227b..2b7a2b5 100644 --- a/knn/knn.go +++ b/knn/knn.go @@ -65,8 +65,8 @@ func (KNN *KNNClassifier) Predict(what base.FixedDataGrid) base.FixedDataGrid { ret := base.GeneratePredictionVector(what) // Resolve Attribute specifications for both - whatAttrSpecs := base.ResolveAllAttributes(what, allNumericAttrs) - trainAttrSpecs := base.ResolveAllAttributes(KNN.TrainingData, allNumericAttrs) + whatAttrSpecs := base.ResolveAttributes(what, allNumericAttrs) + trainAttrSpecs := base.ResolveAttributes(KNN.TrainingData, allNumericAttrs) // Reserve storage for most the most similar items distances := make(map[int]float64) diff --git a/linear_models/logistic.go b/linear_models/logistic.go index 647ca0f..d3af914 100644 --- a/linear_models/logistic.go +++ b/linear_models/logistic.go @@ -34,7 +34,7 @@ func convertInstancesToProblemVec(X base.FixedDataGrid) [][]float64 { // Retrieve numeric non-class Attributes numericAttrs := base.NonClassFloatAttributes(X) - numericAttrSpecs := base.ResolveAllAttributes(X, numericAttrs) + numericAttrSpecs := base.ResolveAttributes(X, numericAttrs) // Convert each row X.MapOverRows(numericAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { @@ -66,7 +66,7 @@ func convertInstancesToLabelVec(X base.FixedDataGrid) []float64 { _, rows := X.Size() labelVec := make([]float64, rows) // Resolve class Attribute specification - classAttrSpecs := base.ResolveAllAttributes(X, classAttrs) + classAttrSpecs := base.ResolveAttributes(X, classAttrs) X.MapOverRows(classAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { labelVec[rowNo] = base.UnpackBytesToFloat(row[0]) return true, nil @@ -90,10 +90,10 @@ func (lr *LogisticRegression) Predict(X base.FixedDataGrid) base.FixedDataGrid { } // Generate return structure ret := base.GeneratePredictionVector(X) - classAttrSpecs := base.ResolveAllAttributes(ret, classAttrs) + classAttrSpecs := base.ResolveAttributes(ret, classAttrs) // Retrieve numeric non-class Attributes numericAttrs := base.NonClassFloatAttributes(X) - numericAttrSpecs := base.ResolveAllAttributes(X, numericAttrs) + numericAttrSpecs := base.ResolveAttributes(X, numericAttrs) // Allocate row storage row := make([]float64, len(numericAttrSpecs)) diff --git a/meta/bagging.go b/meta/bagging.go index d8f04b3..8d4b94f 100644 --- a/meta/bagging.go +++ b/meta/bagging.go @@ -112,7 +112,7 @@ func (b *BaggedModel) Predict(from base.FixedDataGrid) base.FixedDataGrid { for { // Need to resolve the voting problem incoming, ok := <-votes if ok { - cSpecs := base.ResolveAllAttributes(incoming, incoming.AllClassAttributes()) + cSpecs := base.ResolveAttributes(incoming, incoming.AllClassAttributes()) incoming.MapOverRows(cSpecs, func(row [][]byte, predRow int) (bool, error) { // Check if we've seen this class before... if _, ok := voting[predRow]; !ok { diff --git a/naive/bernoulli_nb.go b/naive/bernoulli_nb.go index b1f24af..7ad1e7f 100644 --- a/naive/bernoulli_nb.go +++ b/naive/bernoulli_nb.go @@ -1,8 +1,8 @@ package naive import ( - "math" - base "github.com/sjwhitworth/golearn/base" + base "github.com/sjwhitworth/golearn/base" + "math" ) // A Bernoulli Naive Bayes Classifier. Naive Bayes classifiers assumes @@ -37,91 +37,103 @@ import ( // Information Retrieval. Cambridge University Press, pp. 234-265. // http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html type BernoulliNBClassifier struct { - base.BaseEstimator - // Conditional probability for each term. This vector should be - // accessed in the following way: p(f|c) = condProb[c][f]. - // Logarithm is used in order to avoid underflow. - condProb map[string][]float64 - // Number of instances in each class. This is necessary in order to - // calculate the laplace smooth value during the Predict step. - classInstances map[string]int - // Number of instances used in training. - trainingInstances int - // Number of features in the training set - features int + base.BaseEstimator + // Conditional probability for each term. This vector should be + // accessed in the following way: p(f|c) = condProb[c][f]. + // Logarithm is used in order to avoid underflow. + condProb map[string][]float64 + // Number of instances in each class. This is necessary in order to + // calculate the laplace smooth value during the Predict step. + classInstances map[string]int + // Number of instances used in training. + trainingInstances int + // Number of features in the training set + features int } // Create a new Bernoulli Naive Bayes Classifier. The argument 'classes' // is the number of possible labels in the classification task. func NewBernoulliNBClassifier() *BernoulliNBClassifier { - nb := BernoulliNBClassifier{} - nb.condProb = make(map[string][]float64) - nb.features = 0 - nb.trainingInstances = 0 - return &nb + nb := BernoulliNBClassifier{} + nb.condProb = make(map[string][]float64) + nb.features = 0 + nb.trainingInstances = 0 + return &nb } // Fill data matrix with Bernoulli Naive Bayes model. All values // necessary for calculating prior probability and p(f_i) -func (nb *BernoulliNBClassifier) Fit(X *base.Instances) { +func (nb *BernoulliNBClassifier) Fit(X base.FixedDataGrid) { - // Number of features and instances in this training set - nb.trainingInstances = X.Rows - nb.features = 0 - if X.Rows > 0 { - nb.features = len(X.GetRowVectorWithoutClass(0)) - } + // Check that all Attributes are binary + classAttrs := X.AllClassAttributes() + allAttrs := X.AllAttributes() + featAttrs := base.AttributeDifferenceReference(allAttrs, classAttrs) + for i := range featAttrs { + if _, ok := featAttrs[i].(*base.BinaryAttribute); !ok { + panic(fmt.Sprintf("%v: Should be BinaryAttribute", featAttrs[i])) + } + } + featAttrSpecs := base.ResolveAllAttributes(featAttrs, X) - // Number of instances in class - nb.classInstances = make(map[string]int) + // Check that only one classAttribute is defined + if len(classAttrs) > 0 { + panic("Only one class Attribute can be used") + } - // Number of documents with given term (by class) - docsContainingTerm := make(map[string][]int) + // Number of features and instances in this training set + nb.features, nb.trainingInstances() = X.Size() - // This algorithm could be vectorized after binarizing the data - // matrix. Since mat64 doesn't have this function, a iterative - // version is used. - for r := 0; r < X.Rows; r++ { - class := X.GetClass(r) - docVector := X.GetRowVectorWithoutClass(r) + // Number of instances in class + nb.classInstances = make(map[string]int) - // increment number of instances in class - t, ok := nb.classInstances[class] - if !ok { t = 0 } - nb.classInstances[class] = t + 1 + // Number of documents with given term (by class) + docsContainingTerm := make(map[string][]int) + // This algorithm could be vectorized after binarizing the data + // matrix. Since mat64 doesn't have this function, a iterative + // version is used. + X.MapOverRows(featAttrSpecs, func(docVector [][]byte, r int) (bool, error) { + class := base.GetClass(X, r) - for feat := 0; feat < len(docVector); feat++ { - v := docVector[feat] - // In Bernoulli Naive Bayes the presence and absence of - // features are considered. All non-zero values are - // treated as presence. - if v > 0 { - // Update number of times this feature appeared within - // given label. - t, ok := docsContainingTerm[class] - if !ok { - t = make([]int, nb.features) - docsContainingTerm[class] = t - } - t[feat] += 1 - } - } - } + // increment number of instances in class + t, ok := nb.classInstances[class] + if !ok { + t = 0 + } + nb.classInstances[class] = t + 1 - // Pre-calculate conditional probabilities for each class - for c, _ := range nb.classInstances { - nb.condProb[c] = make([]float64, nb.features) - for feat := 0; feat < nb.features; feat++ { - classTerms, _ := docsContainingTerm[c] - numDocs := classTerms[feat] - docsInClass, _ := nb.classInstances[c] + for feat := 0; feat < len(docVector); feat++ { + v := docVector[feat] + // In Bernoulli Naive Bayes the presence and absence of + // features are considered. All non-zero values are + // treated as presence. + if v[0] > 0 { + // Update number of times this feature appeared within + // given label. + t, ok := docsContainingTerm[class] + if !ok { + t = make([]int, nb.features) + docsContainingTerm[class] = t + } + t[feat] += 1 + } + } + }) - classCondProb, _ := nb.condProb[c] - // Calculate conditional probability with laplace smoothing - classCondProb[feat] = float64(numDocs + 1) / float64(docsInClass + 1) - } - } + // Pre-calculate conditional probabilities for each class + for c, _ := range nb.classInstances { + nb.condProb[c] = make([]float64, nb.features) + for feat := 0; feat < nb.features; feat++ { + classTerms, _ := docsContainingTerm[c] + numDocs := classTerms[feat] + docsInClass, _ := nb.classInstances[c] + + classCondProb, _ := nb.condProb[c] + // Calculate conditional probability with laplace smoothing + classCondProb[feat] = float64(numDocs+1) / float64(docsInClass+1) + } + } } // Use trained model to predict test vector's class. The following @@ -134,43 +146,43 @@ func (nb *BernoulliNBClassifier) Fit(X *base.Instances) { // IMPORTANT: PredictOne panics if Fit was not called or if the // document vector and train matrix have a different number of columns. func (nb *BernoulliNBClassifier) PredictOne(vector []float64) string { - if nb.features == 0 { - panic("Fit should be called before predicting") - } + if nb.features == 0 { + panic("Fit should be called before predicting") + } - if len(vector) != nb.features { - panic("Different dimensions in Train and Test sets") - } + if len(vector) != nb.features { + panic("Different dimensions in Train and Test sets") + } - // Currently only the predicted class is returned. - bestScore := -math.MaxFloat64 - bestClass := "" + // Currently only the predicted class is returned. + bestScore := -math.MaxFloat64 + bestClass := "" - for class, classCount := range nb.classInstances { - // Init classScore with log(prior) - classScore := math.Log((float64(classCount))/float64(nb.trainingInstances)) - for f := 0; f < nb.features; f++ { - if vector[f] > 0 { - // Test document has feature c - classScore += math.Log(nb.condProb[class][f]) - } else { - if nb.condProb[class][f] == 1.0 { - // special case when prob = 1.0, consider laplace - // smooth - classScore += math.Log(1.0 / float64(nb.classInstances[class] + 1)) - } else { - classScore += math.Log(1.0 - nb.condProb[class][f]) - } - } - } + for class, classCount := range nb.classInstances { + // Init classScore with log(prior) + classScore := math.Log((float64(classCount)) / float64(nb.trainingInstances)) + for f := 0; f < nb.features; f++ { + if vector[f] > 0 { + // Test document has feature c + classScore += math.Log(nb.condProb[class][f]) + } else { + if nb.condProb[class][f] == 1.0 { + // special case when prob = 1.0, consider laplace + // smooth + classScore += math.Log(1.0 / float64(nb.classInstances[class]+1)) + } else { + classScore += math.Log(1.0 - nb.condProb[class][f]) + } + } + } - if classScore > bestScore { - bestScore = classScore - bestClass = class - } - } + if classScore > bestScore { + bestScore = classScore + bestClass = class + } + } - return bestClass + return bestClass } // Predict is just a wrapper for the PredictOne function. @@ -178,9 +190,9 @@ func (nb *BernoulliNBClassifier) PredictOne(vector []float64) string { // IMPORTANT: Predict panics if Fit was not called or if the // document vector and train matrix have a different number of columns. func (nb *BernoulliNBClassifier) Predict(what *base.Instances) *base.Instances { - ret := what.GeneratePredictionVector() - for i := 0; i < what.Rows; i++ { - ret.SetAttrStr(i, 0, nb.PredictOne(what.GetRowVectorWithoutClass(i))) - } - return ret + ret := what.GeneratePredictionVector() + for i := 0; i < what.Rows; i++ { + ret.SetAttrStr(i, 0, nb.PredictOne(what.GetRowVectorWithoutClass(i))) + } + return ret } diff --git a/trees/id3.go b/trees/id3.go index af59494..bfa9904 100644 --- a/trees/id3.go +++ b/trees/id3.go @@ -203,7 +203,7 @@ func (d *DecisionTreeNode) Predict(what base.FixedDataGrid) base.FixedDataGrid { panic(err) } predAttrs := base.AttributeDifferenceReferences(what.AllAttributes(), predictions.AllClassAttributes()) - predAttrSpecs := base.ResolveAllAttributes(what, predAttrs) + predAttrSpecs := base.ResolveAttributes(what, predAttrs) what.MapOverRows(predAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { cur := d for {