1
0
mirror of https://github.com/sjwhitworth/golearn.git synced 2025-04-25 13:48:49 +08:00

base: Cleaned up duplicate Attribute resolution functions

This commit is contained in:
Richard Townsend 2014-08-03 12:31:26 +01:00
parent ff97065261
commit 47341b2869
15 changed files with 151 additions and 155 deletions

View File

@ -129,7 +129,7 @@ func ParseCSVBuildInstances(filepath string, hasHeaders bool, u UpdatableDataGri
rowCounter := 0
specs := ResolveAllAttributes(u, u.AllAttributes())
specs := ResolveAttributes(u, u.AllAttributes())
for {
record, err := reader.Read()

View File

@ -379,7 +379,7 @@ func (inst *DenseInstances) Size() (int, int) {
// swapRows swaps over rows i and j
func (inst *DenseInstances) swapRows(i, j int) {
as := GetAllAttributeSpecs(inst)
as := ResolveAllAttributes(inst)
for _, a := range as {
v1 := inst.Get(a, i)
v2 := inst.Get(a, j)
@ -424,7 +424,7 @@ func (inst *DenseInstances) String() string {
var buffer bytes.Buffer
// Get all Attribute information
as := GetAllAttributeSpecs(inst)
as := ResolveAllAttributes(inst)
// Print header
cols, rows := inst.Size()

View File

@ -153,7 +153,7 @@ func (l *LazilyFilteredInstances) MapOverRows(asv []AttributeSpec, mapFunc func(
func (l *LazilyFilteredInstances) RowString(row int) string {
var buffer bytes.Buffer
as := GetAllAttributeSpecs(l) // Retrieve all Attribute data
as := ResolveAllAttributes(l) // Retrieve all Attribute data
first := true // Decide whether to prefix
for _, a := range as {
@ -188,7 +188,7 @@ func (l *LazilyFilteredInstances) String() string {
}
// Get all Attribute information
as := GetAllAttributeSpecs(l)
as := ResolveAllAttributes(l)
// Print header
buffer.WriteString("Lazily filtered instances using ")

View File

@ -17,8 +17,8 @@ func TestLazySortDesc(testEnv *testing.T) {
return
}
as1 := GetAllAttributeSpecs(inst1)
as2 := GetAllAttributeSpecs(inst2)
as1 := ResolveAllAttributes(inst1)
as2 := ResolveAllAttributes(inst2)
if isSortedDesc(inst1, as1[0]) {
testEnv.Error("Can't test descending sort order")
@ -44,7 +44,7 @@ func TestLazySortDesc(testEnv *testing.T) {
func TestLazySortAsc(testEnv *testing.T) {
inst, err := ParseCSVToInstances("../examples/datasets/iris_headers.csv", true)
as1 := GetAllAttributeSpecs(inst)
as1 := ResolveAllAttributes(inst)
if isSortedAsc(inst, as1[0]) {
testEnv.Error("Can't test ascending sort on something ascending already")
}
@ -67,7 +67,7 @@ func TestLazySortAsc(testEnv *testing.T) {
testEnv.Error(err)
return
}
as2 := GetAllAttributeSpecs(inst2)
as2 := ResolveAllAttributes(inst2)
if !isSortedAsc(inst2, as2[0]) {
testEnv.Error("This file should be sorted in ascending order")
}

View File

@ -44,8 +44,8 @@ func TestSortDesc(testEnv *testing.T) {
return
}
as1 := GetAllAttributeSpecs(inst1)
as2 := GetAllAttributeSpecs(inst2)
as1 := ResolveAllAttributes(inst1)
as2 := ResolveAllAttributes(inst2)
if isSortedDesc(inst1, as1[0]) {
testEnv.Error("Can't test descending sort order")
@ -71,7 +71,7 @@ func TestSortDesc(testEnv *testing.T) {
func TestSortAsc(testEnv *testing.T) {
inst, err := ParseCSVToInstances("../examples/datasets/iris_headers.csv", true)
as1 := GetAllAttributeSpecs(inst)
as1 := ResolveAllAttributes(inst)
if isSortedAsc(inst, as1[0]) {
testEnv.Error("Can't test ascending sort on something ascending already")
}
@ -90,7 +90,7 @@ func TestSortAsc(testEnv *testing.T) {
testEnv.Error(err)
return
}
as2 := GetAllAttributeSpecs(inst2)
as2 := ResolveAllAttributes(inst2)
if !isSortedAsc(inst2, as2[0]) {
testEnv.Error("This file should be sorted in ascending order")
}

View File

@ -38,9 +38,9 @@ func NonClassAttributes(d DataGrid) []Attribute {
return AttributeDifferenceReferences(allAttrs, classAttrs)
}
// ResolveAllAttributes returns AttributeSpecs describing
// ResolveAttributes returns AttributeSpecs describing
// all of the Attributes.
func ResolveAllAttributes(d DataGrid, attrs []Attribute) []AttributeSpec {
func ResolveAttributes(d DataGrid, attrs []Attribute) []AttributeSpec {
ret := make([]AttributeSpec, len(attrs))
for i, a := range attrs {
spec, err := d.GetAttribute(a)
@ -52,25 +52,9 @@ func ResolveAllAttributes(d DataGrid, attrs []Attribute) []AttributeSpec {
return ret
}
// GetAllAttributeSpecs retrieves every Attribute specification
// from a given DataGrid. Useful in conjunction with MapOverRows.
func GetAllAttributeSpecs(from DataGrid) []AttributeSpec {
attrs := from.AllAttributes()
return GetSomeAttributeSpecs(from, attrs)
}
// GetSomeAttributeSpecs returns a subset of Attribute specifications
// from a given DataGrid.
func GetSomeAttributeSpecs(from DataGrid, attrs []Attribute) []AttributeSpec {
ret := make([]AttributeSpec, len(attrs))
for i, a := range attrs {
as, err := from.GetAttribute(a)
if err != nil {
panic(err)
}
ret[i] = as
}
return ret
// ResolveAllAttributes returns every AttributeSpec
func ResolveAllAttributes(d DataGrid) []AttributeSpec {
return ResolveAttributes(d, d.AllAttributes())
}
func buildAttrSet(a []Attribute) map[Attribute]bool {

View File

@ -144,7 +144,7 @@ func DecomposeOnAttributeValues(inst FixedDataGrid, at Attribute) map[string]Fix
rowMaps := make(map[string][]int)
// Build full Attribute set
fullAttrSpec := ResolveAllAttributes(inst, newAttrs)
fullAttrSpec := ResolveAttributes(inst, newAttrs)
fullAttrSpec = append(fullAttrSpec, attrSpec)
// Decompose

View File

@ -78,7 +78,7 @@ func NewInstancesViewFromRows(src FixedDataGrid, rows map[int]int) *InstancesVie
func NewInstancesViewFromVisible(src FixedDataGrid, rows []int, attrs []Attribute) *InstancesView {
ret := &InstancesView{
src,
GetSomeAttributeSpecs(src, attrs),
ResolveAttributes(src, attrs),
make(map[int]int),
make(map[Attribute]bool),
true,
@ -99,7 +99,7 @@ func NewInstancesViewFromVisible(src FixedDataGrid, rows []int, attrs []Attribut
func NewInstancesViewFromAttrs(src FixedDataGrid, attrs []Attribute) *InstancesView {
ret := &InstancesView{
src,
GetSomeAttributeSpecs(src, attrs),
ResolveAttributes(src, attrs),
nil,
make(map[Attribute]bool),
false,
@ -252,7 +252,7 @@ func (v *InstancesView) String() string {
maxRows := 30
// Get all Attribute information
as := GetAllAttributeSpecs(v)
as := ResolveAllAttributes(v)
// Print header
cols, rows := v.Size()
@ -305,7 +305,7 @@ func (v *InstancesView) String() string {
// RowString returns a string representation of a given row.
func (v *InstancesView) RowString(row int) string {
var buffer bytes.Buffer
as := GetAllAttributeSpecs(v)
as := ResolveAllAttributes(v)
first := true
for _, a := range as {
val := v.Get(a, row)

View File

@ -46,7 +46,7 @@ func main() {
// for doing so is not very sophisticated.
// First, have to resolve Attribute Specifications
as := base.ResolveAllAttributes(rawData, rawData.AllAttributes())
as := base.ResolveAttributes(rawData, rawData.AllAttributes())
// Attribute Specifications describe where a given column lives
rawData.Set(as[0], 0, as[0].GetAttribute().GetSysValFromString("1.00"))

View File

@ -112,7 +112,7 @@ func TestChiMerge2(testEnv *testing.T) {
// Sort the instances
allAttrs := inst.AllAttributes()
sortAttrSpecs := base.ResolveAllAttributes(inst, allAttrs)[0:1]
sortAttrSpecs := base.ResolveAttributes(inst, allAttrs)[0:1]
instSorted, err := base.Sort(inst, base.Ascending, sortAttrSpecs)
if err != nil {
panic(err)

View File

@ -65,8 +65,8 @@ func (KNN *KNNClassifier) Predict(what base.FixedDataGrid) base.FixedDataGrid {
ret := base.GeneratePredictionVector(what)
// Resolve Attribute specifications for both
whatAttrSpecs := base.ResolveAllAttributes(what, allNumericAttrs)
trainAttrSpecs := base.ResolveAllAttributes(KNN.TrainingData, allNumericAttrs)
whatAttrSpecs := base.ResolveAttributes(what, allNumericAttrs)
trainAttrSpecs := base.ResolveAttributes(KNN.TrainingData, allNumericAttrs)
// Reserve storage for most the most similar items
distances := make(map[int]float64)

View File

@ -34,7 +34,7 @@ func convertInstancesToProblemVec(X base.FixedDataGrid) [][]float64 {
// Retrieve numeric non-class Attributes
numericAttrs := base.NonClassFloatAttributes(X)
numericAttrSpecs := base.ResolveAllAttributes(X, numericAttrs)
numericAttrSpecs := base.ResolveAttributes(X, numericAttrs)
// Convert each row
X.MapOverRows(numericAttrSpecs, func(row [][]byte, rowNo int) (bool, error) {
@ -66,7 +66,7 @@ func convertInstancesToLabelVec(X base.FixedDataGrid) []float64 {
_, rows := X.Size()
labelVec := make([]float64, rows)
// Resolve class Attribute specification
classAttrSpecs := base.ResolveAllAttributes(X, classAttrs)
classAttrSpecs := base.ResolveAttributes(X, classAttrs)
X.MapOverRows(classAttrSpecs, func(row [][]byte, rowNo int) (bool, error) {
labelVec[rowNo] = base.UnpackBytesToFloat(row[0])
return true, nil
@ -90,10 +90,10 @@ func (lr *LogisticRegression) Predict(X base.FixedDataGrid) base.FixedDataGrid {
}
// Generate return structure
ret := base.GeneratePredictionVector(X)
classAttrSpecs := base.ResolveAllAttributes(ret, classAttrs)
classAttrSpecs := base.ResolveAttributes(ret, classAttrs)
// Retrieve numeric non-class Attributes
numericAttrs := base.NonClassFloatAttributes(X)
numericAttrSpecs := base.ResolveAllAttributes(X, numericAttrs)
numericAttrSpecs := base.ResolveAttributes(X, numericAttrs)
// Allocate row storage
row := make([]float64, len(numericAttrSpecs))

View File

@ -112,7 +112,7 @@ func (b *BaggedModel) Predict(from base.FixedDataGrid) base.FixedDataGrid {
for { // Need to resolve the voting problem
incoming, ok := <-votes
if ok {
cSpecs := base.ResolveAllAttributes(incoming, incoming.AllClassAttributes())
cSpecs := base.ResolveAttributes(incoming, incoming.AllClassAttributes())
incoming.MapOverRows(cSpecs, func(row [][]byte, predRow int) (bool, error) {
// Check if we've seen this class before...
if _, ok := voting[predRow]; !ok {

View File

@ -1,8 +1,8 @@
package naive
import (
"math"
base "github.com/sjwhitworth/golearn/base"
base "github.com/sjwhitworth/golearn/base"
"math"
)
// A Bernoulli Naive Bayes Classifier. Naive Bayes classifiers assumes
@ -37,91 +37,103 @@ import (
// Information Retrieval. Cambridge University Press, pp. 234-265.
// http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
type BernoulliNBClassifier struct {
base.BaseEstimator
// Conditional probability for each term. This vector should be
// accessed in the following way: p(f|c) = condProb[c][f].
// Logarithm is used in order to avoid underflow.
condProb map[string][]float64
// Number of instances in each class. This is necessary in order to
// calculate the laplace smooth value during the Predict step.
classInstances map[string]int
// Number of instances used in training.
trainingInstances int
// Number of features in the training set
features int
base.BaseEstimator
// Conditional probability for each term. This vector should be
// accessed in the following way: p(f|c) = condProb[c][f].
// Logarithm is used in order to avoid underflow.
condProb map[string][]float64
// Number of instances in each class. This is necessary in order to
// calculate the laplace smooth value during the Predict step.
classInstances map[string]int
// Number of instances used in training.
trainingInstances int
// Number of features in the training set
features int
}
// Create a new Bernoulli Naive Bayes Classifier. The argument 'classes'
// is the number of possible labels in the classification task.
func NewBernoulliNBClassifier() *BernoulliNBClassifier {
nb := BernoulliNBClassifier{}
nb.condProb = make(map[string][]float64)
nb.features = 0
nb.trainingInstances = 0
return &nb
nb := BernoulliNBClassifier{}
nb.condProb = make(map[string][]float64)
nb.features = 0
nb.trainingInstances = 0
return &nb
}
// Fill data matrix with Bernoulli Naive Bayes model. All values
// necessary for calculating prior probability and p(f_i)
func (nb *BernoulliNBClassifier) Fit(X *base.Instances) {
func (nb *BernoulliNBClassifier) Fit(X base.FixedDataGrid) {
// Number of features and instances in this training set
nb.trainingInstances = X.Rows
nb.features = 0
if X.Rows > 0 {
nb.features = len(X.GetRowVectorWithoutClass(0))
}
// Check that all Attributes are binary
classAttrs := X.AllClassAttributes()
allAttrs := X.AllAttributes()
featAttrs := base.AttributeDifferenceReference(allAttrs, classAttrs)
for i := range featAttrs {
if _, ok := featAttrs[i].(*base.BinaryAttribute); !ok {
panic(fmt.Sprintf("%v: Should be BinaryAttribute", featAttrs[i]))
}
}
featAttrSpecs := base.ResolveAllAttributes(featAttrs, X)
// Number of instances in class
nb.classInstances = make(map[string]int)
// Check that only one classAttribute is defined
if len(classAttrs) > 0 {
panic("Only one class Attribute can be used")
}
// Number of documents with given term (by class)
docsContainingTerm := make(map[string][]int)
// Number of features and instances in this training set
nb.features, nb.trainingInstances() = X.Size()
// This algorithm could be vectorized after binarizing the data
// matrix. Since mat64 doesn't have this function, a iterative
// version is used.
for r := 0; r < X.Rows; r++ {
class := X.GetClass(r)
docVector := X.GetRowVectorWithoutClass(r)
// Number of instances in class
nb.classInstances = make(map[string]int)
// increment number of instances in class
t, ok := nb.classInstances[class]
if !ok { t = 0 }
nb.classInstances[class] = t + 1
// Number of documents with given term (by class)
docsContainingTerm := make(map[string][]int)
// This algorithm could be vectorized after binarizing the data
// matrix. Since mat64 doesn't have this function, a iterative
// version is used.
X.MapOverRows(featAttrSpecs, func(docVector [][]byte, r int) (bool, error) {
class := base.GetClass(X, r)
for feat := 0; feat < len(docVector); feat++ {
v := docVector[feat]
// In Bernoulli Naive Bayes the presence and absence of
// features are considered. All non-zero values are
// treated as presence.
if v > 0 {
// Update number of times this feature appeared within
// given label.
t, ok := docsContainingTerm[class]
if !ok {
t = make([]int, nb.features)
docsContainingTerm[class] = t
}
t[feat] += 1
}
}
}
// increment number of instances in class
t, ok := nb.classInstances[class]
if !ok {
t = 0
}
nb.classInstances[class] = t + 1
// Pre-calculate conditional probabilities for each class
for c, _ := range nb.classInstances {
nb.condProb[c] = make([]float64, nb.features)
for feat := 0; feat < nb.features; feat++ {
classTerms, _ := docsContainingTerm[c]
numDocs := classTerms[feat]
docsInClass, _ := nb.classInstances[c]
for feat := 0; feat < len(docVector); feat++ {
v := docVector[feat]
// In Bernoulli Naive Bayes the presence and absence of
// features are considered. All non-zero values are
// treated as presence.
if v[0] > 0 {
// Update number of times this feature appeared within
// given label.
t, ok := docsContainingTerm[class]
if !ok {
t = make([]int, nb.features)
docsContainingTerm[class] = t
}
t[feat] += 1
}
}
})
classCondProb, _ := nb.condProb[c]
// Calculate conditional probability with laplace smoothing
classCondProb[feat] = float64(numDocs + 1) / float64(docsInClass + 1)
}
}
// Pre-calculate conditional probabilities for each class
for c, _ := range nb.classInstances {
nb.condProb[c] = make([]float64, nb.features)
for feat := 0; feat < nb.features; feat++ {
classTerms, _ := docsContainingTerm[c]
numDocs := classTerms[feat]
docsInClass, _ := nb.classInstances[c]
classCondProb, _ := nb.condProb[c]
// Calculate conditional probability with laplace smoothing
classCondProb[feat] = float64(numDocs+1) / float64(docsInClass+1)
}
}
}
// Use trained model to predict test vector's class. The following
@ -134,43 +146,43 @@ func (nb *BernoulliNBClassifier) Fit(X *base.Instances) {
// IMPORTANT: PredictOne panics if Fit was not called or if the
// document vector and train matrix have a different number of columns.
func (nb *BernoulliNBClassifier) PredictOne(vector []float64) string {
if nb.features == 0 {
panic("Fit should be called before predicting")
}
if nb.features == 0 {
panic("Fit should be called before predicting")
}
if len(vector) != nb.features {
panic("Different dimensions in Train and Test sets")
}
if len(vector) != nb.features {
panic("Different dimensions in Train and Test sets")
}
// Currently only the predicted class is returned.
bestScore := -math.MaxFloat64
bestClass := ""
// Currently only the predicted class is returned.
bestScore := -math.MaxFloat64
bestClass := ""
for class, classCount := range nb.classInstances {
// Init classScore with log(prior)
classScore := math.Log((float64(classCount))/float64(nb.trainingInstances))
for f := 0; f < nb.features; f++ {
if vector[f] > 0 {
// Test document has feature c
classScore += math.Log(nb.condProb[class][f])
} else {
if nb.condProb[class][f] == 1.0 {
// special case when prob = 1.0, consider laplace
// smooth
classScore += math.Log(1.0 / float64(nb.classInstances[class] + 1))
} else {
classScore += math.Log(1.0 - nb.condProb[class][f])
}
}
}
for class, classCount := range nb.classInstances {
// Init classScore with log(prior)
classScore := math.Log((float64(classCount)) / float64(nb.trainingInstances))
for f := 0; f < nb.features; f++ {
if vector[f] > 0 {
// Test document has feature c
classScore += math.Log(nb.condProb[class][f])
} else {
if nb.condProb[class][f] == 1.0 {
// special case when prob = 1.0, consider laplace
// smooth
classScore += math.Log(1.0 / float64(nb.classInstances[class]+1))
} else {
classScore += math.Log(1.0 - nb.condProb[class][f])
}
}
}
if classScore > bestScore {
bestScore = classScore
bestClass = class
}
}
if classScore > bestScore {
bestScore = classScore
bestClass = class
}
}
return bestClass
return bestClass
}
// Predict is just a wrapper for the PredictOne function.
@ -178,9 +190,9 @@ func (nb *BernoulliNBClassifier) PredictOne(vector []float64) string {
// IMPORTANT: Predict panics if Fit was not called or if the
// document vector and train matrix have a different number of columns.
func (nb *BernoulliNBClassifier) Predict(what *base.Instances) *base.Instances {
ret := what.GeneratePredictionVector()
for i := 0; i < what.Rows; i++ {
ret.SetAttrStr(i, 0, nb.PredictOne(what.GetRowVectorWithoutClass(i)))
}
return ret
ret := what.GeneratePredictionVector()
for i := 0; i < what.Rows; i++ {
ret.SetAttrStr(i, 0, nb.PredictOne(what.GetRowVectorWithoutClass(i)))
}
return ret
}

View File

@ -203,7 +203,7 @@ func (d *DecisionTreeNode) Predict(what base.FixedDataGrid) base.FixedDataGrid {
panic(err)
}
predAttrs := base.AttributeDifferenceReferences(what.AllAttributes(), predictions.AllClassAttributes())
predAttrSpecs := base.ResolveAllAttributes(what, predAttrs)
predAttrSpecs := base.ResolveAttributes(what, predAttrs)
what.MapOverRows(predAttrSpecs, func(row [][]byte, rowNo int) (bool, error) {
cur := d
for {