golearn/naive/bernoulli_nb.go

package naive

import (
    "math"
    base "github.com/sjwhitworth/golearn/base"
)

// A Bernoulli Naive Bayes Classifier. Naive Bayes classifiers assumes
// that features probabilities are independent. In order to classify an
// instance, it is calculated the probability that it was generated by
// each known class, that is, for each class C, the following
// probability is calculated.
//
// p(C|F1, F2, F3... Fn)
//
// Being F1, F2... Fn the instance features. Using the bayes theorem
// this can be written as:
//
// \frac{p(C) \times p(F1, F2... Fn|C)}{p(F1, F2... Fn)}
//
// In the Bernoulli Naive Bayes features are considered independent
// booleans, this means that the likelihood of a document given a class
// C is given by:
//
// p(F1, F2... Fn) =
// \prod_{i=1}^{n}{[F_i \times p(f_i|C)) + (1-F_i)(1 - p(f_i|C)))]}
//
// where
//     - F_i equals to 1 if feature is present in vector and zero
//       otherwise
//     - p(f_i|C) the probability of class C generating the feature
//       f_i
//
// For more information:
//
// C.D. Manning, P. Raghavan and H. Schuetze (2008). Introduction to
// Information Retrieval. Cambridge University Press, pp. 234-265.
// http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
type BernoulliNBClassifier struct {
    base.BaseEstimator
    // Conditional probability for each term. This vector should be
    // accessed in the following way: p(f|c) = condProb[c][f].
    // Logarithm is used in order to avoid underflow.
    condProb map[string][]float64
    // Number of instances in each class. This is necessary in order to
    // calculate the laplace smooth value during the Predict step.
    classInstances map[string]int
    // Number of instances used in training.
    trainingInstances int
    // Number of features in the training set
    features int
}

// Create a new Bernoulli Naive Bayes Classifier. The argument 'classes'
// is the number of possible labels in the classification task.
func NewBernoulliNBClassifier() *BernoulliNBClassifier {
    nb := BernoulliNBClassifier{}
    nb.condProb = make(map[string][]float64)
    nb.features = 0
    nb.trainingInstances = 0
    return &nb
}

// Fill data matrix with Bernoulli Naive Bayes model. All values
// necessary for calculating prior probability and p(f_i)
func (nb *BernoulliNBClassifier) Fit(X *base.Instances) {

    // Number of features and instances in this training set
    nb.trainingInstances = X.Rows
    nb.features = 0
    if X.Rows > 0 {
        nb.features = len(X.GetRowVectorWithoutClass(0))
    }

    // Number of instances in class
    nb.classInstances = make(map[string]int)

    // Number of documents with given term (by class)
    docsContainingTerm := make(map[string][]int)

    // This algorithm could be vectorized after binarizing the data
    // matrix. Since mat64 doesn't have this function, a iterative
    // version is used.
    for r := 0; r < X.Rows; r++ {
        class := X.GetClass(r)
        docVector := X.GetRowVectorWithoutClass(r)

        // increment number of instances in class
        t, ok := nb.classInstances[class]
            if !ok { t = 0 }
            nb.classInstances[class] = t + 1


        for feat := 0; feat < len(docVector); feat++ {
            v := docVector[feat]
            // In Bernoulli Naive Bayes the presence and absence of
            // features are considered. All non-zero values are
            // treated as presence.
            if v > 0 {
                // Update number of times this feature appeared within
                // given label.
                t, ok := docsContainingTerm[class]
                if !ok {
                    t = make([]int, nb.features)
                    docsContainingTerm[class] = t
                }
                t[feat] += 1
            }
        }
    }

    // Pre-calculate conditional probabilities for each class
    for c, _ := range nb.classInstances {
        nb.condProb[c] = make([]float64, nb.features)
        for feat := 0; feat < nb.features; feat++ {
            classTerms, _ := docsContainingTerm[c]
            numDocs := classTerms[feat]
            docsInClass, _ := nb.classInstances[c]

            classCondProb, _ := nb.condProb[c]
            // Calculate conditional probability with laplace smoothing
            classCondProb[feat] = float64(numDocs + 1) / float64(docsInClass + 1)
        }
    }
}

// Use trained model to predict test vector's class. The following
// operation is used in order to score each class:
//
// classScore = log(p(c)) + \sum_{f}{log(p(f|c))}
//
// PredictOne returns the string that represents the predicted class.
//
// IMPORTANT: PredictOne panics if Fit was not called or if the
// document vector and train matrix have a different number of columns.
func (nb *BernoulliNBClassifier) PredictOne(vector []float64) string {
    if nb.features == 0 {
        panic("Fit should be called before predicting")
    }

    if len(vector) != nb.features {
        panic("Different dimensions in Train and Test sets")
    }

    // Currently only the predicted class is returned.
    bestScore := -math.MaxFloat64
    bestClass := ""

    for class, classCount := range nb.classInstances {
        // Init classScore with log(prior)
        classScore := math.Log((float64(classCount))/float64(nb.trainingInstances))
        for f := 0; f < nb.features; f++ {
            if vector[f] > 0 {
                // Test document has feature c
                classScore += math.Log(nb.condProb[class][f])
            } else {
                if nb.condProb[class][f] == 1.0 {
                    // special case when prob = 1.0, consider laplace
                    // smooth
                    classScore += math.Log(1.0 / float64(nb.classInstances[class] + 1))
                } else {
                    classScore += math.Log(1.0 - nb.condProb[class][f])
                }
            }
        }

        if classScore > bestScore {
            bestScore = classScore
            bestClass = class
        }
    }

    return bestClass
}

// Predict is just a wrapper for the PredictOne function.
//
// IMPORTANT: Predict panics if Fit was not called or if the
// document vector and train matrix have a different number of columns.
func (nb *BernoulliNBClassifier) Predict(what *base.Instances) *base.Instances {
    ret := what.GeneratePredictionVector()
    for i := 0; i < what.Rows; i++ {
        ret.SetAttrStr(i, 0, nb.PredictOne(what.GetRowVectorWithoutClass(i)))
    }
    return ret
}
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00			`package naive`

			`import (`
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`"math"`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00			`base "github.com/sjwhitworth/golearn/base"`
			`)`

			`// A Bernoulli Naive Bayes Classifier. Naive Bayes classifiers assumes`
			`// that features probabilities are independent. In order to classify an`
			`// instance, it is calculated the probability that it was generated by`
			`// each known class, that is, for each class C, the following`
			`// probability is calculated.`
			`//`
			`// p(C\|F1, F2, F3... Fn)`
			`//`
			`// Being F1, F2... Fn the instance features. Using the bayes theorem`
			`// this can be written as:`
			`//`
			`// \frac{p(C) \times p(F1, F2... Fn\|C)}{p(F1, F2... Fn)}`
			`//`
			`// In the Bernoulli Naive Bayes features are considered independent`
			`// booleans, this means that the likelihood of a document given a class`
			`// C is given by:`
			`//`
			`// p(F1, F2... Fn) =`
			`// \prod_{i=1}^{n}{[F_i \times p(f_i\|C)) + (1-F_i)(1 - p(f_i\|C)))]}`
			`//`
			`// where`
			`// - F_i equals to 1 if feature is present in vector and zero`
			`// otherwise`
			`// - p(f_i\|C) the probability of class C generating the feature`
			`// f_i`
			`//`
			`// For more information:`
			`//`
			`// C.D. Manning, P. Raghavan and H. Schuetze (2008). Introduction to`
			`// Information Retrieval. Cambridge University Press, pp. 234-265.`
			`// http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html`
			`type BernoulliNBClassifier struct {`
			`base.BaseEstimator`
Added Predict function Added predict function along with its test. Current interface is the same of the KNN example. In other words, only the class string is returned from the PredictOne function. 2014-05-20 22:59:03 -03:00			`// Conditional probability for each term. This vector should be`
			`// accessed in the following way: p(f\|c) = condProb[c][f].`
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`// Logarithm is used in order to avoid underflow.`
Added Predict function Added predict function along with its test. Current interface is the same of the KNN example. In other words, only the class string is returned from the PredictOne function. 2014-05-20 22:59:03 -03:00			`condProb map[string][]float64`
			`// Number of instances in each class. This is necessary in order to`
			`// calculate the laplace smooth value during the Predict step.`
			`classInstances map[string]int`
Removed class prior pre-calculation Since the number of instances in each class are stored, there is no need to keep the pre-calculated priors. 2014-06-08 00:01:42 -03:00			`// Number of instances used in training.`
			`trainingInstances int`
Added Predict function Added predict function along with its test. Current interface is the same of the KNN example. In other words, only the class string is returned from the PredictOne function. 2014-05-20 22:59:03 -03:00			`// Number of features in the training set`
			`features int`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00			`}`

			`// Create a new Bernoulli Naive Bayes Classifier. The argument 'classes'`
			`// is the number of possible labels in the classification task.`
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`func NewBernoulliNBClassifier() *BernoulliNBClassifier {`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00			`nb := BernoulliNBClassifier{}`
Added Predict function Added predict function along with its test. Current interface is the same of the KNN example. In other words, only the class string is returned from the PredictOne function. 2014-05-20 22:59:03 -03:00			`nb.condProb = make(map[string][]float64)`
			`nb.features = 0`
Removed class prior pre-calculation Since the number of instances in each class are stored, there is no need to keep the pre-calculated priors. 2014-06-08 00:01:42 -03:00			`nb.trainingInstances = 0`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00			`return &nb`
			`}`

			`// Fill data matrix with Bernoulli Naive Bayes model. All values`
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`// necessary for calculating prior probability and p(f_i)`
			`func (nb BernoulliNBClassifier) Fit(X base.Instances) {`

Removed class prior pre-calculation Since the number of instances in each class are stored, there is no need to keep the pre-calculated priors. 2014-06-08 00:01:42 -03:00			`// Number of features and instances in this training set`
			`nb.trainingInstances = X.Rows`
Added Predict function Added predict function along with its test. Current interface is the same of the KNN example. In other words, only the class string is returned from the PredictOne function. 2014-05-20 22:59:03 -03:00			`nb.features = 0`
			`if X.Rows > 0 {`
			`nb.features = len(X.GetRowVectorWithoutClass(0))`
			`}`

Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`// Number of instances in class`
Added Predict function Added predict function along with its test. Current interface is the same of the KNN example. In other words, only the class string is returned from the PredictOne function. 2014-05-20 22:59:03 -03:00			`nb.classInstances = make(map[string]int)`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`// Number of documents with given term (by class)`
			`docsContainingTerm := make(map[string][]int)`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`// This algorithm could be vectorized after binarizing the data`
			`// matrix. Since mat64 doesn't have this function, a iterative`
			`// version is used.`
			`for r := 0; r < X.Rows; r++ {`
			`class := X.GetClass(r)`
Added Predict function Added predict function along with its test. Current interface is the same of the KNN example. In other words, only the class string is returned from the PredictOne function. 2014-05-20 22:59:03 -03:00			`docVector := X.GetRowVectorWithoutClass(r)`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`// increment number of instances in class`
Added Predict function Added predict function along with its test. Current interface is the same of the KNN example. In other words, only the class string is returned from the PredictOne function. 2014-05-20 22:59:03 -03:00			`t, ok := nb.classInstances[class]`
Removed class prior pre-calculation Since the number of instances in each class are stored, there is no need to keep the pre-calculated priors. 2014-06-08 00:01:42 -03:00			`if !ok { t = 0 }`
			`nb.classInstances[class] = t + 1`
Added Predict function Added predict function along with its test. Current interface is the same of the KNN example. In other words, only the class string is returned from the PredictOne function. 2014-05-20 22:59:03 -03:00
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00
Added Predict function Added predict function along with its test. Current interface is the same of the KNN example. In other words, only the class string is returned from the PredictOne function. 2014-05-20 22:59:03 -03:00			`for feat := 0; feat < len(docVector); feat++ {`
			`v := docVector[feat]`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00			`// In Bernoulli Naive Bayes the presence and absence of`
			`// features are considered. All non-zero values are`
			`// treated as presence.`
			`if v > 0 {`
			`// Update number of times this feature appeared within`
			`// given label.`
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`t, ok := docsContainingTerm[class]`
			`if !ok {`
Added Predict function Added predict function along with its test. Current interface is the same of the KNN example. In other words, only the class string is returned from the PredictOne function. 2014-05-20 22:59:03 -03:00			`t = make([]int, nb.features)`
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`docsContainingTerm[class] = t`
			`}`
			`t[feat] += 1`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00			`}`
			`}`
			`}`
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00
			`// Pre-calculate conditional probabilities for each class`
Added Predict function Added predict function along with its test. Current interface is the same of the KNN example. In other words, only the class string is returned from the PredictOne function. 2014-05-20 22:59:03 -03:00			`for c, _ := range nb.classInstances {`
			`nb.condProb[c] = make([]float64, nb.features)`
			`for feat := 0; feat < nb.features; feat++ {`
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`classTerms, _ := docsContainingTerm[c]`
			`numDocs := classTerms[feat]`
Added Predict function Added predict function along with its test. Current interface is the same of the KNN example. In other words, only the class string is returned from the PredictOne function. 2014-05-20 22:59:03 -03:00			`docsInClass, _ := nb.classInstances[c]`
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00
Added Predict function Added predict function along with its test. Current interface is the same of the KNN example. In other words, only the class string is returned from the PredictOne function. 2014-05-20 22:59:03 -03:00			`classCondProb, _ := nb.condProb[c]`
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`// Calculate conditional probability with laplace smoothing`
Added Predict function Added predict function along with its test. Current interface is the same of the KNN example. In other words, only the class string is returned from the PredictOne function. 2014-05-20 22:59:03 -03:00			`classCondProb[feat] = float64(numDocs + 1) / float64(docsInClass + 1)`
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`}`
			`}`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00			`}`

Added Predict function Added predict function along with its test. Current interface is the same of the KNN example. In other words, only the class string is returned from the PredictOne function. 2014-05-20 22:59:03 -03:00			`// Use trained model to predict test vector's class. The following`
			`// operation is used in order to score each class:`
			`//`
			`// classScore = log(p(c)) + \sum_{f}{log(p(f\|c))}`
			`//`
			`// PredictOne returns the string that represents the predicted class.`
			`//`
			`// IMPORTANT: PredictOne panics if Fit was not called or if the`
			`// document vector and train matrix have a different number of columns.`
			`func (nb *BernoulliNBClassifier) PredictOne(vector []float64) string {`
			`if nb.features == 0 {`
			`panic("Fit should be called before predicting")`
			`}`

			`if len(vector) != nb.features {`
			`panic("Different dimensions in Train and Test sets")`
			`}`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00
Added Predict function Added predict function along with its test. Current interface is the same of the KNN example. In other words, only the class string is returned from the PredictOne function. 2014-05-20 22:59:03 -03:00			`// Currently only the predicted class is returned.`
			`bestScore := -math.MaxFloat64`
			`bestClass := ""`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00
Removed class prior pre-calculation Since the number of instances in each class are stored, there is no need to keep the pre-calculated priors. 2014-06-08 00:01:42 -03:00			`for class, classCount := range nb.classInstances {`
			`// Init classScore with log(prior)`
			`classScore := math.Log((float64(classCount))/float64(nb.trainingInstances))`
Added Predict function Added predict function along with its test. Current interface is the same of the KNN example. In other words, only the class string is returned from the PredictOne function. 2014-05-20 22:59:03 -03:00			`for f := 0; f < nb.features; f++ {`
			`if vector[f] > 0 {`
			`// Test document has feature c`
			`classScore += math.Log(nb.condProb[class][f])`
			`} else {`
			`if nb.condProb[class][f] == 1.0 {`
			`// special case when prob = 1.0, consider laplace`
			`// smooth`
			`classScore += math.Log(1.0 / float64(nb.classInstances[class] + 1))`
			`} else {`
			`classScore += math.Log(1.0 - nb.condProb[class][f])`
			`}`
			`}`
			`}`

			`if classScore > bestScore {`
			`bestScore = classScore`
			`bestClass = class`
			`}`
			`}`

			`return bestClass`
			`}`

			`// Predict is just a wrapper for the PredictOne function.`
			`//`
			`// IMPORTANT: Predict panics if Fit was not called or if the`
			`// document vector and train matrix have a different number of columns.`
			`func (nb BernoulliNBClassifier) Predict(what base.Instances) *base.Instances {`
			`ret := what.GeneratePredictionVector()`
			`for i := 0; i < what.Rows; i++ {`
			`ret.SetAttrStr(i, 0, nb.PredictOne(what.GetRowVectorWithoutClass(i)))`
			`}`
			`return ret`
			`}`