golearn/naive/bernoulli_nb.go

package naive

import (
    "math"
    base "github.com/sjwhitworth/golearn/base"
)

// A Bernoulli Naive Bayes Classifier. Naive Bayes classifiers assumes
// that features probabilities are independent. In order to classify an
// instance, it is calculated the probability that it was generated by
// each known class, that is, for each class C, the following
// probability is calculated.
//
// p(C|F1, F2, F3... Fn)
//
// Being F1, F2... Fn the instance features. Using the bayes theorem
// this can be written as:
//
// \frac{p(C) \times p(F1, F2... Fn|C)}{p(F1, F2... Fn)}
//
// In the Bernoulli Naive Bayes features are considered independent
// booleans, this means that the likelihood of a document given a class
// C is given by:
//
// p(F1, F2... Fn) =
// \prod_{i=1}^{n}{[F_i \times p(f_i|C)) + (1-F_i)(1 - p(f_i|C)))]}
//
// where
//     - F_i equals to 1 if feature is present in vector and zero
//       otherwise
//     - p(f_i|C) the probability of class C generating the feature
//       f_i
//
// For more information:
//
// C.D. Manning, P. Raghavan and H. Schuetze (2008). Introduction to
// Information Retrieval. Cambridge University Press, pp. 234-265.
// http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
type BernoulliNBClassifier struct {
    base.BaseEstimator
    // Logarithm of each class prior
    logClassPrior map[string]float64
    // Log of conditional probability for each term. This vector should be
    // accessed in the following way: p(f|c) = logCondProb[c][f].
    // Logarithm is used in order to avoid underflow.
    logCondProb map[string][]float64
}

// Create a new Bernoulli Naive Bayes Classifier. The argument 'classes'
// is the number of possible labels in the classification task.
func NewBernoulliNBClassifier() *BernoulliNBClassifier {
    nb := BernoulliNBClassifier{}
    nb.logCondProb = make(map[string][]float64)
    nb.logClassPrior = make(map[string]float64)
    return &nb
}

// Fill data matrix with Bernoulli Naive Bayes model. All values
// necessary for calculating prior probability and p(f_i)
func (nb *BernoulliNBClassifier) Fit(X *base.Instances) {

    // Number of instances in class
    classInstances := make(map[string]int)

    // Number of documents with given term (by class)
    docsContainingTerm := make(map[string][]int)

    // This algorithm could be vectorized after binarizing the data
    // matrix. Since mat64 doesn't have this function, a iterative
    // version is used.
    for r := 0; r < X.Rows; r++ {
        class := X.GetClass(r)

        // increment number of instances in class
        t, ok := classInstances[class]
        if !ok { t = 0 }
        classInstances[class] = t + 1

        for feat := 0; feat < X.Cols; feat++ {
            v := X.Get(r, feat)
            // In Bernoulli Naive Bayes the presence and absence of
            // features are considered. All non-zero values are
            // treated as presence.
            if v > 0 {
                // Update number of times this feature appeared within
                // given label.
                t, ok := docsContainingTerm[class]
                if !ok {
                    t = make([]int, X.Cols)
                    docsContainingTerm[class] = t
                }
                t[feat] += 1
            }
        }
    }

    // Pre-calculate conditional probabilities for each class
    for c, _ := range classInstances {
        nb.logClassPrior[c] = math.Log((float64(classInstances[c]))/float64(X.Rows))
        nb.logCondProb[c] = make([]float64, X.Cols)
        for feat := 0; feat < X.Cols; feat++ {
            classTerms, _ := docsContainingTerm[c]
            numDocs := classTerms[feat]
            docsInClass, _ := classInstances[c]

            classLogCondProb, _ := nb.logCondProb[c]
            // Calculate conditional probability with laplace smoothing
            classLogCondProb[feat] = math.Log(float64(numDocs + 1) / float64(docsInClass + 1))
        }
    }
}
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00			`package naive`

			`import (`
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`"math"`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00			`base "github.com/sjwhitworth/golearn/base"`
			`)`

			`// A Bernoulli Naive Bayes Classifier. Naive Bayes classifiers assumes`
			`// that features probabilities are independent. In order to classify an`
			`// instance, it is calculated the probability that it was generated by`
			`// each known class, that is, for each class C, the following`
			`// probability is calculated.`
			`//`
			`// p(C\|F1, F2, F3... Fn)`
			`//`
			`// Being F1, F2... Fn the instance features. Using the bayes theorem`
			`// this can be written as:`
			`//`
			`// \frac{p(C) \times p(F1, F2... Fn\|C)}{p(F1, F2... Fn)}`
			`//`
			`// In the Bernoulli Naive Bayes features are considered independent`
			`// booleans, this means that the likelihood of a document given a class`
			`// C is given by:`
			`//`
			`// p(F1, F2... Fn) =`
			`// \prod_{i=1}^{n}{[F_i \times p(f_i\|C)) + (1-F_i)(1 - p(f_i\|C)))]}`
			`//`
			`// where`
			`// - F_i equals to 1 if feature is present in vector and zero`
			`// otherwise`
			`// - p(f_i\|C) the probability of class C generating the feature`
			`// f_i`
			`//`
			`// For more information:`
			`//`
			`// C.D. Manning, P. Raghavan and H. Schuetze (2008). Introduction to`
			`// Information Retrieval. Cambridge University Press, pp. 234-265.`
			`// http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html`
			`type BernoulliNBClassifier struct {`
			`base.BaseEstimator`
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`// Logarithm of each class prior`
			`logClassPrior map[string]float64`
			`// Log of conditional probability for each term. This vector should be`
			`// accessed in the following way: p(f\|c) = logCondProb[c][f].`
			`// Logarithm is used in order to avoid underflow.`
			`logCondProb map[string][]float64`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00			`}`

			`// Create a new Bernoulli Naive Bayes Classifier. The argument 'classes'`
			`// is the number of possible labels in the classification task.`
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`func NewBernoulliNBClassifier() *BernoulliNBClassifier {`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00			`nb := BernoulliNBClassifier{}`
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`nb.logCondProb = make(map[string][]float64)`
			`nb.logClassPrior = make(map[string]float64)`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00			`return &nb`
			`}`

			`// Fill data matrix with Bernoulli Naive Bayes model. All values`
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`// necessary for calculating prior probability and p(f_i)`
			`func (nb BernoulliNBClassifier) Fit(X base.Instances) {`

			`// Number of instances in class`
			`classInstances := make(map[string]int)`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`// Number of documents with given term (by class)`
			`docsContainingTerm := make(map[string][]int)`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`// This algorithm could be vectorized after binarizing the data`
			`// matrix. Since mat64 doesn't have this function, a iterative`
			`// version is used.`
			`for r := 0; r < X.Rows; r++ {`
			`class := X.GetClass(r)`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`// increment number of instances in class`
			`t, ok := classInstances[class]`
			`if !ok { t = 0 }`
			`classInstances[class] = t + 1`

			`for feat := 0; feat < X.Cols; feat++ {`
			`v := X.Get(r, feat)`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00			`// In Bernoulli Naive Bayes the presence and absence of`
			`// features are considered. All non-zero values are`
			`// treated as presence.`
			`if v > 0 {`
			`// Update number of times this feature appeared within`
			`// given label.`
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00			`t, ok := docsContainingTerm[class]`
			`if !ok {`
			`t = make([]int, X.Cols)`
			`docsContainingTerm[class] = t`
			`}`
			`t[feat] += 1`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00			`}`
			`}`
			`}`
Refactoring for base.Instances Refactored the algorithm to use base.Instances. Rewrote the Fit method to pre-calculate priors and conditional probabilities. 2014-05-18 23:23:51 -03:00
			`// Pre-calculate conditional probabilities for each class`
			`for c, _ := range classInstances {`
			`nb.logClassPrior[c] = math.Log((float64(classInstances[c]))/float64(X.Rows))`
			`nb.logCondProb[c] = make([]float64, X.Cols)`
			`for feat := 0; feat < X.Cols; feat++ {`
			`classTerms, _ := docsContainingTerm[c]`
			`numDocs := classTerms[feat]`
			`docsInClass, _ := classInstances[c]`

			`classLogCondProb, _ := nb.logCondProb[c]`
			`// Calculate conditional probability with laplace smoothing`
			`classLogCondProb[feat] = math.Log(float64(numDocs + 1) / float64(docsInClass + 1))`
			`}`
			`}`
Bernoulli Naive Bayes: first draft This is the first draft of the bernoulli naive bayes implementation. It is missing the Fit function tests and the Predict function. 2014-05-11 21:00:28 -03:00			`}`