golearn/naive/bernoulli_nb.go

package naive

import (
    "github.com/gonum/matrix/mat64"
    base "github.com/sjwhitworth/golearn/base"
)

// A Bernoulli Naive Bayes Classifier. Naive Bayes classifiers assumes
// that features probabilities are independent. In order to classify an
// instance, it is calculated the probability that it was generated by
// each known class, that is, for each class C, the following
// probability is calculated.
//
// p(C|F1, F2, F3... Fn)
//
// Being F1, F2... Fn the instance features. Using the bayes theorem
// this can be written as:
//
// \frac{p(C) \times p(F1, F2... Fn|C)}{p(F1, F2... Fn)}
//
// In the Bernoulli Naive Bayes features are considered independent
// booleans, this means that the likelihood of a document given a class
// C is given by:
//
// p(F1, F2... Fn) =
// \prod_{i=1}^{n}{[F_i \times p(f_i|C)) + (1-F_i)(1 - p(f_i|C)))]}
//
// where
//     - F_i equals to 1 if feature is present in vector and zero
//       otherwise
//     - p(f_i|C) the probability of class C generating the feature
//       f_i
//
// For more information:
//
// C.D. Manning, P. Raghavan and H. Schuetze (2008). Introduction to
// Information Retrieval. Cambridge University Press, pp. 234-265.
// http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
type BernoulliNBClassifier struct {
    base.BaseEstimator
    // Number of instances in each class. Used for calculating the prior
    // probability and p(f_i|C)
    classInstances []int
}

// Create a new Bernoulli Naive Bayes Classifier. The argument 'classes'
// is the number of possible labels in the classification task.
func NewBernoulliNBClassifier(classes int) *BernoulliNBClassifier {
    nb := BernoulliNBClassifier{}
    nb.classInstances = make([]int, classes)
    return &nb
}

// Fill data matrix with Bernoulli Naive Bayes model. All values
// necessary for calculating prior probability and p(f_i
func (nb *BernoulliNBClassifier) Fit(X *mat64.Dense, y []int) {
    instances, features := X.Dims()
    if instances != len(y) {
        panic(mat64.ErrShape)
    }

    nb.Data = mat64.NewDense(len(nb.classInstances), features, nil)

    for r := 0; r < instances; r++ {
        // Get label of this instance. This should be a value between
        // zero and nb.classes.
        label := y[r]
        nb.classInstances[label]++

        for c := 0; c < features; c++ {
            v := X.At(r, c)
            // In Bernoulli Naive Bayes the presence and absence of
            // features are considered. All non-zero values are
            // treated as presence.
            if v > 0 {
                // Update number of times this feature appeared within
                // given label.
                nb.Data.Set(label, c, nb.Data.At(label, c) + 1.0)
            }
        }
    }
}