2014-05-11 21:00:28 -03:00
|
|
|
package naive
|
|
|
|
|
|
|
|
import (
|
2014-05-18 23:23:51 -03:00
|
|
|
"math"
|
2014-05-11 21:00:28 -03:00
|
|
|
base "github.com/sjwhitworth/golearn/base"
|
|
|
|
)
|
|
|
|
|
|
|
|
// A Bernoulli Naive Bayes Classifier. Naive Bayes classifiers assumes
|
|
|
|
// that features probabilities are independent. In order to classify an
|
|
|
|
// instance, it is calculated the probability that it was generated by
|
|
|
|
// each known class, that is, for each class C, the following
|
|
|
|
// probability is calculated.
|
|
|
|
//
|
|
|
|
// p(C|F1, F2, F3... Fn)
|
|
|
|
//
|
|
|
|
// Being F1, F2... Fn the instance features. Using the bayes theorem
|
|
|
|
// this can be written as:
|
|
|
|
//
|
|
|
|
// \frac{p(C) \times p(F1, F2... Fn|C)}{p(F1, F2... Fn)}
|
|
|
|
//
|
|
|
|
// In the Bernoulli Naive Bayes features are considered independent
|
|
|
|
// booleans, this means that the likelihood of a document given a class
|
|
|
|
// C is given by:
|
|
|
|
//
|
|
|
|
// p(F1, F2... Fn) =
|
|
|
|
// \prod_{i=1}^{n}{[F_i \times p(f_i|C)) + (1-F_i)(1 - p(f_i|C)))]}
|
|
|
|
//
|
|
|
|
// where
|
|
|
|
// - F_i equals to 1 if feature is present in vector and zero
|
|
|
|
// otherwise
|
|
|
|
// - p(f_i|C) the probability of class C generating the feature
|
|
|
|
// f_i
|
|
|
|
//
|
|
|
|
// For more information:
|
|
|
|
//
|
|
|
|
// C.D. Manning, P. Raghavan and H. Schuetze (2008). Introduction to
|
|
|
|
// Information Retrieval. Cambridge University Press, pp. 234-265.
|
|
|
|
// http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
|
|
|
|
type BernoulliNBClassifier struct {
|
|
|
|
base.BaseEstimator
|
2014-05-18 23:23:51 -03:00
|
|
|
// Logarithm of each class prior
|
|
|
|
logClassPrior map[string]float64
|
|
|
|
// Log of conditional probability for each term. This vector should be
|
|
|
|
// accessed in the following way: p(f|c) = logCondProb[c][f].
|
|
|
|
// Logarithm is used in order to avoid underflow.
|
|
|
|
logCondProb map[string][]float64
|
2014-05-11 21:00:28 -03:00
|
|
|
}
|
|
|
|
|
|
|
|
// Create a new Bernoulli Naive Bayes Classifier. The argument 'classes'
|
|
|
|
// is the number of possible labels in the classification task.
|
2014-05-18 23:23:51 -03:00
|
|
|
func NewBernoulliNBClassifier() *BernoulliNBClassifier {
|
2014-05-11 21:00:28 -03:00
|
|
|
nb := BernoulliNBClassifier{}
|
2014-05-18 23:23:51 -03:00
|
|
|
nb.logCondProb = make(map[string][]float64)
|
|
|
|
nb.logClassPrior = make(map[string]float64)
|
2014-05-11 21:00:28 -03:00
|
|
|
return &nb
|
|
|
|
}
|
|
|
|
|
|
|
|
// Fill data matrix with Bernoulli Naive Bayes model. All values
|
2014-05-18 23:23:51 -03:00
|
|
|
// necessary for calculating prior probability and p(f_i)
|
|
|
|
func (nb *BernoulliNBClassifier) Fit(X *base.Instances) {
|
|
|
|
|
|
|
|
// Number of instances in class
|
|
|
|
classInstances := make(map[string]int)
|
2014-05-11 21:00:28 -03:00
|
|
|
|
2014-05-18 23:23:51 -03:00
|
|
|
// Number of documents with given term (by class)
|
|
|
|
docsContainingTerm := make(map[string][]int)
|
2014-05-11 21:00:28 -03:00
|
|
|
|
2014-05-18 23:23:51 -03:00
|
|
|
// This algorithm could be vectorized after binarizing the data
|
|
|
|
// matrix. Since mat64 doesn't have this function, a iterative
|
|
|
|
// version is used.
|
|
|
|
for r := 0; r < X.Rows; r++ {
|
|
|
|
class := X.GetClass(r)
|
2014-05-11 21:00:28 -03:00
|
|
|
|
2014-05-18 23:23:51 -03:00
|
|
|
// increment number of instances in class
|
|
|
|
t, ok := classInstances[class]
|
|
|
|
if !ok { t = 0 }
|
|
|
|
classInstances[class] = t + 1
|
|
|
|
|
|
|
|
for feat := 0; feat < X.Cols; feat++ {
|
|
|
|
v := X.Get(r, feat)
|
2014-05-11 21:00:28 -03:00
|
|
|
// In Bernoulli Naive Bayes the presence and absence of
|
|
|
|
// features are considered. All non-zero values are
|
|
|
|
// treated as presence.
|
|
|
|
if v > 0 {
|
|
|
|
// Update number of times this feature appeared within
|
|
|
|
// given label.
|
2014-05-18 23:23:51 -03:00
|
|
|
t, ok := docsContainingTerm[class]
|
|
|
|
if !ok {
|
|
|
|
t = make([]int, X.Cols)
|
|
|
|
docsContainingTerm[class] = t
|
|
|
|
}
|
|
|
|
t[feat] += 1
|
2014-05-11 21:00:28 -03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2014-05-18 23:23:51 -03:00
|
|
|
|
|
|
|
// Pre-calculate conditional probabilities for each class
|
|
|
|
for c, _ := range classInstances {
|
|
|
|
nb.logClassPrior[c] = math.Log((float64(classInstances[c]))/float64(X.Rows))
|
|
|
|
nb.logCondProb[c] = make([]float64, X.Cols)
|
|
|
|
for feat := 0; feat < X.Cols; feat++ {
|
|
|
|
classTerms, _ := docsContainingTerm[c]
|
|
|
|
numDocs := classTerms[feat]
|
|
|
|
docsInClass, _ := classInstances[c]
|
|
|
|
|
|
|
|
classLogCondProb, _ := nb.logCondProb[c]
|
|
|
|
// Calculate conditional probability with laplace smoothing
|
|
|
|
classLogCondProb[feat] = math.Log(float64(numDocs + 1) / float64(docsInClass + 1))
|
|
|
|
}
|
|
|
|
}
|
2014-05-11 21:00:28 -03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|