package naive import ( "math" base "github.com/sjwhitworth/golearn/base" ) // A Bernoulli Naive Bayes Classifier. Naive Bayes classifiers assumes // that features probabilities are independent. In order to classify an // instance, it is calculated the probability that it was generated by // each known class, that is, for each class C, the following // probability is calculated. // // p(C|F1, F2, F3... Fn) // // Being F1, F2... Fn the instance features. Using the bayes theorem // this can be written as: // // \frac{p(C) \times p(F1, F2... Fn|C)}{p(F1, F2... Fn)} // // In the Bernoulli Naive Bayes features are considered independent // booleans, this means that the likelihood of a document given a class // C is given by: // // p(F1, F2... Fn) = // \prod_{i=1}^{n}{[F_i \times p(f_i|C)) + (1-F_i)(1 - p(f_i|C)))]} // // where // - F_i equals to 1 if feature is present in vector and zero // otherwise // - p(f_i|C) the probability of class C generating the feature // f_i // // For more information: // // C.D. Manning, P. Raghavan and H. Schuetze (2008). Introduction to // Information Retrieval. Cambridge University Press, pp. 234-265. // http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html type BernoulliNBClassifier struct { base.BaseEstimator // Logarithm of each class prior logClassPrior map[string]float64 // Log of conditional probability for each term. This vector should be // accessed in the following way: p(f|c) = logCondProb[c][f]. // Logarithm is used in order to avoid underflow. logCondProb map[string][]float64 } // Create a new Bernoulli Naive Bayes Classifier. The argument 'classes' // is the number of possible labels in the classification task. func NewBernoulliNBClassifier() *BernoulliNBClassifier { nb := BernoulliNBClassifier{} nb.logCondProb = make(map[string][]float64) nb.logClassPrior = make(map[string]float64) return &nb } // Fill data matrix with Bernoulli Naive Bayes model. All values // necessary for calculating prior probability and p(f_i) func (nb *BernoulliNBClassifier) Fit(X *base.Instances) { // Number of instances in class classInstances := make(map[string]int) // Number of documents with given term (by class) docsContainingTerm := make(map[string][]int) // This algorithm could be vectorized after binarizing the data // matrix. Since mat64 doesn't have this function, a iterative // version is used. for r := 0; r < X.Rows; r++ { class := X.GetClass(r) // increment number of instances in class t, ok := classInstances[class] if !ok { t = 0 } classInstances[class] = t + 1 for feat := 0; feat < X.Cols; feat++ { v := X.Get(r, feat) // In Bernoulli Naive Bayes the presence and absence of // features are considered. All non-zero values are // treated as presence. if v > 0 { // Update number of times this feature appeared within // given label. t, ok := docsContainingTerm[class] if !ok { t = make([]int, X.Cols) docsContainingTerm[class] = t } t[feat] += 1 } } } // Pre-calculate conditional probabilities for each class for c, _ := range classInstances { nb.logClassPrior[c] = math.Log((float64(classInstances[c]))/float64(X.Rows)) nb.logCondProb[c] = make([]float64, X.Cols) for feat := 0; feat < X.Cols; feat++ { classTerms, _ := docsContainingTerm[c] numDocs := classTerms[feat] docsInClass, _ := classInstances[c] classLogCondProb, _ := nb.logCondProb[c] // Calculate conditional probability with laplace smoothing classLogCondProb[feat] = math.Log(float64(numDocs + 1) / float64(docsInClass + 1)) } } }