From a6072ac9de4c6c74d4d8b57072529e09efdf17f6 Mon Sep 17 00:00:00 2001 From: Richard Townsend Date: Mon, 19 May 2014 12:59:11 +0100 Subject: [PATCH] Package documentation --- ensemble/ensemble.go | 13 +++++++++++++ meta/bagging.go | 10 +++++++++- trees/id3.go | 7 ++++++- trees/random.go | 12 ++++++++++-- trees/trees.go | 28 ++++++++++++++++++++++++++-- 5 files changed, 64 insertions(+), 6 deletions(-) create mode 100644 ensemble/ensemble.go diff --git a/ensemble/ensemble.go b/ensemble/ensemble.go new file mode 100644 index 0000000..a4bcd77 --- /dev/null +++ b/ensemble/ensemble.go @@ -0,0 +1,13 @@ +/* + + Ensemble contains classifiers which combine other classifiers. + + RandomForest: + Generates ForestSize bagged decision trees (currently ID3-based) + each considering a fixed number of random features. + + Built on meta.Bagging + +*/ + +package ensemble \ No newline at end of file diff --git a/meta/bagging.go b/meta/bagging.go index 8f821cb..c6e8431 100644 --- a/meta/bagging.go +++ b/meta/bagging.go @@ -8,7 +8,7 @@ import ( "strings" ) -// BaggedModels train Classifiers on subsets of the original +// BaggedModel trains base.Classifiers on subsets of the original // Instances and combine the results through voting type BaggedModel struct { base.BaseClassifier @@ -17,6 +17,8 @@ type BaggedModel struct { RandomFeatures int } +// generateTrainingAttrs selects RandomFeatures number of base.Attributes from +// the provided base.Instances. func (b *BaggedModel) generateTrainingAttrs(model int, from *base.Instances) []base.Attribute { ret := make([]base.Attribute, 0) if b.RandomFeatures == 0 { @@ -51,11 +53,17 @@ func (b *BaggedModel) generateTrainingAttrs(model int, from *base.Instances) []b return ret } +// generatePredictionInstances returns a modified version of the +// requested base.Instances with only the base.Attributes selected +// for training the model. func (b *BaggedModel) generatePredictionInstances(model int, from *base.Instances) *base.Instances { selected := b.selectedAttributes[model] return from.SelectAttributes(selected) } +// generateTrainingInstances generates RandomFeatures number of +// attributes and returns a modified version of base.Instances +// for training the model func (b *BaggedModel) generateTrainingInstances(model int, from *base.Instances) *base.Instances { insts := from.SampleWithReplacement(from.Rows) selected := b.generateTrainingAttrs(model, from) diff --git a/trees/id3.go b/trees/id3.go index 27f052a..0258a1d 100644 --- a/trees/id3.go +++ b/trees/id3.go @@ -109,6 +109,8 @@ func InferID3Tree(from *base.Instances, with RuleGenerator) *DecisionTreeNode { return ret } +// getNestedString returns the contents of node d +// prefixed by level number of tags (also prints children) func (d *DecisionTreeNode) getNestedString(level int) string { buf := bytes.NewBuffer(nil) tmp := bytes.NewBuffer(nil) @@ -143,6 +145,7 @@ func (d *DecisionTreeNode) String() string { return d.getNestedString(0) } +// computeAccuracy is a helper method for Prune() func computeAccuracy(predictions *base.Instances, from *base.Instances) float64 { cf := eval.GetConfusionMatrix(from, predictions) return eval.GetAccuracy(cf) @@ -231,6 +234,8 @@ type ID3DecisionTree struct { PruneSplit float64 } +// Returns a new ID3DecisionTree with the specified test-prune +// ratio. Of the ratio is less than 0.001, the tree isn't pruned func NewID3DecisionTree(prune float64) *ID3DecisionTree { return &ID3DecisionTree{ base.BaseClassifier{}, @@ -256,7 +261,7 @@ func (t *ID3DecisionTree) Predict(what *base.Instances) *base.Instances { return t.Root.Predict(what) } -// String returns a human-readable ID3 tree +// String returns a human-readable version of this ID3 tree func (t *ID3DecisionTree) String() string { return fmt.Sprintf("ID3DecisionTree(%s\n)", t.Root) } diff --git a/trees/random.go b/trees/random.go index 02697ca..0a47878 100644 --- a/trees/random.go +++ b/trees/random.go @@ -6,13 +6,14 @@ import ( "math/rand" ) +// RandomTreeRuleGenerator is used to generate decision rules for Random Trees type RandomTreeRuleGenerator struct { Attributes int internalRule InformationGainRuleGenerator } -// So WEKA returns a couple of possible attributes and evaluates -// the split criteria on each +// GenerateSplitAttribute returns the best attribute out of those randomly chosen +// which maximises Information Gain func (r *RandomTreeRuleGenerator) GenerateSplitAttribute(f *base.Instances) base.Attribute { // First step is to generate the random attributes that we'll consider @@ -44,12 +45,16 @@ func (r *RandomTreeRuleGenerator) GenerateSplitAttribute(f *base.Instances) base return r.internalRule.GetSplitAttributeFromSelection(consideredAttributes, f) } +// RandomTree builds a decision tree by considering a fixed number +// of randomly-chosen attributes at each node type RandomTree struct { base.BaseClassifier Root *DecisionTreeNode Rule *RandomTreeRuleGenerator } +// NewRandomTree returns a new RandomTree which considers attrs randomly +// chosen attributes at each node. func NewRandomTree(attrs int) *RandomTree { return &RandomTree{ base.BaseClassifier{}, @@ -71,10 +76,13 @@ func (rt *RandomTree) Predict(from *base.Instances) *base.Instances { return rt.Root.Predict(from) } +// String returns a human-readable representation of this structure func (rt *RandomTree) String() string { return fmt.Sprintf("RandomTree(%s)", rt.Root) } +// Prune removes nodes from the tree which are detrimental +// to determining the accuracy of the test set (with) func (rt *RandomTree) Prune(with *base.Instances) { rt.Root.Prune(with) } diff --git a/trees/trees.go b/trees/trees.go index f847e5d..0af00aa 100644 --- a/trees/trees.go +++ b/trees/trees.go @@ -1,2 +1,26 @@ -// Package trees provides a number of tree based ensemble learners. -package trees +/* + + This package implements decision trees. + + ID3DecisionTree: + Builds a decision tree using the ID3 algorithm + by picking the Attribute which maximises + Information Gain at each node. + + Attributes must be CategoricalAttributes at + present, so discretise beforehand (see + filters) + + RandomTree: + Builds a decision tree using the ID3 algorithm + by picking the Attribute amongst those + randomly selected that maximises Information + Gain + + Attributes must be CategoricalAttributes at + present, so discretise beforehand (see + filters) + +*/ + +package trees \ No newline at end of file