mirror of
https://github.com/sjwhitworth/golearn.git
synced 2025-04-25 13:48:49 +08:00

Avoid quadratic loop in getNumericAttributeEntropy. We don't need to recalculate whole distribution for each split, just move changed values. Also use array of slices instead of map of maps of strings to avoid map overhead. For our case I see time reductions from 100+ hours to 50 minutes. I've added benchmark with synthetic data (iris.csv repeated 100 times) and it also shows a nice improvement: name old time/op new time/op delta RandomForestFit-8 117s ± 4% 0s ± 1% -99.61% (p=0.001 n=5+10) 0 is a rounding quirk of benchstat, it should be closer to 0.5s: name time/op RandomForestFit-8 460ms ± 1%
21 lines
491 B
Go
21 lines
491 B
Go
package trees_test
|
|
|
|
import (
|
|
"github.com/sjwhitworth/golearn/base"
|
|
"github.com/sjwhitworth/golearn/ensemble"
|
|
"testing"
|
|
)
|
|
|
|
func BenchmarkRandomForestFit(b *testing.B) {
|
|
// benchdata.csv contains ../examples/datasets/iris.csv repeated 100 times.
|
|
data, err := base.ParseCSVToInstances("benchdata.csv", true)
|
|
if err != nil {
|
|
b.Fatalf("Cannot load benchdata.csv err:\n%v", err)
|
|
}
|
|
b.ResetTimer()
|
|
tree := ensemble.NewRandomForest(20, 4)
|
|
for i := 0; i < b.N; i++ {
|
|
tree.Fit(data)
|
|
}
|
|
}
|