mirror of
https://github.com/mainflux/mainflux.git
synced 2025-04-27 13:48:49 +08:00

Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com> Signed-off-by: dusanb94 <dusan.borovcanin@mainflux.com>
1401 lines
37 KiB
Go
1401 lines
37 KiB
Go
// Copyright ©2014 The Gonum Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package stat
|
|
|
|
import (
|
|
"math"
|
|
"sort"
|
|
|
|
"gonum.org/v1/gonum/floats"
|
|
)
|
|
|
|
// CumulantKind specifies the behavior for calculating the empirical CDF or Quantile
|
|
type CumulantKind int
|
|
|
|
// List of supported CumulantKind values for the Quantile function.
|
|
// Constant values should match the R nomenclature. See
|
|
// https://en.wikipedia.org/wiki/Quantile#Estimating_the_quantiles_of_a_population
|
|
const (
|
|
// Empirical treats the distribution as the actual empirical distribution.
|
|
Empirical CumulantKind = 1
|
|
// LinInterp linearly interpolates the empirical distribution between sample values, with a flat extrapolation.
|
|
LinInterp CumulantKind = 4
|
|
)
|
|
|
|
// bhattacharyyaCoeff computes the Bhattacharyya Coefficient for probability distributions given by:
|
|
//
|
|
// \sum_i \sqrt{p_i q_i}
|
|
//
|
|
// It is assumed that p and q have equal length.
|
|
func bhattacharyyaCoeff(p, q []float64) float64 {
|
|
var bc float64
|
|
for i, a := range p {
|
|
bc += math.Sqrt(a * q[i])
|
|
}
|
|
return bc
|
|
}
|
|
|
|
// Bhattacharyya computes the distance between the probability distributions p and q given by:
|
|
//
|
|
// -\ln ( \sum_i \sqrt{p_i q_i} )
|
|
//
|
|
// The lengths of p and q must be equal. It is assumed that p and q sum to 1.
|
|
func Bhattacharyya(p, q []float64) float64 {
|
|
if len(p) != len(q) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
bc := bhattacharyyaCoeff(p, q)
|
|
return -math.Log(bc)
|
|
}
|
|
|
|
// CDF returns the empirical cumulative distribution function value of x, that is
|
|
// the fraction of the samples less than or equal to q. The
|
|
// exact behavior is determined by the CumulantKind. CDF is theoretically
|
|
// the inverse of the Quantile function, though it may not be the actual inverse
|
|
// for all values q and CumulantKinds.
|
|
//
|
|
// The x data must be sorted in increasing order. If weights is nil then all
|
|
// of the weights are 1. If weights is not nil, then len(x) must equal len(weights).
|
|
// CDF will panic if the length of x is zero.
|
|
//
|
|
// CumulantKind behaviors:
|
|
// - Empirical: Returns the lowest fraction for which q is greater than or equal
|
|
// to that fraction of samples
|
|
func CDF(q float64, c CumulantKind, x, weights []float64) float64 {
|
|
if weights != nil && len(x) != len(weights) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
if floats.HasNaN(x) {
|
|
return math.NaN()
|
|
}
|
|
if len(x) == 0 {
|
|
panic("stat: zero length slice")
|
|
}
|
|
if !sort.Float64sAreSorted(x) {
|
|
panic("x data are not sorted")
|
|
}
|
|
|
|
if q < x[0] {
|
|
return 0
|
|
}
|
|
if q >= x[len(x)-1] {
|
|
return 1
|
|
}
|
|
|
|
var sumWeights float64
|
|
if weights == nil {
|
|
sumWeights = float64(len(x))
|
|
} else {
|
|
sumWeights = floats.Sum(weights)
|
|
}
|
|
|
|
// Calculate the index
|
|
switch c {
|
|
case Empirical:
|
|
// Find the smallest value that is greater than that percent of the samples
|
|
var w float64
|
|
for i, v := range x {
|
|
if v > q {
|
|
return w / sumWeights
|
|
}
|
|
if weights == nil {
|
|
w++
|
|
} else {
|
|
w += weights[i]
|
|
}
|
|
}
|
|
panic("impossible")
|
|
default:
|
|
panic("stat: bad cumulant kind")
|
|
}
|
|
}
|
|
|
|
// ChiSquare computes the chi-square distance between the observed frequencies 'obs' and
|
|
// expected frequencies 'exp' given by:
|
|
//
|
|
// \sum_i (obs_i-exp_i)^2 / exp_i
|
|
//
|
|
// The lengths of obs and exp must be equal.
|
|
func ChiSquare(obs, exp []float64) float64 {
|
|
if len(obs) != len(exp) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
var result float64
|
|
for i, a := range obs {
|
|
b := exp[i]
|
|
if a == 0 && b == 0 {
|
|
continue
|
|
}
|
|
result += (a - b) * (a - b) / b
|
|
}
|
|
return result
|
|
}
|
|
|
|
// CircularMean returns the circular mean of the dataset.
|
|
//
|
|
// atan2(\sum_i w_i * sin(alpha_i), \sum_i w_i * cos(alpha_i))
|
|
//
|
|
// If weights is nil then all of the weights are 1. If weights is not nil, then
|
|
// len(x) must equal len(weights).
|
|
func CircularMean(x, weights []float64) float64 {
|
|
if weights != nil && len(x) != len(weights) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
|
|
var aX, aY float64
|
|
if weights != nil {
|
|
for i, v := range x {
|
|
aX += weights[i] * math.Cos(v)
|
|
aY += weights[i] * math.Sin(v)
|
|
}
|
|
} else {
|
|
for _, v := range x {
|
|
aX += math.Cos(v)
|
|
aY += math.Sin(v)
|
|
}
|
|
}
|
|
|
|
return math.Atan2(aY, aX)
|
|
}
|
|
|
|
// Correlation returns the weighted correlation between the samples of x and y
|
|
// with the given means.
|
|
//
|
|
// sum_i {w_i (x_i - meanX) * (y_i - meanY)} / (stdX * stdY)
|
|
//
|
|
// The lengths of x and y must be equal. If weights is nil then all of the
|
|
// weights are 1. If weights is not nil, then len(x) must equal len(weights).
|
|
func Correlation(x, y, weights []float64) float64 {
|
|
// This is a two-pass corrected implementation. It is an adaptation of the
|
|
// algorithm used in the MeanVariance function, which applies a correction
|
|
// to the typical two pass approach.
|
|
|
|
if len(x) != len(y) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
xu := Mean(x, weights)
|
|
yu := Mean(y, weights)
|
|
var (
|
|
sxx float64
|
|
syy float64
|
|
sxy float64
|
|
xcompensation float64
|
|
ycompensation float64
|
|
)
|
|
if weights == nil {
|
|
for i, xv := range x {
|
|
yv := y[i]
|
|
xd := xv - xu
|
|
yd := yv - yu
|
|
sxx += xd * xd
|
|
syy += yd * yd
|
|
sxy += xd * yd
|
|
xcompensation += xd
|
|
ycompensation += yd
|
|
}
|
|
// xcompensation and ycompensation are from Chan, et. al.
|
|
// referenced in the MeanVariance function. They are analogous
|
|
// to the second term in (1.7) in that paper.
|
|
sxx -= xcompensation * xcompensation / float64(len(x))
|
|
syy -= ycompensation * ycompensation / float64(len(x))
|
|
|
|
return (sxy - xcompensation*ycompensation/float64(len(x))) / math.Sqrt(sxx*syy)
|
|
|
|
}
|
|
|
|
var sumWeights float64
|
|
for i, xv := range x {
|
|
w := weights[i]
|
|
yv := y[i]
|
|
xd := xv - xu
|
|
wxd := w * xd
|
|
yd := yv - yu
|
|
wyd := w * yd
|
|
sxx += wxd * xd
|
|
syy += wyd * yd
|
|
sxy += wxd * yd
|
|
xcompensation += wxd
|
|
ycompensation += wyd
|
|
sumWeights += w
|
|
}
|
|
// xcompensation and ycompensation are from Chan, et. al.
|
|
// referenced in the MeanVariance function. They are analogous
|
|
// to the second term in (1.7) in that paper, except they use
|
|
// the sumWeights instead of the sample count.
|
|
sxx -= xcompensation * xcompensation / sumWeights
|
|
syy -= ycompensation * ycompensation / sumWeights
|
|
|
|
return (sxy - xcompensation*ycompensation/sumWeights) / math.Sqrt(sxx*syy)
|
|
}
|
|
|
|
// Kendall returns the weighted Tau-a Kendall correlation between the
|
|
// samples of x and y. The Kendall correlation measures the quantity of
|
|
// concordant and discordant pairs of numbers. If weights are specified then
|
|
// each pair is weighted by weights[i] * weights[j] and the final sum is
|
|
// normalized to stay between -1 and 1.
|
|
// The lengths of x and y must be equal. If weights is nil then all of the
|
|
// weights are 1. If weights is not nil, then len(x) must equal len(weights).
|
|
func Kendall(x, y, weights []float64) float64 {
|
|
if len(x) != len(y) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
|
|
var (
|
|
cc float64 // number of concordant pairs
|
|
dc float64 // number of discordant pairs
|
|
n = len(x)
|
|
)
|
|
|
|
if weights == nil {
|
|
for i := 0; i < n; i++ {
|
|
for j := i; j < n; j++ {
|
|
if i == j {
|
|
continue
|
|
}
|
|
if math.Signbit(x[j]-x[i]) == math.Signbit(y[j]-y[i]) {
|
|
cc++
|
|
} else {
|
|
dc++
|
|
}
|
|
}
|
|
}
|
|
return (cc - dc) / float64(n*(n-1)/2)
|
|
}
|
|
|
|
var sumWeights float64
|
|
|
|
for i := 0; i < n; i++ {
|
|
for j := i; j < n; j++ {
|
|
if i == j {
|
|
continue
|
|
}
|
|
weight := weights[i] * weights[j]
|
|
if math.Signbit(x[j]-x[i]) == math.Signbit(y[j]-y[i]) {
|
|
cc += weight
|
|
} else {
|
|
dc += weight
|
|
}
|
|
sumWeights += weight
|
|
}
|
|
}
|
|
return float64(cc-dc) / sumWeights
|
|
}
|
|
|
|
// Covariance returns the weighted covariance between the samples of x and y.
|
|
//
|
|
// sum_i {w_i (x_i - meanX) * (y_i - meanY)} / (sum_j {w_j} - 1)
|
|
//
|
|
// The lengths of x and y must be equal. If weights is nil then all of the
|
|
// weights are 1. If weights is not nil, then len(x) must equal len(weights).
|
|
func Covariance(x, y, weights []float64) float64 {
|
|
// This is a two-pass corrected implementation. It is an adaptation of the
|
|
// algorithm used in the MeanVariance function, which applies a correction
|
|
// to the typical two pass approach.
|
|
|
|
if len(x) != len(y) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
xu := Mean(x, weights)
|
|
yu := Mean(y, weights)
|
|
return covarianceMeans(x, y, weights, xu, yu)
|
|
}
|
|
|
|
// covarianceMeans returns the weighted covariance between x and y with the mean
|
|
// of x and y already specified. See the documentation of Covariance for more
|
|
// information.
|
|
func covarianceMeans(x, y, weights []float64, xu, yu float64) float64 {
|
|
var (
|
|
ss float64
|
|
xcompensation float64
|
|
ycompensation float64
|
|
)
|
|
if weights == nil {
|
|
for i, xv := range x {
|
|
yv := y[i]
|
|
xd := xv - xu
|
|
yd := yv - yu
|
|
ss += xd * yd
|
|
xcompensation += xd
|
|
ycompensation += yd
|
|
}
|
|
// xcompensation and ycompensation are from Chan, et. al.
|
|
// referenced in the MeanVariance function. They are analogous
|
|
// to the second term in (1.7) in that paper.
|
|
return (ss - xcompensation*ycompensation/float64(len(x))) / float64(len(x)-1)
|
|
}
|
|
|
|
var sumWeights float64
|
|
|
|
for i, xv := range x {
|
|
w := weights[i]
|
|
yv := y[i]
|
|
wxd := w * (xv - xu)
|
|
yd := (yv - yu)
|
|
ss += wxd * yd
|
|
xcompensation += wxd
|
|
ycompensation += w * yd
|
|
sumWeights += w
|
|
}
|
|
// xcompensation and ycompensation are from Chan, et. al.
|
|
// referenced in the MeanVariance function. They are analogous
|
|
// to the second term in (1.7) in that paper, except they use
|
|
// the sumWeights instead of the sample count.
|
|
return (ss - xcompensation*ycompensation/sumWeights) / (sumWeights - 1)
|
|
}
|
|
|
|
// CrossEntropy computes the cross-entropy between the two distributions specified
|
|
// in p and q.
|
|
func CrossEntropy(p, q []float64) float64 {
|
|
if len(p) != len(q) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
var ce float64
|
|
for i, v := range p {
|
|
if v != 0 {
|
|
ce -= v * math.Log(q[i])
|
|
}
|
|
}
|
|
return ce
|
|
}
|
|
|
|
// Entropy computes the Shannon entropy of a distribution or the distance between
|
|
// two distributions. The natural logarithm is used.
|
|
// - sum_i (p_i * log_e(p_i))
|
|
func Entropy(p []float64) float64 {
|
|
var e float64
|
|
for _, v := range p {
|
|
if v != 0 { // Entropy needs 0 * log(0) == 0.
|
|
e -= v * math.Log(v)
|
|
}
|
|
}
|
|
return e
|
|
}
|
|
|
|
// ExKurtosis returns the population excess kurtosis of the sample.
|
|
// The kurtosis is defined by the 4th moment of the mean divided by the squared
|
|
// variance. The excess kurtosis subtracts 3.0 so that the excess kurtosis of
|
|
// the normal distribution is zero.
|
|
// If weights is nil then all of the weights are 1. If weights is not nil, then
|
|
// len(x) must equal len(weights).
|
|
func ExKurtosis(x, weights []float64) float64 {
|
|
mean, std := MeanStdDev(x, weights)
|
|
if weights == nil {
|
|
var e float64
|
|
for _, v := range x {
|
|
z := (v - mean) / std
|
|
e += z * z * z * z
|
|
}
|
|
mul, offset := kurtosisCorrection(float64(len(x)))
|
|
return e*mul - offset
|
|
}
|
|
|
|
var (
|
|
e float64
|
|
sumWeights float64
|
|
)
|
|
for i, v := range x {
|
|
z := (v - mean) / std
|
|
e += weights[i] * z * z * z * z
|
|
sumWeights += weights[i]
|
|
}
|
|
mul, offset := kurtosisCorrection(sumWeights)
|
|
return e*mul - offset
|
|
}
|
|
|
|
// n is the number of samples
|
|
// see https://en.wikipedia.org/wiki/Kurtosis
|
|
func kurtosisCorrection(n float64) (mul, offset float64) {
|
|
return ((n + 1) / (n - 1)) * (n / (n - 2)) * (1 / (n - 3)), 3 * ((n - 1) / (n - 2)) * ((n - 1) / (n - 3))
|
|
}
|
|
|
|
// GeometricMean returns the weighted geometric mean of the dataset
|
|
//
|
|
// \prod_i {x_i ^ w_i}
|
|
//
|
|
// This only applies with positive x and positive weights. If weights is nil
|
|
// then all of the weights are 1. If weights is not nil, then len(x) must equal
|
|
// len(weights).
|
|
func GeometricMean(x, weights []float64) float64 {
|
|
if weights == nil {
|
|
var s float64
|
|
for _, v := range x {
|
|
s += math.Log(v)
|
|
}
|
|
s /= float64(len(x))
|
|
return math.Exp(s)
|
|
}
|
|
if len(x) != len(weights) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
var (
|
|
s float64
|
|
sumWeights float64
|
|
)
|
|
for i, v := range x {
|
|
s += weights[i] * math.Log(v)
|
|
sumWeights += weights[i]
|
|
}
|
|
s /= sumWeights
|
|
return math.Exp(s)
|
|
}
|
|
|
|
// HarmonicMean returns the weighted harmonic mean of the dataset
|
|
//
|
|
// \sum_i {w_i} / ( sum_i {w_i / x_i} )
|
|
//
|
|
// This only applies with positive x and positive weights.
|
|
// If weights is nil then all of the weights are 1. If weights is not nil, then
|
|
// len(x) must equal len(weights).
|
|
func HarmonicMean(x, weights []float64) float64 {
|
|
if weights != nil && len(x) != len(weights) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
// TODO(btracey): Fix this to make it more efficient and avoid allocation.
|
|
|
|
// This can be numerically unstable (for example if x is very small).
|
|
// W = \sum_i {w_i}
|
|
// hm = exp(log(W) - log(\sum_i w_i / x_i))
|
|
|
|
logs := make([]float64, len(x))
|
|
var W float64
|
|
for i := range x {
|
|
if weights == nil {
|
|
logs[i] = -math.Log(x[i])
|
|
W++
|
|
continue
|
|
}
|
|
logs[i] = math.Log(weights[i]) - math.Log(x[i])
|
|
W += weights[i]
|
|
}
|
|
|
|
// Sum all of the logs
|
|
v := floats.LogSumExp(logs) // This computes log(\sum_i { w_i / x_i}).
|
|
return math.Exp(math.Log(W) - v)
|
|
}
|
|
|
|
// Hellinger computes the distance between the probability distributions p and q given by:
|
|
//
|
|
// \sqrt{ 1 - \sum_i \sqrt{p_i q_i} }
|
|
//
|
|
// The lengths of p and q must be equal. It is assumed that p and q sum to 1.
|
|
func Hellinger(p, q []float64) float64 {
|
|
if len(p) != len(q) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
bc := bhattacharyyaCoeff(p, q)
|
|
return math.Sqrt(1 - bc)
|
|
}
|
|
|
|
// Histogram sums up the weighted number of data points in each bin.
|
|
// The weight of data point x[i] will be placed into count[j] if
|
|
// dividers[j] <= x < dividers[j+1]. The "span" function in the floats package can assist
|
|
// with bin creation.
|
|
//
|
|
// The following conditions on the inputs apply:
|
|
// - The count variable must either be nil or have length of one less than dividers.
|
|
// - The values in dividers must be sorted (use the sort package).
|
|
// - The x values must be sorted.
|
|
// - If weights is nil then all of the weights are 1.
|
|
// - If weights is not nil, then len(x) must equal len(weights).
|
|
func Histogram(count, dividers, x, weights []float64) []float64 {
|
|
if weights != nil && len(x) != len(weights) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
if count == nil {
|
|
count = make([]float64, len(dividers)-1)
|
|
}
|
|
if len(dividers) < 2 {
|
|
panic("histogram: fewer than two dividers")
|
|
}
|
|
if len(count) != len(dividers)-1 {
|
|
panic("histogram: bin count mismatch")
|
|
}
|
|
if !sort.Float64sAreSorted(dividers) {
|
|
panic("histogram: dividers are not sorted")
|
|
}
|
|
if !sort.Float64sAreSorted(x) {
|
|
panic("histogram: x data are not sorted")
|
|
}
|
|
for i := range count {
|
|
count[i] = 0
|
|
}
|
|
if len(x) == 0 {
|
|
return count
|
|
}
|
|
if x[0] < dividers[0] {
|
|
panic("histogram: minimum x value is less than lowest divider")
|
|
}
|
|
if dividers[len(dividers)-1] <= x[len(x)-1] {
|
|
panic("histogram: maximum x value is greater than or equal to highest divider")
|
|
}
|
|
|
|
idx := 0
|
|
comp := dividers[idx+1]
|
|
if weights == nil {
|
|
for _, v := range x {
|
|
if v < comp {
|
|
// Still in the current bucket.
|
|
count[idx]++
|
|
continue
|
|
}
|
|
// Find the next divider where v is less than the divider.
|
|
for j := idx + 1; j < len(dividers); j++ {
|
|
if v < dividers[j+1] {
|
|
idx = j
|
|
comp = dividers[j+1]
|
|
break
|
|
}
|
|
}
|
|
count[idx]++
|
|
}
|
|
return count
|
|
}
|
|
|
|
for i, v := range x {
|
|
if v < comp {
|
|
// Still in the current bucket.
|
|
count[idx] += weights[i]
|
|
continue
|
|
}
|
|
// Need to find the next divider where v is less than the divider.
|
|
for j := idx + 1; j < len(count); j++ {
|
|
if v < dividers[j+1] {
|
|
idx = j
|
|
comp = dividers[j+1]
|
|
break
|
|
}
|
|
}
|
|
count[idx] += weights[i]
|
|
}
|
|
return count
|
|
}
|
|
|
|
// JensenShannon computes the JensenShannon divergence between the distributions
|
|
// p and q. The Jensen-Shannon divergence is defined as
|
|
//
|
|
// m = 0.5 * (p + q)
|
|
// JS(p, q) = 0.5 ( KL(p, m) + KL(q, m) )
|
|
//
|
|
// Unlike Kullback-Leibler, the Jensen-Shannon distance is symmetric. The value
|
|
// is between 0 and ln(2).
|
|
func JensenShannon(p, q []float64) float64 {
|
|
if len(p) != len(q) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
var js float64
|
|
for i, v := range p {
|
|
qi := q[i]
|
|
m := 0.5 * (v + qi)
|
|
if v != 0 {
|
|
// add kl from p to m
|
|
js += 0.5 * v * (math.Log(v) - math.Log(m))
|
|
}
|
|
if qi != 0 {
|
|
// add kl from q to m
|
|
js += 0.5 * qi * (math.Log(qi) - math.Log(m))
|
|
}
|
|
}
|
|
return js
|
|
}
|
|
|
|
// KolmogorovSmirnov computes the largest distance between two empirical CDFs.
|
|
// Each dataset x and y consists of sample locations and counts, xWeights and
|
|
// yWeights, respectively.
|
|
//
|
|
// x and y may have different lengths, though len(x) must equal len(xWeights), and
|
|
// len(y) must equal len(yWeights). Both x and y must be sorted.
|
|
//
|
|
// Special cases are:
|
|
//
|
|
// = 0 if len(x) == len(y) == 0
|
|
// = 1 if len(x) == 0, len(y) != 0 or len(x) != 0 and len(y) == 0
|
|
func KolmogorovSmirnov(x, xWeights, y, yWeights []float64) float64 {
|
|
if xWeights != nil && len(x) != len(xWeights) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
if yWeights != nil && len(y) != len(yWeights) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
if len(x) == 0 || len(y) == 0 {
|
|
if len(x) == 0 && len(y) == 0 {
|
|
return 0
|
|
}
|
|
return 1
|
|
}
|
|
|
|
if floats.HasNaN(x) {
|
|
return math.NaN()
|
|
}
|
|
if floats.HasNaN(y) {
|
|
return math.NaN()
|
|
}
|
|
|
|
if !sort.Float64sAreSorted(x) {
|
|
panic("x data are not sorted")
|
|
}
|
|
if !sort.Float64sAreSorted(y) {
|
|
panic("y data are not sorted")
|
|
}
|
|
|
|
xWeightsNil := xWeights == nil
|
|
yWeightsNil := yWeights == nil
|
|
|
|
var (
|
|
maxDist float64
|
|
xSum, ySum float64
|
|
xCdf, yCdf float64
|
|
xIdx, yIdx int
|
|
)
|
|
|
|
if xWeightsNil {
|
|
xSum = float64(len(x))
|
|
} else {
|
|
xSum = floats.Sum(xWeights)
|
|
}
|
|
|
|
if yWeightsNil {
|
|
ySum = float64(len(y))
|
|
} else {
|
|
ySum = floats.Sum(yWeights)
|
|
}
|
|
|
|
xVal := x[0]
|
|
yVal := y[0]
|
|
|
|
// Algorithm description:
|
|
// The goal is to find the maximum difference in the empirical CDFs for the
|
|
// two datasets. The CDFs are piecewise-constant, and thus the distance
|
|
// between the CDFs will only change at the values themselves.
|
|
//
|
|
// To find the maximum distance, step through the data in ascending order
|
|
// of value between the two datasets. At each step, compute the empirical CDF
|
|
// and compare the local distance with the maximum distance.
|
|
// Due to some corner cases, equal data entries must be tallied simultaneously.
|
|
for {
|
|
switch {
|
|
case xVal < yVal:
|
|
xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil)
|
|
case yVal < xVal:
|
|
yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil)
|
|
case xVal == yVal:
|
|
newX := x[xIdx]
|
|
newY := y[yIdx]
|
|
if newX < newY {
|
|
xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil)
|
|
} else if newY < newX {
|
|
yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil)
|
|
} else {
|
|
// Update them both, they'll be equal next time and the right
|
|
// thing will happen.
|
|
xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil)
|
|
yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil)
|
|
}
|
|
default:
|
|
panic("unreachable")
|
|
}
|
|
|
|
dist := math.Abs(xCdf - yCdf)
|
|
if dist > maxDist {
|
|
maxDist = dist
|
|
}
|
|
|
|
// Both xCdf and yCdf will equal 1 at the end, so if we have reached the
|
|
// end of either sample list, the distance is as large as it can be.
|
|
if xIdx == len(x) || yIdx == len(y) {
|
|
return maxDist
|
|
}
|
|
}
|
|
}
|
|
|
|
// updateKS gets the next data point from one of the set. In doing so, it combines
|
|
// the weight of all the data points of equal value. Upon return, val is the new
|
|
// value of the data set, newCdf is the total combined CDF up until this point,
|
|
// and newIdx is the index of the next location in that sample to examine.
|
|
func updateKS(idx int, cdf, sum float64, values, weights []float64, isNil bool) (val, newCdf float64, newIdx int) {
|
|
// Sum up all the weights of consecutive values that are equal.
|
|
if isNil {
|
|
newCdf = cdf + 1/sum
|
|
} else {
|
|
newCdf = cdf + weights[idx]/sum
|
|
}
|
|
newIdx = idx + 1
|
|
for {
|
|
if newIdx == len(values) {
|
|
return values[newIdx-1], newCdf, newIdx
|
|
}
|
|
if values[newIdx-1] != values[newIdx] {
|
|
return values[newIdx], newCdf, newIdx
|
|
}
|
|
if isNil {
|
|
newCdf += 1 / sum
|
|
} else {
|
|
newCdf += weights[newIdx] / sum
|
|
}
|
|
newIdx++
|
|
}
|
|
}
|
|
|
|
// KullbackLeibler computes the Kullback-Leibler distance between the
|
|
// distributions p and q. The natural logarithm is used.
|
|
//
|
|
// sum_i(p_i * log(p_i / q_i))
|
|
//
|
|
// Note that the Kullback-Leibler distance is not symmetric;
|
|
// KullbackLeibler(p,q) != KullbackLeibler(q,p)
|
|
func KullbackLeibler(p, q []float64) float64 {
|
|
if len(p) != len(q) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
var kl float64
|
|
for i, v := range p {
|
|
if v != 0 { // Entropy needs 0 * log(0) == 0.
|
|
kl += v * (math.Log(v) - math.Log(q[i]))
|
|
}
|
|
}
|
|
return kl
|
|
}
|
|
|
|
// LinearRegression computes the best-fit line
|
|
//
|
|
// y = alpha + beta*x
|
|
//
|
|
// to the data in x and y with the given weights. If origin is true, the
|
|
// regression is forced to pass through the origin.
|
|
//
|
|
// Specifically, LinearRegression computes the values of alpha and
|
|
// beta such that the total residual
|
|
//
|
|
// \sum_i w[i]*(y[i] - alpha - beta*x[i])^2
|
|
//
|
|
// is minimized. If origin is true, then alpha is forced to be zero.
|
|
//
|
|
// The lengths of x and y must be equal. If weights is nil then all of the
|
|
// weights are 1. If weights is not nil, then len(x) must equal len(weights).
|
|
func LinearRegression(x, y, weights []float64, origin bool) (alpha, beta float64) {
|
|
if len(x) != len(y) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
if weights != nil && len(weights) != len(x) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
|
|
w := 1.0
|
|
if origin {
|
|
var x2Sum, xySum float64
|
|
for i, xi := range x {
|
|
if weights != nil {
|
|
w = weights[i]
|
|
}
|
|
yi := y[i]
|
|
xySum += w * xi * yi
|
|
x2Sum += w * xi * xi
|
|
}
|
|
beta = xySum / x2Sum
|
|
|
|
return 0, beta
|
|
}
|
|
|
|
xu, xv := MeanVariance(x, weights)
|
|
yu := Mean(y, weights)
|
|
cov := covarianceMeans(x, y, weights, xu, yu)
|
|
beta = cov / xv
|
|
alpha = yu - beta*xu
|
|
return alpha, beta
|
|
}
|
|
|
|
// RSquared returns the coefficient of determination defined as
|
|
//
|
|
// R^2 = 1 - \sum_i w[i]*(y[i] - alpha - beta*x[i])^2 / \sum_i w[i]*(y[i] - mean(y))^2
|
|
//
|
|
// for the line
|
|
//
|
|
// y = alpha + beta*x
|
|
//
|
|
// and the data in x and y with the given weights.
|
|
//
|
|
// The lengths of x and y must be equal. If weights is nil then all of the
|
|
// weights are 1. If weights is not nil, then len(x) must equal len(weights).
|
|
func RSquared(x, y, weights []float64, alpha, beta float64) float64 {
|
|
if len(x) != len(y) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
if weights != nil && len(weights) != len(x) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
|
|
w := 1.0
|
|
yMean := Mean(y, weights)
|
|
var res, tot, d float64
|
|
for i, xi := range x {
|
|
if weights != nil {
|
|
w = weights[i]
|
|
}
|
|
yi := y[i]
|
|
fi := alpha + beta*xi
|
|
d = yi - fi
|
|
res += w * d * d
|
|
d = yi - yMean
|
|
tot += w * d * d
|
|
}
|
|
return 1 - res/tot
|
|
}
|
|
|
|
// RSquaredFrom returns the coefficient of determination defined as
|
|
//
|
|
// R^2 = 1 - \sum_i w[i]*(estimate[i] - value[i])^2 / \sum_i w[i]*(value[i] - mean(values))^2
|
|
//
|
|
// and the data in estimates and values with the given weights.
|
|
//
|
|
// The lengths of estimates and values must be equal. If weights is nil then
|
|
// all of the weights are 1. If weights is not nil, then len(values) must
|
|
// equal len(weights).
|
|
func RSquaredFrom(estimates, values, weights []float64) float64 {
|
|
if len(estimates) != len(values) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
if weights != nil && len(weights) != len(values) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
|
|
w := 1.0
|
|
mean := Mean(values, weights)
|
|
var res, tot, d float64
|
|
for i, val := range values {
|
|
if weights != nil {
|
|
w = weights[i]
|
|
}
|
|
d = val - estimates[i]
|
|
res += w * d * d
|
|
d = val - mean
|
|
tot += w * d * d
|
|
}
|
|
return 1 - res/tot
|
|
}
|
|
|
|
// RNoughtSquared returns the coefficient of determination defined as
|
|
//
|
|
// R₀^2 = \sum_i w[i]*(beta*x[i])^2 / \sum_i w[i]*y[i]^2
|
|
//
|
|
// for the line
|
|
//
|
|
// y = beta*x
|
|
//
|
|
// and the data in x and y with the given weights. RNoughtSquared should
|
|
// only be used for best-fit lines regressed through the origin.
|
|
//
|
|
// The lengths of x and y must be equal. If weights is nil then all of the
|
|
// weights are 1. If weights is not nil, then len(x) must equal len(weights).
|
|
func RNoughtSquared(x, y, weights []float64, beta float64) float64 {
|
|
if len(x) != len(y) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
if weights != nil && len(weights) != len(x) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
|
|
w := 1.0
|
|
var ssr, tot float64
|
|
for i, xi := range x {
|
|
if weights != nil {
|
|
w = weights[i]
|
|
}
|
|
fi := beta * xi
|
|
ssr += w * fi * fi
|
|
yi := y[i]
|
|
tot += w * yi * yi
|
|
}
|
|
return ssr / tot
|
|
}
|
|
|
|
// Mean computes the weighted mean of the data set.
|
|
//
|
|
// sum_i {w_i * x_i} / sum_i {w_i}
|
|
//
|
|
// If weights is nil then all of the weights are 1. If weights is not nil, then
|
|
// len(x) must equal len(weights).
|
|
func Mean(x, weights []float64) float64 {
|
|
if weights == nil {
|
|
return floats.Sum(x) / float64(len(x))
|
|
}
|
|
if len(x) != len(weights) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
var (
|
|
sumValues float64
|
|
sumWeights float64
|
|
)
|
|
for i, w := range weights {
|
|
sumValues += w * x[i]
|
|
sumWeights += w
|
|
}
|
|
return sumValues / sumWeights
|
|
}
|
|
|
|
// Mode returns the most common value in the dataset specified by x and the
|
|
// given weights. Strict float64 equality is used when comparing values, so users
|
|
// should take caution. If several values are the mode, any of them may be returned.
|
|
func Mode(x, weights []float64) (val float64, count float64) {
|
|
if weights != nil && len(x) != len(weights) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
if len(x) == 0 {
|
|
return 0, 0
|
|
}
|
|
m := make(map[float64]float64)
|
|
if weights == nil {
|
|
for _, v := range x {
|
|
m[v]++
|
|
}
|
|
} else {
|
|
for i, v := range x {
|
|
m[v] += weights[i]
|
|
}
|
|
}
|
|
var (
|
|
maxCount float64
|
|
max float64
|
|
)
|
|
for val, count := range m {
|
|
if count > maxCount {
|
|
maxCount = count
|
|
max = val
|
|
}
|
|
}
|
|
return max, maxCount
|
|
}
|
|
|
|
// BivariateMoment computes the weighted mixed moment between the samples x and y.
|
|
//
|
|
// E[(x - μ_x)^r*(y - μ_y)^s]
|
|
//
|
|
// No degrees of freedom correction is done.
|
|
// The lengths of x and y must be equal. If weights is nil then all of the
|
|
// weights are 1. If weights is not nil, then len(x) must equal len(weights).
|
|
func BivariateMoment(r, s float64, x, y, weights []float64) float64 {
|
|
meanX := Mean(x, weights)
|
|
meanY := Mean(y, weights)
|
|
if len(x) != len(y) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
if weights == nil {
|
|
var m float64
|
|
for i, vx := range x {
|
|
vy := y[i]
|
|
m += math.Pow(vx-meanX, r) * math.Pow(vy-meanY, s)
|
|
}
|
|
return m / float64(len(x))
|
|
}
|
|
if len(weights) != len(x) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
var (
|
|
m float64
|
|
sumWeights float64
|
|
)
|
|
for i, vx := range x {
|
|
vy := y[i]
|
|
w := weights[i]
|
|
m += w * math.Pow(vx-meanX, r) * math.Pow(vy-meanY, s)
|
|
sumWeights += w
|
|
}
|
|
return m / sumWeights
|
|
}
|
|
|
|
// Moment computes the weighted n^th moment of the samples,
|
|
//
|
|
// E[(x - μ)^N]
|
|
//
|
|
// No degrees of freedom correction is done.
|
|
// If weights is nil then all of the weights are 1. If weights is not nil, then
|
|
// len(x) must equal len(weights).
|
|
func Moment(moment float64, x, weights []float64) float64 {
|
|
// This also checks that x and weights have the same length.
|
|
mean := Mean(x, weights)
|
|
if weights == nil {
|
|
var m float64
|
|
for _, v := range x {
|
|
m += math.Pow(v-mean, moment)
|
|
}
|
|
return m / float64(len(x))
|
|
}
|
|
var (
|
|
m float64
|
|
sumWeights float64
|
|
)
|
|
for i, v := range x {
|
|
w := weights[i]
|
|
m += w * math.Pow(v-mean, moment)
|
|
sumWeights += w
|
|
}
|
|
return m / sumWeights
|
|
}
|
|
|
|
// MomentAbout computes the weighted n^th weighted moment of the samples about
|
|
// the given mean \mu,
|
|
//
|
|
// E[(x - μ)^N]
|
|
//
|
|
// No degrees of freedom correction is done.
|
|
// If weights is nil then all of the weights are 1. If weights is not nil, then
|
|
// len(x) must equal len(weights).
|
|
func MomentAbout(moment float64, x []float64, mean float64, weights []float64) float64 {
|
|
if weights == nil {
|
|
var m float64
|
|
for _, v := range x {
|
|
m += math.Pow(v-mean, moment)
|
|
}
|
|
m /= float64(len(x))
|
|
return m
|
|
}
|
|
if len(weights) != len(x) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
var (
|
|
m float64
|
|
sumWeights float64
|
|
)
|
|
for i, v := range x {
|
|
m += weights[i] * math.Pow(v-mean, moment)
|
|
sumWeights += weights[i]
|
|
}
|
|
return m / sumWeights
|
|
}
|
|
|
|
// Quantile returns the sample of x such that x is greater than or
|
|
// equal to the fraction p of samples. The exact behavior is determined by the
|
|
// CumulantKind, and p should be a number between 0 and 1. Quantile is theoretically
|
|
// the inverse of the CDF function, though it may not be the actual inverse
|
|
// for all values p and CumulantKinds.
|
|
//
|
|
// The x data must be sorted in increasing order. If weights is nil then all
|
|
// of the weights are 1. If weights is not nil, then len(x) must equal len(weights).
|
|
// Quantile will panic if the length of x is zero.
|
|
//
|
|
// CumulantKind behaviors:
|
|
// - Empirical: Returns the lowest value q for which q is greater than or equal
|
|
// to the fraction p of samples
|
|
// - LinInterp: Returns the linearly interpolated value
|
|
func Quantile(p float64, c CumulantKind, x, weights []float64) float64 {
|
|
if !(p >= 0 && p <= 1) {
|
|
panic("stat: percentile out of bounds")
|
|
}
|
|
|
|
if weights != nil && len(x) != len(weights) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
if len(x) == 0 {
|
|
panic("stat: zero length slice")
|
|
}
|
|
if floats.HasNaN(x) {
|
|
return math.NaN() // This is needed because the algorithm breaks otherwise.
|
|
}
|
|
if !sort.Float64sAreSorted(x) {
|
|
panic("x data are not sorted")
|
|
}
|
|
|
|
var sumWeights float64
|
|
if weights == nil {
|
|
sumWeights = float64(len(x))
|
|
} else {
|
|
sumWeights = floats.Sum(weights)
|
|
}
|
|
switch c {
|
|
case Empirical:
|
|
return empiricalQuantile(p, x, weights, sumWeights)
|
|
case LinInterp:
|
|
return linInterpQuantile(p, x, weights, sumWeights)
|
|
default:
|
|
panic("stat: bad cumulant kind")
|
|
}
|
|
}
|
|
|
|
func empiricalQuantile(p float64, x, weights []float64, sumWeights float64) float64 {
|
|
var cumsum float64
|
|
fidx := p * sumWeights
|
|
for i := range x {
|
|
if weights == nil {
|
|
cumsum++
|
|
} else {
|
|
cumsum += weights[i]
|
|
}
|
|
if cumsum >= fidx {
|
|
return x[i]
|
|
}
|
|
}
|
|
panic("impossible")
|
|
}
|
|
|
|
func linInterpQuantile(p float64, x, weights []float64, sumWeights float64) float64 {
|
|
var cumsum float64
|
|
fidx := p * sumWeights
|
|
for i := range x {
|
|
if weights == nil {
|
|
cumsum++
|
|
} else {
|
|
cumsum += weights[i]
|
|
}
|
|
if cumsum >= fidx {
|
|
if i == 0 {
|
|
return x[0]
|
|
}
|
|
t := cumsum - fidx
|
|
if weights != nil {
|
|
t /= weights[i]
|
|
}
|
|
return t*x[i-1] + (1-t)*x[i]
|
|
}
|
|
}
|
|
panic("impossible")
|
|
}
|
|
|
|
// Skew computes the skewness of the sample data.
|
|
// If weights is nil then all of the weights are 1. If weights is not nil, then
|
|
// len(x) must equal len(weights).
|
|
// When weights sum to 1 or less, a biased variance estimator should be used.
|
|
func Skew(x, weights []float64) float64 {
|
|
|
|
mean, std := MeanStdDev(x, weights)
|
|
if weights == nil {
|
|
var s float64
|
|
for _, v := range x {
|
|
z := (v - mean) / std
|
|
s += z * z * z
|
|
}
|
|
return s * skewCorrection(float64(len(x)))
|
|
}
|
|
var (
|
|
s float64
|
|
sumWeights float64
|
|
)
|
|
for i, v := range x {
|
|
z := (v - mean) / std
|
|
s += weights[i] * z * z * z
|
|
sumWeights += weights[i]
|
|
}
|
|
return s * skewCorrection(sumWeights)
|
|
}
|
|
|
|
// From: http://www.amstat.org/publications/jse/v19n2/doane.pdf page 7
|
|
func skewCorrection(n float64) float64 {
|
|
return (n / (n - 1)) * (1 / (n - 2))
|
|
}
|
|
|
|
// SortWeighted rearranges the data in x along with their corresponding
|
|
// weights so that the x data are sorted. The data is sorted in place.
|
|
// Weights may be nil, but if weights is non-nil then it must have the same
|
|
// length as x.
|
|
func SortWeighted(x, weights []float64) {
|
|
if weights == nil {
|
|
sort.Float64s(x)
|
|
return
|
|
}
|
|
if len(x) != len(weights) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
sort.Sort(weightSorter{
|
|
x: x,
|
|
w: weights,
|
|
})
|
|
}
|
|
|
|
type weightSorter struct {
|
|
x []float64
|
|
w []float64
|
|
}
|
|
|
|
func (w weightSorter) Len() int { return len(w.x) }
|
|
func (w weightSorter) Less(i, j int) bool { return w.x[i] < w.x[j] }
|
|
func (w weightSorter) Swap(i, j int) {
|
|
w.x[i], w.x[j] = w.x[j], w.x[i]
|
|
w.w[i], w.w[j] = w.w[j], w.w[i]
|
|
}
|
|
|
|
// SortWeightedLabeled rearranges the data in x along with their
|
|
// corresponding weights and boolean labels so that the x data are sorted.
|
|
// The data is sorted in place. Weights and labels may be nil, if either
|
|
// is non-nil it must have the same length as x.
|
|
func SortWeightedLabeled(x []float64, labels []bool, weights []float64) {
|
|
if labels == nil {
|
|
SortWeighted(x, weights)
|
|
return
|
|
}
|
|
if weights == nil {
|
|
if len(x) != len(labels) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
sort.Sort(labelSorter{
|
|
x: x,
|
|
l: labels,
|
|
})
|
|
return
|
|
}
|
|
if len(x) != len(labels) || len(x) != len(weights) {
|
|
panic("stat: slice length mismatch")
|
|
}
|
|
sort.Sort(weightLabelSorter{
|
|
x: x,
|
|
l: labels,
|
|
w: weights,
|
|
})
|
|
}
|
|
|
|
type labelSorter struct {
|
|
x []float64
|
|
l []bool
|
|
}
|
|
|
|
func (a labelSorter) Len() int { return len(a.x) }
|
|
func (a labelSorter) Less(i, j int) bool { return a.x[i] < a.x[j] }
|
|
func (a labelSorter) Swap(i, j int) {
|
|
a.x[i], a.x[j] = a.x[j], a.x[i]
|
|
a.l[i], a.l[j] = a.l[j], a.l[i]
|
|
}
|
|
|
|
type weightLabelSorter struct {
|
|
x []float64
|
|
l []bool
|
|
w []float64
|
|
}
|
|
|
|
func (a weightLabelSorter) Len() int { return len(a.x) }
|
|
func (a weightLabelSorter) Less(i, j int) bool { return a.x[i] < a.x[j] }
|
|
func (a weightLabelSorter) Swap(i, j int) {
|
|
a.x[i], a.x[j] = a.x[j], a.x[i]
|
|
a.l[i], a.l[j] = a.l[j], a.l[i]
|
|
a.w[i], a.w[j] = a.w[j], a.w[i]
|
|
}
|
|
|
|
// StdDev returns the sample standard deviation.
|
|
func StdDev(x, weights []float64) float64 {
|
|
_, std := MeanStdDev(x, weights)
|
|
return std
|
|
}
|
|
|
|
// MeanStdDev returns the sample mean and unbiased standard deviation
|
|
// When weights sum to 1 or less, a biased variance estimator should be used.
|
|
func MeanStdDev(x, weights []float64) (mean, std float64) {
|
|
mean, variance := MeanVariance(x, weights)
|
|
return mean, math.Sqrt(variance)
|
|
}
|
|
|
|
// StdErr returns the standard error in the mean with the given values.
|
|
func StdErr(std, sampleSize float64) float64 {
|
|
return std / math.Sqrt(sampleSize)
|
|
}
|
|
|
|
// StdScore returns the standard score (a.k.a. z-score, z-value) for the value x
|
|
// with the given mean and standard deviation, i.e.
|
|
//
|
|
// (x - mean) / std
|
|
func StdScore(x, mean, std float64) float64 {
|
|
return (x - mean) / std
|
|
}
|
|
|
|
// Variance computes the unbiased weighted sample variance:
|
|
//
|
|
// \sum_i w_i (x_i - mean)^2 / (sum_i w_i - 1)
|
|
//
|
|
// If weights is nil then all of the weights are 1. If weights is not nil, then
|
|
// len(x) must equal len(weights).
|
|
// When weights sum to 1 or less, a biased variance estimator should be used.
|
|
func Variance(x, weights []float64) float64 {
|
|
_, variance := MeanVariance(x, weights)
|
|
return variance
|
|
}
|
|
|
|
// MeanVariance computes the sample mean and unbiased variance, where the mean and variance are
|
|
//
|
|
// \sum_i w_i * x_i / (sum_i w_i)
|
|
// \sum_i w_i (x_i - mean)^2 / (sum_i w_i - 1)
|
|
//
|
|
// respectively.
|
|
// If weights is nil then all of the weights are 1. If weights is not nil, then
|
|
// len(x) must equal len(weights).
|
|
// When weights sum to 1 or less, a biased variance estimator should be used.
|
|
func MeanVariance(x, weights []float64) (mean, variance float64) {
|
|
var (
|
|
unnormalisedVariance float64
|
|
sumWeights float64
|
|
)
|
|
mean, unnormalisedVariance, sumWeights = meanUnnormalisedVarianceSumWeights(x, weights)
|
|
return mean, unnormalisedVariance / (sumWeights - 1)
|
|
}
|
|
|
|
// PopMeanVariance computes the sample mean and biased variance (also known as
|
|
// "population variance"), where the mean and variance are
|
|
//
|
|
// \sum_i w_i * x_i / (sum_i w_i)
|
|
// \sum_i w_i (x_i - mean)^2 / (sum_i w_i)
|
|
//
|
|
// respectively.
|
|
// If weights is nil then all of the weights are 1. If weights is not nil, then
|
|
// len(x) must equal len(weights).
|
|
func PopMeanVariance(x, weights []float64) (mean, variance float64) {
|
|
var (
|
|
unnormalisedVariance float64
|
|
sumWeights float64
|
|
)
|
|
mean, unnormalisedVariance, sumWeights = meanUnnormalisedVarianceSumWeights(x, weights)
|
|
return mean, unnormalisedVariance / sumWeights
|
|
}
|
|
|
|
// PopMeanStdDev returns the sample mean and biased standard deviation
|
|
// (also known as "population standard deviation").
|
|
func PopMeanStdDev(x, weights []float64) (mean, std float64) {
|
|
mean, variance := PopMeanVariance(x, weights)
|
|
return mean, math.Sqrt(variance)
|
|
}
|
|
|
|
// PopStdDev returns the population standard deviation, i.e., a square root
|
|
// of the biased variance estimate.
|
|
func PopStdDev(x, weights []float64) float64 {
|
|
_, stDev := PopMeanStdDev(x, weights)
|
|
return stDev
|
|
}
|
|
|
|
// PopVariance computes the unbiased weighted sample variance:
|
|
//
|
|
// \sum_i w_i (x_i - mean)^2 / (sum_i w_i)
|
|
//
|
|
// If weights is nil then all of the weights are 1. If weights is not nil, then
|
|
// len(x) must equal len(weights).
|
|
func PopVariance(x, weights []float64) float64 {
|
|
_, variance := PopMeanVariance(x, weights)
|
|
return variance
|
|
}
|
|
|
|
func meanUnnormalisedVarianceSumWeights(x, weights []float64) (mean, unnormalisedVariance, sumWeights float64) {
|
|
// This uses the corrected two-pass algorithm (1.7), from "Algorithms for computing
|
|
// the sample variance: Analysis and recommendations" by Chan, Tony F., Gene H. Golub,
|
|
// and Randall J. LeVeque.
|
|
|
|
// Note that this will panic if the slice lengths do not match.
|
|
mean = Mean(x, weights)
|
|
var (
|
|
ss float64
|
|
compensation float64
|
|
)
|
|
if weights == nil {
|
|
for _, v := range x {
|
|
d := v - mean
|
|
ss += d * d
|
|
compensation += d
|
|
}
|
|
unnormalisedVariance = (ss - compensation*compensation/float64(len(x)))
|
|
return mean, unnormalisedVariance, float64(len(x))
|
|
}
|
|
|
|
for i, v := range x {
|
|
w := weights[i]
|
|
d := v - mean
|
|
wd := w * d
|
|
ss += wd * d
|
|
compensation += wd
|
|
sumWeights += w
|
|
}
|
|
unnormalisedVariance = (ss - compensation*compensation/sumWeights)
|
|
return mean, unnormalisedVariance, sumWeights
|
|
}
|