|
0 |
// Package histogram provides a Go implementation of BigML's histogram package
|
|
1 |
// for Clojure/Java. It is currently experiemental.
|
|
2 |
package histogram
|
|
3 |
|
|
4 |
import (
|
|
5 |
"container/heap"
|
|
6 |
"math"
|
|
7 |
"sort"
|
|
8 |
)
|
|
9 |
|
|
10 |
type Bin struct {
|
|
11 |
Count int
|
|
12 |
Sum float64
|
|
13 |
}
|
|
14 |
|
|
15 |
func (b *Bin) Update(x *Bin) {
|
|
16 |
b.Count += x.Count
|
|
17 |
b.Sum += x.Sum
|
|
18 |
}
|
|
19 |
|
|
20 |
func (b *Bin) Mean() float64 {
|
|
21 |
return b.Sum / float64(b.Count)
|
|
22 |
}
|
|
23 |
|
|
24 |
type Bins []*Bin
|
|
25 |
|
|
26 |
func (bs Bins) Len() int { return len(bs) }
|
|
27 |
func (bs Bins) Less(i, j int) bool { return bs[i].Mean() < bs[j].Mean() }
|
|
28 |
func (bs Bins) Swap(i, j int) { bs[i], bs[j] = bs[j], bs[i] }
|
|
29 |
|
|
30 |
func (bs *Bins) Push(x interface{}) {
|
|
31 |
*bs = append(*bs, x.(*Bin))
|
|
32 |
}
|
|
33 |
|
|
34 |
func (bs *Bins) Pop() interface{} {
|
|
35 |
return bs.remove(len(*bs) - 1)
|
|
36 |
}
|
|
37 |
|
|
38 |
func (bs *Bins) remove(n int) *Bin {
|
|
39 |
old := *bs
|
|
40 |
x := old[n]
|
|
41 |
h := old[0:n]
|
|
42 |
if n < len(old)-1 {
|
|
43 |
t := old[n+1:]
|
|
44 |
out := make([]*Bin, len(old)-1)
|
|
45 |
copy(out, h)
|
|
46 |
copy(out[len(h):], t)
|
|
47 |
*bs = out
|
|
48 |
} else {
|
|
49 |
*bs = h
|
|
50 |
}
|
|
51 |
return x
|
|
52 |
}
|
|
53 |
|
|
54 |
type Histogram struct {
|
|
55 |
res *reservoir
|
|
56 |
}
|
|
57 |
|
|
58 |
func New(maxBins int) *Histogram {
|
|
59 |
return &Histogram{res: newReservoir(maxBins)}
|
|
60 |
}
|
|
61 |
|
|
62 |
func (h *Histogram) Insert(f float64) {
|
|
63 |
h.res.insert(&Bin{1, f})
|
|
64 |
h.res.compress()
|
|
65 |
}
|
|
66 |
|
|
67 |
func (h *Histogram) Bins() Bins {
|
|
68 |
return h.res.bins
|
|
69 |
}
|
|
70 |
|
|
71 |
type reservoir struct {
|
|
72 |
n int
|
|
73 |
maxBins int
|
|
74 |
bins Bins
|
|
75 |
}
|
|
76 |
|
|
77 |
func newReservoir(maxBins int) *reservoir {
|
|
78 |
return &reservoir{maxBins: maxBins}
|
|
79 |
}
|
|
80 |
|
|
81 |
func (r *reservoir) insert(bin *Bin) {
|
|
82 |
r.n += bin.Count
|
|
83 |
i := sort.Search(len(r.bins), func(i int) bool {
|
|
84 |
return r.bins[i].Mean() >= bin.Mean()
|
|
85 |
})
|
|
86 |
if i < 0 || i == r.bins.Len() {
|
|
87 |
heap.Push(&r.bins, bin)
|
|
88 |
return
|
|
89 |
}
|
|
90 |
r.bins[i].Update(bin)
|
|
91 |
}
|
|
92 |
|
|
93 |
func (r *reservoir) compress() {
|
|
94 |
for r.bins.Len() > r.maxBins {
|
|
95 |
minGapIndex := -1
|
|
96 |
minGap := math.MaxFloat64
|
|
97 |
for i := 0; i < r.bins.Len()-1; i++ {
|
|
98 |
gap := gapWeight(r.bins[i], r.bins[i+1])
|
|
99 |
if minGap > gap {
|
|
100 |
minGap = gap
|
|
101 |
minGapIndex = i
|
|
102 |
}
|
|
103 |
}
|
|
104 |
prev := r.bins[minGapIndex]
|
|
105 |
next := r.bins.remove(minGapIndex + 1)
|
|
106 |
prev.Update(next)
|
|
107 |
}
|
|
108 |
}
|
|
109 |
|
|
110 |
func gapWeight(prev, next *Bin) float64 {
|
|
111 |
return next.Mean() - prev.Mean()
|
|
112 |
}
|