Codebase list golang-github-beorn7-perks / 8cb26c7
histogram Blake Mizerany 10 years ago
2 changed file(s) with 151 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
0 // Package histogram provides a Go implementation of BigML's histogram package
1 // for Clojure/Java. It is currently experiemental.
2 package histogram
3
4 import (
5 "container/heap"
6 "math"
7 "sort"
8 )
9
10 type Bin struct {
11 Count int
12 Sum float64
13 }
14
15 func (b *Bin) Update(x *Bin) {
16 b.Count += x.Count
17 b.Sum += x.Sum
18 }
19
20 func (b *Bin) Mean() float64 {
21 return b.Sum / float64(b.Count)
22 }
23
24 type Bins []*Bin
25
26 func (bs Bins) Len() int { return len(bs) }
27 func (bs Bins) Less(i, j int) bool { return bs[i].Mean() < bs[j].Mean() }
28 func (bs Bins) Swap(i, j int) { bs[i], bs[j] = bs[j], bs[i] }
29
30 func (bs *Bins) Push(x interface{}) {
31 *bs = append(*bs, x.(*Bin))
32 }
33
34 func (bs *Bins) Pop() interface{} {
35 return bs.remove(len(*bs) - 1)
36 }
37
38 func (bs *Bins) remove(n int) *Bin {
39 old := *bs
40 x := old[n]
41 h := old[0:n]
42 if n < len(old)-1 {
43 t := old[n+1:]
44 out := make([]*Bin, len(old)-1)
45 copy(out, h)
46 copy(out[len(h):], t)
47 *bs = out
48 } else {
49 *bs = h
50 }
51 return x
52 }
53
54 type Histogram struct {
55 res *reservoir
56 }
57
58 func New(maxBins int) *Histogram {
59 return &Histogram{res: newReservoir(maxBins)}
60 }
61
62 func (h *Histogram) Insert(f float64) {
63 h.res.insert(&Bin{1, f})
64 h.res.compress()
65 }
66
67 func (h *Histogram) Bins() Bins {
68 return h.res.bins
69 }
70
71 type reservoir struct {
72 n int
73 maxBins int
74 bins Bins
75 }
76
77 func newReservoir(maxBins int) *reservoir {
78 return &reservoir{maxBins: maxBins}
79 }
80
81 func (r *reservoir) insert(bin *Bin) {
82 r.n += bin.Count
83 i := sort.Search(len(r.bins), func(i int) bool {
84 return r.bins[i].Mean() >= bin.Mean()
85 })
86 if i < 0 || i == r.bins.Len() {
87 heap.Push(&r.bins, bin)
88 return
89 }
90 r.bins[i].Update(bin)
91 }
92
93 func (r *reservoir) compress() {
94 for r.bins.Len() > r.maxBins {
95 minGapIndex := -1
96 minGap := math.MaxFloat64
97 for i := 0; i < r.bins.Len()-1; i++ {
98 gap := gapWeight(r.bins[i], r.bins[i+1])
99 if minGap > gap {
100 minGap = gap
101 minGapIndex = i
102 }
103 }
104 prev := r.bins[minGapIndex]
105 next := r.bins.remove(minGapIndex + 1)
106 prev.Update(next)
107 }
108 }
109
110 func gapWeight(prev, next *Bin) float64 {
111 return next.Mean() - prev.Mean()
112 }
0 package histogram
1
2 import (
3 "math/rand"
4 "testing"
5 )
6
7 func TestHistogram(t *testing.T) {
8 const numPoints = 1e6
9 const maxBins = 3
10
11 h := New(maxBins)
12 for i := 0; i < numPoints; i++ {
13 f := rand.ExpFloat64()
14 h.Insert(f)
15 }
16
17 bins := h.Bins()
18 if g := len(bins); g > maxBins {
19 t.Fatalf("got %d bins, wanted <= %d", g, maxBins)
20 }
21
22 for _, b := range bins {
23 t.Logf("%+v", b)
24 }
25
26 if g := count(h.Bins()); g != numPoints {
27 t.Fatalf("binned %d points, wanted %d", g, numPoints)
28 }
29 }
30
31 func count(bins Bins) int {
32 binCounts := 0
33 for _, b := range bins {
34 binCounts += b.Count
35 }
36 return binCounts
37 }