diff --git a/README.md b/README.md new file mode 100644 index 0000000..fc05777 --- /dev/null +++ b/README.md @@ -0,0 +1,31 @@ +# Perks for Go (golang.org) + +Perks contains the Go package quantile that computes approximate quantiles over +an unbounded data stream within low memory and CPU bounds. + +For more information and examples, see: +http://godoc.org/github.com/bmizerany/perks + +A very special thank you and shout out to Graham Cormode (Rutgers University), +Flip Korn (AT&T Labs–Research), S. Muthukrishnan (Rutgers University), and +Divesh Srivastava (AT&T Labs–Research) for their research and publication of +[Effective Computation of Biased Quantiles over Data Streams](http://www.cs.rutgers.edu/~muthu/bquant.pdf) + +Thank you, also: +* Armon Dadgar (@armon) +* Andrew Gerrand (@nf) +* Brad Fitzpatrick (@bradfitz) +* Keith Rarick (@kr) + +FAQ: + +Q: Why not move the quantile package into the project root? +A: I want to add more packages to perks later. + +Copyright (C) 2013 Blake Mizerany + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/histogram/bench_test.go b/histogram/bench_test.go new file mode 100644 index 0000000..56c7e55 --- /dev/null +++ b/histogram/bench_test.go @@ -0,0 +1,26 @@ +package histogram + +import ( + "math/rand" + "testing" +) + +func BenchmarkInsert10Bins(b *testing.B) { + b.StopTimer() + h := New(10) + b.StartTimer() + for i := 0; i < b.N; i++ { + f := rand.ExpFloat64() + h.Insert(f) + } +} + +func BenchmarkInsert100Bins(b *testing.B) { + b.StopTimer() + h := New(100) + b.StartTimer() + for i := 0; i < b.N; i++ { + f := rand.ExpFloat64() + h.Insert(f) + } +} diff --git a/histogram/histogram.go b/histogram/histogram.go new file mode 100644 index 0000000..bef05c7 --- /dev/null +++ b/histogram/histogram.go @@ -0,0 +1,108 @@ +// Package histogram provides a Go implementation of BigML's histogram package +// for Clojure/Java. It is currently experimental. +package histogram + +import ( + "container/heap" + "math" + "sort" +) + +type Bin struct { + Count int + Sum float64 +} + +func (b *Bin) Update(x *Bin) { + b.Count += x.Count + b.Sum += x.Sum +} + +func (b *Bin) Mean() float64 { + return b.Sum / float64(b.Count) +} + +type Bins []*Bin + +func (bs Bins) Len() int { return len(bs) } +func (bs Bins) Less(i, j int) bool { return bs[i].Mean() < bs[j].Mean() } +func (bs Bins) Swap(i, j int) { bs[i], bs[j] = bs[j], bs[i] } + +func (bs *Bins) Push(x interface{}) { + *bs = append(*bs, x.(*Bin)) +} + +func (bs *Bins) Pop() interface{} { + return bs.remove(len(*bs) - 1) +} + +func (bs *Bins) remove(n int) *Bin { + if n < 0 || len(*bs) < n { + return nil + } + x := (*bs)[n] + *bs = append((*bs)[:n], (*bs)[n+1:]...) + return x +} + +type Histogram struct { + res *reservoir +} + +func New(maxBins int) *Histogram { + return &Histogram{res: newReservoir(maxBins)} +} + +func (h *Histogram) Insert(f float64) { + h.res.insert(&Bin{1, f}) + h.res.compress() +} + +func (h *Histogram) Bins() Bins { + return h.res.bins +} + +type reservoir struct { + n int + maxBins int + bins Bins +} + +func newReservoir(maxBins int) *reservoir { + return &reservoir{maxBins: maxBins} +} + +func (r *reservoir) insert(bin *Bin) { + r.n += bin.Count + i := sort.Search(len(r.bins), func(i int) bool { + return r.bins[i].Mean() >= bin.Mean() + }) + if i < 0 || i == r.bins.Len() { + // TODO(blake): Maybe use an .insert(i, bin) instead of + // performing the extra work of a heap.Push. + heap.Push(&r.bins, bin) + return + } + r.bins[i].Update(bin) +} + +func (r *reservoir) compress() { + for r.bins.Len() > r.maxBins { + minGapIndex := -1 + minGap := math.MaxFloat64 + for i := 0; i < r.bins.Len()-1; i++ { + gap := gapWeight(r.bins[i], r.bins[i+1]) + if minGap > gap { + minGap = gap + minGapIndex = i + } + } + prev := r.bins[minGapIndex] + next := r.bins.remove(minGapIndex + 1) + prev.Update(next) + } +} + +func gapWeight(prev, next *Bin) float64 { + return next.Mean() - prev.Mean() +} diff --git a/histogram/histogram_test.go b/histogram/histogram_test.go new file mode 100644 index 0000000..0575ebe --- /dev/null +++ b/histogram/histogram_test.go @@ -0,0 +1,38 @@ +package histogram + +import ( + "math/rand" + "testing" +) + +func TestHistogram(t *testing.T) { + const numPoints = 1e6 + const maxBins = 3 + + h := New(maxBins) + for i := 0; i < numPoints; i++ { + f := rand.ExpFloat64() + h.Insert(f) + } + + bins := h.Bins() + if g := len(bins); g > maxBins { + t.Fatalf("got %d bins, wanted <= %d", g, maxBins) + } + + for _, b := range bins { + t.Logf("%+v", b) + } + + if g := count(h.Bins()); g != numPoints { + t.Fatalf("binned %d points, wanted %d", g, numPoints) + } +} + +func count(bins Bins) int { + binCounts := 0 + for _, b := range bins { + binCounts += b.Count + } + return binCounts +} diff --git a/quantile/bench_test.go b/quantile/bench_test.go new file mode 100644 index 0000000..0bd0e4e --- /dev/null +++ b/quantile/bench_test.go @@ -0,0 +1,63 @@ +package quantile + +import ( + "testing" +) + +func BenchmarkInsertTargeted(b *testing.B) { + b.ReportAllocs() + + s := NewTargeted(Targets) + b.ResetTimer() + for i := float64(0); i < float64(b.N); i++ { + s.Insert(i) + } +} + +func BenchmarkInsertTargetedSmallEpsilon(b *testing.B) { + s := NewTargeted(TargetsSmallEpsilon) + b.ResetTimer() + for i := float64(0); i < float64(b.N); i++ { + s.Insert(i) + } +} + +func BenchmarkInsertBiased(b *testing.B) { + s := NewLowBiased(0.01) + b.ResetTimer() + for i := float64(0); i < float64(b.N); i++ { + s.Insert(i) + } +} + +func BenchmarkInsertBiasedSmallEpsilon(b *testing.B) { + s := NewLowBiased(0.0001) + b.ResetTimer() + for i := float64(0); i < float64(b.N); i++ { + s.Insert(i) + } +} + +func BenchmarkQuery(b *testing.B) { + s := NewTargeted(Targets) + for i := float64(0); i < 1e6; i++ { + s.Insert(i) + } + b.ResetTimer() + n := float64(b.N) + for i := float64(0); i < n; i++ { + s.Query(i / n) + } +} + +func BenchmarkQuerySmallEpsilon(b *testing.B) { + s := NewTargeted(TargetsSmallEpsilon) + for i := float64(0); i < 1e6; i++ { + s.Insert(i) + } + b.ResetTimer() + n := float64(b.N) + for i := float64(0); i < n; i++ { + s.Query(i / n) + } +} diff --git a/quantile/example_test.go b/quantile/example_test.go new file mode 100644 index 0000000..ab3293a --- /dev/null +++ b/quantile/example_test.go @@ -0,0 +1,121 @@ +// +build go1.1 + +package quantile_test + +import ( + "bufio" + "fmt" + "log" + "os" + "strconv" + "time" + + "github.com/beorn7/perks/quantile" +) + +func Example_simple() { + ch := make(chan float64) + go sendFloats(ch) + + // Compute the 50th, 90th, and 99th percentile. + q := quantile.NewTargeted(map[float64]float64{ + 0.50: 0.005, + 0.90: 0.001, + 0.99: 0.0001, + }) + for v := range ch { + q.Insert(v) + } + + fmt.Println("perc50:", q.Query(0.50)) + fmt.Println("perc90:", q.Query(0.90)) + fmt.Println("perc99:", q.Query(0.99)) + fmt.Println("count:", q.Count()) + // Output: + // perc50: 5 + // perc90: 16 + // perc99: 223 + // count: 2388 +} + +func Example_mergeMultipleStreams() { + // Scenario: + // We have multiple database shards. On each shard, there is a process + // collecting query response times from the database logs and inserting + // them into a Stream (created via NewTargeted(0.90)), much like the + // Simple example. These processes expose a network interface for us to + // ask them to serialize and send us the results of their + // Stream.Samples so we may Merge and Query them. + // + // NOTES: + // * These sample sets are small, allowing us to get them + // across the network much faster than sending the entire list of data + // points. + // + // * For this to work correctly, we must supply the same quantiles + // a priori the process collecting the samples supplied to NewTargeted, + // even if we do not plan to query them all here. + ch := make(chan quantile.Samples) + getDBQuerySamples(ch) + q := quantile.NewTargeted(map[float64]float64{0.90: 0.001}) + for samples := range ch { + q.Merge(samples) + } + fmt.Println("perc90:", q.Query(0.90)) +} + +func Example_window() { + // Scenario: We want the 90th, 95th, and 99th percentiles for each + // minute. + + ch := make(chan float64) + go sendStreamValues(ch) + + tick := time.NewTicker(1 * time.Minute) + q := quantile.NewTargeted(map[float64]float64{ + 0.90: 0.001, + 0.95: 0.0005, + 0.99: 0.0001, + }) + for { + select { + case t := <-tick.C: + flushToDB(t, q.Samples()) + q.Reset() + case v := <-ch: + q.Insert(v) + } + } +} + +func sendStreamValues(ch chan float64) { + // Use your imagination +} + +func flushToDB(t time.Time, samples quantile.Samples) { + // Use your imagination +} + +// This is a stub for the above example. In reality this would hit the remote +// servers via http or something like it. +func getDBQuerySamples(ch chan quantile.Samples) {} + +func sendFloats(ch chan<- float64) { + f, err := os.Open("exampledata.txt") + if err != nil { + log.Fatal(err) + } + sc := bufio.NewScanner(f) + for sc.Scan() { + b := sc.Bytes() + v, err := strconv.ParseFloat(string(b), 64) + if err != nil { + log.Fatal(err) + } + ch <- v + } + if sc.Err() != nil { + log.Fatal(sc.Err()) + } + close(ch) +} diff --git a/quantile/exampledata.txt b/quantile/exampledata.txt new file mode 100644 index 0000000..1602287 --- /dev/null +++ b/quantile/exampledata.txt @@ -0,0 +1,2388 @@ +8 +5 +26 +12 +5 +235 +13 +6 +28 +30 +3 +3 +3 +3 +5 +2 +33 +7 +2 +4 +7 +12 +14 +5 +8 +3 +10 +4 +5 +3 +6 +6 +209 +20 +3 +10 +14 +3 +4 +6 +8 +5 +11 +7 +3 +2 +3 +3 +212 +5 +222 +4 +10 +10 +5 +6 +3 +8 +3 +10 +254 +220 +2 +3 +5 +24 +5 +4 +222 +7 +3 +3 +223 +8 +15 +12 +14 +14 +3 +2 +2 +3 +13 +3 +11 +4 +4 +6 +5 +7 +13 +5 +3 +5 +2 +5 +3 +5 +2 +7 +15 +17 +14 +3 +6 +6 +3 +17 +5 +4 +7 +6 +4 +4 +8 +6 +8 +3 +9 +3 +6 +3 +4 +5 +3 +3 +660 +4 +6 +10 +3 +6 +3 +2 +5 +13 +2 +4 +4 +10 +4 +8 +4 +3 +7 +9 +9 +3 +10 +37 +3 +13 +4 +12 +3 +6 +10 +8 +5 +21 +2 +3 +8 +3 +2 +3 +3 +4 +12 +2 +4 +8 +8 +4 +3 +2 +20 +1 +6 +32 +2 +11 +6 +18 +3 +8 +11 +3 +212 +3 +4 +2 +6 +7 +12 +11 +3 +2 +16 +10 +6 +4 +6 +3 +2 +7 +3 +2 +2 +2 +2 +5 +6 +4 +3 +10 +3 +4 +6 +5 +3 +4 +4 +5 +6 +4 +3 +4 +4 +5 +7 +5 +5 +3 +2 +7 +2 +4 +12 +4 +5 +6 +2 +4 +4 +8 +4 +15 +13 +7 +16 +5 +3 +23 +5 +5 +7 +3 +2 +9 +8 +7 +5 +8 +11 +4 +10 +76 +4 +47 +4 +3 +2 +7 +4 +2 +3 +37 +10 +4 +2 +20 +5 +4 +4 +10 +10 +4 +3 +7 +23 +240 +7 +13 +5 +5 +3 +3 +2 +5 +4 +2 +8 +7 +19 +2 +23 +8 +7 +2 +5 +3 +8 +3 +8 +13 +5 +5 +5 +2 +3 +23 +4 +9 +8 +4 +3 +3 +5 +220 +2 +3 +4 +6 +14 +3 +53 +6 +2 +5 +18 +6 +3 +219 +6 +5 +2 +5 +3 +6 +5 +15 +4 +3 +17 +3 +2 +4 +7 +2 +3 +3 +4 +4 +3 +2 +664 +6 +3 +23 +5 +5 +16 +5 +8 +2 +4 +2 +24 +12 +3 +2 +3 +5 +8 +3 +5 +4 +3 +14 +3 +5 +8 +2 +3 +7 +9 +4 +2 +3 +6 +8 +4 +3 +4 +6 +5 +3 +3 +6 +3 +19 +4 +4 +6 +3 +6 +3 +5 +22 +5 +4 +4 +3 +8 +11 +4 +9 +7 +6 +13 +4 +4 +4 +6 +17 +9 +3 +3 +3 +4 +3 +221 +5 +11 +3 +4 +2 +12 +6 +3 +5 +7 +5 +7 +4 +9 +7 +14 +37 +19 +217 +16 +3 +5 +2 +2 +7 +19 +7 +6 +7 +4 +24 +5 +11 +4 +7 +7 +9 +13 +3 +4 +3 +6 +28 +4 +4 +5 +5 +2 +5 +6 +4 +4 +6 +10 +5 +4 +3 +2 +3 +3 +6 +5 +5 +4 +3 +2 +3 +7 +4 +6 +18 +16 +8 +16 +4 +5 +8 +6 +9 +13 +1545 +6 +215 +6 +5 +6 +3 +45 +31 +5 +2 +2 +4 +3 +3 +2 +5 +4 +3 +5 +7 +7 +4 +5 +8 +5 +4 +749 +2 +31 +9 +11 +2 +11 +5 +4 +4 +7 +9 +11 +4 +5 +4 +7 +3 +4 +6 +2 +15 +3 +4 +3 +4 +3 +5 +2 +13 +5 +5 +3 +3 +23 +4 +4 +5 +7 +4 +13 +2 +4 +3 +4 +2 +6 +2 +7 +3 +5 +5 +3 +29 +5 +4 +4 +3 +10 +2 +3 +79 +16 +6 +6 +7 +7 +3 +5 +5 +7 +4 +3 +7 +9 +5 +6 +5 +9 +6 +3 +6 +4 +17 +2 +10 +9 +3 +6 +2 +3 +21 +22 +5 +11 +4 +2 +17 +2 +224 +2 +14 +3 +4 +4 +2 +4 +4 +4 +4 +5 +3 +4 +4 +10 +2 +6 +3 +3 +5 +7 +2 +7 +5 +6 +3 +218 +2 +2 +5 +2 +6 +3 +5 +222 +14 +6 +33 +3 +2 +5 +3 +3 +3 +9 +5 +3 +3 +2 +7 +4 +3 +4 +3 +5 +6 +5 +26 +4 +13 +9 +7 +3 +221 +3 +3 +4 +4 +4 +4 +2 +18 +5 +3 +7 +9 +6 +8 +3 +10 +3 +11 +9 +5 +4 +17 +5 +5 +6 +6 +3 +2 +4 +12 +17 +6 +7 +218 +4 +2 +4 +10 +3 +5 +15 +3 +9 +4 +3 +3 +6 +29 +3 +3 +4 +5 +5 +3 +8 +5 +6 +6 +7 +5 +3 +5 +3 +29 +2 +31 +5 +15 +24 +16 +5 +207 +4 +3 +3 +2 +15 +4 +4 +13 +5 +5 +4 +6 +10 +2 +7 +8 +4 +6 +20 +5 +3 +4 +3 +12 +12 +5 +17 +7 +3 +3 +3 +6 +10 +3 +5 +25 +80 +4 +9 +3 +2 +11 +3 +3 +2 +3 +8 +7 +5 +5 +19 +5 +3 +3 +12 +11 +2 +6 +5 +5 +5 +3 +3 +3 +4 +209 +14 +3 +2 +5 +19 +4 +4 +3 +4 +14 +5 +6 +4 +13 +9 +7 +4 +7 +10 +2 +9 +5 +7 +2 +8 +4 +6 +5 +5 +222 +8 +7 +12 +5 +216 +3 +4 +4 +6 +3 +14 +8 +7 +13 +4 +3 +3 +3 +3 +17 +5 +4 +3 +33 +6 +6 +33 +7 +5 +3 +8 +7 +5 +2 +9 +4 +2 +233 +24 +7 +4 +8 +10 +3 +4 +15 +2 +16 +3 +3 +13 +12 +7 +5 +4 +207 +4 +2 +4 +27 +15 +2 +5 +2 +25 +6 +5 +5 +6 +13 +6 +18 +6 +4 +12 +225 +10 +7 +5 +2 +2 +11 +4 +14 +21 +8 +10 +3 +5 +4 +232 +2 +5 +5 +3 +7 +17 +11 +6 +6 +23 +4 +6 +3 +5 +4 +2 +17 +3 +6 +5 +8 +3 +2 +2 +14 +9 +4 +4 +2 +5 +5 +3 +7 +6 +12 +6 +10 +3 +6 +2 +2 +19 +5 +4 +4 +9 +2 +4 +13 +3 +5 +6 +3 +6 +5 +4 +9 +6 +3 +5 +7 +3 +6 +6 +4 +3 +10 +6 +3 +221 +3 +5 +3 +6 +4 +8 +5 +3 +6 +4 +4 +2 +54 +5 +6 +11 +3 +3 +4 +4 +4 +3 +7 +3 +11 +11 +7 +10 +6 +13 +223 +213 +15 +231 +7 +3 +7 +228 +2 +3 +4 +4 +5 +6 +7 +4 +13 +3 +4 +5 +3 +6 +4 +6 +7 +2 +4 +3 +4 +3 +3 +6 +3 +7 +3 +5 +18 +5 +6 +8 +10 +3 +3 +3 +2 +4 +2 +4 +4 +5 +6 +6 +4 +10 +13 +3 +12 +5 +12 +16 +8 +4 +19 +11 +2 +4 +5 +6 +8 +5 +6 +4 +18 +10 +4 +2 +216 +6 +6 +6 +2 +4 +12 +8 +3 +11 +5 +6 +14 +5 +3 +13 +4 +5 +4 +5 +3 +28 +6 +3 +7 +219 +3 +9 +7 +3 +10 +6 +3 +4 +19 +5 +7 +11 +6 +15 +19 +4 +13 +11 +3 +7 +5 +10 +2 +8 +11 +2 +6 +4 +6 +24 +6 +3 +3 +3 +3 +6 +18 +4 +11 +4 +2 +5 +10 +8 +3 +9 +5 +3 +4 +5 +6 +2 +5 +7 +4 +4 +14 +6 +4 +4 +5 +5 +7 +2 +4 +3 +7 +3 +3 +6 +4 +5 +4 +4 +4 +3 +3 +3 +3 +8 +14 +2 +3 +5 +3 +2 +4 +5 +3 +7 +3 +3 +18 +3 +4 +4 +5 +7 +3 +3 +3 +13 +5 +4 +8 +211 +5 +5 +3 +5 +2 +5 +4 +2 +655 +6 +3 +5 +11 +2 +5 +3 +12 +9 +15 +11 +5 +12 +217 +2 +6 +17 +3 +3 +207 +5 +5 +4 +5 +9 +3 +2 +8 +5 +4 +3 +2 +5 +12 +4 +14 +5 +4 +2 +13 +5 +8 +4 +225 +4 +3 +4 +5 +4 +3 +3 +6 +23 +9 +2 +6 +7 +233 +4 +4 +6 +18 +3 +4 +6 +3 +4 +4 +2 +3 +7 +4 +13 +227 +4 +3 +5 +4 +2 +12 +9 +17 +3 +7 +14 +6 +4 +5 +21 +4 +8 +9 +2 +9 +25 +16 +3 +6 +4 +7 +8 +5 +2 +3 +5 +4 +3 +3 +5 +3 +3 +3 +2 +3 +19 +2 +4 +3 +4 +2 +3 +4 +4 +2 +4 +3 +3 +3 +2 +6 +3 +17 +5 +6 +4 +3 +13 +5 +3 +3 +3 +4 +9 +4 +2 +14 +12 +4 +5 +24 +4 +3 +37 +12 +11 +21 +3 +4 +3 +13 +4 +2 +3 +15 +4 +11 +4 +4 +3 +8 +3 +4 +4 +12 +8 +5 +3 +3 +4 +2 +220 +3 +5 +223 +3 +3 +3 +10 +3 +15 +4 +241 +9 +7 +3 +6 +6 +23 +4 +13 +7 +3 +4 +7 +4 +9 +3 +3 +4 +10 +5 +5 +1 +5 +24 +2 +4 +5 +5 +6 +14 +3 +8 +2 +3 +5 +13 +13 +3 +5 +2 +3 +15 +3 +4 +2 +10 +4 +4 +4 +5 +5 +3 +5 +3 +4 +7 +4 +27 +3 +6 +4 +15 +3 +5 +6 +6 +5 +4 +8 +3 +9 +2 +6 +3 +4 +3 +7 +4 +18 +3 +11 +3 +3 +8 +9 +7 +24 +3 +219 +7 +10 +4 +5 +9 +12 +2 +5 +4 +4 +4 +3 +3 +19 +5 +8 +16 +8 +6 +22 +3 +23 +3 +242 +9 +4 +3 +3 +5 +7 +3 +3 +5 +8 +3 +7 +5 +14 +8 +10 +3 +4 +3 +7 +4 +6 +7 +4 +10 +4 +3 +11 +3 +7 +10 +3 +13 +6 +8 +12 +10 +5 +7 +9 +3 +4 +7 +7 +10 +8 +30 +9 +19 +4 +3 +19 +15 +4 +13 +3 +215 +223 +4 +7 +4 +8 +17 +16 +3 +7 +6 +5 +5 +4 +12 +3 +7 +4 +4 +13 +4 +5 +2 +5 +6 +5 +6 +6 +7 +10 +18 +23 +9 +3 +3 +6 +5 +2 +4 +2 +7 +3 +3 +2 +5 +5 +14 +10 +224 +6 +3 +4 +3 +7 +5 +9 +3 +6 +4 +2 +5 +11 +4 +3 +3 +2 +8 +4 +7 +4 +10 +7 +3 +3 +18 +18 +17 +3 +3 +3 +4 +5 +3 +3 +4 +12 +7 +3 +11 +13 +5 +4 +7 +13 +5 +4 +11 +3 +12 +3 +6 +4 +4 +21 +4 +6 +9 +5 +3 +10 +8 +4 +6 +4 +4 +6 +5 +4 +8 +6 +4 +6 +4 +4 +5 +9 +6 +3 +4 +2 +9 +3 +18 +2 +4 +3 +13 +3 +6 +6 +8 +7 +9 +3 +2 +16 +3 +4 +6 +3 +2 +33 +22 +14 +4 +9 +12 +4 +5 +6 +3 +23 +9 +4 +3 +5 +5 +3 +4 +5 +3 +5 +3 +10 +4 +5 +5 +8 +4 +4 +6 +8 +5 +4 +3 +4 +6 +3 +3 +3 +5 +9 +12 +6 +5 +9 +3 +5 +3 +2 +2 +2 +18 +3 +2 +21 +2 +5 +4 +6 +4 +5 +10 +3 +9 +3 +2 +10 +7 +3 +6 +6 +4 +4 +8 +12 +7 +3 +7 +3 +3 +9 +3 +4 +5 +4 +4 +5 +5 +10 +15 +4 +4 +14 +6 +227 +3 +14 +5 +216 +22 +5 +4 +2 +2 +6 +3 +4 +2 +9 +9 +4 +3 +28 +13 +11 +4 +5 +3 +3 +2 +3 +3 +5 +3 +4 +3 +5 +23 +26 +3 +4 +5 +6 +4 +6 +3 +5 +5 +3 +4 +3 +2 +2 +2 +7 +14 +3 +6 +7 +17 +2 +2 +15 +14 +16 +4 +6 +7 +13 +6 +4 +5 +6 +16 +3 +3 +28 +3 +6 +15 +3 +9 +2 +4 +6 +3 +3 +22 +4 +12 +6 +7 +2 +5 +4 +10 +3 +16 +6 +9 +2 +5 +12 +7 +5 +5 +5 +5 +2 +11 +9 +17 +4 +3 +11 +7 +3 +5 +15 +4 +3 +4 +211 +8 +7 +5 +4 +7 +6 +7 +6 +3 +6 +5 +6 +5 +3 +4 +4 +26 +4 +6 +10 +4 +4 +3 +2 +3 +3 +4 +5 +9 +3 +9 +4 +4 +5 +5 +8 +2 +4 +2 +3 +8 +4 +11 +19 +5 +8 +6 +3 +5 +6 +12 +3 +2 +4 +16 +12 +3 +4 +4 +8 +6 +5 +6 +6 +219 +8 +222 +6 +16 +3 +13 +19 +5 +4 +3 +11 +6 +10 +4 +7 +7 +12 +5 +3 +3 +5 +6 +10 +3 +8 +2 +5 +4 +7 +2 +4 +4 +2 +12 +9 +6 +4 +2 +40 +2 +4 +10 +4 +223 +4 +2 +20 +6 +7 +24 +5 +4 +5 +2 +20 +16 +6 +5 +13 +2 +3 +3 +19 +3 +2 +4 +5 +6 +7 +11 +12 +5 +6 +7 +7 +3 +5 +3 +5 +3 +14 +3 +4 +4 +2 +11 +1 +7 +3 +9 +6 +11 +12 +5 +8 +6 +221 +4 +2 +12 +4 +3 +15 +4 +5 +226 +7 +218 +7 +5 +4 +5 +18 +4 +5 +9 +4 +4 +2 +9 +18 +18 +9 +5 +6 +6 +3 +3 +7 +3 +5 +4 +4 +4 +12 +3 +6 +31 +5 +4 +7 +3 +6 +5 +6 +5 +11 +2 +2 +11 +11 +6 +7 +5 +8 +7 +10 +5 +23 +7 +4 +3 +5 +34 +2 +5 +23 +7 +3 +6 +8 +4 +4 +4 +2 +5 +3 +8 +5 +4 +8 +25 +2 +3 +17 +8 +3 +4 +8 +7 +3 +15 +6 +5 +7 +21 +9 +5 +6 +6 +5 +3 +2 +3 +10 +3 +6 +3 +14 +7 +4 +4 +8 +7 +8 +2 +6 +12 +4 +213 +6 +5 +21 +8 +2 +5 +23 +3 +11 +2 +3 +6 +25 +2 +3 +6 +7 +6 +6 +4 +4 +6 +3 +17 +9 +7 +6 +4 +3 +10 +7 +2 +3 +3 +3 +11 +8 +3 +7 +6 +4 +14 +36 +3 +4 +3 +3 +22 +13 +21 +4 +2 +7 +4 +4 +17 +15 +3 +7 +11 +2 +4 +7 +6 +209 +6 +3 +2 +2 +24 +4 +9 +4 +3 +3 +3 +29 +2 +2 +4 +3 +3 +5 +4 +6 +3 +3 +2 +4 diff --git a/quantile/stream.go b/quantile/stream.go new file mode 100644 index 0000000..587b1fc --- /dev/null +++ b/quantile/stream.go @@ -0,0 +1,292 @@ +// Package quantile computes approximate quantiles over an unbounded data +// stream within low memory and CPU bounds. +// +// A small amount of accuracy is traded to achieve the above properties. +// +// Multiple streams can be merged before calling Query to generate a single set +// of results. This is meaningful when the streams represent the same type of +// data. See Merge and Samples. +// +// For more detailed information about the algorithm used, see: +// +// Effective Computation of Biased Quantiles over Data Streams +// +// http://www.cs.rutgers.edu/~muthu/bquant.pdf +package quantile + +import ( + "math" + "sort" +) + +// Sample holds an observed value and meta information for compression. JSON +// tags have been added for convenience. +type Sample struct { + Value float64 `json:",string"` + Width float64 `json:",string"` + Delta float64 `json:",string"` +} + +// Samples represents a slice of samples. It implements sort.Interface. +type Samples []Sample + +func (a Samples) Len() int { return len(a) } +func (a Samples) Less(i, j int) bool { return a[i].Value < a[j].Value } +func (a Samples) Swap(i, j int) { a[i], a[j] = a[j], a[i] } + +type invariant func(s *stream, r float64) float64 + +// NewLowBiased returns an initialized Stream for low-biased quantiles +// (e.g. 0.01, 0.1, 0.5) where the needed quantiles are not known a priori, but +// error guarantees can still be given even for the lower ranks of the data +// distribution. +// +// The provided epsilon is a relative error, i.e. the true quantile of a value +// returned by a query is guaranteed to be within (1±Epsilon)*Quantile. +// +// See http://www.cs.rutgers.edu/~muthu/bquant.pdf for time, space, and error +// properties. +func NewLowBiased(epsilon float64) *Stream { + ƒ := func(s *stream, r float64) float64 { + return 2 * epsilon * r + } + return newStream(ƒ) +} + +// NewHighBiased returns an initialized Stream for high-biased quantiles +// (e.g. 0.01, 0.1, 0.5) where the needed quantiles are not known a priori, but +// error guarantees can still be given even for the higher ranks of the data +// distribution. +// +// The provided epsilon is a relative error, i.e. the true quantile of a value +// returned by a query is guaranteed to be within 1-(1±Epsilon)*(1-Quantile). +// +// See http://www.cs.rutgers.edu/~muthu/bquant.pdf for time, space, and error +// properties. +func NewHighBiased(epsilon float64) *Stream { + ƒ := func(s *stream, r float64) float64 { + return 2 * epsilon * (s.n - r) + } + return newStream(ƒ) +} + +// NewTargeted returns an initialized Stream concerned with a particular set of +// quantile values that are supplied a priori. Knowing these a priori reduces +// space and computation time. The targets map maps the desired quantiles to +// their absolute errors, i.e. the true quantile of a value returned by a query +// is guaranteed to be within (Quantile±Epsilon). +// +// See http://www.cs.rutgers.edu/~muthu/bquant.pdf for time, space, and error properties. +func NewTargeted(targets map[float64]float64) *Stream { + ƒ := func(s *stream, r float64) float64 { + var m = math.MaxFloat64 + var f float64 + for quantile, epsilon := range targets { + if quantile*s.n <= r { + f = (2 * epsilon * r) / quantile + } else { + f = (2 * epsilon * (s.n - r)) / (1 - quantile) + } + if f < m { + m = f + } + } + return m + } + return newStream(ƒ) +} + +// Stream computes quantiles for a stream of float64s. It is not thread-safe by +// design. Take care when using across multiple goroutines. +type Stream struct { + *stream + b Samples + sorted bool +} + +func newStream(ƒ invariant) *Stream { + x := &stream{ƒ: ƒ} + return &Stream{x, make(Samples, 0, 500), true} +} + +// Insert inserts v into the stream. +func (s *Stream) Insert(v float64) { + s.insert(Sample{Value: v, Width: 1}) +} + +func (s *Stream) insert(sample Sample) { + s.b = append(s.b, sample) + s.sorted = false + if len(s.b) == cap(s.b) { + s.flush() + } +} + +// Query returns the computed qth percentiles value. If s was created with +// NewTargeted, and q is not in the set of quantiles provided a priori, Query +// will return an unspecified result. +func (s *Stream) Query(q float64) float64 { + if !s.flushed() { + // Fast path when there hasn't been enough data for a flush; + // this also yields better accuracy for small sets of data. + l := len(s.b) + if l == 0 { + return 0 + } + i := int(float64(l) * q) + if i > 0 { + i -= 1 + } + s.maybeSort() + return s.b[i].Value + } + s.flush() + return s.stream.query(q) +} + +// Merge merges samples into the underlying streams samples. This is handy when +// merging multiple streams from separate threads, database shards, etc. +// +// ATTENTION: This method is broken and does not yield correct results. The +// underlying algorithm is not capable of merging streams correctly. +func (s *Stream) Merge(samples Samples) { + sort.Sort(samples) + s.stream.merge(samples) +} + +// Reset reinitializes and clears the list reusing the samples buffer memory. +func (s *Stream) Reset() { + s.stream.reset() + s.b = s.b[:0] +} + +// Samples returns stream samples held by s. +func (s *Stream) Samples() Samples { + if !s.flushed() { + return s.b + } + s.flush() + return s.stream.samples() +} + +// Count returns the total number of samples observed in the stream +// since initialization. +func (s *Stream) Count() int { + return len(s.b) + s.stream.count() +} + +func (s *Stream) flush() { + s.maybeSort() + s.stream.merge(s.b) + s.b = s.b[:0] +} + +func (s *Stream) maybeSort() { + if !s.sorted { + s.sorted = true + sort.Sort(s.b) + } +} + +func (s *Stream) flushed() bool { + return len(s.stream.l) > 0 +} + +type stream struct { + n float64 + l []Sample + ƒ invariant +} + +func (s *stream) reset() { + s.l = s.l[:0] + s.n = 0 +} + +func (s *stream) insert(v float64) { + s.merge(Samples{{v, 1, 0}}) +} + +func (s *stream) merge(samples Samples) { + // TODO(beorn7): This tries to merge not only individual samples, but + // whole summaries. The paper doesn't mention merging summaries at + // all. Unittests show that the merging is inaccurate. Find out how to + // do merges properly. + var r float64 + i := 0 + for _, sample := range samples { + for ; i < len(s.l); i++ { + c := s.l[i] + if c.Value > sample.Value { + // Insert at position i. + s.l = append(s.l, Sample{}) + copy(s.l[i+1:], s.l[i:]) + s.l[i] = Sample{ + sample.Value, + sample.Width, + math.Max(sample.Delta, math.Floor(s.ƒ(s, r))-1), + // TODO(beorn7): How to calculate delta correctly? + } + i++ + goto inserted + } + r += c.Width + } + s.l = append(s.l, Sample{sample.Value, sample.Width, 0}) + i++ + inserted: + s.n += sample.Width + r += sample.Width + } + s.compress() +} + +func (s *stream) count() int { + return int(s.n) +} + +func (s *stream) query(q float64) float64 { + t := math.Ceil(q * s.n) + t += math.Ceil(s.ƒ(s, t) / 2) + p := s.l[0] + var r float64 + for _, c := range s.l[1:] { + r += p.Width + if r+c.Width+c.Delta > t { + return p.Value + } + p = c + } + return p.Value +} + +func (s *stream) compress() { + if len(s.l) < 2 { + return + } + x := s.l[len(s.l)-1] + xi := len(s.l) - 1 + r := s.n - 1 - x.Width + + for i := len(s.l) - 2; i >= 0; i-- { + c := s.l[i] + if c.Width+x.Width+x.Delta <= s.ƒ(s, r) { + x.Width += c.Width + s.l[xi] = x + // Remove element at i. + copy(s.l[i:], s.l[i+1:]) + s.l = s.l[:len(s.l)-1] + xi -= 1 + } else { + x = c + xi = i + } + r -= c.Width + } +} + +func (s *stream) samples() Samples { + samples := make(Samples, len(s.l)) + copy(samples, s.l) + return samples +} diff --git a/quantile/stream_test.go b/quantile/stream_test.go new file mode 100644 index 0000000..4dba054 --- /dev/null +++ b/quantile/stream_test.go @@ -0,0 +1,188 @@ +package quantile + +import ( + "math" + "math/rand" + "sort" + "testing" +) + +var ( + Targets = map[float64]float64{ + 0.01: 0.001, + 0.10: 0.01, + 0.50: 0.05, + 0.90: 0.01, + 0.99: 0.001, + } + TargetsSmallEpsilon = map[float64]float64{ + 0.01: 0.0001, + 0.10: 0.001, + 0.50: 0.005, + 0.90: 0.001, + 0.99: 0.0001, + } + LowQuantiles = []float64{0.01, 0.1, 0.5} + HighQuantiles = []float64{0.99, 0.9, 0.5} +) + +const RelativeEpsilon = 0.01 + +func verifyPercsWithAbsoluteEpsilon(t *testing.T, a []float64, s *Stream) { + sort.Float64s(a) + for quantile, epsilon := range Targets { + n := float64(len(a)) + k := int(quantile * n) + lower := int((quantile - epsilon) * n) + if lower < 1 { + lower = 1 + } + upper := int(math.Ceil((quantile + epsilon) * n)) + if upper > len(a) { + upper = len(a) + } + w, min, max := a[k-1], a[lower-1], a[upper-1] + if g := s.Query(quantile); g < min || g > max { + t.Errorf("q=%f: want %v [%f,%f], got %v", quantile, w, min, max, g) + } + } +} + +func verifyLowPercsWithRelativeEpsilon(t *testing.T, a []float64, s *Stream) { + sort.Float64s(a) + for _, qu := range LowQuantiles { + n := float64(len(a)) + k := int(qu * n) + + lowerRank := int((1 - RelativeEpsilon) * qu * n) + upperRank := int(math.Ceil((1 + RelativeEpsilon) * qu * n)) + w, min, max := a[k-1], a[lowerRank-1], a[upperRank-1] + if g := s.Query(qu); g < min || g > max { + t.Errorf("q=%f: want %v [%f,%f], got %v", qu, w, min, max, g) + } + } +} + +func verifyHighPercsWithRelativeEpsilon(t *testing.T, a []float64, s *Stream) { + sort.Float64s(a) + for _, qu := range HighQuantiles { + n := float64(len(a)) + k := int(qu * n) + + lowerRank := int((1 - (1+RelativeEpsilon)*(1-qu)) * n) + upperRank := int(math.Ceil((1 - (1-RelativeEpsilon)*(1-qu)) * n)) + w, min, max := a[k-1], a[lowerRank-1], a[upperRank-1] + if g := s.Query(qu); g < min || g > max { + t.Errorf("q=%f: want %v [%f,%f], got %v", qu, w, min, max, g) + } + } +} + +func populateStream(s *Stream) []float64 { + a := make([]float64, 0, 1e5+100) + for i := 0; i < cap(a); i++ { + v := rand.NormFloat64() + // Add 5% asymmetric outliers. + if i%20 == 0 { + v = v*v + 1 + } + s.Insert(v) + a = append(a, v) + } + return a +} + +func TestTargetedQuery(t *testing.T) { + rand.Seed(42) + s := NewTargeted(Targets) + a := populateStream(s) + verifyPercsWithAbsoluteEpsilon(t, a, s) +} + +func TestLowBiasedQuery(t *testing.T) { + rand.Seed(42) + s := NewLowBiased(RelativeEpsilon) + a := populateStream(s) + verifyLowPercsWithRelativeEpsilon(t, a, s) +} + +func TestHighBiasedQuery(t *testing.T) { + rand.Seed(42) + s := NewHighBiased(RelativeEpsilon) + a := populateStream(s) + verifyHighPercsWithRelativeEpsilon(t, a, s) +} + +// BrokenTestTargetedMerge is broken, see Merge doc comment. +func BrokenTestTargetedMerge(t *testing.T) { + rand.Seed(42) + s1 := NewTargeted(Targets) + s2 := NewTargeted(Targets) + a := populateStream(s1) + a = append(a, populateStream(s2)...) + s1.Merge(s2.Samples()) + verifyPercsWithAbsoluteEpsilon(t, a, s1) +} + +// BrokenTestLowBiasedMerge is broken, see Merge doc comment. +func BrokenTestLowBiasedMerge(t *testing.T) { + rand.Seed(42) + s1 := NewLowBiased(RelativeEpsilon) + s2 := NewLowBiased(RelativeEpsilon) + a := populateStream(s1) + a = append(a, populateStream(s2)...) + s1.Merge(s2.Samples()) + verifyLowPercsWithRelativeEpsilon(t, a, s2) +} + +// BrokenTestHighBiasedMerge is broken, see Merge doc comment. +func BrokenTestHighBiasedMerge(t *testing.T) { + rand.Seed(42) + s1 := NewHighBiased(RelativeEpsilon) + s2 := NewHighBiased(RelativeEpsilon) + a := populateStream(s1) + a = append(a, populateStream(s2)...) + s1.Merge(s2.Samples()) + verifyHighPercsWithRelativeEpsilon(t, a, s2) +} + +func TestUncompressed(t *testing.T) { + q := NewTargeted(Targets) + for i := 100; i > 0; i-- { + q.Insert(float64(i)) + } + if g := q.Count(); g != 100 { + t.Errorf("want count 100, got %d", g) + } + // Before compression, Query should have 100% accuracy. + for quantile := range Targets { + w := quantile * 100 + if g := q.Query(quantile); g != w { + t.Errorf("want %f, got %f", w, g) + } + } +} + +func TestUncompressedSamples(t *testing.T) { + q := NewTargeted(map[float64]float64{0.99: 0.001}) + for i := 1; i <= 100; i++ { + q.Insert(float64(i)) + } + if g := q.Samples().Len(); g != 100 { + t.Errorf("want count 100, got %d", g) + } +} + +func TestUncompressedOne(t *testing.T) { + q := NewTargeted(map[float64]float64{0.99: 0.01}) + q.Insert(3.14) + if g := q.Query(0.90); g != 3.14 { + t.Error("want PI, got", g) + } +} + +func TestDefaults(t *testing.T) { + if g := NewTargeted(map[float64]float64{0.99: 0.001}).Query(0.99); g != 0 { + t.Errorf("want 0, got %f", g) + } +} diff --git a/topk/topk.go b/topk/topk.go new file mode 100644 index 0000000..5ac3d99 --- /dev/null +++ b/topk/topk.go @@ -0,0 +1,90 @@ +package topk + +import ( + "sort" +) + +// http://www.cs.ucsb.edu/research/tech_reports/reports/2005-23.pdf + +type Element struct { + Value string + Count int +} + +type Samples []*Element + +func (sm Samples) Len() int { + return len(sm) +} + +func (sm Samples) Less(i, j int) bool { + return sm[i].Count < sm[j].Count +} + +func (sm Samples) Swap(i, j int) { + sm[i], sm[j] = sm[j], sm[i] +} + +type Stream struct { + k int + mon map[string]*Element + + // the minimum Element + min *Element +} + +func New(k int) *Stream { + s := new(Stream) + s.k = k + s.mon = make(map[string]*Element) + s.min = &Element{} + + // Track k+1 so that less frequenet items contended for that spot, + // resulting in k being more accurate. + return s +} + +func (s *Stream) Insert(x string) { + s.insert(&Element{x, 1}) +} + +func (s *Stream) Merge(sm Samples) { + for _, e := range sm { + s.insert(e) + } +} + +func (s *Stream) insert(in *Element) { + e := s.mon[in.Value] + if e != nil { + e.Count++ + } else { + if len(s.mon) < s.k+1 { + e = &Element{in.Value, in.Count} + s.mon[in.Value] = e + } else { + e = s.min + delete(s.mon, e.Value) + e.Value = in.Value + e.Count += in.Count + s.min = e + } + } + if e.Count < s.min.Count { + s.min = e + } +} + +func (s *Stream) Query() Samples { + var sm Samples + for _, e := range s.mon { + sm = append(sm, e) + } + sort.Sort(sort.Reverse(sm)) + + if len(sm) < s.k { + return sm + } + + return sm[:s.k] +} diff --git a/topk/topk_test.go b/topk/topk_test.go new file mode 100644 index 0000000..c24f0f7 --- /dev/null +++ b/topk/topk_test.go @@ -0,0 +1,57 @@ +package topk + +import ( + "fmt" + "math/rand" + "sort" + "testing" +) + +func TestTopK(t *testing.T) { + stream := New(10) + ss := []*Stream{New(10), New(10), New(10)} + m := make(map[string]int) + for _, s := range ss { + for i := 0; i < 1e6; i++ { + v := fmt.Sprintf("%x", int8(rand.ExpFloat64())) + s.Insert(v) + m[v]++ + } + stream.Merge(s.Query()) + } + + var sm Samples + for x, s := range m { + sm = append(sm, &Element{x, s}) + } + sort.Sort(sort.Reverse(sm)) + + g := stream.Query() + if len(g) != 10 { + t.Fatalf("got %d, want 10", len(g)) + } + for i, e := range g { + if sm[i].Value != e.Value { + t.Errorf("at %d: want %q, got %q", i, sm[i].Value, e.Value) + } + } +} + +func TestQuery(t *testing.T) { + queryTests := []struct { + value string + expected int + }{ + {"a", 1}, + {"b", 2}, + {"c", 2}, + } + + stream := New(2) + for _, tt := range queryTests { + stream.Insert(tt.value) + if n := len(stream.Query()); n != tt.expected { + t.Errorf("want %d, got %d", tt.expected, n) + } + } +}