Codebase list golang-github-beorn7-perks / multiarch-fixes/main quantile / stream.go
multiarch-fixes/main

Tree @multiarch-fixes/main (Download .tar.gz)

stream.go @multiarch-fixes/main

36f80c6
 
ac5ec5b
36f80c6
ac5ec5b
36f80c6
 
 
55ff267
a1d7964
efd0332
 
 
 
ac5ec5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a9cdc7
ac5ec5b
 
166701f
 
 
ac5ec5b
6684ad6
d5d9f25
4a53c8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6dabc69
4a53c8d
3a68842
6dabc69
3a68842
 
f4ce9b3
 
dab8af3
 
 
4a53c8d
55ff267
b848f97
 
 
 
 
 
6dabc69
4a53c8d
d5d9f25
b848f97
 
 
d5d9f25
b848f97
d5d9f25
5f34d75
 
 
d5d9f25
 
 
6dabc69
d5d9f25
 
b848f97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f6f146
 
ac5ec5b
 
da72989
 
ac5ec5b
 
c83a37f
4a53c8d
da72989
ac5ec5b
 
 
 
 
 
 
 
 
da72989
ac5ec5b
 
 
 
 
974f96d
f4ce9b3
 
ac5ec5b
e3edcea
ac5ec5b
166701f
bd1cf5e
 
 
 
1ac20f1
2918f03
 
 
da72989
2918f03
ac5ec5b
 
 
 
 
 
f4ce9b3
dab8af3
 
 
ac5ec5b
63d46f1
ac5ec5b
 
 
38e95ec
 
55ff267
ac5ec5b
 
 
2a9cdc7
ac5ec5b
 
 
 
e3edcea
ac5ec5b
 
 
c4274eb
 
 
 
 
 
ac5ec5b
da72989
ac5ec5b
 
 
 
da72989
 
 
 
 
 
 
ac5ec5b
456f18a
ac5ec5b
 
 
4a53c8d
 
 
ac5ec5b
 
55ff267
456f18a
ac5ec5b
 
 
 
b302689
ac5ec5b
 
 
4a53c8d
 
 
 
ac5ec5b
456f18a
b302689
456f18a
 
b302689
456f18a
d9a9656
456f18a
4a53c8d
 
 
 
 
 
456f18a
ac5ec5b
 
 
 
d9a9656
456f18a
ac5ec5b
b302689
4a53c8d
ac5ec5b
4a53c8d
ac5ec5b
 
c4274eb
ac5ec5b
 
 
 
 
d5d9f25
456f18a
4a53c8d
456f18a
4a53c8d
ac5ec5b
 
 
 
 
 
 
 
 
456f18a
ac5ec5b
 
456f18a
d9a9656
ac5ec5b
456f18a
 
 
d5d9f25
ac5ec5b
d9a9656
456f18a
 
 
d9a9656
ac5ec5b
 
d9a9656
ac5ec5b
 
 
 
 
 
456f18a
d9a9656
ac5ec5b
 
// Package quantile computes approximate quantiles over an unbounded data
// stream within low memory and CPU bounds.
//
// A small amount of accuracy is traded to achieve the above properties.
//
// Multiple streams can be merged before calling Query to generate a single set
// of results. This is meaningful when the streams represent the same type of
// data. See Merge and Samples.
//
// For more detailed information about the algorithm used, see:
//
// Effective Computation of Biased Quantiles over Data Streams
//
// http://www.cs.rutgers.edu/~muthu/bquant.pdf
package quantile

import (
	"math"
	"sort"
)

// Sample holds an observed value and meta information for compression. JSON
// tags have been added for convenience.
type Sample struct {
	Value float64 `json:",string"`
	Width float64 `json:",string"`
	Delta float64 `json:",string"`
}

// Samples represents a slice of samples. It implements sort.Interface.
type Samples []Sample

func (a Samples) Len() int           { return len(a) }
func (a Samples) Less(i, j int) bool { return a[i].Value < a[j].Value }
func (a Samples) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }

type invariant func(s *stream, r float64) float64

// NewLowBiased returns an initialized Stream for low-biased quantiles
// (e.g. 0.01, 0.1, 0.5) where the needed quantiles are not known a priori, but
// error guarantees can still be given even for the lower ranks of the data
// distribution.
//
// The provided epsilon is a relative error, i.e. the true quantile of a value
// returned by a query is guaranteed to be within (1±Epsilon)*Quantile.
//
// See http://www.cs.rutgers.edu/~muthu/bquant.pdf for time, space, and error
// properties.
func NewLowBiased(epsilon float64) *Stream {
	ƒ := func(s *stream, r float64) float64 {
		return 2 * epsilon * r
	}
	return newStream(ƒ)
}

// NewHighBiased returns an initialized Stream for high-biased quantiles
// (e.g. 0.01, 0.1, 0.5) where the needed quantiles are not known a priori, but
// error guarantees can still be given even for the higher ranks of the data
// distribution.
//
// The provided epsilon is a relative error, i.e. the true quantile of a value
// returned by a query is guaranteed to be within 1-(1±Epsilon)*(1-Quantile).
//
// See http://www.cs.rutgers.edu/~muthu/bquant.pdf for time, space, and error
// properties.
func NewHighBiased(epsilon float64) *Stream {
	ƒ := func(s *stream, r float64) float64 {
		return 2 * epsilon * (s.n - r)
	}
	return newStream(ƒ)
}

// NewTargeted returns an initialized Stream concerned with a particular set of
// quantile values that are supplied a priori. Knowing these a priori reduces
// space and computation time. The targets map maps the desired quantiles to
// their absolute errors, i.e. the true quantile of a value returned by a query
// is guaranteed to be within (Quantile±Epsilon).
//
// See http://www.cs.rutgers.edu/~muthu/bquant.pdf for time, space, and error properties.
func NewTargeted(targetMap map[float64]float64) *Stream {
	// Convert map to slice to avoid slow iterations on a map.
	// ƒ is called on the hot path, so converting the map to a slice
	// beforehand results in significant CPU savings.
	targets := targetMapToSlice(targetMap)

	ƒ := func(s *stream, r float64) float64 {
		var m = math.MaxFloat64
		var f float64
		for _, t := range targets {
			if t.quantile*s.n <= r {
				f = (2 * t.epsilon * r) / t.quantile
			} else {
				f = (2 * t.epsilon * (s.n - r)) / (1 - t.quantile)
			}
			if f < m {
				m = f
			}
		}
		return m
	}
	return newStream(ƒ)
}

type target struct {
	quantile float64
	epsilon  float64
}

func targetMapToSlice(targetMap map[float64]float64) []target {
	targets := make([]target, 0, len(targetMap))

	for quantile, epsilon := range targetMap {
		t := target{
			quantile: quantile,
			epsilon:  epsilon,
		}
		targets = append(targets, t)
	}

	return targets
}

// Stream computes quantiles for a stream of float64s. It is not thread-safe by
// design. Take care when using across multiple goroutines.
type Stream struct {
	*stream
	b      Samples
	sorted bool
}

func newStream(ƒ invariant) *Stream {
	x := &stream{ƒ: ƒ}
	return &Stream{x, make(Samples, 0, 500), true}
}

// Insert inserts v into the stream.
func (s *Stream) Insert(v float64) {
	s.insert(Sample{Value: v, Width: 1})
}

func (s *Stream) insert(sample Sample) {
	s.b = append(s.b, sample)
	s.sorted = false
	if len(s.b) == cap(s.b) {
		s.flush()
	}
}

// Query returns the computed qth percentiles value. If s was created with
// NewTargeted, and q is not in the set of quantiles provided a priori, Query
// will return an unspecified result.
func (s *Stream) Query(q float64) float64 {
	if !s.flushed() {
		// Fast path when there hasn't been enough data for a flush;
		// this also yields better accuracy for small sets of data.
		l := len(s.b)
		if l == 0 {
			return 0
		}
		i := int(math.Ceil(float64(l) * q))
		if i > 0 {
			i -= 1
		}
		s.maybeSort()
		return s.b[i].Value
	}
	s.flush()
	return s.stream.query(q)
}

// Merge merges samples into the underlying streams samples. This is handy when
// merging multiple streams from separate threads, database shards, etc.
//
// ATTENTION: This method is broken and does not yield correct results. The
// underlying algorithm is not capable of merging streams correctly.
func (s *Stream) Merge(samples Samples) {
	sort.Sort(samples)
	s.stream.merge(samples)
}

// Reset reinitializes and clears the list reusing the samples buffer memory.
func (s *Stream) Reset() {
	s.stream.reset()
	s.b = s.b[:0]
}

// Samples returns stream samples held by s.
func (s *Stream) Samples() Samples {
	if !s.flushed() {
		return s.b
	}
	s.flush()
	return s.stream.samples()
}

// Count returns the total number of samples observed in the stream
// since initialization.
func (s *Stream) Count() int {
	return len(s.b) + s.stream.count()
}

func (s *Stream) flush() {
	s.maybeSort()
	s.stream.merge(s.b)
	s.b = s.b[:0]
}

func (s *Stream) maybeSort() {
	if !s.sorted {
		s.sorted = true
		sort.Sort(s.b)
	}
}

func (s *Stream) flushed() bool {
	return len(s.stream.l) > 0
}

type stream struct {
	n float64
	l []Sample
	ƒ invariant
}

func (s *stream) reset() {
	s.l = s.l[:0]
	s.n = 0
}

func (s *stream) insert(v float64) {
	s.merge(Samples{{v, 1, 0}})
}

func (s *stream) merge(samples Samples) {
	// TODO(beorn7): This tries to merge not only individual samples, but
	// whole summaries. The paper doesn't mention merging summaries at
	// all. Unittests show that the merging is inaccurate. Find out how to
	// do merges properly.
	var r float64
	i := 0
	for _, sample := range samples {
		for ; i < len(s.l); i++ {
			c := s.l[i]
			if c.Value > sample.Value {
				// Insert at position i.
				s.l = append(s.l, Sample{})
				copy(s.l[i+1:], s.l[i:])
				s.l[i] = Sample{
					sample.Value,
					sample.Width,
					math.Max(sample.Delta, math.Floor(s.ƒ(s, r))-1),
					// TODO(beorn7): How to calculate delta correctly?
				}
				i++
				goto inserted
			}
			r += c.Width
		}
		s.l = append(s.l, Sample{sample.Value, sample.Width, 0})
		i++
	inserted:
		s.n += sample.Width
		r += sample.Width
	}
	s.compress()
}

func (s *stream) count() int {
	return int(s.n)
}

func (s *stream) query(q float64) float64 {
	t := math.Ceil(q * s.n)
	t += math.Ceil(s.ƒ(s, t) / 2)
	p := s.l[0]
	var r float64
	for _, c := range s.l[1:] {
		r += p.Width
		if r+c.Width+c.Delta > t {
			return p.Value
		}
		p = c
	}
	return p.Value
}

func (s *stream) compress() {
	if len(s.l) < 2 {
		return
	}
	x := s.l[len(s.l)-1]
	xi := len(s.l) - 1
	r := s.n - 1 - x.Width

	for i := len(s.l) - 2; i >= 0; i-- {
		c := s.l[i]
		if c.Width+x.Width+x.Delta <= s.ƒ(s, r) {
			x.Width += c.Width
			s.l[xi] = x
			// Remove element at i.
			copy(s.l[i:], s.l[i+1:])
			s.l = s.l[:len(s.l)-1]
			xi -= 1
		} else {
			x = c
			xi = i
		}
		r -= c.Width
	}
}

func (s *stream) samples() Samples {
	samples := make(Samples, len(s.l))
	copy(samples, s.l)
	return samples
}