Import upstream version 0.0.1+git20210216.1.fc98d27
Debian Janitor
2 years ago
1 | 1 | |
2 | 2 | This is an implementation of Ted Dunning's [t-digest](https://github.com/tdunning/t-digest/) in Go. |
3 | 3 | |
4 | The implementaion is based off [Derrick Burns' C++ implementation](https://github.com/derrickburns/tdigest). | |
4 | The implementation is based off [Derrick Burns' C++ implementation](https://github.com/derrickburns/tdigest). | |
5 | 5 | |
6 | 6 | ## Example |
7 | 7 | |
34 | 34 | log.Println("CDF(5) = ", td.CDF(5)) |
35 | 35 | } |
36 | 36 | ``` |
37 | ||
38 | ## TODO | |
39 | ||
40 | Only the methods for a single TDigest have been implemented. | |
41 | The methods to merge two or more existing t-digests into a single t-digest have yet to be implemented. |
42 | 42 | // CentroidList is sorted by the Mean of the centroid, ascending. |
43 | 43 | type CentroidList []Centroid |
44 | 44 | |
45 | // Clear clears the list. | |
45 | 46 | func (l *CentroidList) Clear() { |
46 | *l = (*l)[0:0] | |
47 | *l = (*l)[:0] | |
47 | 48 | } |
48 | 49 | |
49 | 50 | func (l CentroidList) Len() int { return len(l) } |
0 | module github.com/influxdata/tdigest | |
1 | ||
2 | require ( | |
3 | github.com/google/go-cmp v0.2.0 | |
4 | golang.org/x/exp v0.0.0-20180321215751-8460e604b9de | |
5 | gonum.org/v1/gonum v0.0.0-20181121035319-3f7ecaa7e8ca | |
6 | gonum.org/v1/netlib v0.0.0-20181029234149-ec6d1f5cefe6 // indirect | |
7 | ) | |
8 | ||
9 | go 1.13 |
0 | github.com/google/go-cmp v0.2.0 h1:+dTQ8DZQJz0Mb/HjFlkptS1FeQ4cWSnN941F8aEG4SQ= | |
1 | github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= | |
2 | golang.org/x/exp v0.0.0-20180321215751-8460e604b9de h1:xSjD6HQTqT0H/k60N5yYBtnN1OEkVy7WIo/DYyxKRO0= | |
3 | golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= | |
4 | golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= | |
5 | gonum.org/v1/gonum v0.0.0-20181121035319-3f7ecaa7e8ca h1:PupagGYwj8+I4ubCxcmcBRk3VlUWtTg5huQpZR9flmE= | |
6 | gonum.org/v1/gonum v0.0.0-20181121035319-3f7ecaa7e8ca/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= | |
7 | gonum.org/v1/netlib v0.0.0-20181029234149-ec6d1f5cefe6 h1:4WsZyVtkthqrHTbDCJfiTs8IWNYE4uvsSDgaV6xpp+o= | |
8 | gonum.org/v1/netlib v0.0.0-20181029234149-ec6d1f5cefe6/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= |
4 | 4 | "sort" |
5 | 5 | ) |
6 | 6 | |
7 | // TDigest is a data structure for accurate on-line accumulation of | |
8 | // rank-based statistics such as quantiles and trimmed means. | |
7 | 9 | type TDigest struct { |
8 | 10 | Compression float64 |
9 | 11 | |
18 | 20 | max float64 |
19 | 21 | } |
20 | 22 | |
23 | // New initializes a new distribution with a default compression. | |
21 | 24 | func New() *TDigest { |
22 | 25 | return NewWithCompression(1000) |
23 | 26 | } |
27 | ||
28 | // NewWithCompression initializes a new distribution with custom compression. | |
24 | 29 | func NewWithCompression(c float64) *TDigest { |
25 | 30 | t := &TDigest{ |
26 | 31 | Compression: c, |
27 | 32 | } |
28 | 33 | t.maxProcessed = processedSize(0, t.Compression) |
29 | 34 | t.maxUnprocessed = unprocessedSize(0, t.Compression) |
30 | t.processed = make([]Centroid, 0, t.maxProcessed) | |
31 | t.unprocessed = make([]Centroid, 0, t.maxUnprocessed+1) | |
35 | t.processed = make(CentroidList, 0, t.maxProcessed) | |
36 | t.unprocessed = make(CentroidList, 0, t.maxUnprocessed+1) | |
37 | t.Reset() | |
38 | return t | |
39 | } | |
40 | ||
41 | // Calculate number of bytes needed for a tdigest of size c, | |
42 | // where c is the compression value | |
43 | func ByteSizeForCompression(comp float64) int { | |
44 | c := int(comp) | |
45 | // // A centroid is 2 float64s, so we need 16 bytes for each centroid | |
46 | // float_size := 8 | |
47 | // centroid_size := 2 * float_size | |
48 | ||
49 | // // Unprocessed and processed can grow up to length c | |
50 | // unprocessed_size := centroid_size * c | |
51 | // processed_size := unprocessed_size | |
52 | ||
53 | // // the cumulative field can also be of length c, but each item is a single float64 | |
54 | // cumulative_size := float_size * c // <- this could also be unprocessed_size / 2 | |
55 | ||
56 | // return unprocessed_size + processed_size + cumulative_size | |
57 | ||
58 | // // or, more succinctly: | |
59 | // return float_size * c * 5 | |
60 | ||
61 | // or even more succinctly | |
62 | return c * 40 | |
63 | } | |
64 | ||
65 | // Reset resets the distribution to its initial state. | |
66 | func (t *TDigest) Reset() { | |
67 | t.processed = t.processed[:0] | |
68 | t.unprocessed = t.unprocessed[:0] | |
69 | t.cumulative = t.cumulative[:0] | |
70 | t.processedWeight = 0 | |
71 | t.unprocessedWeight = 0 | |
32 | 72 | t.min = math.MaxFloat64 |
33 | 73 | t.max = -math.MaxFloat64 |
34 | return t | |
35 | } | |
36 | ||
74 | } | |
75 | ||
76 | // Add adds a value x with a weight w to the distribution. | |
37 | 77 | func (t *TDigest) Add(x, w float64) { |
38 | if math.IsNaN(x) { | |
78 | t.AddCentroid(Centroid{Mean: x, Weight: w}) | |
79 | } | |
80 | ||
81 | // AddCentroidList can quickly add multiple centroids. | |
82 | func (t *TDigest) AddCentroidList(c CentroidList) { | |
83 | // It's possible to optimize this by bulk-copying the slice, but this | |
84 | // yields just a 1-2% speedup (most time is in process()), so not worth | |
85 | // the complexity. | |
86 | for i := range c { | |
87 | t.AddCentroid(c[i]) | |
88 | } | |
89 | } | |
90 | ||
91 | // AddCentroid adds a single centroid. | |
92 | // Weights which are not a number or are <= 0 are ignored, as are NaN means. | |
93 | func (t *TDigest) AddCentroid(c Centroid) { | |
94 | if math.IsNaN(c.Mean) || c.Weight <= 0 || math.IsNaN(c.Weight) || math.IsInf(c.Weight, 1) { | |
39 | 95 | return |
40 | 96 | } |
41 | t.AddCentroid(Centroid{Mean: x, Weight: w}) | |
42 | } | |
43 | ||
44 | func (t *TDigest) AddCentroidList(c CentroidList) { | |
45 | l := c.Len() | |
46 | for i := 0; i < l; i++ { | |
47 | diff := l - i | |
48 | room := t.maxUnprocessed - t.unprocessed.Len() | |
49 | mid := i + diff | |
50 | if room < diff { | |
51 | mid = i + room | |
52 | } | |
53 | for i < mid { | |
54 | t.AddCentroid(c[i]) | |
55 | i++ | |
56 | } | |
57 | } | |
58 | } | |
59 | ||
60 | func (t *TDigest) AddCentroid(c Centroid) { | |
97 | ||
61 | 98 | t.unprocessed = append(t.unprocessed, c) |
62 | 99 | t.unprocessedWeight += c.Weight |
63 | 100 | |
65 | 102 | t.unprocessed.Len() > t.maxUnprocessed { |
66 | 103 | t.process() |
67 | 104 | } |
105 | } | |
106 | ||
107 | // Merges the supplied digest into this digest. Functionally equivalent to | |
108 | // calling t.AddCentroidList(t2.Centroids(nil)), but avoids making an extra | |
109 | // copy of the CentroidList. | |
110 | func (t *TDigest) Merge(t2 *TDigest) { | |
111 | t2.process() | |
112 | t.AddCentroidList(t2.processed) | |
68 | 113 | } |
69 | 114 | |
70 | 115 | func (t *TDigest) process() { |
97 | 142 | } |
98 | 143 | t.min = math.Min(t.min, t.processed[0].Mean) |
99 | 144 | t.max = math.Max(t.max, t.processed[t.processed.Len()-1].Mean) |
100 | t.updateCumulative() | |
101 | 145 | t.unprocessed.Clear() |
102 | 146 | } |
103 | 147 | } |
104 | 148 | |
149 | // Centroids returns a copy of processed centroids. | |
150 | // Useful when aggregating multiple t-digests. | |
151 | // | |
152 | // Centroids are appended to the passed CentroidList; if you're re-using a | |
153 | // buffer, be sure to pass cl[:0]. | |
154 | func (t *TDigest) Centroids(cl CentroidList) CentroidList { | |
155 | t.process() | |
156 | return append(cl, t.processed...) | |
157 | } | |
158 | ||
159 | func (t *TDigest) Count() float64 { | |
160 | t.process() | |
161 | ||
162 | // t.process always updates t.processedWeight to the total count of all | |
163 | // centroids, so we don't need to re-count here. | |
164 | return t.processedWeight | |
165 | } | |
166 | ||
105 | 167 | func (t *TDigest) updateCumulative() { |
106 | t.cumulative = make([]float64, t.processed.Len()+1) | |
168 | // Weight can only increase, so the final cumulative value will always be | |
169 | // either equal to, or less than, the total weight. If they are the same, | |
170 | // then nothing has changed since the last update. | |
171 | if len(t.cumulative) > 0 && t.cumulative[len(t.cumulative)-1] == t.processedWeight { | |
172 | return | |
173 | } | |
174 | ||
175 | if n := t.processed.Len() + 1; n <= cap(t.cumulative) { | |
176 | t.cumulative = t.cumulative[:n] | |
177 | } else { | |
178 | t.cumulative = make([]float64, n) | |
179 | } | |
180 | ||
107 | 181 | prev := 0.0 |
108 | 182 | for i, centroid := range t.processed { |
109 | 183 | cur := centroid.Weight |
113 | 187 | t.cumulative[t.processed.Len()] = prev |
114 | 188 | } |
115 | 189 | |
190 | // Quantile returns the (approximate) quantile of | |
191 | // the distribution. Accepted values for q are between 0.0 and 1.0. | |
192 | // Returns NaN if Count is zero or bad inputs. | |
116 | 193 | func (t *TDigest) Quantile(q float64) float64 { |
117 | 194 | t.process() |
195 | t.updateCumulative() | |
118 | 196 | if q < 0 || q > 1 || t.processed.Len() == 0 { |
119 | 197 | return math.NaN() |
120 | 198 | } |
141 | 219 | return weightedAverage(t.processed[t.processed.Len()-1].Mean, z1, t.max, z2) |
142 | 220 | } |
143 | 221 | |
222 | // CDF returns the cumulative distribution function for a given value x. | |
144 | 223 | func (t *TDigest) CDF(x float64) float64 { |
145 | 224 | t.process() |
225 | t.updateCumulative() | |
146 | 226 | switch t.processed.Len() { |
147 | 227 | case 0: |
148 | 228 | return 0.0 |
0 | 0 | package tdigest_test |
1 | 1 | |
2 | 2 | import ( |
3 | "math/rand" | |
3 | "fmt" | |
4 | "math" | |
5 | "reflect" | |
4 | 6 | "testing" |
5 | 7 | |
6 | "github.com/gonum/stat/distuv" | |
7 | 8 | "github.com/influxdata/tdigest" |
9 | "golang.org/x/exp/rand" | |
10 | "gonum.org/v1/gonum/stat/distuv" | |
8 | 11 | ) |
9 | 12 | |
10 | 13 | const ( |
24 | 27 | |
25 | 28 | func init() { |
26 | 29 | dist := distuv.Normal{ |
27 | Mu: Mu, | |
28 | Sigma: Sigma, | |
29 | Source: rand.New(rand.NewSource(seed)), | |
30 | Mu: Mu, | |
31 | Sigma: Sigma, | |
32 | Src: rand.New(rand.NewSource(seed)), | |
30 | 33 | } |
31 | 34 | uniform := rand.New(rand.NewSource(seed)) |
32 | 35 | |
42 | 45 | |
43 | 46 | UniformData[i] = uniform.Float64() * 100 |
44 | 47 | UniformDigest.Add(UniformData[i], 1) |
48 | } | |
49 | } | |
50 | ||
51 | // Compares the quantile results of two digests, and fails if the | |
52 | // fractional err exceeds maxErr. | |
53 | // Always fails if the total count differs. | |
54 | func compareQuantiles(td1, td2 *tdigest.TDigest, maxErr float64) error { | |
55 | if td1.Count() != td2.Count() { | |
56 | return fmt.Errorf("counts are not equal, %d vs %d", int64(td1.Count()), int64(td2.Count())) | |
57 | } | |
58 | for q := 0.05; q < 1; q += 0.05 { | |
59 | if math.Abs(td1.Quantile(q)-td2.Quantile(q))/td1.Quantile(q) > maxErr { | |
60 | return fmt.Errorf("quantile %g differs, %g vs %g", q, td1.Quantile(q), td2.Quantile(q)) | |
61 | } | |
62 | } | |
63 | return nil | |
64 | } | |
65 | ||
66 | // All Add methods should yield equivalent results. | |
67 | func TestTdigest_AddFuncs(t *testing.T) { | |
68 | centroids := NormalDigest.Centroids(nil) | |
69 | ||
70 | addDigest := tdigest.NewWithCompression(100) | |
71 | addCentroidDigest := tdigest.NewWithCompression(100) | |
72 | addCentroidListDigest := tdigest.NewWithCompression(100) | |
73 | ||
74 | for _, c := range centroids { | |
75 | addDigest.Add(c.Mean, c.Weight) | |
76 | addCentroidDigest.AddCentroid(c) | |
77 | } | |
78 | addCentroidListDigest.AddCentroidList(centroids) | |
79 | ||
80 | if err := compareQuantiles(addDigest, addCentroidDigest, 0.01); err != nil { | |
81 | t.Errorf("AddCentroid() differs from from Add(): %s", err.Error()) | |
82 | } | |
83 | if err := compareQuantiles(addDigest, addCentroidListDigest, 0.01); err != nil { | |
84 | t.Errorf("AddCentroidList() differs from from Add(): %s", err.Error()) | |
85 | } | |
86 | } | |
87 | ||
88 | func TestTdigest_Count(t *testing.T) { | |
89 | tests := []struct { | |
90 | name string | |
91 | data []float64 | |
92 | digest *tdigest.TDigest | |
93 | want float64 | |
94 | }{ | |
95 | { | |
96 | name: "empty", | |
97 | data: []float64{}, | |
98 | want: 0, | |
99 | }, | |
100 | { | |
101 | name: "not empty", | |
102 | data: []float64{5, 4}, | |
103 | want: 2, | |
104 | }, | |
105 | } | |
106 | ||
107 | for _, tt := range tests { | |
108 | t.Run(tt.name, func(t *testing.T) { | |
109 | td := tt.digest | |
110 | if td == nil { | |
111 | td = tdigest.NewWithCompression(1000) | |
112 | for _, x := range tt.data { | |
113 | td.Add(x, 1) | |
114 | } | |
115 | } | |
116 | got := td.Count() | |
117 | if got != tt.want { | |
118 | t.Errorf("unexpected count, got %g want %g", got, tt.want) | |
119 | } | |
120 | }) | |
121 | } | |
122 | ||
123 | got := NormalDigest.Count() | |
124 | want := float64(len(NormalData)) | |
125 | if got != want { | |
126 | t.Errorf("unexpected count for NormalDigest, got %g want %g", got, want) | |
127 | } | |
128 | ||
129 | got = UniformDigest.Count() | |
130 | want = float64(len(UniformData)) | |
131 | if got != want { | |
132 | t.Errorf("unexpected count for UniformDigest, got %g want %g", got, want) | |
45 | 133 | } |
46 | 134 | } |
47 | 135 | |
81 | 169 | name: "normal 50", |
82 | 170 | quantile: 0.5, |
83 | 171 | digest: NormalDigest, |
84 | want: 9.997821231634168, | |
172 | want: 10.000673533707138, | |
85 | 173 | }, |
86 | 174 | { |
87 | 175 | name: "normal 90", |
88 | 176 | quantile: 0.9, |
89 | 177 | digest: NormalDigest, |
90 | want: 13.843815760607427, | |
178 | want: 13.842132136909889, | |
91 | 179 | }, |
92 | 180 | { |
93 | 181 | name: "uniform 50", |
94 | 182 | quantile: 0.5, |
95 | 183 | digest: UniformDigest, |
96 | want: 50.02682856274754, | |
184 | want: 49.992502345843555, | |
97 | 185 | }, |
98 | 186 | { |
99 | 187 | name: "uniform 90", |
100 | 188 | quantile: 0.9, |
101 | 189 | digest: UniformDigest, |
102 | want: 90.02117754660424, | |
190 | want: 89.98281777095822, | |
103 | 191 | }, |
104 | 192 | { |
105 | 193 | name: "uniform 99", |
106 | 194 | quantile: 0.99, |
107 | 195 | digest: UniformDigest, |
108 | want: 99.00246731511771, | |
196 | want: 98.98503400959562, | |
109 | 197 | }, |
110 | 198 | { |
111 | 199 | name: "uniform 99.9", |
112 | 200 | quantile: 0.999, |
113 | 201 | digest: UniformDigest, |
114 | want: 99.90178495422307, | |
202 | want: 99.90103781043621, | |
115 | 203 | }, |
116 | 204 | } |
117 | 205 | for _, tt := range tests { |
161 | 249 | name: "normal mean", |
162 | 250 | cdf: 10, |
163 | 251 | data: NormalData, |
164 | want: 0.500298235578106, | |
252 | want: 0.4999156505250766, | |
165 | 253 | }, |
166 | 254 | { |
167 | 255 | name: "normal high", |
179 | 267 | name: "uniform 50", |
180 | 268 | cdf: 50, |
181 | 269 | data: UniformData, |
182 | want: 0.49972989818712815, | |
270 | want: 0.5000756133965755, | |
183 | 271 | }, |
184 | 272 | { |
185 | 273 | name: "uniform min", |
197 | 285 | name: "uniform 10", |
198 | 286 | cdf: 10, |
199 | 287 | data: UniformData, |
200 | want: 0.099715527526992, | |
288 | want: 0.09987932577650871, | |
201 | 289 | }, |
202 | 290 | { |
203 | 291 | name: "uniform 90", |
204 | 292 | cdf: 90, |
205 | 293 | data: UniformData, |
206 | want: 0.8997838903965611, | |
294 | want: 0.9001667885256108, | |
207 | 295 | }, |
208 | 296 | } |
209 | 297 | for _, tt := range tests { |
223 | 311 | } |
224 | 312 | } |
225 | 313 | |
314 | func TestTdigest_Reset(t *testing.T) { | |
315 | td := tdigest.New() | |
316 | for _, x := range NormalData { | |
317 | td.Add(x, 1) | |
318 | } | |
319 | q1 := td.Quantile(0.9) | |
320 | ||
321 | td.Reset() | |
322 | for _, x := range NormalData { | |
323 | td.Add(x, 1) | |
324 | } | |
325 | if q2 := td.Quantile(0.9); q2 != q1 { | |
326 | t.Errorf("unexpected quantile, got %g want %g", q2, q1) | |
327 | } | |
328 | } | |
329 | ||
330 | func TestTdigest_OddInputs(t *testing.T) { | |
331 | td := tdigest.New() | |
332 | td.Add(math.NaN(), 1) | |
333 | td.Add(1, math.NaN()) | |
334 | td.Add(1, 0) | |
335 | td.Add(1, -1000) | |
336 | if td.Count() != 0 { | |
337 | t.Error("invalid value was alloed to be added") | |
338 | } | |
339 | ||
340 | // Infinite values are allowed. | |
341 | td.Add(1, 1) | |
342 | td.Add(2, 1) | |
343 | td.Add(math.Inf(1), 1) | |
344 | if q := td.Quantile(0.5); q != 2 { | |
345 | t.Errorf("expected median value 2, got %f", q) | |
346 | } | |
347 | if q := td.Quantile(0.9); !math.IsInf(q, 1) { | |
348 | t.Errorf("expected median value 2, got %f", q) | |
349 | } | |
350 | } | |
351 | ||
352 | func TestTdigest_Merge(t *testing.T) { | |
353 | // Repeat merges enough times to ensure we call compress() | |
354 | numRepeats := 20 | |
355 | addDigest := tdigest.New() | |
356 | for i := 0; i < numRepeats; i++ { | |
357 | for _, c := range NormalDigest.Centroids(nil) { | |
358 | addDigest.AddCentroid(c) | |
359 | } | |
360 | for _, c := range UniformDigest.Centroids(nil) { | |
361 | addDigest.AddCentroid(c) | |
362 | } | |
363 | } | |
364 | ||
365 | mergeDigest := tdigest.New() | |
366 | for i := 0; i < numRepeats; i++ { | |
367 | mergeDigest.Merge(NormalDigest) | |
368 | mergeDigest.Merge(UniformDigest) | |
369 | } | |
370 | ||
371 | if err := compareQuantiles(addDigest, mergeDigest, 0.001); err != nil { | |
372 | t.Errorf("AddCentroid() differs from from Merge(): %s", err.Error()) | |
373 | } | |
374 | ||
375 | // Empty merge does nothing and has no effect on underlying centroids. | |
376 | c1 := addDigest.Centroids(nil) | |
377 | addDigest.Merge(tdigest.New()) | |
378 | c2 := addDigest.Centroids(nil) | |
379 | if !reflect.DeepEqual(c1, c2) { | |
380 | t.Error("Merging an empty digest altered data") | |
381 | } | |
382 | } | |
383 | ||
226 | 384 | var quantiles = []float64{0.1, 0.5, 0.9, 0.99, 0.999} |
227 | 385 | |
228 | 386 | func BenchmarkTDigest_Add(b *testing.B) { |
233 | 391 | } |
234 | 392 | } |
235 | 393 | } |
394 | ||
395 | func BenchmarkTDigest_AddCentroid(b *testing.B) { | |
396 | centroids := make(tdigest.CentroidList, len(NormalData)) | |
397 | for i := range centroids { | |
398 | centroids[i].Mean = NormalData[i] | |
399 | centroids[i].Weight = 1 | |
400 | } | |
401 | ||
402 | b.ResetTimer() | |
403 | for n := 0; n < b.N; n++ { | |
404 | td := tdigest.NewWithCompression(1000) | |
405 | for i := range centroids { | |
406 | td.AddCentroid(centroids[i]) | |
407 | } | |
408 | } | |
409 | } | |
410 | ||
411 | func BenchmarkTDigest_AddCentroidList(b *testing.B) { | |
412 | centroids := make(tdigest.CentroidList, len(NormalData)) | |
413 | for i := range centroids { | |
414 | centroids[i].Mean = NormalData[i] | |
415 | centroids[i].Weight = 1 | |
416 | } | |
417 | ||
418 | b.ResetTimer() | |
419 | for n := 0; n < b.N; n++ { | |
420 | td := tdigest.NewWithCompression(1000) | |
421 | td.AddCentroidList(centroids) | |
422 | } | |
423 | } | |
424 | ||
425 | func BenchmarkTDigest_Merge(b *testing.B) { | |
426 | b.Run("AddCentroid", func(b *testing.B) { | |
427 | var cl tdigest.CentroidList | |
428 | td := tdigest.New() | |
429 | for n := 0; n < b.N; n++ { | |
430 | cl = NormalDigest.Centroids(cl[:0]) | |
431 | for i := range cl { | |
432 | td.AddCentroid(cl[i]) | |
433 | } | |
434 | } | |
435 | }) | |
436 | b.Run("Merge", func(b *testing.B) { | |
437 | td := tdigest.New() | |
438 | for n := 0; n < b.N; n++ { | |
439 | td.Merge(NormalDigest) | |
440 | } | |
441 | }) | |
442 | } | |
443 | ||
236 | 444 | func BenchmarkTDigest_Quantile(b *testing.B) { |
237 | 445 | td := tdigest.NewWithCompression(1000) |
238 | 446 | for _, x := range NormalData { |
246 | 454 | } |
247 | 455 | } |
248 | 456 | } |
457 | ||
458 | func TestTdigest_Centroids(t *testing.T) { | |
459 | tests := []struct { | |
460 | name string | |
461 | data []float64 | |
462 | digest *tdigest.TDigest | |
463 | want tdigest.CentroidList | |
464 | }{ | |
465 | { | |
466 | name: "increasing", | |
467 | data: []float64{1, 2, 3, 4, 5}, | |
468 | want: tdigest.CentroidList{ | |
469 | tdigest.Centroid{ | |
470 | Mean: 1.0, | |
471 | Weight: 1.0, | |
472 | }, | |
473 | ||
474 | tdigest.Centroid{ | |
475 | Mean: 2.5, | |
476 | Weight: 2.0, | |
477 | }, | |
478 | ||
479 | tdigest.Centroid{ | |
480 | Mean: 4.0, | |
481 | Weight: 1.0, | |
482 | }, | |
483 | ||
484 | tdigest.Centroid{ | |
485 | Mean: 5.0, | |
486 | Weight: 1.0, | |
487 | }, | |
488 | }, | |
489 | }, | |
490 | } | |
491 | ||
492 | for _, tt := range tests { | |
493 | t.Run(tt.name, func(t *testing.T) { | |
494 | var got tdigest.CentroidList | |
495 | td := tt.digest | |
496 | if td == nil { | |
497 | td = tdigest.NewWithCompression(3) | |
498 | for _, x := range tt.data { | |
499 | td.Add(x, 1) | |
500 | } | |
501 | } | |
502 | got = td.Centroids(got[:0]) | |
503 | if !reflect.DeepEqual(got, tt.want) { | |
504 | t.Errorf("unexpected list got %g want %g", got, tt.want) | |
505 | } | |
506 | }) | |
507 | } | |
508 | } |
0 | package main | |
1 | ||
2 | import ( | |
3 | "os" | |
4 | "strconv" | |
5 | ||
6 | "golang.org/x/exp/rand" | |
7 | "gonum.org/v1/gonum/stat/distuv" | |
8 | ) | |
9 | ||
10 | const ( | |
11 | N = 1e6 | |
12 | Mu = 10 | |
13 | Sigma = 3 | |
14 | ||
15 | seed = 42 | |
16 | ) | |
17 | ||
18 | func main() { | |
19 | // Generate uniform and normal data | |
20 | uniform := rand.New(rand.NewSource(seed)) | |
21 | dist := distuv.Normal{ | |
22 | Mu: Mu, | |
23 | Sigma: Sigma, | |
24 | Src: rand.New(rand.NewSource(seed)), | |
25 | } | |
26 | ||
27 | uniformData := make([]float64, N) | |
28 | normalData := make([]float64, N) | |
29 | for i := range normalData { | |
30 | normalData[i] = dist.Rand() | |
31 | uniformData[i] = uniform.Float64() * 100 | |
32 | } | |
33 | ||
34 | smallData := []float64{1, 2, 3, 4, 5, 5, 4, 3, 2, 1} | |
35 | ||
36 | writeData("uniform.dat", uniformData) | |
37 | writeData("normal.dat", normalData) | |
38 | writeData("small.dat", smallData) | |
39 | } | |
40 | ||
41 | func writeData(name string, data []float64) { | |
42 | f, err := os.Create(name) | |
43 | if err != nil { | |
44 | panic(err) | |
45 | } | |
46 | defer f.Close() | |
47 | ||
48 | buf := make([]byte, 0, 64) | |
49 | for _, x := range data { | |
50 | buf = strconv.AppendFloat(buf, x, 'f', -1, 64) | |
51 | _, err := f.Write(buf) | |
52 | if err != nil { | |
53 | panic(err) | |
54 | } | |
55 | _, err = f.Write([]byte{'\n'}) | |
56 | if err != nil { | |
57 | panic(err) | |
58 | } | |
59 | buf = buf[0:0] | |
60 | } | |
61 | } |
0 | package main | |
1 | ||
2 | import ( | |
3 | "math/rand" | |
4 | "os" | |
5 | "strconv" | |
6 | ||
7 | "github.com/gonum/stat/distuv" | |
8 | ) | |
9 | ||
10 | const ( | |
11 | N = 1e6 | |
12 | Mu = 10 | |
13 | Sigma = 3 | |
14 | ||
15 | seed = 42 | |
16 | ) | |
17 | ||
18 | func main() { | |
19 | // Generate uniform and normal data | |
20 | uniform := rand.New(rand.NewSource(seed)) | |
21 | dist := distuv.Normal{ | |
22 | Mu: Mu, | |
23 | Sigma: Sigma, | |
24 | Source: rand.New(rand.NewSource(seed)), | |
25 | } | |
26 | ||
27 | uniformData := make([]float64, N) | |
28 | normalData := make([]float64, N) | |
29 | for i := range normalData { | |
30 | normalData[i] = dist.Rand() | |
31 | uniformData[i] = uniform.Float64() * 100 | |
32 | } | |
33 | ||
34 | smallData := []float64{1, 2, 3, 4, 5, 5, 4, 3, 2, 1} | |
35 | ||
36 | writeData("uniform.dat", uniformData) | |
37 | writeData("normal.dat", normalData) | |
38 | writeData("small.dat", smallData) | |
39 | } | |
40 | ||
41 | func writeData(name string, data []float64) { | |
42 | f, err := os.Create(name) | |
43 | if err != nil { | |
44 | panic(err) | |
45 | } | |
46 | defer f.Close() | |
47 | ||
48 | buf := make([]byte, 0, 64) | |
49 | for _, x := range data { | |
50 | buf = strconv.AppendFloat(buf, x, 'f', -1, 64) | |
51 | _, err := f.Write(buf) | |
52 | if err != nil { | |
53 | panic(err) | |
54 | } | |
55 | _, err = f.Write([]byte{'\n'}) | |
56 | if err != nil { | |
57 | panic(err) | |
58 | } | |
59 | buf = buf[0:0] | |
60 | } | |
61 | } |
4 | 4 | DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) |
5 | 5 | cd "$DIR" |
6 | 6 | |
7 | go run gen.go | |
7 | go run gen/main.go | |
8 | 8 | go run main.go |
9 | g++ -o cpp.test main.cpp | |
9 | g++ -std=c++11 -o cpp.test main.cpp | |
10 | 10 | ./cpp.test 2>/dev/null |
11 | 11 | rm cpp.test |
12 | 12 | |
13 | go run validate.go | |
13 | go run validate/main.go |
0 | package main | |
1 | ||
2 | import ( | |
3 | "bufio" | |
4 | "log" | |
5 | "math" | |
6 | "os" | |
7 | "strconv" | |
8 | "strings" | |
9 | ) | |
10 | ||
11 | var dataFiles = []string{ | |
12 | "small.dat", | |
13 | "uniform.dat", | |
14 | "normal.dat", | |
15 | } | |
16 | ||
17 | const ( | |
18 | cppQExt = ".cpp.quantiles" | |
19 | goQExt = ".go.quantiles" | |
20 | ||
21 | cppCDFExt = ".cpp.cdfs" | |
22 | goCDFExt = ".go.cdfs" | |
23 | ||
24 | epsilon = 1e-6 | |
25 | ) | |
26 | ||
27 | func main() { | |
28 | for _, f := range dataFiles { | |
29 | // Validate Quantiles | |
30 | cppQuantiles := loadResults(f + cppQExt) | |
31 | goQuantiles := loadResults(f + goQExt) | |
32 | if len(cppQuantiles) != len(goQuantiles) { | |
33 | log.Fatal("differing number of quantiles results") | |
34 | } | |
35 | ||
36 | for i := range cppQuantiles { | |
37 | if math.Abs(cppQuantiles[i]-goQuantiles[i]) > epsilon { | |
38 | log.Fatalf("differing quantile result go: %f cpp: %f", goQuantiles[i], cppQuantiles[i]) | |
39 | } | |
40 | } | |
41 | ||
42 | // Validate CDFs | |
43 | cppCDFs := loadResults(f + cppCDFExt) | |
44 | goCDFs := loadResults(f + goCDFExt) | |
45 | if len(cppCDFs) != len(goCDFs) { | |
46 | log.Fatal("differing number of CDFs results") | |
47 | } | |
48 | ||
49 | for i := range cppCDFs { | |
50 | if math.Abs(cppCDFs[i]-goCDFs[i]) > epsilon { | |
51 | log.Fatalf("differing CDF result go: %f cpp: %f", goCDFs[i], cppCDFs[i]) | |
52 | } | |
53 | } | |
54 | } | |
55 | } | |
56 | ||
57 | func loadResults(name string) []float64 { | |
58 | f, err := os.Open(name) | |
59 | if err != nil { | |
60 | panic(err) | |
61 | } | |
62 | defer f.Close() | |
63 | s := bufio.NewScanner(f) | |
64 | var data []float64 | |
65 | for s.Scan() { | |
66 | parts := strings.SplitN(s.Text(), " ", 2) | |
67 | x, err := strconv.ParseFloat(parts[0], 64) | |
68 | if err != nil { | |
69 | panic(err) | |
70 | } | |
71 | data = append(data, x) | |
72 | } | |
73 | return data | |
74 | } |
0 | package main | |
1 | ||
2 | import ( | |
3 | "bufio" | |
4 | "log" | |
5 | "math" | |
6 | "os" | |
7 | "strconv" | |
8 | "strings" | |
9 | ) | |
10 | ||
11 | var dataFiles = []string{ | |
12 | "small.dat", | |
13 | "uniform.dat", | |
14 | "normal.dat", | |
15 | } | |
16 | ||
17 | const ( | |
18 | cppQExt = ".cpp.quantiles" | |
19 | goQExt = ".go.quantiles" | |
20 | ||
21 | cppCDFExt = ".cpp.cdfs" | |
22 | goCDFExt = ".go.cdfs" | |
23 | ||
24 | epsilon = 1e-6 | |
25 | ) | |
26 | ||
27 | func main() { | |
28 | for _, f := range dataFiles { | |
29 | // Validate Quantiles | |
30 | cppQuantiles := loadResults(f + cppQExt) | |
31 | goQuantiles := loadResults(f + goQExt) | |
32 | if len(cppQuantiles) != len(goQuantiles) { | |
33 | log.Fatal("differing number of quantiles results") | |
34 | } | |
35 | ||
36 | for i := range cppQuantiles { | |
37 | if math.Abs(cppQuantiles[i]-goQuantiles[i]) > epsilon { | |
38 | log.Fatalf("differing quantile result go: %f cpp: %f", goQuantiles[i], cppQuantiles[i]) | |
39 | } | |
40 | } | |
41 | ||
42 | // Validate CDFs | |
43 | cppCDFs := loadResults(f + cppCDFExt) | |
44 | goCDFs := loadResults(f + goCDFExt) | |
45 | if len(cppCDFs) != len(goCDFs) { | |
46 | log.Fatal("differing number of CDFs results") | |
47 | } | |
48 | ||
49 | for i := range cppCDFs { | |
50 | if math.Abs(cppCDFs[i]-goCDFs[i]) > epsilon { | |
51 | log.Fatalf("differing CDF result go: %f cpp: %f", goCDFs[i], cppCDFs[i]) | |
52 | } | |
53 | } | |
54 | } | |
55 | } | |
56 | ||
57 | func loadResults(name string) []float64 { | |
58 | f, err := os.Open(name) | |
59 | if err != nil { | |
60 | panic(err) | |
61 | } | |
62 | defer f.Close() | |
63 | s := bufio.NewScanner(f) | |
64 | var data []float64 | |
65 | for s.Scan() { | |
66 | parts := strings.SplitN(s.Text(), " ", 2) | |
67 | x, err := strconv.ParseFloat(parts[0], 64) | |
68 | if err != nil { | |
69 | panic(err) | |
70 | } | |
71 | data = append(data, x) | |
72 | } | |
73 | return data | |
74 | } |