0 | 0 |
/*********************************************************************
|
1 | 1 |
Blosc - Blocked Shuffling and Compression Library
|
2 | 2 |
|
3 | |
Author: Francesc Alted <francesc@blosc.org>
|
4 | |
Creation date: 2009-05-20
|
|
3 |
Copyright (C) 2021 The Blosc Developers <blosc@blosc.org>
|
|
4 |
https://blosc.org
|
|
5 |
License: BSD 3-Clause (see LICENSE.txt)
|
5 | 6 |
|
6 | 7 |
See LICENSE.txt for details about copyright and rights to use.
|
7 | 8 |
**********************************************************************/
|
|
42 | 43 |
#define MAX_FARDISTANCE (65535 + MAX_DISTANCE - 1)
|
43 | 44 |
|
44 | 45 |
#ifdef BLOSC_STRICT_ALIGN
|
45 | |
#define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8)
|
|
46 |
#define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8)
|
46 | 47 |
#define BLOSCLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24)
|
47 | 48 |
#else
|
48 | |
#define BLOSCLZ_READU16(p) *((const uint16_t*)(p))
|
49 | |
#define BLOSCLZ_READU32(p) *((const uint32_t*)(p))
|
50 | |
#endif
|
51 | |
|
52 | |
#define HASH_LOG (12U)
|
|
49 |
#define BLOSCLZ_READU16(p) *((const uint16_t*)(p))
|
|
50 |
#define BLOSCLZ_READU32(p) *((const uint32_t*)(p))
|
|
51 |
#endif
|
|
52 |
|
|
53 |
#define HASH_LOG (14U)
|
|
54 |
#define HASH_LOG2 (12U)
|
53 | 55 |
|
54 | 56 |
// This is used in LZ4 and seems to work pretty well here too
|
55 | 57 |
#define HASH_FUNCTION(v, s, h) { \
|
|
60 | 62 |
#if defined(__AVX2__)
|
61 | 63 |
static uint8_t *get_run_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
|
62 | 64 |
uint8_t x = ip[-1];
|
63 | |
/* safe because the outer check against ip limit */
|
64 | |
if (ip < (ip_bound - sizeof(int64_t))) {
|
65 | |
int64_t value, value2;
|
66 | |
/* Broadcast the value for every byte in a 64-bit register */
|
67 | |
memset(&value, x, 8);
|
68 | |
#if defined(BLOSC_STRICT_ALIGN)
|
69 | |
memcpy(&value2, ref, 8);
|
70 | |
#else
|
71 | |
value2 = ((int64_t*)ref)[0];
|
72 | |
#endif
|
73 | |
if (value != value2) {
|
74 | |
/* Return the byte that starts to differ */
|
75 | |
while (*ref++ == x) ip++;
|
76 | |
return ip;
|
77 | |
}
|
78 | |
else {
|
79 | |
ip += 8;
|
80 | |
ref += 8;
|
81 | |
}
|
82 | |
}
|
83 | |
if (ip < (ip_bound - sizeof(__m128i))) {
|
84 | |
__m128i value, value2, cmp;
|
85 | |
/* Broadcast the value for every byte in a 128-bit register */
|
86 | |
memset(&value, x, sizeof(__m128i));
|
87 | |
value2 = _mm_loadu_si128((__m128i *) ref);
|
88 | |
cmp = _mm_cmpeq_epi32(value, value2);
|
89 | |
if (_mm_movemask_epi8(cmp) != 0xFFFF) {
|
90 | |
/* Return the byte that starts to differ */
|
91 | |
while (*ref++ == x) ip++;
|
92 | |
return ip;
|
93 | |
} else {
|
94 | |
ip += sizeof(__m128i);
|
95 | |
ref += sizeof(__m128i);
|
96 | |
}
|
97 | |
}
|
|
65 |
|
98 | 66 |
while (ip < (ip_bound - (sizeof(__m256i)))) {
|
99 | 67 |
__m256i value, value2, cmp;
|
100 | 68 |
/* Broadcast the value for every byte in a 256-bit register */
|
|
115 | 83 |
while ((ip < ip_bound) && (*ref++ == x)) ip++;
|
116 | 84 |
return ip;
|
117 | 85 |
}
|
118 | |
|
119 | |
#elif defined(__SSE2__)
|
120 | |
|
|
86 |
#endif
|
|
87 |
|
|
88 |
#if defined(__SSE2__)
|
121 | 89 |
static uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
|
122 | 90 |
uint8_t x = ip[-1];
|
123 | 91 |
|
124 | |
if (ip < (ip_bound - sizeof(int64_t))) {
|
125 | |
int64_t value, value2;
|
126 | |
/* Broadcast the value for every byte in a 64-bit register */
|
127 | |
memset(&value, x, 8);
|
128 | |
#if defined(BLOSC_STRICT_ALIGN)
|
129 | |
memcpy(&value2, ref, 8);
|
130 | |
#else
|
131 | |
value2 = ((int64_t*)ref)[0];
|
132 | |
#endif
|
133 | |
if (value != value2) {
|
134 | |
/* Return the byte that starts to differ */
|
135 | |
while (*ref++ == x) ip++;
|
136 | |
return ip;
|
137 | |
}
|
138 | |
else {
|
139 | |
ip += 8;
|
140 | |
ref += 8;
|
141 | |
}
|
142 | |
}
|
143 | |
/* safe because the outer check against ip limit */
|
144 | 92 |
while (ip < (ip_bound - sizeof(__m128i))) {
|
145 | 93 |
__m128i value, value2, cmp;
|
146 | 94 |
/* Broadcast the value for every byte in a 128-bit register */
|
|
162 | 110 |
return ip;
|
163 | 111 |
}
|
164 | 112 |
|
165 | |
#else
|
|
113 |
#endif
|
|
114 |
|
166 | 115 |
|
167 | 116 |
static uint8_t *get_run(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
|
168 | 117 |
uint8_t x = ip[-1];
|
|
191 | 140 |
return ip;
|
192 | 141 |
}
|
193 | 142 |
|
194 | |
#endif
|
195 | |
|
196 | 143 |
|
197 | 144 |
/* Return the byte that starts to differ */
|
198 | 145 |
static uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
|
|
219 | 166 |
static uint8_t *get_match_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
|
220 | 167 |
__m128i value, value2, cmp;
|
221 | 168 |
|
222 | |
if (ip < (ip_bound - sizeof(int64_t))) {
|
223 | |
if (*(int64_t *) ref != *(int64_t *) ip) {
|
224 | |
/* Return the byte that starts to differ */
|
225 | |
while (*ref++ == *ip++) {}
|
226 | |
return ip;
|
227 | |
} else {
|
228 | |
ip += sizeof(int64_t);
|
229 | |
ref += sizeof(int64_t);
|
230 | |
}
|
231 | |
}
|
232 | 169 |
while (ip < (ip_bound - sizeof(__m128i))) {
|
233 | 170 |
value = _mm_loadu_si128((__m128i *) ip);
|
234 | 171 |
value2 = _mm_loadu_si128((__m128i *) ref);
|
235 | 172 |
cmp = _mm_cmpeq_epi32(value, value2);
|
236 | 173 |
if (_mm_movemask_epi8(cmp) != 0xFFFF) {
|
237 | 174 |
/* Return the byte that starts to differ */
|
238 | |
return get_match(ip, ip_bound, ref);
|
|
175 |
while (*ref++ == *ip++) {}
|
|
176 |
return ip;
|
239 | 177 |
}
|
240 | 178 |
else {
|
241 | 179 |
ip += sizeof(__m128i);
|
|
252 | 190 |
#if defined(__AVX2__)
|
253 | 191 |
static uint8_t *get_match_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
|
254 | 192 |
|
255 | |
if (ip < (ip_bound - sizeof(int64_t))) {
|
256 | |
if (*(int64_t *) ref != *(int64_t *) ip) {
|
257 | |
/* Return the byte that starts to differ */
|
258 | |
while (*ref++ == *ip++) {}
|
259 | |
return ip;
|
260 | |
} else {
|
261 | |
ip += sizeof(int64_t);
|
262 | |
ref += sizeof(int64_t);
|
263 | |
}
|
264 | |
}
|
265 | |
if (ip < (ip_bound - sizeof(__m128i))) {
|
266 | |
__m128i value, value2, cmp;
|
267 | |
value = _mm_loadu_si128((__m128i *) ip);
|
268 | |
value2 = _mm_loadu_si128((__m128i *) ref);
|
269 | |
cmp = _mm_cmpeq_epi32(value, value2);
|
270 | |
if (_mm_movemask_epi8(cmp) != 0xFFFF) {
|
271 | |
/* Return the byte that starts to differ */
|
272 | |
return get_match_16(ip, ip_bound, ref);
|
273 | |
}
|
274 | |
else {
|
275 | |
ip += sizeof(__m128i);
|
276 | |
ref += sizeof(__m128i);
|
277 | |
}
|
278 | |
}
|
279 | 193 |
while (ip < (ip_bound - sizeof(__m256i))) {
|
280 | 194 |
__m256i value, value2, cmp;
|
281 | 195 |
value = _mm256_loadu_si256((__m256i *) ip);
|
|
298 | 212 |
#endif
|
299 | 213 |
|
300 | 214 |
|
301 | |
static uint8_t* get_run_or_match(uint8_t* ip, uint8_t* ip_bound, const uint8_t* ref, bool run) {
|
|
215 |
static uint8_t* get_run_or_match(uint8_t* ip, const uint8_t* ip_bound, const uint8_t* ref, bool run) {
|
302 | 216 |
if (BLOSCLZ_UNLIKELY(run)) {
|
303 | 217 |
#if defined(__AVX2__)
|
304 | |
ip = get_run_32(ip, ip_bound, ref);
|
|
218 |
// Extensive experiments on AMD Ryzen3 say that regular get_run is faster
|
|
219 |
// ip = get_run_32(ip, ip_bound, ref);
|
|
220 |
ip = get_run(ip, ip_bound, ref);
|
305 | 221 |
#elif defined(__SSE2__)
|
306 | |
ip = get_run_16(ip, ip_bound, ref);
|
|
222 |
// Extensive experiments on AMD Ryzen3 say that regular get_run is faster
|
|
223 |
// ip = get_run_16(ip, ip_bound, ref);
|
|
224 |
ip = get_run(ip, ip_bound, ref);
|
307 | 225 |
#else
|
308 | 226 |
ip = get_run(ip, ip_bound, ref);
|
309 | 227 |
#endif
|
310 | 228 |
}
|
311 | 229 |
else {
|
312 | 230 |
#if defined(__AVX2__)
|
313 | |
ip = get_match_32(ip, ip_bound, ref);
|
|
231 |
// Extensive experiments on AMD Ryzen3 say that regular get_match_16 is faster
|
|
232 |
// ip = get_match_32(ip, ip_bound, ref);
|
|
233 |
ip = get_match_16(ip, ip_bound, ref);
|
314 | 234 |
#elif defined(__SSE2__)
|
315 | 235 |
ip = get_match_16(ip, ip_bound, ref);
|
316 | 236 |
#else
|
|
334 | 254 |
} \
|
335 | 255 |
}
|
336 | 256 |
|
337 | |
#define LITERAL2(ip, oc, anchor, copy) { \
|
|
257 |
#define LITERAL2(ip, anchor, copy) { \
|
338 | 258 |
oc++; anchor++; \
|
339 | 259 |
ip = anchor; \
|
340 | 260 |
copy++; \
|
|
344 | 264 |
} \
|
345 | 265 |
}
|
346 | 266 |
|
347 | |
#define DISTANCE_SHORT(op, op_limit, len, distance) { \
|
|
267 |
#define MATCH_SHORT(op, op_limit, len, distance) { \
|
348 | 268 |
if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) \
|
349 | 269 |
goto out; \
|
350 | 270 |
*op++ = (uint8_t)((len << 5U) + (distance >> 8U)); \
|
351 | 271 |
*op++ = (uint8_t)((distance & 255U)); \
|
352 | 272 |
}
|
353 | 273 |
|
354 | |
#define DISTANCE_LONG(op, op_limit, len, distance) { \
|
|
274 |
#define MATCH_LONG(op, op_limit, len, distance) { \
|
355 | 275 |
if (BLOSCLZ_UNLIKELY(op + 1 > op_limit)) \
|
356 | 276 |
goto out; \
|
357 | 277 |
*op++ = (uint8_t)((7U << 5U) + (distance >> 8U)); \
|
|
366 | 286 |
*op++ = (uint8_t)((distance & 255U)); \
|
367 | 287 |
}
|
368 | 288 |
|
369 | |
#define DISTANCE_SHORT_FAR(op, op_limit, len, distance) { \
|
|
289 |
#define MATCH_SHORT_FAR(op, op_limit, len, distance) { \
|
370 | 290 |
if (BLOSCLZ_UNLIKELY(op + 4 > op_limit)) \
|
371 | 291 |
goto out; \
|
372 | 292 |
*op++ = (uint8_t)((len << 5U) + 31); \
|
|
375 | 295 |
*op++ = (uint8_t)(distance & 255U); \
|
376 | 296 |
}
|
377 | 297 |
|
378 | |
#define DISTANCE_LONG_FAR(op, op_limit, len, distance) { \
|
|
298 |
#define MATCH_LONG_FAR(op, op_limit, len, distance) { \
|
379 | 299 |
if (BLOSCLZ_UNLIKELY(op + 1 > op_limit)) \
|
380 | 300 |
goto out; \
|
381 | 301 |
*op++ = (7U << 5U) + 31; \
|
|
393 | 313 |
}
|
394 | 314 |
|
395 | 315 |
|
396 | |
// Get the compressed size of a buffer. Useful for testing compression ratios for high clevels.
|
397 | |
static int get_csize(uint8_t* ibase, int maxlen, bool force_3b_shift) {
|
|
316 |
// Get a guess for the compressed size of a buffer
|
|
317 |
static double get_cratio(uint8_t* ibase, int maxlen, int minlen, int ipshift) {
|
398 | 318 |
uint8_t* ip = ibase;
|
399 | 319 |
int32_t oc = 0;
|
400 | |
uint8_t* ip_bound = ibase + maxlen - 1;
|
401 | |
uint8_t* ip_limit = ibase + maxlen - 12;
|
402 | |
uint32_t htab[1U << (uint8_t)HASH_LOG];
|
|
320 |
const uint16_t hashlen = (1U << (uint8_t)HASH_LOG2);
|
|
321 |
uint16_t htab[1U << (uint8_t)HASH_LOG2];
|
403 | 322 |
uint32_t hval;
|
404 | 323 |
uint32_t seq;
|
405 | 324 |
uint8_t copy;
|
|
325 |
// Make a tradeoff between testing too much and too little
|
|
326 |
uint16_t limit = (maxlen > hashlen) ? hashlen : maxlen;
|
|
327 |
uint8_t* ip_bound = ibase + limit - 1;
|
|
328 |
uint8_t* ip_limit = ibase + limit - 12;
|
406 | 329 |
|
407 | 330 |
// Initialize the hash table to distances of 0
|
408 | |
for (unsigned i = 0; i < (1U << HASH_LOG); i++) {
|
409 | |
htab[i] = 0;
|
410 | |
}
|
|
331 |
memset(htab, 0, hashlen * sizeof(uint16_t));
|
411 | 332 |
|
412 | 333 |
/* we start with literal copy */
|
413 | 334 |
copy = 4;
|
|
421 | 342 |
|
422 | 343 |
/* find potential match */
|
423 | 344 |
seq = BLOSCLZ_READU32(ip);
|
424 | |
HASH_FUNCTION(hval, seq, HASH_LOG)
|
|
345 |
HASH_FUNCTION(hval, seq, HASH_LOG2)
|
425 | 346 |
ref = ibase + htab[hval];
|
426 | 347 |
|
427 | 348 |
/* calculate distance to the match */
|
428 | |
distance = anchor - ref;
|
|
349 |
distance = (unsigned int)(anchor - ref);
|
429 | 350 |
|
430 | 351 |
/* update hash table */
|
431 | |
htab[hval] = (uint32_t) (anchor - ibase);
|
|
352 |
htab[hval] = (uint16_t) (anchor - ibase);
|
432 | 353 |
|
433 | 354 |
if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
|
434 | |
LITERAL2(ip, oc, anchor, copy)
|
|
355 |
LITERAL2(ip, anchor, copy)
|
435 | 356 |
continue;
|
436 | 357 |
}
|
437 | 358 |
|
438 | 359 |
/* is this a match? check the first 4 bytes */
|
439 | |
if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) {
|
|
360 |
if (BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip)) {
|
440 | 361 |
ref += 4;
|
441 | 362 |
}
|
442 | 363 |
else {
|
443 | 364 |
/* no luck, copy as a literal */
|
444 | |
LITERAL2(ip, oc, anchor, copy)
|
|
365 |
LITERAL2(ip, anchor, copy)
|
445 | 366 |
continue;
|
446 | 367 |
}
|
447 | 368 |
|
|
454 | 375 |
/* get runs or matches; zero distance means a run */
|
455 | 376 |
ip = get_run_or_match(ip, ip_bound, ref, !distance);
|
456 | 377 |
|
457 | |
ip -= force_3b_shift ? 3 : 4;
|
|
378 |
ip -= ipshift;
|
458 | 379 |
unsigned len = (int)(ip - anchor);
|
459 | |
// If match is close, let's reduce the minimum length to encode it
|
460 | |
unsigned minlen = (distance < MAX_DISTANCE) ? 3 : 4;
|
461 | |
// Encoding short lengths is expensive during decompression
|
462 | 380 |
if (len < minlen) {
|
463 | |
LITERAL2(ip, oc, anchor, copy)
|
|
381 |
LITERAL2(ip, anchor, copy)
|
464 | 382 |
continue;
|
465 | 383 |
}
|
466 | 384 |
|
467 | |
/* if we have'nt copied anything, adjust the output counter */
|
|
385 |
/* if we haven't copied anything, adjust the output counter */
|
468 | 386 |
if (!copy)
|
469 | 387 |
oc--;
|
470 | 388 |
/* reset literal counter */
|
|
487 | 405 |
|
488 | 406 |
/* update the hash at match boundary */
|
489 | 407 |
seq = BLOSCLZ_READU32(ip);
|
490 | |
HASH_FUNCTION(hval, seq, HASH_LOG)
|
491 | |
htab[hval] = (uint32_t) (ip++ - ibase);
|
492 | |
seq >>= 8U;
|
493 | |
HASH_FUNCTION(hval, seq, HASH_LOG)
|
494 | |
htab[hval] = (uint32_t) (ip++ - ibase);
|
|
408 |
HASH_FUNCTION(hval, seq, HASH_LOG2)
|
|
409 |
htab[hval] = (uint16_t)(ip++ - ibase);
|
|
410 |
ip++;
|
495 | 411 |
/* assuming literal copy */
|
496 | 412 |
oc++;
|
497 | |
|
498 | |
}
|
499 | |
|
500 | |
/* if we have copied something, adjust the copy length */
|
501 | |
if (!copy)
|
502 | |
oc--;
|
503 | |
|
504 | |
return (int)oc;
|
|
413 |
}
|
|
414 |
|
|
415 |
double ic;
|
|
416 |
ic = (double)(ip - ibase);
|
|
417 |
return ic / (double)oc;
|
505 | 418 |
}
|
506 | 419 |
|
507 | 420 |
|
508 | 421 |
int blosclz_compress(const int clevel, const void* input, int length,
|
509 | |
void* output, int maxout) {
|
|
422 |
void* output, int maxout, const int split_block) {
|
510 | 423 |
uint8_t* ibase = (uint8_t*)input;
|
511 | |
uint8_t* ip = ibase;
|
512 | |
uint8_t* ip_bound = ibase + length - 1;
|
513 | |
uint8_t* ip_limit = ibase + length - 12;
|
514 | |
uint8_t* op = (uint8_t*)output;
|
515 | |
uint8_t* op_limit;
|
516 | |
uint32_t htab[1U << (uint8_t)HASH_LOG];
|
517 | |
uint32_t hval;
|
518 | |
uint32_t seq;
|
519 | |
uint8_t copy;
|
520 | |
|
521 | |
op_limit = op + maxout;
|
522 | |
|
523 | |
// Minimum lengths for encoding
|
524 | |
unsigned minlen_[10] = {0, 12, 12, 11, 10, 9, 8, 7, 6, 5};
|
525 | |
|
526 | |
// Minimum compression ratios for initiate encoding
|
527 | |
double cratio_[10] = {0, 2, 2, 2, 2, 1.8, 1.6, 1.4, 1.2, 1.1};
|
528 | |
|
529 | |
uint8_t hashlog_[10] = {0, HASH_LOG - 2, HASH_LOG - 1, HASH_LOG, HASH_LOG,
|
530 | |
HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG};
|
531 | |
uint8_t hashlog = hashlog_[clevel];
|
532 | |
// Initialize the hash table to distances of 0
|
533 | |
for (unsigned i = 0; i < (1U << hashlog); i++) {
|
534 | |
htab[i] = 0;
|
535 | |
}
|
536 | |
|
537 | |
/* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */
|
538 | |
if (length < 16 || maxout < 66) {
|
539 | |
return 0;
|
|
424 |
|
|
425 |
// Experiments say that checking 1/4 of the buffer is enough to figure out approx cratio
|
|
426 |
int maxlen = length / 4;
|
|
427 |
// Start probing somewhere inside the buffer
|
|
428 |
int shift = length - maxlen;
|
|
429 |
// Actual entropy probing!
|
|
430 |
double cratio = get_cratio(ibase + shift, maxlen, 3, 3);
|
|
431 |
// discard probes with small compression ratios (too expensive)
|
|
432 |
double cratio_[10] = {0, 2, 1.5, 1.2, 1.2, 1.2, 1.2, 1.15, 1.1, 1.0};
|
|
433 |
if (cratio < cratio_[clevel]) {
|
|
434 |
goto out;
|
540 | 435 |
}
|
541 | 436 |
|
542 | 437 |
/* When we go back in a match (shift), we obtain quite different compression properties.
|
543 | 438 |
* It looks like 4 is more useful in combination with bitshuffle and small typesizes
|
544 | |
* (compress better and faster in e.g. `b2bench blosclz bitshuffle single 6 6291456 1 19`).
|
545 | |
* Fallback to 4 because it provides more consistent results on small itemsizes.
|
|
439 |
* Fallback to 4 because it provides more consistent results for large cratios.
|
546 | 440 |
*
|
547 | 441 |
* In this block we also check cratios for the beginning of the buffers and
|
548 | 442 |
* eventually discard those that are small (take too long to decompress).
|
549 | 443 |
* This process is called _entropy probing_.
|
550 | 444 |
*/
|
551 | |
int ipshift = 4;
|
552 | |
int maxlen; // maximum length for entropy probing
|
553 | |
int csize_3b;
|
554 | |
int csize_4b;
|
555 | |
double cratio = 0;
|
556 | |
switch (clevel) {
|
557 | |
case 1:
|
558 | |
case 2:
|
559 | |
case 3:
|
560 | |
maxlen = length / 8;
|
561 | |
csize_4b = get_csize(ibase, maxlen, false);
|
562 | |
cratio = (double)maxlen / csize_4b;
|
563 | |
break;
|
564 | |
case 4:
|
565 | |
case 5:
|
566 | |
case 6:
|
567 | |
case 7:
|
568 | |
case 8:
|
569 | |
maxlen = length / 8;
|
570 | |
csize_4b = get_csize(ibase, maxlen, false);
|
571 | |
cratio = (double)maxlen / csize_4b;
|
572 | |
break;
|
573 | |
case 9:
|
574 | |
// case 9 is special. we need to asses the optimal shift
|
575 | |
maxlen = length / 8;
|
576 | |
csize_3b = get_csize(ibase, maxlen, true);
|
577 | |
csize_4b = get_csize(ibase, maxlen, false);
|
578 | |
ipshift = (csize_3b < csize_4b) ? 3 : 4;
|
579 | |
cratio = (csize_3b < csize_4b) ? ((double)maxlen / csize_3b) : ((double)maxlen / csize_4b);
|
580 | |
break;
|
581 | |
default:
|
582 | |
break;
|
583 | |
}
|
584 | |
// discard probes with small compression ratios (too expensive)
|
585 | |
if (cratio < cratio_ [clevel]) {
|
586 | |
goto out;
|
587 | |
}
|
|
445 |
unsigned ipshift = 4;
|
|
446 |
// Compute optimal shift and minimum lengths for encoding
|
|
447 |
// Use 4 by default, except for low entropy data, where we should do a best effort
|
|
448 |
unsigned minlen = 4;
|
|
449 |
// BloscLZ works better with splits mostly, so when data is not split, do a best effort
|
|
450 |
// Why using cratio < 4 is based in experiments with low and high entropy
|
|
451 |
if (!split_block || cratio < 4) {
|
|
452 |
ipshift = 3;
|
|
453 |
minlen = 3;
|
|
454 |
}
|
|
455 |
|
|
456 |
uint8_t hashlog_[10] = {0, HASH_LOG - 2, HASH_LOG - 1, HASH_LOG, HASH_LOG,
|
|
457 |
HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG};
|
|
458 |
uint8_t hashlog = hashlog_[clevel];
|
|
459 |
|
|
460 |
uint8_t* ip = ibase;
|
|
461 |
const uint8_t* ip_bound = ibase + length - 1;
|
|
462 |
const uint8_t* ip_limit = ibase + length - 12;
|
|
463 |
uint8_t* op = (uint8_t*)output;
|
|
464 |
const uint8_t* op_limit = op + maxout;
|
|
465 |
|
|
466 |
/* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */
|
|
467 |
if (length < 16 || maxout < 66) {
|
|
468 |
return 0;
|
|
469 |
}
|
|
470 |
|
|
471 |
// Initialize the hash table
|
|
472 |
uint32_t htab[1U << (uint8_t)HASH_LOG];
|
|
473 |
memset(htab, 0, (1U << hashlog) * sizeof(uint32_t));
|
588 | 474 |
|
589 | 475 |
/* we start with literal copy */
|
590 | |
copy = 4;
|
|
476 |
uint8_t copy = 4;
|
591 | 477 |
*op++ = MAX_COPY - 1;
|
592 | 478 |
*op++ = *ip++;
|
593 | 479 |
*op++ = *ip++;
|
|
601 | 487 |
uint8_t* anchor = ip; /* comparison starting-point */
|
602 | 488 |
|
603 | 489 |
/* find potential match */
|
604 | |
seq = BLOSCLZ_READU32(ip);
|
|
490 |
uint32_t seq = BLOSCLZ_READU32(ip);
|
|
491 |
uint32_t hval;
|
605 | 492 |
HASH_FUNCTION(hval, seq, hashlog)
|
606 | 493 |
ref = ibase + htab[hval];
|
607 | 494 |
|
608 | 495 |
/* calculate distance to the match */
|
609 | |
distance = anchor - ref;
|
|
496 |
distance = (unsigned int)(anchor - ref);
|
610 | 497 |
|
611 | 498 |
/* update hash table */
|
612 | 499 |
htab[hval] = (uint32_t) (anchor - ibase);
|
|
638 | 525 |
ip -= ipshift;
|
639 | 526 |
|
640 | 527 |
unsigned len = (int)(ip - anchor);
|
641 | |
// If match is close, let's reduce the minimum length to encode it
|
642 | |
unsigned minlen = (clevel == 9) ? ipshift : minlen_[clevel];
|
643 | 528 |
|
644 | 529 |
// Encoding short lengths is expensive during decompression
|
645 | |
// Encode only for reasonable lengths (extensive experiments done)
|
646 | 530 |
if (len < minlen || (len <= 5 && distance >= MAX_DISTANCE)) {
|
647 | 531 |
LITERAL(ip, op, op_limit, anchor, copy)
|
648 | 532 |
continue;
|
|
661 | 545 |
/* encode the match */
|
662 | 546 |
if (distance < MAX_DISTANCE) {
|
663 | 547 |
if (len < 7) {
|
664 | |
DISTANCE_SHORT(op, op_limit, len, distance)
|
|
548 |
MATCH_SHORT(op, op_limit, len, distance)
|
665 | 549 |
} else {
|
666 | |
DISTANCE_LONG(op, op_limit, len, distance)
|
|
550 |
MATCH_LONG(op, op_limit, len, distance)
|
667 | 551 |
}
|
668 | 552 |
} else {
|
669 | 553 |
/* far away, but not yet in the another galaxy... */
|
670 | 554 |
distance -= MAX_DISTANCE;
|
671 | 555 |
if (len < 7) {
|
672 | |
DISTANCE_SHORT_FAR(op, op_limit, len, distance)
|
|
556 |
MATCH_SHORT_FAR(op, op_limit, len, distance)
|
673 | 557 |
} else {
|
674 | |
DISTANCE_LONG_FAR(op, op_limit, len, distance)
|
|
558 |
MATCH_LONG_FAR(op, op_limit, len, distance)
|
675 | 559 |
}
|
676 | 560 |
}
|
677 | 561 |
|
|
679 | 563 |
seq = BLOSCLZ_READU32(ip);
|
680 | 564 |
HASH_FUNCTION(hval, seq, hashlog)
|
681 | 565 |
htab[hval] = (uint32_t) (ip++ - ibase);
|
682 | |
seq >>= 8U;
|
683 | |
HASH_FUNCTION(hval, seq, hashlog)
|
684 | |
htab[hval] = (uint32_t) (ip++ - ibase);
|
685 | |
/* assuming literal copy */
|
|
566 |
if (clevel == 9) {
|
|
567 |
// In some situations, including a second hash proves to be useful,
|
|
568 |
// but not in others. Activating here in max clevel only.
|
|
569 |
seq >>= 8U;
|
|
570 |
HASH_FUNCTION(hval, seq, hashlog)
|
|
571 |
htab[hval] = (uint32_t) (ip++ - ibase);
|
|
572 |
}
|
|
573 |
else {
|
|
574 |
ip++;
|
|
575 |
}
|
686 | 576 |
|
687 | 577 |
if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))
|
688 | 578 |
goto out;
|
|
579 |
|
|
580 |
/* assuming literal copy */
|
689 | 581 |
*op++ = MAX_COPY - 1;
|
690 | 582 |
}
|
691 | 583 |
|
|
716 | 608 |
}
|
717 | 609 |
|
718 | 610 |
// See https://habr.com/en/company/yandex/blog/457612/
|
719 | |
#ifdef __AVX2__
|
|
611 |
#if defined(__AVX2__)
|
720 | 612 |
|
721 | 613 |
#if defined(_MSC_VER)
|
722 | 614 |
#define ALIGNED_(x) __declspec(align(x))
|
|
852 | 744 |
}
|
853 | 745 |
else {
|
854 | 746 |
// general copy with any overlap
|
855 | |
#ifdef __AVX2__
|
|
747 |
#if defined(__AVX2__)
|
856 | 748 |
if (op - ref <= 16) {
|
857 | 749 |
// This is not faster on a combination of compilers (clang, gcc, icc) or machines, but
|
858 | 750 |
// it is not slower either. Let's activate here for experimentation.
|
|
860 | 752 |
}
|
861 | 753 |
else {
|
862 | 754 |
#endif
|
863 | |
op = copy_match(op, ref, (unsigned) len);
|
864 | |
#ifdef __AVX2__
|
|
755 |
op = copy_match(op, ref, (unsigned) len);
|
|
756 |
#if defined(__AVX2__)
|
865 | 757 |
}
|
866 | 758 |
#endif
|
867 | 759 |
}
|