43 | 43 |
#define MAX_FARDISTANCE (65535 + MAX_DISTANCE - 1)
|
44 | 44 |
|
45 | 45 |
#ifdef BLOSC_STRICT_ALIGN
|
46 | |
#define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8)
|
|
46 |
#define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8)
|
47 | 47 |
#define BLOSCLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24)
|
48 | 48 |
#else
|
49 | |
#define BLOSCLZ_READU16(p) *((const uint16_t*)(p))
|
50 | |
#define BLOSCLZ_READU32(p) *((const uint32_t*)(p))
|
|
49 |
#define BLOSCLZ_READU16(p) *((const uint16_t*)(p))
|
|
50 |
#define BLOSCLZ_READU32(p) *((const uint32_t*)(p))
|
51 | 51 |
#endif
|
52 | 52 |
|
53 | 53 |
#define HASH_LOG (14U)
|
|
55 | 55 |
|
56 | 56 |
// This is used in LZ4 and seems to work pretty well here too
|
57 | 57 |
#define HASH_FUNCTION(v, s, h) { \
|
58 | |
v = (s * 2654435761U) >> (32U - h); \
|
|
58 |
(v) = ((s) * 2654435761U) >> (32U - (h)); \
|
59 | 59 |
}
|
60 | 60 |
|
61 | 61 |
|
|
86 | 86 |
#endif
|
87 | 87 |
|
88 | 88 |
#if defined(__SSE2__)
|
89 | |
static uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
|
|
89 |
uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
|
90 | 90 |
uint8_t x = ip[-1];
|
91 | 91 |
|
92 | 92 |
while (ip < (ip_bound - sizeof(__m128i))) {
|
|
142 | 142 |
|
143 | 143 |
|
144 | 144 |
/* Return the byte that starts to differ */
|
145 | |
static uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
|
|
145 |
uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
|
146 | 146 |
#if !defined(BLOSC_STRICT_ALIGN)
|
147 | 147 |
while (ip < (ip_bound - sizeof(int64_t))) {
|
148 | 148 |
if (*(int64_t*)ref != *(int64_t*)ip) {
|
|
212 | 212 |
#endif
|
213 | 213 |
|
214 | 214 |
|
215 | |
static uint8_t* get_run_or_match(uint8_t* ip, const uint8_t* ip_bound, const uint8_t* ref, bool run) {
|
|
215 |
static uint8_t* get_run_or_match(uint8_t* ip, uint8_t* ip_bound, const uint8_t* ref, bool run) {
|
216 | 216 |
if (BLOSCLZ_UNLIKELY(run)) {
|
217 | 217 |
#if defined(__AVX2__)
|
218 | 218 |
// Extensive experiments on AMD Ryzen3 say that regular get_run is faster
|
|
243 | 243 |
|
244 | 244 |
|
245 | 245 |
#define LITERAL(ip, op, op_limit, anchor, copy) { \
|
246 | |
if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) \
|
|
246 |
if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit))) \
|
247 | 247 |
goto out; \
|
248 | |
*op++ = *anchor++; \
|
249 | |
ip = anchor; \
|
250 | |
copy++; \
|
251 | |
if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) { \
|
252 | |
copy = 0; \
|
253 | |
*op++ = MAX_COPY-1; \
|
|
248 |
*(op)++ = *(anchor)++; \
|
|
249 |
(ip) = (anchor); \
|
|
250 |
(copy)++; \
|
|
251 |
if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) { \
|
|
252 |
(copy) = 0; \
|
|
253 |
*(op)++ = MAX_COPY-1; \
|
254 | 254 |
} \
|
255 | 255 |
}
|
256 | 256 |
|
257 | 257 |
#define LITERAL2(ip, anchor, copy) { \
|
258 | |
oc++; anchor++; \
|
259 | |
ip = anchor; \
|
260 | |
copy++; \
|
261 | |
if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) { \
|
262 | |
copy = 0; \
|
|
258 |
oc++; (anchor)++; \
|
|
259 |
(ip) = (anchor); \
|
|
260 |
(copy)++; \
|
|
261 |
if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) { \
|
|
262 |
(copy) = 0; \
|
263 | 263 |
oc++; \
|
264 | 264 |
} \
|
265 | 265 |
}
|
266 | 266 |
|
267 | |
#define MATCH_SHORT(op, op_limit, len, distance) { \
|
268 | |
if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) \
|
|
267 |
#define MATCH_SHORT(op, op_limit, len, distance) { \
|
|
268 |
if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit))) \
|
|
269 |
goto out; \
|
|
270 |
*(op)++ = (uint8_t)(((len) << 5U) + ((distance) >> 8U));\
|
|
271 |
*(op)++ = (uint8_t)(((distance) & 255U)); \
|
|
272 |
}
|
|
273 |
|
|
274 |
#define MATCH_LONG(op, op_limit, len, distance) { \
|
|
275 |
if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit))) \
|
269 | 276 |
goto out; \
|
270 | |
*op++ = (uint8_t)((len << 5U) + (distance >> 8U)); \
|
271 | |
*op++ = (uint8_t)((distance & 255U)); \
|
272 | |
}
|
273 | |
|
274 | |
#define MATCH_LONG(op, op_limit, len, distance) { \
|
275 | |
if (BLOSCLZ_UNLIKELY(op + 1 > op_limit)) \
|
|
277 |
*(op)++ = (uint8_t)((7U << 5U) + ((distance) >> 8U)); \
|
|
278 |
for ((len) -= 7; (len) >= 255; (len) -= 255) { \
|
|
279 |
if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit))) \
|
|
280 |
goto out; \
|
|
281 |
*(op)++ = 255; \
|
|
282 |
} \
|
|
283 |
if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit))) \
|
276 | 284 |
goto out; \
|
277 | |
*op++ = (uint8_t)((7U << 5U) + (distance >> 8U)); \
|
278 | |
for (len -= 7; len >= 255; len -= 255) { \
|
279 | |
if (BLOSCLZ_UNLIKELY(op + 1 > op_limit)) \
|
280 | |
goto out; \
|
281 | |
*op++ = 255; \
|
282 | |
} \
|
283 | |
if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) \
|
284 | |
goto out; \
|
285 | |
*op++ = (uint8_t)len; \
|
286 | |
*op++ = (uint8_t)((distance & 255U)); \
|
|
285 |
*(op)++ = (uint8_t)(len); \
|
|
286 |
*(op)++ = (uint8_t)(((distance) & 255U)); \
|
287 | 287 |
}
|
288 | 288 |
|
289 | 289 |
#define MATCH_SHORT_FAR(op, op_limit, len, distance) { \
|
290 | |
if (BLOSCLZ_UNLIKELY(op + 4 > op_limit)) \
|
|
290 |
if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit))) \
|
291 | 291 |
goto out; \
|
292 | |
*op++ = (uint8_t)((len << 5U) + 31); \
|
293 | |
*op++ = 255; \
|
294 | |
*op++ = (uint8_t)(distance >> 8U); \
|
295 | |
*op++ = (uint8_t)(distance & 255U); \
|
|
292 |
*(op)++ = (uint8_t)(((len) << 5U) + 31); \
|
|
293 |
*(op)++ = 255; \
|
|
294 |
*(op)++ = (uint8_t)((distance) >> 8U); \
|
|
295 |
*(op)++ = (uint8_t)((distance) & 255U); \
|
296 | 296 |
}
|
297 | 297 |
|
298 | 298 |
#define MATCH_LONG_FAR(op, op_limit, len, distance) { \
|
299 | |
if (BLOSCLZ_UNLIKELY(op + 1 > op_limit)) \
|
|
299 |
if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit))) \
|
300 | 300 |
goto out; \
|
301 | |
*op++ = (7U << 5U) + 31; \
|
302 | |
for (len -= 7; len >= 255; len -= 255) { \
|
303 | |
if (BLOSCLZ_UNLIKELY(op + 1 > op_limit)) \
|
|
301 |
*(op)++ = (7U << 5U) + 31; \
|
|
302 |
for ((len) -= 7; (len) >= 255; (len) -= 255) { \
|
|
303 |
if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit))) \
|
304 | 304 |
goto out; \
|
305 | |
*op++ = 255; \
|
|
305 |
*(op)++ = 255; \
|
306 | 306 |
} \
|
307 | |
if (BLOSCLZ_UNLIKELY(op + 4 > op_limit)) \
|
|
307 |
if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit))) \
|
308 | 308 |
goto out; \
|
309 | |
*op++ = (uint8_t)len; \
|
310 | |
*op++ = 255; \
|
311 | |
*op++ = (uint8_t)(distance >> 8U); \
|
312 | |
*op++ = (uint8_t)(distance & 255U); \
|
|
309 |
*(op)++ = (uint8_t)(len); \
|
|
310 |
*(op)++ = 255; \
|
|
311 |
*(op)++ = (uint8_t)((distance) >> 8U); \
|
|
312 |
*(op)++ = (uint8_t)((distance) & 255U); \
|
313 | 313 |
}
|
314 | 314 |
|
315 | 315 |
|
|
376 | 376 |
ip = get_run_or_match(ip, ip_bound, ref, !distance);
|
377 | 377 |
|
378 | 378 |
ip -= ipshift;
|
379 | |
unsigned len = (int)(ip - anchor);
|
|
379 |
int len = (int)(ip - anchor);
|
380 | 380 |
if (len < minlen) {
|
381 | 381 |
LITERAL2(ip, anchor, copy)
|
382 | 382 |
continue;
|
|
412 | 412 |
oc++;
|
413 | 413 |
}
|
414 | 414 |
|
415 | |
double ic;
|
416 | |
ic = (double)(ip - ibase);
|
|
415 |
double ic = (double)(ip - ibase);
|
417 | 416 |
return ic / (double)oc;
|
418 | 417 |
}
|
419 | 418 |
|
|
431 | 430 |
// discard probes with small compression ratios (too expensive)
|
432 | 431 |
double cratio_[10] = {0, 2, 1.5, 1.2, 1.2, 1.2, 1.2, 1.15, 1.1, 1.0};
|
433 | 432 |
if (cratio < cratio_[clevel]) {
|
434 | |
goto out;
|
|
433 |
goto out;
|
435 | 434 |
}
|
436 | 435 |
|
437 | 436 |
/* When we go back in a match (shift), we obtain quite different compression properties.
|
|
452 | 451 |
ipshift = 3;
|
453 | 452 |
minlen = 3;
|
454 | 453 |
}
|
|
454 |
else {
|
|
455 |
minlen = 4;
|
|
456 |
}
|
455 | 457 |
|
456 | 458 |
uint8_t hashlog_[10] = {0, HASH_LOG - 2, HASH_LOG - 1, HASH_LOG, HASH_LOG,
|
457 | 459 |
HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG};
|
458 | 460 |
uint8_t hashlog = hashlog_[clevel];
|
459 | 461 |
|
460 | 462 |
uint8_t* ip = ibase;
|
461 | |
const uint8_t* ip_bound = ibase + length - 1;
|
462 | |
const uint8_t* ip_limit = ibase + length - 12;
|
|
463 |
uint8_t* ip_bound = ibase + length - 1;
|
|
464 |
uint8_t* ip_limit = ibase + length - 12;
|
463 | 465 |
uint8_t* op = (uint8_t*)output;
|
464 | 466 |
const uint8_t* op_limit = op + maxout;
|
|
467 |
uint32_t seq;
|
|
468 |
uint8_t copy;
|
|
469 |
uint32_t hval;
|
465 | 470 |
|
466 | 471 |
/* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */
|
467 | 472 |
if (length < 16 || maxout < 66) {
|
|
473 | 478 |
memset(htab, 0, (1U << hashlog) * sizeof(uint32_t));
|
474 | 479 |
|
475 | 480 |
/* we start with literal copy */
|
476 | |
uint8_t copy = 4;
|
|
481 |
copy = 4;
|
477 | 482 |
*op++ = MAX_COPY - 1;
|
478 | 483 |
*op++ = *ip++;
|
479 | 484 |
*op++ = *ip++;
|
|
487 | 492 |
uint8_t* anchor = ip; /* comparison starting-point */
|
488 | 493 |
|
489 | 494 |
/* find potential match */
|
490 | |
uint32_t seq = BLOSCLZ_READU32(ip);
|
491 | |
uint32_t hval;
|
|
495 |
seq = BLOSCLZ_READU32(ip);
|
492 | 496 |
HASH_FUNCTION(hval, seq, hashlog)
|
493 | 497 |
ref = ibase + htab[hval];
|
494 | 498 |
|
|
685 | 689 |
while (1) {
|
686 | 690 |
if (ctrl >= 32) {
|
687 | 691 |
// match
|
688 | |
int32_t len = (ctrl >> 5U) - 1 ;
|
689 | |
int32_t ofs = (ctrl & 31U) << 8U;
|
|
692 |
int32_t len = (int32_t)(ctrl >> 5U) - 1 ;
|
|
693 |
int32_t ofs = (int32_t)(ctrl & 31U) << 8U;
|
690 | 694 |
uint8_t code;
|
691 | 695 |
const uint8_t* ref = op - ofs;
|
692 | 696 |
|
|
752 | 756 |
}
|
753 | 757 |
else {
|
754 | 758 |
#endif
|
755 | |
op = copy_match(op, ref, (unsigned) len);
|
|
759 |
op = copy_match(op, ref, (unsigned) len);
|
756 | 760 |
#if defined(__AVX2__)
|
757 | 761 |
}
|
758 | 762 |
#endif
|