894 lines
26 KiB
C
894 lines
26 KiB
C
/*********************************************************************
|
|
Blosc - Blocked Shuffling and Compression Library
|
|
|
|
Author: Francesc Alted <francesc@blosc.org>
|
|
Creation date: 2009-05-20
|
|
|
|
See LICENSE.txt for details about copyright and rights to use.
|
|
**********************************************************************/
|
|
|
|
/*********************************************************************
|
|
The code in this file is heavily based on FastLZ, a lightning-fast
|
|
lossless compression library. See LICENSES/FASTLZ.txt for details.
|
|
**********************************************************************/
|
|
|
|
|
|
#include <stdio.h>
|
|
#include <stdbool.h>
|
|
#include "blosclz.h"
|
|
#include "fastcopy.h"
|
|
#include "blosc-common.h"
|
|
|
|
|
|
/*
|
|
* Give hints to the compiler for branch prediction optimization.
|
|
*/
|
|
#if defined(__GNUC__) && (__GNUC__ > 2)
|
|
#define BLOSCLZ_LIKELY(c) (__builtin_expect((c), 1))
|
|
#define BLOSCLZ_UNLIKELY(c) (__builtin_expect((c), 0))
|
|
#else
|
|
#define BLOSCLZ_LIKELY(c) (c)
|
|
#define BLOSCLZ_UNLIKELY(c) (c)
|
|
#endif
|
|
|
|
/*
|
|
* Use inlined functions for supported systems.
|
|
*/
|
|
#if defined(_MSC_VER) && !defined(__cplusplus) /* Visual Studio */
|
|
#define inline __inline /* Visual C is not C99, but supports some kind of inline */
|
|
#endif
|
|
|
|
#define MAX_COPY 32U
|
|
#define MAX_DISTANCE 8191
|
|
#define MAX_FARDISTANCE (65535 + MAX_DISTANCE - 1)
|
|
|
|
#ifdef BLOSC_STRICT_ALIGN
|
|
#define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8)
|
|
#define BLOSCLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24)
|
|
#else
|
|
#define BLOSCLZ_READU16(p) *((const uint16_t*)(p))
|
|
#define BLOSCLZ_READU32(p) *((const uint32_t*)(p))
|
|
#endif
|
|
|
|
#define HASH_LOG (12U)
|
|
|
|
// This is used in LZ4 and seems to work pretty well here too
|
|
#define HASH_FUNCTION(v, s, h) { \
|
|
v = (s * 2654435761U) >> (32U - h); \
|
|
}
|
|
|
|
|
|
#if defined(__AVX2__)
|
|
static uint8_t *get_run_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
|
|
uint8_t x = ip[-1];
|
|
/* safe because the outer check against ip limit */
|
|
if (ip < (ip_bound - sizeof(int64_t))) {
|
|
int64_t value, value2;
|
|
/* Broadcast the value for every byte in a 64-bit register */
|
|
memset(&value, x, 8);
|
|
#if defined(BLOSC_STRICT_ALIGN)
|
|
memcpy(&value2, ref, 8);
|
|
#else
|
|
value2 = ((int64_t*)ref)[0];
|
|
#endif
|
|
if (value != value2) {
|
|
/* Return the byte that starts to differ */
|
|
while (*ref++ == x) ip++;
|
|
return ip;
|
|
}
|
|
else {
|
|
ip += 8;
|
|
ref += 8;
|
|
}
|
|
}
|
|
if (ip < (ip_bound - sizeof(__m128i))) {
|
|
__m128i value, value2, cmp;
|
|
/* Broadcast the value for every byte in a 128-bit register */
|
|
memset(&value, x, sizeof(__m128i));
|
|
value2 = _mm_loadu_si128((__m128i *) ref);
|
|
cmp = _mm_cmpeq_epi32(value, value2);
|
|
if (_mm_movemask_epi8(cmp) != 0xFFFF) {
|
|
/* Return the byte that starts to differ */
|
|
while (*ref++ == x) ip++;
|
|
return ip;
|
|
} else {
|
|
ip += sizeof(__m128i);
|
|
ref += sizeof(__m128i);
|
|
}
|
|
}
|
|
while (ip < (ip_bound - (sizeof(__m256i)))) {
|
|
__m256i value, value2, cmp;
|
|
/* Broadcast the value for every byte in a 256-bit register */
|
|
memset(&value, x, sizeof(__m256i));
|
|
value2 = _mm256_loadu_si256((__m256i *)ref);
|
|
cmp = _mm256_cmpeq_epi64(value, value2);
|
|
if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
|
|
/* Return the byte that starts to differ */
|
|
while (*ref++ == x) ip++;
|
|
return ip;
|
|
}
|
|
else {
|
|
ip += sizeof(__m256i);
|
|
ref += sizeof(__m256i);
|
|
}
|
|
}
|
|
/* Look into the remainder */
|
|
while ((ip < ip_bound) && (*ref++ == x)) ip++;
|
|
return ip;
|
|
}
|
|
|
|
#elif defined(__SSE2__)
|
|
|
|
static uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
|
|
uint8_t x = ip[-1];
|
|
|
|
if (ip < (ip_bound - sizeof(int64_t))) {
|
|
int64_t value, value2;
|
|
/* Broadcast the value for every byte in a 64-bit register */
|
|
memset(&value, x, 8);
|
|
#if defined(BLOSC_STRICT_ALIGN)
|
|
memcpy(&value2, ref, 8);
|
|
#else
|
|
value2 = ((int64_t*)ref)[0];
|
|
#endif
|
|
if (value != value2) {
|
|
/* Return the byte that starts to differ */
|
|
while (*ref++ == x) ip++;
|
|
return ip;
|
|
}
|
|
else {
|
|
ip += 8;
|
|
ref += 8;
|
|
}
|
|
}
|
|
/* safe because the outer check against ip limit */
|
|
while (ip < (ip_bound - sizeof(__m128i))) {
|
|
__m128i value, value2, cmp;
|
|
/* Broadcast the value for every byte in a 128-bit register */
|
|
memset(&value, x, sizeof(__m128i));
|
|
value2 = _mm_loadu_si128((__m128i *)ref);
|
|
cmp = _mm_cmpeq_epi32(value, value2);
|
|
if (_mm_movemask_epi8(cmp) != 0xFFFF) {
|
|
/* Return the byte that starts to differ */
|
|
while (*ref++ == x) ip++;
|
|
return ip;
|
|
}
|
|
else {
|
|
ip += sizeof(__m128i);
|
|
ref += sizeof(__m128i);
|
|
}
|
|
}
|
|
/* Look into the remainder */
|
|
while ((ip < ip_bound) && (*ref++ == x)) ip++;
|
|
return ip;
|
|
}
|
|
|
|
#else
|
|
|
|
static uint8_t *get_run(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
|
|
uint8_t x = ip[-1];
|
|
int64_t value, value2;
|
|
/* Broadcast the value for every byte in a 64-bit register */
|
|
memset(&value, x, 8);
|
|
/* safe because the outer check against ip limit */
|
|
while (ip < (ip_bound - sizeof(int64_t))) {
|
|
#if defined(BLOSC_STRICT_ALIGN)
|
|
memcpy(&value2, ref, 8);
|
|
#else
|
|
value2 = ((int64_t*)ref)[0];
|
|
#endif
|
|
if (value != value2) {
|
|
/* Return the byte that starts to differ */
|
|
while (*ref++ == x) ip++;
|
|
return ip;
|
|
}
|
|
else {
|
|
ip += 8;
|
|
ref += 8;
|
|
}
|
|
}
|
|
/* Look into the remainder */
|
|
while ((ip < ip_bound) && (*ref++ == x)) ip++;
|
|
return ip;
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
/* Return the byte that starts to differ */
|
|
static uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
|
|
#if !defined(BLOSC_STRICT_ALIGN)
|
|
while (ip < (ip_bound - sizeof(int64_t))) {
|
|
if (*(int64_t*)ref != *(int64_t*)ip) {
|
|
/* Return the byte that starts to differ */
|
|
while (*ref++ == *ip++) {}
|
|
return ip;
|
|
}
|
|
else {
|
|
ip += sizeof(int64_t);
|
|
ref += sizeof(int64_t);
|
|
}
|
|
}
|
|
#endif
|
|
/* Look into the remainder */
|
|
while ((ip < ip_bound) && (*ref++ == *ip++)) {}
|
|
return ip;
|
|
}
|
|
|
|
|
|
#if defined(__SSE2__)
|
|
static uint8_t *get_match_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
|
|
__m128i value, value2, cmp;
|
|
|
|
if (ip < (ip_bound - sizeof(int64_t))) {
|
|
if (*(int64_t *) ref != *(int64_t *) ip) {
|
|
/* Return the byte that starts to differ */
|
|
while (*ref++ == *ip++) {}
|
|
return ip;
|
|
} else {
|
|
ip += sizeof(int64_t);
|
|
ref += sizeof(int64_t);
|
|
}
|
|
}
|
|
while (ip < (ip_bound - sizeof(__m128i))) {
|
|
value = _mm_loadu_si128((__m128i *) ip);
|
|
value2 = _mm_loadu_si128((__m128i *) ref);
|
|
cmp = _mm_cmpeq_epi32(value, value2);
|
|
if (_mm_movemask_epi8(cmp) != 0xFFFF) {
|
|
/* Return the byte that starts to differ */
|
|
return get_match(ip, ip_bound, ref);
|
|
}
|
|
else {
|
|
ip += sizeof(__m128i);
|
|
ref += sizeof(__m128i);
|
|
}
|
|
}
|
|
/* Look into the remainder */
|
|
while ((ip < ip_bound) && (*ref++ == *ip++)) {}
|
|
return ip;
|
|
}
|
|
#endif
|
|
|
|
|
|
#if defined(__AVX2__)
|
|
static uint8_t *get_match_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
|
|
|
|
if (ip < (ip_bound - sizeof(int64_t))) {
|
|
if (*(int64_t *) ref != *(int64_t *) ip) {
|
|
/* Return the byte that starts to differ */
|
|
while (*ref++ == *ip++) {}
|
|
return ip;
|
|
} else {
|
|
ip += sizeof(int64_t);
|
|
ref += sizeof(int64_t);
|
|
}
|
|
}
|
|
if (ip < (ip_bound - sizeof(__m128i))) {
|
|
__m128i value, value2, cmp;
|
|
value = _mm_loadu_si128((__m128i *) ip);
|
|
value2 = _mm_loadu_si128((__m128i *) ref);
|
|
cmp = _mm_cmpeq_epi32(value, value2);
|
|
if (_mm_movemask_epi8(cmp) != 0xFFFF) {
|
|
/* Return the byte that starts to differ */
|
|
return get_match_16(ip, ip_bound, ref);
|
|
}
|
|
else {
|
|
ip += sizeof(__m128i);
|
|
ref += sizeof(__m128i);
|
|
}
|
|
}
|
|
while (ip < (ip_bound - sizeof(__m256i))) {
|
|
__m256i value, value2, cmp;
|
|
value = _mm256_loadu_si256((__m256i *) ip);
|
|
value2 = _mm256_loadu_si256((__m256i *)ref);
|
|
cmp = _mm256_cmpeq_epi64(value, value2);
|
|
if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
|
|
/* Return the byte that starts to differ */
|
|
while (*ref++ == *ip++) {}
|
|
return ip;
|
|
}
|
|
else {
|
|
ip += sizeof(__m256i);
|
|
ref += sizeof(__m256i);
|
|
}
|
|
}
|
|
/* Look into the remainder */
|
|
while ((ip < ip_bound) && (*ref++ == *ip++)) {}
|
|
return ip;
|
|
}
|
|
#endif
|
|
|
|
|
|
static uint8_t* get_run_or_match(uint8_t* ip, uint8_t* ip_bound, const uint8_t* ref, bool run) {
|
|
if (BLOSCLZ_UNLIKELY(run)) {
|
|
#if defined(__AVX2__)
|
|
ip = get_run_32(ip, ip_bound, ref);
|
|
#elif defined(__SSE2__)
|
|
ip = get_run_16(ip, ip_bound, ref);
|
|
#else
|
|
ip = get_run(ip, ip_bound, ref);
|
|
#endif
|
|
}
|
|
else {
|
|
#if defined(__AVX2__)
|
|
ip = get_match_32(ip, ip_bound, ref);
|
|
#elif defined(__SSE2__)
|
|
ip = get_match_16(ip, ip_bound, ref);
|
|
#else
|
|
ip = get_match(ip, ip_bound, ref);
|
|
#endif
|
|
}
|
|
|
|
return ip;
|
|
}
|
|
|
|
|
|
#define LITERAL(ip, op, op_limit, anchor, copy) { \
|
|
if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) \
|
|
goto out; \
|
|
*op++ = *anchor++; \
|
|
ip = anchor; \
|
|
copy++; \
|
|
if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) { \
|
|
copy = 0; \
|
|
*op++ = MAX_COPY-1; \
|
|
} \
|
|
}
|
|
|
|
#define LITERAL2(ip, oc, anchor, copy) { \
|
|
oc++; anchor++; \
|
|
ip = anchor; \
|
|
copy++; \
|
|
if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) { \
|
|
copy = 0; \
|
|
oc++; \
|
|
} \
|
|
}
|
|
|
|
#define DISTANCE_SHORT(op, op_limit, len, distance) { \
|
|
if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) \
|
|
goto out; \
|
|
*op++ = (uint8_t)((len << 5U) + (distance >> 8U)); \
|
|
*op++ = (uint8_t)((distance & 255U)); \
|
|
}
|
|
|
|
#define DISTANCE_LONG(op, op_limit, len, distance) { \
|
|
if (BLOSCLZ_UNLIKELY(op + 1 > op_limit)) \
|
|
goto out; \
|
|
*op++ = (uint8_t)((7U << 5U) + (distance >> 8U)); \
|
|
for (len -= 7; len >= 255; len -= 255) { \
|
|
if (BLOSCLZ_UNLIKELY(op + 1 > op_limit)) \
|
|
goto out; \
|
|
*op++ = 255; \
|
|
} \
|
|
if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) \
|
|
goto out; \
|
|
*op++ = (uint8_t)len; \
|
|
*op++ = (uint8_t)((distance & 255U)); \
|
|
}
|
|
|
|
#define DISTANCE_SHORT_FAR(op, op_limit, len, distance) { \
|
|
if (BLOSCLZ_UNLIKELY(op + 4 > op_limit)) \
|
|
goto out; \
|
|
*op++ = (uint8_t)((len << 5U) + 31); \
|
|
*op++ = 255; \
|
|
*op++ = (uint8_t)(distance >> 8U); \
|
|
*op++ = (uint8_t)(distance & 255U); \
|
|
}
|
|
|
|
#define DISTANCE_LONG_FAR(op, op_limit, len, distance) { \
|
|
if (BLOSCLZ_UNLIKELY(op + 1 > op_limit)) \
|
|
goto out; \
|
|
*op++ = (7U << 5U) + 31; \
|
|
for (len -= 7; len >= 255; len -= 255) { \
|
|
if (BLOSCLZ_UNLIKELY(op + 1 > op_limit)) \
|
|
goto out; \
|
|
*op++ = 255; \
|
|
} \
|
|
if (BLOSCLZ_UNLIKELY(op + 4 > op_limit)) \
|
|
goto out; \
|
|
*op++ = (uint8_t)len; \
|
|
*op++ = 255; \
|
|
*op++ = (uint8_t)(distance >> 8U); \
|
|
*op++ = (uint8_t)(distance & 255U); \
|
|
}
|
|
|
|
|
|
// Get the compressed size of a buffer. Useful for testing compression ratios for high clevels.
|
|
static int get_csize(uint8_t* ibase, int maxlen, bool force_3b_shift) {
|
|
uint8_t* ip = ibase;
|
|
int32_t oc = 0;
|
|
uint8_t* ip_bound = ibase + maxlen - 1;
|
|
uint8_t* ip_limit = ibase + maxlen - 12;
|
|
uint32_t htab[1U << (uint8_t)HASH_LOG];
|
|
uint32_t hval;
|
|
uint32_t seq;
|
|
uint8_t copy;
|
|
|
|
// Initialize the hash table to distances of 0
|
|
for (unsigned i = 0; i < (1U << HASH_LOG); i++) {
|
|
htab[i] = 0;
|
|
}
|
|
|
|
/* we start with literal copy */
|
|
copy = 4;
|
|
oc += 5;
|
|
|
|
/* main loop */
|
|
while (BLOSCLZ_LIKELY(ip < ip_limit)) {
|
|
const uint8_t* ref;
|
|
unsigned distance;
|
|
uint8_t* anchor = ip; /* comparison starting-point */
|
|
|
|
/* find potential match */
|
|
seq = BLOSCLZ_READU32(ip);
|
|
HASH_FUNCTION(hval, seq, HASH_LOG)
|
|
ref = ibase + htab[hval];
|
|
|
|
/* calculate distance to the match */
|
|
distance = anchor - ref;
|
|
|
|
/* update hash table */
|
|
htab[hval] = (uint32_t) (anchor - ibase);
|
|
|
|
if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
|
|
LITERAL2(ip, oc, anchor, copy)
|
|
continue;
|
|
}
|
|
|
|
/* is this a match? check the first 4 bytes */
|
|
if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) {
|
|
ref += 4;
|
|
}
|
|
else {
|
|
/* no luck, copy as a literal */
|
|
LITERAL2(ip, oc, anchor, copy)
|
|
continue;
|
|
}
|
|
|
|
/* last matched byte */
|
|
ip = anchor + 4;
|
|
|
|
/* distance is biased */
|
|
distance--;
|
|
|
|
/* get runs or matches; zero distance means a run */
|
|
ip = get_run_or_match(ip, ip_bound, ref, !distance);
|
|
|
|
ip -= force_3b_shift ? 3 : 4;
|
|
unsigned len = (int)(ip - anchor);
|
|
// If match is close, let's reduce the minimum length to encode it
|
|
unsigned minlen = (distance < MAX_DISTANCE) ? 3 : 4;
|
|
// Encoding short lengths is expensive during decompression
|
|
if (len < minlen) {
|
|
LITERAL2(ip, oc, anchor, copy)
|
|
continue;
|
|
}
|
|
|
|
/* if we have'nt copied anything, adjust the output counter */
|
|
if (!copy)
|
|
oc--;
|
|
/* reset literal counter */
|
|
copy = 0;
|
|
|
|
/* encode the match */
|
|
if (distance < MAX_DISTANCE) {
|
|
if (len >= 7) {
|
|
oc += ((len - 7) / 255) + 1;
|
|
}
|
|
oc += 2;
|
|
}
|
|
else {
|
|
/* far away, but not yet in the another galaxy... */
|
|
if (len >= 7) {
|
|
oc += ((len - 7) / 255) + 1;
|
|
}
|
|
oc += 4;
|
|
}
|
|
|
|
/* update the hash at match boundary */
|
|
seq = BLOSCLZ_READU32(ip);
|
|
HASH_FUNCTION(hval, seq, HASH_LOG)
|
|
htab[hval] = (uint32_t) (ip++ - ibase);
|
|
seq >>= 8U;
|
|
HASH_FUNCTION(hval, seq, HASH_LOG)
|
|
htab[hval] = (uint32_t) (ip++ - ibase);
|
|
/* assuming literal copy */
|
|
oc++;
|
|
|
|
}
|
|
|
|
/* if we have copied something, adjust the copy length */
|
|
if (!copy)
|
|
oc--;
|
|
|
|
return (int)oc;
|
|
}
|
|
|
|
|
|
int blosclz_compress(const int clevel, const void* input, int length,
|
|
void* output, int maxout) {
|
|
uint8_t* ibase = (uint8_t*)input;
|
|
uint8_t* ip = ibase;
|
|
uint8_t* ip_bound = ibase + length - 1;
|
|
uint8_t* ip_limit = ibase + length - 12;
|
|
uint8_t* op = (uint8_t*)output;
|
|
uint8_t* op_limit;
|
|
uint32_t htab[1U << (uint8_t)HASH_LOG];
|
|
uint32_t hval;
|
|
uint32_t seq;
|
|
uint8_t copy;
|
|
|
|
op_limit = op + maxout;
|
|
|
|
// Minimum lengths for encoding
|
|
unsigned minlen_[10] = {0, 12, 12, 11, 10, 9, 8, 7, 6, 5};
|
|
|
|
// Minimum compression ratios for initiate encoding
|
|
double cratio_[10] = {0, 2, 2, 2, 2, 1.8, 1.6, 1.4, 1.2, 1.1};
|
|
|
|
uint8_t hashlog_[10] = {0, HASH_LOG - 2, HASH_LOG - 1, HASH_LOG, HASH_LOG,
|
|
HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG};
|
|
uint8_t hashlog = hashlog_[clevel];
|
|
// Initialize the hash table to distances of 0
|
|
for (unsigned i = 0; i < (1U << hashlog); i++) {
|
|
htab[i] = 0;
|
|
}
|
|
|
|
/* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */
|
|
if (length < 16 || maxout < 66) {
|
|
return 0;
|
|
}
|
|
|
|
/* When we go back in a match (shift), we obtain quite different compression properties.
|
|
* It looks like 4 is more useful in combination with bitshuffle and small typesizes
|
|
* (compress better and faster in e.g. `b2bench blosclz bitshuffle single 6 6291456 1 19`).
|
|
* Fallback to 4 because it provides more consistent results on small itemsizes.
|
|
*
|
|
* In this block we also check cratios for the beginning of the buffers and
|
|
* eventually discard those that are small (take too long to decompress).
|
|
* This process is called _entropy probing_.
|
|
*/
|
|
int ipshift = 4;
|
|
int maxlen; // maximum length for entropy probing
|
|
int csize_3b;
|
|
int csize_4b;
|
|
double cratio = 0;
|
|
switch (clevel) {
|
|
case 1:
|
|
case 2:
|
|
case 3:
|
|
maxlen = length / 8;
|
|
csize_4b = get_csize(ibase, maxlen, false);
|
|
cratio = (double)maxlen / csize_4b;
|
|
break;
|
|
case 4:
|
|
case 5:
|
|
case 6:
|
|
case 7:
|
|
case 8:
|
|
maxlen = length / 8;
|
|
csize_4b = get_csize(ibase, maxlen, false);
|
|
cratio = (double)maxlen / csize_4b;
|
|
break;
|
|
case 9:
|
|
// case 9 is special. we need to asses the optimal shift
|
|
maxlen = length / 8;
|
|
csize_3b = get_csize(ibase, maxlen, true);
|
|
csize_4b = get_csize(ibase, maxlen, false);
|
|
ipshift = (csize_3b < csize_4b) ? 3 : 4;
|
|
cratio = (csize_3b < csize_4b) ? ((double)maxlen / csize_3b) : ((double)maxlen / csize_4b);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
// discard probes with small compression ratios (too expensive)
|
|
if (cratio < cratio_ [clevel]) {
|
|
goto out;
|
|
}
|
|
|
|
/* we start with literal copy */
|
|
copy = 4;
|
|
*op++ = MAX_COPY - 1;
|
|
*op++ = *ip++;
|
|
*op++ = *ip++;
|
|
*op++ = *ip++;
|
|
*op++ = *ip++;
|
|
|
|
/* main loop */
|
|
while (BLOSCLZ_LIKELY(ip < ip_limit)) {
|
|
const uint8_t* ref;
|
|
unsigned distance;
|
|
uint8_t* anchor = ip; /* comparison starting-point */
|
|
|
|
/* find potential match */
|
|
seq = BLOSCLZ_READU32(ip);
|
|
HASH_FUNCTION(hval, seq, hashlog)
|
|
ref = ibase + htab[hval];
|
|
|
|
/* calculate distance to the match */
|
|
distance = anchor - ref;
|
|
|
|
/* update hash table */
|
|
htab[hval] = (uint32_t) (anchor - ibase);
|
|
|
|
if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
|
|
LITERAL(ip, op, op_limit, anchor, copy)
|
|
continue;
|
|
}
|
|
|
|
/* is this a match? check the first 4 bytes */
|
|
if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) {
|
|
ref += 4;
|
|
} else {
|
|
/* no luck, copy as a literal */
|
|
LITERAL(ip, op, op_limit, anchor, copy)
|
|
continue;
|
|
}
|
|
|
|
/* last matched byte */
|
|
ip = anchor + 4;
|
|
|
|
/* distance is biased */
|
|
distance--;
|
|
|
|
/* get runs or matches; zero distance means a run */
|
|
ip = get_run_or_match(ip, ip_bound, ref, !distance);
|
|
|
|
/* length is biased, '1' means a match of 3 bytes */
|
|
ip -= ipshift;
|
|
|
|
unsigned len = (int)(ip - anchor);
|
|
// If match is close, let's reduce the minimum length to encode it
|
|
unsigned minlen = (clevel == 9) ? ipshift : minlen_[clevel];
|
|
|
|
// Encoding short lengths is expensive during decompression
|
|
// Encode only for reasonable lengths (extensive experiments done)
|
|
if (len < minlen || (len <= 5 && distance >= MAX_DISTANCE)) {
|
|
LITERAL(ip, op, op_limit, anchor, copy)
|
|
continue;
|
|
}
|
|
|
|
/* if we have copied something, adjust the copy count */
|
|
if (copy)
|
|
/* copy is biased, '0' means 1 byte copy */
|
|
*(op - copy - 1) = (uint8_t)(copy - 1);
|
|
else
|
|
/* back, to overwrite the copy count */
|
|
op--;
|
|
/* reset literal counter */
|
|
copy = 0;
|
|
|
|
/* encode the match */
|
|
if (distance < MAX_DISTANCE) {
|
|
if (len < 7) {
|
|
DISTANCE_SHORT(op, op_limit, len, distance)
|
|
} else {
|
|
DISTANCE_LONG(op, op_limit, len, distance)
|
|
}
|
|
} else {
|
|
/* far away, but not yet in the another galaxy... */
|
|
distance -= MAX_DISTANCE;
|
|
if (len < 7) {
|
|
DISTANCE_SHORT_FAR(op, op_limit, len, distance)
|
|
} else {
|
|
DISTANCE_LONG_FAR(op, op_limit, len, distance)
|
|
}
|
|
}
|
|
|
|
/* update the hash at match boundary */
|
|
seq = BLOSCLZ_READU32(ip);
|
|
HASH_FUNCTION(hval, seq, hashlog)
|
|
htab[hval] = (uint32_t) (ip++ - ibase);
|
|
seq >>= 8U;
|
|
HASH_FUNCTION(hval, seq, hashlog)
|
|
htab[hval] = (uint32_t) (ip++ - ibase);
|
|
/* assuming literal copy */
|
|
|
|
if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))
|
|
goto out;
|
|
*op++ = MAX_COPY - 1;
|
|
}
|
|
|
|
/* left-over as literal copy */
|
|
while (BLOSCLZ_UNLIKELY(ip <= ip_bound)) {
|
|
if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) goto out;
|
|
*op++ = *ip++;
|
|
copy++;
|
|
if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) {
|
|
copy = 0;
|
|
*op++ = MAX_COPY - 1;
|
|
}
|
|
}
|
|
|
|
/* if we have copied something, adjust the copy length */
|
|
if (copy)
|
|
*(op - copy - 1) = (uint8_t)(copy - 1);
|
|
else
|
|
op--;
|
|
|
|
/* marker for blosclz */
|
|
*(uint8_t*)output |= (1U << 5U);
|
|
|
|
return (int)(op - (uint8_t*)output);
|
|
|
|
out:
|
|
return 0;
|
|
}
|
|
|
|
// See https://habr.com/en/company/yandex/blog/457612/
|
|
#ifdef __AVX2__
|
|
|
|
#if defined(_MSC_VER)
|
|
#define ALIGNED_(x) __declspec(align(x))
|
|
#else
|
|
#if defined(__GNUC__)
|
|
#define ALIGNED_(x) __attribute__ ((aligned(x)))
|
|
#endif
|
|
#endif
|
|
#define ALIGNED_TYPE_(t, x) t ALIGNED_(x)
|
|
|
|
static unsigned char* copy_match_16(unsigned char *op, const unsigned char *match, int32_t len)
|
|
{
|
|
size_t offset = op - match;
|
|
while (len >= 16) {
|
|
|
|
static const ALIGNED_TYPE_(uint8_t, 16) masks[] =
|
|
{
|
|
0, 1, 2, 1, 4, 1, 4, 2, 8, 7, 6, 5, 4, 3, 2, 1, // offset = 0, not used as mask, but for shift
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // offset = 1
|
|
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
|
|
0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0,
|
|
0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
|
|
0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0,
|
|
0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
|
|
0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // offset = 16
|
|
};
|
|
|
|
_mm_storeu_si128((__m128i *)(op),
|
|
_mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(match)),
|
|
_mm_load_si128((const __m128i *)(masks) + offset)));
|
|
|
|
match += masks[offset];
|
|
|
|
op += 16;
|
|
len -= 16;
|
|
}
|
|
// Deal with remainders
|
|
for (; len > 0; len--) {
|
|
*op++ = *match++;
|
|
}
|
|
return op;
|
|
}
|
|
#endif
|
|
|
|
// LZ4 wildCopy which can reach excellent copy bandwidth (even if insecure)
|
|
static inline void wild_copy(uint8_t *out, const uint8_t* from, uint8_t* end) {
|
|
uint8_t* d = out;
|
|
const uint8_t* s = from;
|
|
uint8_t* const e = end;
|
|
|
|
do { memcpy(d,s,8); d+=8; s+=8; } while (d<e);
|
|
}
|
|
|
|
int blosclz_decompress(const void* input, int length, void* output, int maxout) {
|
|
const uint8_t* ip = (const uint8_t*)input;
|
|
const uint8_t* ip_limit = ip + length;
|
|
uint8_t* op = (uint8_t*)output;
|
|
uint32_t ctrl;
|
|
uint8_t* op_limit = op + maxout;
|
|
if (BLOSCLZ_UNLIKELY(length == 0)) {
|
|
return 0;
|
|
}
|
|
ctrl = (*ip++) & 31U;
|
|
|
|
while (1) {
|
|
if (ctrl >= 32) {
|
|
// match
|
|
int32_t len = (ctrl >> 5U) - 1 ;
|
|
int32_t ofs = (ctrl & 31U) << 8U;
|
|
uint8_t code;
|
|
const uint8_t* ref = op - ofs;
|
|
|
|
if (len == 7 - 1) {
|
|
do {
|
|
if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
|
|
return 0;
|
|
}
|
|
code = *ip++;
|
|
len += code;
|
|
} while (code == 255);
|
|
}
|
|
else {
|
|
if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
|
|
return 0;
|
|
}
|
|
}
|
|
code = *ip++;
|
|
len += 3;
|
|
ref -= code;
|
|
|
|
/* match from 16-bit distance */
|
|
if (BLOSCLZ_UNLIKELY(code == 255)) {
|
|
if (ofs == (31U << 8U)) {
|
|
if (ip + 1 >= ip_limit) {
|
|
return 0;
|
|
}
|
|
ofs = (*ip++) << 8U;
|
|
ofs += *ip++;
|
|
ref = op - ofs - MAX_DISTANCE;
|
|
}
|
|
}
|
|
|
|
if (BLOSCLZ_UNLIKELY(op + len > op_limit)) {
|
|
return 0;
|
|
}
|
|
|
|
if (BLOSCLZ_UNLIKELY(ref - 1 < (uint8_t*)output)) {
|
|
return 0;
|
|
}
|
|
|
|
if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
|
|
ctrl = *ip++;
|
|
|
|
ref--;
|
|
if (ref == op - 1) {
|
|
/* optimized copy for a run */
|
|
memset(op, *ref, len);
|
|
op += len;
|
|
}
|
|
else if ((op - ref >= 8) && (op_limit - op >= len + 8)) {
|
|
// copy with an overlap not larger than 8
|
|
wild_copy(op, ref, op + len);
|
|
op += len;
|
|
}
|
|
else {
|
|
// general copy with any overlap
|
|
#ifdef __AVX2__
|
|
if (op - ref <= 16) {
|
|
// This is not faster on a combination of compilers (clang, gcc, icc) or machines, but
|
|
// it is not slower either. Let's activate here for experimentation.
|
|
op = copy_match_16(op, ref, len);
|
|
}
|
|
else {
|
|
#endif
|
|
op = copy_match(op, ref, (unsigned) len);
|
|
#ifdef __AVX2__
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
else {
|
|
// literal
|
|
ctrl++;
|
|
if (BLOSCLZ_UNLIKELY(op + ctrl > op_limit)) {
|
|
return 0;
|
|
}
|
|
if (BLOSCLZ_UNLIKELY(ip + ctrl > ip_limit)) {
|
|
return 0;
|
|
}
|
|
|
|
memcpy(op, ip, ctrl); op += ctrl; ip += ctrl;
|
|
// On GCC-6, fastcopy this is still faster than plain memcpy
|
|
// However, using recent CLANG/LLVM 9.0, there is almost no difference
|
|
// in performance.
|
|
// And starting on CLANG/LLVM 10 and GCC 9, memcpy is generally faster.
|
|
// op = fastcopy(op, ip, (unsigned) ctrl); ip += ctrl;
|
|
|
|
if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
|
|
ctrl = *ip++;
|
|
}
|
|
}
|
|
|
|
return (int)(op - (uint8_t*)output);
|
|
}
|