Compress-Stream-Zstd
view release on metacpan or search on metacpan
ext/zstd/lib/decompress/huf_decompress.c view on Meta::CPAN
}
#if HUF_NEED_BMI2_FUNCTION
static BMI2_TARGET_ATTRIBUTE
size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
size_t cSrcSize, HUF_DTable const* DTable) {
return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
}
#endif
static
size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
size_t cSrcSize, HUF_DTable const* DTable) {
return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
}
#if ZSTD_ENABLE_ASM_X86_64_BMI2
HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
#endif
static HUF_FAST_BMI2_ATTRS
void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
{
U64 bits[4];
BYTE const* ip[4];
BYTE* op[4];
U16 const* const dtable = (U16 const*)args->dt;
BYTE* const oend = args->oend;
BYTE const* const ilimit = args->ilimit;
/* Copy the arguments to local variables */
ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
ZSTD_memcpy(&op, &args->op, sizeof(op));
assert(MEM_isLittleEndian());
assert(!MEM_32bits());
for (;;) {
BYTE* olimit;
int stream;
int symbol;
/* Assert loop preconditions */
#ifndef NDEBUG
for (stream = 0; stream < 4; ++stream) {
assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
assert(ip[stream] >= ilimit);
}
#endif
/* Compute olimit */
{
/* Each iteration produces 5 output symbols per stream */
size_t const oiters = (size_t)(oend - op[3]) / 5;
/* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
* per stream.
*/
size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
/* We can safely run iters iterations before running bounds checks */
size_t const iters = MIN(oiters, iiters);
size_t const symbols = iters * 5;
/* We can simply check that op[3] < olimit, instead of checking all
* of our bounds, since we can't hit the other bounds until we've run
* iters iterations, which only happens when op[3] == olimit.
*/
olimit = op[3] + symbols;
/* Exit fast decoding loop once we get close to the end. */
if (op[3] + 20 > olimit)
break;
/* Exit the decoding loop if any input pointer has crossed the
* previous one. This indicates corruption, and a precondition
* to our loop is that ip[i] >= ip[0].
*/
for (stream = 1; stream < 4; ++stream) {
if (ip[stream] < ip[stream - 1])
goto _out;
}
}
#ifndef NDEBUG
for (stream = 1; stream < 4; ++stream) {
assert(ip[stream] >= ip[stream - 1]);
}
#endif
do {
/* Decode 5 symbols in each of the 4 streams */
for (symbol = 0; symbol < 5; ++symbol) {
for (stream = 0; stream < 4; ++stream) {
int const index = (int)(bits[stream] >> 53);
int const entry = (int)dtable[index];
bits[stream] <<= (entry & 63);
op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF);
}
}
/* Reload the bitstreams */
for (stream = 0; stream < 4; ++stream) {
int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
int const nbBits = ctz & 7;
int const nbBytes = ctz >> 3;
op[stream] += 5;
ip[stream] -= nbBytes;
bits[stream] = MEM_read64(ip[stream]) | 1;
bits[stream] <<= nbBits;
}
} while (op[3] < olimit);
}
_out:
/* Save the final values of each of the state variables back to args. */
ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
ZSTD_memcpy(&args->op, &op, sizeof(op));
}
/**
* @returns @p dstSize on success (>= 6)
* 0 if the fallback implementation should be used
* An error if an error occurred
*/
static HUF_FAST_BMI2_ATTRS
ext/zstd/lib/decompress/huf_decompress.c view on Meta::CPAN
static
size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
size_t cSrcSize, HUF_DTable const* DTable) {
return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
}
#if ZSTD_ENABLE_ASM_X86_64_BMI2
HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
#endif
static HUF_FAST_BMI2_ATTRS
void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
{
U64 bits[4];
BYTE const* ip[4];
BYTE* op[4];
BYTE* oend[4];
HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
BYTE const* const ilimit = args->ilimit;
/* Copy the arguments to local registers. */
ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
ZSTD_memcpy(&op, &args->op, sizeof(op));
oend[0] = op[1];
oend[1] = op[2];
oend[2] = op[3];
oend[3] = args->oend;
assert(MEM_isLittleEndian());
assert(!MEM_32bits());
for (;;) {
BYTE* olimit;
int stream;
int symbol;
/* Assert loop preconditions */
#ifndef NDEBUG
for (stream = 0; stream < 4; ++stream) {
assert(op[stream] <= oend[stream]);
assert(ip[stream] >= ilimit);
}
#endif
/* Compute olimit */
{
/* Each loop does 5 table lookups for each of the 4 streams.
* Each table lookup consumes up to 11 bits of input, and produces
* up to 2 bytes of output.
*/
/* We can consume up to 7 bytes of input per iteration per stream.
* We also know that each input pointer is >= ip[0]. So we can run
* iters loops before running out of input.
*/
size_t iters = (size_t)(ip[0] - ilimit) / 7;
/* Each iteration can produce up to 10 bytes of output per stream.
* Each output stream my advance at different rates. So take the
* minimum number of safe iterations among all the output streams.
*/
for (stream = 0; stream < 4; ++stream) {
size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
iters = MIN(iters, oiters);
}
/* Each iteration produces at least 5 output symbols. So until
* op[3] crosses olimit, we know we haven't executed iters
* iterations yet. This saves us maintaining an iters counter,
* at the expense of computing the remaining # of iterations
* more frequently.
*/
olimit = op[3] + (iters * 5);
/* Exit the fast decoding loop if we are too close to the end. */
if (op[3] + 10 > olimit)
break;
/* Exit the decoding loop if any input pointer has crossed the
* previous one. This indicates corruption, and a precondition
* to our loop is that ip[i] >= ip[0].
*/
for (stream = 1; stream < 4; ++stream) {
if (ip[stream] < ip[stream - 1])
goto _out;
}
}
#ifndef NDEBUG
for (stream = 1; stream < 4; ++stream) {
assert(ip[stream] >= ip[stream - 1]);
}
#endif
do {
/* Do 5 table lookups for each of the first 3 streams */
for (symbol = 0; symbol < 5; ++symbol) {
for (stream = 0; stream < 3; ++stream) {
int const index = (int)(bits[stream] >> 53);
HUF_DEltX2 const entry = dtable[index];
MEM_write16(op[stream], entry.sequence);
bits[stream] <<= (entry.nbBits);
op[stream] += (entry.length);
}
}
/* Do 1 table lookup from the final stream */
{
int const index = (int)(bits[3] >> 53);
HUF_DEltX2 const entry = dtable[index];
MEM_write16(op[3], entry.sequence);
bits[3] <<= (entry.nbBits);
op[3] += (entry.length);
}
/* Do 4 table lookups from the final stream & reload bitstreams */
for (stream = 0; stream < 4; ++stream) {
/* Do a table lookup from the final stream.
* This is interleaved with the reloading to reduce register
* pressure. This shouldn't be necessary, but compilers can
* struggle with codegen with high register pressure.
*/
{
int const index = (int)(bits[3] >> 53);
HUF_DEltX2 const entry = dtable[index];
MEM_write16(op[3], entry.sequence);
bits[3] <<= (entry.nbBits);
op[3] += (entry.length);
}
/* Reload the bistreams. The final bitstream must be reloaded
* after the 5th symbol was decoded.
*/
( run in 0.567 second using v1.01-cache-2.11-cpan-71847e10f99 )