Compress-Stream-Zstd

 view release on metacpan or  search on metacpan

ext/zstd/lib/decompress/huf_decompress.c  view on Meta::CPAN

}

#if HUF_NEED_BMI2_FUNCTION
static BMI2_TARGET_ATTRIBUTE
size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
                    size_t cSrcSize, HUF_DTable const* DTable) {
    return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
}
#endif

static
size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
                    size_t cSrcSize, HUF_DTable const* DTable) {
    return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
}

#if ZSTD_ENABLE_ASM_X86_64_BMI2

HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;

#endif

static HUF_FAST_BMI2_ATTRS
void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
{
    U64 bits[4];
    BYTE const* ip[4];
    BYTE* op[4];
    U16 const* const dtable = (U16 const*)args->dt;
    BYTE* const oend = args->oend;
    BYTE const* const ilimit = args->ilimit;

    /* Copy the arguments to local variables */
    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
    ZSTD_memcpy(&op, &args->op, sizeof(op));

    assert(MEM_isLittleEndian());
    assert(!MEM_32bits());

    for (;;) {
        BYTE* olimit;
        int stream;
        int symbol;

        /* Assert loop preconditions */
#ifndef NDEBUG
        for (stream = 0; stream < 4; ++stream) {
            assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
            assert(ip[stream] >= ilimit);
        }
#endif
        /* Compute olimit */
        {
            /* Each iteration produces 5 output symbols per stream */
            size_t const oiters = (size_t)(oend - op[3]) / 5;
            /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
             * per stream.
             */
            size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
            /* We can safely run iters iterations before running bounds checks */
            size_t const iters = MIN(oiters, iiters);
            size_t const symbols = iters * 5;

            /* We can simply check that op[3] < olimit, instead of checking all
             * of our bounds, since we can't hit the other bounds until we've run
             * iters iterations, which only happens when op[3] == olimit.
             */
            olimit = op[3] + symbols;

            /* Exit fast decoding loop once we get close to the end. */
            if (op[3] + 20 > olimit)
                break;

            /* Exit the decoding loop if any input pointer has crossed the
             * previous one. This indicates corruption, and a precondition
             * to our loop is that ip[i] >= ip[0].
             */
            for (stream = 1; stream < 4; ++stream) {
                if (ip[stream] < ip[stream - 1])
                    goto _out;
            }
        }

#ifndef NDEBUG
        for (stream = 1; stream < 4; ++stream) {
            assert(ip[stream] >= ip[stream - 1]);
        }
#endif

        do {
            /* Decode 5 symbols in each of the 4 streams */
            for (symbol = 0; symbol < 5; ++symbol) {
                for (stream = 0; stream < 4; ++stream) {
                    int const index = (int)(bits[stream] >> 53);
                    int const entry = (int)dtable[index];
                    bits[stream] <<= (entry & 63);
                    op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF);
                }
            }
            /* Reload the bitstreams */
            for (stream = 0; stream < 4; ++stream) {
                int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
                int const nbBits = ctz & 7;
                int const nbBytes = ctz >> 3;
                op[stream] += 5;
                ip[stream] -= nbBytes;
                bits[stream] = MEM_read64(ip[stream]) | 1;
                bits[stream] <<= nbBits;
            }
        } while (op[3] < olimit);
    }

_out:

    /* Save the final values of each of the state variables back to args. */
    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
    ZSTD_memcpy(&args->op, &op, sizeof(op));
}

/**
 * @returns @p dstSize on success (>= 6)
 *          0 if the fallback implementation should be used
 *          An error if an error occurred
 */
static HUF_FAST_BMI2_ATTRS

ext/zstd/lib/decompress/huf_decompress.c  view on Meta::CPAN

static
size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
                    size_t cSrcSize, HUF_DTable const* DTable) {
    return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
}

#if ZSTD_ENABLE_ASM_X86_64_BMI2

HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;

#endif

static HUF_FAST_BMI2_ATTRS
void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
{
    U64 bits[4];
    BYTE const* ip[4];
    BYTE* op[4];
    BYTE* oend[4];
    HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
    BYTE const* const ilimit = args->ilimit;

    /* Copy the arguments to local registers. */
    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
    ZSTD_memcpy(&op, &args->op, sizeof(op));

    oend[0] = op[1];
    oend[1] = op[2];
    oend[2] = op[3];
    oend[3] = args->oend;

    assert(MEM_isLittleEndian());
    assert(!MEM_32bits());

    for (;;) {
        BYTE* olimit;
        int stream;
        int symbol;

        /* Assert loop preconditions */
#ifndef NDEBUG
        for (stream = 0; stream < 4; ++stream) {
            assert(op[stream] <= oend[stream]);
            assert(ip[stream] >= ilimit);
        }
#endif
        /* Compute olimit */
        {
            /* Each loop does 5 table lookups for each of the 4 streams.
             * Each table lookup consumes up to 11 bits of input, and produces
             * up to 2 bytes of output.
             */
            /* We can consume up to 7 bytes of input per iteration per stream.
             * We also know that each input pointer is >= ip[0]. So we can run
             * iters loops before running out of input.
             */
            size_t iters = (size_t)(ip[0] - ilimit) / 7;
            /* Each iteration can produce up to 10 bytes of output per stream.
             * Each output stream my advance at different rates. So take the
             * minimum number of safe iterations among all the output streams.
             */
            for (stream = 0; stream < 4; ++stream) {
                size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
                iters = MIN(iters, oiters);
            }

            /* Each iteration produces at least 5 output symbols. So until
             * op[3] crosses olimit, we know we haven't executed iters
             * iterations yet. This saves us maintaining an iters counter,
             * at the expense of computing the remaining # of iterations
             * more frequently.
             */
            olimit = op[3] + (iters * 5);

            /* Exit the fast decoding loop if we are too close to the end. */
            if (op[3] + 10 > olimit)
                break;

            /* Exit the decoding loop if any input pointer has crossed the
             * previous one. This indicates corruption, and a precondition
             * to our loop is that ip[i] >= ip[0].
             */
            for (stream = 1; stream < 4; ++stream) {
                if (ip[stream] < ip[stream - 1])
                    goto _out;
            }
        }

#ifndef NDEBUG
        for (stream = 1; stream < 4; ++stream) {
            assert(ip[stream] >= ip[stream - 1]);
        }
#endif

        do {
            /* Do 5 table lookups for each of the first 3 streams */
            for (symbol = 0; symbol < 5; ++symbol) {
                for (stream = 0; stream < 3; ++stream) {
                    int const index = (int)(bits[stream] >> 53);
                    HUF_DEltX2 const entry = dtable[index];
                    MEM_write16(op[stream], entry.sequence);
                    bits[stream] <<= (entry.nbBits);
                    op[stream] += (entry.length);
                }
            }
            /* Do 1 table lookup from the final stream */
            {
                int const index = (int)(bits[3] >> 53);
                HUF_DEltX2 const entry = dtable[index];
                MEM_write16(op[3], entry.sequence);
                bits[3] <<= (entry.nbBits);
                op[3] += (entry.length);
            }
            /* Do 4 table lookups from the final stream & reload bitstreams */
            for (stream = 0; stream < 4; ++stream) {
                /* Do a table lookup from the final stream.
                 * This is interleaved with the reloading to reduce register
                 * pressure. This shouldn't be necessary, but compilers can
                 * struggle with codegen with high register pressure.
                 */
                {
                    int const index = (int)(bits[3] >> 53);
                    HUF_DEltX2 const entry = dtable[index];
                    MEM_write16(op[3], entry.sequence);
                    bits[3] <<= (entry.nbBits);
                    op[3] += (entry.length);
                }
                /* Reload the bistreams. The final bitstream must be reloaded
                 * after the 5th symbol was decoded.
                 */



( run in 0.567 second using v1.01-cache-2.11-cpan-71847e10f99 )