iterations results from the CPAN

Compress-Stream-Zstd

/* Calls X(N) for each stream 0, 1, 2, 3. */
#define FOR_EACH_STREAM(X) \
    X(0);                  \
    X(1);                  \
    X(2);                  \
    X(3)

/* Calls X(N, idx) for each stream 0, 1, 2, 3. */
#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
    X(0, idx);                             \
    X(1, idx);                             \
    X(2, idx);                             \
    X(3, idx)

/* Define both _HUF_* & HUF_* symbols because MacOS
 * C symbols are prefixed with '_' & Linux symbols aren't.
 */
_HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
    ZSTD_CET_ENDBRANCH
    /* Save all registers - even if they are callee saved for simplicity. */
    push %rax
    push %rbx
    push %rcx
    push %rdx
    push %rbp
    push %rsi
    push %rdi
    push %r8
    push %r9
    push %r10
    push %r11
    push %r12
    push %r13
    push %r14
    push %r15

    /* Read HUF_DecompressAsmArgs* args from %rax */
    movq %rdi, %rax
    movq  0(%rax), %ip0
    movq  8(%rax), %ip1
    movq 16(%rax), %ip2
    movq 24(%rax), %ip3
    movq 32(%rax), %op0
    movq 40(%rax), %op1
    movq 48(%rax), %op2
    movq 56(%rax), %op3
    movq 64(%rax), %bits0
    movq 72(%rax), %bits1
    movq 80(%rax), %bits2
    movq 88(%rax), %bits3
    movq 96(%rax), %dtable
    push %rax      /* argument */
    push 104(%rax) /* ilimit */
    push 112(%rax) /* oend */
    push %olimit   /* olimit space */

    subq $24, %rsp

.L_4X1_compute_olimit:
    /* Computes how many iterations we can do safely
     * %r15, %rax may be clobbered
     * rbx, rdx must be saved
     * op3 & ip0 mustn't be clobbered
     */
    movq %rbx, 0(%rsp)
    movq %rdx, 8(%rsp)

    movq 32(%rsp), %rax /* rax = oend */
    subq %op3,    %rax  /* rax = oend - op3 */

    /* r15 = (oend - op3) / 5 */
    movabsq $-3689348814741910323, %rdx
    mulq %rdx
    movq %rdx, %r15
    shrq $2, %r15

    movq %ip0,     %rax /* rax = ip0 */
    movq 40(%rsp), %rdx /* rdx = ilimit */
    subq %rdx,     %rax /* rax = ip0 - ilimit */
    movq %rax,     %rbx /* rbx = ip0 - ilimit */

    /* rdx = (ip0 - ilimit) / 7 */
    movabsq $2635249153387078803, %rdx
    mulq %rdx
    subq %rdx, %rbx
    shrq %rbx
    addq %rbx, %rdx
    shrq $2, %rdx

    /* r15 = min(%rdx, %r15) */
    cmpq %rdx, %r15
    cmova %rdx, %r15

    /* r15 = r15 * 5 */
    leaq (%r15, %r15, 4), %r15

    /* olimit = op3 + r15 */
    addq %op3, %olimit

    movq 8(%rsp), %rdx
    movq 0(%rsp), %rbx

    /* If (op3 + 20 > olimit) */
    movq %op3, %rax    /* rax = op3 */
    addq $20,  %rax    /* rax = op3 + 20 */
    cmpq %rax, %olimit /* op3 + 20 > olimit */
    jb .L_4X1_exit

    /* If (ip1 < ip0) go to exit */
    cmpq %ip0, %ip1
    jb .L_4X1_exit

    /* If (ip2 < ip1) go to exit */
    cmpq %ip1, %ip2
    jb .L_4X1_exit

    /* If (ip3 < ip2) go to exit */
    cmpq %ip2, %ip3
    jb .L_4X1_exit

ext/zstd/lib/decompress/huf_decompress_amd64.S view on Meta::CPAN

    pop %rdx
    pop %rcx
    pop %rbx
    pop %rax
    ret

_HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
    ZSTD_CET_ENDBRANCH
    /* Save all registers - even if they are callee saved for simplicity. */
    push %rax
    push %rbx
    push %rcx
    push %rdx
    push %rbp
    push %rsi
    push %rdi
    push %r8
    push %r9
    push %r10
    push %r11
    push %r12
    push %r13
    push %r14
    push %r15

    movq %rdi, %rax
    movq  0(%rax), %ip0
    movq  8(%rax), %ip1
    movq 16(%rax), %ip2
    movq 24(%rax), %ip3
    movq 32(%rax), %op0
    movq 40(%rax), %op1
    movq 48(%rax), %op2
    movq 56(%rax), %op3
    movq 64(%rax), %bits0
    movq 72(%rax), %bits1
    movq 80(%rax), %bits2
    movq 88(%rax), %bits3
    movq 96(%rax), %dtable
    push %rax      /* argument */
    push %rax      /* olimit */
    push 104(%rax) /* ilimit */

    movq 112(%rax), %rax
    push %rax /* oend3 */

    movq %op3, %rax
    push %rax /* oend2 */

    movq %op2, %rax
    push %rax /* oend1 */

    movq %op1, %rax
    push %rax /* oend0 */

    /* Scratch space */
    subq $8, %rsp

.L_4X2_compute_olimit:
    /* Computes how many iterations we can do safely
     * %r15, %rax may be clobbered
     * rdx must be saved
     * op[1,2,3,4] & ip0 mustn't be clobbered
     */
    movq %rdx, 0(%rsp)

    /* We can consume up to 7 input bytes each iteration. */
    movq %ip0,     %rax  /* rax = ip0 */
    movq 40(%rsp), %rdx  /* rdx = ilimit */
    subq %rdx,     %rax  /* rax = ip0 - ilimit */
    movq %rax,    %r15   /* r15 = ip0 - ilimit */

    /* rdx = rax / 7 */
    movabsq $2635249153387078803, %rdx
    mulq %rdx
    subq %rdx, %r15
    shrq %r15
    addq %r15, %rdx
    shrq $2, %rdx

    /* r15 = (ip0 - ilimit) / 7 */
    movq %rdx, %r15

    /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
    movq 8(%rsp),  %rax /* rax = oend0 */
    subq %op0,     %rax /* rax = oend0 - op0 */
    movq 16(%rsp), %rdx /* rdx = oend1 */
    subq %op1,     %rdx /* rdx = oend1 - op1 */

    cmpq  %rax,    %rdx
    cmova %rax,    %rdx /* rdx = min(%rdx, %rax) */

    movq 24(%rsp), %rax /* rax = oend2 */
    subq %op2,     %rax /* rax = oend2 - op2 */

    cmpq  %rax,    %rdx
    cmova %rax,    %rdx /* rdx = min(%rdx, %rax) */

    movq 32(%rsp), %rax /* rax = oend3 */
    subq %op3,     %rax /* rax = oend3 - op3 */

    cmpq  %rax,    %rdx
    cmova %rax,    %rdx /* rdx = min(%rdx, %rax) */

    movabsq $-3689348814741910323, %rax
    mulq %rdx
    shrq $3,       %rdx /* rdx = rdx / 10 */

    /* r15 = min(%rdx, %r15) */
    cmpq  %rdx, %r15
    cmova %rdx, %r15

    /* olimit = op3 + 5 * r15 */
    movq %r15, %rax
    leaq (%op3, %rax, 4), %olimit
    addq %rax, %olimit

    movq 0(%rsp), %rdx

    /* If (op3 + 10 > olimit) */

( run in 1.537 second using v1.01-cache-2.11-cpan-71847e10f99 )