Compress-Stream-Zstd
view release on metacpan or search on metacpan
ext/zstd/lib/decompress/huf_decompress_amd64.S view on Meta::CPAN
.global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop
.text
/* Sets up register mappings for clarity.
* op[], bits[], dtable & ip[0] each get their own register.
* ip[1,2,3] & olimit alias var[].
* %rax is a scratch register.
*/
#define op0 rsi
#define op1 rbx
#define op2 rcx
#define op3 rdi
#define ip0 r8
#define ip1 r9
#define ip2 r10
#define ip3 r11
#define bits0 rbp
#define bits1 rdx
#define bits2 r12
#define bits3 r13
#define dtable r14
#define olimit r15
/* var[] aliases ip[1,2,3] & olimit
* ip[1,2,3] are saved every iteration.
* olimit is only used in compute_olimit.
*/
#define var0 r15
#define var1 r9
#define var2 r10
#define var3 r11
/* 32-bit var registers */
#define vard0 r15d
#define vard1 r9d
#define vard2 r10d
#define vard3 r11d
/* Calls X(N) for each stream 0, 1, 2, 3. */
#define FOR_EACH_STREAM(X) \
X(0); \
X(1); \
X(2); \
X(3)
/* Calls X(N, idx) for each stream 0, 1, 2, 3. */
#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
X(0, idx); \
X(1, idx); \
X(2, idx); \
X(3, idx)
/* Define both _HUF_* & HUF_* symbols because MacOS
* C symbols are prefixed with '_' & Linux symbols aren't.
*/
_HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
ZSTD_CET_ENDBRANCH
/* Save all registers - even if they are callee saved for simplicity. */
push %rax
push %rbx
push %rcx
push %rdx
push %rbp
push %rsi
push %rdi
push %r8
push %r9
push %r10
push %r11
push %r12
push %r13
push %r14
push %r15
/* Read HUF_DecompressAsmArgs* args from %rax */
movq %rdi, %rax
movq 0(%rax), %ip0
movq 8(%rax), %ip1
movq 16(%rax), %ip2
movq 24(%rax), %ip3
movq 32(%rax), %op0
movq 40(%rax), %op1
movq 48(%rax), %op2
movq 56(%rax), %op3
movq 64(%rax), %bits0
movq 72(%rax), %bits1
movq 80(%rax), %bits2
movq 88(%rax), %bits3
movq 96(%rax), %dtable
push %rax /* argument */
push 104(%rax) /* ilimit */
push 112(%rax) /* oend */
push %olimit /* olimit space */
subq $24, %rsp
.L_4X1_compute_olimit:
/* Computes how many iterations we can do safely
* %r15, %rax may be clobbered
* rbx, rdx must be saved
* op3 & ip0 mustn't be clobbered
*/
movq %rbx, 0(%rsp)
movq %rdx, 8(%rsp)
movq 32(%rsp), %rax /* rax = oend */
subq %op3, %rax /* rax = oend - op3 */
/* r15 = (oend - op3) / 5 */
movabsq $-3689348814741910323, %rdx
mulq %rdx
movq %rdx, %r15
shrq $2, %r15
movq %ip0, %rax /* rax = ip0 */
movq 40(%rsp), %rdx /* rdx = ilimit */
subq %rdx, %rax /* rax = ip0 - ilimit */
ext/zstd/lib/decompress/huf_decompress_amd64.S view on Meta::CPAN
/* If op3 < olimit: continue the loop */
cmp %op3, 24(%rsp)
ja .L_4X1_loop_body
/* Reload ip[1,2,3] from stack */
movq 0(%rsp), %ip1
movq 8(%rsp), %ip2
movq 16(%rsp), %ip3
/* Re-compute olimit */
jmp .L_4X1_compute_olimit
#undef GET_NEXT_DELT
#undef DECODE_FROM_DELT
#undef DECODE
#undef RELOAD_BITS
.L_4X1_exit:
addq $24, %rsp
/* Restore stack (oend & olimit) */
pop %rax /* olimit */
pop %rax /* oend */
pop %rax /* ilimit */
pop %rax /* arg */
/* Save ip / op / bits */
movq %ip0, 0(%rax)
movq %ip1, 8(%rax)
movq %ip2, 16(%rax)
movq %ip3, 24(%rax)
movq %op0, 32(%rax)
movq %op1, 40(%rax)
movq %op2, 48(%rax)
movq %op3, 56(%rax)
movq %bits0, 64(%rax)
movq %bits1, 72(%rax)
movq %bits2, 80(%rax)
movq %bits3, 88(%rax)
/* Restore registers */
pop %r15
pop %r14
pop %r13
pop %r12
pop %r11
pop %r10
pop %r9
pop %r8
pop %rdi
pop %rsi
pop %rbp
pop %rdx
pop %rcx
pop %rbx
pop %rax
ret
_HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
ZSTD_CET_ENDBRANCH
/* Save all registers - even if they are callee saved for simplicity. */
push %rax
push %rbx
push %rcx
push %rdx
push %rbp
push %rsi
push %rdi
push %r8
push %r9
push %r10
push %r11
push %r12
push %r13
push %r14
push %r15
movq %rdi, %rax
movq 0(%rax), %ip0
movq 8(%rax), %ip1
movq 16(%rax), %ip2
movq 24(%rax), %ip3
movq 32(%rax), %op0
movq 40(%rax), %op1
movq 48(%rax), %op2
movq 56(%rax), %op3
movq 64(%rax), %bits0
movq 72(%rax), %bits1
movq 80(%rax), %bits2
movq 88(%rax), %bits3
movq 96(%rax), %dtable
push %rax /* argument */
push %rax /* olimit */
push 104(%rax) /* ilimit */
movq 112(%rax), %rax
push %rax /* oend3 */
movq %op3, %rax
push %rax /* oend2 */
movq %op2, %rax
push %rax /* oend1 */
movq %op1, %rax
push %rax /* oend0 */
/* Scratch space */
subq $8, %rsp
.L_4X2_compute_olimit:
/* Computes how many iterations we can do safely
* %r15, %rax may be clobbered
* rdx must be saved
* op[1,2,3,4] & ip0 mustn't be clobbered
*/
movq %rdx, 0(%rsp)
/* We can consume up to 7 input bytes each iteration. */
movq %ip0, %rax /* rax = ip0 */
( run in 2.809 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )