ISAL-Crypto

 view release on metacpan or  search on metacpan

isa-l_crypto/sha256_mb/sha256_opt_x1.asm  view on Meta::CPAN


%xdefine XTMP0 xmm0
%xdefine XTMP1 xmm1
%xdefine XTMP2 xmm2
%xdefine XTMP3 xmm3
%xdefine XTMP4 xmm8
%xdefine XFER xmm9

%define SHUF_00BA xmm10      ; shuffle xBxA -> 00BA
%define SHUF_DC00 xmm11      ; shuffle xDxC -> DC00
%define BYTE_FLIP_MASK xmm12

; arg index is start from 0 while mgr_flush/submit is from 1
%define MGR	arg0	; rdi or rcx
%define NBLK	arg1	; rsi or rdx
%define IDX	r8	; local variable -- consistent with caller
%define NLANX4	r10	; consistent with caller, should be r10

%define TMGR r9	; data pointer stored in stack named _TMGR
%define INP r9	; data pointer stored in stack named _INP
%define SRND r9	; clobbers INP
%define TMP r9	; local variable -- assistant to address digest

%xdefine TBL rbp
%xdefine c ecx
%xdefine d esi
%xdefine e edx
%xdefine a eax
%xdefine b ebx

%xdefine f edi
%xdefine g r12d
%xdefine h r11d

%xdefine y0 r13d
%xdefine y1 r14d
%xdefine y2 r15d


;; FRAMESZ plus pushes must be an odd multiple of 8
%define _STACK_ALIGN_SIZE 8	; 0 or 8 depends on pushes
%define _INP_END_SIZE 8
%define _INP_SIZE 8
%define _TMGR_SIZE 8
%define _XFER_SIZE 16
%define _XMM_SAVE_SIZE 0
%define _GPR_SAVE_SIZE 8*9	;rbx, rdx, rbp, (rdi, rsi), r12~r15

%define _STACK_ALIGN 0
%define _INP_END (_STACK_ALIGN  + _STACK_ALIGN_SIZE)
%define _INP (_INP_END  + _INP_END_SIZE)
%define _TMGR (_INP + _INP_SIZE)
%define _XFER (_TMGR + _TMGR_SIZE)
%define _XMM_SAVE (_XFER + _XFER_SIZE)
%define _GPR_SAVE (_XMM_SAVE + _XMM_SAVE_SIZE)
%define STACK_SIZE (_GPR_SAVE + _GPR_SAVE_SIZE)

;; assume buffers not aligned
%define    MOVDQ movdqu

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros

; addm [mem], reg
; Add reg to mem using reg-mem add and store
%macro addm 2
        add     %2, %1 ;changed
        mov     %1, %2 ;changed
%endmacro

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
; Load xmm with mem and byte swap each dword
%macro COPY_XMM_AND_BSWAP 3
        MOVDQ %1, %2 ;changed
        pshufb %1, %3 ;changed
%endmacro

; rotate_Xs
; Rotate values of symbols X0...X3
%macro rotate_Xs 0
%xdefine X_ X0
%xdefine X0 X1
%xdefine X1 X2
%xdefine X2 X3
%xdefine X3 X_
%endmacro

; ROTATE_ARGS
; Rotate values of symbols a...h
%macro ROTATE_ARGS 0
%xdefine TMP_ h
%xdefine h g
%xdefine g f
%xdefine f e
%xdefine e d
%xdefine d c
%xdefine c b
%xdefine b a
%xdefine a TMP_
%endmacro

%macro FOUR_ROUNDS_AND_SCHED 0
	;; compute s0 four at a time and s1 two at a time
	;; compute W[-16] + W[-7] 4 at a time
	movdqa  XTMP0, X3
	mov     y0, e 			; y0 = e
	ror     y0, (25-11)             ; y0 = e >> (25-11)
	mov     y1, a                   ; y1 = a
	palignr XTMP0, X2, 4            ; XTMP0 = W[-7]
	ror     y1, (22-13)             ; y1 = a >> (22-13)
	xor     y0, e                   ; y0 = e ^ (e >> (25-11))
	mov     y2, f                   ; y2 = f
	ror     y0, (11-6)              ; y0 = (e >> (11-6)) ^ (e >> (25-6))
	movdqa  XTMP1, X1
	xor     y1, a                   ; y1 = a ^ (a >> (22-13)
	xor     y2, g                   ; y2 = f^g
	paddd   XTMP0, X0               ; XTMP0 = W[-7] + W[-16]
	xor     y0, e                   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
	and     y2, e                   ; y2 = (f^g)&e
	ror     y1, (13-2)              ; y1 = (a >> (13-2)) ^ (a >> (22-2))



( run in 0.724 second using v1.01-cache-2.11-cpan-5b529ec07f3 )