ISAL-Crypto

 view release on metacpan or  search on metacpan

isa-l_crypto/aes/gcm_avx_gen2.asm  view on Meta::CPAN

; TLen:
;       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
;
; poly = x^128 + x^127 + x^126 + x^121 + 1
; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
;

%include "reg_sizes.asm"
%include "gcm_defines.asm"

%ifndef GCM128_MODE
%ifndef GCM192_MODE
%ifndef GCM256_MODE
%error "No GCM mode selected for gcm_avx_gen2.asm!"
%endif
%endif
%endif

%ifndef FUNCT_EXTENSION
%define FUNCT_EXTENSION
%endif

%ifdef GCM128_MODE
%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen2 %+ FUNCT_EXTENSION
%define NROUNDS 9
%endif

%ifdef GCM192_MODE
%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen2 %+ FUNCT_EXTENSION
%define NROUNDS 11
%endif

%ifdef GCM256_MODE
%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen2 %+ FUNCT_EXTENSION
%define NROUNDS 13
%endif

default rel
; need to push 5 registers into stack to maintain
%define STACK_OFFSET 8*5

%define	TMP2	16*0    ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
%define	TMP3	16*1    ; Temporary storage for AES State 3
%define	TMP4	16*2    ; Temporary storage for AES State 4
%define	TMP5	16*3    ; Temporary storage for AES State 5
%define	TMP6	16*4    ; Temporary storage for AES State 6
%define	TMP7	16*5    ; Temporary storage for AES State 7
%define	TMP8	16*6    ; Temporary storage for AES State 8

%define	LOCAL_STORAGE	16*7

%ifidn __OUTPUT_FORMAT__, win64
	%define	XMM_STORAGE	16*10
%else
	%define	XMM_STORAGE	0
%endif

%define	VARIABLE_OFFSET	LOCAL_STORAGE + XMM_STORAGE

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Utility Macros
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
; Input: A and B (128-bits each, bit-reflected)
; Output: C = A*B*x mod poly, (i.e. >>1 )
; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro  GHASH_MUL  7
%define %%GH %1         ; 16 Bytes
%define %%HK %2         ; 16 Bytes
%define %%T1 %3
%define %%T2 %4
%define %%T3 %5
%define %%T4 %6
%define %%T5 %7
        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        ;; Karatsuba
        vpshufd         %%T2, %%GH, 01001110b
        vpshufd         %%T3, %%HK, 01001110b
        vpxor           %%T2, %%T2, %%GH                ; %%T2 = (a1+a0)
        vpxor           %%T3, %%T3, %%HK                ; %%T3 = (b1+b0)

        vpclmulqdq      %%T1, %%GH, %%HK, 0x11          ; %%T1 = a1*b1
        vpclmulqdq      %%GH, %%HK, 0x00                ; %%GH = a0*b0
        vpclmulqdq      %%T2, %%T3, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
        vpxor           %%T2, %%T2, %%GH
        vpxor           %%T2, %%T2, %%T1                ; %%T2 = a0*b1+a1*b0

        vpslldq         %%T3, %%T2, 8                   ; shift-L %%T3 2 DWs
        vpsrldq         %%T2, %%T2, 8                   ; shift-R %%T2 2 DWs
        vpxor           %%GH, %%GH, %%T3
        vpxor           %%T1, %%T1, %%T2                ; <%%T1:%%GH> = %%GH x %%HK

        ;first phase of the reduction
        vpslld  %%T2, %%GH, 31                          ; packed right shifting << 31
        vpslld  %%T3, %%GH, 30                          ; packed right shifting shift << 30
        vpslld  %%T4, %%GH, 25                          ; packed right shifting shift << 25

        vpxor   %%T2, %%T2, %%T3                        ; xor the shifted versions
        vpxor   %%T2, %%T2, %%T4

        vpsrldq %%T5, %%T2, 4                           ; shift-R %%T5 1 DW

        vpslldq %%T2, %%T2, 12                          ; shift-L %%T2 3 DWs
        vpxor   %%GH, %%GH, %%T2                        ; first phase of the reduction complete
        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

        ;second phase of the reduction

        vpsrld  %%T2,%%GH,1                             ; packed left shifting >> 1
        vpsrld  %%T3,%%GH,2                             ; packed left shifting >> 2
        vpsrld  %%T4,%%GH,7                             ; packed left shifting >> 7
        vpxor   %%T2, %%T2, %%T3                        ; xor the shifted versions
        vpxor   %%T2, %%T2, %%T4

        vpxor   %%T2, %%T2, %%T5
        vpxor   %%GH, %%GH, %%T2
        vpxor   %%GH, %%GH, %%T1                        ; the result is in %%GH

isa-l_crypto/aes/gcm_avx_gen2.asm  view on Meta::CPAN


	mov	%%T1, %%A_IN		; T1 = AAD
	mov	%%T2, %%A_LEN		; T2 = aadLen
	vpxor	%%AAD_HASH, %%AAD_HASH

	cmp	%%T2, 16
	jl	%%_get_small_AAD_block

%%_get_AAD_loop16:

	vmovdqu	%%XTMP1, [%%T1]
	;byte-reflect the AAD data
	vpshufb	%%XTMP1, [SHUF_MASK]
	vpxor	%%AAD_HASH, %%XTMP1
	GHASH_MUL	%%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5

	sub	%%T2, 16
	je	%%_CALC_AAD_done

	add	%%T1, 16
	cmp	%%T2, 16
	jge	%%_get_AAD_loop16

%%_get_small_AAD_block:
	READ_SMALL_DATA_INPUT	%%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
	;byte-reflect the AAD data
	vpshufb	%%XTMP1, [SHUF_MASK]
	vpxor	%%AAD_HASH, %%XTMP1
	GHASH_MUL	%%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5

%%_CALC_AAD_done:

%endmacro ; CALC_AAD_HASH



;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
; Requires the input data be at least 1 byte long.
; Input:
;  GDATA_KEY - struct gcm_key_data *
;  GDATA_CTX - struct gcm_context_data *
;  PLAIN_CYPH_IN - input text
;  PLAIN_CYPH_LEN - input text length
;  DATA_OFFSET - the current data offset
;  ENC_DEC - whether encoding or decoding
; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro PARTIAL_BLOCK	8
%define	%%GDATA_KEY		%1
%define	%%GDATA_CTX		%2
%define	%%CYPH_PLAIN_OUT	%3
%define	%%PLAIN_CYPH_IN		%4
%define	%%PLAIN_CYPH_LEN	%5
%define	%%DATA_OFFSET		%6
%define	%%AAD_HASH		%7
%define	%%ENC_DEC		%8
	mov	r13, [%%GDATA_CTX + PBlockLen]
	cmp	r13, 0
	je	%%_partial_block_done		;Leave Macro if no partial blocks

	cmp	%%PLAIN_CYPH_LEN, 16		;Read in input data without over reading
	jl	%%_fewer_than_16_bytes
	VXLDR	xmm1, [%%PLAIN_CYPH_IN]		;If more than 16 bytes of data, just fill the xmm register
	jmp	%%_data_read

%%_fewer_than_16_bytes:
	lea	r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
	READ_SMALL_DATA_INPUT	xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15

%%_data_read:				;Finished reading in data


	vmovdqu	xmm9, [%%GDATA_CTX + PBlockEncKey]	;xmm9 = my_ctx_data.partial_block_enc_key
	vmovdqu	xmm13, [%%GDATA_KEY + HashKey]

	lea	r12, [SHIFT_MASK]

	cmp	r13, rax
	add	r12, r13			; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
	vmovdqu	xmm2, [r12]			; get the appropriate shuffle mask
	vpshufb	xmm9, xmm2			;shift right r13 bytes

%ifidn	%%ENC_DEC, DEC
	vmovdqa	xmm3, xmm1
	vpxor	xmm9, xmm1			; Cyphertext XOR E(K, Yn)

	mov	r15, %%PLAIN_CYPH_LEN
	add	r15, r13
	sub	r15, 16				;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
	jge	%%_no_extra_mask_1		;Determine if if partial block is not being filled and shift mask accordingly
	sub	r12, r15
%%_no_extra_mask_1:

	vmovdqu	xmm1, [r12 + ALL_F-SHIFT_MASK]	; get the appropriate mask to mask out bottom r13 bytes of xmm9
	vpand	xmm9, xmm1			; mask out bottom r13 bytes of xmm9

	vpand	xmm3, xmm1
	vpshufb	xmm3, [SHUF_MASK]
	vpshufb	xmm3, xmm2
	vpxor	%%AAD_HASH, xmm3


	cmp	r15,0
	jl	%%_partial_incomplete_1

	GHASH_MUL	%%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6	;GHASH computation for the last <16 Byte block
	xor	rax,rax
	mov	[%%GDATA_CTX + PBlockLen], rax
	jmp	%%_dec_done
%%_partial_incomplete_1:
	add	[%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
%%_dec_done:
	vmovdqu	[%%GDATA_CTX + AadHash], %%AAD_HASH

%else
	vpxor	xmm9, xmm1	; Plaintext XOR E(K, Yn)

	mov	r15, %%PLAIN_CYPH_LEN
	add	r15, r13

isa-l_crypto/aes/gcm_avx_gen2.asm  view on Meta::CPAN

; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
; Input: struct gcm_key_data *(GDATA_KEY), struct gcm_context_data *(GDATA_CTX),
;        IV, Additional Authentication data (A_IN), Additional
; Data length (A_LEN)
; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA.
; Clobbers rax, r10-r13, and xmm0-xmm6
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro	GCM_INIT 	5
%define	%%GDATA_KEY	%1
%define	%%GDATA_CTX	%2
%define	%%IV		%3
%define	%%A_IN		%4
%define	%%A_LEN		%5
%define	%%AAD_HASH	xmm0
%define	%%SUBHASH	xmm1


	vmovdqu	%%SUBHASH, [%%GDATA_KEY + HashKey]

	CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
	vpxor	xmm2, xmm3
	mov	r10, %%A_LEN

	vmovdqu	[%%GDATA_CTX + AadHash], %%AAD_HASH	; ctx_data.aad hash = aad_hash
	mov	[%%GDATA_CTX + AadLen], r10		; ctx_data.aad_length = aad_length
	xor	r10, r10
	mov	[%%GDATA_CTX + InLen], r10		; ctx_data.in_length = 0
	mov	[%%GDATA_CTX + PBlockLen], r10		; ctx_data.partial_block_length = 0
	vmovdqu	[%%GDATA_CTX + PBlockEncKey], xmm2	; ctx_data.partial_block_enc_key = 0
	mov	r10, %%IV
        vmovdqa xmm2, [rel ONEf]                        ; read 12 IV bytes and pad with 0x00000001
        vpinsrq xmm2, [r10], 0
        vpinsrd xmm2, [r10+8], 2
	vmovdqu	[%%GDATA_CTX + OrigIV], xmm2		; ctx_data.orig_IV = iv

	vpshufb xmm2, [SHUF_MASK]

	vmovdqu	[%%GDATA_CTX + CurCount], xmm2		; ctx_data.current_counter = iv
%endmacro


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
; has been initialized by GCM_INIT
; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data * (GDATA_CTX),
;        input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
; and whether encoding or decoding (ENC_DEC)
; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
; Clobbers rax, r10-r15, and xmm0-xmm15
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro	GCM_ENC_DEC 		6
%define	%%GDATA_KEY		%1
%define	%%GDATA_CTX		%2
%define	%%CYPH_PLAIN_OUT	%3
%define	%%PLAIN_CYPH_IN		%4
%define	%%PLAIN_CYPH_LEN	%5
%define	%%ENC_DEC		%6
%define	%%DATA_OFFSET		r11

; Macro flow:
; calculate the number of 16byte blocks in the message
; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
	cmp	%%PLAIN_CYPH_LEN, 0
	je	%%_multiple_of_16_bytes

	xor %%DATA_OFFSET, %%DATA_OFFSET
	add [%%GDATA_CTX+InLen], %%PLAIN_CYPH_LEN       ; Update length of data processed
	vmovdqu  xmm13, [%%GDATA_KEY + HashKey]         ; xmm13 = HashKey
	vmovdqu xmm8, [%%GDATA_CTX + AadHash]


	PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC


	mov	r13, %%PLAIN_CYPH_LEN
	sub	r13, %%DATA_OFFSET
	mov	r10, r13				; save the amount of data left to process in r10
	and     r13, -16                                ; r13 = r13 - (r13 mod 16)

        mov     r12, r13
        shr     r12, 4
        and     r12, 7

        jz      %%_initial_num_blocks_is_0

        cmp     r12, 7
        je      %%_initial_num_blocks_is_7
        cmp     r12, 6
        je      %%_initial_num_blocks_is_6
        cmp     r12, 5
        je      %%_initial_num_blocks_is_5
        cmp     r12, 4
        je      %%_initial_num_blocks_is_4
        cmp     r12, 3
        je      %%_initial_num_blocks_is_3
        cmp     r12, 2
        je      %%_initial_num_blocks_is_2

        jmp     %%_initial_num_blocks_is_1

%%_initial_num_blocks_is_7:
	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
        sub     r13, 16*7
        jmp     %%_initial_blocks_encrypted

%%_initial_num_blocks_is_6:
	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
        sub     r13, 16*6
        jmp     %%_initial_blocks_encrypted

%%_initial_num_blocks_is_5:
	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
        sub     r13, 16*5
        jmp     %%_initial_blocks_encrypted

%%_initial_num_blocks_is_4:
	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
        sub     r13, 16*4



( run in 1.026 second using v1.01-cache-2.11-cpan-df04353d9ac )