ISAL-Crypto

 view release on metacpan or  search on metacpan

isa-l_crypto/aes/gcm_sse.asm  view on Meta::CPAN

;       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
;
; poly = x^128 + x^127 + x^126 + x^121 + 1
; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
;

%include "reg_sizes.asm"
%include "gcm_defines.asm"

%ifndef GCM128_MODE
%ifndef GCM192_MODE
%ifndef GCM256_MODE
%error "No GCM mode selected for gcm_sse.asm!"
%endif
%endif
%endif

%ifndef FUNCT_EXTENSION
%define FUNCT_EXTENSION
%endif

%ifdef GCM128_MODE
%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ sse %+ FUNCT_EXTENSION
%define NROUNDS 9
%endif

%ifdef GCM192_MODE
%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ sse %+ FUNCT_EXTENSION
%define NROUNDS 11
%endif

%ifdef GCM256_MODE
%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ sse %+ FUNCT_EXTENSION
%define NROUNDS 13
%endif


default rel
; need to push 5 registers into stack to maintain
%define STACK_OFFSET 8*5

%define	TMP2	16*0    ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
%define	TMP3	16*1    ; Temporary storage for AES State 3
%define	TMP4	16*2    ; Temporary storage for AES State 4
%define	TMP5	16*3    ; Temporary storage for AES State 5
%define	TMP6	16*4    ; Temporary storage for AES State 6
%define	TMP7	16*5    ; Temporary storage for AES State 7
%define	TMP8	16*6    ; Temporary storage for AES State 8

%define	LOCAL_STORAGE	16*7

%ifidn __OUTPUT_FORMAT__, win64
	%define	XMM_STORAGE	16*10
%else
	%define	XMM_STORAGE	0
%endif

%define	VARIABLE_OFFSET	LOCAL_STORAGE + XMM_STORAGE

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Utility Macros
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
; Input: A and B (128-bits each, bit-reflected)
; Output: C = A*B*x mod poly, (i.e. >>1 )
; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro  GHASH_MUL  7
%define %%GH %1         ; 16 Bytes
%define %%HK %2         ; 16 Bytes
%define %%T1 %3
%define %%T2 %4
%define %%T3 %5
%define %%T4 %6
%define %%T5 %7
        ; %%GH, %%HK hold the values for the two operands which are carry-less multiplied
        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        ; Karatsuba Method
        movdqa  %%T1, %%GH
        pshufd  %%T2, %%GH, 01001110b
        pshufd  %%T3, %%HK, 01001110b
        pxor    %%T2, %%GH                              ; %%T2 = (a1+a0)
        pxor    %%T3, %%HK                              ; %%T3 = (b1+b0)

        pclmulqdq       %%T1, %%HK, 0x11                ; %%T1 = a1*b1
        pclmulqdq       %%GH, %%HK, 0x00                ; %%GH = a0*b0
        pclmulqdq       %%T2, %%T3, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
        pxor    %%T2, %%GH
        pxor    %%T2, %%T1                              ; %%T2 = a0*b1+a1*b0

        movdqa  %%T3, %%T2
        pslldq  %%T3, 8                                 ; shift-L %%T3 2 DWs
        psrldq  %%T2, 8                                 ; shift-R %%T2 2 DWs
        pxor    %%GH, %%T3
        pxor    %%T1, %%T2                              ; <%%T1:%%GH> holds the result of the carry-less multiplication of %%GH by %%HK


        ;first phase of the reduction
        movdqa  %%T2, %%GH
        movdqa  %%T3, %%GH
        movdqa  %%T4, %%GH                              ; move %%GH into %%T2, %%T3, %%T4 in order to perform the three shifts independently

        pslld   %%T2, 31                                ; packed right shifting << 31
        pslld   %%T3, 30                                ; packed right shifting shift << 30
        pslld   %%T4, 25                                ; packed right shifting shift << 25
        pxor    %%T2, %%T3                              ; xor the shifted versions
        pxor    %%T2, %%T4

        movdqa  %%T5, %%T2
        psrldq  %%T5, 4                                 ; shift-R %%T5 1 DW

        pslldq  %%T2, 12                                ; shift-L %%T2 3 DWs
        pxor    %%GH, %%T2                              ; first phase of the reduction complete
        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

        ;second phase of the reduction
        movdqa  %%T2,%%GH                               ; make 3 copies of %%GH (in in %%T2, %%T3, %%T4) for doing three shift operations
        movdqa  %%T3,%%GH

isa-l_crypto/aes/gcm_sse.asm  view on Meta::CPAN

%define	%%T3		%12
%define	%%T4		%13
%define	%%T5		%14	; temp reg 5


	mov	%%T1, %%A_IN		; T1 = AAD
	mov	%%T2, %%A_LEN		; T2 = aadLen
	pxor	%%AAD_HASH, %%AAD_HASH

	cmp	%%T2, 16
	jl	%%_get_small_AAD_block

%%_get_AAD_loop16:

	movdqu	%%XTMP1, [%%T1]
	;byte-reflect the AAD data
	pshufb	%%XTMP1, [SHUF_MASK]
	pxor	%%AAD_HASH, %%XTMP1
	GHASH_MUL	%%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5

	sub	%%T2, 16
	je	%%_CALC_AAD_done

	add	%%T1, 16
	cmp	%%T2, 16
	jge	%%_get_AAD_loop16

%%_get_small_AAD_block:
	READ_SMALL_DATA_INPUT	%%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
	;byte-reflect the AAD data
	pshufb	%%XTMP1, [SHUF_MASK]
	pxor	%%AAD_HASH, %%XTMP1
	GHASH_MUL	%%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5

%%_CALC_AAD_done:

%endmacro ; CALC_AAD_HASH



;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
; Requires the input data be at least 1 byte long.
; Input: gcm_key_data (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN),
; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET),
; and whether encoding or decoding (ENC_DEC).
; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro PARTIAL_BLOCK	8
%define	%%GDATA_KEY		%1
%define	%%GDATA_CTX		%2
%define	%%CYPH_PLAIN_OUT	%3
%define	%%PLAIN_CYPH_IN		%4
%define	%%PLAIN_CYPH_LEN	%5
%define	%%DATA_OFFSET		%6
%define	%%AAD_HASH		%7
%define	%%ENC_DEC		%8
	mov	r13, [%%GDATA_CTX + PBlockLen]
	cmp	r13, 0
	je	%%_partial_block_done		;Leave Macro if no partial blocks

	cmp	%%PLAIN_CYPH_LEN, 16		;Read in input data without over reading
	jl	%%_fewer_than_16_bytes
	XLDR	xmm1, [%%PLAIN_CYPH_IN]		;If more than 16 bytes of data, just fill the xmm register
	jmp	%%_data_read

%%_fewer_than_16_bytes:
	lea	r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
	READ_SMALL_DATA_INPUT	xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
	mov	r13, [%%GDATA_CTX + PBlockLen]

%%_data_read:				;Finished reading in data


	movdqu	xmm9, [%%GDATA_CTX + PBlockEncKey]	;xmm9 = ctx_data.partial_block_enc_key
	movdqu	xmm13, [%%GDATA_KEY + HashKey]

	lea	r12, [SHIFT_MASK]

	add	r12, r13			; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
	movdqu	xmm2, [r12]			; get the appropriate shuffle mask
	pshufb	xmm9, xmm2			;shift right r13 bytes

%ifidn	%%ENC_DEC, DEC
	movdqa	xmm3, xmm1
	pxor	xmm9, xmm1			; Cyphertext XOR E(K, Yn)

	mov	r15, %%PLAIN_CYPH_LEN
	add	r15, r13
	sub	r15, 16				;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
	jge	%%_no_extra_mask_1		;Determine if if partial block is not being filled and shift mask accordingly
	sub	r12, r15
%%_no_extra_mask_1:

	movdqu	xmm1, [r12 + ALL_F-SHIFT_MASK]	; get the appropriate mask to mask out bottom r13 bytes of xmm9
	pand	xmm9, xmm1			; mask out bottom r13 bytes of xmm9

	pand	xmm3, xmm1
	pshufb	xmm3, [SHUF_MASK]
	pshufb	xmm3, xmm2
	pxor	%%AAD_HASH, xmm3


	cmp	r15,0
	jl	%%_partial_incomplete_1

	GHASH_MUL	%%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6	;GHASH computation for the last <16 Byte block
	xor	rax,rax
	mov	[%%GDATA_CTX + PBlockLen], rax
	jmp	%%_dec_done
%%_partial_incomplete_1:
	add	[%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
%%_dec_done:
	movdqu	[%%GDATA_CTX + AadHash], %%AAD_HASH

%else
	pxor	xmm9, xmm1	; Plaintext XOR E(K, Yn)

	mov	r15, %%PLAIN_CYPH_LEN
	add	r15, r13

isa-l_crypto/aes/gcm_sse.asm  view on Meta::CPAN


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV,
; Additional Authentication data (A_IN), Additional Data length (A_LEN).
; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA.
; Clobbers rax, r10-r13 and xmm0-xmm6
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro  GCM_INIT 	5
%define %%GDATA_KEY	%1
%define %%GDATA_CTX	%2
%define %%IV		%3
%define %%A_IN		%4
%define %%A_LEN		%5
%define %%AAD_HASH	xmm0
%define %%SUBHASH	xmm1


        movdqu  %%SUBHASH, [%%GDATA_KEY + HashKey]

	CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
	pxor	xmm2, xmm3
	mov	r10, %%A_LEN

	movdqu	[%%GDATA_CTX + AadHash], %%AAD_HASH	; ctx_data.aad hash = aad_hash
	mov	[%%GDATA_CTX + AadLen], r10		; ctx_data.aad_length = aad_length
	xor	r10, r10
	mov	[%%GDATA_CTX + InLen], r10		; ctx_data.in_length = 0
	mov	[%%GDATA_CTX + PBlockLen], r10		; ctx_data.partial_block_length = 0
	movdqu	[%%GDATA_CTX + PBlockEncKey], xmm2	; ctx_data.partial_block_enc_key = 0
	mov	r10, %%IV
        movdqa  xmm2, [rel ONEf]                        ; read 12 IV bytes and pad with 0x00000001
        pinsrq  xmm2, [r10], 0
        pinsrd  xmm2, [r10+8], 2
	movdqu	[%%GDATA_CTX + OrigIV], xmm2		; ctx_data.orig_IV = iv

	pshufb xmm2, [SHUF_MASK]

	movdqu	[%%GDATA_CTX + CurCount], xmm2		; ctx_data.current_counter = iv
%endmacro


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data
; struct has been initialized by GCM_INIT.
; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
; Input: gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN),
; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC)
; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
; Clobbers rax, r10-r15, and xmm0-xmm15
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro	GCM_ENC_DEC		6
%define	%%GDATA_KEY		%1
%define	%%GDATA_CTX		%2
%define	%%CYPH_PLAIN_OUT	%3
%define	%%PLAIN_CYPH_IN		%4
%define	%%PLAIN_CYPH_LEN	%5
%define	%%ENC_DEC		%6
%define	%%DATA_OFFSET		r11

; Macro flow:
; calculate the number of 16byte blocks in the message
; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'

	cmp	%%PLAIN_CYPH_LEN, 0
	je	%%_multiple_of_16_bytes

	xor	%%DATA_OFFSET, %%DATA_OFFSET
	add	[%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
	movdqu	xmm13, [%%GDATA_KEY + HashKey]                 ; xmm13 = HashKey
	movdqu	xmm8, [%%GDATA_CTX + AadHash]


	PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC

        mov     r13, %%PLAIN_CYPH_LEN                               ; save the number of bytes of plaintext/ciphertext
	sub	r13, %%DATA_OFFSET
	mov	r10, r13	;save the amount of data left to process in r10
        and     r13, -16                                ; r13 = r13 - (r13 mod 16)

        mov     r12, r13
        shr     r12, 4
        and     r12, 7
        jz      %%_initial_num_blocks_is_0

        cmp     r12, 7
        je      %%_initial_num_blocks_is_7
        cmp     r12, 6
        je      %%_initial_num_blocks_is_6
        cmp     r12, 5
        je      %%_initial_num_blocks_is_5
        cmp     r12, 4
        je      %%_initial_num_blocks_is_4
        cmp     r12, 3
        je      %%_initial_num_blocks_is_3
        cmp     r12, 2
        je      %%_initial_num_blocks_is_2

        jmp     %%_initial_num_blocks_is_1

%%_initial_num_blocks_is_7:
	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
        sub     r13, 16*7
        jmp     %%_initial_blocks_encrypted

%%_initial_num_blocks_is_6:
	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
        sub     r13, 16*6
        jmp     %%_initial_blocks_encrypted

%%_initial_num_blocks_is_5:
	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
        sub     r13, 16*5
        jmp     %%_initial_blocks_encrypted

%%_initial_num_blocks_is_4:
	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
        sub     r13, 16*4
        jmp     %%_initial_blocks_encrypted



( run in 2.460 seconds using v1.01-cache-2.11-cpan-ceb78f64989 )