ISAL-Crypto
view release on metacpan or search on metacpan
isa-l_crypto/aes/gcm_sse.asm view on Meta::CPAN
; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
;
; poly = x^128 + x^127 + x^126 + x^121 + 1
; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
;
%include "reg_sizes.asm"
%include "gcm_defines.asm"
%ifndef GCM128_MODE
%ifndef GCM192_MODE
%ifndef GCM256_MODE
%error "No GCM mode selected for gcm_sse.asm!"
%endif
%endif
%endif
%ifndef FUNCT_EXTENSION
%define FUNCT_EXTENSION
%endif
%ifdef GCM128_MODE
%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ sse %+ FUNCT_EXTENSION
%define NROUNDS 9
%endif
%ifdef GCM192_MODE
%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ sse %+ FUNCT_EXTENSION
%define NROUNDS 11
%endif
%ifdef GCM256_MODE
%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ sse %+ FUNCT_EXTENSION
%define NROUNDS 13
%endif
default rel
; need to push 5 registers into stack to maintain
%define STACK_OFFSET 8*5
%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
%define TMP3 16*1 ; Temporary storage for AES State 3
%define TMP4 16*2 ; Temporary storage for AES State 4
%define TMP5 16*3 ; Temporary storage for AES State 5
%define TMP6 16*4 ; Temporary storage for AES State 6
%define TMP7 16*5 ; Temporary storage for AES State 7
%define TMP8 16*6 ; Temporary storage for AES State 8
%define LOCAL_STORAGE 16*7
%ifidn __OUTPUT_FORMAT__, win64
%define XMM_STORAGE 16*10
%else
%define XMM_STORAGE 0
%endif
%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Utility Macros
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
; Input: A and B (128-bits each, bit-reflected)
; Output: C = A*B*x mod poly, (i.e. >>1 )
; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro GHASH_MUL 7
%define %%GH %1 ; 16 Bytes
%define %%HK %2 ; 16 Bytes
%define %%T1 %3
%define %%T2 %4
%define %%T3 %5
%define %%T4 %6
%define %%T5 %7
; %%GH, %%HK hold the values for the two operands which are carry-less multiplied
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Karatsuba Method
movdqa %%T1, %%GH
pshufd %%T2, %%GH, 01001110b
pshufd %%T3, %%HK, 01001110b
pxor %%T2, %%GH ; %%T2 = (a1+a0)
pxor %%T3, %%HK ; %%T3 = (b1+b0)
pclmulqdq %%T1, %%HK, 0x11 ; %%T1 = a1*b1
pclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0
pclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
pxor %%T2, %%GH
pxor %%T2, %%T1 ; %%T2 = a0*b1+a1*b0
movdqa %%T3, %%T2
pslldq %%T3, 8 ; shift-L %%T3 2 DWs
psrldq %%T2, 8 ; shift-R %%T2 2 DWs
pxor %%GH, %%T3
pxor %%T1, %%T2 ; <%%T1:%%GH> holds the result of the carry-less multiplication of %%GH by %%HK
;first phase of the reduction
movdqa %%T2, %%GH
movdqa %%T3, %%GH
movdqa %%T4, %%GH ; move %%GH into %%T2, %%T3, %%T4 in order to perform the three shifts independently
pslld %%T2, 31 ; packed right shifting << 31
pslld %%T3, 30 ; packed right shifting shift << 30
pslld %%T4, 25 ; packed right shifting shift << 25
pxor %%T2, %%T3 ; xor the shifted versions
pxor %%T2, %%T4
movdqa %%T5, %%T2
psrldq %%T5, 4 ; shift-R %%T5 1 DW
pslldq %%T2, 12 ; shift-L %%T2 3 DWs
pxor %%GH, %%T2 ; first phase of the reduction complete
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;second phase of the reduction
movdqa %%T2,%%GH ; make 3 copies of %%GH (in in %%T2, %%T3, %%T4) for doing three shift operations
movdqa %%T3,%%GH
isa-l_crypto/aes/gcm_sse.asm view on Meta::CPAN
%define %%T3 %12
%define %%T4 %13
%define %%T5 %14 ; temp reg 5
mov %%T1, %%A_IN ; T1 = AAD
mov %%T2, %%A_LEN ; T2 = aadLen
pxor %%AAD_HASH, %%AAD_HASH
cmp %%T2, 16
jl %%_get_small_AAD_block
%%_get_AAD_loop16:
movdqu %%XTMP1, [%%T1]
;byte-reflect the AAD data
pshufb %%XTMP1, [SHUF_MASK]
pxor %%AAD_HASH, %%XTMP1
GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
sub %%T2, 16
je %%_CALC_AAD_done
add %%T1, 16
cmp %%T2, 16
jge %%_get_AAD_loop16
%%_get_small_AAD_block:
READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
;byte-reflect the AAD data
pshufb %%XTMP1, [SHUF_MASK]
pxor %%AAD_HASH, %%XTMP1
GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
%%_CALC_AAD_done:
%endmacro ; CALC_AAD_HASH
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
; Requires the input data be at least 1 byte long.
; Input: gcm_key_data (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN),
; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET),
; and whether encoding or decoding (ENC_DEC).
; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro PARTIAL_BLOCK 8
%define %%GDATA_KEY %1
%define %%GDATA_CTX %2
%define %%CYPH_PLAIN_OUT %3
%define %%PLAIN_CYPH_IN %4
%define %%PLAIN_CYPH_LEN %5
%define %%DATA_OFFSET %6
%define %%AAD_HASH %7
%define %%ENC_DEC %8
mov r13, [%%GDATA_CTX + PBlockLen]
cmp r13, 0
je %%_partial_block_done ;Leave Macro if no partial blocks
cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
jl %%_fewer_than_16_bytes
XLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
jmp %%_data_read
%%_fewer_than_16_bytes:
lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
mov r13, [%%GDATA_CTX + PBlockLen]
%%_data_read: ;Finished reading in data
movdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = ctx_data.partial_block_enc_key
movdqu xmm13, [%%GDATA_KEY + HashKey]
lea r12, [SHIFT_MASK]
add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
movdqu xmm2, [r12] ; get the appropriate shuffle mask
pshufb xmm9, xmm2 ;shift right r13 bytes
%ifidn %%ENC_DEC, DEC
movdqa xmm3, xmm1
pxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
mov r15, %%PLAIN_CYPH_LEN
add r15, r13
sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
sub r12, r15
%%_no_extra_mask_1:
movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
pand xmm3, xmm1
pshufb xmm3, [SHUF_MASK]
pshufb xmm3, xmm2
pxor %%AAD_HASH, xmm3
cmp r15,0
jl %%_partial_incomplete_1
GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
xor rax,rax
mov [%%GDATA_CTX + PBlockLen], rax
jmp %%_dec_done
%%_partial_incomplete_1:
add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
%%_dec_done:
movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
%else
pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
mov r15, %%PLAIN_CYPH_LEN
add r15, r13
isa-l_crypto/aes/gcm_sse.asm view on Meta::CPAN
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV,
; Additional Authentication data (A_IN), Additional Data length (A_LEN).
; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA.
; Clobbers rax, r10-r13 and xmm0-xmm6
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro GCM_INIT 5
%define %%GDATA_KEY %1
%define %%GDATA_CTX %2
%define %%IV %3
%define %%A_IN %4
%define %%A_LEN %5
%define %%AAD_HASH xmm0
%define %%SUBHASH xmm1
movdqu %%SUBHASH, [%%GDATA_KEY + HashKey]
CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
pxor xmm2, xmm3
mov r10, %%A_LEN
movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash
mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length
xor r10, r10
mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0
mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0
movdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0
mov r10, %%IV
movdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001
pinsrq xmm2, [r10], 0
pinsrd xmm2, [r10+8], 2
movdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv
pshufb xmm2, [SHUF_MASK]
movdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv
%endmacro
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data
; struct has been initialized by GCM_INIT.
; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
; Input: gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN),
; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC)
; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
; Clobbers rax, r10-r15, and xmm0-xmm15
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro GCM_ENC_DEC 6
%define %%GDATA_KEY %1
%define %%GDATA_CTX %2
%define %%CYPH_PLAIN_OUT %3
%define %%PLAIN_CYPH_IN %4
%define %%PLAIN_CYPH_LEN %5
%define %%ENC_DEC %6
%define %%DATA_OFFSET r11
; Macro flow:
; calculate the number of 16byte blocks in the message
; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
cmp %%PLAIN_CYPH_LEN, 0
je %%_multiple_of_16_bytes
xor %%DATA_OFFSET, %%DATA_OFFSET
add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
movdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey
movdqu xmm8, [%%GDATA_CTX + AadHash]
PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
mov r13, %%PLAIN_CYPH_LEN ; save the number of bytes of plaintext/ciphertext
sub r13, %%DATA_OFFSET
mov r10, r13 ;save the amount of data left to process in r10
and r13, -16 ; r13 = r13 - (r13 mod 16)
mov r12, r13
shr r12, 4
and r12, 7
jz %%_initial_num_blocks_is_0
cmp r12, 7
je %%_initial_num_blocks_is_7
cmp r12, 6
je %%_initial_num_blocks_is_6
cmp r12, 5
je %%_initial_num_blocks_is_5
cmp r12, 4
je %%_initial_num_blocks_is_4
cmp r12, 3
je %%_initial_num_blocks_is_3
cmp r12, 2
je %%_initial_num_blocks_is_2
jmp %%_initial_num_blocks_is_1
%%_initial_num_blocks_is_7:
INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
sub r13, 16*7
jmp %%_initial_blocks_encrypted
%%_initial_num_blocks_is_6:
INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
sub r13, 16*6
jmp %%_initial_blocks_encrypted
%%_initial_num_blocks_is_5:
INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
sub r13, 16*5
jmp %%_initial_blocks_encrypted
%%_initial_num_blocks_is_4:
INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
sub r13, 16*4
jmp %%_initial_blocks_encrypted
( run in 2.460 seconds using v1.01-cache-2.11-cpan-ceb78f64989 )