Alien-libsecp256k1
view release on metacpan or search on metacpan
libsecp256k1/src/asm/field_10x26_arm.s view on Meta::CPAN
@ vim: set tabstop=8 softtabstop=8 shiftwidth=8 noexpandtab syntax=armasm:
/***********************************************************************
* Copyright (c) 2014 Wladimir J. van der Laan *
* Distributed under the MIT software license, see the accompanying *
* file COPYING or https://www.opensource.org/licenses/mit-license.php.*
***********************************************************************/
/*
ARM implementation of field_10x26 inner loops.
Note:
- To avoid unnecessary loads and make use of available registers, two
'passes' have every time been interleaved, with the odd passes accumulating c' and d'
which will be added to c and d respectively in the even passes
*/
.syntax unified
@ eabi attributes - see readelf -A
.eabi_attribute 24, 1 @ Tag_ABI_align_needed = 8-byte
.eabi_attribute 25, 1 @ Tag_ABI_align_preserved = 8-byte, except leaf SP
.text
@ Field constants
.set field_R0, 0x3d10
.set field_R1, 0x400
.set field_not_M, 0xfc000000 @ ~M = ~0x3ffffff
.align 2
.global secp256k1_fe_mul_inner
.type secp256k1_fe_mul_inner, %function
.hidden secp256k1_fe_mul_inner
@ Arguments:
@ r0 r Restrict: can overlap with a, not with b
@ r1 a
@ r2 b
@ Stack (total 4+10*4 = 44)
@ sp + #0 saved 'r' pointer
@ sp + #4 + 4*X t0,t1,t2,t3,t4,t5,t6,t7,u8,t9
secp256k1_fe_mul_inner:
stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r14}
sub sp, sp, #48 @ frame=44 + alignment
str r0, [sp, #0] @ save result address, we need it only at the end
/******************************************
* Main computation code.
******************************************
Allocation:
r0,r14,r7,r8 scratch
r1 a (pointer)
r2 b (pointer)
r3:r4 c
r5:r6 d
r11:r12 c'
r9:r10 d'
Note: do not write to r[] here, it may overlap with a[]
*/
/* A - interleaved with B */
ldr r7, [r1, #0*4] @ a[0]
ldr r8, [r2, #9*4] @ b[9]
ldr r0, [r1, #1*4] @ a[1]
umull r5, r6, r7, r8 @ d = a[0] * b[9]
ldr r14, [r2, #8*4] @ b[8]
umull r9, r10, r0, r8 @ d' = a[1] * b[9]
ldr r7, [r1, #2*4] @ a[2]
umlal r5, r6, r0, r14 @ d += a[1] * b[8]
ldr r8, [r2, #7*4] @ b[7]
umlal r9, r10, r7, r14 @ d' += a[2] * b[8]
ldr r0, [r1, #3*4] @ a[3]
umlal r5, r6, r7, r8 @ d += a[2] * b[7]
ldr r14, [r2, #6*4] @ b[6]
umlal r9, r10, r0, r8 @ d' += a[3] * b[7]
ldr r7, [r1, #4*4] @ a[4]
umlal r5, r6, r0, r14 @ d += a[3] * b[6]
ldr r8, [r2, #5*4] @ b[5]
umlal r9, r10, r7, r14 @ d' += a[4] * b[6]
ldr r0, [r1, #5*4] @ a[5]
umlal r5, r6, r7, r8 @ d += a[4] * b[5]
ldr r14, [r2, #4*4] @ b[4]
umlal r9, r10, r0, r8 @ d' += a[5] * b[5]
ldr r7, [r1, #6*4] @ a[6]
umlal r5, r6, r0, r14 @ d += a[5] * b[4]
ldr r8, [r2, #3*4] @ b[3]
umlal r9, r10, r7, r14 @ d' += a[6] * b[4]
ldr r0, [r1, #7*4] @ a[7]
umlal r5, r6, r7, r8 @ d += a[6] * b[3]
ldr r14, [r2, #2*4] @ b[2]
libsecp256k1/src/asm/field_10x26_arm.s view on Meta::CPAN
add r1, r0, #3*4
stmia r1, {r2,r7,r8,r9,r10}
bic r2, r3, field_not_M @ r[8] = c & M
str r2, [r0, #8*4]
mov r3, r3, lsr #26 @ c >>= 26
orr r3, r3, r4, asl #6
mov r4, r4, lsr #26
mov r14, field_R1 @ c += u8 * R1
umlal r3, r4, r11, r14
movw r14, field_R0 @ c += d * R0
umlal r3, r4, r5, r14
adds r3, r3, r12 @ c += t9
adc r4, r4, #0
add r1, sp, #4 + 0*4 @ r7,r8,r9 = t0,t1,t2
ldmia r1, {r7,r8,r9}
ubfx r2, r3, #0, #22 @ r[9] = c & (M >> 4)
str r2, [r0, #9*4]
mov r3, r3, lsr #22 @ c >>= 22
orr r3, r3, r4, asl #10
mov r4, r4, lsr #22
movw r14, field_R1 << 4 @ c += d * (R1 << 4)
umlal r3, r4, r5, r14
movw r14, field_R0 >> 4 @ d = c * (R0 >> 4) + t0 (64x64 multiply+add)
umull r5, r6, r3, r14 @ d = c.lo * (R0 >> 4)
adds r5, r5, r7 @ d.lo += t0
mla r6, r14, r4, r6 @ d.hi += c.hi * (R0 >> 4)
adc r6, r6, 0 @ d.hi += carry
bic r2, r5, field_not_M @ r[0] = d & M
str r2, [r0, #0*4]
mov r5, r5, lsr #26 @ d >>= 26
orr r5, r5, r6, asl #6
mov r6, r6, lsr #26
movw r14, field_R1 >> 4 @ d += c * (R1 >> 4) + t1 (64x64 multiply+add)
umull r1, r2, r3, r14 @ tmp = c.lo * (R1 >> 4)
adds r5, r5, r8 @ d.lo += t1
adc r6, r6, #0 @ d.hi += carry
adds r5, r5, r1 @ d.lo += tmp.lo
mla r2, r14, r4, r2 @ tmp.hi += c.hi * (R1 >> 4)
adc r6, r6, r2 @ d.hi += carry + tmp.hi
bic r2, r5, field_not_M @ r[1] = d & M
str r2, [r0, #1*4]
mov r5, r5, lsr #26 @ d >>= 26 (ignore hi)
orr r5, r5, r6, asl #6
add r5, r5, r9 @ d += t2
str r5, [r0, #2*4] @ r[2] = d
add sp, sp, #48
ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.size secp256k1_fe_mul_inner, .-secp256k1_fe_mul_inner
.align 2
.global secp256k1_fe_sqr_inner
.type secp256k1_fe_sqr_inner, %function
.hidden secp256k1_fe_sqr_inner
@ Arguments:
@ r0 r Can overlap with a
@ r1 a
@ Stack (total 4+10*4 = 44)
@ sp + #0 saved 'r' pointer
@ sp + #4 + 4*X t0,t1,t2,t3,t4,t5,t6,t7,u8,t9
secp256k1_fe_sqr_inner:
stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r14}
sub sp, sp, #48 @ frame=44 + alignment
str r0, [sp, #0] @ save result address, we need it only at the end
/******************************************
* Main computation code.
******************************************
Allocation:
r0,r14,r2,r7,r8 scratch
r1 a (pointer)
r3:r4 c
r5:r6 d
r11:r12 c'
r9:r10 d'
Note: do not write to r[] here, it may overlap with a[]
*/
/* A interleaved with B */
ldr r0, [r1, #1*4] @ a[1]*2
ldr r7, [r1, #0*4] @ a[0]
mov r0, r0, asl #1
ldr r14, [r1, #9*4] @ a[9]
umull r3, r4, r7, r7 @ c = a[0] * a[0]
ldr r8, [r1, #8*4] @ a[8]
mov r7, r7, asl #1
umull r5, r6, r7, r14 @ d = a[0]*2 * a[9]
ldr r7, [r1, #2*4] @ a[2]*2
umull r9, r10, r0, r14 @ d' = a[1]*2 * a[9]
ldr r14, [r1, #7*4] @ a[7]
umlal r5, r6, r0, r8 @ d += a[1]*2 * a[8]
mov r7, r7, asl #1
ldr r0, [r1, #3*4] @ a[3]*2
umlal r9, r10, r7, r8 @ d' += a[2]*2 * a[8]
ldr r8, [r1, #6*4] @ a[6]
umlal r5, r6, r7, r14 @ d += a[2]*2 * a[7]
mov r0, r0, asl #1
ldr r7, [r1, #4*4] @ a[4]*2
umlal r9, r10, r0, r14 @ d' += a[3]*2 * a[7]
ldr r14, [r1, #5*4] @ a[5]
mov r7, r7, asl #1
umlal r5, r6, r0, r8 @ d += a[3]*2 * a[6]
umlal r9, r10, r7, r8 @ d' += a[4]*2 * a[6]
umlal r5, r6, r7, r14 @ d += a[4]*2 * a[5]
umlal r9, r10, r14, r14 @ d' += a[5] * a[5]
bic r0, r5, field_not_M @ t9 = d & M
str r0, [sp, #4 + 9*4]
mov r5, r5, lsr #26 @ d >>= 26
orr r5, r5, r6, asl #6
mov r6, r6, lsr #26
( run in 0.478 second using v1.01-cache-2.11-cpan-75ffa21a3d4 )