Crypt-PQClean-Sign
view release on metacpan or search on metacpan
pqclean/crypto_sign/falcon-1024/aarch64/macrous.h view on Meta::CPAN
/*
* Macro for sign/unsigned integer
*
* =============================================================================
* Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
* ECE Department, George Mason University
* Fairfax, VA, U.S.A.
* Author: Duc Tri Nguyen
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================================
* @author Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
*/
#include <arm_neon.h>
#define vmull_lo(c, a, b) c = vmull_s16(vget_low_s16(a), vget_low_s16(b));
#define vmull_hi(c, a, b) c = vmull_high_s16(a, b);
#define vmulla_lo(d, c, a, b) d = vmlal_s16(c, vget_low_s16(a), vget_low_s16(b));
#define vmulla_hi(d, c, a, b) d = vmlal_high_s16(c, a, b);
#define vadd(c, a, b) c = vaddq_u32(a, b);
#define vaddv(c, a) c = vaddvq_u32(a);
#define vor(c, a, b) c = vorrq_u32(a, b);
// Macro for NTT operation. Using signed 16-bit.
#define vload_s16_4(c, addr) c = vld4q_s16(addr);
#define vload_s16_x2(c, addr) c = vld1q_s16_x2(addr);
#define vload_s16_x4(c, addr) c = vld1q_s16_x4(addr);
#define vstore_s16_x4(addr, c) vst1q_s16_x4(addr, c);
#define vstore_s16_x2(addr, c) vst1q_s16_x2(addr, c);
#define vstore_s16_4(add, c) vst4q_s16(add, c);
/*
* Strategy for NTT:
* - Forward and Inverse NTT multiply with constant, use either Barrett or Montgomery *Rounding* arithmetic
* - Pointwise multiplication must use Montgomery *Doubling* arithmetic
*
* Rounding because:
*
* - Montgomery need one coefficient to be *odd*, it only works with precomputed coefficient
* => Tried this approach, very strict on coefficient input range.
* => E.g a*b: a in [-R/2, R/2]. b in [-Q/2, Q/2] then c in [-2Q, 2Q]
*
* - Barrett multiplication seem to work better with no restriction
* => Proved to be good. E.g c=a*b, a in [-R, R], b in [-Q/2, Q/2] then c in [-3Q/2, 3Q/2]
* However, depend on the input bound, the output bound is varies. By using this knowledge, we can further
* optimize Barrett point by carefully check the output bound according to input bound.
*
* - Barrett reduction with c = a % Q. a in [-R, R] then c in [-Q/2, Q/2]
*
*
* Doubling because
* - Montgomery Doubling work with two unknown coefficient, no constaint at all
* => c = a*b. a,b in [-R, R] c in [-Q, Q]
*/
// ------------ Forward NTT and Inverse NTT ------------
/*
* GS Butterfly with Barrett *Rounding* reduction
* Input: a in [-R, R], zl = w, zh = precomp_w, N, t
* Output: c = a * b % Q. c in [-3Q/2, 3Q/2]
*/
#define gsbf_br(a, b, zl, zh, QMVQ, t) \
t = vsubq_s16(a, b); \
a = vaddq_s16(a, b); \
b = vqrdmulhq_s16(t, zh); \
t = vmulq_s16(t, zl); \
b = vmlsq_laneq_s16(t, b, QMVQ, 0);
#define gsbf_bri(a, b, zl, zh, i, QMVQ, t) \
t = vsubq_s16(a, b); \
a = vaddq_s16(a, b); \
b = vqrdmulhq_laneq_s16(t, zh, i); \
t = vmulq_laneq_s16(t, zl, i); \
b = vmlsq_laneq_s16(t, b, QMVQ, 0);
#define gsbf_bri_x4(a, b, zl, zh, i0, i1, i2, i3, QMVQ, t) \
t.val[0] = vsubq_s16(a.val[0], b.val[0]); \
t.val[1] = vsubq_s16(a.val[1], b.val[1]); \
t.val[2] = vsubq_s16(a.val[2], b.val[2]); \
t.val[3] = vsubq_s16(a.val[3], b.val[3]); \
a.val[0] = vaddq_s16(a.val[0], b.val[0]); \
a.val[1] = vaddq_s16(a.val[1], b.val[1]); \
a.val[2] = vaddq_s16(a.val[2], b.val[2]); \
( run in 0.489 second using v1.01-cache-2.11-cpan-5b529ec07f3 )