Macro results from the CPAN

Crypt-PQClean-Sign
/*
 * Macro for sign/unsigned integer
 *
 * =============================================================================
 * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
 * ECE Department, George Mason University
 * Fairfax, VA, U.S.A.
 * Author: Duc Tri Nguyen
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *     http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * =============================================================================
 * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
 */

#include <arm_neon.h>

#define vmull_lo(c, a, b) c = vmull_s16(vget_low_s16(a), vget_low_s16(b));

#define vmull_hi(c, a, b) c = vmull_high_s16(a, b);

#define vmulla_lo(d, c, a, b) d = vmlal_s16(c, vget_low_s16(a), vget_low_s16(b));

#define vmulla_hi(d, c, a, b) d = vmlal_high_s16(c, a, b);

#define vadd(c, a, b) c = vaddq_u32(a, b);

#define vaddv(c, a) c = vaddvq_u32(a);

#define vor(c, a, b) c = vorrq_u32(a, b);

// Macro for NTT operation. Using signed 16-bit.
#define vload_s16_4(c, addr) c = vld4q_s16(addr);
#define vload_s16_x2(c, addr) c = vld1q_s16_x2(addr);
#define vload_s16_x4(c, addr) c = vld1q_s16_x4(addr);

#define vstore_s16_x4(addr, c) vst1q_s16_x4(addr, c);
#define vstore_s16_x2(addr, c) vst1q_s16_x2(addr, c);
#define vstore_s16_4(add, c) vst4q_s16(add, c);

/*
 * Strategy for NTT:
 * - Forward and Inverse NTT multiply with constant, use either Barrett or Montgomery *Rounding* arithmetic
 * - Pointwise multiplication must use Montgomery *Doubling* arithmetic
 *
 * Rounding because:
 *
 * - Montgomery need one coefficient to be *odd*, it only works with precomputed coefficient
 * => Tried this approach, very strict on coefficient input range.
 * => E.g a*b: a in [-R/2, R/2]. b in [-Q/2, Q/2] then c in [-2Q, 2Q]
 *
 *  - Barrett multiplication seem to work better with no restriction
 * => Proved to be good. E.g c=a*b, a in [-R, R], b in [-Q/2, Q/2] then c in [-3Q/2, 3Q/2]
 * However, depend on the input bound, the output bound is varies. By using this knowledge, we can further
 * optimize Barrett point by carefully check the output bound according to input bound.
 *
 * - Barrett reduction with c = a % Q. a in [-R, R] then c in [-Q/2, Q/2]
 *
 *
 * Doubling because
 * - Montgomery Doubling work with two unknown coefficient, no constaint at all
 * => c = a*b. a,b in [-R, R] c in [-Q, Q]
 */

// ------------ Forward NTT and Inverse NTT ------------
/*
 * GS Butterfly with Barrett *Rounding* reduction
 * Input: a in [-R, R], zl = w, zh = precomp_w, N, t
 * Output: c = a * b % Q. c in [-3Q/2, 3Q/2]
 */
#define gsbf_br(a, b, zl, zh, QMVQ, t) \
    t = vsubq_s16(a, b);               \
    a = vaddq_s16(a, b);               \
    b = vqrdmulhq_s16(t, zh);          \
    t = vmulq_s16(t, zl);              \
    b = vmlsq_laneq_s16(t, b, QMVQ, 0);

#define gsbf_bri(a, b, zl, zh, i, QMVQ, t) \
    t = vsubq_s16(a, b);                   \
    a = vaddq_s16(a, b);                   \
    b = vqrdmulhq_laneq_s16(t, zh, i);     \
    t = vmulq_laneq_s16(t, zl, i);         \
    b = vmlsq_laneq_s16(t, b, QMVQ, 0);

#define gsbf_bri_x4(a, b, zl, zh, i0, i1, i2, i3, QMVQ, t)   \
    t.val[0] = vsubq_s16(a.val[0], b.val[0]);                \
    t.val[1] = vsubq_s16(a.val[1], b.val[1]);                \
    t.val[2] = vsubq_s16(a.val[2], b.val[2]);                \
    t.val[3] = vsubq_s16(a.val[3], b.val[3]);                \
    a.val[0] = vaddq_s16(a.val[0], b.val[0]);                \
    a.val[1] = vaddq_s16(a.val[1], b.val[1]);                \
    a.val[2] = vaddq_s16(a.val[2], b.val[2]);                \
( run in 0.489 second using v1.01-cache-2.11-cpan-5b529ec07f3 )