nix results from the CPAN

Alien-libsecp256k1
     *
     *     2^0 * (table(0, d>>0 & mask(0)) + ... + table(COMB_BLOCKS-1, d>>0 & mask(COMP_BLOCKS-1)))
     *   + 2^1 * (table(0, d>>1 & mask(0)) + ... + table(COMB_BLOCKS-1, d>>1 & mask(COMP_BLOCKS-1)))
     *   + 2^2 * (table(0, d>>2 & mask(0)) + ... + table(COMB_BLOCKS-1, d>>2 & mask(COMP_BLOCKS-1)))
     *   + ...
     *   + 2^(COMB_SPACING-1) * (table(0, d>>(COMB_SPACING-1) & mask(0)) + ...)
     *
     * Or more generically as
     *
     *   sum(2^i * sum(table(b, d>>i & mask(b)), b=0..COMB_BLOCKS-1), i=0..COMB_SPACING-1)
     *
     * This is implemented using an outer loop that runs in reverse order over the lines of this
     * equation, which in each iteration runs an inner loop that adds the terms of that line and
     * then doubles the result before proceeding to the next line.
     *
     * In pseudocode:
     *   c = infinity
     *   for comb_off in range(COMB_SPACING - 1, -1, -1):
     *     for block in range(COMB_BLOCKS):
     *       c += table(block, (d >> comb_off) & mask(block))
     *     if comb_off > 0:
     *       c = 2*c
     *   return c
     *
     * This computes c = comb(d, G/2), and thus finally R = c + ctx->ge_offset. Note that it would
     * be possible to apply an initial offset instead of a final offset (moving ge_offset to take
     * the place of infinity above), but the chosen approach allows using (in a future improvement)
     * an incomplete addition formula for most of the multiplication.
     *
     * The last question is how to implement the table(b, m) function. For any value of b,
     * m=(d & mask(b)) can only take on at most 2^COMB_TEETH possible values (the last one may have
     * fewer as there mask(b) may exceed the curve order). So we could create COMB_BLOCK tables
     * which contain a value for each such m value.
     *
     * Now note that if m=(d & mask(b)), then flipping the relevant bits of m results in negating
     * the result of table(b, m). This is because table(b,m XOR mask(b)) = table(b, mask(b) - m) =
     * (mask(b) - m - mask(b)/2)*G = (-m + mask(b)/2)*G = -(m - mask(b)/2)*G = -table(b, m).
     * Because of this it suffices to only store the first half of the m values for every b. If an
     * entry from the second half is needed, we look up its bit-flipped version instead, and negate
     * it.
     *
     * secp256k1_ecmult_gen_prec_table[b][index] stores the table(b, m) entries. Index
     * is the relevant mask(b) bits of m packed together without gaps. */

    /* Outer loop: iterate over comb_off from COMB_SPACING - 1 down to 0. */
    comb_off = COMB_SPACING - 1;
    while (1) {
        uint32_t block;
        uint32_t bit_pos = comb_off;
        /* Inner loop: for each block, add table entries to the result. */
        for (block = 0; block < COMB_BLOCKS; ++block) {
            /* Gather the mask(block)-selected bits of d into bits. They're packed:
             * bits[tooth] = d[(block*COMB_TEETH + tooth)*COMB_SPACING + comb_off]. */
            uint32_t bits = 0, sign, abs, index, tooth;
            /* Instead of reading individual bits here to construct the bits variable,
             * build up the result by xoring rotated reads together. In every iteration,
             * one additional bit is made correct, starting at the bottom. The bits
             * above that contain junk. This reduces leakage by avoiding computations
             * on variables that can have only a low number of possible values (e.g.,
             * just two values when reading a single bit into a variable.) See:
             * https://www.usenix.org/system/files/conference/usenixsecurity18/sec18-alam.pdf
             */
            for (tooth = 0; tooth < COMB_TEETH; ++tooth) {
                /* Construct bitdata s.t. the bottom bit is the bit we'd like to read.
                 *
                 * We could just set bitdata = recoded[bit_pos >> 5] >> (bit_pos & 0x1f)
                 * but this would simply discard the bits that fall off at the bottom,
                 * and thus, for example, bitdata could still have only two values if we
                 * happen to shift by exactly 31 positions. We use a rotation instead,
                 * which ensures that bitdata doesn't loose entropy. This relies on the
                 * rotation being atomic, i.e., the compiler emitting an actual rot
                 * instruction. */
                uint32_t bitdata = secp256k1_rotr32(recoded[bit_pos >> 5], bit_pos & 0x1f);

                /* Clear the bit at position tooth, but sssh, don't tell clang. */
                uint32_t volatile vmask = ~(1 << tooth);
                bits &= vmask;

                /* Write the bit into position tooth (and junk into higher bits). */
                bits ^= bitdata << tooth;
                bit_pos += COMB_SPACING;
            }

            /* If the top bit of bits is 1, flip them all (corresponding to looking up
             * the negated table value), and remember to negate the result in sign. */
            sign = (bits >> (COMB_TEETH - 1)) & 1;
            abs = (bits ^ -sign) & (COMB_POINTS - 1);
            VERIFY_CHECK(sign == 0 || sign == 1);
            VERIFY_CHECK(abs < COMB_POINTS);

            /** This uses a conditional move to avoid any secret data in array indexes.
             *   _Any_ use of secret indexes has been demonstrated to result in timing
             *   sidechannels, even when the cache-line access patterns are uniform.
             *  See also:
             *   "A word of warning", CHES 2013 Rump Session, by Daniel J. Bernstein and Peter Schwabe
             *    (https://cryptojedi.org/peter/data/chesrump-20130822.pdf) and
             *   "Cache Attacks and Countermeasures: the Case of AES", RSA 2006,
             *    by Dag Arne Osvik, Adi Shamir, and Eran Tromer
             *    (https://www.tau.ac.il/~tromer/papers/cache.pdf)
             */
            for (index = 0; index < COMB_POINTS; ++index) {
                secp256k1_ge_storage_cmov(&adds, &secp256k1_ecmult_gen_prec_table[block][index], index == abs);
            }

            /* Set add=adds or add=-adds, in constant time, based on sign. */
            secp256k1_ge_from_storage(&add, &adds);
            secp256k1_fe_negate(&neg, &add.y, 1);
            secp256k1_fe_cmov(&add.y, &neg, sign);

            /* Add the looked up and conditionally negated value to r. */
            if (EXPECT(first, 0)) {
                /* If this is the first table lookup, we can skip addition. */
                secp256k1_gej_set_ge(r, &add);
                /* Give the entry a random Z coordinate to blind intermediary results. */
                secp256k1_gej_rescale(r, &ctx->proj_blind);
                first = 0;
            } else {
                secp256k1_gej_add_ge(r, r, &add);
            }
        }
( run in 1.171 second using v1.01-cache-2.11-cpan-39bf76dae61 )