map results from the CPAN

Affix
    if (mod == 0x40)
        emit_byte(buf, (uint8_t)offset);
    else if (mod == 0x80)
        emit_int32(buf, offset);
}
/**
 * @internal
 * @brief Emits `movups [base + offset], xmm` to store a 128-bit unaligned value to memory.
 * @details Opcode format: 0F 11 /r
 */
INFIX_INTERNAL void emit_movups_mem_xmm(code_buffer * buf, x64_gpr dest_base, int32_t offset, x64_xmm src) {
    uint8_t rex = 0;
    if (src >= XMM8_REG)
        rex |= REX_R;
    if (dest_base >= R8_REG)
        rex |= REX_B;
    if (rex)
        emit_byte(buf, 0x40 | rex);
    EMIT_BYTES(buf, 0x0F, 0x11);
    uint8_t mod = (offset >= -128 && offset <= 127) ? 0x40 : 0x80;
    if (offset == 0 && (dest_base % 8) != RBP_REG)
        mod = 0x00;
    emit_modrm(buf, mod >> 6, src % 8, dest_base % 8);
    if (dest_base % 8 == RSP_REG)
        emit_byte(buf, 0x24);
    if (mod == 0x40)
        emit_byte(buf, (uint8_t)offset);
    else if (mod == 0x80)
        emit_int32(buf, offset);
}
/**
 * @internal
 * @brief Emits a VEX prefix for an AVX instruction.
 * @details This helper centralizes the logic of choosing between 2-byte (C5) and 3-byte (C4) VEX encodings.
 */
INFIX_INTERNAL void emit_vex_prefix(
    code_buffer * buf, bool r, bool x, bool b, uint8_t m, bool w, uint8_t v, bool l, uint8_t p) {
    // VEX encoding inverts R, X, B bits.
    // The 2-byte VEX prefix cannot encode the L bit for 256-bit operations.
    // The condition must ensure we only use it for 128-bit operations (l=0).
    if (!b && !x && m == 1 && w == 0 && !l) {
        // Use 2-byte VEX prefix (C5) when possible.
        emit_byte(buf, 0xC5);
        uint8_t byte2 = ((!r) << 7) | ((~v & 0xF) << 3) | ((l & 1) << 2) | (p & 3);
        emit_byte(buf, byte2);
    }
    else {
        // Fall back to 3-byte VEX prefix (C4).
        emit_byte(buf, 0xC4);
        uint8_t byte2 = ((!r) << 7) | ((!x) << 6) | ((!b) << 5) | (m & 0x1F);
        emit_byte(buf, byte2);
        uint8_t byte3 = ((w & 1) << 7) | ((~v & 0xF) << 3) | ((l & 1) << 2) | (p & 3);
        emit_byte(buf, byte3);
    }
}
/**
 * @internal
 * @brief Emits a 4-byte EVEX prefix for an AVX-512 instruction, following the Intel SDM.
 */
INFIX_INTERNAL void emit_evex_prefix(code_buffer * buf,
                                     uint8_t map,  // 1 for 0F, 2 for 0F38, 3 for 0F3A
                                     uint8_t pp,   // 00=none, 01=66, 10=F3, 11=F2
                                     bool W,
                                     bool R,
                                     bool X,
                                     bool B,
                                     bool R_prime,  // Register bits
                                     uint8_t vvvv,  // Source register (inverted)
                                     bool L,
                                     bool L_prime,
                                     bool z,
                                     bool b,
                                     uint8_t aaa)  // Masking/control bits
{
    emit_byte(buf, 0x62);
    // Byte 2: P0 - R, X, B, R' bits are inverted. 0 means 1, 1 means 0.
    uint8_t p0 = 0;
    p0 |= (R ? 0 : 1) << 7;        // Inverted R bit
    p0 |= (X ? 0 : 1) << 6;        // Inverted X bit
    p0 |= (B ? 0 : 1) << 5;        // Inverted B bit
    p0 |= (R_prime ? 0 : 1) << 4;  // Inverted R' bit
    p0 |= (map & 0x0F);            // Low 4 bits select the opcode map (0F, 0F38, 0F3A)
    emit_byte(buf, p0);
    // Byte 3: P1
    uint8_t p1 = 0;
    p1 |= (pp & 0b11);
    p1 |= (1 << 2);              // ' (marks EVEX), must be 1
    p1 |= ((~vvvv & 0xF) << 3);  // vvvv field is inverted
    p1 |= W ? (1 << 7) : 0;
    emit_byte(buf, p1);
    // Byte 4: P2
    uint8_t p2 = 0;
    p2 |= (aaa & 0b111);
    p2 |= b ? (1 << 4) : 0;
    p2 |= L_prime ? (1 << 6) : 0;
    p2 |= L ? (1 << 5) : 0;
    p2 |= z ? (1 << 7) : 0;
    // V' bit is the high bit of the 5-bit vvvv register specifier and is NOT inverted.
    p2 |= (((vvvv >> 4) & 1) << 3);
    emit_byte(buf, p2);
}
/**
 * @internal
 * @brief Emits `vmovupd ymm, [base + offset]` to load a 256-bit unaligned value (AVX).
 * @details Instruction format: VEX.256.66.0F.WIG 10 /r
 */
INFIX_INTERNAL void emit_vmovupd_ymm_mem(code_buffer * buf, x64_xmm dest, x64_gpr src_base, int32_t offset) {
    // VEX prefix fields for vmovupd ymm, m256:
    // L=1 (256-bit), p=1 (from 66 prefix), m-mmmm=01 (from 0F map).
    // The vvvv field is not used for a memory source and should be 0.
    emit_vex_prefix(buf, dest >= XMM8_REG, 0, src_base >= R8_REG, 1, false, 0, true, 1);
    emit_byte(buf, 0x10);  // Opcode for MOVUPD
    uint8_t mod = (offset >= -128 && offset <= 127) ? 0x40 : 0x80;
    if (offset == 0 && (src_base % 8) != RBP_REG)
        mod = 0x00;
    emit_modrm(buf, mod >> 6, dest % 8, src_base % 8);
    if (src_base % 8 == RSP_REG)
        emit_byte(buf, 0x24);
    if (mod == 0x40)
        emit_byte(buf, (uint8_t)offset);
    else if (mod == 0x80)
        emit_int32(buf, offset);
}
/**
 * @internal
 * @brief Emits `vmovupd [base + offset], ymm` to store a 256-bit unaligned value (AVX).
 * @details Instruction format: VEX.256.66.0F.WIG 11 /r
 */
INFIX_INTERNAL void emit_vmovupd_mem_ymm(code_buffer * buf, x64_gpr dest_base, int32_t offset, x64_xmm src) {
    // For a store, the VEX.vvvv field is not used and should be 0.
    emit_vex_prefix(buf, src >= XMM8_REG, 0, dest_base >= R8_REG, 1, false, 0, true, 1);
    emit_byte(buf, 0x11);  // Opcode for MOVUPD (store)
    uint8_t mod = (offset >= -128 && offset <= 127) ? 0x40 : 0x80;
    if (offset == 0 && (dest_base % 8) != RBP_REG)
        mod = 0x00;
    emit_modrm(buf, mod >> 6, src % 8, dest_base % 8);
    if (dest_base % 8 == RSP_REG)
        emit_byte(buf, 0x24);
    if (mod == 0x40)
        emit_byte(buf, (uint8_t)offset);
    else if (mod == 0x80)
        emit_int32(buf, offset);
}
/**
 * @internal
 * @brief Emits `vmovupd zmm, [base + offset]` to load a 512-bit unaligned value (AVX-512).
 * @details Instruction format: EVEX.512.66.0F.W0 10 /r
 */
INFIX_INTERNAL void emit_vmovupd_zmm_mem(code_buffer * buf, x64_xmm dest, x64_gpr src_base, int32_t offset) {
    // For vmovupd zmm, m512:
    // vvvv field is unused and must be 0.
    emit_evex_prefix(buf,
                     1,
                     1,
                     true,  // W=1 for double-precision
                     dest >= XMM8_REG,
                     false,
                     src_base >= R8_REG,
                     dest >= XMM16_REG,
                     // Per Intel SDM: For mem source, EVEX.vvvv must be 1111b (inverted from 0)
                     // and EVEX.V' must be 1. The value 16 (0b10000) encodes this.
                     16,
                     false,  // L=0 for 512-bit
                     true,   // L'=1 for 512-bit
                     false,
                     false,
                     0);
    emit_byte(buf, 0x10);  // Opcode for MOVUPD
    uint8_t mod = (offset >= -128 && offset <= 127) ? 0x40 : 0x80;
    if (offset == 0 && (src_base % 8) != RBP_REG)
        mod = 0x00;
    emit_modrm(buf, mod >> 6, dest % 8, src_base % 8);
    if (src_base % 8 == RSP_REG)
        emit_byte(buf, 0x24);
    if (mod == 0x40)
        emit_byte(buf, (uint8_t)offset);
    else if (mod == 0x80)
        emit_int32(buf, offset);
}
/**
 * @internal
 * @brief Emits `vmovupd [base + offset], zmm` to store a 512-bit unaligned value (AVX-512).
 * @details Instruction format: EVEX.512.66.0F.W0 11 /r
 */
INFIX_INTERNAL void emit_vmovupd_mem_zmm(code_buffer * buf, x64_gpr dest_base, int32_t offset, x64_xmm src) {
    // For a store, the source register is encoded in EVEX.reg_field (via ModRM)
    // and the vvvv field is repurposed. Per Intel SDM, for a memory destination,
    // V' must be 1. We encode this by passing a value with the 5th bit set (16)
    // to the vvvv parameter of the prefix emitter.
    emit_evex_prefix(buf,
                     1,     // map: 0F
                     1,     // pp: 66
                     true,  // W: 1 (double-precision)
                     src >= XMM8_REG,
                     false,
                     dest_base >= R8_REG,
                     src >= XMM16_REG,
                     16,     // vvvv field + V' bit encoding for memory destination
                     false,  // L=0 for 512-bit
                     true,   // L'=1 for 512-bit
                     false,  // z=0
                     false,  // b=0
                     0);     // aaa=0
    emit_byte(buf, 0x11);    // Opcode for MOVUPD (store)
    uint8_t mod = (offset >= -128 && offset <= 127) ? 0x40 : 0x80;
    if (offset == 0 && (dest_base % 8) != RBP_REG)
        mod = 0x00;
    emit_modrm(buf, mod >> 6, src % 8, dest_base % 8);
    if (dest_base % 8 == RSP_REG)
        emit_byte(buf, 0x24);
    if (mod == 0x40)
        emit_byte(buf, (uint8_t)offset);
    else if (mod == 0x80)
        emit_int32(buf, offset);
}
/**
 * @internal
 * @brief Emits `vzeroupper` to clear the upper bits of all YMM/ZMM registers.
 * @details Opcode format: VEX.128.0F.77
 */
INFIX_INTERNAL void emit_vzeroupper(code_buffer * buf) {
    // VEX.128.0F.77 (C5 F8 77)
    // L=0 (128-bit), p=0 (no prefix), m-mmmm=1 (0F map), vvvv=1111 (none)
    emit_vex_prefix(buf, 0, 0, 0, 1, 0, 0, 0, 0);
    emit_byte(buf, 0x77);
}
/**
 * @internal
 * @brief Emits `cvtsd2ss xmm1, xmm2/m64` to convert a double to a float.
 * @details Opcode format: F2 0F 5A /r
 */
INFIX_INTERNAL void emit_cvtsd2ss_xmm_xmm(code_buffer * buf, x64_xmm dest, x64_xmm src) {
    emit_byte(buf, 0xF2);
    uint8_t rex = 0;
    if (dest >= XMM8_REG)
        rex |= REX_R;
    if (src >= XMM8_REG)
        rex |= REX_B;
    if (rex)
        emit_byte(buf, 0x40 | rex);
    EMIT_BYTES(buf, 0x0F, 0x5A);
    emit_modrm(buf, 3, dest % 8, src % 8);
}

/**
 * @internal
 * @brief Emits `cvtsd2ss xmm, [base + offset]` to load a double, convert to float, and store in xmm.
 * @details Opcode format: F2 0F 5A /r
 */
INFIX_INTERNAL void emit_cvtsd2ss_xmm_mem(code_buffer * buf, x64_xmm dest, x64_gpr src_base, int32_t offset) {
    emit_byte(buf, 0xF2);  // F2 prefix for SD (scalar double)
    uint8_t rex = 0;
    if (dest >= XMM8_REG)
        rex |= REX_R;
    if (src_base >= R8_REG)
        rex |= REX_B;
    if (rex)
        emit_byte(buf, 0x40 | rex);

    EMIT_BYTES(buf, 0x0F, 0x5A);  // Opcode for CVTSD2SS

    uint8_t mod = (offset >= -128 && offset <= 127) ? 0x40 : 0x80;
    if (offset == 0 && (src_base % 8) != RBP_REG)
        mod = 0x00;

    emit_modrm(buf, mod >> 6, dest % 8, src_base % 8);

    if (src_base % 8 == RSP_REG)
        emit_byte(buf, 0x24);  // SIB byte for RSP base

    if (mod == 0x40)
        emit_byte(buf, (uint8_t)offset);
    else if (mod == 0x80)
        emit_int32(buf, offset);
}

/**
 * @internal
 * @brief Emits `movaps xmm1, xmm2/m128` to move 128 bits between XMM registers.
 * @details Opcode format: 0F 28 /r
 */
INFIX_INTERNAL void emit_movaps_xmm_xmm(code_buffer * buf, x64_xmm dest, x64_xmm src) {
    uint8_t rex = 0;
( run in 2.277 seconds using v1.01-cache-2.11-cpan-5a3173703d6 )