Affix

 view release on metacpan or  search on metacpan

infix/src/arch/x64/abi_win_x64.c  view on Meta::CPAN

/**
 * Copyright (c) 2025 Sanko Robinson
 *
 * This source code is dual-licensed under the Artistic License 2.0 or the MIT License.
 * You may choose to use this code under the terms of either license.
 *
 * SPDX-License-Identifier: (Artistic-2.0 OR MIT)
 *
 * The documentation blocks within this file are licensed under the
 * Creative Commons Attribution 4.0 International License (CC BY 4.0).
 *
 * SPDX-License-Identifier: CC-BY-4.0
 */
/**
 * @file abi_win_x64.c
 * @brief Implements the FFI logic for the Windows x64 calling convention.
 * @ingroup internal_abi_x64
 *
 * @internal
 * This file provides the concrete implementation of the ABI spec for the Microsoft
 * x64 calling convention, used on all 64-bit versions of Windows.
 *
 * Key features and differences from the System V ABI implemented here:
 *
 * - **Register "Slots":** The first four arguments are passed in registers, but the
 *   slots are shared. RCX/XMM0 is the first slot, RDX/XMM1 is the second, etc.
 *   An `int` followed by a `float` would use RCX and XMM1.
 *
 * - **Shadow Space:** The caller must allocate a 32-byte "shadow space" on the stack
 *   for the callee.
 *
 * - **By-Reference Passing:** Aggregates (structs/unions) are passed by reference
 *   if their size is not a power of two (1, 2, 4, or 8 bytes), or if they have
 *   special constructors.
 *   **Crucially, all SIMD vectors > 16 bytes (__m256, __m512) are passed by reference.**
 *   **__m128 (16 bytes) is passed by value in XMM.**
 *
 * - **Return Values:**
 *   - Scalars and __m128 are returned in registers (RAX/XMM0).
 *   - **__m256 and __m512 are returned in registers (YMM0/ZMM0).**
 *   - Aggregates > 8 bytes (excluding vectors) are returned via a hidden pointer.
 * @endinternal
 */
// This file performs many safe conversions from size_t to int32_t for instruction
// offsets. The library's internal limits (INFIX_MAX_STACK_ALLOC) ensure these
// conversions do not lose data. We disable the warning to produce a clean build.
#if defined(INFIX_COMPILER_MSVC)
#pragma warning(push)
#pragma warning(disable : 4267)  // conversion from 'size_t' to 'int32_t'
#endif
#include "arch/x64/abi_x64_common.h"
#include "arch/x64/abi_x64_emitters.h"
#include "common/infix_internals.h"
#include "common/utility.h"
#include <stdbool.h>
#include <stdlib.h>
/** An array of GPRs used for passing the first four integer/pointer arguments. */
static const x64_gpr GPR_ARGS[] = {RCX_REG, RDX_REG, R8_REG, R9_REG};
/** An array of XMM registers used for passing the first four floating-point arguments. */
static const x64_xmm XMM_ARGS[] = {XMM0_REG, XMM1_REG, XMM2_REG, XMM3_REG};
/** The number of register "slots" available for arguments. */
#define NUM_GPR_ARGS 4
/** The number of XMM registers used for arguments. */
#define NUM_XMM_ARGS 4
/** The size in bytes of the mandatory stack space reserved by the caller for the callee. */
#define SHADOW_SPACE 32

/** @brief The v-table of Windows x64 functions for generating forward trampolines. */
static infix_status prepare_forward_call_frame_win_x64(infix_arena_t * arena,
                                                       infix_call_frame_layout ** out_layout,
                                                       infix_type * ret_type,
                                                       infix_type ** arg_types,
                                                       size_t num_args,
                                                       size_t num_fixed_args,
                                                       void * target_fn);
static infix_status generate_forward_prologue_win_x64(code_buffer * buf, infix_call_frame_layout * layout);
static infix_status generate_forward_argument_moves_win_x64(code_buffer * buf,
                                                            infix_call_frame_layout * layout,
                                                            infix_type ** arg_types,
                                                            size_t num_args,
                                                            size_t num_fixed_args);
static infix_status generate_forward_call_instruction_win_x64(code_buffer *, infix_call_frame_layout *);
static infix_status generate_forward_epilogue_win_x64(code_buffer * buf,
                                                      infix_call_frame_layout * layout,
                                                      infix_type * ret_type);
const infix_forward_abi_spec g_win_x64_forward_spec = {
    .prepare_forward_call_frame = prepare_forward_call_frame_win_x64,
    .generate_forward_prologue = generate_forward_prologue_win_x64,
    .generate_forward_argument_moves = generate_forward_argument_moves_win_x64,
    .generate_forward_call_instruction = generate_forward_call_instruction_win_x64,
    .generate_forward_epilogue = generate_forward_epilogue_win_x64};
/** @brief The v-table of Windows x64 functions for generating reverse trampolines. */
static infix_status prepare_reverse_call_frame_win_x64(infix_arena_t * arena,
                                                       infix_reverse_call_frame_layout ** out_layout,
                                                       infix_reverse_t * context);
static infix_status generate_reverse_prologue_win_x64(code_buffer * buf, infix_reverse_call_frame_layout * layout);
static infix_status generate_reverse_argument_marshalling_win_x64(code_buffer * buf,
                                                                  infix_reverse_call_frame_layout * layout,
                                                                  infix_reverse_t * context);
static infix_status generate_reverse_dispatcher_call_win_x64(code_buffer * buf,
                                                             infix_reverse_call_frame_layout * layout,
                                                             infix_reverse_t * context);
static infix_status generate_reverse_epilogue_win_x64(code_buffer * buf,
                                                      infix_reverse_call_frame_layout * layout,
                                                      infix_reverse_t * context);
const infix_reverse_abi_spec g_win_x64_reverse_spec = {
    .prepare_reverse_call_frame = prepare_reverse_call_frame_win_x64,
    .generate_reverse_prologue = generate_reverse_prologue_win_x64,
    .generate_reverse_argument_marshalling = generate_reverse_argument_marshalling_win_x64,
    .generate_reverse_dispatcher_call = generate_reverse_dispatcher_call_win_x64,
    .generate_reverse_epilogue = generate_reverse_epilogue_win_x64};

/** @brief The v-table for the new Direct Marshalling ABI. */
static infix_status prepare_direct_forward_call_frame_win_x64(infix_arena_t * arena,
                                                              infix_direct_call_frame_layout ** out_layout,
                                                              infix_type * ret_type,
                                                              infix_type ** arg_types,
                                                              size_t num_args,
                                                              infix_direct_arg_handler_t * handlers,
                                                              void * target_fn);
static infix_status generate_direct_forward_prologue_win_x64(code_buffer * buf,
                                                             infix_direct_call_frame_layout * layout);
static infix_status generate_direct_forward_argument_moves_win_x64(code_buffer * buf,
                                                                   infix_direct_call_frame_layout * layout);
static infix_status generate_direct_forward_call_instruction_win_x64(code_buffer * buf,
                                                                     infix_direct_call_frame_layout * layout);
static infix_status generate_direct_forward_epilogue_win_x64(code_buffer * buf,
                                                             infix_direct_call_frame_layout * layout,
                                                             infix_type * ret_type);
const infix_direct_forward_abi_spec g_win_x64_direct_forward_spec = {
    .prepare_direct_forward_call_frame = prepare_direct_forward_call_frame_win_x64,
    .generate_direct_forward_prologue = generate_direct_forward_prologue_win_x64,
    .generate_direct_forward_argument_moves = generate_direct_forward_argument_moves_win_x64,
    .generate_direct_forward_call_instruction = generate_direct_forward_call_instruction_win_x64,
    .generate_direct_forward_epilogue = generate_direct_forward_epilogue_win_x64};

/**
 * @internal
 * @brief Determines if a type is returned by value in RAX/XMM0 or via a hidden pointer.
 * @details On Windows x64:
 * - Scalars and __m128 are returned in registers (RAX or XMM0).
 * - **__m256 and __m512 are returned in registers (YMM0 or ZMM0).**
 * - Aggregates > 8 bytes (excluding vectors) are returned via a hidden pointer.
 */
static bool return_value_is_by_reference(const infix_type * type) {
    if (type->category == INFIX_TYPE_VECTOR) {
// Windows x64 ABI (MSVC and Clang) returns ALL vectors in registers (XMM0, YMM0, ZMM0).
// However, MinGW GCC diverges for __m256 and __m512, returning them via hidden pointer.
// We target specifically MinGW GCC by checking for INFIX_COMPILER_GCC and excluding INFIX_COMPILER_CLANG.
#if defined(INFIX_COMPILER_GCC) && !defined(INFIX_COMPILER_CLANG) && defined(INFIX_ENV_MINGW)
        if (type->size > 16)
            return true;
#endif
        return false;
    }

    if (type->category == INFIX_TYPE_STRUCT || type->category == INFIX_TYPE_UNION ||
        type->category == INFIX_TYPE_ARRAY || type->category == INFIX_TYPE_COMPLEX)
        return type->size != 1 && type->size != 2 && type->size != 4 && type->size != 8;

    // Small scalar primitives (including float16) are returned by value.
    if (type->category == INFIX_TYPE_PRIMITIVE && type->size <= 8)
        return false;

#if defined(INFIX_COMPILER_GCC)
    // GCC/Clang have a special case for returning long double by reference on Windows.
    if (is_long_double(type))
        return true;
#endif
    return false;
}
/**
 * @internal
 * @brief Determines if a type must be passed by reference on the Windows x64 ABI.
 * @details
 * - Arrays are passed as pointers (C decay).
 * - __m128, __m256, __m512 are ALL passed by reference (pointer).
 * - Aggregates > 8 bytes or non-power-of-two size are passed by reference.
 */
static bool is_passed_by_reference(const infix_type * type) {
    if (type == nullptr)
        return false;
    // Arrays passed as arguments decay to pointers. We must pass the address.
    if (type->category == INFIX_TYPE_ARRAY)
        return true;

    // Windows x64 ABI:
    // 128-bit vectors (__m128) are passed by reference in some environments (like MinGW GCC).
    // Vectors of 256 bits or 512 bits are always passed by reference.
    if (type->category == INFIX_TYPE_VECTOR)
        return type->size >= 16;

    // Small scalar primitives (including float16) are passed by value.
    if (type->category == INFIX_TYPE_PRIMITIVE && type->size <= 8)
        return false;

    return type->size != 1 && type->size != 2 && type->size != 4 && type->size != 8;
}

/**
 * @internal
 * @brief Stage 1 (Forward): Analyzes a signature and creates a call frame layout for Windows x64.
 * @details Assigns each argument to a register "slot" or the stack. If the return value is
 *          passed by reference, it consumes the first slot (RCX).
 * @param arena The temporary arena for allocations.
 * @param out_layout Receives the created layout blueprint.
 * @param ret_type The function's return type.
 * @param arg_types Array of argument types.
 * @param num_args Total number of arguments.
 * @param num_fixed_args Number of non-variadic arguments.
 * @param target_fn The target function address.
 * @return `INFIX_SUCCESS` on success.
 */
static infix_status prepare_forward_call_frame_win_x64(infix_arena_t * arena,
                                                       infix_call_frame_layout ** out_layout,
                                                       infix_type * ret_type,
                                                       infix_type ** arg_types,
                                                       size_t num_args,
                                                       size_t num_fixed_args,
                                                       void * target_fn) {
    if (out_layout == nullptr)
        return INFIX_ERROR_INVALID_ARGUMENT;
    infix_call_frame_layout * layout =
        infix_arena_calloc(arena, 1, sizeof(infix_call_frame_layout), _Alignof(infix_call_frame_layout));
    if (layout == nullptr) {
        *out_layout = nullptr;
        return INFIX_ERROR_ALLOCATION_FAILED;
    }
    layout->is_variadic = num_args > num_fixed_args;
    layout->target_fn = target_fn;
    INFIX_DEBUG_PRINTF("Allocating %llu bytes for arg_locations in temp_arena",
                       (unsigned long long)(num_args * sizeof(infix_arg_location)));
    layout->arg_locations =
        infix_arena_calloc(arena, num_args, sizeof(infix_arg_location), _Alignof(infix_arg_location));
    if (layout->arg_locations == nullptr && num_args > 0) {
        *out_layout = nullptr;
        return INFIX_ERROR_ALLOCATION_FAILED;
    }
    layout->return_value_in_memory = return_value_is_by_reference(ret_type);
    size_t arg_position = 0;
    if (layout->return_value_in_memory)
        arg_position++;  // The hidden return pointer consumes the first slot (RCX).
    size_t current_stack_offset = SHADOW_SPACE;
    size_t max_align = 16;
    layout->num_stack_args = 0;
    for (size_t i = 0; i < num_args; ++i) {
        infix_type * current_type = arg_types[i];
        if (current_type->alignment > max_align)
            max_align = current_type->alignment;
        // Detect vectors as FP so they get XMM slots if passed by value (<=16 bytes).
        bool is_fp = is_float16(current_type) || is_float(current_type) || is_double(current_type) ||
            (current_type->category == INFIX_TYPE_VECTOR);
        // as FP register args in the slot assignment logic
        // (they go to GPR slots as pointers).
        bool is_ref = is_passed_by_reference(current_type);
        bool is_variadic_arg = (i >= num_fixed_args);

        if (arg_position < 4) {
            if (is_fp && !is_ref && !is_variadic_arg) {
                layout->arg_locations[i].type = ARG_LOCATION_XMM;
                layout->arg_locations[i].reg_index = (uint8_t)arg_position++;
            }
            else {
                layout->arg_locations[i].type = ARG_LOCATION_GPR;
                layout->arg_locations[i].reg_index = (uint8_t)arg_position++;
            }
        }
        else {
            layout->arg_locations[i].type = ARG_LOCATION_STACK;
            current_stack_offset = _infix_align_up(current_stack_offset, current_type->alignment);
            layout->arg_locations[i].stack_offset = (uint32_t)current_stack_offset;
            layout->num_stack_args++;
            // Calculate space needed on the stack for this argument.
            // By-reference types (including arrays/vectors) are just a pointer (8 bytes).
            size_t arg_stack_space = is_ref ? 8 : ((current_type->size + 7) & ~7);
            current_stack_offset += arg_stack_space;
            // Step 0: Make sure we aren't blowing ourselves up
            if (current_stack_offset > INFIX_MAX_ARG_SIZE) {
                *out_layout = nullptr;
                return INFIX_ERROR_LAYOUT_FAILED;
            }
        }
    }
    if (ret_type->alignment > max_align)
        max_align = ret_type->alignment;

    size_t total_stack_arg_size = current_stack_offset;
    // Total allocation must include shadow space and be aligned to max_align.
    layout->total_stack_alloc = (uint32_t)_infix_align_up(total_stack_arg_size, max_align);
    layout->max_align = (uint32_t)max_align;
    // Prevent integer overflow and excessive stack allocation.
    if (layout->total_stack_alloc > INFIX_MAX_STACK_ALLOC) {
        fprintf(stderr, "Error: Calculated stack allocation exceeds safe limit of %d bytes.\n", INFIX_MAX_STACK_ALLOC);
        *out_layout = nullptr;
        return INFIX_ERROR_LAYOUT_FAILED;
    }
    *out_layout = layout;
    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 2 (Forward): Generates the function prologue for the Windows x64 trampoline.
 * @details This function emits the standard machine code required at the beginning of a function.
 *          The generated assembly performs these steps:
 *          1.  `push rbp` / `mov rbp, rsp`: Creates a standard stack frame.
 *          2.  `push r12-r15`: Saves all callee-saved registers that the trampoline will
 *              use to hold its context.
 *          3.  `and rsp, -16`: **Forces 16-byte stack alignment**. This is critical because
 *              SIMD instructions in the target function may segfault if the stack is misaligned.
 *          4.  `mov r12, rcx`, etc.: Moves the trampoline's own arguments into preserved registers.
 *          5.  `sub rsp, imm32`: Allocates the required space on the stack.
 *
 * @param buf The code buffer to write the assembly into.
 * @param layout The call frame layout containing total stack allocation information.
 * @return `INFIX_SUCCESS` on successful code generation.
 */
static infix_status generate_forward_prologue_win_x64(code_buffer * buf, infix_call_frame_layout * layout) {
    emit_push_reg(buf, RBP_REG);  // push rbp
    // Save callee-saved registers we will use to hold our context.
    emit_push_reg(buf, R12_REG);              // push r12 (will hold target function address)
    emit_push_reg(buf, R13_REG);              // push r13 (will hold return value pointer)
    emit_push_reg(buf, R14_REG);              // push r14 (will hold argument pointers array)
    emit_push_reg(buf, R15_REG);              // push r15 (will be a scratch register for data moves)
    emit_mov_reg_reg(buf, RBP_REG, RSP_REG);  // mov rbp, rsp

    layout->prologue_size = (uint32_t)buf->size;

    // FORCE 16-BYTE ALIGNMENT.
    // AND RSP, -16
    emit_and_reg_imm8(buf, RSP_REG, -16);

    // Move incoming trampoline arguments to non-volatile registers.
    if (layout->target_fn == nullptr) {           // Unbound: (target_fn, ret_ptr, args_ptr) in RCX, RDX, R8
        emit_mov_reg_reg(buf, R12_REG, RCX_REG);  // R12 = target function
        emit_mov_reg_reg(buf, R13_REG, RDX_REG);  // R13 = return value buffer
        emit_mov_reg_reg(buf, R14_REG, R8_REG);   // R14 = argument values array
    }
    else {                                        // Bound: (ret_ptr, args_ptr) in RCX, RDX
        emit_mov_reg_reg(buf, R13_REG, RCX_REG);  // R13 = return value buffer
        emit_mov_reg_reg(buf, R14_REG, RDX_REG);  // R14 = argument values array
    }
    // Allocate stack space for arguments and shadow space.
    if (layout->total_stack_alloc > 0)
        emit_sub_reg_imm32(buf, RSP_REG, (int32_t)layout->total_stack_alloc);

    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 3 (Forward): Generates code to move arguments into their native locations.
 * @details This function iterates through the layout blueprint and emits `mov` instructions
 *          to place each argument into its assigned register or stack slot.
 *
 *          Key behaviors implemented:
 *          - **Register Arguments:** Loads data into the correct GPR or XMM register.
 *          - **Sign-Extension:** Uses `movsxd` for signed integers smaller than 64 bits.
 *          - **By-Reference Arguments:** Loads the pointer directly into the GPR.
 *          - **Stack Arguments:** Copies data to the stack, past the 32-byte shadow space.
 *          - **Variadic Floats:** Correctly passes float/double arguments in both the
 *            appropriate GPR and XMM register for variadic functions.
 * @param buf The code buffer.
 * @param layout The layout blueprint.
 * @param arg_types The array of argument types.
 * @param num_args Total number of arguments.
 * @param num_fixed_args Number of fixed arguments.
 * @return `INFIX_SUCCESS` on success.
 */
static infix_status generate_forward_argument_moves_win_x64(code_buffer * buf,
                                                            infix_call_frame_layout * layout,
                                                            infix_type ** arg_types,
                                                            size_t num_args,
                                                            size_t num_fixed_args) {
    // If returning a large struct, the hidden pointer (stored in r13) must be moved to RCX.
    if (layout->return_value_in_memory)
        emit_mov_reg_reg(buf, GPR_ARGS[0], R13_REG);
    // Marshall Register Arguments
    for (size_t i = 0; i < num_args; ++i) {
        infix_arg_location * loc = &layout->arg_locations[i];
        if (loc->type == ARG_LOCATION_STACK)
            continue;  // Handle stack args later.
        infix_type * current_type = arg_types[i];
        bool is_variadic_arg = (i >= num_fixed_args);
        // R15 = pointer to the current argument's data from the args_array.
        emit_mov_reg_mem(buf, R15_REG, R14_REG, (int32_t)(i * sizeof(void *)));
        if (loc->type == ARG_LOCATION_GPR) {
            if (is_passed_by_reference(current_type))

infix/src/arch/x64/abi_win_x64.c  view on Meta::CPAN

                    if (current_type->size == 1)
                        emit_movzx_reg64_mem8(buf, GPR_ARGS[loc->reg_index], R15_REG, 0);
                    else if (current_type->size == 2 || is_float16(current_type))
                        emit_movzx_reg64_mem16(buf, GPR_ARGS[loc->reg_index], R15_REG, 0);
                    else if (current_type->size == 4)
                        emit_mov_reg32_mem(buf, GPR_ARGS[loc->reg_index], R15_REG, 0);
                    else
                        emit_mov_reg_mem(buf, GPR_ARGS[loc->reg_index], R15_REG, 0);
                }
            }
        }
        else {  // ARG_LOCATION_XMM
            if (is_float16(current_type)) {
                // Half-precision is passed in the low 16 bits of the XMM register.
                // We use movzx to load exactly 16 bits from memory to avoid over-reading local variables.
                emit_movzx_reg64_mem16(buf, RAX_REG, R15_REG, 0);
                emit_movq_xmm_gpr(buf, XMM_ARGS[loc->reg_index], RAX_REG);
            }
            else if (is_float(current_type))
                emit_movss_xmm_mem(buf, XMM_ARGS[loc->reg_index], R15_REG, 0);
            else if (current_type->category == INFIX_TYPE_VECTOR)
                emit_movups_xmm_mem(buf, XMM_ARGS[loc->reg_index], R15_REG, 0);
            else
                emit_movsd_xmm_mem(buf, XMM_ARGS[loc->reg_index], R15_REG, 0);
        }
    }
    // Marshall Stack Arguments
    for (size_t i = 0; i < num_args; ++i) {
        if (layout->arg_locations[i].type != ARG_LOCATION_STACK)
            continue;
        infix_type * current_type = arg_types[i];
        infix_arg_location * loc = &layout->arg_locations[i];
        // R15 = pointer to the argument's data.
        emit_mov_reg_mem(buf, R15_REG, R14_REG, i * sizeof(void *));
        if (is_passed_by_reference(current_type))
            // Arrays/By-Ref: R15 IS the pointer. Store it on the stack.
            emit_mov_mem_reg(buf, RSP_REG, loc->stack_offset, R15_REG);
        else {
            // Copy the argument data from the user's buffer to the stack, 8 bytes at a time.
            for (size_t offset = 0; offset < current_type->size; offset += 8) {
                emit_mov_reg_mem(buf, RAX_REG, R15_REG, offset);                      // Load 8 bytes into scratch reg
                emit_mov_mem_reg(buf, RSP_REG, loc->stack_offset + offset, RAX_REG);  // Store to stack
            }
        }
    }
    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 3.5 (Forward): Generates the null-check and call instruction.
 * @param buf The code buffer.
 * @param layout The call frame layout.
 * @return `INFIX_SUCCESS`.
 */
static infix_status generate_forward_call_instruction_win_x64(code_buffer * buf,
                                                              c23_maybe_unused infix_call_frame_layout * layout) {
    if (layout->target_fn) {
        // For a bound trampoline, the target is hardcoded. Load it into R12.
        emit_mov_reg_imm64(buf, R12_REG, (uint64_t)layout->target_fn);
    }
    // For an unbound trampoline, R12 was already loaded from the first argument in the prologue.
    // On Windows x64, the target function pointer is stored in R12.
    emit_test_reg_reg(buf, R12_REG, R12_REG);  // test r12, r12
    emit_jnz_short(buf, 2);                    // jnz +2
    emit_ud2(buf);                             // ud2
    emit_call_reg(buf, R12_REG);               // call r12
    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 4 (Forward): Generates the function epilogue for the Windows x64 trampoline.
 * @details This function emits the code to handle the function's return value and
 *          properly tear down the stack frame.
 *
 *          Since the prologue used `AND RSP, -16`, we cannot restore `RSP` by simply adding
 *          to it. Instead, we use `LEA RSP, [RBP - 32]` to restore `RSP` to point exactly
 *          to where the saved registers (R12-R15) are stored.
 *          Offset calculation: RBP is pushed, then R12, R13, R14, R15.
 *          RBP points to saved RBP.
 *          R12 @ RBP-8
 *          R13 @ RBP-16
 *          R14 @ RBP-24
 *          R15 @ RBP-32
 *
 * @param buf The code buffer.
 * @param layout The call frame layout.
 * @param ret_type The `infix_type` of the function's return value.
 * @return `INFIX_SUCCESS` on successful code generation.
 */
static infix_status generate_forward_epilogue_win_x64(code_buffer * buf,
                                                      infix_call_frame_layout * layout,
                                                      infix_type * ret_type) {
    layout->epilogue_offset = (uint32_t)buf->size;
    // R13 holds the pointer to the FFI return buffer.
    if (ret_type->category != INFIX_TYPE_VOID && !layout->return_value_in_memory) {
        if (is_float16(ret_type)) {
            // Half-precision is returned in the low 16 bits of XMM0.
            // movd eax, xmm0 ; mov [r13], ax
            emit_movq_gpr_xmm(buf, RAX_REG, XMM0_REG);
            emit_mov_mem_reg16(buf, R13_REG, 0, RAX_REG);
        }
        else if (is_float(ret_type))
            emit_movss_mem_xmm(buf, R13_REG, 0, XMM0_REG);
        else if (is_double(ret_type))
            emit_movsd_mem_xmm(buf, R13_REG, 0, XMM0_REG);
        else if (ret_type->size == 16 &&
                 (ret_type->category == INFIX_TYPE_PRIMITIVE || ret_type->category == INFIX_TYPE_VECTOR))
            // `__int128_t` (on GCC/Clang) and 16-byte vectors are returned in XMM0.
            emit_movups_mem_xmm(buf, R13_REG, 0, XMM0_REG);
        else if (ret_type->size == 32 && ret_type->category == INFIX_TYPE_VECTOR)
            emit_vmovupd_mem_ymm(buf, R13_REG, 0, XMM0_REG);
        else if (ret_type->size == 64 && ret_type->category == INFIX_TYPE_VECTOR)
            emit_vmovupd_mem_zmm(buf, R13_REG, 0, XMM0_REG);
        else {
            // All other by-value types are returned in RAX. Use a size-appropriate store.
            switch (ret_type->size) {
            case 1:
                emit_mov_mem_reg8(buf, R13_REG, 0, RAX_REG);
                break;
            case 2:
                // This handles int16 and float16
                emit_mov_mem_reg16(buf, R13_REG, 0, RAX_REG);
                break;
            case 4:
                emit_mov_mem_reg32(buf, R13_REG, 0, RAX_REG);
                break;
            case 8:
                emit_mov_mem_reg(buf, R13_REG, 0, RAX_REG);
                break;
            default:
                break;  // Should be unreachable
            }
        }
    }
    if (layout->max_align >= 32)
        emit_vzeroupper(buf);

    // Restore stack pointer to the saved registers area.
    // RBP was set to RSP after all pushes.
    // mov rsp, rbp
    emit_mov_reg_reg(buf, RSP_REG, RBP_REG);

    // Restore callee-saved registers and return.
    emit_pop_reg(buf, R15_REG);
    emit_pop_reg(buf, R14_REG);
    emit_pop_reg(buf, R13_REG);
    emit_pop_reg(buf, R12_REG);
    emit_pop_reg(buf, RBP_REG);
    emit_ret(buf);
    return INFIX_SUCCESS;
}
/**
 * @internal

infix/src/arch/x64/abi_win_x64.c  view on Meta::CPAN

    for (size_t i = 0; i < context->num_args; ++i) {
        if (context->arg_types[i] == nullptr) {
            *out_layout = nullptr;
            _infix_set_error(INFIX_CATEGORY_ABI, INFIX_CODE_INVALID_MEMBER_TYPE, 0);
            return INFIX_ERROR_INVALID_ARGUMENT;
        }
        size_t align = context->arg_types[i]->alignment;
        if (align < 8)
            align = 8;
        if (align > max_align)
            max_align = align;

        if (!is_passed_by_reference(context->arg_types[i])) {
            saved_args_data_size = _infix_align_up(saved_args_data_size, align);
            saved_args_data_size += context->arg_types[i]->size;
        }
    }
    // Security: Check against excessively large argument data size.
    if (saved_args_data_size > INFIX_MAX_ARG_SIZE) {
        *out_layout = nullptr;
        return INFIX_ERROR_LAYOUT_FAILED;
    }

    // The total space needed includes all local data plus the shadow space for the call to the C dispatcher.
    size_t total_local_space = return_size + args_array_size + saved_args_data_size + gpr_reg_save_area_size +
        xmm_reg_save_area_size + SHADOW_SPACE;

    // Add max_align to account for potential internal padding
    total_local_space += max_align;

    // Prevent integer overflow from fuzzer-provided types that are impractically large by ensuring the total required
    // stack space is within a safe limit.
    if (total_local_space > INFIX_MAX_STACK_ALLOC) {
        *out_layout = nullptr;
        return INFIX_ERROR_LAYOUT_FAILED;
    }

    // The total allocation for the stack frame must be aligned to the maximum required alignment.
    layout->total_stack_alloc = (uint32_t)_infix_align_up(total_local_space, max_align);

    // Define the layout of our local stack variables relative to RSP after allocation.
    // [ shadow space (32) | return_buffer | gpr_save | xmm_save | args_array | (padding) | saved_args_data ]
    layout->return_buffer_offset = (int32_t)_infix_align_up(SHADOW_SPACE, max_align);
    layout->gpr_save_area_offset = layout->return_buffer_offset + (int32_t)_infix_align_up(return_size, max_align);
    layout->xmm_save_area_offset =
        layout->gpr_save_area_offset + (int32_t)_infix_align_up(gpr_reg_save_area_size, max_align);
    layout->args_array_offset =
        layout->xmm_save_area_offset + (int32_t)_infix_align_up(xmm_reg_save_area_size, max_align);

    // Ensure proper alignment for the saved arguments area.
    layout->saved_args_offset =
        (int32_t)_infix_align_up((size_t)(layout->args_array_offset + args_array_size), max_align);

    layout->max_align = (uint32_t)max_align;

    *out_layout = layout;
    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 2 (Reverse): Generates the prologue for the reverse trampoline stub.
 * @details Emits the standard Windows x64 function entry code. This involves:
 *          1. Creating a standard stack frame (`push rbp; mov rbp, rsp`).
 *          2. Saving any non-volatile registers that the stub will use as scratch space
 *             (RSI and RDI in this implementation).
 *          3. **Forcing stack alignment** (`and rsp, -16`).
 *          4. Allocating all necessary local stack space for the stub's internal
 *             data structures, as calculated in the `prepare` stage.
 *
 * @param buf The code buffer to write the assembly into.
 * @param layout The blueprint containing the total stack space to allocate.
 * @return `INFIX_SUCCESS`.
 */
static infix_status generate_reverse_prologue_win_x64(code_buffer * buf, infix_reverse_call_frame_layout * layout) {
    // Standard function prologue to establish a stack frame.
    emit_push_reg(buf, RBP_REG);
    // Save callee-saved registers that we might use as scratch registers.
    emit_push_reg(buf, RSI_REG);
    emit_push_reg(buf, RDI_REG);
    emit_mov_reg_reg(buf, RBP_REG, RSP_REG);

    layout->prologue_size = (uint32_t)buf->size;

    // FORCE STACK ALIGNMENT.
    // Use the maximum alignment required by the signature (16, 32, or 64).
    emit_and_reg_imm8(buf, RSP_REG, -(int8_t)layout->max_align);

    // Allocate all local stack space calculated in the prepare stage. This includes
    // space for register save areas, the return buffer, args_array, and shadow space.
    if (layout->total_stack_alloc > 0)
        emit_sub_reg_imm32(buf, RSP_REG, (int32_t)layout->total_stack_alloc);

    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 3 (Reverse): Generates code to marshal arguments into the generic `void**` array.
 * @details This function performs the "un-marshalling" of arguments from their native
 *          locations into the generic format expected by the C dispatcher.
 *
 *          The process is as follows:
 *          1.  **Save All Argument Registers:** It first saves all four potential integer
 *              argument registers (RCX, RDX, R8, R9) and all four potential floating-point
 *              registers (XMM0-3) to a dedicated save area on the local stack. This
 *              captures all register-based arguments in one place.
 *
 *          2.  **Populate `args_array`:** It then iterates through the function's expected
 *              arguments and generates code to populate the `args_array`. For each argument:
 *              a. It determines if the argument was passed in a register or on the stack.
 *              b. If passed by reference, it gets the pointer directly from the register
 *                 save area or the caller's stack.
 *              c. If passed by value, it gets a pointer *to the saved copy* of the value.
 *              d. This pointer is then stored in the correct slot of the `args_array`.
 *
 * @param buf The code buffer.
 * @param layout The blueprint containing stack offsets for the save areas and `args_array`.
 * @param context The context containing the argument type information for the callback.
 * @return `INFIX_SUCCESS`.
 */
static infix_status generate_reverse_argument_marshalling_win_x64(code_buffer * buf,
                                                                  infix_reverse_call_frame_layout * layout,
                                                                  infix_reverse_t * context) {
    // Step 1: Save all potential incoming argument registers to our local stack.
    // Use 64-byte offsets to support AVX-512 in the stack layout.
    emit_mov_mem_reg(buf, RSP_REG, layout->gpr_save_area_offset + 0 * 8, RCX_REG);
    emit_mov_mem_reg(buf, RSP_REG, layout->gpr_save_area_offset + 1 * 8, RDX_REG);
    emit_mov_mem_reg(buf, RSP_REG, layout->gpr_save_area_offset + 2 * 8, R8_REG);
    emit_mov_mem_reg(buf, RSP_REG, layout->gpr_save_area_offset + 3 * 8, R9_REG);

    if (layout->max_align >= 32) {
        // AVX enabled: Save full 256 bits
        emit_vmovupd_mem_ymm(buf, RSP_REG, layout->xmm_save_area_offset + 0 * 64, XMM0_REG);
        emit_vmovupd_mem_ymm(buf, RSP_REG, layout->xmm_save_area_offset + 1 * 64, XMM1_REG);
        emit_vmovupd_mem_ymm(buf, RSP_REG, layout->xmm_save_area_offset + 2 * 64, XMM2_REG);
        emit_vmovupd_mem_ymm(buf, RSP_REG, layout->xmm_save_area_offset + 3 * 64, XMM3_REG);
    }
    else {
        // SSE only: Save 128 bits
        emit_movups_mem_xmm(buf, RSP_REG, layout->xmm_save_area_offset + 0 * 64, XMM0_REG);
        emit_movups_mem_xmm(buf, RSP_REG, layout->xmm_save_area_offset + 1 * 64, XMM1_REG);
        emit_movups_mem_xmm(buf, RSP_REG, layout->xmm_save_area_offset + 2 * 64, XMM2_REG);
        emit_movups_mem_xmm(buf, RSP_REG, layout->xmm_save_area_offset + 3 * 64, XMM3_REG);

infix/src/arch/x64/abi_win_x64.c  view on Meta::CPAN

            else
                emit_lea_reg_mem(buf, RAX_REG, RBP_REG, caller_stack_offset);
            emit_mov_mem_reg(buf, RSP_REG, layout->args_array_offset + (int32_t)i * sizeof(void *), RAX_REG);
            stack_slot_offset += (passed_by_ref ? 8 : (current_type->size + 7)) / 8;
        }
        if (!passed_by_ref)
            current_saved_data_offset += current_type->size;
    }
    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 4 (Reverse): Generates the code to call the high-level C dispatcher function.
 * @details This function emits the instructions to load the three arguments for the C
 *          dispatcher into the correct registers according to the Windows x64 ABI,
 *          then calls the dispatcher.
 *
 *          The C dispatcher's signature is:
 *          `void fn(infix_reverse_t* context, void* return_value_ptr, void** args_array)`
 *
 *          The generated code performs the following argument setup:
 *          1. `RCX` (Arg 1): The `context` pointer (a 64-bit immediate).
 *          2. `RDX` (Arg 2): The pointer to the return value buffer. This is either a
 *             pointer to local stack space, or the original pointer passed by the
 *             caller in RCX if the function returns a large struct by reference.
 *          3. `R8` (Arg 3): The pointer to the `args_array` on the local stack.
 *          4. The address of the dispatcher function itself is loaded into `R9`,
 *             which is then called.
 * @param buf The code buffer.
 * @param layout The blueprint containing stack offsets.
 * @param context The context, containing the dispatcher's address.
 * @return `INFIX_SUCCESS`.
 */
static infix_status generate_reverse_dispatcher_call_win_x64(code_buffer * buf,
                                                             infix_reverse_call_frame_layout * layout,
                                                             infix_reverse_t * context) {
    // Arg 1 (RCX): Load the `context` pointer.
    emit_mov_reg_imm64(buf, RCX_REG, (uint64_t)context);
    // Arg 2 (RDX): Load the pointer to the return value buffer.
    if (return_value_is_by_reference(context->return_type))
        // If the return is by reference, the original caller passed the destination
        // pointer in RCX. We saved it in our GPR save area (Step 1 of marshalling).
        emit_mov_reg_mem(buf, RDX_REG, RSP_REG, layout->gpr_save_area_offset + 0 * 8);
    else
        // Otherwise, the return buffer is on our local stack. Load its address.
        emit_lea_reg_mem(buf, RDX_REG, RSP_REG, layout->return_buffer_offset);

    // Arg 3 (R8): Load the address of the `args_array` on our local stack.
    emit_lea_reg_mem(buf, R8_REG, RSP_REG, layout->args_array_offset);

    if (layout->max_align >= 32)
        emit_vzeroupper(buf);

    // Load the C dispatcher's address into a scratch register (R9) and call it.
    emit_mov_reg_imm64(buf, R9_REG, (uint64_t)context->internal_dispatcher);
    emit_call_reg(buf, R9_REG);
    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 5 (Reverse): Generates the epilogue for the reverse trampoline stub.
 * @details After the C dispatcher returns, this code is responsible for the final steps
 *          of the reverse trampoline. It retrieves the return value from the buffer on
 *          the stub's local stack and places it into the correct native return register
 *          (`RAX` or `XMM0`) as required by the Windows x64 ABI.
 *
 *          It then restores the stack pointer using `LEA RSP, [RBP - 16]` to undo the
 *          dynamic alignment performed in the prologue, restores saved registers, and returns.
 *
 * @param buf The code buffer.
 * @param layout The blueprint containing stack offsets.
 * @param context The context containing the return type information.
 * @return `INFIX_SUCCESS`.
 */
static infix_status generate_reverse_epilogue_win_x64(code_buffer * buf,
                                                      infix_reverse_call_frame_layout * layout,
                                                      infix_reverse_t * context) {
    if (layout->max_align >= 32) {
        // Only call VZEROUPPER if we aren't returning a value in YMM/ZMM registers,
        // as VZEROUPPER would zero the upper half of the result.
        bool returning_large_vector =
            (context->return_type->category == INFIX_TYPE_VECTOR && context->return_type->size >= 32 &&
             !return_value_is_by_reference(context->return_type));
        if (!returning_large_vector)
            emit_vzeroupper(buf);
    }

    // Handle the return value after the dispatcher returns.
    if (context->return_type->category != INFIX_TYPE_VOID) {
        if (return_value_is_by_reference(context->return_type))
            // The return value was written directly via the hidden pointer.
            // The ABI requires this original pointer (which was in RCX) to be returned in RAX.
            emit_mov_reg_mem(buf, RAX_REG, RSP_REG, layout->gpr_save_area_offset + 0 * 8);
        else {
            // The return value is in our local buffer. Load it into the correct return register.
#if !defined(INFIX_COMPILER_MSVC)
            if (context->return_type->size == 16 && context->return_type->category == INFIX_TYPE_PRIMITIVE)
                // GCC/Clang on Windows returns 128-bit integers and long double in XMM0.
                emit_movups_xmm_mem(buf, XMM0_REG, RSP_REG, layout->return_buffer_offset);
            else
#endif
                if (context->return_type->category == INFIX_TYPE_VECTOR) {
                if (context->return_type->size == 64)
                    emit_vmovupd_zmm_mem(buf, XMM0_REG, RSP_REG, layout->return_buffer_offset);
                else if (context->return_type->size == 32)
                    emit_vmovupd_ymm_mem(buf, XMM0_REG, RSP_REG, layout->return_buffer_offset);
                else  // size 16
                    emit_movups_xmm_mem(buf, XMM0_REG, RSP_REG, layout->return_buffer_offset);
            }
            else if (is_float16(context->return_type)) {
                // Half-precision is returned in the low 16 bits of XMM0.
                emit_movzx_reg64_mem16(buf, RAX_REG, RSP_REG, layout->return_buffer_offset);
                emit_movq_xmm_gpr(buf, XMM0_REG, RAX_REG);
            }
            else if (is_float(context->return_type))
                emit_movss_xmm_mem(buf, XMM0_REG, RSP_REG, layout->return_buffer_offset);
            else if (is_double(context->return_type))
                emit_movsd_xmm_mem(buf, XMM0_REG, RSP_REG, layout->return_buffer_offset);
            else
                // All other by-value types (integers, pointers, small structs) are returned in RAX.
                emit_mov_reg_mem(buf, RAX_REG, RSP_REG, layout->return_buffer_offset);
        }
    }
    // Restore stack pointer to the saved registers area.
    // RBP was set to RSP after all pushes.
    // mov rsp, rbp
    emit_mov_reg_reg(buf, RSP_REG, RBP_REG);

    emit_pop_reg(buf, RDI_REG);
    emit_pop_reg(buf, RSI_REG);
    emit_pop_reg(buf, RBP_REG);

    emit_ret(buf);
    return INFIX_SUCCESS;
}

infix/src/arch/x64/abi_win_x64.c  view on Meta::CPAN

    size_t outgoing_stack_offset = SHADOW_SPACE;
    size_t temp_space_offset = 0;

    // First pass: Classify ABI locations and calculate required temporary space.
    for (size_t i = 0; i < num_args; ++i) {
        const infix_type * type = arg_types[i];
        layout->args[i].type = type;
        layout->args[i].handler = &handlers[i];

        // Allocate temporary space for this argument's marshalled result.
        if (handlers[i].aggregate_marshaller) {
            temp_space_offset = _infix_align_up(temp_space_offset, type->alignment);
            layout->args[i].location.num_regs = (uint32_t)temp_space_offset;  // Store temp offset in num_regs
            temp_space_offset += type->size;
        }
        else if (handlers[i].scalar_marshaller) {
            temp_space_offset = _infix_align_up(temp_space_offset, 16);
            layout->args[i].location.num_regs = (uint32_t)temp_space_offset;  // Temp save slot for scalar
            temp_space_offset += 16;
        }
        else if (handlers[i].writeback_handler) {  // For out-only parameters
            const infix_type * pointee =
                (type->category == INFIX_TYPE_POINTER) ? type->meta.pointer_info.pointee_type : type;
            temp_space_offset = _infix_align_up(temp_space_offset, pointee->alignment);
            layout->args[i].location.num_regs = (uint32_t)temp_space_offset;
            temp_space_offset += pointee->size;
        }

        // Determine final ABI location.
        bool is_fp = is_float16(type) || is_float(type) || is_double(type) || (type->category == INFIX_TYPE_VECTOR);
        bool by_ref =
            is_passed_by_reference(type) || (type->category == INFIX_TYPE_POINTER && handlers[i].aggregate_marshaller);

        if (arg_position < 4) {
            if (is_fp && !by_ref) {
                layout->args[i].location.type = ARG_LOCATION_XMM;
                layout->args[i].location.reg_index = (uint8_t)arg_position++;
            }
            else {
                layout->args[i].location.type = ARG_LOCATION_GPR;
                layout->args[i].location.reg_index = (uint8_t)arg_position++;
            }
        }
        else
            layout->args[i].location.type = ARG_LOCATION_STACK;
    }

    // Second pass: Calculate final outgoing stack offsets and total allocation size.
    for (size_t i = 0; i < num_args; ++i) {
        if (layout->args[i].location.type == ARG_LOCATION_STACK) {
            const infix_type * type = arg_types[i];
            bool by_ref = is_passed_by_reference(type) ||
                (type->category == INFIX_TYPE_POINTER && handlers[i].aggregate_marshaller);
            // Overwrite the temp offset with the final outgoing offset.
            layout->args[i].location.stack_offset = (uint32_t)outgoing_stack_offset;
            size_t size_on_stack = by_ref ? 8 : ((type->size + 7) & ~7);
            outgoing_stack_offset += size_on_stack;
        }
    }

    // Ensure the base of the scratch area is 16-byte aligned, matching Generate phase logic.
    size_t scratch_base_offset = (outgoing_stack_offset + 15) & ~15;

    size_t total_needed = scratch_base_offset + temp_space_offset;
    layout->total_stack_alloc = (total_needed + 15) & ~15;

    // Final pass: Adjust temp/scratch offsets to be relative to RSP after allocation.
    size_t temp_base_offset = scratch_base_offset;
    for (size_t i = 0; i < num_args; ++i) {
        if (layout->args[i].handler->aggregate_marshaller || layout->args[i].handler->scalar_marshaller ||
            layout->args[i].handler->writeback_handler) {
            layout->args[i].location.num_regs += temp_base_offset;
        }
    }

    if (layout->total_stack_alloc > INFIX_MAX_STACK_ALLOC) {
        *out_layout = nullptr;
        return INFIX_ERROR_LAYOUT_FAILED;
    }
    *out_layout = layout;
    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 2 (Direct): Generates the function prologue.
 * @details Establishes a stack frame, saves callee-saved registers (R12-R15) for context,
 * moves the direct CIF arguments (`ret_ptr`, `lang_args`) into them, and allocates all
 * stack space required for outgoing arguments, shadow space, and local marshalling buffers.
 *
 * This version uses **forced 16-byte stack alignment** via `AND RSP, -16`.
 */
static infix_status generate_direct_forward_prologue_win_x64(code_buffer * buf,
                                                             infix_direct_call_frame_layout * layout) {
    emit_push_reg(buf, RBP_REG);
    // Save callee-saved registers we will use for our context.
    emit_push_reg(buf, R12_REG);  // Will hold scratch data
    emit_push_reg(buf, R13_REG);  // Will hold return value pointer
    emit_push_reg(buf, R14_REG);  // Will hold language objects array pointer
    emit_push_reg(buf, R15_REG);  // Will hold target function address
    emit_mov_reg_reg(buf, RBP_REG, RSP_REG);

    layout->prologue_size = (uint32_t)buf->size;

    // FORCE 64-BYTE ALIGNMENT.
    // AND RSP, -64 (48 83 E4 C0)
    emit_and_reg_imm8(buf, RSP_REG, -64);

    // The direct CIF is called with (ret_ptr, lang_args) in RCX, RDX.
    emit_mov_reg_reg(buf, R13_REG, RCX_REG);  // r13 = ret_ptr
    emit_mov_reg_reg(buf, R14_REG, RDX_REG);  // r14 = lang_objects array

    // Allocate all stack space.
    if (layout->total_stack_alloc > 0)
        emit_sub_reg_imm32(buf, RSP_REG, (int32_t)layout->total_stack_alloc);

    return INFIX_SUCCESS;
}

/**
 * @internal
 * @brief Stage 3 (Direct): Generates code to call marshallers and move arguments for Windows x64.
 * @details This corrected implementation uses a two-phase approach for each argument:
 * 1. MARSHALL: Call the user's handler to get the C value into a temporary location
 *    (RAX/XMM0 for scalars, a local stack buffer for aggregates).
 * 2. PLACE: Move the value from its temporary location to its final destination
 *    (the register or stack slot required by the ABI for the target C call).
 * This separation prevents register clobbering and ensures correctness.
 */
static infix_status generate_direct_forward_argument_moves_win_x64(code_buffer * buf,
                                                                   infix_direct_call_frame_layout * layout) {
    // PHASE 1: MARSHALL & SAVE
    for (size_t i = 0; i < layout->num_args; ++i) {
        const infix_direct_arg_layout * arg_layout = &layout->args[i];
        int32_t temp_offset = (int32_t)arg_layout->location.num_regs;

        if (arg_layout->handler->scalar_marshaller || arg_layout->handler->aggregate_marshaller) {

            // Arg 1 (RCX) for marshaller: the language object pointer.
            emit_mov_reg_mem(buf, RCX_REG, R14_REG, i * sizeof(void *));

            if (arg_layout->handler->scalar_marshaller) {
                emit_mov_reg_imm64(buf, R10_REG, (uint64_t)arg_layout->handler->scalar_marshaller);
#if INFIX_SANITY_CHECK_ENABLE
                emit_mov_reg_reg(buf, R12_REG, RSP_REG);  // Save RSP to non-volatile R12
#endif
                emit_call_reg(buf, R10_REG);  // Result is now in RAX or XMM0.
#if INFIX_SANITY_CHECK_ENABLE
                emit_cmp_reg_reg(buf, R12_REG, RSP_REG);  // Verify RSP balance
                emit_je_short(buf, 2);
                emit_ud2(buf);  // Crash if marshaller clobbered the stack
#endif

                emit_mov_mem_reg(buf, RSP_REG, temp_offset, RAX_REG);
            }
            else if (arg_layout->handler->aggregate_marshaller) {
                // Arg 2 (RDX): Pointer to our stack buffer for the aggregate.
                emit_lea_reg_mem(buf, RDX_REG, RSP_REG, temp_offset);

                // Arg 3 (R8): The infix_type*.
                emit_mov_reg_imm64(buf, R8_REG, (uint64_t)arg_layout->type);
                emit_mov_reg_imm64(buf, R10_REG, (uint64_t)arg_layout->handler->aggregate_marshaller);
#if INFIX_SANITY_CHECK_ENABLE

infix/src/arch/x64/abi_win_x64.c  view on Meta::CPAN

        case ARG_LOCATION_XMM:
            {
                // Load the marshalled double-precision value from our temp slot into a scratch register (XMM15).
                emit_movsd_xmm_mem(buf, XMM15_REG, RSP_REG, temp_offset);

                // If the target C type is actually a float, we must convert the double to a float.
                if (is_float(arg_layout->type))
                    // `cvtsd2ss xmm15, xmm15` : Convert Scalar Double to Scalar Single in place.
                    emit_cvtsd2ss_xmm_xmm(buf, XMM15_REG, XMM15_REG);

                // Now move the correctly-sized value from XMM15 to the final destination register.
                emit_movaps_xmm_xmm(buf, XMM_ARGS[arg_layout->location.reg_index], XMM15_REG);
            }
            break;

        case ARG_LOCATION_STACK:
            {
                int32_t out_stack_offset = (int32_t)arg_layout->location.stack_offset;

                if (is_float(arg_layout->type)) {
                    // Load the double from the temp slot, convert it, then store the single.
                    emit_movsd_xmm_mem(buf, XMM15_REG, RSP_REG, temp_offset);
                    emit_cvtsd2ss_xmm_xmm(buf, XMM15_REG, XMM15_REG);
                    emit_movss_mem_xmm(buf, RSP_REG, out_stack_offset, XMM15_REG);
                }
                else if (!arg_layout->handler->scalar_marshaller && !arg_layout->handler->aggregate_marshaller) {
                    emit_lea_reg_mem(buf, RAX_REG, RSP_REG, temp_offset);
                    emit_mov_mem_reg(buf, RSP_REG, out_stack_offset, RAX_REG);
                }
                else {
                    // All other stack arguments are passed by value (even large ones, by copying).
                    for (size_t offset = 0; offset < arg_layout->type->size; offset += 8) {
                        emit_mov_reg_mem(buf, RAX_REG, RSP_REG, temp_offset + offset);
                        emit_mov_mem_reg(buf, RSP_REG, out_stack_offset + offset, RAX_REG);
                    }
                }
            }
            break;
        default:
            break;
        }
    }
    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 3.5 (Direct): Generates the call instruction.
 */
static infix_status generate_direct_forward_call_instruction_win_x64(code_buffer * buf,
                                                                     infix_direct_call_frame_layout * layout) {
    emit_mov_reg_imm64(buf, R15_REG, (uint64_t)layout->target_fn);  // Use R15 for target function
    emit_test_reg_reg(buf, R15_REG, R15_REG);
    emit_jnz_short(buf, 2);
    emit_ud2(buf);
    emit_call_reg(buf, R15_REG);
    return INFIX_SUCCESS;
}

/**
 * @internal
 * @brief Stage 4 (Direct): Generates the epilogue, including write-back calls.
 *
 * Uses `LEA RSP, [RBP - 32]` to safely restore the stack pointer.
 */
static infix_status generate_direct_forward_epilogue_win_x64(code_buffer * buf,
                                                             infix_direct_call_frame_layout * layout,
                                                             infix_type * ret_type) {
    layout->epilogue_offset = (uint32_t)buf->size;
    // Handle C function's return value.
    if (ret_type->category != INFIX_TYPE_VOID && !layout->return_value_in_memory) {
        if (is_float16(ret_type)) {
            // Half-precision is returned in the low 16 bits of XMM0.
            // movd eax, xmm0 ; mov [r13], ax
            emit_movq_gpr_xmm(buf, RAX_REG, XMM0_REG);
            emit_mov_mem_reg16(buf, R13_REG, 0, RAX_REG);
        }
        else if (is_float(ret_type))
            emit_movss_mem_xmm(buf, R13_REG, 0, XMM0_REG);
        else if (is_double(ret_type))
            emit_movsd_mem_xmm(buf, R13_REG, 0, XMM0_REG);
        else if (ret_type->size == 16 &&
                 (ret_type->category == INFIX_TYPE_PRIMITIVE || ret_type->category == INFIX_TYPE_VECTOR))
            // `__int128_t` (on GCC/Clang) and 16-byte vectors are returned in XMM0.
            emit_movups_mem_xmm(buf, R13_REG, 0, XMM0_REG);
        else if (ret_type->size == 32 && ret_type->category == INFIX_TYPE_VECTOR)
            emit_vmovupd_mem_ymm(buf, R13_REG, 0, XMM0_REG);
        else if (ret_type->size == 64 && ret_type->category == INFIX_TYPE_VECTOR)
            emit_vmovupd_mem_zmm(buf, R13_REG, 0, XMM0_REG);
        else {
            // All other by-value types are returned in RAX. Use a size-appropriate store.
            switch (ret_type->size) {
            case 1:
                emit_mov_mem_reg8(buf, R13_REG, 0, RAX_REG);
                break;
            case 2:
                emit_mov_mem_reg16(buf, R13_REG, 0, RAX_REG);
                break;
            case 4:
                emit_mov_mem_reg32(buf, R13_REG, 0, RAX_REG);
                break;
            case 8:
                emit_mov_mem_reg(buf, R13_REG, 0, RAX_REG);
                break;
            default:
                break;
            }
        }
    }

    // Call Write-Back Handlers
    for (size_t i = 0; i < layout->num_args; ++i) {
        const infix_direct_arg_layout * arg_layout = &layout->args[i];
        if (arg_layout->handler->writeback_handler) {
            emit_sub_reg_imm32(buf, RSP_REG, 48);
            emit_mov_mem_reg(buf, RSP_REG, 32, RAX_REG);
            emit_movsd_mem_xmm(buf, RSP_REG, 40, XMM0_REG);

            emit_mov_reg_mem(buf, RCX_REG, R14_REG, i * sizeof(void *));

            int32_t temp_offset = (int32_t)arg_layout->location.num_regs;
            emit_lea_reg_mem(buf, RDX_REG, RSP_REG, temp_offset + 48);

            emit_mov_reg_imm64(buf, R8_REG, (uint64_t)arg_layout->type);

            emit_mov_reg_imm64(buf, R10_REG, (uint64_t)arg_layout->handler->writeback_handler);
            emit_call_reg(buf, R10_REG);

            emit_mov_reg_mem(buf, RAX_REG, RSP_REG, 32);
            emit_movsd_xmm_mem(buf, XMM0_REG, RSP_REG, 40);
            emit_add_reg_imm32(buf, RSP_REG, 48);
        }
    }

    // Safe Epilogue
    // If AVX was potentially used, clear the upper bits of YMM registers.
    // Note: We'll add max_align to direct layout in the next step.
    // For now, let's assume if any arg was a vector, we might have used AVX.
    bool maybe_avx = false;
    for (size_t i = 0; i < layout->num_args; i++) {
        if (layout->args[i].type->category == INFIX_TYPE_VECTOR && layout->args[i].type->size >= 32) {
            maybe_avx = true;
            break;
        }
    }
    if (maybe_avx || (ret_type->category == INFIX_TYPE_VECTOR && ret_type->size >= 32))
        emit_vzeroupper(buf);

    // Restore stack pointer to the saved registers area.
    // RBP was set to RSP after all pushes.
    // mov rsp, rbp
    emit_mov_reg_reg(buf, RSP_REG, RBP_REG);

    emit_pop_reg(buf, R15_REG);
    emit_pop_reg(buf, R14_REG);
    emit_pop_reg(buf, R13_REG);
    emit_pop_reg(buf, R12_REG);
    emit_pop_reg(buf, RBP_REG);

    emit_ret(buf);

    return INFIX_SUCCESS;
}

#ifdef _MSC_VER
#pragma warning(pop)
#endif



( run in 1.406 second using v1.01-cache-2.11-cpan-97f6503c9c8 )