Affix

 view release on metacpan or  search on metacpan

infix/src/arch/aarch64/abi_arm64.c  view on Meta::CPAN

/**
 * Copyright (c) 2025 Sanko Robinson
 *
 * This source code is dual-licensed under the Artistic License 2.0 or the MIT License.
 * You may choose to use this code under the terms of either license.
 *
 * SPDX-License-Identifier: (Artistic-2.0 OR MIT)
 *
 * The documentation blocks within this file are licensed under the
 * Creative Commons Attribution 4.0 International License (CC BY 4.0).
 *
 * SPDX-License-Identifier: CC-BY-4.0
 */
/**
 * @file abi_arm64.c
 * @brief Implements the FFI logic for the AArch64 (ARM64) architecture.
 * @ingroup internal_abi_aarch64
 *
 * @internal
 * This file provides the concrete implementation of the `infix_forward_abi_spec`
 * and `infix_reverse_abi_spec` for the ARM64 architecture. It primarily follows
 * the standard "Procedure Call Standard for the ARM 64-bit Architecture" (AAPCS64),
 * but also contains critical conditional logic to handle deviations for specific
 * platforms like Apple macOS and Windows on ARM.
 *
 * @section aapcs64_rules Key AAPCS64 Rules Implemented
 *
 * - **Register Usage:**
 *   - The first 8 integer/pointer arguments are passed in GPRs (X0-X7).
 *   - The first 8 floating-point/vector arguments are passed in VPRs (V0-V7).
 *
 * - **Homogeneous Floating-point Aggregates (HFAs):** Structs or arrays composed
 *   entirely of 1 to 4 identical floating-point types (`float` or `double`) are
 *   passed in consecutive VPRs.
 *
 * - **Return Values:**
 *   - Aggregates up to 16 bytes are returned in registers (GPRs and/or VPRs).
 *   - Larger aggregates are returned via a hidden pointer passed by the caller
 *     in the dedicated "indirect result location register", `X8`.
 *
 * @section platform_deviations Platform-Specific Deviations
 *
 * - **Variadic Calls (Apple macOS):** All variadic arguments are passed on the
 *   stack. Arguments smaller than 8 bytes are promoted to fill 8-byte stack slots.
 *
 * - **Variadic Calls (Windows on ARM):** The HFA rule is disabled for variadic
 *   arguments. Floating-point scalars are passed in GPRs, not VPRs.
 *
 * - **16-Byte Argument Alignment:**
 *   - **Standard/macOS:** 16-byte aggregates passed in GPRs must start in an
 *     even-numbered register (X0, X2, X4, X6).
 *   - **macOS Exception:** `__int128_t` does NOT require even-GPR alignment.
 *   - **Windows Exception:** Variadic 16-byte aggregates do NOT require even-GPR alignment.
 * @endinternal
 */
#include "arch/aarch64/abi_arm64_common.h"
#include "arch/aarch64/abi_arm64_emitters.h"
#include "common/infix_internals.h"
#include "common/utility.h"
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
/** @internal The General-Purpose Registers used for the first 8 integer/pointer arguments. */
static const arm64_gpr GPR_ARGS[] = {X0_REG, X1_REG, X2_REG, X3_REG, X4_REG, X5_REG, X6_REG, X7_REG};
/** @internal The SIMD/Floating-Point Registers used for the first 8 float/double/vector arguments. */
static const arm64_vpr VPR_ARGS[] = {V0_REG, V1_REG, V2_REG, V3_REG, V4_REG, V5_REG, V6_REG, V7_REG};
/** @internal The total number of GPRs available for argument passing. */
#define NUM_GPR_ARGS 8
/** @internal The total number of VPRs available for argument passing. */
#define NUM_VPR_ARGS 8
/** @internal A safe limit on the number of fields to classify to prevent DoS from exponential complexity. */
#define MAX_AGGREGATE_FIELDS_TO_CLASSIFY 32

//
static bool is_hfa(const infix_type * type, const infix_type ** base_type);

/** @internal The v-table of AArch64 functions for generating forward trampolines. */
static infix_status prepare_forward_call_frame_arm64(infix_arena_t * arena,
                                                     infix_call_frame_layout ** out_layout,
                                                     infix_type * ret_type,
                                                     infix_type ** arg_types,
                                                     size_t num_args,
                                                     size_t num_fixed_args,
                                                     void * target_fn);
static infix_status generate_forward_prologue_arm64(code_buffer * buf, infix_call_frame_layout * layout);
static infix_status generate_forward_argument_moves_arm64(code_buffer * buf,
                                                          infix_call_frame_layout * layout,
                                                          infix_type ** arg_types,
                                                          size_t num_args,
                                                          c23_maybe_unused size_t num_fixed_args);
static infix_status generate_forward_call_instruction_arm64(code_buffer *, infix_call_frame_layout *);
static infix_status generate_forward_epilogue_arm64(code_buffer * buf,
                                                    infix_call_frame_layout * layout,
                                                    infix_type * ret_type);
const infix_forward_abi_spec g_arm64_forward_spec = {
    .prepare_forward_call_frame = prepare_forward_call_frame_arm64,
    .generate_forward_prologue = generate_forward_prologue_arm64,
    .generate_forward_argument_moves = generate_forward_argument_moves_arm64,
    .generate_forward_call_instruction = generate_forward_call_instruction_arm64,
    .generate_forward_epilogue = generate_forward_epilogue_arm64};

/** @internal The v-table of AArch64 functions for generating reverse trampolines. */
static infix_status prepare_reverse_call_frame_arm64(infix_arena_t * arena,
                                                     infix_reverse_call_frame_layout ** out_layout,
                                                     infix_reverse_t * context);
static infix_status generate_reverse_prologue_arm64(code_buffer * buf, infix_reverse_call_frame_layout * layout);
static infix_status generate_reverse_argument_marshalling_arm64(code_buffer * buf,
                                                                infix_reverse_call_frame_layout * layout,
                                                                infix_reverse_t * context);
static infix_status generate_reverse_dispatcher_call_arm64(code_buffer * buf,
                                                           infix_reverse_call_frame_layout * layout,
                                                           infix_reverse_t * context);
static infix_status generate_reverse_epilogue_arm64(code_buffer * buf,
                                                    infix_reverse_call_frame_layout * layout,
                                                    infix_reverse_t * context);
const infix_reverse_abi_spec g_arm64_reverse_spec = {
    .prepare_reverse_call_frame = prepare_reverse_call_frame_arm64,
    .generate_reverse_prologue = generate_reverse_prologue_arm64,
    .generate_reverse_argument_marshalling = generate_reverse_argument_marshalling_arm64,
    .generate_reverse_dispatcher_call = generate_reverse_dispatcher_call_arm64,
    .generate_reverse_epilogue = generate_reverse_epilogue_arm64};

/** @internal The v-table for the new Direct Marshalling ABI. */
static infix_status prepare_direct_forward_call_frame_arm64(infix_arena_t * arena,
                                                            infix_direct_call_frame_layout ** out_layout,
                                                            infix_type * ret_type,
                                                            infix_type ** arg_types,
                                                            size_t num_args,
                                                            infix_direct_arg_handler_t * handlers,
                                                            void * target_fn);
static infix_status generate_direct_forward_prologue_arm64(code_buffer * buf, infix_direct_call_frame_layout * layout);
static infix_status generate_direct_forward_argument_moves_arm64(code_buffer * buf,
                                                                 infix_direct_call_frame_layout * layout);
static infix_status generate_direct_forward_call_instruction_arm64(code_buffer * buf,
                                                                   infix_direct_call_frame_layout * layout);
static infix_status generate_direct_forward_epilogue_arm64(code_buffer * buf,
                                                           infix_direct_call_frame_layout * layout,
                                                           infix_type * ret_type);
const infix_direct_forward_abi_spec g_arm64_direct_forward_spec = {
    .prepare_direct_forward_call_frame = prepare_direct_forward_call_frame_arm64,
    .generate_direct_forward_prologue = generate_direct_forward_prologue_arm64,
    .generate_direct_forward_argument_moves = generate_direct_forward_argument_moves_arm64,
    .generate_direct_forward_call_instruction = generate_direct_forward_call_instruction_arm64,
    .generate_direct_forward_epilogue = generate_direct_forward_epilogue_arm64};

/**
 * @internal
 * @brief Recursively finds the first primitive floating-point type in a potential HFA.
 * @details This function performs a depth-first search to find the very first `float`
 *          or `double` primitive within an aggregate. This becomes the candidate
 *          "base type" that all other members of the aggregate will be compared against.
 * @param type The type to search within.
 * @return A pointer to the `infix_type` of the base element, or `nullptr` if not found.
 */
static const infix_type * get_hfa_base_type(const infix_type * type) {
    if (type == nullptr)
        return nullptr;
    // Base case: we've found a primitive float or double.
    if (is_float(type) || is_double(type))
        return type;
    // Recursive step for arrays.
    if (type->category == INFIX_TYPE_ARRAY)
        return get_hfa_base_type(type->meta.array_info.element_type);
    // Recursive step for structs: check the first member.
    if (type->category == INFIX_TYPE_STRUCT && type->meta.aggregate_info.num_members > 0)
        return get_hfa_base_type(type->meta.aggregate_info.members[0].type);
    // Recursive step for _Complex.
    if (type->category == INFIX_TYPE_COMPLEX)
        return get_hfa_base_type(type->meta.complex_info.base_type);
    return nullptr;  // Not a float-based type.
}
/**
 * @internal
 * @brief Recursively verifies that all primitive members of a type are identical to a given base type.
 * @details After `get_hfa_base_type` finds a potential base type, this function traverses
 *          the entire aggregate to ensure every single primitive member is of that exact same type.
 * @param type The current type/member being checked.
 * @param base_type The required base type (e.g., `float`) to check against.
 * @param field_count A counter to prevent stack overflow/DoS from excessively complex types.
 * @return `true` if all constituent members of `type` are of `base_type`, `false` otherwise.
 */
static bool is_hfa_recursive_check(const infix_type * type, const infix_type * base_type, size_t * field_count) {
    if (type == nullptr)
        return false;
    // Abort if the type is excessively complex.
    if (*field_count > MAX_AGGREGATE_FIELDS_TO_CLASSIFY)
        return false;
    // Base case: A primitive must match the base type.
    if (is_float(type) || is_double(type)) {
        (*field_count)++;
        return type == base_type;
    }
    // Recursive step for _Complex: both parts must match the base type.
    if (type->category == INFIX_TYPE_COMPLEX)
        return type->meta.complex_info.base_type == base_type;
    // Recursive step for arrays: check the element type.
    if (type->category == INFIX_TYPE_ARRAY)
        return is_hfa_recursive_check(type->meta.array_info.element_type, base_type, field_count);
    // Recursive step for structs: check every member.
    if (type->category == INFIX_TYPE_STRUCT) {
        if (type->meta.aggregate_info.num_members == 0)
            return false;
        for (size_t i = 0; i < type->meta.aggregate_info.num_members; ++i)
            if (!is_hfa_recursive_check(type->meta.aggregate_info.members[i].type, base_type, field_count))
                return false;
        return true;
    }
    // If it's not a float, complex, array, or struct, it cannot be part of an HFA.
    return false;
}
/**
 * @internal
 * @brief Determines if a type is a Homogeneous Floating-point Aggregate (HFA).
 * @details An HFA is a struct or array containing 1 to 4 elements of the same, single
 *          floating-point type (`float` or `double`), including in nested aggregates.
 *
 * @param type The `infix_type` to check.
 * @param[out] out_base_type If the type is an HFA, this is set to its base `float` or `double` type.
 * @return `true` if the type is a valid HFA, `false` otherwise.
 */
static bool is_hfa(const infix_type * type, const infix_type ** out_base_type) {
    if (type->category != INFIX_TYPE_STRUCT && type->category != INFIX_TYPE_ARRAY &&
        type->category != INFIX_TYPE_COMPLEX)
        return false;
    // HFAs cannot be excessively large.
    if (type->size == 0 || type->size > 64)  // Max HFA size is 4 * sizeof(double) = 32 on standard, 4*16=64 on others
        return false;
    // Find the base float/double type of the first primitive element.
    const infix_type * base = get_hfa_base_type(type);
    if (base == nullptr)
        return false;
    // Check that the total size is a multiple of the base type, with 1 to 4 elements.
    size_t num_elements = type->size / base->size;
    if (num_elements < 1 || num_elements > 4 || type->size != num_elements * base->size)
        return false;
    // Verify that ALL members recursively conform to this single base type.
    size_t field_count = 0;
    if (!is_hfa_recursive_check(type, base, &field_count))
        return false;
    if (out_base_type)
        *out_base_type = base;
    return true;
}
/**
 * @internal
 * @brief Stage 1 (Forward): Analyzes a signature and creates a call frame layout for AAPCS64.
 * @details This function assigns each argument to a location (GPR, VPR, or Stack) according
 *          to the AAPCS64 rules. It contains extensive conditional logic to handle ABI
 *          deviations on Apple and Windows platforms, especially for variadic arguments
 *          and 16-byte aggregate alignment.
 *
 * @param arena The temporary arena for allocations.
 * @param out_layout Receives the created layout blueprint.
 * @param ret_type The function's return type.
 * @param arg_types Array of argument types.
 * @param num_args Total number of arguments.
 * @param num_fixed_args Number of non-variadic arguments.
 * @param target_fn The target function address.
 * @return `INFIX_SUCCESS` on success, or an error code on failure.
 */
static infix_status prepare_forward_call_frame_arm64(infix_arena_t * arena,
                                                     infix_call_frame_layout ** out_layout,
                                                     infix_type * ret_type,
                                                     infix_type ** arg_types,
                                                     size_t num_args,
                                                     size_t num_fixed_args,
                                                     void * target_fn) {
    if (out_layout == nullptr)
        return INFIX_ERROR_INVALID_ARGUMENT;
    infix_call_frame_layout * layout =
        infix_arena_calloc(arena, 1, sizeof(infix_call_frame_layout), _Alignof(infix_call_frame_layout));
    if (layout == nullptr) {
        *out_layout = nullptr;
        return INFIX_ERROR_ALLOCATION_FAILED;
    }
    layout->arg_locations =
        infix_arena_calloc(arena, num_args, sizeof(infix_arg_location), _Alignof(infix_arg_location));
    if (layout->arg_locations == nullptr && num_args > 0) {
        *out_layout = nullptr;
        return INFIX_ERROR_ALLOCATION_FAILED;
    }
    size_t gpr_count = 0, vpr_count = 0, stack_offset = 0;
    layout->is_variadic = (num_fixed_args < num_args);
    layout->target_fn = target_fn;
    layout->num_args = num_args;
    layout->num_stack_args = 0;
    // Determine if the return value is passed by reference (via hidden pointer in X8).
    // This is true for aggregates larger than 16 bytes.
    bool ret_is_aggregate = (ret_type->category == INFIX_TYPE_STRUCT || ret_type->category == INFIX_TYPE_UNION ||
                             ret_type->category == INFIX_TYPE_ARRAY || ret_type->category == INFIX_TYPE_COMPLEX);
    layout->return_value_in_memory = (ret_is_aggregate && ret_type->size > 16);
    // Main Argument Classification Loop
    for (size_t i = 0; i < num_args; ++i) {
        infix_type * type = arg_types[i];
        // Security: Reject excessively large types.
        if (type->size > INFIX_MAX_ARG_SIZE) {
            *out_layout = nullptr;
            return INFIX_ERROR_LAYOUT_FAILED;
        }
        bool placed_in_register = false;
        c23_maybe_unused bool is_variadic_arg = (i >= num_fixed_args);

        // Arrays decay to pointers. Always treat as a GPR argument (8 bytes).
        if (type->category == INFIX_TYPE_ARRAY) {
            if (gpr_count < NUM_GPR_ARGS) {
                layout->arg_locations[i].type = ARG_LOCATION_GPR;
                layout->arg_locations[i].reg_index = (uint8_t)gpr_count++;
                placed_in_register = true;

infix/src/arch/aarch64/abi_arm64.c  view on Meta::CPAN

        else {                     // Integers, pointers, small aggregates, and variadic floats on Windows.
            if (type->size > 8) {  // Types > 8 and <= 16 bytes are passed in a pair of GPRs.
                bool needs_alignment = true;
#if defined(INFIX_OS_MACOS)
                // macOS Deviation: `__int128_t` does not require even-GPR alignment.
                if (type->category == INFIX_TYPE_PRIMITIVE)
                    needs_alignment = false;
#elif defined(INFIX_OS_WINDOWS)
                // Windows Deviation: Variadic 16-byte arguments do not require even-GPR alignment.
                if (is_variadic_arg)
                    needs_alignment = false;
#endif
                // Standard rule: 16-byte args must start in an even-numbered GPR.
                if (needs_alignment && (gpr_count % 2 != 0))
                    gpr_count++;
                if (gpr_count + 1 < NUM_GPR_ARGS) {
                    layout->arg_locations[i].type = ARG_LOCATION_GPR_PAIR;
                    layout->arg_locations[i].reg_index = (uint8_t)gpr_count;
                    gpr_count += 2;
                    placed_in_register = true;
                }
            }
            else {  // Types <= 8 bytes passed in a single GPR.
                if (gpr_count < NUM_GPR_ARGS) {
                    layout->arg_locations[i].type = ARG_LOCATION_GPR;
                    layout->arg_locations[i].reg_index = (uint8_t)gpr_count++;
                    placed_in_register = true;
                }
            }
        }
        // If it couldn't be placed in a register, it must go on the stack.
        if (!placed_in_register) {
            layout->arg_locations[i].type = ARG_LOCATION_STACK;

            // Enforce natural alignment for stack arguments on ARM64
            size_t align = type->alignment;
            if (align < 8)
                align = 8;

            // Align the current stack offset
            stack_offset = (stack_offset + (align - 1)) & ~(align - 1);
            layout->arg_locations[i].stack_offset = (uint32_t)stack_offset;
            stack_offset += (type->size + 7) & ~7;  // Stack slots are 8-byte aligned.
            layout->num_stack_args++;
        }
    }
    // The total stack space for arguments must be 16-byte aligned before the call.
    layout->total_stack_alloc = (stack_offset + 15) & ~15;
    layout->num_gpr_args = (uint8_t)gpr_count;
    layout->num_vpr_args = (uint8_t)vpr_count;
    // Security: Prevent excessive stack allocation.
    if (layout->total_stack_alloc > INFIX_MAX_STACK_ALLOC) {
        *out_layout = nullptr;
        return INFIX_ERROR_LAYOUT_FAILED;
    }
    *out_layout = layout;
    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 2 (Forward): Generates the function prologue for the AArch64 trampoline.
 * @details Sets up the stack frame by saving the frame pointer (X29) and link register (X30),
 *          saves callee-saved registers (X19-X22) that will be used to hold the trampoline's
 *          context, moves the trampoline's arguments into those preserved registers, and
 *          allocates the necessary stack space for stack-passed arguments.
 * @param buf The code buffer.
 * @param layout The layout blueprint.
 * @return `INFIX_SUCCESS`.
 */
static infix_status generate_forward_prologue_arm64(code_buffer * buf, infix_call_frame_layout * layout) {
    // `stp x29, x30, [sp, #-16]!` : Push Frame Pointer and Link Register to the stack, pre-decrementing SP.
    emit_arm64_stp_pre_index(buf, true, X29_FP_REG, X30_LR_REG, SP_REG, -16);
    // `stp x19, x20, [sp, #-16]!` : Save callee-saved registers that we will use for our context.
    emit_arm64_stp_pre_index(buf, true, X19_REG, X20_REG, SP_REG, -16);
    // `stp x21, x22, [sp, #-16]!`
    emit_arm64_stp_pre_index(buf, true, X21_REG, X22_REG, SP_REG, -16);
    // `mov x29, sp` : Establish the new Frame Pointer after all registers are pushed.
    emit_arm64_mov_reg(buf, true, X29_FP_REG, SP_REG);

    layout->prologue_size = (uint32_t)buf->size;

    // Move the trampoline's own arguments into these now-safe callee-saved registers.
    if (layout->target_fn == nullptr) {  // Unbound trampoline args: (target_fn, ret_ptr, args_ptr) in X0, X1, X2.
        emit_arm64_mov_reg(buf, true, X19_REG, X0_REG);  // mov x19, x0 (x19 will hold target_fn)
        emit_arm64_mov_reg(buf, true, X20_REG, X1_REG);  // mov x20, x1 (x20 will hold ret_ptr)
        emit_arm64_mov_reg(buf, true, X21_REG, X2_REG);  // mov x21, x2 (x21 will hold args_ptr)
    }
    else {                                               // Bound trampoline args: (ret_ptr, args_ptr) in X0, X1.
        emit_arm64_mov_reg(buf, true, X20_REG, X0_REG);  // mov x20, x0 (x20 = ret_ptr)
        emit_arm64_mov_reg(buf, true, X21_REG, X1_REG);  // mov x21, x1 (x21 = args_ptr)
    }
    // Allocate stack space for arguments that will be passed on the stack.
    if (layout->total_stack_alloc > 0)
        emit_arm64_sub_imm(buf, true, false, SP_REG, SP_REG, (uint32_t)layout->total_stack_alloc);
    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 3 (Forward): Generates code to move arguments into their native locations.
 * @details This function marshals arguments from the generic `void**` array (pointed to by X21)
 *          into the correct GPRs, VPRs, or stack slots, respecting HFA rules and platform-specific
 *          variadic conventions like Apple's stack-only approach.
 * @param buf The code buffer.
 * @param layout The layout blueprint.
 * @param arg_types The array of argument types.
 * @param num_args The total number of arguments.
 * @param num_fixed_args The number of fixed (non-variadic) arguments.
 * @return `INFIX_SUCCESS`.
 */
static infix_status generate_forward_argument_moves_arm64(code_buffer * buf,
                                                          infix_call_frame_layout * layout,
                                                          infix_type ** arg_types,
                                                          size_t num_args,
                                                          c23_maybe_unused size_t num_fixed_args) {
    // If returning a large struct, the ABI requires the hidden pointer (our return buffer, in X20)
    // to be passed in the indirect result location register, x8.
    if (layout->return_value_in_memory)
        emit_arm64_mov_reg(buf, true, X8_REG, X20_REG);  // mov x8, x20
    // Standard AAPCS64 Quirk: For variadic calls, a GPR must contain the number of VPRs used.
    // This rule does NOT apply to Apple's ABI, so we exclude it for macOS.
#if !defined(INFIX_OS_MACOS)
    else if (layout->is_variadic)
        // Since we don't know the types of variadic arguments at compile time, the ABI
        // states the safest value is 0. A callee like printf will use this to determine
        // how to process its va_list. We use x8 as it's a volatile register.
        // A safe default is 0. Callee (like printf) uses this to interpret its va_list.
        emit_arm64_load_u64_immediate(buf, X8_REG, 0);  // mov x8, #0
#endif
    // Main argument marshalling loop.
    for (size_t i = 0; i < num_args; ++i) {
        infix_arg_location * loc = &layout->arg_locations[i];
        infix_type * type = arg_types[i];
        // Load the pointer to the current argument's data into scratch register x9.
        // x21 holds the base of the void** args_array.
        emit_arm64_ldr_imm(buf, true, X9_REG, X21_REG, (int32_t)(i * sizeof(void *)));  // ldr x9, [x21, #offset]
        switch (loc->type) {
        case ARG_LOCATION_GPR:
            {
                // Arrays passed by pointer. The data at X9 IS the pointer. Move X9 to dest reg.
                if (type->category == INFIX_TYPE_ARRAY) {

infix/src/arch/aarch64/abi_arm64.c  view on Meta::CPAN

                            if (type->size >= 8)  // 64-bit integers and pointers
                                emit_arm64_ldr_imm(buf, true, X10_REG, X9_REG, 0);
                            else if (is_signed_lt_64)  // Signed types < 64-bit
                                emit_arm64_ldrsw_imm(buf, X10_REG, X9_REG, 0);
                            else  // Unsigned types < 64-bit
                                emit_arm64_ldr_imm(buf, false, X10_REG, X9_REG, 0);
                            // Store the promoted 64-bit value.
                            if (loc->stack_offset < (unsigned)max_imm_offset)
                                emit_arm64_str_imm(buf, true, X10_REG, SP_REG, loc->stack_offset);
                            else {
                                emit_arm64_add_imm(buf, true, false, X11_REG, SP_REG, loc->stack_offset);
                                emit_arm64_str_imm(buf, true, X10_REG, X11_REG, 0);
                            }
                        }
                        // This primitive/pointer has been handled, so break from the switch.
                        break;
                    }
                    // If it's a struct, fall through to the generic copy loop.
                }
#endif
                // Generic stack argument handling (for non-macOS, or for structs on macOS)
                // If it's an array passed on the stack, it's a pointer (8 bytes).
                if (type->category == INFIX_TYPE_ARRAY) {
                    emit_arm64_str_imm(buf, true, X9_REG, SP_REG, (int32_t)loc->stack_offset);
                    break;
                }

                const int32_t max_imm_offset = 0xFFF * 8;
                for (size_t offset = 0; offset < type->size; offset += 8) {
                    emit_arm64_ldr_imm(buf, true, X10_REG, X9_REG, (int32_t)offset);
                    int32_t current_stack_offset = (int32_t)(loc->stack_offset + offset);
                    if (current_stack_offset >= 0 && current_stack_offset < max_imm_offset &&
                        (current_stack_offset % 8 == 0))
                        emit_arm64_str_imm(buf, true, X10_REG, SP_REG, current_stack_offset);
                    else {
                        emit_arm64_add_imm(buf, true, false, X11_REG, SP_REG, current_stack_offset);
                        emit_arm64_str_imm(buf, true, X10_REG, X11_REG, 0);
                    }
                }
                break;
            }
        }
    }
    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 3.5 (Forward): Generates the call instruction.
 * @details Emits a null-check on the target function pointer followed by a
 *          `BLR` (Branch with Link to Register) instruction. If the pointer
 *          is null, a `BRK` instruction is executed to crash safely.
 * @param buf The code buffer.
 * @param layout The call frame layout.
 * @return `INFIX_SUCCESS`.
 */
static infix_status generate_forward_call_instruction_arm64(code_buffer * buf,
                                                            c23_maybe_unused infix_call_frame_layout * layout) {
    if (layout->target_fn)
        // For a bound trampoline, the target is hardcoded. Load it into X19.
        emit_arm64_load_u64_immediate(buf, X19_REG, (uint64_t)layout->target_fn);
    // For an unbound trampoline, X19 was already loaded from the first argument in the prologue.
    // `cbnz x19, #8` : If the target function pointer in x19 is not zero, branch 8 bytes forward.
    emit_arm64_cbnz(buf, true, X19_REG, 8);
    // `brk #0` : If the pointer was null, execute a breakpoint instruction to cause a deliberate crash.
    emit_arm64_brk(buf, 0);
    // `blr x19` : Branch with link to the target function address in x19.
    emit_arm64_blr_reg(buf, X19_REG);
    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 4 (Forward): Generates the function epilogue.
 * @details Emits code to handle the return value (from X0/X1 or V0-V3), deallocates
 *          the stack frame, restores callee-saved registers, and returns to the caller.
 * @param buf The code buffer.
 * @param layout The layout blueprint.
 * @param ret_type The function's return type.
 * @return `INFIX_SUCCESS`.
 */
static infix_status generate_forward_epilogue_arm64(code_buffer * buf,
                                                    infix_call_frame_layout * layout,
                                                    infix_type * ret_type) {
    layout->epilogue_offset = (uint32_t)buf->size;
    // If the function returns a value and it wasn't returned via hidden pointer...
    if (ret_type->category != INFIX_TYPE_VOID && !layout->return_value_in_memory) {
        // ...copy the result from the appropriate return register(s) into the user's return buffer (pointer in X20).
        const infix_type * hfa_base = nullptr;

        // The order of these checks is critical. Handle the most specific cases first.
        // On Apple Silicon, long double is 8 bytes. Only emit 128-bit store if size is actually 16.
        if ((is_long_double(ret_type) && ret_type->size == 16) ||
            (ret_type->category == INFIX_TYPE_VECTOR && ret_type->size == 16))
            emit_arm64_str_q_imm(buf, V0_REG, X20_REG, 0);  // str q0, [x20]
        else if (is_hfa(ret_type, &hfa_base)) {
            size_t num_elements = ret_type->size / hfa_base->size;
            for (size_t i = 0; i < num_elements; ++i)
                emit_arm64_str_vpr(buf,
                                   hfa_base->size,
                                   VPR_ARGS[i],
                                   X20_REG,
                                   (int32_t)(i * hfa_base->size));  // Explicit cast
        }
        else if (is_float16(ret_type))
            emit_arm64_str_vpr(buf, 2, V0_REG, X20_REG, 0);  // str h0, [x20]
        else if (is_float(ret_type))
            emit_arm64_str_vpr(buf, 4, V0_REG, X20_REG, 0);  // str s0, [x20]
        // Handle standard double OR 8-byte long double (macOS)
        else if (is_double(ret_type) || (is_long_double(ret_type) && ret_type->size == 8))
            emit_arm64_str_vpr(buf, 8, V0_REG, X20_REG, 0);  // str d0, [x20]
        else {
            // Integer, pointer, or small aggregate return.
            switch (ret_type->size) {
            case 1:
                emit_arm64_strb_imm(buf, X0_REG, X20_REG, 0);
                break;
            case 2:
                emit_arm64_strh_imm(buf, X0_REG, X20_REG, 0);
                break;
            case 4:
                emit_arm64_str_imm(buf, false, X0_REG, X20_REG, 0);
                break;
            case 8:
                emit_arm64_str_imm(buf, true, X0_REG, X20_REG, 0);
                break;
            case 16:  // For __int128_t or small structs
                emit_arm64_str_imm(buf, true, X0_REG, X20_REG, 0);
                emit_arm64_str_imm(buf, true, X1_REG, X20_REG, 8);
                break;
            default:
                break;
            }
        }
    }
    // Deallocate stack space and restore registers.
    // X29 was set to SP after all pushes.
    // mov sp, x29
    emit_arm64_mov_reg(buf, true, SP_REG, X29_FP_REG);

    emit_arm64_ldp_post_index(buf, true, X21_REG, X22_REG, SP_REG, 16);        // ldp x21, x22, [sp], #16
    emit_arm64_ldp_post_index(buf, true, X19_REG, X20_REG, SP_REG, 16);        // ldp x19, x20, [sp], #16
    emit_arm64_ldp_post_index(buf, true, X29_FP_REG, X30_LR_REG, SP_REG, 16);  // ldp x29, x30, [sp], #16
    emit_arm64_ret(buf, X30_LR_REG);                                           // ret
    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 1 (Reverse): Calculates the stack layout for a reverse trampoline stub.
 * @details This function determines the total stack space the JIT-compiled stub will need
 *          for its local variables. This space includes:
 *          1. A buffer to store the return value before it's placed in registers.
 *          2. An array of `void*` pointers (`args_array`) to pass to the C dispatcher.
 *          3. A contiguous data area where the contents of all incoming arguments
 *             (from registers or the caller's stack) will be saved.
 *
 * @param arena The temporary arena for allocations.
 * @param[out] out_layout The resulting reverse call frame layout blueprint, populated with offsets.
 * @param context The reverse trampoline context with full signature information.
 * @return `INFIX_SUCCESS` on success, or an error code on failure.
 */
static infix_status prepare_reverse_call_frame_arm64(infix_arena_t * arena,
                                                     infix_reverse_call_frame_layout ** out_layout,
                                                     infix_reverse_t * context) {
    infix_reverse_call_frame_layout * layout = infix_arena_calloc(
        arena, 1, sizeof(infix_reverse_call_frame_layout), _Alignof(infix_reverse_call_frame_layout));
    if (!layout)
        return INFIX_ERROR_ALLOCATION_FAILED;
    // The return buffer must be large enough and aligned for any type.
    size_t return_size = (context->return_type->size + 15) & ~15;
    // The array of pointers that will be passed to the C dispatcher.
    size_t args_array_size = (context->num_args * sizeof(void *) + 15) & ~15;
    // The contiguous block where we will save the actual argument data.
    size_t saved_args_data_size = 0;
    for (size_t i = 0; i < context->num_args; ++i) {
        if (context->arg_types[i]->size > INFIX_MAX_ARG_SIZE) {
            *out_layout = nullptr;
            return INFIX_ERROR_LAYOUT_FAILED;
        }
        // Ensure each saved argument slot is 16-byte aligned for simplicity and correctness.
        saved_args_data_size += (context->arg_types[i]->size + 15) & ~15;
    }
    // Security check against excessively large aggregate argument data size.
    if (saved_args_data_size > INFIX_MAX_ARG_SIZE) {
        *out_layout = nullptr;
        return INFIX_ERROR_LAYOUT_FAILED;
    }
    size_t total_local_space = return_size + args_array_size + saved_args_data_size;
    // The total stack allocation for the frame must be 16-byte aligned.
    if (total_local_space > INFIX_MAX_STACK_ALLOC) {
        *out_layout = nullptr;
        return INFIX_ERROR_LAYOUT_FAILED;
    }
    layout->total_stack_alloc = (total_local_space + 15) & ~15;
    // Local variables are accessed via positive offsets from the stack pointer (SP)
    // after the initial `sub sp, sp, #alloc` in the prologue.
    // The layout on our local stack will be: [ return_buffer | args_array | saved_args_data ]
    layout->return_buffer_offset = 0;
    layout->args_array_offset = layout->return_buffer_offset + (int32_t)return_size;
    layout->saved_args_offset = layout->args_array_offset + (int32_t)args_array_size;
    *out_layout = layout;
    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 2 (Reverse): Generates the prologue for the reverse trampoline stub.
 * @details This function emits the standard AArch64 function entry code. It saves the
 *          caller's frame pointer (X29) and the link register (X30, the return address)
 *          to the stack, establishes a new frame by pointing X29 to the current stack
 *          pointer, and allocates the pre-calculated stack space for local variables.
 *
 * @param buf The code buffer to write to.
 * @param layout The blueprint containing the total stack space to allocate.
 * @return `INFIX_SUCCESS` on success.
 */
static infix_status generate_reverse_prologue_arm64(code_buffer * buf, infix_reverse_call_frame_layout * layout) {
    // `stp x29, x30, [sp, #-16]!` : Save Frame Pointer and Link Register, pre-decrementing SP.
    emit_arm64_stp_pre_index(buf, true, X29_FP_REG, X30_LR_REG, SP_REG, -16);
    // `mov x29, sp` : Establish the new frame pointer.
    emit_arm64_mov_reg(buf, true, X29_FP_REG, SP_REG);
    // `sub sp, sp, #total_stack_alloc` : Allocate space for our local variables.
    if (layout->total_stack_alloc > 0)
        emit_arm64_sub_imm(buf, true, false, SP_REG, SP_REG, (uint32_t)layout->total_stack_alloc);
    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 3 (Reverse): Generates code to marshal arguments into the `void**` array.
 * @details This generates `STR` instructions to copy argument data from their native
 *          locations (GPRs, VPRs, or the caller's stack) into a contiguous "saved args"
 *          area on the stub's local stack. It then populates the `args_array` with
 *          pointers to this saved data, respecting all platform-specific ABI deviations.
 *
 * @param buf The code buffer.
 * @param layout The layout blueprint.
 * @param context The reverse context.
 * @return `INFIX_SUCCESS`.
 */
static infix_status generate_reverse_argument_marshalling_arm64(code_buffer * buf,
                                                                infix_reverse_call_frame_layout * layout,
                                                                infix_reverse_t * context) {
    // Handle Return Value Pointer (Indirect Result Location)
    // If the return type is a large struct (> 16 bytes), the caller passes a hidden pointer in X8.
    // X8 is volatile, so we must save this pointer into our stack frame immediately.
    bool ret_is_aggregate =
        (context->return_type->category == INFIX_TYPE_STRUCT || context->return_type->category == INFIX_TYPE_UNION ||
         context->return_type->category == INFIX_TYPE_ARRAY || context->return_type->category == INFIX_TYPE_COMPLEX);
    bool return_in_memory = ret_is_aggregate && context->return_type->size > 16;

    if (return_in_memory) {
        // str x8, [sp, #return_buffer_offset]
        emit_arm64_str_imm(buf, true, X8_REG, SP_REG, layout->return_buffer_offset);
    }

    // Iterate over arguments
    size_t gpr_idx = 0;
    size_t vpr_idx = 0;
    size_t current_saved_data_offset = 0;

    // Arguments passed on the caller's stack start at offset 16 from our new frame pointer (X29).
    // X29 was established after pushing X29, X30, X19, X20, X21, X22.
    // [X29]    -> saved X29
    // [X29+8]  -> saved X30 (LR)
    // [X29+16] -> caller's first stack argument
    size_t caller_stack_offset = 16;

    for (size_t i = 0; i < context->num_args; ++i) {
        infix_type * type = context->arg_types[i];
        bool is_variadic_arg = i >= context->num_fixed_args;

        // Calculate where to save this argument's data in our local stack frame.
        int32_t arg_save_loc = (int32_t)(layout->saved_args_offset + current_saved_data_offset);

#if defined(INFIX_OS_MACOS)
        // macOS ABI deviation:
        // On macOS ARM64, ALL variadic arguments are passed on the stack.
        // They are also promoted: types < 8 bytes occupy a full 8-byte stack slot.
        if (is_variadic_arg) {
            size_t size_on_stack = (type->size < 8) ? 8 : type->size;
            size_on_stack = (size_on_stack + 7) & ~7;  // Align to 8 bytes

            // Copy from caller's stack to our local save area
            for (size_t offset = 0; offset < size_on_stack; offset += 8) {
                // ldr x9, [fp, #caller_offset]
                emit_arm64_ldr_imm(buf, true, X9_REG, X29_FP_REG, (int32_t)(caller_stack_offset + offset));

                int32_t dest_offset = arg_save_loc + (int32_t)offset;
                if (dest_offset >= 0 && ((unsigned)dest_offset / 8) <= 0xFFF && (dest_offset % 8 == 0))
                    emit_arm64_str_imm(buf, true, X9_REG, SP_REG, dest_offset);
                else {
                    emit_arm64_add_imm(buf, true, false, X10_REG, SP_REG, dest_offset);
                    emit_arm64_str_imm(buf, true, X9_REG, X10_REG, 0);
                }
            }
            caller_stack_offset += size_on_stack;

            // Set the pointer in args_array[i] to point to the saved data
            int32_t dest_offset = layout->args_array_offset + (int32_t)(i * sizeof(void *));
            emit_arm64_add_imm(buf, true, false, X9_REG, SP_REG, (uint32_t)arg_save_loc);

            if (dest_offset >= 0 && ((unsigned)dest_offset / 8) <= 0xFFF && (dest_offset % 8 == 0))
                emit_arm64_str_imm(buf, true, X9_REG, SP_REG, dest_offset);
            else {
                emit_arm64_add_imm(buf, true, false, X10_REG, SP_REG, dest_offset);
                emit_arm64_str_imm(buf, true, X9_REG, X10_REG, 0);
            }

            current_saved_data_offset += (type->size + 15) & ~15;
            continue;  // Argument handled, move to next
        }
#endif

        // Standard AAPCS64 logic
        bool is_pass_by_ref = (type->size > 16) && !is_variadic_arg;
        bool is_from_stack = false;

        bool expect_in_vpr = is_float16(type) || is_float(type) || is_double(type) || is_long_double(type) ||
            type->category == INFIX_TYPE_VECTOR;
#if defined(INFIX_OS_WINDOWS)
        // Windows on ARM ABI disables HFA rules for variadic functions; floats go to GPRs.
        if (context->is_variadic)
            expect_in_vpr = false;
#endif

        if (is_pass_by_ref) {
            // Large aggregates passed by reference. The argument is a pointer.
            // We store this pointer directly into args_array[i].
            int32_t dest_offset = layout->args_array_offset + (int32_t)(i * sizeof(void *));
            arm64_gpr src_reg;

            if (gpr_idx < NUM_GPR_ARGS)
                src_reg = GPR_ARGS[gpr_idx++];
            else {
                // Pointer passed on stack
                emit_arm64_ldr_imm(buf, true, X9_REG, X29_FP_REG, (int32_t)caller_stack_offset);
                src_reg = X9_REG;
                caller_stack_offset += 8;
            }

            if (dest_offset >= 0 && (dest_offset / 8) <= 0xFFF && (dest_offset % 8 == 0))
                emit_arm64_str_imm(buf, true, src_reg, SP_REG, dest_offset);
            else {
                emit_arm64_add_imm(buf, true, false, X10_REG, SP_REG, dest_offset);
                emit_arm64_str_imm(buf, true, src_reg, X10_REG, 0);
            }
            continue;  // Argument handled (no data copying needed)
        }

        const infix_type * hfa_base_type = nullptr;
        bool is_hfa_candidate = !is_variadic_arg && is_hfa(type, &hfa_base_type);
#if defined(INFIX_OS_WINDOWS)
        if (context->is_variadic)
            is_hfa_candidate = false;
#endif

        if (is_hfa_candidate) {
            // Homogeneous Floating-point Aggregate
            size_t num_elements = type->size / hfa_base_type->size;
            if (vpr_idx + num_elements <= NUM_VPR_ARGS) {
                const int scale = (int)hfa_base_type->size;
                for (size_t j = 0; j < num_elements; ++j) {
                    int32_t dest_offset = arg_save_loc + (int32_t)(j * hfa_base_type->size);
                    if (dest_offset >= 0 && ((unsigned)dest_offset / scale) <= 0xFFF && (dest_offset % scale == 0))
                        emit_arm64_str_vpr(buf, hfa_base_type->size, VPR_ARGS[vpr_idx++], SP_REG, dest_offset);
                    else {
                        emit_arm64_add_imm(buf, true, false, X10_REG, SP_REG, dest_offset);
                        emit_arm64_str_vpr(buf, hfa_base_type->size, VPR_ARGS[vpr_idx++], X10_REG, 0);
                    }
                }
            }
            else {
                is_from_stack = true;
            }
        }
        else if (expect_in_vpr) {
            // Single FP/Vector argument
            if (vpr_idx < NUM_VPR_ARGS) {
                // Determine width: 128-bit (Quad), 64-bit (Double), 32-bit (Single), or 16-bit (Half).
                // On macOS ARM64, long double is 8 bytes, so we must check size == 16.
                bool is_128bit = (type->size == 16);

// On Windows, always use 128-bit stores for robustness against partial register updates.
#if defined(INFIX_OS_WINDOWS)
                is_128bit = true;
#endif

                if (is_128bit && ((type->category == INFIX_TYPE_VECTOR) || is_long_double(type))) {
                    // Use STR Qn for 128-bit types
                    if (arg_save_loc >= 0 && ((unsigned)arg_save_loc / 16) <= 0xFFF && (arg_save_loc % 16 == 0))
                        emit_arm64_str_q_imm(buf, VPR_ARGS[vpr_idx++], SP_REG, arg_save_loc);
                    else {
                        emit_arm64_add_imm(buf, true, false, X10_REG, SP_REG, arg_save_loc);
                        emit_arm64_str_q_imm(buf, VPR_ARGS[vpr_idx++], X10_REG, 0);
                    }
                }
                else {
                    // Use STR Hn (16-bit), STR Sn (32-bit), or STR Dn (64-bit)
                    // Note: macOS long double (8 bytes) falls into path here via size check/alias logic
                    const int scale = (int)type->size;

                    if (arg_save_loc >= 0 && ((unsigned)arg_save_loc / scale) <= 0xFFF && (arg_save_loc % scale == 0)) {
                        emit_arm64_str_vpr(buf, type->size, VPR_ARGS[vpr_idx++], SP_REG, arg_save_loc);
                    }
                    else {
                        emit_arm64_add_imm(buf, true, false, X10_REG, SP_REG, arg_save_loc);
                        emit_arm64_str_vpr(buf, type->size, VPR_ARGS[vpr_idx++], X10_REG, 0);
                    }
                }
            }
            else {
                is_from_stack = true;
            }
        }
        else {
            // Integer / Pointer / Small Struct in GPR
            if (type->size > 8) {
                // 16-byte aggregate in Xn, Xn+1
                bool needs_alignment = true;
#if defined(INFIX_OS_MACOS)
                if (type->category == INFIX_TYPE_PRIMITIVE)
                    needs_alignment = false;
#elif defined(INFIX_OS_WINDOWS)
                if (is_variadic_arg)
                    needs_alignment = false;
#endif
                if (needs_alignment && (gpr_idx % 2 != 0))
                    gpr_idx++;

                if (gpr_idx + 1 < NUM_GPR_ARGS) {
                    // Store first half
                    if (arg_save_loc >= 0 && (((unsigned)arg_save_loc + 8) / 8) <= 0xFFF && (arg_save_loc % 8 == 0)) {
                        emit_arm64_str_imm(buf, true, GPR_ARGS[gpr_idx++], SP_REG, arg_save_loc);
                        emit_arm64_str_imm(buf, true, GPR_ARGS[gpr_idx++], SP_REG, arg_save_loc + 8);
                    }
                    else {
                        emit_arm64_add_imm(buf, true, false, X10_REG, SP_REG, arg_save_loc);
                        emit_arm64_str_imm(buf, true, GPR_ARGS[gpr_idx++], X10_REG, 0);
                        emit_arm64_str_imm(buf, true, GPR_ARGS[gpr_idx++], X10_REG, 8);
                    }
                }
                else {
                    is_from_stack = true;
                }
            }
            else if (gpr_idx < NUM_GPR_ARGS) {
                // <= 8 bytes in single GPR
                if (arg_save_loc >= 0 && ((unsigned)arg_save_loc / 8) <= 0xFFF && (arg_save_loc % 8 == 0))
                    emit_arm64_str_imm(buf, true, GPR_ARGS[gpr_idx++], SP_REG, arg_save_loc);
                else {
                    emit_arm64_add_imm(buf, true, false, X10_REG, SP_REG, arg_save_loc);
                    emit_arm64_str_imm(buf, true, GPR_ARGS[gpr_idx++], X10_REG, 0);
                }
            }
            else {
                is_from_stack = true;
            }
        }

infix/src/arch/aarch64/abi_arm64.c  view on Meta::CPAN

                    emit_arm64_str_imm(buf, true, X9_REG, X10_REG, 0);
                }
            }
            caller_stack_offset += size_on_stack;
        }

        // Write pointer to this saved data into the args_array[i]
        int32_t dest_offset = layout->args_array_offset + (int32_t)(i * sizeof(void *));

        // Calculate absolute address of saved arg: X9 = SP + arg_save_loc
        emit_arm64_add_imm(buf, true, false, X9_REG, SP_REG, (uint32_t)arg_save_loc);

        if (dest_offset >= 0 && ((unsigned)dest_offset / 8) <= 0xFFF && (dest_offset % 8 == 0))
            emit_arm64_str_imm(buf, true, X9_REG, SP_REG, dest_offset);
        else {
            emit_arm64_add_imm(buf, true, false, X10_REG, SP_REG, dest_offset);
            emit_arm64_str_imm(buf, true, X9_REG, X10_REG, 0);
        }

        current_saved_data_offset += (type->size + 15) & ~15;
    }
    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 4 (Reverse): Generates the code to call the high-level C dispatcher function.
 * @details This emits the instructions to load the three arguments for the dispatcher
 *          (`context`, `return_buffer_ptr`, `args_array_ptr`) into the correct registers
 *          (X0, X1, X2) and then calls the dispatcher via `blr` (branch with link to register).
 *
 * @param buf The code buffer.
 * @param layout The blueprint containing stack offsets.
 * @param context The context, containing the dispatcher's address.
 * @return `INFIX_SUCCESS` on success.
 */
static infix_status generate_reverse_dispatcher_call_arm64(code_buffer * buf,
                                                           infix_reverse_call_frame_layout * layout,
                                                           infix_reverse_t * context) {
    // Arg 1: Load context pointer into X0.
    emit_arm64_load_u64_immediate(buf, X0_REG, (uint64_t)context);
    bool ret_is_aggregate =
        (context->return_type->category == INFIX_TYPE_STRUCT || context->return_type->category == INFIX_TYPE_UNION ||
         context->return_type->category == INFIX_TYPE_ARRAY || context->return_type->category == INFIX_TYPE_COMPLEX);
    bool return_in_memory = ret_is_aggregate && context->return_type->size > 16;
    // Arg 2: Load pointer to return buffer into X1.
    if (return_in_memory)
        // We saved the pointer from X8 earlier, now we load it back.
        emit_arm64_ldr_imm(buf, true, X1_REG, SP_REG, layout->return_buffer_offset);
    else
        // The return buffer is on our stack, so we calculate its address.
        emit_arm64_add_imm(buf, true, false, X1_REG, SP_REG, (uint32_t)layout->return_buffer_offset);
    // Arg 3: Load pointer to args_array into X2.
    emit_arm64_add_imm(buf, true, false, X2_REG, SP_REG, (uint32_t)layout->args_array_offset);
    // Load the C dispatcher's address into a scratch register (X9) and call it.
    emit_arm64_load_u64_immediate(buf, X9_REG, (uint64_t)context->internal_dispatcher);
    emit_arm64_blr_reg(buf, X9_REG);  // blr x9
    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 5 (Reverse): Generates the epilogue for the reverse trampoline stub.
 * @details After the C dispatcher returns, this code retrieves the return value from the
 *          return buffer on the stub's local stack and places it into the correct native return
 *          registers (X0, X1, V0, etc.) as required by the AAPCS64. It then tears down the
 *          stack frame and returns control to the native caller.
 * @param buf The code buffer.
 * @param layout The layout blueprint.
 * @param context The reverse context.
 * @return `INFIX_SUCCESS`.
 */
static infix_status generate_reverse_epilogue_arm64(code_buffer * buf,
                                                    infix_reverse_call_frame_layout * layout,
                                                    infix_reverse_t * context) {
    bool ret_is_aggregate =
        (context->return_type->category == INFIX_TYPE_STRUCT || context->return_type->category == INFIX_TYPE_UNION ||
         context->return_type->category == INFIX_TYPE_ARRAY || context->return_type->category == INFIX_TYPE_COMPLEX);
    bool return_in_memory = ret_is_aggregate && context->return_type->size > 16;
    if (context->return_type->category != INFIX_TYPE_VOID && !return_in_memory) {
        const infix_type * base = nullptr;

        // Explicitly check for 128-bit types.
        // Note: On macOS ARM64, long double is 8 bytes, so is_long_double() is true but size is 8.
        // We only want the 128-bit load if the size matches.
        bool is_128bit = (context->return_type->size == 16);
        if (is_128bit && (is_long_double(context->return_type) || context->return_type->category == INFIX_TYPE_VECTOR))
            emit_arm64_ldr_q_imm(buf, V0_REG, SP_REG, layout->return_buffer_offset);
        else if (is_hfa(context->return_type, &base)) {
            size_t num_elements = context->return_type->size / base->size;
            for (size_t i = 0; i < num_elements; ++i) {
                emit_arm64_ldr_vpr(buf,
                                   base->size,
                                   VPR_ARGS[i],
                                   SP_REG,
                                   (int32_t)(layout->return_buffer_offset + i * base->size));  // Explicit cast
            }
        }
        else if (is_long_double(context->return_type) ||
                 (context->return_type->category == INFIX_TYPE_VECTOR && context->return_type->size == 16))
            emit_arm64_ldr_q_imm(buf, V0_REG, SP_REG, layout->return_buffer_offset);
        else if (is_float16(context->return_type))
            emit_arm64_ldr_vpr(buf, 2, V0_REG, SP_REG, layout->return_buffer_offset);
        else if (is_float(context->return_type) || is_double(context->return_type) ||
                 (is_long_double(context->return_type) && context->return_type->size == 8))
            emit_arm64_ldr_vpr(buf, context->return_type->size, V0_REG, SP_REG, layout->return_buffer_offset);
        else {
            // Integer, pointer, or small struct returned in GPRs.
            emit_arm64_ldr_imm(buf, true, X0_REG, SP_REG, layout->return_buffer_offset);
            if (context->return_type->size > 8)
                emit_arm64_ldr_imm(buf, true, X1_REG, SP_REG, layout->return_buffer_offset + 8);
        }
    }
    // Deallocate stack and restore frame.
    if (layout->total_stack_alloc > 0)
        // add sp, sp, #total_stack_alloc
        emit_arm64_add_imm(buf, true, false, SP_REG, SP_REG, (uint32_t)layout->total_stack_alloc);  // Cast size_t
    // Restore Frame Pointer and Link Register, then return.
    emit_arm64_ldp_post_index(
        buf, true, X29_FP_REG, X30_LR_REG, SP_REG, 16);  // ldp x29, x30, [sp], #16 (Load pair, post-indexed)
    emit_arm64_ret(buf, X30_LR_REG);                     // ret
    return INFIX_SUCCESS;
}

/**
 * @internal
 * @brief Stage 1 (Direct): Analyzes a signature and creates a call frame layout for AAPCS64.
 */
static infix_status prepare_direct_forward_call_frame_arm64(infix_arena_t * arena,
                                                            infix_direct_call_frame_layout ** out_layout,
                                                            infix_type * ret_type,
                                                            infix_type ** arg_types,
                                                            size_t num_args,
                                                            infix_direct_arg_handler_t * handlers,
                                                            void * target_fn) {
    // Reuse the standard classification logic.
    infix_call_frame_layout * standard_layout = nullptr;
    infix_status status =
        prepare_forward_call_frame_arm64(arena, &standard_layout, ret_type, arg_types, num_args, num_args, target_fn);
    if (status != INFIX_SUCCESS)
        return status;

    // Create the new direct layout and copy basic info.
    infix_direct_call_frame_layout * layout =
        infix_arena_calloc(arena, 1, sizeof(infix_direct_call_frame_layout), _Alignof(infix_direct_call_frame_layout));
    if (!layout)
        return INFIX_ERROR_ALLOCATION_FAILED;

    layout->args =
        infix_arena_calloc(arena, num_args, sizeof(infix_direct_arg_layout), _Alignof(infix_direct_arg_layout));
    if (!layout->args && num_args > 0)
        return INFIX_ERROR_ALLOCATION_FAILED;

    layout->num_args = num_args;
    layout->target_fn = target_fn;
    layout->return_value_in_memory = standard_layout->return_value_in_memory;

    // Calculate scratch space needed on the stack.
    // Note: We do NOT store the scratch offset in layout->args[i].location.stack_offset,
    // because that field is needed for the *outgoing* ABI stack offset.
    // Instead, we just calculate the total size here, and recalculate the offsets
    // sequentially during generation.
    size_t scratch_space_needed = 0;
    for (size_t i = 0; i < num_args; ++i) {
        layout->args[i].location = standard_layout->arg_locations[i];
        layout->args[i].type = arg_types[i];
        layout->args[i].handler = &handlers[i];

        if (handlers[i].aggregate_marshaller) {
            scratch_space_needed = _infix_align_up(scratch_space_needed, arg_types[i]->alignment);
            scratch_space_needed += arg_types[i]->size;
        }
        else if (handlers[i].scalar_marshaller) {
            // Scalars need scratch space to bounce X0 -> Stack -> V0
            scratch_space_needed = _infix_align_up(scratch_space_needed, 16);
            scratch_space_needed += 16;
        }
        else if (handlers[i].writeback_handler) {
            const infix_type * pointee = (arg_types[i]->category == INFIX_TYPE_POINTER)
                ? arg_types[i]->meta.pointer_info.pointee_type
                : arg_types[i];
            scratch_space_needed = _infix_align_up(scratch_space_needed, pointee->alignment);
            scratch_space_needed += pointee->size;
        }
    }

    size_t total_needed = standard_layout->total_stack_alloc + scratch_space_needed;
    layout->total_stack_alloc = (total_needed + 15) & ~15;

    *out_layout = layout;
    return INFIX_SUCCESS;
}

/**
 * @internal
 * @brief Stage 2 (Direct): Generates the function prologue.
 */
static infix_status generate_direct_forward_prologue_arm64(code_buffer * buf, infix_direct_call_frame_layout * layout) {
    // Standard prologue: save FP/LR, set up new FP.
    emit_arm64_stp_pre_index(buf, true, X29_FP_REG, X30_LR_REG, SP_REG, -16);
    // Save callee-saved registers for our context.
    // X19: target_fn, X20: ret_ptr, X21: lang_args array
    emit_arm64_stp_pre_index(buf, true, X19_REG, X20_REG, SP_REG, -16);
    emit_arm64_stp_pre_index(buf, true, X21_REG, X22_REG, SP_REG, -16);  // X22 as scratch
    emit_arm64_mov_reg(buf, true, X29_FP_REG, SP_REG);

    // The direct CIF is called with (ret_ptr, lang_args) in X0, X1.
    emit_arm64_mov_reg(buf, true, X20_REG, X0_REG);  // x20 = ret_ptr
    emit_arm64_mov_reg(buf, true, X21_REG, X1_REG);  // x21 = lang_args

    // Allocate total stack space.
    if (layout->total_stack_alloc > 0)
        emit_arm64_sub_imm(buf, true, false, SP_REG, SP_REG, (uint32_t)layout->total_stack_alloc);
    return INFIX_SUCCESS;
}

/**
 * @internal
 * @brief Stage 3 (Direct): Generates code to call marshallers and move arguments.
 */
static infix_status generate_direct_forward_argument_moves_arm64(code_buffer * buf,
                                                                 infix_direct_call_frame_layout * layout) {
    if (layout->return_value_in_memory)
        emit_arm64_mov_reg(buf, true, X8_REG, X20_REG);

    // Re-calculate standard stack size to find where scratch space begins
    size_t standard_alloc_size = 0;
    {
        size_t stack_offset = 0;
        for (size_t i = 0; i < layout->num_args; ++i) {
            if (layout->args[i].location.type == ARG_LOCATION_STACK) {
                size_t s = layout->args[i].type->size;
                size_t end = layout->args[i].location.stack_offset + ((s + 7) & ~7);
                if (end > stack_offset)
                    stack_offset = end;
            }
        }
        standard_alloc_size = (stack_offset + 15) & ~15;
    }
    size_t scratch_base_from_sp = standard_alloc_size;
    size_t current_scratch_offset = 0;

    // PHASE 1: MARSHALL & SAVE TO STACK

    for (size_t i = 0; i < layout->num_args; ++i) {
        const infix_direct_arg_layout * arg_layout = &layout->args[i];
        int32_t my_scratch_offset = -1;

        // Calculate offset for all types requiring scratch space
        bool needs_scratch = false;
        size_t size = 0;
        size_t align = 0;

        if (arg_layout->handler->aggregate_marshaller) {
            size = arg_layout->type->size;
            align = arg_layout->type->alignment;
            needs_scratch = true;
        }
        else if (arg_layout->handler->scalar_marshaller) {

infix/src/arch/aarch64/abi_arm64.c  view on Meta::CPAN

                    }
                }
                break;
            case ARG_LOCATION_STACK:
                for (size_t offset = 0; offset < arg_layout->type->size; offset += 8) {
                    emit_arm64_ldr_imm(buf, true, X9_REG, SP_REG, my_scratch_offset + (int32_t)offset);
                    emit_arm64_str_imm(buf, true, X9_REG, SP_REG, arg_layout->location.stack_offset + (int32_t)offset);
                }
                break;
            default:
                break;
            }
        }
        else if (arg_layout->handler->scalar_marshaller) {
            // Value was returned in X0 and saved to scratch slot.
            if (arg_layout->location.type == ARG_LOCATION_GPR) {
                // Load from scratch to destination GPR
                emit_arm64_ldr_imm(buf, true, GPR_ARGS[arg_layout->location.reg_index], SP_REG, my_scratch_offset);
            }
            else if (arg_layout->location.type == ARG_LOCATION_VPR) {
                if (is_float(arg_layout->type)) {
                    // Load 64-bit double from scratch into D-reg (use dest reg as temp)
                    arm64_vpr dest_v = VPR_ARGS[arg_layout->location.reg_index];
                    emit_arm64_ldr_vpr(buf, 8, dest_v, SP_REG, my_scratch_offset);

                    // FCVT S, D (Double to Single)
                    // Opcode: 0x1e624000 | (Rn << 5) | Rd.
                    // Rn=dest_v, Rd=dest_v (in place conversion)
                    uint32_t fcvt = 0x1e624000 | ((dest_v & 0x1F) << 5) | (dest_v & 0x1F);
                    emit_int32(buf, fcvt);
                }
                else {
                    // Load directly (double)
                    emit_arm64_ldr_vpr(buf, 8, VPR_ARGS[arg_layout->location.reg_index], SP_REG, my_scratch_offset);
                }
            }
            else if (arg_layout->location.type == ARG_LOCATION_STACK) {
                emit_arm64_ldr_imm(buf, true, X9_REG, SP_REG, my_scratch_offset);
                emit_arm64_str_imm(buf, true, X9_REG, SP_REG, arg_layout->location.stack_offset);
            }
        }
    }
    return INFIX_SUCCESS;
}

/**
 * @internal
 * @brief Stage 3.5 (Direct): Generates the call instruction.
 */
static infix_status generate_direct_forward_call_instruction_arm64(code_buffer * buf,
                                                                   infix_direct_call_frame_layout * layout) {
    emit_arm64_load_u64_immediate(buf, X19_REG, (uint64_t)layout->target_fn);
    emit_arm64_cbnz(buf, true, X19_REG, 8);
    emit_arm64_brk(buf, 0);
    emit_arm64_blr_reg(buf, X19_REG);
    return INFIX_SUCCESS;
}

/**
 * @internal
 * @brief Stage 4 (Direct): Generates the epilogue, including write-back calls.
 */
static infix_status generate_direct_forward_epilogue_arm64(code_buffer * buf,
                                                           infix_direct_call_frame_layout * layout,
                                                           infix_type * ret_type) {
    layout->epilogue_offset = (uint32_t)buf->size;
    // Handle C function's return value.
    if (ret_type->category != INFIX_TYPE_VOID && !layout->return_value_in_memory) {
        const infix_type * hfa_base = nullptr;
        if ((is_long_double(ret_type) && ret_type->size == 16) ||
            (ret_type->category == INFIX_TYPE_VECTOR && ret_type->size == 16))
            emit_arm64_str_q_imm(buf, V0_REG, X20_REG, 0);
        else if (is_hfa(ret_type, &hfa_base)) {
            size_t num_elements = ret_type->size / hfa_base->size;
            for (size_t i = 0; i < num_elements; ++i)
                emit_arm64_str_vpr(buf,
                                   hfa_base->size,
                                   VPR_ARGS[i],
                                   X20_REG,
                                   (int32_t)(i * hfa_base->size));  // Explicit cast
        }
        else if (is_float16(ret_type))
            emit_arm64_str_vpr(buf, 2, V0_REG, X20_REG, 0);
        else if (is_float(ret_type))
            emit_arm64_str_vpr(buf, 4, V0_REG, X20_REG, 0);
        else if (is_double(ret_type))
            emit_arm64_str_vpr(buf, 8, V0_REG, X20_REG, 0);
        else {
            // Integer, pointer, or small aggregate return.
            switch (ret_type->size) {
            case 1:
                emit_arm64_strb_imm(buf, X0_REG, X20_REG, 0);
                break;
            case 2:
                emit_arm64_strh_imm(buf, X0_REG, X20_REG, 0);
                break;
            case 4:
                emit_arm64_str_imm(buf, false, X0_REG, X20_REG, 0);
                break;
            case 8:
                emit_arm64_str_imm(buf, true, X0_REG, X20_REG, 0);
                break;
            case 16:
                emit_arm64_str_imm(buf, true, X0_REG, X20_REG, 0);
                emit_arm64_str_imm(buf, true, X1_REG, X20_REG, 8);
                break;
            default:
                break;
            }
        }
    }

    // Re-calculate standard stack size to find scratch base
    size_t standard_alloc_size = 0;
    {
        size_t stack_offset = 0;
        for (size_t i = 0; i < layout->num_args; ++i) {
            if (layout->args[i].location.type == ARG_LOCATION_STACK) {
                size_t s = layout->args[i].type->size;
                size_t end = layout->args[i].location.stack_offset + ((s + 7) & ~7);
                if (end > stack_offset)
                    stack_offset = end;
            }
        }
        standard_alloc_size = (stack_offset + 15) & ~15;
    }

    // Call Write-Back Handlers
    size_t epilogue_scratch_offset = 0;  // Track offset locally to ensure consistency

    for (size_t i = 0; i < layout->num_args; ++i) {
        const infix_direct_arg_layout * arg_layout = &layout->args[i];

        // Re-calculate offset for this arg (Must match Phase 1 & 2 logic exactly)
        int32_t my_scratch_offset = -1;
        bool needs_scratch = false;
        size_t size = 0;
        size_t align = 0;

        if (arg_layout->handler->aggregate_marshaller) {
            size = arg_layout->type->size;
            align = arg_layout->type->alignment;
            needs_scratch = true;
        }
        else if (arg_layout->handler->scalar_marshaller) {
            size = 16;
            align = 16;
            needs_scratch = true;
        }
        else if (arg_layout->handler->writeback_handler) {
            const infix_type * pointee = (arg_layout->type->category == INFIX_TYPE_POINTER)
                ? arg_layout->type->meta.pointer_info.pointee_type
                : arg_layout->type;
            size = pointee->size;
            align = pointee->alignment;
            needs_scratch = true;
        }

        if (needs_scratch) {
            epilogue_scratch_offset = _infix_align_up(epilogue_scratch_offset, align);
            my_scratch_offset = (int32_t)(standard_alloc_size + epilogue_scratch_offset);
            epilogue_scratch_offset += size;
        }

        if (arg_layout->handler->writeback_handler) {
            // Save C return value (in X0/V0) before calling out.
            // Note: Technically should save more registers for HFA returns, but this matches basic needs.
            emit_arm64_sub_imm(buf, true, false, SP_REG, SP_REG, 32);
            emit_arm64_str_imm(buf, true, X0_REG, SP_REG, 0);
            emit_arm64_str_imm(buf, true, X1_REG, SP_REG, 8);
            emit_arm64_str_q_imm(buf, V0_REG, SP_REG, 16);  // Save V0 (covers float/double/vector)

            // Arg 1 (X0): Original language object pointer.
            emit_arm64_ldr_imm(buf, true, X0_REG, X21_REG, (int32_t)(i * sizeof(void *)));

            // Arg 2 (X1): Pointer to the C data.
            // Address = Current SP (which is Original SP - 32) + 32 + offset
            int32_t total_offset = 32 + my_scratch_offset;
            emit_arm64_add_imm(buf, true, false, X1_REG, SP_REG, total_offset);

            // Arg 3 (X2): The infix_type*.
            emit_arm64_load_u64_immediate(buf, X2_REG, (uint64_t)arg_layout->type);

            // Call the handler.
            emit_arm64_load_u64_immediate(buf, X10_REG, (uint64_t)arg_layout->handler->writeback_handler);
            emit_arm64_blr_reg(buf, X10_REG);

            // Restore C return value.
            emit_arm64_ldr_q_imm(buf, V0_REG, SP_REG, 16);
            emit_arm64_ldr_imm(buf, true, X1_REG, SP_REG, 8);
            emit_arm64_ldr_imm(buf, true, X0_REG, SP_REG, 0);
            emit_arm64_add_imm(buf, true, false, SP_REG, SP_REG, 32);
        }
    }

    // Standard Epilogue
    // Restore stack pointer to the saved registers area.
    // X29 was set to SP after all pushes.
    // mov sp, x29
    emit_arm64_mov_reg(buf, true, SP_REG, X29_FP_REG);

    emit_arm64_ldp_post_index(buf, true, X21_REG, X22_REG, SP_REG, 16);
    emit_arm64_ldp_post_index(buf, true, X19_REG, X20_REG, SP_REG, 16);
    emit_arm64_ldp_post_index(buf, true, X29_FP_REG, X30_LR_REG, SP_REG, 16);
    emit_arm64_ret(buf, X30_LR_REG);

    return INFIX_SUCCESS;
}



( run in 1.228 second using v1.01-cache-2.11-cpan-5735350b133 )