Affix

 view release on metacpan or  search on metacpan

infix/src/arch/aarch64/abi_arm64.c  view on Meta::CPAN

/**
 * Copyright (c) 2025 Sanko Robinson
 *
 * This source code is dual-licensed under the Artistic License 2.0 or the MIT License.
 * You may choose to use this code under the terms of either license.
 *
 * SPDX-License-Identifier: (Artistic-2.0 OR MIT)
 *
 * The documentation blocks within this file are licensed under the
 * Creative Commons Attribution 4.0 International License (CC BY 4.0).
 *
 * SPDX-License-Identifier: CC-BY-4.0
 */
/**
 * @file abi_arm64.c
 * @brief Implements the FFI logic for the AArch64 (ARM64) architecture.
 * @ingroup internal_abi_aarch64
 *
 * @internal
 * This file provides the concrete implementation of the `infix_forward_abi_spec`
 * and `infix_reverse_abi_spec` for the ARM64 architecture. It primarily follows
 * the standard "Procedure Call Standard for the ARM 64-bit Architecture" (AAPCS64),
 * but also contains critical conditional logic to handle deviations for specific
 * platforms like Apple macOS and Windows on ARM.
 *
 * @section aapcs64_rules Key AAPCS64 Rules Implemented
 *
 * - **Register Usage:**
 *   - The first 8 integer/pointer arguments are passed in GPRs (X0-X7).
 *   - The first 8 floating-point/vector arguments are passed in VPRs (V0-V7).
 *
 * - **Homogeneous Floating-point Aggregates (HFAs):** Structs or arrays composed
 *   entirely of 1 to 4 identical floating-point types (`float` or `double`) are
 *   passed in consecutive VPRs.
 *
 * - **Return Values:**
 *   - Aggregates up to 16 bytes are returned in registers (GPRs and/or VPRs).
 *   - Larger aggregates are returned via a hidden pointer passed by the caller
 *     in the dedicated "indirect result location register", `X8`.
 *
 * @section platform_deviations Platform-Specific Deviations
 *
 * - **Variadic Calls (Apple macOS):** All variadic arguments are passed on the
 *   stack. Arguments smaller than 8 bytes are promoted to fill 8-byte stack slots.
 *
 * - **Variadic Calls (Windows on ARM):** The HFA rule is disabled for variadic
 *   arguments. Floating-point scalars are passed in GPRs, not VPRs.
 *
 * - **16-Byte Argument Alignment:**
 *   - **Standard/macOS:** 16-byte aggregates passed in GPRs must start in an
 *     even-numbered register (X0, X2, X4, X6).
 *   - **macOS Exception:** `__int128_t` does NOT require even-GPR alignment.
 *   - **Windows Exception:** Variadic 16-byte aggregates do NOT require even-GPR alignment.
 * @endinternal
 */
#include "arch/aarch64/abi_arm64_common.h"
#include "arch/aarch64/abi_arm64_emitters.h"
#include "common/infix_internals.h"
#include "common/utility.h"
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
/** @internal The General-Purpose Registers used for the first 8 integer/pointer arguments. */
static const arm64_gpr GPR_ARGS[] = {X0_REG, X1_REG, X2_REG, X3_REG, X4_REG, X5_REG, X6_REG, X7_REG};
/** @internal The SIMD/Floating-Point Registers used for the first 8 float/double/vector arguments. */
static const arm64_vpr VPR_ARGS[] = {V0_REG, V1_REG, V2_REG, V3_REG, V4_REG, V5_REG, V6_REG, V7_REG};
/** @internal The total number of GPRs available for argument passing. */
#define NUM_GPR_ARGS 8
/** @internal The total number of VPRs available for argument passing. */
#define NUM_VPR_ARGS 8
/** @internal A safe limit on the number of fields to classify to prevent DoS from exponential complexity. */
#define MAX_AGGREGATE_FIELDS_TO_CLASSIFY 32

//
static bool is_hfa(const infix_type * type, const infix_type ** base_type);

/** @internal The v-table of AArch64 functions for generating forward trampolines. */
static infix_status prepare_forward_call_frame_arm64(infix_arena_t * arena,
                                                     infix_call_frame_layout ** out_layout,
                                                     infix_type * ret_type,
                                                     infix_type ** arg_types,
                                                     size_t num_args,
                                                     size_t num_fixed_args,
                                                     void * target_fn);
static infix_status generate_forward_prologue_arm64(code_buffer * buf, infix_call_frame_layout * layout);
static infix_status generate_forward_argument_moves_arm64(code_buffer * buf,
                                                          infix_call_frame_layout * layout,
                                                          infix_type ** arg_types,
                                                          size_t num_args,
                                                          c23_maybe_unused size_t num_fixed_args);
static infix_status generate_forward_call_instruction_arm64(code_buffer *, infix_call_frame_layout *);
static infix_status generate_forward_epilogue_arm64(code_buffer * buf,
                                                    infix_call_frame_layout * layout,
                                                    infix_type * ret_type);
const infix_forward_abi_spec g_arm64_forward_spec = {
    .prepare_forward_call_frame = prepare_forward_call_frame_arm64,
    .generate_forward_prologue = generate_forward_prologue_arm64,
    .generate_forward_argument_moves = generate_forward_argument_moves_arm64,
    .generate_forward_call_instruction = generate_forward_call_instruction_arm64,
    .generate_forward_epilogue = generate_forward_epilogue_arm64};

/** @internal The v-table of AArch64 functions for generating reverse trampolines. */
static infix_status prepare_reverse_call_frame_arm64(infix_arena_t * arena,
                                                     infix_reverse_call_frame_layout ** out_layout,
                                                     infix_reverse_t * context);
static infix_status generate_reverse_prologue_arm64(code_buffer * buf, infix_reverse_call_frame_layout * layout);
static infix_status generate_reverse_argument_marshalling_arm64(code_buffer * buf,
                                                                infix_reverse_call_frame_layout * layout,
                                                                infix_reverse_t * context);
static infix_status generate_reverse_dispatcher_call_arm64(code_buffer * buf,

infix/src/arch/aarch64/abi_arm64.c  view on Meta::CPAN

        if (layout->is_variadic && is_variadic_arg) {
            layout->arg_locations[i].type = ARG_LOCATION_STACK;
            layout->arg_locations[i].stack_offset = (uint32_t)stack_offset;
            // Any argument smaller than 8 bytes must be promoted to an 8-byte slot on the stack.
            size_t arg_size_on_stack = (type->size < 8) ? 8 : type->size;
            stack_offset += (arg_size_on_stack + 7) & ~7;
            layout->num_stack_args++;
            continue;  // Argument classified, proceed to the next one.
        }
#endif
        bool pass_fp_in_vpr = is_float16(type) || is_float(type) || is_double(type) || is_long_double(type) ||
            type->category == INFIX_TYPE_VECTOR;
        const infix_type * hfa_base_type = nullptr;
        bool is_hfa_candidate = is_hfa(type, &hfa_base_type);
#if defined(INFIX_OS_WINDOWS)
        // Windows on ARM ABI Deviation: If the function is variadic, HFA rules are ignored,
        // and all floating-point scalars are passed in GPRs.
        if (layout->is_variadic) {
            pass_fp_in_vpr = false;
            is_hfa_candidate = false;
        }
#endif
        // The order of these checks is critical to follow the ABI specification correctly.
        if (is_hfa_candidate) {
            size_t num_elements = type->size / hfa_base_type->size;
            if (vpr_count + num_elements <= NUM_VPR_ARGS) {
                layout->arg_locations[i].type = ARG_LOCATION_VPR_HFA;
                layout->arg_locations[i].reg_index = (uint8_t)vpr_count;
                layout->arg_locations[i].num_regs = (uint32_t)num_elements;
                vpr_count += num_elements;
                placed_in_register = true;
            }
        }
        else if (type->size > 16) {
            // Aggregates > 16 bytes are passed by reference (a pointer in a GPR).
            if (gpr_count < NUM_GPR_ARGS) {
                layout->arg_locations[i].type = ARG_LOCATION_GPR_REFERENCE;
                layout->arg_locations[i].reg_index = (uint8_t)gpr_count++;
                placed_in_register = true;
            }
        }
        else if (pass_fp_in_vpr) {
            if (vpr_count < NUM_VPR_ARGS) {
                layout->arg_locations[i].type = ARG_LOCATION_VPR;
                layout->arg_locations[i].reg_index = (uint8_t)vpr_count++;
                placed_in_register = true;
            }
        }
        else {                     // Integers, pointers, small aggregates, and variadic floats on Windows.
            if (type->size > 8) {  // Types > 8 and <= 16 bytes are passed in a pair of GPRs.
                bool needs_alignment = true;
#if defined(INFIX_OS_MACOS)
                // macOS Deviation: `__int128_t` does not require even-GPR alignment.
                if (type->category == INFIX_TYPE_PRIMITIVE)
                    needs_alignment = false;
#elif defined(INFIX_OS_WINDOWS)
                // Windows Deviation: Variadic 16-byte arguments do not require even-GPR alignment.
                if (is_variadic_arg)
                    needs_alignment = false;
#endif
                // Standard rule: 16-byte args must start in an even-numbered GPR.
                if (needs_alignment && (gpr_count % 2 != 0))
                    gpr_count++;
                if (gpr_count + 1 < NUM_GPR_ARGS) {
                    layout->arg_locations[i].type = ARG_LOCATION_GPR_PAIR;
                    layout->arg_locations[i].reg_index = (uint8_t)gpr_count;
                    gpr_count += 2;
                    placed_in_register = true;
                }
            }
            else {  // Types <= 8 bytes passed in a single GPR.
                if (gpr_count < NUM_GPR_ARGS) {
                    layout->arg_locations[i].type = ARG_LOCATION_GPR;
                    layout->arg_locations[i].reg_index = (uint8_t)gpr_count++;
                    placed_in_register = true;
                }
            }
        }
        // If it couldn't be placed in a register, it must go on the stack.
        if (!placed_in_register) {
            layout->arg_locations[i].type = ARG_LOCATION_STACK;

            // Enforce natural alignment for stack arguments on ARM64
            size_t align = type->alignment;
            if (align < 8)
                align = 8;

            // Align the current stack offset
            stack_offset = (stack_offset + (align - 1)) & ~(align - 1);
            layout->arg_locations[i].stack_offset = (uint32_t)stack_offset;
            stack_offset += (type->size + 7) & ~7;  // Stack slots are 8-byte aligned.
            layout->num_stack_args++;
        }
    }
    // The total stack space for arguments must be 16-byte aligned before the call.
    layout->total_stack_alloc = (stack_offset + 15) & ~15;
    layout->num_gpr_args = (uint8_t)gpr_count;
    layout->num_vpr_args = (uint8_t)vpr_count;
    // Security: Prevent excessive stack allocation.
    if (layout->total_stack_alloc > INFIX_MAX_STACK_ALLOC) {
        *out_layout = nullptr;
        return INFIX_ERROR_LAYOUT_FAILED;
    }
    *out_layout = layout;
    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 2 (Forward): Generates the function prologue for the AArch64 trampoline.
 * @details Sets up the stack frame by saving the frame pointer (X29) and link register (X30),
 *          saves callee-saved registers (X19-X22) that will be used to hold the trampoline's
 *          context, moves the trampoline's arguments into those preserved registers, and
 *          allocates the necessary stack space for stack-passed arguments.
 * @param buf The code buffer.
 * @param layout The layout blueprint.
 * @return `INFIX_SUCCESS`.
 */
static infix_status generate_forward_prologue_arm64(code_buffer * buf, infix_call_frame_layout * layout) {
    // `stp x29, x30, [sp, #-16]!` : Push Frame Pointer and Link Register to the stack, pre-decrementing SP.
    emit_arm64_stp_pre_index(buf, true, X29_FP_REG, X30_LR_REG, SP_REG, -16);
    // `stp x19, x20, [sp, #-16]!` : Save callee-saved registers that we will use for our context.

infix/src/arch/aarch64/abi_arm64.c  view on Meta::CPAN

    layout->saved_args_offset = layout->args_array_offset + (int32_t)args_array_size;
    *out_layout = layout;
    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 2 (Reverse): Generates the prologue for the reverse trampoline stub.
 * @details This function emits the standard AArch64 function entry code. It saves the
 *          caller's frame pointer (X29) and the link register (X30, the return address)
 *          to the stack, establishes a new frame by pointing X29 to the current stack
 *          pointer, and allocates the pre-calculated stack space for local variables.
 *
 * @param buf The code buffer to write to.
 * @param layout The blueprint containing the total stack space to allocate.
 * @return `INFIX_SUCCESS` on success.
 */
static infix_status generate_reverse_prologue_arm64(code_buffer * buf, infix_reverse_call_frame_layout * layout) {
    // `stp x29, x30, [sp, #-16]!` : Save Frame Pointer and Link Register, pre-decrementing SP.
    emit_arm64_stp_pre_index(buf, true, X29_FP_REG, X30_LR_REG, SP_REG, -16);
    // `mov x29, sp` : Establish the new frame pointer.
    emit_arm64_mov_reg(buf, true, X29_FP_REG, SP_REG);
    // `sub sp, sp, #total_stack_alloc` : Allocate space for our local variables.
    if (layout->total_stack_alloc > 0)
        emit_arm64_sub_imm(buf, true, false, SP_REG, SP_REG, (uint32_t)layout->total_stack_alloc);
    return INFIX_SUCCESS;
}
/**
 * @internal
 * @brief Stage 3 (Reverse): Generates code to marshal arguments into the `void**` array.
 * @details This generates `STR` instructions to copy argument data from their native
 *          locations (GPRs, VPRs, or the caller's stack) into a contiguous "saved args"
 *          area on the stub's local stack. It then populates the `args_array` with
 *          pointers to this saved data, respecting all platform-specific ABI deviations.
 *
 * @param buf The code buffer.
 * @param layout The layout blueprint.
 * @param context The reverse context.
 * @return `INFIX_SUCCESS`.
 */
static infix_status generate_reverse_argument_marshalling_arm64(code_buffer * buf,
                                                                infix_reverse_call_frame_layout * layout,
                                                                infix_reverse_t * context) {
    // Handle Return Value Pointer (Indirect Result Location)
    // If the return type is a large struct (> 16 bytes), the caller passes a hidden pointer in X8.
    // X8 is volatile, so we must save this pointer into our stack frame immediately.
    bool ret_is_aggregate =
        (context->return_type->category == INFIX_TYPE_STRUCT || context->return_type->category == INFIX_TYPE_UNION ||
         context->return_type->category == INFIX_TYPE_ARRAY || context->return_type->category == INFIX_TYPE_COMPLEX);
    bool return_in_memory = ret_is_aggregate && context->return_type->size > 16;

    if (return_in_memory) {
        // str x8, [sp, #return_buffer_offset]
        emit_arm64_str_imm(buf, true, X8_REG, SP_REG, layout->return_buffer_offset);
    }

    // Iterate over arguments
    size_t gpr_idx = 0;
    size_t vpr_idx = 0;
    size_t current_saved_data_offset = 0;

    // Arguments passed on the caller's stack start at offset 16 from our new frame pointer (X29).
    // X29 was established after pushing X29, X30, X19, X20, X21, X22.
    // [X29]    -> saved X29
    // [X29+8]  -> saved X30 (LR)
    // [X29+16] -> caller's first stack argument
    size_t caller_stack_offset = 16;

    for (size_t i = 0; i < context->num_args; ++i) {
        infix_type * type = context->arg_types[i];
        bool is_variadic_arg = i >= context->num_fixed_args;

        // Calculate where to save this argument's data in our local stack frame.
        int32_t arg_save_loc = (int32_t)(layout->saved_args_offset + current_saved_data_offset);

#if defined(INFIX_OS_MACOS)
        // macOS ABI deviation:
        // On macOS ARM64, ALL variadic arguments are passed on the stack.
        // They are also promoted: types < 8 bytes occupy a full 8-byte stack slot.
        if (is_variadic_arg) {
            size_t size_on_stack = (type->size < 8) ? 8 : type->size;
            size_on_stack = (size_on_stack + 7) & ~7;  // Align to 8 bytes

            // Copy from caller's stack to our local save area
            for (size_t offset = 0; offset < size_on_stack; offset += 8) {
                // ldr x9, [fp, #caller_offset]
                emit_arm64_ldr_imm(buf, true, X9_REG, X29_FP_REG, (int32_t)(caller_stack_offset + offset));

                int32_t dest_offset = arg_save_loc + (int32_t)offset;
                if (dest_offset >= 0 && ((unsigned)dest_offset / 8) <= 0xFFF && (dest_offset % 8 == 0))
                    emit_arm64_str_imm(buf, true, X9_REG, SP_REG, dest_offset);
                else {
                    emit_arm64_add_imm(buf, true, false, X10_REG, SP_REG, dest_offset);
                    emit_arm64_str_imm(buf, true, X9_REG, X10_REG, 0);
                }
            }
            caller_stack_offset += size_on_stack;

            // Set the pointer in args_array[i] to point to the saved data
            int32_t dest_offset = layout->args_array_offset + (int32_t)(i * sizeof(void *));
            emit_arm64_add_imm(buf, true, false, X9_REG, SP_REG, (uint32_t)arg_save_loc);

            if (dest_offset >= 0 && ((unsigned)dest_offset / 8) <= 0xFFF && (dest_offset % 8 == 0))
                emit_arm64_str_imm(buf, true, X9_REG, SP_REG, dest_offset);
            else {
                emit_arm64_add_imm(buf, true, false, X10_REG, SP_REG, dest_offset);
                emit_arm64_str_imm(buf, true, X9_REG, X10_REG, 0);
            }

            current_saved_data_offset += (type->size + 15) & ~15;
            continue;  // Argument handled, move to next
        }
#endif

        // Standard AAPCS64 logic
        bool is_pass_by_ref = (type->size > 16) && !is_variadic_arg;
        bool is_from_stack = false;

        bool expect_in_vpr = is_float16(type) || is_float(type) || is_double(type) || is_long_double(type) ||
            type->category == INFIX_TYPE_VECTOR;
#if defined(INFIX_OS_WINDOWS)
        // Windows on ARM ABI disables HFA rules for variadic functions; floats go to GPRs.



( run in 1.779 second using v1.01-cache-2.11-cpan-99c4e6809bf )