Affix
view release on metacpan or search on metacpan
infix/src/arch/aarch64/abi_arm64.c view on Meta::CPAN
/**
* Copyright (c) 2025 Sanko Robinson
*
* This source code is dual-licensed under the Artistic License 2.0 or the MIT License.
* You may choose to use this code under the terms of either license.
*
* SPDX-License-Identifier: (Artistic-2.0 OR MIT)
*
* The documentation blocks within this file are licensed under the
* Creative Commons Attribution 4.0 International License (CC BY 4.0).
*
* SPDX-License-Identifier: CC-BY-4.0
*/
/**
* @file abi_arm64.c
* @brief Implements the FFI logic for the AArch64 (ARM64) architecture.
* @ingroup internal_abi_aarch64
*
* @internal
* This file provides the concrete implementation of the `infix_forward_abi_spec`
* and `infix_reverse_abi_spec` for the ARM64 architecture. It primarily follows
* the standard "Procedure Call Standard for the ARM 64-bit Architecture" (AAPCS64),
* but also contains critical conditional logic to handle deviations for specific
* platforms like Apple macOS and Windows on ARM.
*
* @section aapcs64_rules Key AAPCS64 Rules Implemented
*
* - **Register Usage:**
* - The first 8 integer/pointer arguments are passed in GPRs (X0-X7).
* - The first 8 floating-point/vector arguments are passed in VPRs (V0-V7).
*
* - **Homogeneous Floating-point Aggregates (HFAs):** Structs or arrays composed
* entirely of 1 to 4 identical floating-point types (`float` or `double`) are
* passed in consecutive VPRs.
*
* - **Return Values:**
* - Aggregates up to 16 bytes are returned in registers (GPRs and/or VPRs).
* - Larger aggregates are returned via a hidden pointer passed by the caller
* in the dedicated "indirect result location register", `X8`.
*
* @section platform_deviations Platform-Specific Deviations
*
* - **Variadic Calls (Apple macOS):** All variadic arguments are passed on the
* stack. Arguments smaller than 8 bytes are promoted to fill 8-byte stack slots.
*
* - **Variadic Calls (Windows on ARM):** The HFA rule is disabled for variadic
* arguments. Floating-point scalars are passed in GPRs, not VPRs.
*
* - **16-Byte Argument Alignment:**
* - **Standard/macOS:** 16-byte aggregates passed in GPRs must start in an
* even-numbered register (X0, X2, X4, X6).
* - **macOS Exception:** `__int128_t` does NOT require even-GPR alignment.
* - **Windows Exception:** Variadic 16-byte aggregates do NOT require even-GPR alignment.
* @endinternal
*/
#include "arch/aarch64/abi_arm64_common.h"
#include "arch/aarch64/abi_arm64_emitters.h"
#include "common/infix_internals.h"
#include "common/utility.h"
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
/** @internal The General-Purpose Registers used for the first 8 integer/pointer arguments. */
static const arm64_gpr GPR_ARGS[] = {X0_REG, X1_REG, X2_REG, X3_REG, X4_REG, X5_REG, X6_REG, X7_REG};
/** @internal The SIMD/Floating-Point Registers used for the first 8 float/double/vector arguments. */
static const arm64_vpr VPR_ARGS[] = {V0_REG, V1_REG, V2_REG, V3_REG, V4_REG, V5_REG, V6_REG, V7_REG};
/** @internal The total number of GPRs available for argument passing. */
#define NUM_GPR_ARGS 8
/** @internal The total number of VPRs available for argument passing. */
#define NUM_VPR_ARGS 8
/** @internal A safe limit on the number of fields to classify to prevent DoS from exponential complexity. */
#define MAX_AGGREGATE_FIELDS_TO_CLASSIFY 32
//
static bool is_hfa(const infix_type * type, const infix_type ** base_type);
/** @internal The v-table of AArch64 functions for generating forward trampolines. */
static infix_status prepare_forward_call_frame_arm64(infix_arena_t * arena,
infix_call_frame_layout ** out_layout,
infix_type * ret_type,
infix_type ** arg_types,
size_t num_args,
size_t num_fixed_args,
void * target_fn);
static infix_status generate_forward_prologue_arm64(code_buffer * buf, infix_call_frame_layout * layout);
static infix_status generate_forward_argument_moves_arm64(code_buffer * buf,
infix_call_frame_layout * layout,
infix_type ** arg_types,
size_t num_args,
c23_maybe_unused size_t num_fixed_args);
static infix_status generate_forward_call_instruction_arm64(code_buffer *, infix_call_frame_layout *);
static infix_status generate_forward_epilogue_arm64(code_buffer * buf,
infix_call_frame_layout * layout,
infix_type * ret_type);
const infix_forward_abi_spec g_arm64_forward_spec = {
.prepare_forward_call_frame = prepare_forward_call_frame_arm64,
.generate_forward_prologue = generate_forward_prologue_arm64,
.generate_forward_argument_moves = generate_forward_argument_moves_arm64,
.generate_forward_call_instruction = generate_forward_call_instruction_arm64,
.generate_forward_epilogue = generate_forward_epilogue_arm64};
/** @internal The v-table of AArch64 functions for generating reverse trampolines. */
static infix_status prepare_reverse_call_frame_arm64(infix_arena_t * arena,
infix_reverse_call_frame_layout ** out_layout,
infix_reverse_t * context);
static infix_status generate_reverse_prologue_arm64(code_buffer * buf, infix_reverse_call_frame_layout * layout);
static infix_status generate_reverse_argument_marshalling_arm64(code_buffer * buf,
infix_reverse_call_frame_layout * layout,
infix_reverse_t * context);
static infix_status generate_reverse_dispatcher_call_arm64(code_buffer * buf,
infix/src/arch/aarch64/abi_arm64.c view on Meta::CPAN
if (layout->is_variadic && is_variadic_arg) {
layout->arg_locations[i].type = ARG_LOCATION_STACK;
layout->arg_locations[i].stack_offset = (uint32_t)stack_offset;
// Any argument smaller than 8 bytes must be promoted to an 8-byte slot on the stack.
size_t arg_size_on_stack = (type->size < 8) ? 8 : type->size;
stack_offset += (arg_size_on_stack + 7) & ~7;
layout->num_stack_args++;
continue; // Argument classified, proceed to the next one.
}
#endif
bool pass_fp_in_vpr = is_float16(type) || is_float(type) || is_double(type) || is_long_double(type) ||
type->category == INFIX_TYPE_VECTOR;
const infix_type * hfa_base_type = nullptr;
bool is_hfa_candidate = is_hfa(type, &hfa_base_type);
#if defined(INFIX_OS_WINDOWS)
// Windows on ARM ABI Deviation: If the function is variadic, HFA rules are ignored,
// and all floating-point scalars are passed in GPRs.
if (layout->is_variadic) {
pass_fp_in_vpr = false;
is_hfa_candidate = false;
}
#endif
// The order of these checks is critical to follow the ABI specification correctly.
if (is_hfa_candidate) {
size_t num_elements = type->size / hfa_base_type->size;
if (vpr_count + num_elements <= NUM_VPR_ARGS) {
layout->arg_locations[i].type = ARG_LOCATION_VPR_HFA;
layout->arg_locations[i].reg_index = (uint8_t)vpr_count;
layout->arg_locations[i].num_regs = (uint32_t)num_elements;
vpr_count += num_elements;
placed_in_register = true;
}
}
else if (type->size > 16) {
// Aggregates > 16 bytes are passed by reference (a pointer in a GPR).
if (gpr_count < NUM_GPR_ARGS) {
layout->arg_locations[i].type = ARG_LOCATION_GPR_REFERENCE;
layout->arg_locations[i].reg_index = (uint8_t)gpr_count++;
placed_in_register = true;
}
}
else if (pass_fp_in_vpr) {
if (vpr_count < NUM_VPR_ARGS) {
layout->arg_locations[i].type = ARG_LOCATION_VPR;
layout->arg_locations[i].reg_index = (uint8_t)vpr_count++;
placed_in_register = true;
}
}
else { // Integers, pointers, small aggregates, and variadic floats on Windows.
if (type->size > 8) { // Types > 8 and <= 16 bytes are passed in a pair of GPRs.
bool needs_alignment = true;
#if defined(INFIX_OS_MACOS)
// macOS Deviation: `__int128_t` does not require even-GPR alignment.
if (type->category == INFIX_TYPE_PRIMITIVE)
needs_alignment = false;
#elif defined(INFIX_OS_WINDOWS)
// Windows Deviation: Variadic 16-byte arguments do not require even-GPR alignment.
if (is_variadic_arg)
needs_alignment = false;
#endif
// Standard rule: 16-byte args must start in an even-numbered GPR.
if (needs_alignment && (gpr_count % 2 != 0))
gpr_count++;
if (gpr_count + 1 < NUM_GPR_ARGS) {
layout->arg_locations[i].type = ARG_LOCATION_GPR_PAIR;
layout->arg_locations[i].reg_index = (uint8_t)gpr_count;
gpr_count += 2;
placed_in_register = true;
}
}
else { // Types <= 8 bytes passed in a single GPR.
if (gpr_count < NUM_GPR_ARGS) {
layout->arg_locations[i].type = ARG_LOCATION_GPR;
layout->arg_locations[i].reg_index = (uint8_t)gpr_count++;
placed_in_register = true;
}
}
}
// If it couldn't be placed in a register, it must go on the stack.
if (!placed_in_register) {
layout->arg_locations[i].type = ARG_LOCATION_STACK;
// Enforce natural alignment for stack arguments on ARM64
size_t align = type->alignment;
if (align < 8)
align = 8;
// Align the current stack offset
stack_offset = (stack_offset + (align - 1)) & ~(align - 1);
layout->arg_locations[i].stack_offset = (uint32_t)stack_offset;
stack_offset += (type->size + 7) & ~7; // Stack slots are 8-byte aligned.
layout->num_stack_args++;
}
}
// The total stack space for arguments must be 16-byte aligned before the call.
layout->total_stack_alloc = (stack_offset + 15) & ~15;
layout->num_gpr_args = (uint8_t)gpr_count;
layout->num_vpr_args = (uint8_t)vpr_count;
// Security: Prevent excessive stack allocation.
if (layout->total_stack_alloc > INFIX_MAX_STACK_ALLOC) {
*out_layout = nullptr;
return INFIX_ERROR_LAYOUT_FAILED;
}
*out_layout = layout;
return INFIX_SUCCESS;
}
/**
* @internal
* @brief Stage 2 (Forward): Generates the function prologue for the AArch64 trampoline.
* @details Sets up the stack frame by saving the frame pointer (X29) and link register (X30),
* saves callee-saved registers (X19-X22) that will be used to hold the trampoline's
* context, moves the trampoline's arguments into those preserved registers, and
* allocates the necessary stack space for stack-passed arguments.
* @param buf The code buffer.
* @param layout The layout blueprint.
* @return `INFIX_SUCCESS`.
*/
static infix_status generate_forward_prologue_arm64(code_buffer * buf, infix_call_frame_layout * layout) {
// `stp x29, x30, [sp, #-16]!` : Push Frame Pointer and Link Register to the stack, pre-decrementing SP.
emit_arm64_stp_pre_index(buf, true, X29_FP_REG, X30_LR_REG, SP_REG, -16);
// `stp x19, x20, [sp, #-16]!` : Save callee-saved registers that we will use for our context.
infix/src/arch/aarch64/abi_arm64.c view on Meta::CPAN
layout->saved_args_offset = layout->args_array_offset + (int32_t)args_array_size;
*out_layout = layout;
return INFIX_SUCCESS;
}
/**
* @internal
* @brief Stage 2 (Reverse): Generates the prologue for the reverse trampoline stub.
* @details This function emits the standard AArch64 function entry code. It saves the
* caller's frame pointer (X29) and the link register (X30, the return address)
* to the stack, establishes a new frame by pointing X29 to the current stack
* pointer, and allocates the pre-calculated stack space for local variables.
*
* @param buf The code buffer to write to.
* @param layout The blueprint containing the total stack space to allocate.
* @return `INFIX_SUCCESS` on success.
*/
static infix_status generate_reverse_prologue_arm64(code_buffer * buf, infix_reverse_call_frame_layout * layout) {
// `stp x29, x30, [sp, #-16]!` : Save Frame Pointer and Link Register, pre-decrementing SP.
emit_arm64_stp_pre_index(buf, true, X29_FP_REG, X30_LR_REG, SP_REG, -16);
// `mov x29, sp` : Establish the new frame pointer.
emit_arm64_mov_reg(buf, true, X29_FP_REG, SP_REG);
// `sub sp, sp, #total_stack_alloc` : Allocate space for our local variables.
if (layout->total_stack_alloc > 0)
emit_arm64_sub_imm(buf, true, false, SP_REG, SP_REG, (uint32_t)layout->total_stack_alloc);
return INFIX_SUCCESS;
}
/**
* @internal
* @brief Stage 3 (Reverse): Generates code to marshal arguments into the `void**` array.
* @details This generates `STR` instructions to copy argument data from their native
* locations (GPRs, VPRs, or the caller's stack) into a contiguous "saved args"
* area on the stub's local stack. It then populates the `args_array` with
* pointers to this saved data, respecting all platform-specific ABI deviations.
*
* @param buf The code buffer.
* @param layout The layout blueprint.
* @param context The reverse context.
* @return `INFIX_SUCCESS`.
*/
static infix_status generate_reverse_argument_marshalling_arm64(code_buffer * buf,
infix_reverse_call_frame_layout * layout,
infix_reverse_t * context) {
// Handle Return Value Pointer (Indirect Result Location)
// If the return type is a large struct (> 16 bytes), the caller passes a hidden pointer in X8.
// X8 is volatile, so we must save this pointer into our stack frame immediately.
bool ret_is_aggregate =
(context->return_type->category == INFIX_TYPE_STRUCT || context->return_type->category == INFIX_TYPE_UNION ||
context->return_type->category == INFIX_TYPE_ARRAY || context->return_type->category == INFIX_TYPE_COMPLEX);
bool return_in_memory = ret_is_aggregate && context->return_type->size > 16;
if (return_in_memory) {
// str x8, [sp, #return_buffer_offset]
emit_arm64_str_imm(buf, true, X8_REG, SP_REG, layout->return_buffer_offset);
}
// Iterate over arguments
size_t gpr_idx = 0;
size_t vpr_idx = 0;
size_t current_saved_data_offset = 0;
// Arguments passed on the caller's stack start at offset 16 from our new frame pointer (X29).
// X29 was established after pushing X29, X30, X19, X20, X21, X22.
// [X29] -> saved X29
// [X29+8] -> saved X30 (LR)
// [X29+16] -> caller's first stack argument
size_t caller_stack_offset = 16;
for (size_t i = 0; i < context->num_args; ++i) {
infix_type * type = context->arg_types[i];
bool is_variadic_arg = i >= context->num_fixed_args;
// Calculate where to save this argument's data in our local stack frame.
int32_t arg_save_loc = (int32_t)(layout->saved_args_offset + current_saved_data_offset);
#if defined(INFIX_OS_MACOS)
// macOS ABI deviation:
// On macOS ARM64, ALL variadic arguments are passed on the stack.
// They are also promoted: types < 8 bytes occupy a full 8-byte stack slot.
if (is_variadic_arg) {
size_t size_on_stack = (type->size < 8) ? 8 : type->size;
size_on_stack = (size_on_stack + 7) & ~7; // Align to 8 bytes
// Copy from caller's stack to our local save area
for (size_t offset = 0; offset < size_on_stack; offset += 8) {
// ldr x9, [fp, #caller_offset]
emit_arm64_ldr_imm(buf, true, X9_REG, X29_FP_REG, (int32_t)(caller_stack_offset + offset));
int32_t dest_offset = arg_save_loc + (int32_t)offset;
if (dest_offset >= 0 && ((unsigned)dest_offset / 8) <= 0xFFF && (dest_offset % 8 == 0))
emit_arm64_str_imm(buf, true, X9_REG, SP_REG, dest_offset);
else {
emit_arm64_add_imm(buf, true, false, X10_REG, SP_REG, dest_offset);
emit_arm64_str_imm(buf, true, X9_REG, X10_REG, 0);
}
}
caller_stack_offset += size_on_stack;
// Set the pointer in args_array[i] to point to the saved data
int32_t dest_offset = layout->args_array_offset + (int32_t)(i * sizeof(void *));
emit_arm64_add_imm(buf, true, false, X9_REG, SP_REG, (uint32_t)arg_save_loc);
if (dest_offset >= 0 && ((unsigned)dest_offset / 8) <= 0xFFF && (dest_offset % 8 == 0))
emit_arm64_str_imm(buf, true, X9_REG, SP_REG, dest_offset);
else {
emit_arm64_add_imm(buf, true, false, X10_REG, SP_REG, dest_offset);
emit_arm64_str_imm(buf, true, X9_REG, X10_REG, 0);
}
current_saved_data_offset += (type->size + 15) & ~15;
continue; // Argument handled, move to next
}
#endif
// Standard AAPCS64 logic
bool is_pass_by_ref = (type->size > 16) && !is_variadic_arg;
bool is_from_stack = false;
bool expect_in_vpr = is_float16(type) || is_float(type) || is_double(type) || is_long_double(type) ||
type->category == INFIX_TYPE_VECTOR;
#if defined(INFIX_OS_WINDOWS)
// Windows on ARM ABI disables HFA rules for variadic functions; floats go to GPRs.
( run in 1.779 second using v1.01-cache-2.11-cpan-99c4e6809bf )