278 lines
11 KiB
Diff
278 lines
11 KiB
Diff
From 6f0ab0a9f46a17b68349ff6035aa776bf65f0575 Mon Sep 17 00:00:00 2001
|
|
From: Richard Sandiford <richard.sandiford@arm.com>
|
|
Date: Tue, 12 Sep 2023 16:08:56 +0100
|
|
Subject: [PATCH] aarch64: Explicitly record probe registers in frame info
|
|
|
|
The stack frame is currently divided into three areas:
|
|
|
|
A: the area above the hard frame pointer
|
|
B: the SVE saves below the hard frame pointer
|
|
C: the outgoing arguments
|
|
|
|
If the stack frame is allocated in one chunk, the allocation needs a
|
|
probe if the frame size is >= guard_size - 1KiB. In addition, if the
|
|
function is not a leaf function, it must probe an address no more than
|
|
1KiB above the outgoing SP. We ensured the second condition by
|
|
|
|
(1) using single-chunk allocations for non-leaf functions only if
|
|
the link register save slot is within 512 bytes of the bottom
|
|
of the frame; and
|
|
|
|
(2) using the link register save as a probe (meaning, for instance,
|
|
that it can't be individually shrink wrapped)
|
|
|
|
If instead the stack is allocated in multiple chunks, then:
|
|
|
|
* an allocation involving only the outgoing arguments (C above) requires
|
|
a probe if the allocation size is > 1KiB
|
|
|
|
* any other allocation requires a probe if the allocation size
|
|
is >= guard_size - 1KiB
|
|
|
|
* second and subsequent allocations require the previous allocation
|
|
to probe at the bottom of the allocated area, regardless of the size
|
|
of that previous allocation
|
|
|
|
The final point means that, unlike for single allocations,
|
|
it can be necessary to have both a non-SVE register probe and
|
|
an SVE register probe. For example:
|
|
|
|
* allocate A, probe using a non-SVE register save
|
|
* allocate B, probe using an SVE register save
|
|
* allocate C
|
|
|
|
The non-SVE register used in this case was again the link register.
|
|
It was previously used even if the link register save slot was some
|
|
bytes above the bottom of the non-SVE register saves, but an earlier
|
|
patch avoided that by putting the link register save slot first.
|
|
|
|
As a belt-and-braces fix, this patch explicitly records which
|
|
probe registers we're using and allows the non-SVE probe to be
|
|
whichever register comes first (as for SVE).
|
|
|
|
The patch also avoids unnecessary probes in sve/pcs/stack_clash_3.c.
|
|
|
|
gcc/
|
|
* config/aarch64/aarch64.h (aarch64_frame::sve_save_and_probe)
|
|
(aarch64_frame::hard_fp_save_and_probe): New fields.
|
|
* config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize them.
|
|
Rather than asserting that a leaf function saves LR, instead assert
|
|
that a leaf function saves something.
|
|
(aarch64_get_separate_components): Prevent the chosen probe
|
|
registers from being individually shrink-wrapped.
|
|
(aarch64_allocate_and_probe_stack_space): Remove workaround for
|
|
probe registers that aren't at the bottom of the previous allocation.
|
|
|
|
gcc/testsuite/
|
|
* gcc.target/aarch64/sve/pcs/stack_clash_3.c: Avoid redundant probes.
|
|
---
|
|
gcc/config/aarch64/aarch64.cc | 68 +++++++++++++++----
|
|
gcc/config/aarch64/aarch64.h | 8 +++
|
|
.../aarch64/sve/pcs/stack_clash_3.c | 6 +-
|
|
3 files changed, 64 insertions(+), 18 deletions(-)
|
|
|
|
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
|
|
index 8abf3d7a1e2b..a8d907df8843 100644
|
|
--- a/gcc/config/aarch64/aarch64.cc
|
|
+++ b/gcc/config/aarch64/aarch64.cc
|
|
@@ -8210,15 +8210,11 @@ aarch64_layout_frame (void)
|
|
&& !crtl->abi->clobbers_full_reg_p (regno))
|
|
frame.reg_offset[regno] = SLOT_REQUIRED;
|
|
|
|
- /* With stack-clash, LR must be saved in non-leaf functions. The saving of
|
|
- LR counts as an implicit probe which allows us to maintain the invariant
|
|
- described in the comment at expand_prologue. */
|
|
- gcc_assert (crtl->is_leaf
|
|
- || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
|
|
|
|
poly_int64 offset = crtl->outgoing_args_size;
|
|
gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
|
|
frame.bytes_below_saved_regs = offset;
|
|
+ frame.sve_save_and_probe = INVALID_REGNUM;
|
|
|
|
/* Now assign stack slots for the registers. Start with the predicate
|
|
registers, since predicate LDR and STR have a relatively small
|
|
@@ -8226,6 +8222,8 @@ aarch64_layout_frame (void)
|
|
for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
|
|
if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
|
|
{
|
|
+ if (frame.sve_save_and_probe == INVALID_REGNUM)
|
|
+ frame.sve_save_and_probe = regno;
|
|
frame.reg_offset[regno] = offset;
|
|
offset += BYTES_PER_SVE_PRED;
|
|
}
|
|
@@ -8263,6 +8261,8 @@ aarch64_layout_frame (void)
|
|
for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
|
|
if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
|
|
{
|
|
+ if (frame.sve_save_and_probe == INVALID_REGNUM)
|
|
+ frame.sve_save_and_probe = regno;
|
|
frame.reg_offset[regno] = offset;
|
|
offset += vector_save_size;
|
|
}
|
|
@@ -8272,10 +8272,18 @@ aarch64_layout_frame (void)
|
|
frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
|
|
bool saves_below_hard_fp_p
|
|
= maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
|
|
+ gcc_assert (!saves_below_hard_fp_p
|
|
+ || (frame.sve_save_and_probe != INVALID_REGNUM
|
|
+ && known_eq (frame.reg_offset[frame.sve_save_and_probe],
|
|
+ frame.bytes_below_saved_regs)));
|
|
+
|
|
frame.bytes_below_hard_fp = offset;
|
|
+ frame.hard_fp_save_and_probe = INVALID_REGNUM;
|
|
|
|
auto allocate_gpr_slot = [&](unsigned int regno)
|
|
{
|
|
+ if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
|
|
+ frame.hard_fp_save_and_probe = regno;
|
|
frame.reg_offset[regno] = offset;
|
|
if (frame.wb_push_candidate1 == INVALID_REGNUM)
|
|
frame.wb_push_candidate1 = regno;
|
|
@@ -8309,6 +8317,8 @@ aarch64_layout_frame (void)
|
|
for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
|
|
if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
|
|
{
|
|
+ if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
|
|
+ frame.hard_fp_save_and_probe = regno;
|
|
/* If there is an alignment gap between integer and fp callee-saves,
|
|
allocate the last fp register to it if possible. */
|
|
if (regno == last_fp_reg
|
|
@@ -8332,6 +8342,17 @@ aarch64_layout_frame (void)
|
|
offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
|
|
|
|
frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
|
|
+ gcc_assert (known_eq (frame.saved_regs_size,
|
|
+ frame.below_hard_fp_saved_regs_size)
|
|
+ || (frame.hard_fp_save_and_probe != INVALID_REGNUM
|
|
+ && known_eq (frame.reg_offset[frame.hard_fp_save_and_probe],
|
|
+ frame.bytes_below_hard_fp)));
|
|
+
|
|
+ /* With stack-clash, a register must be saved in non-leaf functions.
|
|
+ The saving of the bottommost register counts as an implicit probe,
|
|
+ which allows us to maintain the invariant described in the comment
|
|
+ at expand_prologue. */
|
|
+ gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0));
|
|
|
|
offset += get_frame_size ();
|
|
offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
|
|
@@ -8462,6 +8483,25 @@ aarch64_layout_frame (void)
|
|
frame.final_adjust = frame.bytes_below_saved_regs;
|
|
}
|
|
|
|
+ /* The frame is allocated in pieces, with each non-final piece
|
|
+ including a register save at offset 0 that acts as a probe for
|
|
+ the following piece. In addition, the save of the bottommost register
|
|
+ acts as a probe for callees and allocas. Roll back any probes that
|
|
+ aren't needed.
|
|
+
|
|
+ A probe isn't needed if it is associated with the final allocation
|
|
+ (including callees and allocas) that happens before the epilogue is
|
|
+ executed. */
|
|
+ if (crtl->is_leaf
|
|
+ && !cfun->calls_alloca
|
|
+ && known_eq (frame.final_adjust, 0))
|
|
+ {
|
|
+ if (maybe_ne (frame.sve_callee_adjust, 0))
|
|
+ frame.sve_save_and_probe = INVALID_REGNUM;
|
|
+ else
|
|
+ frame.hard_fp_save_and_probe = INVALID_REGNUM;
|
|
+ }
|
|
+
|
|
/* Make sure the individual adjustments add up to the full frame size. */
|
|
gcc_assert (known_eq (frame.initial_adjust
|
|
+ frame.callee_adjust
|
|
@@ -9039,13 +9079,6 @@ aarch64_get_separate_components (void)
|
|
|
|
poly_int64 offset = frame.reg_offset[regno];
|
|
|
|
- /* If the register is saved in the first SVE save slot, we use
|
|
- it as a stack probe for -fstack-clash-protection. */
|
|
- if (flag_stack_clash_protection
|
|
- && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
|
|
- && known_eq (offset, frame.bytes_below_saved_regs))
|
|
- continue;
|
|
-
|
|
/* Get the offset relative to the register we'll use. */
|
|
if (frame_pointer_needed)
|
|
offset -= frame.bytes_below_hard_fp;
|
|
@@ -9080,6 +9113,13 @@ aarch64_get_separate_components (void)
|
|
|
|
bitmap_clear_bit (components, LR_REGNUM);
|
|
bitmap_clear_bit (components, SP_REGNUM);
|
|
+ if (flag_stack_clash_protection)
|
|
+ {
|
|
+ if (frame.sve_save_and_probe != INVALID_REGNUM)
|
|
+ bitmap_clear_bit (components, frame.sve_save_and_probe);
|
|
+ if (frame.hard_fp_save_and_probe != INVALID_REGNUM)
|
|
+ bitmap_clear_bit (components, frame.hard_fp_save_and_probe);
|
|
+ }
|
|
|
|
return components;
|
|
}
|
|
@@ -9616,8 +9656,8 @@ aarch64_epilogue_uses (int regno)
|
|
When probing is needed, we emit a probe at the start of the prologue
|
|
and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
|
|
|
|
- We have to track how much space has been allocated and the only stores
|
|
- to the stack we track as implicit probes are the FP/LR stores.
|
|
+ We can also use register saves as probes. These are stored in
|
|
+ sve_save_and_probe and hard_fp_save_and_probe.
|
|
|
|
For outgoing arguments we probe if the size is larger than 1KB, such that
|
|
the ABI specified buffer is maintained for the next callee.
|
|
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
|
|
index b61358370732..46d4693e2064 100644
|
|
--- a/gcc/config/aarch64/aarch64.h
|
|
+++ b/gcc/config/aarch64/aarch64.h
|
|
@@ -957,6 +957,14 @@ struct GTY (()) aarch64_frame
|
|
This is the register they should use. */
|
|
unsigned spare_pred_reg;
|
|
|
|
+ /* An SVE register that is saved below the hard frame pointer and that acts
|
|
+ as a probe for later allocations, or INVALID_REGNUM if none. */
|
|
+ unsigned sve_save_and_probe;
|
|
+
|
|
+ /* A register that is saved at the hard frame pointer and that acts
|
|
+ as a probe for later allocations, or INVALID_REGNUM if none. */
|
|
+ unsigned hard_fp_save_and_probe;
|
|
+
|
|
bool laid_out;
|
|
|
|
/* True if shadow call stack should be enabled for the current function. */
|
|
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
|
|
index 3e01ec36c3a4..3530a0d504ba 100644
|
|
--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
|
|
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
|
|
@@ -11,11 +11,10 @@
|
|
** mov x11, sp
|
|
** ...
|
|
** sub sp, sp, x13
|
|
-** str p4, \[sp\]
|
|
** cbz w0, [^\n]*
|
|
+** str p4, \[sp\]
|
|
** ...
|
|
** ptrue p0\.b, all
|
|
-** ldr p4, \[sp\]
|
|
** addvl sp, sp, #1
|
|
** ldr x24, \[sp\], 32
|
|
** ret
|
|
@@ -39,13 +38,12 @@ test_1 (int n)
|
|
** mov x11, sp
|
|
** ...
|
|
** sub sp, sp, x13
|
|
-** str p4, \[sp\]
|
|
** cbz w0, [^\n]*
|
|
+** str p4, \[sp\]
|
|
** str p5, \[sp, #1, mul vl\]
|
|
** str p6, \[sp, #2, mul vl\]
|
|
** ...
|
|
** ptrue p0\.b, all
|
|
-** ldr p4, \[sp\]
|
|
** addvl sp, sp, #1
|
|
** ldr x24, \[sp\], 32
|
|
** ret
|
|
--
|
|
2.43.5
|
|
|