2653 lines
112 KiB
Diff
2653 lines
112 KiB
Diff
From 3a9ddc2f95926a75a9b436ad4dfd4070f535a113 Mon Sep 17 00:00:00 2001
|
|
From: rickyleung <leung.wing.chung@huawei.com>
|
|
Date: Tue, 7 May 2024 21:25:52 +0800
|
|
Subject: [PATCH 4/7] [backport][AArch64] Stack probing for function prologues
|
|
|
|
Reference: https://github.com/llvm/llvm-project/commit/cc944f502f1ee20d73ff88c2c86cc909f12caadb
|
|
|
|
This adds code to AArch64 function prologues to protect against stack
|
|
clash attacks by probing (writing to) the stack at regular enough
|
|
intervals to ensure that the guard page cannot be skipped over.
|
|
|
|
The patch depends on and maintains the following invariants:
|
|
|
|
Upon function entry the caller guarantees that it has probed the stack
|
|
(e.g. performed a store) at some address [sp, #N], where`0 <= N <=
|
|
1024`. This invariant comes from a requirement for compatibility with
|
|
GCC. Any address range in the allocated stack, no smaller than
|
|
stack-probe-size bytes contains at least one probe At any time the stack
|
|
pointer is above or in the guard page Probes are performed in
|
|
descreasing address order
|
|
The stack-probe-size is a function attribute that can be set by a
|
|
platform to correspond to the guard page size.
|
|
|
|
By default, the stack probe size is 4KiB, which is a safe default as
|
|
this is the smallest possible page size for AArch64. Linux uses a 64KiB
|
|
guard for AArch64, so this can be overridden by the stack-probe-size
|
|
function attribute.
|
|
|
|
For small frames without a frame pointer (<= 240 bytes), no probes are
|
|
needed.
|
|
|
|
For larger frame sizes, LLVM always stores x29 to the stack. This serves
|
|
as an implicit stack probe. Thus, while allocating stack objects the
|
|
compiler assumes that the stack has been probed at [sp].
|
|
|
|
There are multiple probing sequences that can be emitted, depending on
|
|
the size of the stack allocation:
|
|
|
|
A straight-line sequence of subtracts and stores, used when the
|
|
allocation size is smaller than 5 guard pages. A loop allocating and
|
|
probing one page size per iteration, plus at most a single probe to deal
|
|
with the remainder, used when the allocation size is larger but still
|
|
known at compile time. A loop which moves the SP down to the target
|
|
value held in a register (or a loop, moving a scratch register to the
|
|
target value help in SP), used when the allocation size is not known at
|
|
compile-time, such as when allocating space for SVE values, or when
|
|
over-aligning the stack. This is emitted in AArch64InstrInfo because it
|
|
will also be used for dynamic allocas in a future patch. A single probe
|
|
where the amount of stack adjustment is unknown, but is known to be less
|
|
than or equal to a page size.
|
|
|
|
---------
|
|
|
|
Co-authored-by: Oliver Stannard <oliver.stannard@linaro.org>
|
|
---
|
|
.../Target/AArch64/AArch64FrameLowering.cpp | 335 +++++++-
|
|
.../lib/Target/AArch64/AArch64FrameLowering.h | 17 +-
|
|
.../Target/AArch64/AArch64ISelLowering.cpp | 6 +
|
|
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 10 +
|
|
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 90 +++
|
|
llvm/lib/Target/AArch64/AArch64InstrInfo.h | 6 +
|
|
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 24 +-
|
|
.../AArch64/AArch64MachineFunctionInfo.cpp | 43 +-
|
|
.../AArch64/AArch64MachineFunctionInfo.h | 6 +
|
|
.../test/CodeGen/AArch64/stack-probing-64k.ll | 392 ++++++++++
|
|
.../AArch64/stack-probing-last-in-block.mir | 146 ++++
|
|
.../test/CodeGen/AArch64/stack-probing-sve.ll | 724 ++++++++++++++++++
|
|
llvm/test/CodeGen/AArch64/stack-probing.ll | 539 +++++++++++++
|
|
13 files changed, 2300 insertions(+), 38 deletions(-)
|
|
create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-64k.ll
|
|
create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir
|
|
create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-sve.ll
|
|
create mode 100644 llvm/test/CodeGen/AArch64/stack-probing.ll
|
|
|
|
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
|
|
index eeb6185fa36d..af019ab23770 100644
|
|
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
|
|
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
|
|
@@ -672,10 +672,18 @@ void AArch64FrameLowering::emitCalleeSavedSVERestores(
|
|
emitCalleeSavedRestores(MBB, MBBI, true);
|
|
}
|
|
|
|
+// Return the maximum possible number of bytes for `Size` due to the
|
|
+// architectural limit on the size of a SVE register.
|
|
+static int64_t upperBound(StackOffset Size) {
|
|
+ static const int64_t MAX_BYTES_PER_SCALABLE_BYTE = 16;
|
|
+ return Size.getScalable() * MAX_BYTES_PER_SCALABLE_BYTE + Size.getFixed();
|
|
+}
|
|
+
|
|
void AArch64FrameLowering::allocateStackSpace(
|
|
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
|
|
- bool NeedsRealignment, StackOffset AllocSize, bool NeedsWinCFI,
|
|
- bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset) const {
|
|
+ int64_t RealignmentPadding, StackOffset AllocSize, bool NeedsWinCFI,
|
|
+ bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset,
|
|
+ bool FollowupAllocs) const {
|
|
|
|
if (!AllocSize)
|
|
return;
|
|
@@ -687,27 +695,129 @@ void AArch64FrameLowering::allocateStackSpace(
|
|
AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
|
|
const MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
|
|
- Register TargetReg =
|
|
- NeedsRealignment ? findScratchNonCalleeSaveRegister(&MBB) : AArch64::SP;
|
|
- // SUB Xd/SP, SP, AllocSize
|
|
+ const int64_t MaxAlign = MFI.getMaxAlign().value();
|
|
+ const uint64_t AndMask = ~(MaxAlign - 1);
|
|
+
|
|
+ if (!Subtarget.getTargetLowering()->hasInlineStackProbe(MF)) {
|
|
+ Register TargetReg = RealignmentPadding
|
|
+ ? findScratchNonCalleeSaveRegister(&MBB)
|
|
+ : AArch64::SP;
|
|
+ // SUB Xd/SP, SP, AllocSize
|
|
+ emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII,
|
|
+ MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI,
|
|
+ EmitCFI, InitialOffset);
|
|
+
|
|
+ if (RealignmentPadding) {
|
|
+ // AND SP, X9, 0b11111...0000
|
|
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP)
|
|
+ .addReg(TargetReg, RegState::Kill)
|
|
+ .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
|
|
+ .setMIFlags(MachineInstr::FrameSetup);
|
|
+ AFI.setStackRealigned(true);
|
|
+
|
|
+ // No need for SEH instructions here; if we're realigning the stack,
|
|
+ // we've set a frame pointer and already finished the SEH prologue.
|
|
+ assert(!NeedsWinCFI);
|
|
+ }
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ //
|
|
+ // Stack probing allocation.
|
|
+ //
|
|
+
|
|
+ // Fixed length allocation. If we don't need to re-align the stack and don't
|
|
+ // have SVE objects, we can use a more efficient sequence for stack probing.
|
|
+ if (AllocSize.getScalable() == 0 && RealignmentPadding == 0) {
|
|
+ Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB);
|
|
+ assert(ScratchReg != AArch64::NoRegister);
|
|
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC))
|
|
+ .addDef(ScratchReg)
|
|
+ .addImm(AllocSize.getFixed())
|
|
+ .addImm(InitialOffset.getFixed())
|
|
+ .addImm(InitialOffset.getScalable());
|
|
+ // The fixed allocation may leave unprobed bytes at the top of the
|
|
+ // stack. If we have subsequent alocation (e.g. if we have variable-sized
|
|
+ // objects), we need to issue an extra probe, so these allocations start in
|
|
+ // a known state.
|
|
+ if (FollowupAllocs) {
|
|
+ // STR XZR, [SP]
|
|
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
|
|
+ .addReg(AArch64::XZR)
|
|
+ .addReg(AArch64::SP)
|
|
+ .addImm(0)
|
|
+ .setMIFlags(MachineInstr::FrameSetup);
|
|
+ }
|
|
+
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ // Variable length allocation.
|
|
+
|
|
+ // If the (unknown) allocation size cannot exceed the probe size, decrement
|
|
+ // the stack pointer right away.
|
|
+ int64_t ProbeSize = AFI.getStackProbeSize();
|
|
+ if (upperBound(AllocSize) + RealignmentPadding <= ProbeSize) {
|
|
+ Register ScratchReg = RealignmentPadding
|
|
+ ? findScratchNonCalleeSaveRegister(&MBB)
|
|
+ : AArch64::SP;
|
|
+ assert(ScratchReg != AArch64::NoRegister);
|
|
+ // SUB Xd, SP, AllocSize
|
|
+ emitFrameOffset(MBB, MBBI, DL, ScratchReg, AArch64::SP, -AllocSize, &TII,
|
|
+ MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI,
|
|
+ EmitCFI, InitialOffset);
|
|
+ if (RealignmentPadding) {
|
|
+ // AND SP, Xn, 0b11111...0000
|
|
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP)
|
|
+ .addReg(ScratchReg, RegState::Kill)
|
|
+ .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
|
|
+ .setMIFlags(MachineInstr::FrameSetup);
|
|
+ AFI.setStackRealigned(true);
|
|
+ }
|
|
+ if (FollowupAllocs || upperBound(AllocSize) + RealignmentPadding >
|
|
+ AArch64::StackProbeMaxUnprobedStack) {
|
|
+ // STR XZR, [SP]
|
|
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
|
|
+ .addReg(AArch64::XZR)
|
|
+ .addReg(AArch64::SP)
|
|
+ .addImm(0)
|
|
+ .setMIFlags(MachineInstr::FrameSetup);
|
|
+ }
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ // Emit a variable-length allocation probing loop.
|
|
+ // TODO: As an optimisation, the loop can be "unrolled" into a few parts,
|
|
+ // each of them guaranteed to adjust the stack by less than the probe size.
|
|
+ Register TargetReg = findScratchNonCalleeSaveRegister(&MBB);
|
|
+ assert(TargetReg != AArch64::NoRegister);
|
|
+ // SUB Xd, SP, AllocSize
|
|
emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII,
|
|
MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI,
|
|
EmitCFI, InitialOffset);
|
|
|
|
- if (NeedsRealignment) {
|
|
- const int64_t MaxAlign = MFI.getMaxAlign().value();
|
|
- const uint64_t AndMask = ~(MaxAlign - 1);
|
|
- // AND SP, Xd, 0b11111...0000
|
|
- BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP)
|
|
+ if (RealignmentPadding) {
|
|
+ // AND Xn, Xn, 0b11111...0000
|
|
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), TargetReg)
|
|
.addReg(TargetReg, RegState::Kill)
|
|
.addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
|
|
.setMIFlags(MachineInstr::FrameSetup);
|
|
- AFI.setStackRealigned(true);
|
|
+ }
|
|
|
|
- // No need for SEH instructions here; if we're realigning the stack,
|
|
- // we've set a frame pointer and already finished the SEH prologue.
|
|
- assert(!NeedsWinCFI);
|
|
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC_VAR))
|
|
+ .addReg(TargetReg);
|
|
+ if (EmitCFI) {
|
|
+ // Set the CFA register back to SP.
|
|
+ unsigned Reg =
|
|
+ Subtarget.getRegisterInfo()->getDwarfRegNum(AArch64::SP, true);
|
|
+ unsigned CFIIndex =
|
|
+ MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
|
|
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
|
|
+ .addCFIIndex(CFIIndex)
|
|
+ .setMIFlags(MachineInstr::FrameSetup);
|
|
}
|
|
+ if (RealignmentPadding)
|
|
+ AFI.setStackRealigned(true);
|
|
}
|
|
|
|
static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) {
|
|
@@ -893,9 +1003,11 @@ bool AArch64FrameLowering::canUseAsPrologue(
|
|
MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
|
|
const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
|
|
const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
|
|
+ const AArch64TargetLowering *TLI = Subtarget.getTargetLowering();
|
|
|
|
- // Don't need a scratch register if we're not going to re-align the stack.
|
|
- if (!RegInfo->hasStackRealignment(*MF))
|
|
+ // Don't need a scratch register if we're not going to re-align the stack or
|
|
+ // emit stack probes.
|
|
+ if (!RegInfo->hasStackRealignment(*MF) && TLI->hasInlineStackProbe(*MF))
|
|
return true;
|
|
// Otherwise, we can use any block as long as it has a scratch register
|
|
// available.
|
|
@@ -905,15 +1017,11 @@ bool AArch64FrameLowering::canUseAsPrologue(
|
|
static bool windowsRequiresStackProbe(MachineFunction &MF,
|
|
uint64_t StackSizeInBytes) {
|
|
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
|
|
- if (!Subtarget.isTargetWindows())
|
|
- return false;
|
|
- const Function &F = MF.getFunction();
|
|
+ const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>();
|
|
// TODO: When implementing stack protectors, take that into account
|
|
// for the probe threshold.
|
|
- unsigned StackProbeSize =
|
|
- F.getFnAttributeAsParsedInteger("stack-probe-size", 4096);
|
|
- return (StackSizeInBytes >= StackProbeSize) &&
|
|
- !F.hasFnAttribute("no-stack-arg-probe");
|
|
+ return Subtarget.isTargetWindows() && MFI.hasStackProbing() &&
|
|
+ StackSizeInBytes >= uint64_t(MFI.getStackProbeSize());
|
|
}
|
|
|
|
static bool needsWinCFI(const MachineFunction &MF) {
|
|
@@ -1678,7 +1786,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
|
|
// Alignment is required for the parent frame, not the funclet
|
|
const bool NeedsRealignment =
|
|
NumBytes && !IsFunclet && RegInfo->hasStackRealignment(MF);
|
|
- int64_t RealignmentPadding =
|
|
+ const int64_t RealignmentPadding =
|
|
(NeedsRealignment && MFI.getMaxAlign() > Align(16))
|
|
? MFI.getMaxAlign().value() - 16
|
|
: 0;
|
|
@@ -1814,6 +1922,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
|
|
// Process the SVE callee-saves to determine what space needs to be
|
|
// allocated.
|
|
if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
|
|
+ LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize
|
|
+ << "\n");
|
|
// Find callee save instructions in frame.
|
|
CalleeSavesBegin = MBBI;
|
|
assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
|
|
@@ -1828,8 +1938,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
|
|
// Allocate space for the callee saves (if any).
|
|
StackOffset CFAOffset =
|
|
StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
|
|
- allocateStackSpace(MBB, CalleeSavesBegin, false, SVECalleeSavesSize, false,
|
|
- nullptr, EmitAsyncCFI && !HasFP, CFAOffset);
|
|
+ StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes);
|
|
+ allocateStackSpace(MBB, CalleeSavesBegin, 0, SVECalleeSavesSize, false,
|
|
+ nullptr, EmitAsyncCFI && !HasFP, CFAOffset,
|
|
+ MFI.hasVarSizedObjects() || LocalsSize);
|
|
CFAOffset += SVECalleeSavesSize;
|
|
|
|
if (EmitAsyncCFI)
|
|
@@ -1843,10 +1955,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
|
|
// FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
|
|
// the correct value here, as NumBytes also includes padding bytes,
|
|
// which shouldn't be counted here.
|
|
- allocateStackSpace(MBB, CalleeSavesEnd, NeedsRealignment,
|
|
+ allocateStackSpace(MBB, CalleeSavesEnd, RealignmentPadding,
|
|
SVELocalsSize + StackOffset::getFixed(NumBytes),
|
|
NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP,
|
|
- CFAOffset);
|
|
+ CFAOffset, MFI.hasVarSizedObjects());
|
|
}
|
|
|
|
// If we need a base pointer, set it up here. It's whatever the value of the
|
|
@@ -4028,3 +4140,170 @@ void AArch64FrameLowering::orderFrameObjects(
|
|
dbgs() << "\n";
|
|
});
|
|
}
|
|
+
|
|
+/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at
|
|
+/// least every ProbeSize bytes. Returns an iterator of the first instruction
|
|
+/// after the loop. The difference between SP and TargetReg must be an exact
|
|
+/// multiple of ProbeSize.
|
|
+MachineBasicBlock::iterator
|
|
+AArch64FrameLowering::inlineStackProbeLoopExactMultiple(
|
|
+ MachineBasicBlock::iterator MBBI, int64_t ProbeSize,
|
|
+ Register TargetReg) const {
|
|
+ MachineBasicBlock &MBB = *MBBI->getParent();
|
|
+ MachineFunction &MF = *MBB.getParent();
|
|
+ const AArch64InstrInfo *TII =
|
|
+ MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
|
|
+ DebugLoc DL = MBB.findDebugLoc(MBBI);
|
|
+
|
|
+ MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
|
|
+ MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
|
|
+ MF.insert(MBBInsertPoint, LoopMBB);
|
|
+ MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
|
|
+ MF.insert(MBBInsertPoint, ExitMBB);
|
|
+
|
|
+ // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not encodable
|
|
+ // in SUB).
|
|
+ emitFrameOffset(*LoopMBB, LoopMBB->end(), DL, AArch64::SP, AArch64::SP,
|
|
+ StackOffset::getFixed(-ProbeSize), TII,
|
|
+ MachineInstr::FrameSetup);
|
|
+ // STR XZR, [SP]
|
|
+ BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::STRXui))
|
|
+ .addReg(AArch64::XZR)
|
|
+ .addReg(AArch64::SP)
|
|
+ .addImm(0)
|
|
+ .setMIFlags(MachineInstr::FrameSetup);
|
|
+ // CMP SP, TargetReg
|
|
+ BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
|
|
+ AArch64::XZR)
|
|
+ .addReg(AArch64::SP)
|
|
+ .addReg(TargetReg)
|
|
+ .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
|
|
+ .setMIFlags(MachineInstr::FrameSetup);
|
|
+ // B.CC Loop
|
|
+ BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::Bcc))
|
|
+ .addImm(AArch64CC::NE)
|
|
+ .addMBB(LoopMBB)
|
|
+ .setMIFlags(MachineInstr::FrameSetup);
|
|
+
|
|
+ LoopMBB->addSuccessor(ExitMBB);
|
|
+ LoopMBB->addSuccessor(LoopMBB);
|
|
+ // Synthesize the exit MBB.
|
|
+ ExitMBB->splice(ExitMBB->end(), &MBB, MBBI, MBB.end());
|
|
+ ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
|
|
+ MBB.addSuccessor(LoopMBB);
|
|
+ // Update liveins.
|
|
+ recomputeLiveIns(*LoopMBB);
|
|
+ recomputeLiveIns(*ExitMBB);
|
|
+
|
|
+ return ExitMBB->begin();
|
|
+}
|
|
+
|
|
+void AArch64FrameLowering::inlineStackProbeFixed(
|
|
+ MachineBasicBlock::iterator MBBI, Register ScratchReg, int64_t FrameSize,
|
|
+ StackOffset CFAOffset) const {
|
|
+ MachineBasicBlock *MBB = MBBI->getParent();
|
|
+ MachineFunction &MF = *MBB->getParent();
|
|
+ const AArch64InstrInfo *TII =
|
|
+ MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
|
|
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
|
|
+ bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
|
|
+ bool HasFP = hasFP(MF);
|
|
+
|
|
+ DebugLoc DL;
|
|
+ int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
|
|
+ int64_t NumBlocks = FrameSize / ProbeSize;
|
|
+ int64_t ResidualSize = FrameSize % ProbeSize;
|
|
+
|
|
+ LLVM_DEBUG(dbgs() << "Stack probing: total " << FrameSize << " bytes, "
|
|
+ << NumBlocks << " blocks of " << ProbeSize
|
|
+ << " bytes, plus " << ResidualSize << " bytes\n");
|
|
+
|
|
+ // Decrement SP by NumBlock * ProbeSize bytes, with either unrolled or
|
|
+ // ordinary loop.
|
|
+ if (NumBlocks <= AArch64::StackProbeMaxLoopUnroll) {
|
|
+ for (int i = 0; i < NumBlocks; ++i) {
|
|
+ // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not
|
|
+ // encodable in a SUB).
|
|
+ emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP,
|
|
+ StackOffset::getFixed(-ProbeSize), TII,
|
|
+ MachineInstr::FrameSetup, false, false, nullptr,
|
|
+ EmitAsyncCFI && !HasFP, CFAOffset);
|
|
+ CFAOffset += StackOffset::getFixed(ProbeSize);
|
|
+ // STR XZR, [SP]
|
|
+ BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui))
|
|
+ .addReg(AArch64::XZR)
|
|
+ .addReg(AArch64::SP)
|
|
+ .addImm(0)
|
|
+ .setMIFlags(MachineInstr::FrameSetup);
|
|
+ }
|
|
+ } else if (NumBlocks != 0) {
|
|
+ // SUB ScratchReg, SP, #FrameSize (or equivalent if FrameSize is not
|
|
+ // encodable in ADD). ScrathReg may temporarily become the CFA register.
|
|
+ emitFrameOffset(*MBB, MBBI, DL, ScratchReg, AArch64::SP,
|
|
+ StackOffset::getFixed(-ProbeSize * NumBlocks), TII,
|
|
+ MachineInstr::FrameSetup, false, false, nullptr,
|
|
+ EmitAsyncCFI && !HasFP, CFAOffset);
|
|
+ CFAOffset += StackOffset::getFixed(ProbeSize * NumBlocks);
|
|
+ MBBI = inlineStackProbeLoopExactMultiple(MBBI, ProbeSize, ScratchReg);
|
|
+ MBB = MBBI->getParent();
|
|
+ if (EmitAsyncCFI && !HasFP) {
|
|
+ // Set the CFA register back to SP.
|
|
+ const AArch64RegisterInfo &RegInfo =
|
|
+ *MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
|
|
+ unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
|
|
+ unsigned CFIIndex =
|
|
+ MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
|
|
+ BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
|
|
+ .addCFIIndex(CFIIndex)
|
|
+ .setMIFlags(MachineInstr::FrameSetup);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (ResidualSize != 0) {
|
|
+ // SUB SP, SP, #ResidualSize (or equivalent if ResidualSize is not encodable
|
|
+ // in SUB).
|
|
+ emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP,
|
|
+ StackOffset::getFixed(-ResidualSize), TII,
|
|
+ MachineInstr::FrameSetup, false, false, nullptr,
|
|
+ EmitAsyncCFI && !HasFP, CFAOffset);
|
|
+ if (ResidualSize > AArch64::StackProbeMaxUnprobedStack) {
|
|
+ // STR XZR, [SP]
|
|
+ BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui))
|
|
+ .addReg(AArch64::XZR)
|
|
+ .addReg(AArch64::SP)
|
|
+ .addImm(0)
|
|
+ .setMIFlags(MachineInstr::FrameSetup);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF,
|
|
+ MachineBasicBlock &MBB) const {
|
|
+ // Get the instructions that need to be replaced. We emit at most two of
|
|
+ // these. Remember them in order to avoid complications coming from the need
|
|
+ // to traverse the block while potentially creating more blocks.
|
|
+ SmallVector<MachineInstr *, 4> ToReplace;
|
|
+ for (MachineInstr &MI : MBB)
|
|
+ if (MI.getOpcode() == AArch64::PROBED_STACKALLOC ||
|
|
+ MI.getOpcode() == AArch64::PROBED_STACKALLOC_VAR)
|
|
+ ToReplace.push_back(&MI);
|
|
+
|
|
+ for (MachineInstr *MI : ToReplace) {
|
|
+ if (MI->getOpcode() == AArch64::PROBED_STACKALLOC) {
|
|
+ Register ScratchReg = MI->getOperand(0).getReg();
|
|
+ int64_t FrameSize = MI->getOperand(1).getImm();
|
|
+ StackOffset CFAOffset = StackOffset::get(MI->getOperand(2).getImm(),
|
|
+ MI->getOperand(3).getImm());
|
|
+ inlineStackProbeFixed(MI->getIterator(), ScratchReg, FrameSize,
|
|
+ CFAOffset);
|
|
+ } else {
|
|
+ assert(MI->getOpcode() == AArch64::PROBED_STACKALLOC_VAR &&
|
|
+ "Stack probe pseudo-instruction expected");
|
|
+ const AArch64InstrInfo *TII =
|
|
+ MI->getMF()->getSubtarget<AArch64Subtarget>().getInstrInfo();
|
|
+ Register TargetReg = MI->getOperand(0).getReg();
|
|
+ (void)TII->probedStackAlloc(MI->getIterator(), TargetReg, true);
|
|
+ }
|
|
+ MI->eraseFromParent();
|
|
+ }
|
|
+}
|
|
\ No newline at end of file
|
|
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
|
|
index f3313f3b53ff..941af03a78b7 100644
|
|
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
|
|
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
|
|
@@ -152,13 +152,26 @@ private:
|
|
MachineBasicBlock::iterator MBBI) const;
|
|
void allocateStackSpace(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI,
|
|
- bool NeedsRealignment, StackOffset AllocSize,
|
|
+ int64_t RealignmentPadding, StackOffset AllocSize,
|
|
bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI,
|
|
- StackOffset InitialOffset) const;
|
|
+ StackOffset InitialOffset, bool FollowupAllocs) const;
|
|
|
|
/// Emit target zero call-used regs.
|
|
void emitZeroCallUsedRegs(BitVector RegsToZero,
|
|
MachineBasicBlock &MBB) const override;
|
|
+
|
|
+ /// Replace a StackProbe stub (if any) with the actual probe code inline
|
|
+ void inlineStackProbe(MachineFunction &MF,
|
|
+ MachineBasicBlock &PrologueMBB) const override;
|
|
+
|
|
+ void inlineStackProbeFixed(MachineBasicBlock::iterator MBBI,
|
|
+ Register ScratchReg, int64_t FrameSize,
|
|
+ StackOffset CFAOffset) const;
|
|
+
|
|
+ MachineBasicBlock::iterator
|
|
+ inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI,
|
|
+ int64_t NegProbeSize,
|
|
+ Register TargetReg) const;
|
|
};
|
|
|
|
} // End llvm namespace
|
|
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
|
|
index 6e721b937846..082043420fb9 100644
|
|
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
|
|
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
|
|
@@ -26051,3 +26051,9 @@ bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
|
|
}
|
|
return true;
|
|
}
|
|
+
|
|
+bool AArch64TargetLowering::hasInlineStackProbe(
|
|
+ const MachineFunction &MF) const {
|
|
+ return !Subtarget->isTargetWindows() &&
|
|
+ MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
|
|
+}
|
|
\ No newline at end of file
|
|
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
|
|
index aca45f113e73..643d363e234a 100644
|
|
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
|
|
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
|
|
@@ -508,6 +508,13 @@ const unsigned RoundingBitsPos = 22;
|
|
const ArrayRef<MCPhysReg> getGPRArgRegs();
|
|
const ArrayRef<MCPhysReg> getFPRArgRegs();
|
|
|
|
+/// Maximum allowed number of unprobed bytes above SP at an ABI
|
|
+/// boundary.
|
|
+const unsigned StackProbeMaxUnprobedStack = 1024;
|
|
+
|
|
+/// Maximum number of iterations to unroll for a constant size probing loop.
|
|
+const unsigned StackProbeMaxLoopUnroll = 4;
|
|
+
|
|
} // namespace AArch64
|
|
|
|
class AArch64Subtarget;
|
|
@@ -942,6 +949,9 @@ public:
|
|
// used for 64bit and 128bit vectors as well.
|
|
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON = false) const;
|
|
|
|
+ /// True if stack clash protection is enabled for this functions.
|
|
+ bool hasInlineStackProbe(const MachineFunction &MF) const override;
|
|
+
|
|
private:
|
|
/// Keep a pointer to the AArch64Subtarget around so that we can
|
|
/// make the right decision when generating code for different targets.
|
|
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
|
|
index 0691e07a639b..b3b42a97e8c9 100644
|
|
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
|
|
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
|
|
@@ -11,6 +11,7 @@
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AArch64InstrInfo.h"
|
|
+#include "AArch64ExpandImm.h"
|
|
#include "AArch64MachineFunctionInfo.h"
|
|
#include "AArch64Subtarget.h"
|
|
#include "MCTargetDesc/AArch64AddressingModes.h"
|
|
@@ -18,6 +19,7 @@
|
|
#include "llvm/ADT/ArrayRef.h"
|
|
#include "llvm/ADT/STLExtras.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
+#include "llvm/CodeGen/LivePhysRegs.h"
|
|
#include "llvm/CodeGen/MachineBasicBlock.h"
|
|
#include "llvm/CodeGen/MachineCombinerPattern.h"
|
|
#include "llvm/CodeGen/MachineFrameInfo.h"
|
|
@@ -8428,6 +8430,94 @@ unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
|
|
return AArch64::BLR;
|
|
}
|
|
|
|
+MachineBasicBlock::iterator
|
|
+AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
|
|
+ Register TargetReg, bool FrameSetup) const {
|
|
+ assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP");
|
|
+
|
|
+ MachineBasicBlock &MBB = *MBBI->getParent();
|
|
+ MachineFunction &MF = *MBB.getParent();
|
|
+ const AArch64InstrInfo *TII =
|
|
+ MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
|
|
+ int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
|
|
+ DebugLoc DL = MBB.findDebugLoc(MBBI);
|
|
+
|
|
+ MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
|
|
+ MachineBasicBlock *LoopTestMBB =
|
|
+ MF.CreateMachineBasicBlock(MBB.getBasicBlock());
|
|
+ MF.insert(MBBInsertPoint, LoopTestMBB);
|
|
+ MachineBasicBlock *LoopBodyMBB =
|
|
+ MF.CreateMachineBasicBlock(MBB.getBasicBlock());
|
|
+ MF.insert(MBBInsertPoint, LoopBodyMBB);
|
|
+ MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
|
|
+ MF.insert(MBBInsertPoint, ExitMBB);
|
|
+ MachineInstr::MIFlag Flags =
|
|
+ FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;
|
|
+
|
|
+ // LoopTest:
|
|
+ // SUB SP, SP, #ProbeSize
|
|
+ emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
|
|
+ AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
|
|
+
|
|
+ // CMP SP, TargetReg
|
|
+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
|
|
+ AArch64::XZR)
|
|
+ .addReg(AArch64::SP)
|
|
+ .addReg(TargetReg)
|
|
+ .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
|
|
+ .setMIFlags(Flags);
|
|
+
|
|
+ // B.<Cond> LoopExit
|
|
+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
|
|
+ .addImm(AArch64CC::LE)
|
|
+ .addMBB(ExitMBB)
|
|
+ .setMIFlags(Flags);
|
|
+
|
|
+ // STR XZR, [SP]
|
|
+ BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
|
|
+ .addReg(AArch64::XZR)
|
|
+ .addReg(AArch64::SP)
|
|
+ .addImm(0)
|
|
+ .setMIFlags(Flags);
|
|
+
|
|
+ // B loop
|
|
+ BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
|
|
+ .addMBB(LoopTestMBB)
|
|
+ .setMIFlags(Flags);
|
|
+
|
|
+ // LoopExit:
|
|
+ // MOV SP, TargetReg
|
|
+ BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
|
|
+ .addReg(TargetReg)
|
|
+ .addImm(0)
|
|
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
|
|
+ .setMIFlags(Flags);
|
|
+
|
|
+ // STR XZR, [SP]
|
|
+ BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::STRXui))
|
|
+ .addReg(AArch64::XZR)
|
|
+ .addReg(AArch64::SP)
|
|
+ .addImm(0)
|
|
+ .setMIFlags(Flags);
|
|
+
|
|
+ ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
|
|
+ ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
|
|
+
|
|
+ LoopTestMBB->addSuccessor(ExitMBB);
|
|
+ LoopTestMBB->addSuccessor(LoopBodyMBB);
|
|
+ LoopBodyMBB->addSuccessor(LoopTestMBB);
|
|
+ MBB.addSuccessor(LoopTestMBB);
|
|
+
|
|
+ // Update liveins.
|
|
+ if (MF.getRegInfo().reservedRegsFrozen()) {
|
|
+ recomputeLiveIns(*LoopTestMBB);
|
|
+ recomputeLiveIns(*LoopBodyMBB);
|
|
+ recomputeLiveIns(*ExitMBB);
|
|
+ }
|
|
+
|
|
+ return ExitMBB->begin();
|
|
+}
|
|
+
|
|
#define GET_INSTRINFO_HELPERS
|
|
#define GET_INSTRMAP_INFO
|
|
#include "AArch64GenInstrInfo.inc"
|
|
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
|
|
index 20210a96d67a..7e84b86fc52c 100644
|
|
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
|
|
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
|
|
@@ -340,6 +340,12 @@ public:
|
|
static void decomposeStackOffsetForDwarfOffsets(const StackOffset &Offset,
|
|
int64_t &ByteSized,
|
|
int64_t &VGSized);
|
|
+ // Decrement the SP, issuing probes along the way. `TargetReg` is the new top
|
|
+ // of the stack. `FrameSetup` is passed as true, if the allocation is a part
|
|
+ // of constructing the activation frame of a function.
|
|
+ MachineBasicBlock::iterator probedStackAlloc(MachineBasicBlock::iterator MBBI,
|
|
+ Register TargetReg,
|
|
+ bool FrameSetup) const;
|
|
#define GET_INSTRINFO_HELPER_DECLS
|
|
#include "AArch64GenInstrInfo.inc"
|
|
|
|
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
|
|
index 9e72d37880c5..09980c2f45e6 100644
|
|
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
|
|
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
|
|
@@ -880,7 +880,8 @@ include "SMEInstrFormats.td"
|
|
// Miscellaneous instructions.
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
-let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
|
|
+let hasSideEffects = 1, isCodeGenOnly = 1 in {
|
|
+let Defs = [SP], Uses = [SP] in {
|
|
// We set Sched to empty list because we expect these instructions to simply get
|
|
// removed in most cases.
|
|
def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
|
|
@@ -889,7 +890,26 @@ def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
|
|
def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
|
|
[(AArch64callseq_end timm:$amt1, timm:$amt2)]>,
|
|
Sched<[]>;
|
|
-} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
|
|
+}
|
|
+
|
|
+let Defs = [SP, NZCV], Uses = [SP] in {
|
|
+// Probed stack allocation of a constant size, used in function prologues when
|
|
+// stack-clash protection is enabled.
|
|
+def PROBED_STACKALLOC : Pseudo<(outs GPR64:$scratch),
|
|
+ (ins i64imm:$stacksize, i64imm:$fixed_offset,
|
|
+ i64imm:$scalable_offset),
|
|
+ []>,
|
|
+ Sched<[]>;
|
|
+
|
|
+// Probed stack allocation of a variable size, used in function prologues when
|
|
+// stack-clash protection is enabled.
|
|
+def PROBED_STACKALLOC_VAR : Pseudo<(outs),
|
|
+ (ins GPR64sp:$target),
|
|
+ []>,
|
|
+ Sched<[]>;
|
|
+
|
|
+} // Defs = [SP, NZCV], Uses = [SP] in
|
|
+} // hasSideEffects = 1, isCodeGenOnly = 1
|
|
|
|
let isReMaterializable = 1, isCodeGenOnly = 1 in {
|
|
// FIXME: The following pseudo instructions are only needed because remat
|
|
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
|
|
index 961a19317d66..0bef3c2d2483 100644
|
|
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
|
|
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
|
|
@@ -97,14 +97,45 @@ AArch64FunctionInfo::AArch64FunctionInfo(const Function &F,
|
|
if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
|
|
F.getParent()->getModuleFlag("branch-target-enforcement")))
|
|
BranchTargetEnforcement = BTE->getZExtValue();
|
|
- return;
|
|
+ } else {
|
|
+ const StringRef BTIEnable =
|
|
+ F.getFnAttribute("branch-target-enforcement").getValueAsString();
|
|
+ assert(BTIEnable.equals_insensitive("true") ||
|
|
+ BTIEnable.equals_insensitive("false"));
|
|
+ BranchTargetEnforcement = BTIEnable.equals_insensitive("true");
|
|
}
|
|
|
|
- const StringRef BTIEnable =
|
|
- F.getFnAttribute("branch-target-enforcement").getValueAsString();
|
|
- assert(BTIEnable.equals_insensitive("true") ||
|
|
- BTIEnable.equals_insensitive("false"));
|
|
- BranchTargetEnforcement = BTIEnable.equals_insensitive("true");
|
|
+ // The default stack probe size is 4096 if the function has no
|
|
+ // stack-probe-size attribute. This is a safe default because it is the
|
|
+ // smallest possible guard page size.
|
|
+ uint64_t ProbeSize = 4096;
|
|
+ if (F.hasFnAttribute("stack-probe-size"))
|
|
+ ProbeSize = F.getFnAttributeAsParsedInteger("stack-probe-size");
|
|
+ else if (const auto *PS = mdconst::extract_or_null<ConstantInt>(
|
|
+ F.getParent()->getModuleFlag("stack-probe-size")))
|
|
+ ProbeSize = PS->getZExtValue();
|
|
+ assert(int64_t(ProbeSize) > 0 && "Invalid stack probe size");
|
|
+
|
|
+ if (STI->isTargetWindows()) {
|
|
+ if (!F.hasFnAttribute("no-stack-arg-probe"))
|
|
+ StackProbeSize = ProbeSize;
|
|
+ } else {
|
|
+ // Round down to the stack alignment.
|
|
+ uint64_t StackAlign =
|
|
+ STI->getFrameLowering()->getTransientStackAlign().value();
|
|
+ ProbeSize = std::max(StackAlign, ProbeSize & ~(StackAlign - 1U));
|
|
+ StringRef ProbeKind;
|
|
+ if (F.hasFnAttribute("probe-stack"))
|
|
+ ProbeKind = F.getFnAttribute("probe-stack").getValueAsString();
|
|
+ else if (const auto *PS = dyn_cast_or_null<MDString>(
|
|
+ F.getParent()->getModuleFlag("probe-stack")))
|
|
+ ProbeKind = PS->getString();
|
|
+ if (ProbeKind.size()) {
|
|
+ if (ProbeKind != "inline-asm")
|
|
+ report_fatal_error("Unsupported stack probing method");
|
|
+ StackProbeSize = ProbeSize;
|
|
+ }
|
|
+ }
|
|
}
|
|
|
|
MachineFunctionInfo *AArch64FunctionInfo::clone(
|
|
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
|
|
index d82fb436925e..d50011594eb1 100644
|
|
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
|
|
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
|
|
@@ -192,6 +192,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
|
|
/// True if the function need asynchronous unwind information.
|
|
mutable std::optional<bool> NeedsAsyncDwarfUnwindInfo;
|
|
|
|
+ int64_t StackProbeSize = 0;
|
|
+
|
|
public:
|
|
AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI);
|
|
|
|
@@ -447,6 +449,10 @@ public:
|
|
bool needsDwarfUnwindInfo(const MachineFunction &MF) const;
|
|
bool needsAsyncDwarfUnwindInfo(const MachineFunction &MF) const;
|
|
|
|
+ bool hasStackProbing() const { return StackProbeSize != 0; }
|
|
+
|
|
+ int64_t getStackProbeSize() const { return StackProbeSize; }
|
|
+
|
|
private:
|
|
// Hold the lists of LOHs.
|
|
MILOHContainer LOHContainerSet;
|
|
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-64k.ll b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll
|
|
new file mode 100644
|
|
index 000000000000..0a3198fc520e
|
|
--- /dev/null
|
|
+++ b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll
|
|
@@ -0,0 +1,392 @@
|
|
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s
|
|
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s
|
|
+
|
|
+; Tests for prolog sequences for stack probing, when using a 64KiB stack guard.
|
|
+
|
|
+; 64k bytes is the largest frame we can probe in one go.
|
|
+define void @static_65536(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_65536:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 65552
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 65536, align 1
|
|
+ store i8* %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; 64k+16 bytes, still needs just one probe.
|
|
+define void @static_65552(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_65552:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 65552
|
|
+; CHECK-NEXT: str xzr, [sp], #-16
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 65568
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 32
|
|
+; CHECK-NEXT: add sp, sp, #16
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 65552, align 1
|
|
+ store i8* %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; 64k+1024 bytes, the largest frame which needs just one probe.
|
|
+define void @static_66560(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_66560:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 65552
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: sub sp, sp, #1024
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 66576
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 1040
|
|
+; CHECK-NEXT: add sp, sp, #1024
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 66560, align 1
|
|
+ store i8* %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; 64k+1024+16 bytes, the smallest frame which needs two probes.
|
|
+define void @static_66576(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_66576:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 65552
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: sub sp, sp, #1040
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 66592
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 1056
|
|
+; CHECK-NEXT: add sp, sp, #1040
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 66576, align 1
|
|
+ store i8* %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; 2*64k+1024, the largest frame needing two probes.
|
|
+define void @static_132096(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_132096:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 65552
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 131088
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: sub sp, sp, #1024
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 132112
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #32, lsl #12 // =131072
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 1040
|
|
+; CHECK-NEXT: add sp, sp, #1024
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 132096, align 1
|
|
+ store i8* %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; 5*64k-16, the largest frame probed without a loop.
|
|
+define void @static_327664(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_327664:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 65552
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 131088
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 196624
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 262160
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: sub sp, sp, #15, lsl #12 // =61440
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 323600
|
|
+; CHECK-NEXT: sub sp, sp, #4080
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 327680
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #79, lsl #12 // =323584
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 4096
|
|
+; CHECK-NEXT: add sp, sp, #4080
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 327664, align 1
|
|
+ store i8* %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; 5*64k, smallest frame probed with a loop.
|
|
+define void @static_327680(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_327680:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680
|
|
+; CHECK-NEXT: .cfi_def_cfa w9, 327696
|
|
+; CHECK-NEXT: .LBB6_1: // %entry
|
|
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: cmp sp, x9
|
|
+; CHECK-NEXT: b.ne .LBB6_1
|
|
+; CHECK-NEXT: // %bb.2: // %entry
|
|
+; CHECK-NEXT: .cfi_def_cfa_register wsp
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 327680, align 1
|
|
+ store i8* %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; 5*64k+1024, large enough to use a loop, but not a multiple of 64KiB
|
|
+; so has a reminder, but no extra probe.
|
|
+define void @static_328704(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_328704:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680
|
|
+; CHECK-NEXT: .cfi_def_cfa w9, 327696
|
|
+; CHECK-NEXT: .LBB7_1: // %entry
|
|
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: cmp sp, x9
|
|
+; CHECK-NEXT: b.ne .LBB7_1
|
|
+; CHECK-NEXT: // %bb.2: // %entry
|
|
+; CHECK-NEXT: .cfi_def_cfa_register wsp
|
|
+; CHECK-NEXT: sub sp, sp, #1024
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 328720
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 1040
|
|
+; CHECK-NEXT: add sp, sp, #1024
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 328704, align 1
|
|
+ store i8* %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; 5*64k+1040, large enough to use a loop, has a reminder and
|
|
+; an extra probe.
|
|
+define void @static_328720(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_328720:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680
|
|
+; CHECK-NEXT: .cfi_def_cfa w9, 327696
|
|
+; CHECK-NEXT: .LBB8_1: // %entry
|
|
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: cmp sp, x9
|
|
+; CHECK-NEXT: b.ne .LBB8_1
|
|
+; CHECK-NEXT: // %bb.2: // %entry
|
|
+; CHECK-NEXT: .cfi_def_cfa_register wsp
|
|
+; CHECK-NEXT: sub sp, sp, #1040
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 328736
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 1056
|
|
+; CHECK-NEXT: add sp, sp, #1040
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 328720, align 1
|
|
+ store i8* %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; A small allocation, but with a very large alignment requirement. We do this
|
|
+; by moving SP far enough that a sufficiently-aligned block will exist
|
|
+; somewhere in the stack frame, so must probe the whole of that larger SP move.
|
|
+define void @static_16_align_131072(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_16_align_131072:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: mov x29, sp
|
|
+; CHECK-NEXT: .cfi_def_cfa w29, 16
|
|
+; CHECK-NEXT: .cfi_offset w30, -8
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub x9, sp, #31, lsl #12 // =126976
|
|
+; CHECK-NEXT: sub x9, x9, #4080
|
|
+; CHECK-NEXT: and x9, x9, #0xfffffffffffe0000
|
|
+; CHECK-NEXT: .LBB9_1: // %entry
|
|
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
|
|
+; CHECK-NEXT: cmp sp, x9
|
|
+; CHECK-NEXT: b.le .LBB9_3
|
|
+; CHECK-NEXT: // %bb.2: // %entry
|
|
+; CHECK-NEXT: // in Loop: Header=BB9_1 Depth=1
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: b .LBB9_1
|
|
+; CHECK-NEXT: .LBB9_3: // %entry
|
|
+; CHECK-NEXT: mov sp, x9
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: mov sp, x29
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w30
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 16, align 131072
|
|
+ store i8* %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; A small allocation, but with a very large alignment requirement which
|
|
+; is nevertheless small enough as to not need a loop.
|
|
+define void @static_16_align_8192(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_16_align_8192:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: mov x29, sp
|
|
+; CHECK-NEXT: .cfi_def_cfa w29, 16
|
|
+; CHECK-NEXT: .cfi_offset w30, -8
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: sub x9, x9, #4080
|
|
+; CHECK-NEXT: and sp, x9, #0xffffffffffffe000
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: mov sp, x29
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w30
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 16, align 8192
|
|
+ store i8* %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; A large allocation with a very large alignment requirement which
|
|
+; is nevertheless small enough as to not need a loop.
|
|
+define void @static_32752_align_32k(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_32752_align_32k:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: mov x29, sp
|
|
+; CHECK-NEXT: .cfi_def_cfa w29, 16
|
|
+; CHECK-NEXT: .cfi_offset w30, -8
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub x9, sp, #7, lsl #12 // =28672
|
|
+; CHECK-NEXT: sub x9, x9, #4080
|
|
+; CHECK-NEXT: and sp, x9, #0xffffffffffff8000
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: mov sp, x29
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w30
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 32752, align 32768
|
|
+ store i8* %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" }
|
|
\ No newline at end of file
|
|
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir b/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir
|
|
new file mode 100644
|
|
index 000000000000..a8a21ab330ba
|
|
--- /dev/null
|
|
+++ b/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir
|
|
@@ -0,0 +1,146 @@
|
|
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
|
|
+# RUN: llc -run-pass=prologepilog %s -o - | FileCheck %s
|
|
+# Regression test for a crash when the probing instruction
|
|
+# to replace is last in the block.
|
|
+--- |
|
|
+ source_filename = "tt.ll"
|
|
+ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
|
|
+ target triple = "aarch64-linux"
|
|
+
|
|
+ declare i1 @g(ptr)
|
|
+
|
|
+ define void @f(ptr %out) #0 {
|
|
+ entry:
|
|
+ %p = alloca i32, i32 50000, align 4
|
|
+ br label %loop
|
|
+
|
|
+ loop: ; preds = %loop, %entry
|
|
+ %c = call i1 @g(ptr %p)
|
|
+ br i1 %c, label %loop, label %exit
|
|
+
|
|
+ exit: ; preds = %loop
|
|
+ ret void
|
|
+ }
|
|
+
|
|
+ attributes #0 = { uwtable "frame-pointer"="none" "probe-stack"="inline-asm" "target-features"="+sve" }
|
|
+
|
|
+...
|
|
+---
|
|
+name: f
|
|
+alignment: 4
|
|
+exposesReturnsTwice: false
|
|
+legalized: false
|
|
+regBankSelected: false
|
|
+selected: false
|
|
+failedISel: false
|
|
+tracksRegLiveness: true
|
|
+hasWinCFI: false
|
|
+callsEHReturn: false
|
|
+callsUnwindInit: false
|
|
+hasEHCatchret: false
|
|
+hasEHScopes: false
|
|
+hasEHFunclets: false
|
|
+isOutlined: false
|
|
+debugInstrRef: false
|
|
+failsVerification: false
|
|
+tracksDebugUserValues: true
|
|
+registers: []
|
|
+liveins: []
|
|
+frameInfo:
|
|
+ isFrameAddressTaken: false
|
|
+ isReturnAddressTaken: false
|
|
+ hasStackMap: false
|
|
+ hasPatchPoint: false
|
|
+ stackSize: 0
|
|
+ offsetAdjustment: 0
|
|
+ maxAlignment: 4
|
|
+ adjustsStack: true
|
|
+ hasCalls: true
|
|
+ stackProtector: ''
|
|
+ functionContext: ''
|
|
+ maxCallFrameSize: 0
|
|
+ cvBytesOfCalleeSavedRegisters: 0
|
|
+ hasOpaqueSPAdjustment: false
|
|
+ hasVAStart: false
|
|
+ hasMustTailInVarArgFunc: false
|
|
+ hasTailCall: false
|
|
+ localFrameSize: 200000
|
|
+ savePoint: ''
|
|
+ restorePoint: ''
|
|
+fixedStack: []
|
|
+stack:
|
|
+ - { id: 0, name: p, type: default, offset: 0, size: 200000, alignment: 4,
|
|
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
|
+ local-offset: -200000, debug-info-variable: '', debug-info-expression: '',
|
|
+ debug-info-location: '' }
|
|
+entry_values: []
|
|
+callSites: []
|
|
+debugValueSubstitutions: []
|
|
+constants: []
|
|
+machineFunctionInfo: {}
|
|
+body: |
|
|
+ ; CHECK-LABEL: name: f
|
|
+ ; CHECK: bb.0.entry:
|
|
+ ; CHECK-NEXT: successors: %bb.3(0x80000000)
|
|
+ ; CHECK-NEXT: liveins: $lr, $fp
|
|
+ ; CHECK-NEXT: {{ $}}
|
|
+ ; CHECK-NEXT: early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.2), (store (s64) into %stack.1)
|
|
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
|
|
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
|
|
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
|
|
+ ; CHECK-NEXT: $x9 = frame-setup SUBXri $sp, 48, 12
|
|
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w9, 196624
|
|
+ ; CHECK-NEXT: {{ $}}
|
|
+ ; CHECK-NEXT: bb.3.entry:
|
|
+ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.3(0x40000000)
|
|
+ ; CHECK-NEXT: liveins: $x9
|
|
+ ; CHECK-NEXT: {{ $}}
|
|
+ ; CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1, 12
|
|
+ ; CHECK-NEXT: frame-setup STRXui $xzr, $sp, 0
|
|
+ ; CHECK-NEXT: $xzr = frame-setup SUBSXrx64 $sp, $x9, 24, implicit-def $nzcv
|
|
+ ; CHECK-NEXT: frame-setup Bcc 1, %bb.3, implicit $nzcv
|
|
+ ; CHECK-NEXT: {{ $}}
|
|
+ ; CHECK-NEXT: bb.4.entry:
|
|
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
|
|
+ ; CHECK-NEXT: {{ $}}
|
|
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $wsp
|
|
+ ; CHECK-NEXT: $sp = frame-setup SUBXri $sp, 3392, 0
|
|
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 200016
|
|
+ ; CHECK-NEXT: frame-setup STRXui $xzr, $sp, 0
|
|
+ ; CHECK-NEXT: {{ $}}
|
|
+ ; CHECK-NEXT: bb.1.loop:
|
|
+ ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000)
|
|
+ ; CHECK-NEXT: {{ $}}
|
|
+ ; CHECK-NEXT: $x0 = ADDXri $sp, 0, 0
|
|
+ ; CHECK-NEXT: BL @g, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0
|
|
+ ; CHECK-NEXT: TBNZW killed renamable $w0, 0, %bb.1
|
|
+ ; CHECK-NEXT: B %bb.2
|
|
+ ; CHECK-NEXT: {{ $}}
|
|
+ ; CHECK-NEXT: bb.2.exit:
|
|
+ ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 48, 12
|
|
+ ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 3408
|
|
+ ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 3392, 0
|
|
+ ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16
|
|
+ ; CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.1)
|
|
+ ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
|
|
+ ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30
|
|
+ ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
|
|
+ ; CHECK-NEXT: RET_ReallyLR
|
|
+ bb.0.entry:
|
|
+ successors: %bb.1(0x80000000)
|
|
+
|
|
+
|
|
+ bb.1.loop:
|
|
+ successors: %bb.1(0x7c000000), %bb.2(0x04000000)
|
|
+
|
|
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
|
|
+ $x0 = ADDXri %stack.0.p, 0, 0
|
|
+ BL @g, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0
|
|
+ ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
|
|
+ TBNZW killed renamable $w0, 0, %bb.1
|
|
+ B %bb.2
|
|
+
|
|
+ bb.2.exit:
|
|
+ RET_ReallyLR
|
|
+
|
|
+...
|
|
\ No newline at end of file
|
|
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
|
|
new file mode 100644
|
|
index 000000000000..e765d071e722
|
|
--- /dev/null
|
|
+++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
|
|
@@ -0,0 +1,724 @@
|
|
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s
|
|
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
|
|
+
|
|
+; Test prolog sequences for stack probing when SVE objects are involved.
|
|
+
|
|
+; The space for SVE objects needs probing in the general case, because
|
|
+; the stack adjustment may happen to be too big (i.e. greater than the
|
|
+; probe size) to allocate with a single `addvl`.
|
|
+; When we do know that the stack adjustment cannot exceed the probe size
|
|
+; we can avoid emitting a probe loop and emit a simple `addvl; str`
|
|
+; sequence instead.
|
|
+
|
|
+define void @sve_1_vector(ptr %out) #0 {
|
|
+; CHECK-LABEL: sve_1_vector:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: addvl sp, sp, #-1
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #1
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %vec = alloca <vscale x 4 x float>, align 16
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; As above, but with 4 SVE vectors of stack space.
|
|
+define void @sve_4_vector(ptr %out) #0 {
|
|
+; CHECK-LABEL: sve_4_vector:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: addvl sp, sp, #-4
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #4
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %vec1 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec2 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec3 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec4 = alloca <vscale x 4 x float>, align 16
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; As above, but with 16 SVE vectors of stack space.
|
|
+; The stack adjustment is less than or equal to 16 x 256 = 4096, so
|
|
+; we can allocate the locals at once.
|
|
+define void @sve_16_vector(ptr %out) #0 {
|
|
+; CHECK-LABEL: sve_16_vector:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: addvl sp, sp, #-16
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: addvl sp, sp, #16
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %vec1 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec2 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec3 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec4 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec5 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec6 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec7 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec8 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec9 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec10 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec11 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec12 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec13 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec14 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec15 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec16 = alloca <vscale x 4 x float>, align 16
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; As above, but with 17 SVE vectors of stack space. Now we need
|
|
+; a probing loops since stack adjustment may be greater than
|
|
+; the probe size (17 x 256 = 4354 bytes)
|
|
+; TODO: Allocating `k*16+r` SVE vectors can be unrolled into
|
|
+; emiting the `k + r` sequences of `addvl sp, sp, #-N; str xzr, [sp]`
|
|
+define void @sve_17_vector(ptr %out) #0 {
|
|
+; CHECK-LABEL: sve_17_vector:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: addvl x9, sp, #-17
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG
|
|
+; CHECK-NEXT: .LBB3_1: // %entry
|
|
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: cmp sp, x9
|
|
+; CHECK-NEXT: b.le .LBB3_3
|
|
+; CHECK-NEXT: // %bb.2: // %entry
|
|
+; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: b .LBB3_1
|
|
+; CHECK-NEXT: .LBB3_3: // %entry
|
|
+; CHECK-NEXT: mov sp, x9
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: .cfi_def_cfa_register wsp
|
|
+; CHECK-NEXT: addvl sp, sp, #17
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %vec1 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec2 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec3 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec4 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec5 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec6 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec7 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec8 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec9 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec10 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec11 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec12 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec13 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec14 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec15 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec16 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec17 = alloca <vscale x 4 x float>, align 16
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; Space for callee-saved SVE register is allocated similarly to allocating
|
|
+; space for SVE locals. When we know the stack adjustment cannot exceed the
|
|
+; probe size we can skip the explict probe, since saving SVE registers serves
|
|
+; as an implicit probe.
|
|
+define void @sve_1v_csr(<vscale x 4 x float> %a) #0 {
|
|
+; CHECK-LABEL: sve_1v_csr:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: addvl sp, sp, #-1
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
|
|
+; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
|
|
+; CHECK-NEXT: //APP
|
|
+; CHECK-NEXT: //NO_APP
|
|
+; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: addvl sp, sp, #1
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: .cfi_restore z8
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ call void asm sideeffect "", "~{z8}" ()
|
|
+ ret void
|
|
+}
|
|
+
|
|
+define void @sve_4v_csr(<vscale x 4 x float> %a) #0 {
|
|
+; CHECK-LABEL: sve_4v_csr:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: addvl sp, sp, #-4
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
|
|
+; CHECK-NEXT: str z11, [sp] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
|
|
+; CHECK-NEXT: //APP
|
|
+; CHECK-NEXT: //NO_APP
|
|
+; CHECK-NEXT: ldr z11, [sp] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: addvl sp, sp, #4
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: .cfi_restore z8
|
|
+; CHECK-NEXT: .cfi_restore z9
|
|
+; CHECK-NEXT: .cfi_restore z10
|
|
+; CHECK-NEXT: .cfi_restore z11
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11}" ()
|
|
+ ret void
|
|
+}
|
|
+
|
|
+define void @sve_16v_csr(<vscale x 4 x float> %a) #0 {
|
|
+; CHECK-LABEL: sve_16v_csr:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: addvl sp, sp, #-16
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
|
|
+; CHECK-NEXT: //APP
|
|
+; CHECK-NEXT: //NO_APP
|
|
+; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: addvl sp, sp, #16
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: .cfi_restore z8
|
|
+; CHECK-NEXT: .cfi_restore z9
|
|
+; CHECK-NEXT: .cfi_restore z10
|
|
+; CHECK-NEXT: .cfi_restore z11
|
|
+; CHECK-NEXT: .cfi_restore z12
|
|
+; CHECK-NEXT: .cfi_restore z13
|
|
+; CHECK-NEXT: .cfi_restore z14
|
|
+; CHECK-NEXT: .cfi_restore z15
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" ()
|
|
+ ret void
|
|
+}
|
|
+
|
|
+define void @sve_1p_csr(<vscale x 4 x float> %a) #0 {
|
|
+; CHECK-LABEL: sve_1p_csr:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: addvl sp, sp, #-1
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
|
|
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
|
|
+; CHECK-NEXT: //APP
|
|
+; CHECK-NEXT: //NO_APP
|
|
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
|
|
+; CHECK-NEXT: addvl sp, sp, #1
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ call void asm sideeffect "", "~{p8}" ()
|
|
+ ret void
|
|
+}
|
|
+
|
|
+define void @sve_4p_csr(<vscale x 4 x float> %a) #0 {
|
|
+; CHECK-LABEL: sve_4p_csr:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: addvl sp, sp, #-1
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
|
|
+; CHECK-NEXT: str p11, [sp, #4, mul vl] // 2-byte Folded Spill
|
|
+; CHECK-NEXT: str p10, [sp, #5, mul vl] // 2-byte Folded Spill
|
|
+; CHECK-NEXT: str p9, [sp, #6, mul vl] // 2-byte Folded Spill
|
|
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
|
|
+; CHECK-NEXT: //APP
|
|
+; CHECK-NEXT: //NO_APP
|
|
+; CHECK-NEXT: ldr p11, [sp, #4, mul vl] // 2-byte Folded Reload
|
|
+; CHECK-NEXT: ldr p10, [sp, #5, mul vl] // 2-byte Folded Reload
|
|
+; CHECK-NEXT: ldr p9, [sp, #6, mul vl] // 2-byte Folded Reload
|
|
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
|
|
+; CHECK-NEXT: addvl sp, sp, #1
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ call void asm sideeffect "", "~{p8},~{p9},~{p10},~{p11}" ()
|
|
+ ret void
|
|
+}
|
|
+
|
|
+define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 {
|
|
+; CHECK-LABEL: sve_16v_1p_csr:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: addvl x9, sp, #-17
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG
|
|
+; CHECK-NEXT: .LBB9_1: // %entry
|
|
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: cmp sp, x9
|
|
+; CHECK-NEXT: b.le .LBB9_3
|
|
+; CHECK-NEXT: // %bb.2: // %entry
|
|
+; CHECK-NEXT: // in Loop: Header=BB9_1 Depth=1
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: b .LBB9_1
|
|
+; CHECK-NEXT: .LBB9_3: // %entry
|
|
+; CHECK-NEXT: mov sp, x9
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: .cfi_def_cfa_register wsp
|
|
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
|
|
+; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
|
|
+; CHECK-NEXT: //APP
|
|
+; CHECK-NEXT: //NO_APP
|
|
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: addvl sp, sp, #17
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: .cfi_restore z8
|
|
+; CHECK-NEXT: .cfi_restore z9
|
|
+; CHECK-NEXT: .cfi_restore z10
|
|
+; CHECK-NEXT: .cfi_restore z11
|
|
+; CHECK-NEXT: .cfi_restore z12
|
|
+; CHECK-NEXT: .cfi_restore z13
|
|
+; CHECK-NEXT: .cfi_restore z14
|
|
+; CHECK-NEXT: .cfi_restore z15
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ call void asm sideeffect "", "~{p8},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" ()
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; A SVE vector and a 16-byte fixed size object.
|
|
+define void @sve_1_vector_16_arr(ptr %out) #0 {
|
|
+; CHECK-LABEL: sve_1_vector_16_arr:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub sp, sp, #16
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 32
|
|
+; CHECK-NEXT: addvl sp, sp, #-1
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #1
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 32
|
|
+; CHECK-NEXT: add sp, sp, #16
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %vec = alloca <vscale x 4 x float>, align 16
|
|
+ %arr = alloca i8, i64 16, align 1
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; A large SVE stack object and a large stack slot, both of which need probing.
|
|
+; TODO: This could be optimised by combining the fixed-size offset into the
|
|
+; loop.
|
|
+define void @sve_1_vector_4096_arr(ptr %out) #0 {
|
|
+; CHECK-LABEL: sve_1_vector_4096_arr:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub x9, sp, #3, lsl #12 // =12288
|
|
+; CHECK-NEXT: .cfi_def_cfa w9, 12304
|
|
+; CHECK-NEXT: addvl x9, x9, #-32
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 256 * VG
|
|
+; CHECK-NEXT: addvl x9, x9, #-32
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 512 * VG
|
|
+; CHECK-NEXT: .LBB11_1: // %entry
|
|
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: cmp sp, x9
|
|
+; CHECK-NEXT: b.le .LBB11_3
|
|
+; CHECK-NEXT: // %bb.2: // %entry
|
|
+; CHECK-NEXT: // in Loop: Header=BB11_1 Depth=1
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: b .LBB11_1
|
|
+; CHECK-NEXT: .LBB11_3: // %entry
|
|
+; CHECK-NEXT: mov sp, x9
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: .cfi_def_cfa_register wsp
|
|
+; CHECK-NEXT: addvl sp, sp, #31
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x88, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 264 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #31
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 16 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #2
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 12304
|
|
+; CHECK-NEXT: add sp, sp, #3, lsl #12 // =12288
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %vec = alloca <vscale x 256 x float>, align 16
|
|
+ %arr = alloca i8, i64 12288, align 1
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; Not tested: SVE stack objects with alignment >16 bytes, which isn't currently
|
|
+; supported even without stack-probing.
|
|
+
|
|
+; An SVE vector, and a 16-byte fixed size object, which
|
|
+; has a large alignment requirement.
|
|
+define void @sve_1_vector_16_arr_align_8192(ptr %out) #0 {
|
|
+; CHECK-LABEL: sve_1_vector_16_arr_align_8192:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: mov x29, sp
|
|
+; CHECK-NEXT: .cfi_def_cfa w29, 16
|
|
+; CHECK-NEXT: .cfi_offset w30, -8
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: sub x9, x9, #4080
|
|
+; CHECK-NEXT: addvl x9, x9, #-1
|
|
+; CHECK-NEXT: and x9, x9, #0xffffffffffffe000
|
|
+; CHECK-NEXT: .LBB12_1: // %entry
|
|
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: cmp sp, x9
|
|
+; CHECK-NEXT: b.le .LBB12_3
|
|
+; CHECK-NEXT: // %bb.2: // %entry
|
|
+; CHECK-NEXT: // in Loop: Header=BB12_1 Depth=1
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: b .LBB12_1
|
|
+; CHECK-NEXT: .LBB12_3: // %entry
|
|
+; CHECK-NEXT: mov sp, x9
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: mov sp, x29
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w30
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %vec = alloca <vscale x 4 x float>, align 16
|
|
+ %arr = alloca i8, i64 16, align 8192
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; With 64k guard pages, we can allocate bigger SVE space without a probing loop.
|
|
+define void @sve_1024_64k_guard(ptr %out) #0 "stack-probe-size"="65536" {
|
|
+; CHECK-LABEL: sve_1024_64k_guard:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: addvl sp, sp, #-32
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 256 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #-32
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 512 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #-32
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 768 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #-32
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1024 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #-32
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1280 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #-32
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1536 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #-32
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1792 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #-32
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 2048 * VG
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: addvl sp, sp, #31
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1800 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #31
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1552 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #31
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1304 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #31
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1056 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #31
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 808 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #31
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 560 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #31
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 312 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #31
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #8
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %vec = alloca <vscale x 1024 x float>, align 16
|
|
+ ret void
|
|
+}
|
|
+
|
|
+define void @sve_1028_64k_guard(ptr %out) #0 "stack-probe-size"="65536" {
|
|
+; CHECK-LABEL: sve_1028_64k_guard:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: addvl x9, sp, #-32
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 256 * VG
|
|
+; CHECK-NEXT: addvl x9, x9, #-32
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 512 * VG
|
|
+; CHECK-NEXT: addvl x9, x9, #-32
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 768 * VG
|
|
+; CHECK-NEXT: addvl x9, x9, #-32
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1024 * VG
|
|
+; CHECK-NEXT: addvl x9, x9, #-32
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1280 * VG
|
|
+; CHECK-NEXT: addvl x9, x9, #-32
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1536 * VG
|
|
+; CHECK-NEXT: addvl x9, x9, #-32
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1792 * VG
|
|
+; CHECK-NEXT: addvl x9, x9, #-32
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2048 * VG
|
|
+; CHECK-NEXT: addvl x9, x9, #-1
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2056 * VG
|
|
+; CHECK-NEXT: .LBB14_1: // %entry
|
|
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
|
|
+; CHECK-NEXT: cmp sp, x9
|
|
+; CHECK-NEXT: b.le .LBB14_3
|
|
+; CHECK-NEXT: // %bb.2: // %entry
|
|
+; CHECK-NEXT: // in Loop: Header=BB14_1 Depth=1
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: b .LBB14_1
|
|
+; CHECK-NEXT: .LBB14_3: // %entry
|
|
+; CHECK-NEXT: mov sp, x9
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: .cfi_def_cfa_register wsp
|
|
+; CHECK-NEXT: addvl sp, sp, #31
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1808 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #31
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1560 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #31
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1312 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #31
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1064 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #31
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 816 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #31
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 568 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #31
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 320 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #31
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #9
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %vec = alloca <vscale x 1024 x float>, align 16
|
|
+ %vec1 = alloca <vscale x 4 x float>, align 16
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; With 5 SVE vectors of stack space the unprobed area
|
|
+; at the top of the stack can exceed 1024 bytes (5 x 256 == 1280),
|
|
+; hence we need to issue a probe.
|
|
+define void @sve_5_vector(ptr %out) #0 {
|
|
+; CHECK-LABEL: sve_5_vector:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: addvl sp, sp, #-5
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 40 * VG
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: addvl sp, sp, #5
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %vec1 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec2 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec3 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec4 = alloca <vscale x 4 x float>, align 16
|
|
+ %vec5 = alloca <vscale x 4 x float>, align 16
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; Test with a 14 scalable bytes (so up to 14 * 16 = 224) of unprobed
|
|
+; are bellow the save location of `p9`.
|
|
+define void @sve_unprobed_area(<vscale x 4 x float> %a, i32 %n) #0 {
|
|
+; CHECK-LABEL: sve_unprobed_area:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: addvl sp, sp, #-4
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: str p9, [sp, #7, mul vl] // 2-byte Folded Spill
|
|
+; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
|
|
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
|
|
+; CHECK-NEXT: addvl sp, sp, #-4
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
|
|
+; CHECK-NEXT: //APP
|
|
+; CHECK-NEXT: //NO_APP
|
|
+; CHECK-NEXT: addvl sp, sp, #4
|
|
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
|
|
+; CHECK-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
|
|
+; CHECK-NEXT: addvl sp, sp, #4
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: .cfi_restore z8
|
|
+; CHECK-NEXT: .cfi_restore z9
|
|
+; CHECK-NEXT: .cfi_restore z10
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ call void asm sideeffect "", "~{z8},~{z9},~{z10},~{p9}" ()
|
|
+
|
|
+ %v0 = alloca <vscale x 4 x float>, align 16
|
|
+ %v1 = alloca <vscale x 4 x float>, align 16
|
|
+ %v2 = alloca <vscale x 4 x float>, align 16
|
|
+ %v3 = alloca <vscale x 4 x float>, align 16
|
|
+
|
|
+ ret void
|
|
+}
|
|
+
|
|
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" }
|
|
\ No newline at end of file
|
|
diff --git a/llvm/test/CodeGen/AArch64/stack-probing.ll b/llvm/test/CodeGen/AArch64/stack-probing.ll
|
|
new file mode 100644
|
|
index 000000000000..95001450622f
|
|
--- /dev/null
|
|
+++ b/llvm/test/CodeGen/AArch64/stack-probing.ll
|
|
@@ -0,0 +1,539 @@
|
|
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s
|
|
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s
|
|
+
|
|
+; Tests for prolog sequences for stack probing, when using a 4KiB stack guard.
|
|
+
|
|
+; The stack probing parameters in function attributes take precedence over
|
|
+; ones in the module flags.
|
|
+
|
|
+; Small stack frame, no probing required.
|
|
+define void @static_64(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_64:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: sub sp, sp, #64
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 64
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #64
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 64, align 1
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; At 256 bytes we start to always create a frame pointer. No frame smaller then
|
|
+; this needs a probe, so we can use the saving of at least one CSR as a probe
|
|
+; at the top of our frame.
|
|
+define void @static_256(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_256:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: sub sp, sp, #272
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 272
|
|
+; CHECK-NEXT: str x29, [sp, #256] // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #272
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 256, align 1
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; At 1024 bytes, this is the largest frame which doesn't need probing.
|
|
+define void @static_1024(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_1024:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub sp, sp, #1024
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 1040
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #1024
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 1024, align 1
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; At 1024+16 bytes, this is the smallest frame which needs probing.
|
|
+define void @static_1040(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_1040:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub sp, sp, #1040
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 1056
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #1040
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 1040, align 1
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; 4k bytes is the largest frame we can probe in one go.
|
|
+define void @static_4096(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_4096:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 4112
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 4096, align 1
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; 4k+16 bytes, still needs just one probe.
|
|
+define void @static_4112(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_4112:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 4112
|
|
+; CHECK-NEXT: str xzr, [sp], #-16
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 4128
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 32
|
|
+; CHECK-NEXT: add sp, sp, #16
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 4112, align 1
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; 4k+1024 bytes, the largest frame which needs just one probe.
|
|
+define void @static_5120(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_5120:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 4112
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: sub sp, sp, #1024
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 5136
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 1040
|
|
+; CHECK-NEXT: add sp, sp, #1024
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 5120, align 1
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; 4k+1024+16, the smallest frame which needs two probes.
|
|
+define void @static_5136(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_5136:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 4112
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: sub sp, sp, #1040
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 5152
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 1056
|
|
+; CHECK-NEXT: add sp, sp, #1040
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 5136, align 1
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; 2*4k+1024, the largest frame needing two probes
|
|
+define void @static_9216(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_9216:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 4112
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 8208
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: sub sp, sp, #1024
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 9232
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #2, lsl #12 // =8192
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 1040
|
|
+; CHECK-NEXT: add sp, sp, #1024
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 9216, align 1
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; 5*4k-16, the largest frame probed without a loop
|
|
+define void @static_20464(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_20464:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 4112
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 8208
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 12304
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16400
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: sub sp, sp, #4080
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 20480
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #4, lsl #12 // =16384
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 4096
|
|
+; CHECK-NEXT: add sp, sp, #4080
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 20464, align 1
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; 5*4k, the smallest frame probed with a loop
|
|
+define void @static_20480(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_20480:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub x9, sp, #5, lsl #12 // =20480
|
|
+; CHECK-NEXT: .cfi_def_cfa w9, 20496
|
|
+; CHECK-NEXT: .LBB10_1: // %entry
|
|
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: cmp sp, x9
|
|
+; CHECK-NEXT: b.ne .LBB10_1
|
|
+; CHECK-NEXT: // %bb.2: // %entry
|
|
+; CHECK-NEXT: .cfi_def_cfa_register wsp
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #5, lsl #12 // =20480
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 20480, align 1
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; 5*4k + 1024, large enough to use a loop, but not a multiple of 4KiB
|
|
+; so has a reminder, but no extra probe.
|
|
+define void @static_21504(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_21504:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub x9, sp, #5, lsl #12 // =20480
|
|
+; CHECK-NEXT: .cfi_def_cfa w9, 20496
|
|
+; CHECK-NEXT: .LBB11_1: // %entry
|
|
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: cmp sp, x9
|
|
+; CHECK-NEXT: b.ne .LBB11_1
|
|
+; CHECK-NEXT: // %bb.2: // %entry
|
|
+; CHECK-NEXT: .cfi_def_cfa_register wsp
|
|
+; CHECK-NEXT: sub sp, sp, #1024
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 21520
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #5, lsl #12 // =20480
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 1040
|
|
+; CHECK-NEXT: add sp, sp, #1024
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 21504, align 1
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; 5*4k+1040, large enough to use a loop, has a reminder and
|
|
+; an extra probe.
|
|
+define void @static_21520(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_21520:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub x9, sp, #5, lsl #12 // =20480
|
|
+; CHECK-NEXT: .cfi_def_cfa w9, 20496
|
|
+; CHECK-NEXT: .LBB12_1: // %entry
|
|
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: cmp sp, x9
|
|
+; CHECK-NEXT: b.ne .LBB12_1
|
|
+; CHECK-NEXT: // %bb.2: // %entry
|
|
+; CHECK-NEXT: .cfi_def_cfa_register wsp
|
|
+; CHECK-NEXT: sub sp, sp, #1040
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 21536
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #5, lsl #12 // =20480
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 1056
|
|
+; CHECK-NEXT: add sp, sp, #1040
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 21520, align 1
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; A small allocation, but with a very large alignment requirement. We do this
|
|
+; by moving SP far enough that a sufficiently-aligned block will exist
|
|
+; somewhere in the stack frame, so must probe the whole of that larger SP move.
|
|
+define void @static_16_align_8192(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_16_align_8192:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: mov x29, sp
|
|
+; CHECK-NEXT: .cfi_def_cfa w29, 16
|
|
+; CHECK-NEXT: .cfi_offset w30, -8
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: sub x9, x9, #4080
|
|
+; CHECK-NEXT: and x9, x9, #0xffffffffffffe000
|
|
+; CHECK-NEXT: .LBB13_1: // %entry
|
|
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: cmp sp, x9
|
|
+; CHECK-NEXT: b.le .LBB13_3
|
|
+; CHECK-NEXT: // %bb.2: // %entry
|
|
+; CHECK-NEXT: // in Loop: Header=BB13_1 Depth=1
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: b .LBB13_1
|
|
+; CHECK-NEXT: .LBB13_3: // %entry
|
|
+; CHECK-NEXT: mov sp, x9
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: mov sp, x29
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w30
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 16, align 8192
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; A small allocation with a very large alignment requirement, but
|
|
+; nevertheless small enough as to not need a loop.
|
|
+define void @static_16_align_2048(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_16_align_2048:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: mov x29, sp
|
|
+; CHECK-NEXT: .cfi_def_cfa w29, 16
|
|
+; CHECK-NEXT: .cfi_offset w30, -8
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub x9, sp, #2032
|
|
+; CHECK-NEXT: and sp, x9, #0xfffffffffffff800
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: mov sp, x29
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w30
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 16, align 2048
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; A large(-ish) allocation with a very large alignment requirement, but
|
|
+; nevertheless small enough as to not need a loop.
|
|
+define void @static_2032_align_2048(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_2032_align_2048:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: mov x29, sp
|
|
+; CHECK-NEXT: .cfi_def_cfa w29, 16
|
|
+; CHECK-NEXT: .cfi_offset w30, -8
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub x9, sp, #2032
|
|
+; CHECK-NEXT: and sp, x9, #0xfffffffffffff800
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: mov sp, x29
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w30
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 2032, align 2048
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; Test stack probing is enabled by module flags
|
|
+define void @static_9232(ptr %out) uwtable(async) {
|
|
+; CHECK-LABEL: static_9232:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub sp, sp, #2, lsl #12 // =8192
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 8208
|
|
+; CHECK-NEXT: sub sp, sp, #800
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 9008
|
|
+; CHECK-NEXT: str xzr, [sp], #-240
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 9248
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: add sp, sp, #2, lsl #12 // =8192
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 1056
|
|
+; CHECK-NEXT: add sp, sp, #1040
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i64 9232, align 1
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; Test for a tight upper bound on the amount of stack adjustment
|
|
+; due to stack realignment. No probes should appear.
|
|
+define void @static_1008(ptr %out) #0 {
|
|
+; CHECK-LABEL: static_1008:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: mov x29, sp
|
|
+; CHECK-NEXT: .cfi_def_cfa w29, 16
|
|
+; CHECK-NEXT: .cfi_offset w30, -8
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: sub x9, sp, #1008
|
|
+; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str x8, [x0]
|
|
+; CHECK-NEXT: mov sp, x29
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w30
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i8, i32 1008, align 32
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "stack-probe-size"="4096" "frame-pointer"="none" }
|
|
+
|
|
+!llvm.module.flags = !{!0, !1}
|
|
+
|
|
+!0 = !{i32 4, !"probe-stack", !"inline-asm"}
|
|
+!1 = !{i32 8, !"stack-probe-size", i32 9000}
|
|
\ No newline at end of file
|
|
--
|
|
2.42.0.windows.2
|
|
|