745 lines
30 KiB
Diff
745 lines
30 KiB
Diff
From e433199a7dbe87324a671299f6509f19d295382f Mon Sep 17 00:00:00 2001
|
|
From: rickyleung <leung.wing.chung@huawei.com>
|
|
Date: Fri, 26 Apr 2024 16:59:48 +0800
|
|
Subject: [PATCH 5/7] [backport][AArch64] Stack probing for dynamic allocas in
|
|
SelectionDAG
|
|
|
|
Reference: https://github.com/llvm/llvm-project/commit/b1806e6a1f0589acc88499419531c4eb82488f1a
|
|
|
|
Add support for probing for dynamic allocas (variable-size objects and
|
|
outgoing stack arguments).
|
|
|
|
Co-authored-by: Oliver Stannard <oliver.stannard@linaro.org>
|
|
---
|
|
.../Target/AArch64/AArch64FrameLowering.cpp | 26 ++
|
|
.../Target/AArch64/AArch64ISelLowering.cpp | 152 +++++---
|
|
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 13 +-
|
|
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 14 +
|
|
.../stack-probing-dynamic-no-frame-setup.ll | 14 +
|
|
.../CodeGen/AArch64/stack-probing-dynamic.ll | 362 ++++++++++++++++++
|
|
6 files changed, 526 insertions(+), 55 deletions(-)
|
|
create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll
|
|
create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
|
|
|
|
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
|
|
index af019ab23770..fe21173f531f 100644
|
|
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
|
|
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
|
|
@@ -462,6 +462,11 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
|
|
/// included as part of the stack frame.
|
|
bool
|
|
AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
|
|
+ // The stack probing code for the dynamically allocated outgoing arguments
|
|
+ // area assumes that the stack is probed at the top - either by the prologue
|
|
+ // code, which issues a probe if `hasVarSizedObjects` return true, or by the
|
|
+ // most recent variable-sized object allocation. Changing the condition here
|
|
+ // may need to be followed up by changes to the probe issuing logic.
|
|
return !MF.getFrameInfo().hasVarSizedObjects();
|
|
}
|
|
|
|
@@ -470,6 +475,9 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
|
|
MachineBasicBlock::iterator I) const {
|
|
const AArch64InstrInfo *TII =
|
|
static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
|
|
+ const AArch64TargetLowering *TLI =
|
|
+ MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
|
|
+ MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
DebugLoc DL = I->getDebugLoc();
|
|
unsigned Opc = I->getOpcode();
|
|
bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
|
|
@@ -496,6 +504,24 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
|
|
// Most call frames will be allocated at the start of a function so
|
|
// this is OK, but it is a limitation that needs dealing with.
|
|
assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
|
|
+
|
|
+ if (TLI->hasInlineStackProbe(MF) &&
|
|
+ -Amount >= AArch64::StackProbeMaxUnprobedStack) {
|
|
+ // When stack probing is enabled, the decrement of SP may need to be
|
|
+ // probed. We only need to do this if the call site needs 1024 bytes of
|
|
+ // space or more, because a region smaller than that is allowed to be
|
|
+ // unprobed at an ABI boundary. We rely on the fact that SP has been
|
|
+ // probed exactly at this point, either by the prologue or most recent
|
|
+ // dynamic allocation.
|
|
+ assert(MFI.hasVarSizedObjects() &&
|
|
+ "non-reserved call frame without var sized objects?");
|
|
+ Register ScratchReg =
|
|
+ MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
|
|
+ inlineStackProbeFixed(I, ScratchReg, -Amount, StackOffset::get(0, 0));
|
|
+ } else {
|
|
+ emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
|
|
+ StackOffset::getFixed(Amount), TII);
|
|
+ }
|
|
emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
|
|
StackOffset::getFixed(Amount), TII);
|
|
}
|
|
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
|
|
index 082043420fb9..eff0722e1c77 100644
|
|
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
|
|
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
|
|
@@ -556,10 +556,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
|
|
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
|
|
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
|
|
|
|
- if (Subtarget->isTargetWindows())
|
|
- setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
|
|
- else
|
|
- setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
|
|
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
|
|
|
|
// Constant pool entries
|
|
setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
|
|
@@ -2288,6 +2285,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|
MAKE_CASE(AArch64ISD::CSINC)
|
|
MAKE_CASE(AArch64ISD::THREAD_POINTER)
|
|
MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
|
|
+ MAKE_CASE(AArch64ISD::PROBED_ALLOCA)
|
|
MAKE_CASE(AArch64ISD::ABDS_PRED)
|
|
MAKE_CASE(AArch64ISD::ABDU_PRED)
|
|
MAKE_CASE(AArch64ISD::HADDS_PRED)
|
|
@@ -2646,6 +2644,22 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
|
|
return BB;
|
|
}
|
|
|
|
+MachineBasicBlock *
|
|
+AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
|
|
+ MachineBasicBlock *MBB) const {
|
|
+ MachineFunction &MF = *MBB->getParent();
|
|
+ MachineBasicBlock::iterator MBBI = MI.getIterator();
|
|
+ DebugLoc DL = MBB->findDebugLoc(MBBI);
|
|
+ const AArch64InstrInfo &TII =
|
|
+ *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
|
|
+ Register TargetReg = MI.getOperand(0).getReg();
|
|
+ MachineBasicBlock::iterator NextInst =
|
|
+ TII.probedStackAlloc(MBBI, TargetReg, false);
|
|
+
|
|
+ MI.eraseFromParent();
|
|
+ return NextInst->getParent();
|
|
+}
|
|
+
|
|
MachineBasicBlock *
|
|
AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
|
|
MachineInstr &MI,
|
|
@@ -2774,6 +2788,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
|
|
|
|
case AArch64::CATCHRET:
|
|
return EmitLoweredCatchRet(MI, BB);
|
|
+ case AArch64::PROBED_STACKALLOC_DYN:
|
|
+ return EmitDynamicProbedAlloc(MI, BB);
|
|
case AArch64::LD1_MXIPXX_H_PSEUDO_B:
|
|
return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
|
|
case AArch64::LD1_MXIPXX_H_PSEUDO_H:
|
|
@@ -13666,9 +13682,34 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
|
|
AN->getMemOperand());
|
|
}
|
|
|
|
-SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
|
|
- SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
|
|
+SDValue
|
|
+AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
|
|
+ SelectionDAG &DAG) const {
|
|
+
|
|
SDLoc dl(Op);
|
|
+ // Get the inputs.
|
|
+ SDNode *Node = Op.getNode();
|
|
+ SDValue Chain = Op.getOperand(0);
|
|
+ SDValue Size = Op.getOperand(1);
|
|
+ MaybeAlign Align =
|
|
+ cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
|
|
+ EVT VT = Node->getValueType(0);
|
|
+
|
|
+ if (DAG.getMachineFunction().getFunction().hasFnAttribute(
|
|
+ "no-stack-arg-probe")) {
|
|
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
|
|
+ Chain = SP.getValue(1);
|
|
+ SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
|
|
+ if (Align)
|
|
+ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
|
|
+ DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
|
|
+ Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
|
|
+ SDValue Ops[2] = {SP, Chain};
|
|
+ return DAG.getMergeValues(Ops, dl);
|
|
+ }
|
|
+
|
|
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
|
|
+
|
|
EVT PtrVT = getPointerTy(DAG.getDataLayout());
|
|
SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
|
|
PtrVT, 0);
|
|
@@ -13692,7 +13733,59 @@ SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
|
|
|
|
Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
|
|
DAG.getConstant(4, dl, MVT::i64));
|
|
- return Chain;
|
|
+
|
|
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
|
|
+ Chain = SP.getValue(1);
|
|
+ SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
|
|
+ if (Align)
|
|
+ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
|
|
+ DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
|
|
+ Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
|
|
+
|
|
+ Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
|
|
+
|
|
+ SDValue Ops[2] = {SP, Chain};
|
|
+ return DAG.getMergeValues(Ops, dl);
|
|
+}
|
|
+
|
|
+SDValue
|
|
+AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
|
|
+ SelectionDAG &DAG) const {
|
|
+ // Get the inputs.
|
|
+ SDNode *Node = Op.getNode();
|
|
+ SDValue Chain = Op.getOperand(0);
|
|
+ SDValue Size = Op.getOperand(1);
|
|
+
|
|
+ MaybeAlign Align =
|
|
+ cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
|
|
+ SDLoc dl(Op);
|
|
+ EVT VT = Node->getValueType(0);
|
|
+
|
|
+ // Construct the new SP value in a GPR.
|
|
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
|
|
+ Chain = SP.getValue(1);
|
|
+ SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
|
|
+ if (Align)
|
|
+ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
|
|
+ DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
|
|
+
|
|
+ // Set the real SP to the new value with a probing loop.
|
|
+ Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
|
|
+ SDValue Ops[2] = {SP, Chain};
|
|
+ return DAG.getMergeValues(Ops, dl);
|
|
+}
|
|
+
|
|
+SDValue
|
|
+AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
|
|
+ SelectionDAG &DAG) const {
|
|
+ MachineFunction &MF = DAG.getMachineFunction();
|
|
+
|
|
+ if (Subtarget->isTargetWindows())
|
|
+ return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
|
|
+ else if (hasInlineStackProbe(MF))
|
|
+ return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
|
|
+ else
|
|
+ return SDValue();
|
|
}
|
|
|
|
// When x and y are extended, lower:
|
|
@@ -13746,51 +13839,6 @@ SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
|
|
return DAG.getNode(ISD::ADD, dl, VT, Add, tmp);
|
|
}
|
|
|
|
-SDValue
|
|
-AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
|
|
- SelectionDAG &DAG) const {
|
|
- assert(Subtarget->isTargetWindows() &&
|
|
- "Only Windows alloca probing supported");
|
|
- SDLoc dl(Op);
|
|
- // Get the inputs.
|
|
- SDNode *Node = Op.getNode();
|
|
- SDValue Chain = Op.getOperand(0);
|
|
- SDValue Size = Op.getOperand(1);
|
|
- MaybeAlign Align =
|
|
- cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
|
|
- EVT VT = Node->getValueType(0);
|
|
-
|
|
- if (DAG.getMachineFunction().getFunction().hasFnAttribute(
|
|
- "no-stack-arg-probe")) {
|
|
- SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
|
|
- Chain = SP.getValue(1);
|
|
- SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
|
|
- if (Align)
|
|
- SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
|
|
- DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
|
|
- Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
|
|
- SDValue Ops[2] = {SP, Chain};
|
|
- return DAG.getMergeValues(Ops, dl);
|
|
- }
|
|
-
|
|
- Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
|
|
-
|
|
- Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
|
|
-
|
|
- SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
|
|
- Chain = SP.getValue(1);
|
|
- SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
|
|
- if (Align)
|
|
- SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
|
|
- DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
|
|
- Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
|
|
-
|
|
- Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
|
|
-
|
|
- SDValue Ops[2] = {SP, Chain};
|
|
- return DAG.getMergeValues(Ops, dl);
|
|
-}
|
|
-
|
|
SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
EVT VT = Op.getValueType();
|
|
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
|
|
index 643d363e234a..9b388c7f8668 100644
|
|
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
|
|
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
|
|
@@ -90,6 +90,10 @@ enum NodeType : unsigned {
|
|
ADC,
|
|
SBC, // adc, sbc instructions
|
|
|
|
+ // To avoid stack clash, allocation is performed by block and each block is
|
|
+ // probed.
|
|
+ PROBED_ALLOCA,
|
|
+
|
|
// Predicated instructions where inactive lanes produce undefined results.
|
|
ABDS_PRED,
|
|
ABDU_PRED,
|
|
@@ -610,6 +614,9 @@ public:
|
|
MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
|
|
MachineBasicBlock *BB) const;
|
|
|
|
+ MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI,
|
|
+ MachineBasicBlock *MBB) const;
|
|
+
|
|
MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg,
|
|
MachineInstr &MI,
|
|
MachineBasicBlock *BB) const;
|
|
@@ -1113,10 +1120,10 @@ private:
|
|
SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
|
|
SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
|
|
SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
|
|
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
|
|
- SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain,
|
|
- SDValue &Size,
|
|
- SelectionDAG &DAG) const;
|
|
+
|
|
SDValue LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const;
|
|
|
|
SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op,
|
|
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
|
|
index 09980c2f45e6..9b9103e01d67 100644
|
|
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
|
|
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
|
|
@@ -818,6 +818,12 @@ def AArch64stilp : SDNode<"AArch64ISD::STILP", SDT_AArch64stilp, [SDNPHasChain,
|
|
def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
|
|
|
|
def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;
|
|
+
|
|
+def AArch64probedalloca
|
|
+ : SDNode<"AArch64ISD::PROBED_ALLOCA",
|
|
+ SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
|
|
+ [SDNPHasChain, SDNPMayStore]>;
|
|
+
|
|
def AArch64mrs : SDNode<"AArch64ISD::MRS",
|
|
SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>,
|
|
[SDNPHasChain, SDNPOutGlue]>;
|
|
@@ -908,6 +914,14 @@ def PROBED_STACKALLOC_VAR : Pseudo<(outs),
|
|
[]>,
|
|
Sched<[]>;
|
|
|
|
+// Probed stack allocations of a variable size, used for allocas of unknown size
|
|
+// when stack-clash protection is enabled.
|
|
+let usesCustomInserter = 1 in
|
|
+def PROBED_STACKALLOC_DYN : Pseudo<(outs),
|
|
+ (ins GPR64common:$target),
|
|
+ [(AArch64probedalloca GPR64common:$target)]>,
|
|
+ Sched<[]>;
|
|
+
|
|
} // Defs = [SP, NZCV], Uses = [SP] in
|
|
} // hasSideEffects = 1, isCodeGenOnly = 1
|
|
|
|
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll
|
|
new file mode 100644
|
|
index 000000000000..673f9038a35f
|
|
--- /dev/null
|
|
+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll
|
|
@@ -0,0 +1,14 @@
|
|
+; RUN: llc --stop-after=finalize-isel -o - | FileCheck %s
|
|
+target triple = "aarch64-linux"
|
|
+
|
|
+; Check dynamic stack allocation and probing instructions do not have
|
|
+; the FrameSetup flag.
|
|
+
|
|
+; CHECK-NOT: frame-setup
|
|
+define void @no_frame_setup(i64 %size, ptr %out) #0 {
|
|
+ %v = alloca i8, i64 %size, align 1
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }
|
|
\ No newline at end of file
|
|
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
|
|
new file mode 100644
|
|
index 000000000000..4d9ef77f7a0d
|
|
--- /dev/null
|
|
+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
|
|
@@ -0,0 +1,362 @@
|
|
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s
|
|
+
|
|
+; Dynamically-sized allocation, needs a loop which can handle any size at
|
|
+; runtime. The final iteration of the loop will temporarily put SP below the
|
|
+; target address, but this doesn't break any of the ABI constraints on the
|
|
+; stack, and also doesn't probe below the target SP value.
|
|
+define void @dynamic(i64 %size, ptr %out) #0 {
|
|
+; CHECK-LABEL: dynamic:
|
|
+; CHECK: // %bb.0:
|
|
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: mov x29, sp
|
|
+; CHECK-NEXT: .cfi_def_cfa w29, 16
|
|
+; CHECK-NEXT: .cfi_offset w30, -8
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: add x9, x0, #15
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
|
|
+; CHECK-NEXT: sub x8, x8, x9
|
|
+; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: cmp sp, x8
|
|
+; CHECK-NEXT: b.le .LBB0_3
|
|
+; CHECK-NEXT: // %bb.2: // in Loop: Header=BB0_1 Depth=1
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: b .LBB0_1
|
|
+; CHECK-NEXT: .LBB0_3:
|
|
+; CHECK-NEXT: mov sp, x8
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: str x8, [x1]
|
|
+; CHECK-NEXT: mov sp, x29
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w30
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+ %v = alloca i8, i64 %size, align 1
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; This function has a fixed-size stack slot and a dynamic one. The fixed size
|
|
+; slot isn't large enough that we would normally probe it, but we need to do so
|
|
+; here otherwise the gap between the CSR save and the first probe of the
|
|
+; dynamic allocation could be too far apart when the size of the dynamic
|
|
+; allocation is close to the guard size.
|
|
+define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 {
|
|
+; CHECK-LABEL: dynamic_fixed:
|
|
+; CHECK: // %bb.0:
|
|
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: mov x29, sp
|
|
+; CHECK-NEXT: .cfi_def_cfa w29, 16
|
|
+; CHECK-NEXT: .cfi_offset w30, -8
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: str xzr, [sp, #-64]!
|
|
+; CHECK-NEXT: add x9, x0, #15
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: sub x10, x29, #64
|
|
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
|
|
+; CHECK-NEXT: str x10, [x1]
|
|
+; CHECK-NEXT: sub x8, x8, x9
|
|
+; CHECK-NEXT: .LBB1_1: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: cmp sp, x8
|
|
+; CHECK-NEXT: b.le .LBB1_3
|
|
+; CHECK-NEXT: // %bb.2: // in Loop: Header=BB1_1 Depth=1
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: b .LBB1_1
|
|
+; CHECK-NEXT: .LBB1_3:
|
|
+; CHECK-NEXT: mov sp, x8
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: str x8, [x2]
|
|
+; CHECK-NEXT: mov sp, x29
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w30
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+ %v1 = alloca i8, i64 64, align 1
|
|
+ store ptr %v1, ptr %out1, align 8
|
|
+ %v2 = alloca i8, i64 %size, align 1
|
|
+ store ptr %v2, ptr %out2, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; Dynamic allocation, with an alignment requirement greater than the alignment
|
|
+; of SP. Done by ANDing the target SP with a constant to align it down, then
|
|
+; doing the loop as normal. Note that we also re-align the stack in the prolog,
|
|
+; which isn't actually needed because the only aligned allocations are dynamic,
|
|
+; this is done even without stack probing.
|
|
+define void @dynamic_align_64(i64 %size, ptr %out) #0 {
|
|
+; CHECK-LABEL: dynamic_align_64:
|
|
+; CHECK: // %bb.0:
|
|
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 32
|
|
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
|
|
+; CHECK-NEXT: mov x29, sp
|
|
+; CHECK-NEXT: .cfi_def_cfa w29, 32
|
|
+; CHECK-NEXT: .cfi_offset w19, -16
|
|
+; CHECK-NEXT: .cfi_offset w30, -24
|
|
+; CHECK-NEXT: .cfi_offset w29, -32
|
|
+; CHECK-NEXT: sub x9, sp, #32
|
|
+; CHECK-NEXT: and sp, x9, #0xffffffffffffffc0
|
|
+; CHECK-NEXT: add x9, x0, #15
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
|
|
+; CHECK-NEXT: mov x19, sp
|
|
+; CHECK-NEXT: sub x8, x8, x9
|
|
+; CHECK-NEXT: and x8, x8, #0xffffffffffffffc0
|
|
+; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: cmp sp, x8
|
|
+; CHECK-NEXT: b.le .LBB2_3
|
|
+; CHECK-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: b .LBB2_1
|
|
+; CHECK-NEXT: .LBB2_3:
|
|
+; CHECK-NEXT: mov sp, x8
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: str x8, [x1]
|
|
+; CHECK-NEXT: mov sp, x29
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 32
|
|
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
|
|
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w19
|
|
+; CHECK-NEXT: .cfi_restore w30
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+ %v = alloca i8, i64 %size, align 64
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; Dynamic allocation, with an alignment greater than the stack guard size. The
|
|
+; only difference to the dynamic allocation is the constant used for aligning
|
|
+; the target SP, the loop will probe the whole allocation without needing to
|
|
+; know about the alignment padding.
|
|
+define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
|
|
+; CHECK-LABEL: dynamic_align_8192:
|
|
+; CHECK: // %bb.0:
|
|
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 32
|
|
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
|
|
+; CHECK-NEXT: mov x29, sp
|
|
+; CHECK-NEXT: .cfi_def_cfa w29, 32
|
|
+; CHECK-NEXT: .cfi_offset w19, -16
|
|
+; CHECK-NEXT: .cfi_offset w30, -24
|
|
+; CHECK-NEXT: .cfi_offset w29, -32
|
|
+; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: sub x9, x9, #4064
|
|
+; CHECK-NEXT: and x9, x9, #0xffffffffffffe000
|
|
+; CHECK-NEXT: .LBB3_1: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: cmp sp, x9
|
|
+; CHECK-NEXT: b.le .LBB3_3
|
|
+; CHECK-NEXT: // %bb.2: // in Loop: Header=BB3_1 Depth=1
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: b .LBB3_1
|
|
+; CHECK-NEXT: .LBB3_3:
|
|
+; CHECK-NEXT: mov sp, x9
|
|
+; CHECK-NEXT: add x9, x0, #15
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
|
|
+; CHECK-NEXT: mov x19, sp
|
|
+; CHECK-NEXT: sub x8, x8, x9
|
|
+; CHECK-NEXT: and x8, x8, #0xffffffffffffe000
|
|
+; CHECK-NEXT: .LBB3_4: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: cmp sp, x8
|
|
+; CHECK-NEXT: b.le .LBB3_6
|
|
+; CHECK-NEXT: // %bb.5: // in Loop: Header=BB3_4 Depth=1
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: b .LBB3_4
|
|
+; CHECK-NEXT: .LBB3_6:
|
|
+; CHECK-NEXT: mov sp, x8
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: str x8, [x1]
|
|
+; CHECK-NEXT: mov sp, x29
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 32
|
|
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
|
|
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w19
|
|
+; CHECK-NEXT: .cfi_restore w30
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+ %v = alloca i8, i64 %size, align 8192
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; For 64k guard pages, the only difference is the constant subtracted from SP
|
|
+; in the loop.
|
|
+define void @dynamic_64k_guard(i64 %size, ptr %out) #0 "stack-probe-size"="65536" {
|
|
+; CHECK-LABEL: dynamic_64k_guard:
|
|
+; CHECK: // %bb.0:
|
|
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: mov x29, sp
|
|
+; CHECK-NEXT: .cfi_def_cfa w29, 16
|
|
+; CHECK-NEXT: .cfi_offset w30, -8
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: add x9, x0, #15
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
|
|
+; CHECK-NEXT: sub x8, x8, x9
|
|
+; CHECK-NEXT: .LBB4_1: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
|
|
+; CHECK-NEXT: cmp sp, x8
|
|
+; CHECK-NEXT: b.le .LBB4_3
|
|
+; CHECK-NEXT: // %bb.2: // in Loop: Header=BB4_1 Depth=1
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: b .LBB4_1
|
|
+; CHECK-NEXT: .LBB4_3:
|
|
+; CHECK-NEXT: mov sp, x8
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: str x8, [x1]
|
|
+; CHECK-NEXT: mov sp, x29
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w30
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+ %v = alloca i8, i64 %size, align 1
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; If a function has variable-sized stack objects, then any function calls which
|
|
+; need to pass arguments on the stack must allocate the stack space for them
|
|
+; dynamically, to ensure they are at the bottom of the frame. We need to probe
|
|
+; that space when it is larger than the unprobed space allowed by the ABI (1024
|
|
+; bytes), so this needs a very large number of arguments.
|
|
+define void @no_reserved_call_frame(i64 %n) #0 {
|
|
+; CHECK-LABEL: no_reserved_call_frame:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
+; CHECK-NEXT: mov x29, sp
|
|
+; CHECK-NEXT: .cfi_def_cfa w29, 16
|
|
+; CHECK-NEXT: .cfi_offset w30, -8
|
|
+; CHECK-NEXT: .cfi_offset w29, -16
|
|
+; CHECK-NEXT: lsl x9, x0, #2
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: add x9, x9, #15
|
|
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
|
|
+; CHECK-NEXT: sub x0, x8, x9
|
|
+; CHECK-NEXT: .LBB5_1: // %entry
|
|
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: cmp sp, x0
|
|
+; CHECK-NEXT: b.le .LBB5_3
|
|
+; CHECK-NEXT: // %bb.2: // %entry
|
|
+; CHECK-NEXT: // in Loop: Header=BB5_1 Depth=1
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: b .LBB5_1
|
|
+; CHECK-NEXT: .LBB5_3: // %entry
|
|
+; CHECK-NEXT: mov sp, x0
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: sub sp, sp, #1104
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: bl callee_stack_args
|
|
+; CHECK-NEXT: add sp, sp, #1104
|
|
+; CHECK-NEXT: mov sp, x29
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w30
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i32, i64 %n
|
|
+ call void @callee_stack_args(ptr %v, [138 x i64] undef)
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; Same as above but without a variable-sized allocation, so the reserved call
|
|
+; frame can be folded into the fixed-size allocation in the prologue.
|
|
+define void @reserved_call_frame(i64 %n) #0 {
|
|
+; CHECK-LABEL: reserved_call_frame:
|
|
+; CHECK: // %bb.0: // %entry
|
|
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 32
|
|
+; CHECK-NEXT: str x28, [sp, #16] // 8-byte Folded Spill
|
|
+; CHECK-NEXT: mov x29, sp
|
|
+; CHECK-NEXT: .cfi_def_cfa w29, 32
|
|
+; CHECK-NEXT: .cfi_offset w28, -16
|
|
+; CHECK-NEXT: .cfi_offset w30, -24
|
|
+; CHECK-NEXT: .cfi_offset w29, -32
|
|
+; CHECK-NEXT: sub sp, sp, #1504
|
|
+; CHECK-NEXT: add x0, sp, #1104
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: bl callee_stack_args
|
|
+; CHECK-NEXT: add sp, sp, #1504
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 32
|
|
+; CHECK-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload
|
|
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w28
|
|
+; CHECK-NEXT: .cfi_restore w30
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+entry:
|
|
+ %v = alloca i32, i64 100
|
|
+ call void @callee_stack_args(ptr %v, [138 x i64] undef)
|
|
+ ret void
|
|
+}
|
|
+
|
|
+declare void @callee_stack_args(ptr, [138 x i64])
|
|
+
|
|
+; Dynamic allocation of SVE vectors
|
|
+define void @dynamic_sve(i64 %size, ptr %out) #0 "target-features"="+sve" {
|
|
+; CHECK-LABEL: dynamic_sve:
|
|
+; CHECK: // %bb.0:
|
|
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 32
|
|
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
|
|
+; CHECK-NEXT: mov x29, sp
|
|
+; CHECK-NEXT: .cfi_def_cfa w29, 32
|
|
+; CHECK-NEXT: .cfi_offset w19, -16
|
|
+; CHECK-NEXT: .cfi_offset w30, -24
|
|
+; CHECK-NEXT: .cfi_offset w29, -32
|
|
+; CHECK-NEXT: rdvl x9, #1
|
|
+; CHECK-NEXT: mov x10, #15 // =0xf
|
|
+; CHECK-NEXT: mov x8, sp
|
|
+; CHECK-NEXT: madd x9, x0, x9, x10
|
|
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
|
|
+; CHECK-NEXT: sub x8, x8, x9
|
|
+; CHECK-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1
|
|
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
+; CHECK-NEXT: cmp sp, x8
|
|
+; CHECK-NEXT: b.le .LBB7_3
|
|
+; CHECK-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: b .LBB7_1
|
|
+; CHECK-NEXT: .LBB7_3:
|
|
+; CHECK-NEXT: mov sp, x8
|
|
+; CHECK-NEXT: str xzr, [sp]
|
|
+; CHECK-NEXT: str x8, [x1]
|
|
+; CHECK-NEXT: mov sp, x29
|
|
+; CHECK-NEXT: .cfi_def_cfa wsp, 32
|
|
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
|
|
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
|
|
+; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
+; CHECK-NEXT: .cfi_restore w19
|
|
+; CHECK-NEXT: .cfi_restore w30
|
|
+; CHECK-NEXT: .cfi_restore w29
|
|
+; CHECK-NEXT: ret
|
|
+ %v = alloca <vscale x 4 x float>, i64 %size, align 16
|
|
+ store ptr %v, ptr %out, align 8
|
|
+ ret void
|
|
+}
|
|
+
|
|
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }
|
|
\ No newline at end of file
|
|
--
|
|
2.42.0.windows.2
|
|
|