package init as gcc-7.3.0

This commit is contained in:
eastb233 2020-01-19 17:29:40 +08:00
commit 523d5a7c93
25 changed files with 10216 additions and 0 deletions

View File

@ -0,0 +1,126 @@
From 900ccfa89dda3ab5f7e44a0dd4d1e9d108b5dc8b Mon Sep 17 00:00:00 2001
From: rguenth <rguenth@138bc75d-0d04-0410-961f-82ee72b054a4>
Date: Tue, 26 Mar 2019 13:18:23 +0000
Subject: [PATCH] 2019-02-26 Richard Biener <rguenther@suse.de>
Backport from mainline
2019-02-12 Richard Biener <rguenther@suse.de>
PR tree-optimization/89253
* tree-ssa-loop-split.c (tree_ssa_split_loops): Check we can
duplicate the loop.
* gfortran.dg/pr89253.f: New testcase.
2019-02-08 Richard Biener <rguenther@suse.de>
PR middle-end/89223
* tree-data-ref.c (initialize_matrix_A): Fail if constant
doesn't fit in HWI.
(analyze_subscript_affine_affine): Handle failure from
initialize_matrix_A.
* gcc.dg/torture/pr89223.c: New testcase.
2019-01-28 Richard Biener <rguenther@suse.de>
PR tree-optimization/88739
* tree-ssa-sccvn.c (vn_reference_lookup_3): Avoid generating
BIT_FIELD_REFs of non-mode-precision integral operands.
* gcc.c-torture/execute/pr88739.c: New test.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/gcc-7-branch@269942 138bc75d-0d04-0410-961f-82ee72b054a4
---
diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c
index 2480f4e..a349e3e 100644
--- a/gcc/tree-data-ref.c
+++ b/gcc/tree-data-ref.c
@@ -2118,6 +2118,8 @@ initialize_matrix_A (lambda_matrix A, tree chrec, unsigned index, int mult)
switch (TREE_CODE (chrec))
{
case POLYNOMIAL_CHREC:
+ if (!cst_and_fits_in_hwi (CHREC_RIGHT (chrec)))
+ return chrec_dont_know;
A[index][0] = mult * int_cst_value (CHREC_RIGHT (chrec));
return initialize_matrix_A (A, CHREC_LEFT (chrec), index + 1, mult);
@@ -2499,7 +2501,7 @@ analyze_subscript_affine_affine (tree chrec_a,
tree *last_conflicts)
{
unsigned nb_vars_a, nb_vars_b, dim;
- HOST_WIDE_INT init_a, init_b, gamma, gcd_alpha_beta;
+ HOST_WIDE_INT gamma, gcd_alpha_beta;
lambda_matrix A, U, S;
struct obstack scratch_obstack;
@@ -2536,9 +2538,20 @@ analyze_subscript_affine_affine (tree chrec_a,
A = lambda_matrix_new (dim, 1, &scratch_obstack);
S = lambda_matrix_new (dim, 1, &scratch_obstack);
- init_a = int_cst_value (initialize_matrix_A (A, chrec_a, 0, 1));
- init_b = int_cst_value (initialize_matrix_A (A, chrec_b, nb_vars_a, -1));
- gamma = init_b - init_a;
+ tree init_a = initialize_matrix_A (A, chrec_a, 0, 1);
+ tree init_b = initialize_matrix_A (A, chrec_b, nb_vars_a, -1);
+ if (init_a == chrec_dont_know
+ || init_b == chrec_dont_know)
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "affine-affine test failed: "
+ "representation issue.\n");
+ *overlaps_a = conflict_fn_not_known ();
+ *overlaps_b = conflict_fn_not_known ();
+ *last_conflicts = chrec_dont_know;
+ goto end_analyze_subs_aa;
+ }
+ gamma = int_cst_value (init_b) - int_cst_value (init_a);
/* Don't do all the hard work of solving the Diophantine equation
when we already know the solution: for example,
diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
index fd97213..3992597 100644
--- a/gcc/tree-ssa-loop-split.c
+++ b/gcc/tree-ssa-loop-split.c
@@ -649,7 +649,8 @@ tree_ssa_split_loops (void)
false, true)
&& niter.cmp != ERROR_MARK
/* We can't yet handle loops controlled by a != predicate. */
- && niter.cmp != NE_EXPR)
+ && niter.cmp != NE_EXPR
+ && can_duplicate_loop_p (loop))
{
if (split_loop (loop, &niter))
{
diff --git a/gcc/tree-ssa-sccvn.c b/gcc/tree-ssa-sccvn.c
index c93f1f2..a2e3ce2 100644
--- a/gcc/tree-ssa-sccvn.c
+++ b/gcc/tree-ssa-sccvn.c
@@ -2029,6 +2029,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *vr_,
base2 = get_ref_base_and_extent (gimple_assign_lhs (def_stmt),
&offset2, &size2, &maxsize2,
&reverse);
+ tree def_rhs = gimple_assign_rhs1 (def_stmt);
if (!reverse
&& maxsize2 != -1
&& maxsize2 == size2
@@ -2041,11 +2042,14 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *vr_,
according to endianness. */
&& (! INTEGRAL_TYPE_P (vr->type)
|| ref->size == TYPE_PRECISION (vr->type))
- && ref->size % BITS_PER_UNIT == 0)
+ && ref->size % BITS_PER_UNIT == 0
+ && (! INTEGRAL_TYPE_P (TREE_TYPE (def_rhs))
+ || (TYPE_PRECISION (TREE_TYPE (def_rhs))
+ == GET_MODE_PRECISION (TYPE_MODE (TREE_TYPE (def_rhs))))))
{
code_helper rcode = BIT_FIELD_REF;
tree ops[3];
- ops[0] = SSA_VAL (gimple_assign_rhs1 (def_stmt));
+ ops[0] = SSA_VAL (def_rhs);
ops[1] = bitsize_int (ref->size);
ops[2] = bitsize_int (offset - offset2);
tree val = vn_nary_build_or_lookup (rcode, vr->type, ops);
--
2.9.3

655
CVE-2018-12886.patch Normal file
View File

@ -0,0 +1,655 @@
diff -urpN a/gcc/cfgexpand.c b/gcc/cfgexpand.c
--- a/gcc/cfgexpand.c 2019-05-30 16:58:45.350508770 +0800
+++ b/gcc/cfgexpand.c 2019-05-30 11:53:13.315156625 +0800
@@ -6094,6 +6094,23 @@ stack_protect_prologue (void)
rtx x, y;
x = expand_normal (crtl->stack_protect_guard);
+
+ if (targetm.have_stack_protect_combined_set () && guard_decl)
+ {
+ gcc_assert (DECL_P (guard_decl));
+ y = DECL_RTL (guard_decl);
+
+ /* Allow the target to compute address of Y and copy it to X without
+ leaking Y into a register. This combined address + copy pattern
+ allows the target to prevent spilling of any intermediate results by
+ splitting it after register allocator. */
+ if (rtx_insn *insn = targetm.gen_stack_protect_combined_set (x, y))
+ {
+ emit_insn (insn);
+ return;
+ }
+ }
+
if (guard_decl)
y = expand_normal (guard_decl);
else
diff -urpN a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
--- a/gcc/config/arm/arm.c 2019-05-30 16:58:45.354508770 +0800
+++ b/gcc/config/arm/arm.c 2019-05-30 16:59:05.058508073 +0800
@@ -7236,21 +7236,34 @@ legitimate_pic_operand_p (rtx x)
return 1;
}
-/* Record that the current function needs a PIC register. Initialize
- cfun->machine->pic_reg if we have not already done so. */
+/* Record that the current function needs a PIC register. If PIC_REG is null,
+ a new pseudo is allocated as PIC register, otherwise PIC_REG is used. In
+ both case cfun->machine->pic_reg is initialized if we have not already done
+ so. COMPUTE_NOW decide whether and where to set the PIC register. If true,
+ PIC register is reloaded in the current position of the instruction stream
+ irregardless of whether it was loaded before. Otherwise, it is only loaded
+ if not already done so (crtl->uses_pic_offset_table is null). Note that
+ nonnull PIC_REG is only supported iff COMPUTE_NOW is true and null PIC_REG
+ is only supported iff COMPUTE_NOW is false. */
static void
-require_pic_register (void)
+require_pic_register (rtx pic_reg, bool compute_now)
{
+ gcc_assert (compute_now == (pic_reg != NULL_RTX));
+
/* A lot of the logic here is made obscure by the fact that this
routine gets called as part of the rtx cost estimation process.
We don't want those calls to affect any assumptions about the real
function; and further, we can't call entry_of_function() until we
start the real expansion process. */
- if (!crtl->uses_pic_offset_table)
+ if (!crtl->uses_pic_offset_table || compute_now)
{
- gcc_assert (can_create_pseudo_p ());
+ gcc_assert (can_create_pseudo_p ()
+ || (pic_reg != NULL_RTX
+ && REG_P (pic_reg)
+ && GET_MODE (pic_reg) == Pmode));
if (arm_pic_register != INVALID_REGNUM
+ && !compute_now
&& !(TARGET_THUMB1 && arm_pic_register > LAST_LO_REGNUM))
{
if (!cfun->machine->pic_reg)
@@ -7266,8 +7279,19 @@ require_pic_register (void)
{
rtx_insn *seq, *insn;
- if (!cfun->machine->pic_reg)
- cfun->machine->pic_reg = gen_reg_rtx (Pmode);
+ if (pic_reg == NULL_RTX && cfun->machine->pic_reg == NULL_RTX)
+ {
+ pic_reg = gen_reg_rtx (Pmode);
+ cfun->machine->pic_reg = pic_reg;
+ }
+ else if (pic_reg == NULL_RTX)
+ {
+ pic_reg = cfun->machine->pic_reg;
+ }
+ else if (cfun->machine->pic_reg == NULL_RTX)
+ {
+ cfun->machine->pic_reg = pic_reg;
+ }
/* Play games to avoid marking the function as needing pic
if we are being called as part of the cost-estimation
@@ -7278,11 +7306,12 @@ require_pic_register (void)
start_sequence ();
if (TARGET_THUMB1 && arm_pic_register != INVALID_REGNUM
- && arm_pic_register > LAST_LO_REGNUM)
+ && arm_pic_register > LAST_LO_REGNUM
+ && !compute_now)
emit_move_insn (cfun->machine->pic_reg,
gen_rtx_REG (Pmode, arm_pic_register));
else
- arm_load_pic_register (0UL);
+ arm_load_pic_register (0UL, pic_reg);
seq = get_insns ();
end_sequence ();
@@ -7295,16 +7324,33 @@ require_pic_register (void)
we can't yet emit instructions directly in the final
insn stream. Queue the insns on the entry edge, they will
be committed after everything else is expanded. */
- insert_insn_on_edge (seq,
- single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
+ if (currently_expanding_to_rtl)
+ insert_insn_on_edge (seq,
+ single_succ_edge
+ (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
+ else
+ emit_insn (seq);
}
}
}
}
+/* Legitimize PIC load to ORIG into REG. If REG is NULL, a new pseudo is
+ created to hold the result of the load. If not NULL, PIC_REG indicates
+ which register to use as PIC register, otherwise it is decided by register
+ allocator. COMPUTE_NOW forces the PIC register to be loaded at the current
+ location in the instruction stream, irregardless of whether it was loaded
+ previously. Note that nonnull PIC_REG is only supported iff COMPUTE_NOW is
+ true and null PIC_REG is only supported iff COMPUTE_NOW is false.
+
+ Returns the register REG into which the PIC load is performed. */
+
rtx
-legitimize_pic_address (rtx orig, machine_mode mode, rtx reg)
+legitimize_pic_address (rtx orig, machine_mode mode, rtx reg, rtx pic_reg,
+ bool compute_now)
{
+ gcc_assert (compute_now == (pic_reg != NULL_RTX));
+
if (GET_CODE (orig) == SYMBOL_REF
|| GET_CODE (orig) == LABEL_REF)
{
@@ -7337,9 +7383,12 @@ legitimize_pic_address (rtx orig, machin
rtx mem;
/* If this function doesn't have a pic register, create one now. */
- require_pic_register ();
+ require_pic_register (pic_reg, compute_now);
+
+ if (pic_reg == NULL_RTX)
+ pic_reg = cfun->machine->pic_reg;
- pat = gen_calculate_pic_address (reg, cfun->machine->pic_reg, orig);
+ pat = gen_calculate_pic_address (reg, pic_reg, orig);
/* Make the MEM as close to a constant as possible. */
mem = SET_SRC (pat);
@@ -7388,9 +7437,11 @@ legitimize_pic_address (rtx orig, machin
gcc_assert (GET_CODE (XEXP (orig, 0)) == PLUS);
- base = legitimize_pic_address (XEXP (XEXP (orig, 0), 0), Pmode, reg);
+ base = legitimize_pic_address (XEXP (XEXP (orig, 0), 0), Pmode, reg,
+ pic_reg, compute_now);
offset = legitimize_pic_address (XEXP (XEXP (orig, 0), 1), Pmode,
- base == reg ? 0 : reg);
+ base == reg ? 0 : reg, pic_reg,
+ compute_now);
if (CONST_INT_P (offset))
{
@@ -7490,16 +7541,17 @@ static GTY(()) int pic_labelno;
low register. */
void
-arm_load_pic_register (unsigned long saved_regs ATTRIBUTE_UNUSED)
+arm_load_pic_register (unsigned long saved_regs ATTRIBUTE_UNUSED, rtx pic_reg)
{
- rtx l1, labelno, pic_tmp, pic_rtx, pic_reg;
+ rtx l1, labelno, pic_tmp, pic_rtx;
if (crtl->uses_pic_offset_table == 0 || TARGET_SINGLE_PIC_BASE)
return;
gcc_assert (flag_pic);
- pic_reg = cfun->machine->pic_reg;
+ if (pic_reg == NULL_RTX)
+ pic_reg = cfun->machine->pic_reg;
if (TARGET_VXWORKS_RTP)
{
pic_rtx = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE);
@@ -8558,7 +8610,8 @@ arm_legitimize_address (rtx x, rtx orig_
{
/* We need to find and carefully transform any SYMBOL and LABEL
references; so go back to the original address expression. */
- rtx new_x = legitimize_pic_address (orig_x, mode, NULL_RTX);
+ rtx new_x = legitimize_pic_address (orig_x, mode, NULL_RTX, NULL_RTX,
+ false /*compute_now*/);
if (new_x != orig_x)
x = new_x;
@@ -8626,7 +8679,8 @@ thumb_legitimize_address (rtx x, rtx ori
{
/* We need to find and carefully transform any SYMBOL and LABEL
references; so go back to the original address expression. */
- rtx new_x = legitimize_pic_address (orig_x, mode, NULL_RTX);
+ rtx new_x = legitimize_pic_address (orig_x, mode, NULL_RTX, NULL_RTX,
+ false /*compute_now*/);
if (new_x != orig_x)
x = new_x;
@@ -17800,7 +17854,7 @@ arm_emit_call_insn (rtx pat, rtx addr, b
? !targetm.binds_local_p (SYMBOL_REF_DECL (addr))
: !SYMBOL_REF_LOCAL_P (addr)))
{
- require_pic_register ();
+ require_pic_register (NULL_RTX, false /*compute_now*/);
use_reg (&CALL_INSN_FUNCTION_USAGE (insn), cfun->machine->pic_reg);
}
@@ -21706,7 +21760,7 @@ arm_expand_prologue (void)
mask &= THUMB2_WORK_REGS;
if (!IS_NESTED (func_type))
mask |= (1 << IP_REGNUM);
- arm_load_pic_register (mask);
+ arm_load_pic_register (mask, NULL_RTX);
}
/* If we are profiling, make sure no instructions are scheduled before
@@ -24909,7 +24963,7 @@ thumb1_expand_prologue (void)
/* Load the pic register before setting the frame pointer,
so we can use r7 as a temporary work register. */
if (flag_pic && arm_pic_register != INVALID_REGNUM)
- arm_load_pic_register (live_regs_mask);
+ arm_load_pic_register (live_regs_mask, NULL_RTX);
if (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0)
emit_move_insn (gen_rtx_REG (Pmode, ARM_HARD_FRAME_POINTER_REGNUM),
diff -urpN a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
--- a/gcc/config/arm/arm.md 2019-05-30 16:58:45.358508769 +0800
+++ b/gcc/config/arm/arm.md 2019-05-30 11:52:58.491157149 +0800
@@ -6051,7 +6051,8 @@
operands[1] = legitimize_pic_address (operands[1], SImode,
(!can_create_pseudo_p ()
? operands[0]
- : 0));
+ : NULL_RTX), NULL_RTX,
+ false /*compute_now*/);
}
"
)
@@ -6340,7 +6341,7 @@
/* r3 is clobbered by set/longjmp, so we can use it as a scratch
register. */
if (arm_pic_register != INVALID_REGNUM)
- arm_load_pic_register (1UL << 3);
+ arm_load_pic_register (1UL << 3, NULL_RTX);
DONE;
}")
@@ -8666,6 +8667,164 @@
(set_attr "conds" "clob")]
)
+;; Named patterns for stack smashing protection.
+(define_expand "stack_protect_combined_set"
+ [(parallel
+ [(set (match_operand:SI 0 "memory_operand" "")
+ (unspec:SI [(match_operand:SI 1 "guard_operand" "")]
+ UNSPEC_SP_SET))
+ (clobber (match_scratch:SI 2 ""))
+ (clobber (match_scratch:SI 3 ""))])]
+ ""
+ ""
+)
+
+;; Use a separate insn from the above expand to be able to have the mem outside
+;; the operand #1 when register allocation comes. This is needed to avoid LRA
+;; try to reload the guard since we need to control how PIC access is done in
+;; the -fpic/-fPIC case (see COMPUTE_NOW parameter when calling
+;; legitimize_pic_address ()).
+(define_insn_and_split "*stack_protect_combined_set_insn"
+ [(set (match_operand:SI 0 "memory_operand" "=m,m")
+ (unspec:SI [(mem:SI (match_operand:SI 1 "guard_addr_operand" "X,X"))]
+ UNSPEC_SP_SET))
+ (clobber (match_scratch:SI 2 "=&l,&r"))
+ (clobber (match_scratch:SI 3 "=&l,&r"))]
+ ""
+ "#"
+ "reload_completed"
+ [(parallel [(set (match_dup 0) (unspec:SI [(mem:SI (match_dup 2))]
+ UNSPEC_SP_SET))
+ (clobber (match_dup 2))])]
+ "
+{
+ if (flag_pic)
+ {
+ /* Forces recomputing of GOT base now. */
+ legitimize_pic_address (operands[1], SImode, operands[2], operands[3],
+ true /*compute_now*/);
+ }
+ else
+ {
+ if (address_operand (operands[1], SImode))
+ operands[2] = operands[1];
+ else
+ {
+ rtx mem = XEXP (force_const_mem (SImode, operands[1]), 0);
+ emit_move_insn (operands[2], mem);
+ }
+ }
+}"
+ [(set_attr "arch" "t1,32")]
+)
+
+(define_insn "*stack_protect_set_insn"
+ [(set (match_operand:SI 0 "memory_operand" "=m,m")
+ (unspec:SI [(mem:SI (match_operand:SI 1 "register_operand" "+&l,&r"))]
+ UNSPEC_SP_SET))
+ (clobber (match_dup 1))]
+ ""
+ "@
+ ldr\\t%1, [%1]\;str\\t%1, %0\;movs\t%1,#0
+ ldr\\t%1, [%1]\;str\\t%1, %0\;mov\t%1,#0"
+ [(set_attr "length" "8,12")
+ (set_attr "conds" "clob,nocond")
+ (set_attr "type" "multiple")
+ (set_attr "arch" "t1,32")]
+)
+
+(define_expand "stack_protect_combined_test"
+ [(parallel
+ [(set (pc)
+ (if_then_else
+ (eq (match_operand:SI 0 "memory_operand" "")
+ (unspec:SI [(match_operand:SI 1 "guard_operand" "")]
+ UNSPEC_SP_TEST))
+ (label_ref (match_operand 2))
+ (pc)))
+ (clobber (match_scratch:SI 3 ""))
+ (clobber (match_scratch:SI 4 ""))
+ (clobber (reg:CC CC_REGNUM))])]
+ ""
+ ""
+)
+
+;; Use a separate insn from the above expand to be able to have the mem outside
+;; the operand #1 when register allocation comes. This is needed to avoid LRA
+;; try to reload the guard since we need to control how PIC access is done in
+;; the -fpic/-fPIC case (see COMPUTE_NOW parameter when calling
+;; legitimize_pic_address ()).
+(define_insn_and_split "*stack_protect_combined_test_insn"
+ [(set (pc)
+ (if_then_else
+ (eq (match_operand:SI 0 "memory_operand" "m,m")
+ (unspec:SI [(mem:SI (match_operand:SI 1 "guard_addr_operand" "X,X"))]
+ UNSPEC_SP_TEST))
+ (label_ref (match_operand 2))
+ (pc)))
+ (clobber (match_scratch:SI 3 "=&l,&r"))
+ (clobber (match_scratch:SI 4 "=&l,&r"))
+ (clobber (reg:CC CC_REGNUM))]
+ ""
+ "#"
+ "reload_completed"
+ [(const_int 0)]
+{
+ rtx eq;
+
+ if (flag_pic)
+ {
+ /* Forces recomputing of GOT base now. */
+ legitimize_pic_address (operands[1], SImode, operands[3], operands[4],
+ true /*compute_now*/);
+ }
+ else
+ {
+ if (address_operand (operands[1], SImode))
+ operands[3] = operands[1];
+ else
+ {
+ rtx mem = XEXP (force_const_mem (SImode, operands[1]), 0);
+ emit_move_insn (operands[3], mem);
+ }
+ }
+ if (TARGET_32BIT)
+ {
+ emit_insn (gen_arm_stack_protect_test_insn (operands[4], operands[0],
+ operands[3]));
+ rtx cc_reg = gen_rtx_REG (CC_Zmode, CC_REGNUM);
+ eq = gen_rtx_EQ (CC_Zmode, cc_reg, const0_rtx);
+ emit_jump_insn (gen_arm_cond_branch (operands[2], eq, cc_reg));
+ }
+ else
+ {
+ emit_insn (gen_thumb1_stack_protect_test_insn (operands[4], operands[0],
+ operands[3]));
+ eq = gen_rtx_EQ (VOIDmode, operands[4], const0_rtx);
+ emit_jump_insn (gen_cbranchsi4 (eq, operands[4], const0_rtx,
+ operands[2]));
+ }
+ DONE;
+}
+ [(set_attr "arch" "t1,32")]
+)
+
+(define_insn "arm_stack_protect_test_insn"
+ [(set (reg:CC_Z CC_REGNUM)
+ (compare:CC_Z (unspec:SI [(match_operand:SI 1 "memory_operand" "m,m")
+ (mem:SI (match_operand:SI 2 "register_operand" "+l,r"))]
+ UNSPEC_SP_TEST)
+ (const_int 0)))
+ (clobber (match_operand:SI 0 "register_operand" "=&l,&r"))
+ (clobber (match_dup 2))]
+ "TARGET_32BIT"
+ "ldr\t%0, [%2]\;ldr\t%2, %1\;eors\t%0, %2, %0"
+ [(set_attr "length" "8,12")
+ (set_attr "conds" "set")
+ (set_attr "type" "multiple")
+ (set_attr "arch" "t,32")]
+)
+
(define_expand "casesi"
[(match_operand:SI 0 "s_register_operand" "") ; index to jump on
(match_operand:SI 1 "const_int_operand" "") ; lower bound
diff -urpN a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
--- a/gcc/config/arm/arm-protos.h 2019-05-30 16:58:45.358508769 +0800
+++ b/gcc/config/arm/arm-protos.h 2019-05-30 11:52:58.491157149 +0800
@@ -28,7 +28,7 @@ extern enum unwind_info_type arm_except_
extern int use_return_insn (int, rtx);
extern bool use_simple_return_p (void);
extern enum reg_class arm_regno_class (int);
-extern void arm_load_pic_register (unsigned long);
+extern void arm_load_pic_register (unsigned long, rtx);
extern int arm_volatile_func (void);
extern void arm_expand_prologue (void);
extern void arm_expand_epilogue (bool);
@@ -69,7 +69,7 @@ extern int const_ok_for_dimode_op (HOST_
extern int arm_split_constant (RTX_CODE, machine_mode, rtx,
HOST_WIDE_INT, rtx, rtx, int);
extern int legitimate_pic_operand_p (rtx);
-extern rtx legitimize_pic_address (rtx, machine_mode, rtx);
+extern rtx legitimize_pic_address (rtx, machine_mode, rtx, rtx, bool);
extern rtx legitimize_tls_address (rtx, rtx);
extern bool arm_legitimate_address_p (machine_mode, rtx, bool);
extern int arm_legitimate_address_outer_p (machine_mode, rtx, RTX_CODE, int);
diff -urpN a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md
--- a/gcc/config/arm/predicates.md 2019-05-30 16:58:45.358508769 +0800
+++ b/gcc/config/arm/predicates.md 2019-05-30 11:52:58.491157149 +0800
@@ -31,6 +31,23 @@
|| REGNO_REG_CLASS (REGNO (op)) != NO_REGS));
})
+; Predicate for stack protector guard's address in
+; stack_protect_combined_set_insn and stack_protect_combined_test_insn patterns
+(define_predicate "guard_addr_operand"
+ (match_test "true")
+{
+ return (CONSTANT_ADDRESS_P (op)
+ || !targetm.cannot_force_const_mem (mode, op));
+})
+
+; Predicate for stack protector guard in stack_protect_combined_set and
+; stack_protect_combined_test patterns
+(define_predicate "guard_operand"
+ (match_code "mem")
+{
+ return guard_addr_operand (XEXP (op, 0), mode);
+})
+
(define_predicate "imm_for_neon_inv_logic_operand"
(match_code "const_vector")
{
diff -urpN a/gcc/config/arm/thumb1.md b/gcc/config/arm/thumb1.md
--- a/gcc/config/arm/thumb1.md 2019-05-30 16:58:45.358508769 +0800
+++ b/gcc/config/arm/thumb1.md 2019-05-30 11:52:58.491157149 +0800
@@ -1964,4 +1964,17 @@
}"
[(set_attr "type" "mov_reg")]
)
+
+(define_insn "thumb1_stack_protect_test_insn"
+ [(set (match_operand:SI 0 "register_operand" "=&l")
+ (unspec:SI [(match_operand:SI 1 "memory_operand" "m")
+ (mem:SI (match_operand:SI 2 "register_operand" "+l"))]
+ UNSPEC_SP_TEST))
+ (clobber (match_dup 2))]
+ "TARGET_THUMB1"
+ "ldr\t%0, [%2]\;ldr\t%2, %1\;eors\t%0, %2, %0"
+ [(set_attr "length" "8")
+ (set_attr "conds" "set")
+ (set_attr "type" "multiple")]
+)
diff -urpN a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
--- a/gcc/config/arm/unspecs.md 2019-05-30 16:58:45.358508769 +0800
+++ b/gcc/config/arm/unspecs.md 2019-05-30 11:52:58.491157149 +0800
@@ -86,6 +86,9 @@
UNSPEC_PROBE_STACK ; Probe stack memory reference
UNSPEC_NONSECURE_MEM ; Represent non-secure memory in ARMv8-M with
; security extension
+ UNSPEC_SP_SET ; Represent the setting of stack protector's canary
+ UNSPEC_SP_TEST ; Represent the testing of stack protector's canary
+ ; against the guard.
])
(define_c_enum "unspec" [
diff -urpN a/gcc/doc/md.texi b/gcc/doc/md.texi
--- a/gcc/doc/md.texi 2019-05-30 16:58:45.362508769 +0800
+++ b/gcc/doc/md.texi 2019-05-30 11:52:58.491157149 +0800
@@ -6955,22 +6955,61 @@ builtins.
The get/set patterns have a single output/input operand respectively,
with @var{mode} intended to be @code{Pmode}.
+@cindex @code{stack_protect_combined_set} instruction pattern
+@item @samp{stack_protect_combined_set}
+This pattern, if defined, moves a @code{ptr_mode} value from an address
+whose declaration RTX is given in operand 1 to the memory in operand 0
+without leaving the value in a register afterward. If several
+instructions are needed by the target to perform the operation (eg. to
+load the address from a GOT entry then load the @code{ptr_mode} value
+and finally store it), it is the backend's responsibility to ensure no
+intermediate result gets spilled. This is to avoid leaking the value
+some place that an attacker might use to rewrite the stack guard slot
+after having clobbered it.
+
+If this pattern is not defined, then the address declaration is
+expanded first in the standard way and a @code{stack_protect_set}
+pattern is then generated to move the value from that address to the
+address in operand 0.
+
@cindex @code{stack_protect_set} instruction pattern
@item @samp{stack_protect_set}
-This pattern, if defined, moves a @code{ptr_mode} value from the memory
-in operand 1 to the memory in operand 0 without leaving the value in
-a register afterward. This is to avoid leaking the value some place
-that an attacker might use to rewrite the stack guard slot after
-having clobbered it.
+This pattern, if defined, moves a @code{ptr_mode} value from the valid
+memory location in operand 1 to the memory in operand 0 without leaving
+the value in a register afterward. This is to avoid leaking the value
+some place that an attacker might use to rewrite the stack guard slot
+after having clobbered it.
+
+Note: on targets where the addressing modes do not allow to load
+directly from stack guard address, the address is expanded in a standard
+way first which could cause some spills.
If this pattern is not defined, then a plain move pattern is generated.
+@cindex @code{stack_protect_combined_test} instruction pattern
+@item @samp{stack_protect_combined_test}
+This pattern, if defined, compares a @code{ptr_mode} value from an
+address whose declaration RTX is given in operand 1 with the memory in
+operand 0 without leaving the value in a register afterward and
+branches to operand 2 if the values were equal. If several
+instructions are needed by the target to perform the operation (eg. to
+load the address from a GOT entry then load the @code{ptr_mode} value
+and finally store it), it is the backend's responsibility to ensure no
+intermediate result gets spilled. This is to avoid leaking the value
+some place that an attacker might use to rewrite the stack guard slot
+after having clobbered it.
+
+If this pattern is not defined, then the address declaration is
+expanded first in the standard way and a @code{stack_protect_test}
+pattern is then generated to compare the value from that address to the
+value at the memory in operand 0.
+
@cindex @code{stack_protect_test} instruction pattern
@item @samp{stack_protect_test}
This pattern, if defined, compares a @code{ptr_mode} value from the
-memory in operand 1 with the memory in operand 0 without leaving the
-value in a register afterward and branches to operand 2 if the values
-were equal.
+valid memory location in operand 1 with the memory in operand 0 without
+leaving the value in a register afterward and branches to operand 2 if
+the values were equal.
If this pattern is not defined, then a plain compare pattern and
conditional branch pattern is used.
diff -urpN a/gcc/function.c b/gcc/function.c
--- a/gcc/function.c 2019-05-30 16:58:45.362508769 +0800
+++ b/gcc/function.c 2019-05-30 11:53:14.071156599 +0800
@@ -5065,18 +5065,34 @@ stack_protect_epilogue (void)
tree guard_decl = targetm.stack_protect_guard ();
rtx_code_label *label = gen_label_rtx ();
rtx x, y;
- rtx_insn *seq;
+ rtx_insn *seq = NULL;
x = expand_normal (crtl->stack_protect_guard);
- if (guard_decl)
- y = expand_normal (guard_decl);
+
+ if (targetm.have_stack_protect_combined_test () && guard_decl)
+ {
+ gcc_assert (DECL_P (guard_decl));
+ y = DECL_RTL (guard_decl);
+ /* Allow the target to compute address of Y and compare it with X without
+ leaking Y into a register. This combined address + compare pattern
+ allows the target to prevent spilling of any intermediate results by
+ splitting it after register allocator. */
+ seq = targetm.gen_stack_protect_combined_test (x, y, label);
+ }
else
- y = const0_rtx;
+ {
+ if (guard_decl)
+ y = expand_normal (guard_decl);
+ else
+ y = const0_rtx;
+
+ /* Allow the target to compare Y with X without leaking either into
+ a register. */
+ if (targetm.have_stack_protect_test ())
+ seq = targetm.gen_stack_protect_test (x, y, label);
+ }
- /* Allow the target to compare Y with X without leaking either into
- a register. */
- if (targetm.have_stack_protect_test ()
- && ((seq = targetm.gen_stack_protect_test (x, y, label)) != NULL_RTX))
+ if (seq)
emit_insn (seq);
else
emit_cmp_and_jump_insns (x, y, EQ, NULL_RTX, ptr_mode, 1, label);
diff -urpN a/gcc/genpreds.c b/gcc/genpreds.c
--- a/gcc/genpreds.c 2019-05-30 16:58:45.362508769 +0800
+++ b/gcc/genpreds.c 2019-05-30 11:53:14.163156595 +0800
@@ -1581,7 +1581,8 @@ write_insn_preds_c (void)
#include \"reload.h\"\n\
#include \"regs.h\"\n\
#include \"emit-rtl.h\"\n\
-#include \"tm-constrs.h\"\n");
+#include \"tm-constrs.h\"\n\
+#include \"target.h\"\n");
FOR_ALL_PREDICATES (p)
write_one_predicate_function (p);
diff -urpN a/gcc/target-insns.def b/gcc/target-insns.def
--- a/gcc/target-insns.def 2019-05-30 16:58:45.362508769 +0800
+++ b/gcc/target-insns.def 2019-05-30 11:52:58.495157149 +0800
@@ -96,7 +96,9 @@ DEF_TARGET_INSN (sibcall_value, (rtx x0,
DEF_TARGET_INSN (simple_return, (void))
DEF_TARGET_INSN (split_stack_prologue, (void))
DEF_TARGET_INSN (split_stack_space_check, (rtx x0, rtx x1))
+DEF_TARGET_INSN (stack_protect_combined_set, (rtx x0, rtx x1))
DEF_TARGET_INSN (stack_protect_set, (rtx x0, rtx x1))
+DEF_TARGET_INSN (stack_protect_combined_test, (rtx x0, rtx x1, rtx x2))
DEF_TARGET_INSN (stack_protect_test, (rtx x0, rtx x1, rtx x2))
DEF_TARGET_INSN (store_multiple, (rtx x0, rtx x1, rtx x2))
DEF_TARGET_INSN (tablejump, (rtx x0, rtx x1))

51
CVE-2019-15847.patch Normal file
View File

@ -0,0 +1,51 @@
diff -urpN a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
--- a/gcc/config/rs6000/altivec.md 2018-01-15 01:47:30.483964000 +0800
+++ b/gcc/config/rs6000/altivec.md 2019-09-09 00:01:25.770835633 +0800
@@ -74,9 +74,6 @@
UNSPEC_VUNPACK_LO_SIGN_DIRECT
UNSPEC_VUPKHPX
UNSPEC_VUPKLPX
- UNSPEC_DARN
- UNSPEC_DARN_32
- UNSPEC_DARN_RAW
UNSPEC_DST
UNSPEC_DSTT
UNSPEC_DSTST
@@ -3770,21 +3767,21 @@
(define_insn "darn_32"
[(set (match_operand:SI 0 "register_operand" "=r")
- (unspec:SI [(const_int 0)] UNSPEC_DARN_32))]
+ (unspec_volatile:SI [(const_int 0)] UNSPECV_DARN_32))]
"TARGET_P9_MISC"
"darn %0,0"
[(set_attr "type" "integer")])
(define_insn "darn_raw"
[(set (match_operand:DI 0 "register_operand" "=r")
- (unspec:DI [(const_int 0)] UNSPEC_DARN_RAW))]
+ (unspec_volatile:DI [(const_int 0)] UNSPECV_DARN_RAW))]
"TARGET_P9_MISC && TARGET_64BIT"
"darn %0,2"
[(set_attr "type" "integer")])
(define_insn "darn"
[(set (match_operand:DI 0 "register_operand" "=r")
- (unspec:DI [(const_int 0)] UNSPEC_DARN))]
+ (unspec_volatile:DI [(const_int 0)] UNSPECV_DARN))]
"TARGET_P9_MISC && TARGET_64BIT"
"darn %0,1"
[(set_attr "type" "integer")])
diff -urpN a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
--- a/gcc/config/rs6000/rs6000.md 2018-01-21 21:32:58.843504000 +0800
+++ b/gcc/config/rs6000/rs6000.md 2019-09-08 23:53:13.122859153 +0800
@@ -163,6 +163,9 @@
UNSPECV_EH_RR ; eh_reg_restore
UNSPECV_ISYNC ; isync instruction
UNSPECV_MFTB ; move from time base
+ UNSPECV_DARN ; darn 1 (deliver a random number)
+ UNSPECV_DARN_32 ; darn 2
+ UNSPECV_DARN_RAW ; darn 0
UNSPECV_NLGR ; non-local goto receiver
UNSPECV_MFFS ; Move from FPSCR
UNSPECV_MTFSF ; Move to FPSCR Fields

View File

@ -0,0 +1,24 @@
diff -urpN a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
--- a/gcc/config/aarch64/aarch64.c 2018-10-09 11:49:19.000000000 +0800
+++ b/gcc/config/aarch64/aarch64.c 2018-10-09 13:42:15.000000000 +0800
@@ -1619,7 +1619,7 @@ aarch64_load_symref_appropriately (rtx d
case SYMBOL_SMALL_TLSDESC:
{
machine_mode mode = GET_MODE (dest);
- rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
+ rtx x0 = gen_rtx_REG (ptr_mode, R0_REGNUM);
rtx tp;
gcc_assert (mode == Pmode || mode == ptr_mode);
@@ -1635,6 +1635,11 @@ aarch64_load_symref_appropriately (rtx d
if (mode != Pmode)
tp = gen_lowpart (mode, tp);
+ if (mode != ptr_mode)
+ {
+ x0 = force_reg (mode, gen_rtx_SIGN_EXTEND (mode, x0));
+ }
+
emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
return;

View File

@ -0,0 +1,31 @@
diff -urpN a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
--- a/gcc/config/aarch64/aarch64.md 2018-10-09 11:30:50.000000000 +0800
+++ b/gcc/config/aarch64/aarch64.md 2018-10-09 11:52:54.000000000 +0800
@@ -857,6 +857,13 @@
: !REG_P (callee))
XEXP (operands[0], 0) = force_reg (Pmode, callee);
+ if (TARGET_ILP32
+ && GET_CODE (XEXP (operands[0], 0)) == SYMBOL_REF
+ && GET_MODE (XEXP (operands[0], 0)) == SImode)
+ XEXP (operands[0], 0) = convert_memory_address (DImode,
+ XEXP (operands[0], 0));
+
+
if (operands[2] == NULL_RTX)
operands[2] = const0_rtx;
@@ -889,6 +896,13 @@
: !REG_P (callee))
XEXP (operands[1], 0) = force_reg (Pmode, callee);
+ if (TARGET_ILP32
+ && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
+ && GET_MODE (XEXP (operands[1], 0)) == SImode)
+ XEXP (operands[1], 0) = convert_memory_address (DImode,
+ XEXP (operands[1], 0));
+
+
if (operands[3] == NULL_RTX)
operands[3] = const0_rtx;

View File

@ -0,0 +1,780 @@
diff -urpN a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
--- a/gcc/config/aarch64/aarch64.c 2019-04-15 14:50:25.866378665 +0800
+++ b/gcc/config/aarch64/aarch64.c 2019-04-15 14:49:21.986376983 +0800
@@ -554,6 +554,31 @@ static const struct tune_params generic_
(AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
};
+static const struct tune_params tsv110_tunings =
+{
+ &cortexa57_extra_costs,
+ &generic_addrcost_table,
+ &generic_regmove_cost,
+ &generic_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ 4, /* memmov_cost */
+ 4, /* issue_rate */
+ AARCH64_FUSE_NOTHING, /* fusible_ops */
+ 16, /* function_align. */
+ 16, /* jump_align. */
+ 8, /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ 0, /* cache_line_size. */
+ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+};
+
static const struct tune_params cortexa35_tunings =
{
&cortexa53_extra_costs,
diff -urpN a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
--- a/gcc/config/aarch64/aarch64-cores.def 2017-02-15 08:09:28.845771000 +0800
+++ b/gcc/config/aarch64/aarch64-cores.def 2019-04-15 14:49:21.986376983 +0800
@@ -78,6 +78,8 @@ AARCH64_CORE("xgene1", xgene1, x
AARCH64_CORE("thunderx2t99p1", thunderx2t99p1, thunderx2t99, 8_1A, AARCH64_FL_FOR_ARCH8_1 | AARCH64_FL_CRYPTO, thunderx2t99, 0x42, 0x516, -1)
AARCH64_CORE("vulcan", vulcan, thunderx2t99, 8_1A, AARCH64_FL_FOR_ARCH8_1 | AARCH64_FL_CRYPTO, thunderx2t99, 0x42, 0x516, -1)
+AARCH64_CORE("tsv110", tsv110, tsv110, 8A, AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, tsv110, 0x48, 0xd01, -1)
+
/* V8 big.LITTLE implementations. */
AARCH64_CORE("cortex-a57.cortex-a53", cortexa57cortexa53, cortexa53, 8A, AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, 0x41, AARCH64_BIG_LITTLE (0xd07, 0xd03), -1)
diff -urpN a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
--- a/gcc/config/aarch64/aarch64.md 2019-04-15 14:50:25.870378665 +0800
+++ b/gcc/config/aarch64/aarch64.md 2019-04-15 14:49:21.986376983 +0800
@@ -226,6 +226,7 @@
(include "thunderx.md")
(include "../arm/xgene1.md")
(include "thunderx2t99.md")
+(include "tsv110.md")
;; -------------------------------------------------------------------
;; Jumps and other miscellaneous insns
diff -urpN a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
--- a/gcc/config/aarch64/aarch64-tune.md 2017-02-15 08:09:28.845771000 +0800
+++ b/gcc/config/aarch64/aarch64-tune.md 2019-04-15 14:49:21.986376983 +0800
@@ -1,5 +1,5 @@
;; -*- buffer-read-only: t -*-
;; Generated automatically by gentune.sh from aarch64-cores.def
(define_attr "tune"
- "cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,exynosm1,falkor,qdf24xx,thunderx,thunderxt88p1,thunderxt88,thunderxt81,thunderxt83,thunderx2t99,xgene1,thunderx2t99p1,vulcan,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53"
+ "cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,exynosm1,falkor,qdf24xx,thunderx,thunderxt88p1,thunderxt88,thunderxt81,thunderxt83,thunderx2t99,xgene1,tsv110,thunderx2t99p1,vulcan,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53"
(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
diff -urpN a/gcc/config/aarch64/tsv110.md b/gcc/config/aarch64/tsv110.md
--- a/gcc/config/aarch64/tsv110.md 1970-01-01 08:00:00.000000000 +0800
+++ b/gcc/config/aarch64/tsv110.md 2019-04-15 14:55:30.420081420 +0800
@@ -0,0 +1,708 @@
+;; tsv110 pipeline description
+;; Copyright (C) 2018 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3. If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_automaton "tsv110")
+
+(define_attr "tsv110_neon_type"
+ "neon_arith_acc, neon_arith_acc_q,
+ neon_arith_basic, neon_arith_complex,
+ neon_reduc_add_acc, neon_multiply, neon_multiply_q,
+ neon_multiply_long, neon_mla, neon_mla_q, neon_mla_long,
+ neon_sat_mla_long, neon_shift_acc, neon_shift_imm_basic,
+ neon_shift_imm_complex,
+ neon_shift_reg_basic, neon_shift_reg_basic_q, neon_shift_reg_complex,
+ neon_shift_reg_complex_q, neon_fp_negabs, neon_fp_arith,
+ neon_fp_arith_q, neon_fp_reductions_q, neon_fp_cvt_int,
+ neon_fp_cvt_int_q, neon_fp_cvt16, neon_fp_minmax, neon_fp_mul,
+ neon_fp_mul_q, neon_fp_mla, neon_fp_mla_q, neon_fp_recpe_rsqrte,
+ neon_fp_recpe_rsqrte_q, neon_fp_recps_rsqrts, neon_fp_recps_rsqrts_q,
+ neon_bitops, neon_bitops_q, neon_from_gp,
+ neon_from_gp_q, neon_move, neon_tbl3_tbl4, neon_zip_q, neon_to_gp,
+ neon_load_a, neon_load_b, neon_load_c, neon_load_d, neon_load_e,
+ neon_load_f, neon_store_a, neon_store_b, neon_store_complex,
+ unknown"
+ (cond [
+ (eq_attr "type" "neon_arith_acc, neon_reduc_add_acc,\
+ neon_reduc_add_acc_q")
+ (const_string "neon_arith_acc")
+ (eq_attr "type" "neon_arith_acc_q")
+ (const_string "neon_arith_acc_q")
+ (eq_attr "type" "neon_abs,neon_abs_q,neon_add, neon_add_q, neon_add_long,\
+ neon_add_widen, neon_neg, neon_neg_q,\
+ neon_reduc_add, neon_reduc_add_q,\
+ neon_reduc_add_long, neon_sub, neon_sub_q,\
+ neon_sub_long, neon_sub_widen, neon_logic,\
+ neon_logic_q, neon_tst, neon_tst_q,\
+ neon_compare, neon_compare_q,\
+ neon_compare_zero, neon_compare_zero_q,\
+ neon_minmax, neon_minmax_q, neon_reduc_minmax,\
+ neon_reduc_minmax_q")
+ (const_string "neon_arith_basic")
+ (eq_attr "type" "neon_add_halve_narrow_q,\
+ neon_add_halve, neon_add_halve_q,\
+ neon_sub_halve, neon_sub_halve_q, neon_qabs,\
+ neon_qabs_q, neon_qadd, neon_qadd_q, neon_qneg,\
+ neon_qneg_q, neon_qsub, neon_qsub_q,\
+ neon_sub_halve_narrow_q")
+ (const_string "neon_arith_complex")
+
+ (eq_attr "type" "neon_mul_b, neon_mul_h, neon_mul_s,\
+ neon_mul_h_scalar, neon_mul_s_scalar,\
+ neon_sat_mul_b, neon_sat_mul_h,\
+ neon_sat_mul_s, neon_sat_mul_h_scalar,\
+ neon_sat_mul_s_scalar,\
+ neon_mul_b_long, neon_mul_h_long,\
+ neon_mul_s_long,\
+ neon_mul_h_scalar_long, neon_mul_s_scalar_long,\
+ neon_sat_mul_b_long, neon_sat_mul_h_long,\
+ neon_sat_mul_s_long, neon_sat_mul_h_scalar_long,\
+ neon_sat_mul_s_scalar_long,\
+ neon_mla_b, neon_mla_h, neon_mla_s,\
+ neon_mla_h_scalar, neon_mla_s_scalar,\
+ neon_mla_b_long, neon_mla_h_long,\
+ neon_mla_s_long,\
+ neon_mla_h_scalar_long, neon_mla_s_scalar_long,\
+ neon_sat_mla_b_long, neon_sat_mla_h_long,\
+ neon_sat_mla_s_long, neon_sat_mla_h_scalar_long,\
+ neon_sat_mla_s_scalar_long")
+ (const_string "neon_multiply")
+ (eq_attr "type" "neon_mul_b_q, neon_mul_h_q, neon_mul_s_q,\
+ neon_mul_h_scalar_q, neon_mul_s_scalar_q,\
+ neon_sat_mul_b_q, neon_sat_mul_h_q,\
+ neon_sat_mul_s_q, neon_sat_mul_h_scalar_q,\
+ neon_sat_mul_s_scalar_q,\
+ neon_mla_b_q, neon_mla_h_q, neon_mla_s_q,\
+ neon_mla_h_scalar_q, neon_mla_s_scalar_q")
+ (const_string "neon_multiply_q")
+
+ (eq_attr "type" "neon_shift_acc, neon_shift_acc_q")
+ (const_string "neon_shift_acc")
+ (eq_attr "type" "neon_shift_imm, neon_shift_imm_q,\
+ neon_shift_imm_narrow_q, neon_shift_imm_long")
+ (const_string "neon_shift_imm_basic")
+ (eq_attr "type" "neon_sat_shift_imm, neon_sat_shift_imm_q,\
+ neon_sat_shift_imm_narrow_q")
+ (const_string "neon_shift_imm_complex")
+ (eq_attr "type" "neon_shift_reg")
+ (const_string "neon_shift_reg_basic")
+ (eq_attr "type" "neon_shift_reg_q")
+ (const_string "neon_shift_reg_basic_q")
+ (eq_attr "type" "neon_sat_shift_reg")
+ (const_string "neon_shift_reg_complex")
+ (eq_attr "type" "neon_sat_shift_reg_q")
+ (const_string "neon_shift_reg_complex_q")
+
+ (eq_attr "type" "neon_fp_neg_s, neon_fp_neg_s_q,\
+ neon_fp_abs_s, neon_fp_abs_s_q,\
+ neon_fp_neg_d, neon_fp_neg_d_q,\
+ neon_fp_abs_d, neon_fp_abs_d_q,\
+ neon_fp_minmax_s,neon_fp_minmax_d,\
+ neon_fp_reduc_minmax_s,neon_fp_reduc_minmax_d")
+ (const_string "neon_fp_negabs")
+ (eq_attr "type" "neon_fp_addsub_s, neon_fp_abd_s,\
+ neon_fp_reduc_add_s, neon_fp_compare_s,\
+ neon_fp_round_s,\
+ neon_fp_addsub_d, neon_fp_abd_d,\
+ neon_fp_reduc_add_d, neon_fp_compare_d,\
+ neon_fp_round_d")
+ (const_string "neon_fp_arith")
+ (eq_attr "type" "neon_fp_addsub_s_q, neon_fp_abd_s_q,\
+ neon_fp_reduc_add_s_q, neon_fp_compare_s_q,\
+ neon_fp_minmax_s_q, neon_fp_round_s_q,\
+ neon_fp_addsub_d_q, neon_fp_abd_d_q,\
+ neon_fp_reduc_add_d_q, neon_fp_compare_d_q,\
+ neon_fp_minmax_d_q, neon_fp_round_d_q")
+ (const_string "neon_fp_arith_q")
+ (eq_attr "type" "neon_fp_reduc_minmax_s_q,\
+ neon_fp_reduc_minmax_d_q,\
+ neon_fp_reduc_add_s_q, neon_fp_reduc_add_d_q")
+ (const_string "neon_fp_reductions_q")
+ (eq_attr "type" "neon_fp_to_int_s, neon_int_to_fp_s,\
+ neon_fp_to_int_d, neon_int_to_fp_d")
+ (const_string "neon_fp_cvt_int")
+ (eq_attr "type" "neon_fp_to_int_s_q, neon_int_to_fp_s_q,\
+ neon_fp_to_int_d_q, neon_int_to_fp_d_q")
+ (const_string "neon_fp_cvt_int_q")
+ (eq_attr "type" "neon_fp_cvt_narrow_s_q, neon_fp_cvt_widen_h")
+ (const_string "neon_fp_cvt16")
+ (eq_attr "type" "neon_fp_mul_s, neon_fp_mul_s_scalar,\
+ neon_fp_mul_d")
+ (const_string "neon_fp_mul")
+ (eq_attr "type" "neon_fp_mul_s_q, neon_fp_mul_s_scalar_q,\
+ neon_fp_mul_d_q, neon_fp_mul_d_scalar_q")
+ (const_string "neon_fp_mul_q")
+ (eq_attr "type" "neon_fp_mla_s, neon_fp_mla_s_scalar,\
+ neon_fp_mla_d")
+ (const_string "neon_fp_mla")
+ (eq_attr "type" "neon_fp_mla_s_q, neon_fp_mla_s_scalar_q,
+ neon_fp_mla_d_q, neon_fp_mla_d_scalar_q")
+ (const_string "neon_fp_mla_q")
+ (eq_attr "type" "neon_fp_recpe_s, neon_fp_rsqrte_s,\
+ neon_fp_recpx_s,\
+ neon_fp_recpe_d, neon_fp_rsqrte_d,\
+ neon_fp_recpx_d")
+ (const_string "neon_fp_recpe_rsqrte")
+ (eq_attr "type" "neon_fp_recpe_s_q, neon_fp_rsqrte_s_q,\
+ neon_fp_recpx_s_q,\
+ neon_fp_recpe_d_q, neon_fp_rsqrte_d_q,\
+ neon_fp_recpx_d_q")
+ (const_string "neon_fp_recpe_rsqrte_q")
+ (eq_attr "type" "neon_fp_recps_s, neon_fp_rsqrts_s,\
+ neon_fp_recps_d, neon_fp_rsqrts_d")
+ (const_string "neon_fp_recps_rsqrts")
+ (eq_attr "type" "neon_fp_recps_s_q, neon_fp_rsqrts_s_q,\
+ neon_fp_recps_d_q, neon_fp_rsqrts_d_q")
+ (const_string "neon_fp_recps_rsqrts_q")
+ (eq_attr "type" "neon_bsl, neon_cls, neon_cnt,\
+ neon_rev, neon_permute, neon_rbit,\
+ neon_tbl1, neon_tbl2, neon_zip,\
+ neon_dup, neon_dup_q, neon_ext, neon_ext_q,\
+ neon_move, neon_move_q, neon_move_narrow_q")
+ (const_string "neon_bitops")
+ (eq_attr "type" "neon_bsl_q, neon_cls_q, neon_cnt_q,\
+ neon_rev_q, neon_permute_q, neon_rbit_q")
+ (const_string "neon_bitops_q")
+ (eq_attr "type" "neon_from_gp,f_mcr,f_mcrr")
+ (const_string "neon_from_gp")
+ (eq_attr "type" "neon_from_gp_q")
+ (const_string "neon_from_gp_q")
+
+ (eq_attr "type" "f_loads, f_loadd,\
+ neon_load1_1reg, neon_load1_1reg_q,\
+ neon_load1_2reg, neon_load1_2reg_q")
+ (const_string "neon_load_a")
+ (eq_attr "type" "neon_load1_3reg, neon_load1_3reg_q,\
+ neon_load1_4reg, neon_load1_4reg_q")
+ (const_string "neon_load_b")
+ (eq_attr "type" "neon_load1_one_lane, neon_load1_one_lane_q,\
+ neon_load1_all_lanes, neon_load1_all_lanes_q,\
+ neon_load2_2reg, neon_load2_2reg_q,\
+ neon_load2_all_lanes, neon_load2_all_lanes_q")
+ (const_string "neon_load_c")
+ (eq_attr "type" "neon_load2_4reg, neon_load2_4reg_q,\
+ neon_load3_3reg, neon_load3_3reg_q,\
+ neon_load3_one_lane, neon_load3_one_lane_q,\
+ neon_load4_4reg, neon_load4_4reg_q")
+ (const_string "neon_load_d")
+ (eq_attr "type" "neon_load2_one_lane, neon_load2_one_lane_q,\
+ neon_load3_all_lanes, neon_load3_all_lanes_q,\
+ neon_load4_all_lanes, neon_load4_all_lanes_q")
+ (const_string "neon_load_e")
+ (eq_attr "type" "neon_load4_one_lane, neon_load4_one_lane_q")
+ (const_string "neon_load_f")
+
+ (eq_attr "type" "f_stores, f_stored,\
+ neon_store1_1reg")
+ (const_string "neon_store_a")
+ (eq_attr "type" "neon_store1_2reg, neon_store1_1reg_q")
+ (const_string "neon_store_b")
+ (eq_attr "type" "neon_store1_3reg, neon_store1_3reg_q,\
+ neon_store3_3reg, neon_store3_3reg_q,\
+ neon_store2_4reg, neon_store2_4reg_q,\
+ neon_store4_4reg, neon_store4_4reg_q,\
+ neon_store2_2reg, neon_store2_2reg_q,\
+ neon_store3_one_lane, neon_store3_one_lane_q,\
+ neon_store4_one_lane, neon_store4_one_lane_q,\
+ neon_store1_4reg, neon_store1_4reg_q,\
+ neon_store1_one_lane, neon_store1_one_lane_q,\
+ neon_store2_one_lane, neon_store2_one_lane_q")
+ (const_string "neon_store_complex")]
+ (const_string "unknown")))
+
+;; The tsv110 core is modelled as issues pipeline that has
+;; the following functional units.
+;; 1. Three pipelines for integer operations: ALU1, ALU2, ALU3
+
+(define_cpu_unit "tsv110_alu1_issue" "tsv110")
+(define_reservation "tsv110_alu1" "tsv110_alu1_issue")
+
+(define_cpu_unit "tsv110_alu2_issue" "tsv110")
+(define_reservation "tsv110_alu2" "tsv110_alu2_issue")
+
+(define_cpu_unit "tsv110_alu3_issue" "tsv110")
+(define_reservation "tsv110_alu3" "tsv110_alu3_issue")
+
+;; 2. One pipeline for complex integer operations: MDU
+
+(define_cpu_unit "tsv110_mdu_issue" "tsv110")
+(define_reservation "tsv110_mdu" "tsv110_mdu_issue")
+
+;; 3. Two asymmetric pipelines for Asimd and FP operations: FSU1, FSU2
+(define_automaton "tsv110_fsu")
+
+(define_cpu_unit "tsv110_fsu1_issue"
+ "tsv110_fsu")
+(define_cpu_unit "tsv110_fsu2_issue"
+ "tsv110_fsu")
+
+(define_reservation "tsv110_fsu1" "tsv110_fsu1_issue")
+(define_reservation "tsv110_fsu2" "tsv110_fsu2_issue")
+
+;; 4. Two pipeline for branch operations but same with alu2 and alu3: BRU1, BRU2
+
+;; 5. Two pipelines for load and store operations: LS1, LS2.
+
+(define_cpu_unit "tsv110_ls1_issue" "tsv110")
+(define_cpu_unit "tsv110_ls2_issue" "tsv110")
+(define_reservation "tsv110_ls1" "tsv110_ls1_issue")
+(define_reservation "tsv110_ls2" "tsv110_ls2_issue")
+
+;; Block all issue queues.
+
+(define_reservation "tsv110_block" "tsv110_fsu1_issue + tsv110_fsu2_issue
+ + tsv110_mdu_issue + tsv110_alu1_issue
+ + tsv110_alu2_issue + tsv110_alu3_issue + tsv110_ls1_issue + tsv110_ls2_issue")
+
+;; Simple Execution Unit:
+;;
+;; Simple ALU without shift
+(define_insn_reservation "tsv110_alu" 1
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "alu_imm,logic_imm,\
+ alu_sreg,logic_reg,\
+ adc_imm,adc_reg,\
+ adr,bfm,clz,rbit,rev,\
+ shift_imm,shift_reg,\
+ mov_imm,mov_reg,\
+ mvn_imm,mvn_reg,\
+ mrs,multiple,no_insn"))
+ "tsv110_alu1|tsv110_alu2|tsv110_alu3")
+
+(define_insn_reservation "tsv110_alus" 1
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "alus_imm,logics_imm,\
+ alus_sreg,logics_reg,\
+ adcs_imm,adcs_reg"))
+ "tsv110_alu2|tsv110_alu3")
+
+;; ALU ops with shift
+(define_insn_reservation "tsv110_alu_shift" 2
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "extend,\
+ alu_shift_imm,alu_shift_reg,\
+ crc,logic_shift_imm,logic_shift_reg,\
+ mov_shift,mvn_shift,\
+ mov_shift_reg,mvn_shift_reg"))
+ "tsv110_mdu")
+
+(define_insn_reservation "tsv110_alus_shift" 2
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "alus_shift_imm,alus_shift_reg,\
+ logics_shift_imm,logics_shift_reg"))
+ "tsv110_alu2|tsv110_alu3")
+
+;; Multiplies instructions
+(define_insn_reservation "tsv110_mult" 3
+ (and (eq_attr "tune" "tsv110")
+ (ior (eq_attr "mul32" "yes")
+ (eq_attr "mul64" "yes")))
+ "tsv110_mdu")
+
+;; Integer divide
+(define_insn_reservation "tsv110_div" 10
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "udiv,sdiv"))
+ "tsv110_mdu")
+
+;; Block all issue pipes for a cycle
+(define_insn_reservation "tsv110_block" 1
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "block"))
+ "tsv110_block")
+
+;; Branch execution Unit
+;;
+;; Branches take two issue slot.
+;; No latency as there is no result
+(define_insn_reservation "tsv110_branch" 0
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "branch"))
+ "tsv110_alu2|tsv110_alu3")
+
+;; Load-store execution Unit
+;;
+;; Loads of up to two words.
+(define_insn_reservation "tsv110_load1" 4
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "load1,load2"))
+ "tsv110_ls1|tsv110_ls2")
+
+;; Stores of up to two words.
+(define_insn_reservation "tsv110_store1" 0
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "store1,store2"))
+ "tsv110_ls1|tsv110_ls2")
+
+;; Advanced SIMD Unit - Integer Arithmetic Instructions.
+
+(define_insn_reservation "tsv110_neon_abd_aba" 4
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "neon_abd,neon_arith_acc"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation "tsv110_neon_abd_aba_q" 4
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "neon_arith_acc_q"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation "tsv110_neon_arith_basic" 2
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_arith_basic"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation "tsv110_neon_arith_complex" 4
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_arith_complex"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+;; Integer Multiply Instructions.
+;; D-form
+(define_insn_reservation "tsv110_neon_multiply" 4
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_multiply"))
+ "tsv110_fsu1")
+
+(define_insn_reservation "tsv110_neon_multiply_dlong" 2
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "neon_mul_d_long"))
+ "tsv110_fsu1")
+
+;; Q-form
+(define_insn_reservation "tsv110_neon_multiply_q" 8
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_multiply_q"))
+ "tsv110_fsu1")
+
+;; Integer Shift Instructions.
+
+(define_insn_reservation
+ "tsv110_neon_shift_acc" 4
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_shift_acc,\
+ neon_shift_imm_basic,neon_shift_imm_complex,neon_shift_reg_basic,\
+ neon_shift_reg_complex"))
+ "tsv110_fsu1")
+
+(define_insn_reservation
+ "tsv110_neon_shift_acc_q" 4
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_shift_reg_basic_q,\
+ neon_shift_reg_complex_q"))
+ "tsv110_fsu1")
+
+;; Floating Point Instructions.
+
+(define_insn_reservation
+ "tsv110_neon_fp_negabs" 2
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_fp_negabs"))
+ "(tsv110_fsu1|tsv110_fsu2)")
+
+(define_insn_reservation
+ "tsv110_neon_fp_arith" 4
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_fp_arith"))
+ "(tsv110_fsu1|tsv110_fsu2)")
+
+(define_insn_reservation
+ "tsv110_neon_fp_arith_q" 4
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_fp_arith_q"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation
+ "tsv110_neon_fp_minmax_q" 2
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "neon_fp_minmax_s_q,neon_fp_minmax_d_q"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation
+ "tsv110_neon_fp_reductions_q" 4
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_fp_reductions_q"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation
+ "tsv110_neon_fp_cvt_int" 2
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_fp_cvt_int,neon_fp_cvt_int_q"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation
+ "tsv110_neon_fp_mul" 5
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_fp_mul"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation
+ "tsv110_neon_fp_mul_q" 5
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_fp_mul_q"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation
+ "tsv110_neon_fp_mla" 7
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_fp_mla,\
+ neon_fp_recps_rsqrts"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation
+ "tsv110_neon_fp_recpe_rsqrte" 3
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_fp_recpe_rsqrte"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation
+ "tsv110_neon_fp_mla_q" 7
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_fp_mla_q,\
+ neon_fp_recps_rsqrts_q"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation
+ "tsv110_neon_fp_recpe_rsqrte_q" 3
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_fp_recpe_rsqrte_q"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+;; Miscellaneous Instructions.
+
+(define_insn_reservation
+ "tsv110_neon_bitops" 2
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_bitops"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation
+ "tsv110_neon_dup" 2
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "neon_from_gp,f_mcr"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation
+ "tsv110_neon_mov" 2
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "f_mcrr"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation
+ "tsv110_neon_bitops_q" 2
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_bitops_q"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation
+ "tsv110_neon_from_gp_q" 4
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_from_gp_q"))
+ "(tsv110_alu1+tsv110_fsu1)|(tsv110_alu1+tsv110_fsu2)")
+
+(define_insn_reservation
+ "tsv110_neon_to_gp" 3
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "neon_to_gp,neon_to_gp_q"))
+ "tsv110_fsu1")
+
+;; Load Instructions.
+
+(define_insn_reservation
+ "tsv110_neon_ld1_lane" 8
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "neon_load1_one_lane,neon_load1_one_lane_q,\
+ neon_load1_all_lanes,neon_load1_all_lanes_q"))
+ "(tsv110_ls1 + tsv110_fsu1)|(tsv110_ls1 + tsv110_fsu2)|(tsv110_ls2 + tsv110_fsu1)|(tsv110_ls2 + tsv110_fsu2)")
+
+(define_insn_reservation
+ "tsv110_neon_ld1_reg1" 6
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "f_loads,f_loadd,neon_load1_1reg,neon_load1_1reg_q"))
+ "tsv110_ls1|tsv110_ls2")
+
+(define_insn_reservation
+ "tsv110_neon_ld1_reg2" 6
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "neon_load1_2reg,neon_load1_2reg_q"))
+ "tsv110_ls1|tsv110_ls2")
+
+(define_insn_reservation
+ "tsv110_neon_ld1_reg3" 7
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "neon_load1_3reg,neon_load1_3reg_q"))
+ "tsv110_ls1|tsv110_ls2")
+
+(define_insn_reservation
+ "tsv110_neon_ld1_reg4" 7
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "neon_load1_4reg,neon_load1_4reg_q"))
+ "tsv110_ls1|tsv110_ls2")
+
+(define_insn_reservation
+ "tsv110_neon_ld2" 8
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "neon_load1_2reg,neon_load1_2reg_q,\
+ neon_load2_2reg,neon_load2_2reg_q,neon_load2_all_lanes,\
+ neon_load2_all_lanes_q,neon_load2_one_lane,neon_load2_one_lane_q"))
+ "(tsv110_ls1 + tsv110_fsu1)|(tsv110_ls1 + tsv110_fsu2)|(tsv110_ls2 + tsv110_fsu1)|(tsv110_ls2 + tsv110_fsu2)")
+
+(define_insn_reservation
+ "tsv110_neon_ld3" 9
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "neon_load3_3reg,neon_load3_3reg_q,\
+ neon_load3_one_lane,neon_load3_one_lane_q,\
+ neon_load3_all_lanes,neon_load3_all_lanes_q"))
+ "(tsv110_ls1 + tsv110_fsu1)|(tsv110_ls1 + tsv110_fsu2)|(tsv110_ls2 + tsv110_fsu1)|(tsv110_ls2 + tsv110_fsu2)")
+
+(define_insn_reservation
+ "tsv110_neon_ld4_lane" 9
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "neon_load4_all_lanes,neon_load4_all_lanes_q,\
+ neon_load4_one_lane,neon_load4_one_lane_q"))
+ "(tsv110_ls1 + tsv110_fsu1)|(tsv110_ls1 + tsv110_fsu2)|(tsv110_ls2 + tsv110_fsu1)|(tsv110_ls2 + tsv110_fsu2)")
+
+(define_insn_reservation
+ "tsv110_neon_ld4_reg" 11
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "neon_load4_all_lanes,neon_load4_all_lanes_q,\
+ neon_load4_one_lane,neon_load4_one_lane_q"))
+ "(tsv110_ls1 + tsv110_fsu1)|(tsv110_ls1 + tsv110_fsu2)|(tsv110_ls2 + tsv110_fsu1)|(tsv110_ls2 + tsv110_fsu2)")
+
+;; Store Instructions.
+
+(define_insn_reservation
+ "tsv110_neon_store_a" 0
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_store_a"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation
+ "tsv110_neon_store_b" 0
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_store_b"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+;; These block issue for a number of cycles proportional to the number
+;; of 64-bit chunks they will store, we don't attempt to model that
+;; precisely, treat them as blocking execution for two cycles when
+;; issued.
+(define_insn_reservation
+ "tsv110_neon_store_complex" 0
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "tsv110_neon_type" "neon_store_complex"))
+ "tsv110_block*2")
+
+;; Floating-Point Operations.
+
+(define_insn_reservation "tsv110_fp_const" 2
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "fconsts,fconstd,fmov"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation "tsv110_fp_add_sub" 5
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "fadds,faddd,fmuls,fmuld"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation "tsv110_fp_mac" 7
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "fmacs,ffmas,fmacd,ffmad"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation "tsv110_fp_cvt" 3
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "f_cvt"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation "tsv110_fp_cvtf2i" 4
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "f_cvtf2i"))
+ "tsv110_fsu1")
+
+(define_insn_reservation "tsv110_fp_cvti2f" 5
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "f_cvti2f"))
+ "(tsv110_alu1+tsv110_fsu1)|(tsv110_alu1+tsv110_fsu2)")
+
+(define_insn_reservation "tsv110_fp_cmp" 4
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "fcmps,fcmpd"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation "tsv110_fp_arith" 2
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "ffariths,ffarithd"))
+ "tsv110_fsu1|tsv110_fsu2")
+
+(define_insn_reservation "tsv110_fp_divs" 12
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "fdivs,neon_fp_div_s,fdivd,neon_fp_div_d,\
+ neon_fp_div_s_q,neon_fp_div_d_q"))
+ "tsv110_fsu1")
+
+(define_insn_reservation "tsv110_fp_sqrts" 24
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "fsqrts,neon_fp_sqrt_s,fsqrtd,neon_fp_sqrt_d,\
+ neon_fp_sqrt_s_q,neon_fp_sqrt_d_q"))
+ "tsv110_fsu2")
+
+(define_insn_reservation "tsv110_crypto_aes" 3
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "crypto_aese,crypto_aesmc"))
+ "tsv110_fsu1")
+
+(define_insn_reservation "tsv110_crypto_sha1_fast" 2
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "crypto_sha1_fast,crypto_sha1_xor"))
+ "(tsv110_fsu1|tsv110_fsu2)")
+
+(define_insn_reservation "tsv110_crypto_sha256_fast" 2
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "crypto_sha256_fast"))
+ "tsv110_fsu1")
+
+(define_insn_reservation "tsv110_crypto_complex" 5
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "crypto_sha1_slow,crypto_sha256_slow"))
+ "tsv110_fsu1")
+
+;; We lie with calls. They take up all issue slots, but are otherwise
+;; not harmful.
+(define_insn_reservation "tsv110_call" 1
+ (and (eq_attr "tune" "tsv110")
+ (eq_attr "type" "call"))
+ "tsv110_alu1_issue+tsv110_alu2_issue+tsv110_alu3_issue+tsv110_fsu1_issue+tsv110_fsu2_issue\
+ +tsv110_mdu_issue+tsv110_ls1_issue+tsv110_ls2_issue"
+)
+
+;; Simple execution unit bypasses
+(define_bypass 1 "tsv110_alu"
+ "tsv110_alu,tsv110_alu_shift")
+(define_bypass 2 "tsv110_alu_shift"
+ "tsv110_alu,tsv110_alu_shift")
+
+;; An MLA or a MUL can feed a dependent MLA.
+(define_bypass 3 "tsv110_neon_*mla*,tsv110_neon_*mul*"
+ "tsv110_neon_*mla*")
+
+;; We don't need to care about control hazards, either the branch is
+;; predicted in which case we pay no penalty, or the branch is
+;; mispredicted in which case instruction scheduling will be unlikely to
+;; help.
+(define_bypass 1 "tsv110_*"
+ "tsv110_call,tsv110_branch")

View File

@ -0,0 +1,60 @@
diff -urp a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
--- a/gcc/config/arm/arm.c 2019-01-18 11:25:20.840179114 +0800
+++ b/gcc/config/arm/arm.c 2019-01-18 11:25:47.548179817 +0800
@@ -14306,18 +14306,36 @@ gen_movmem_ldrd_strd (rtx *operands)
emit_move_insn (reg0, src);
else
{
- emit_insn (gen_unaligned_loadsi (low_reg, src));
- src = next_consecutive_mem (src);
- emit_insn (gen_unaligned_loadsi (hi_reg, src));
+ if (flag_lsrd_be_adjust && BYTES_BIG_ENDIAN && WORDS_BIG_ENDIAN)
+ {
+ emit_insn (gen_unaligned_loadsi (hi_reg, src));
+ src = next_consecutive_mem (src);
+ emit_insn (gen_unaligned_loadsi (low_reg, src));
+ }
+ else
+ {
+ emit_insn (gen_unaligned_loadsi (low_reg, src));
+ src = next_consecutive_mem (src);
+ emit_insn (gen_unaligned_loadsi (hi_reg, src));
+ }
}
if (dst_aligned)
emit_move_insn (dst, reg0);
else
{
- emit_insn (gen_unaligned_storesi (dst, low_reg));
- dst = next_consecutive_mem (dst);
- emit_insn (gen_unaligned_storesi (dst, hi_reg));
+ if (flag_lsrd_be_adjust && BYTES_BIG_ENDIAN && WORDS_BIG_ENDIAN)
+ {
+ emit_insn (gen_unaligned_storesi (dst, hi_reg));
+ dst = next_consecutive_mem (dst);
+ emit_insn (gen_unaligned_storesi (dst, low_reg));
+ }
+ else
+ {
+ emit_insn (gen_unaligned_storesi (dst, low_reg));
+ dst = next_consecutive_mem (dst);
+ emit_insn (gen_unaligned_storesi (dst, hi_reg));
+ }
}
src = next_consecutive_mem (src);
diff -urp a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt
--- a/gcc/config/arm/arm.opt 2019-01-18 11:25:20.840179114 +0800
+++ b/gcc/config/arm/arm.opt 2019-01-18 11:28:51.744184666 +0800
@@ -274,6 +274,10 @@ masm-syntax-unified
Target Report Var(inline_asm_unified) Init(0) Save
Assume unified syntax for inline assembly code.
+mlsrd-be-adjust
+Target Report Var(flag_lsrd_be_adjust) Init(1)
+Adjust ldrd/strd splitting order when it's big-endian.
+
mpure-code
Target Report Var(target_pure_code) Init(0)
Do not allow constant data to be placed in code sections.

View File

@ -0,0 +1,19 @@
diff -urpN gcc-7.3.0-bak/gcc/config/arm/arm.c gcc-7.3.0/gcc/config/arm/arm.c
--- gcc-7.3.0-bak/gcc/config/arm/arm.c 2018-11-13 14:23:21.362347728 +0800
+++ gcc-7.3.0/gcc/config/arm/arm.c 2018-11-13 14:31:15.722360215 +0800
@@ -26853,7 +26853,14 @@ static bool
arm_array_mode_supported_p (machine_mode mode,
unsigned HOST_WIDE_INT nelems)
{
- if (TARGET_NEON
+
+
+ /* We don't want to enable interleaved loads and stores for BYTES_BIG_ENDIAN
+ for now, as the lane-swapping logic needs to be extended in the expanders.
+ See PR target/82518. */
+
+
+ if (TARGET_NEON && !BYTES_BIG_ENDIAN
&& (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))
&& (nelems >= 2 && nelems <= 4))
return true;

View File

@ -0,0 +1,25 @@
diff -Nurp a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
--- a/gcc/config/arm/arm.md 2019-08-10 00:21:12.658523444 +0800
+++ b/gcc/config/arm/arm.md 2019-08-10 00:21:53.478521496 +0800
@@ -5337,7 +5337,9 @@
#
ldrh%?\\t%0, %1"
[(set_attr "type" "alu_shift_reg,load_byte")
- (set_attr "predicable" "yes")]
+ (set_attr "predicable" "yes")
+ (set_attr "pool_range" "*,256")
+ (set_attr "neg_pool_range" "*,244")]
)
(define_insn "*arm_zero_extendhisi2_v6"
@@ -5348,7 +5350,9 @@
uxth%?\\t%0, %1
ldrh%?\\t%0, %1"
[(set_attr "predicable" "yes")
- (set_attr "type" "extend,load_byte")]
+ (set_attr "type" "extend,load_byte")
+ (set_attr "pool_range" "*,256")
+ (set_attr "neg_pool_range" "*,244")]
)
(define_insn "*arm_zero_extendhisi2addsi"

BIN
cloog-0.18.4.tar.gz Normal file

Binary file not shown.

View File

@ -0,0 +1,21 @@
diff -N -urp a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
--- a/gcc/config/aarch64/aarch64.c 2018-11-16 18:02:11.000000000 +0800
+++ b/gcc/config/aarch64/aarch64.c 2018-11-16 18:07:39.000000000 +0800
@@ -6102,7 +6102,7 @@ aarch64_elf_asm_constructor (rtx symbol,
-Wformat-truncation false positive, use a larger size. */
char buf[23];
snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
- s = get_section (buf, SECTION_WRITE, NULL);
+ s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
switch_to_section (s);
assemble_align (POINTER_SIZE);
assemble_aligned_integer (POINTER_BYTES, symbol);
@@ -6122,7 +6122,7 @@ aarch64_elf_asm_destructor (rtx symbol,
-Wformat-truncation false positive, use a larger size. */
char buf[23];
snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
- s = get_section (buf, SECTION_WRITE, NULL);
+ s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
switch_to_section (s);
assemble_align (POINTER_SIZE);
assemble_aligned_integer (POINTER_BYTES, symbol);

View File

@ -0,0 +1,155 @@
diff -N -urp a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
--- a/gcc/config/i386/sse.md 2019-10-30 10:02:45.894920908 +0800
+++ b/gcc/config/i386/sse.md 2019-10-30 10:17:39.682887612 +0800
@@ -16012,9 +16012,11 @@
switch (INTVAL (operands[4]))
{
case 3:
- return "vgatherpf0<ssemodesuffix>ps\t{%5%{%0%}|%5%{%0%}}";
+ /* %X5 so that we don't emit any *WORD PTR for -masm=intel, as
+ gas changed what it requires incompatibly. */
+ return "vgatherpf0<ssemodesuffix>ps\t{%5%{%0%}|%X5%{%0%}}";
case 2:
- return "vgatherpf1<ssemodesuffix>ps\t{%5%{%0%}|%5%{%0%}}";
+ return "vgatherpf1<ssemodesuffix>ps\t{%5%{%0%}|%X5%{%0%}}";
default:
gcc_unreachable ();
}
@@ -16057,9 +16059,11 @@
switch (INTVAL (operands[4]))
{
case 3:
- return "vgatherpf0<ssemodesuffix>pd\t{%5%{%0%}|%5%{%0%}}";
+ /* %X5 so that we don't emit any *WORD PTR for -masm=intel, as
+ gas changed what it requires incompatibly. */
+ return "vgatherpf0<ssemodesuffix>pd\t{%5%{%0%}|%X5%{%0%}}";
case 2:
- return "vgatherpf1<ssemodesuffix>pd\t{%5%{%0%}|%5%{%0%}}";
+ return "vgatherpf1<ssemodesuffix>pd\t{%5%{%0%}|%X5%{%0%}}";
default:
gcc_unreachable ();
}
@@ -16103,10 +16107,12 @@
{
case 3:
case 7:
- return "vscatterpf0<ssemodesuffix>ps\t{%5%{%0%}|%5%{%0%}}";
+ /* %X5 so that we don't emit any *WORD PTR for -masm=intel, as
+ gas changed what it requires incompatibly. */
+ return "vscatterpf0<ssemodesuffix>ps\t{%5%{%0%}|%X5%{%0%}}";
case 2:
case 6:
- return "vscatterpf1<ssemodesuffix>ps\t{%5%{%0%}|%5%{%0%}}";
+ return "vscatterpf1<ssemodesuffix>ps\t{%5%{%0%}|%X5%{%0%}}";
default:
gcc_unreachable ();
}
@@ -16150,10 +16156,12 @@
{
case 3:
case 7:
- return "vscatterpf0<ssemodesuffix>pd\t{%5%{%0%}|%5%{%0%}}";
+ /* %X5 so that we don't emit any *WORD PTR for -masm=intel, as
+ gas changed what it requires incompatibly. */
+ return "vscatterpf0<ssemodesuffix>pd\t{%5%{%0%}|%X5%{%0%}}";
case 2:
case 6:
- return "vscatterpf1<ssemodesuffix>pd\t{%5%{%0%}|%5%{%0%}}";
+ return "vscatterpf1<ssemodesuffix>pd\t{%5%{%0%}|%X5%{%0%}}";
default:
gcc_unreachable ();
}
@@ -19153,12 +19161,6 @@
(set_attr "prefix" "vex")
(set_attr "mode" "<sseinsnmode>")])
-;; Memory operand override for -masm=intel of the v*gatherq* patterns.
-(define_mode_attr gatherq_mode
- [(V4SI "q") (V2DI "x") (V4SF "q") (V2DF "x")
- (V8SI "x") (V4DI "t") (V8SF "x") (V4DF "t")
- (V16SI "t") (V8DI "g") (V16SF "t") (V8DF "g")])
-
(define_expand "<avx512>_gathersi<mode>"
[(parallel [(set (match_operand:VI48F 0 "register_operand")
(unspec:VI48F
@@ -19192,7 +19194,9 @@
UNSPEC_GATHER))
(clobber (match_scratch:<avx512fmaskmode> 2 "=&Yk"))]
"TARGET_AVX512F"
- "v<sseintprefix>gatherd<ssemodesuffix>\t{%6, %0%{%2%}|%0%{%2%}, %<xtg_mode>6}"
+;; %X6 so that we don't emit any *WORD PTR for -masm=intel, as
+;; gas changed what it requires incompatibly.
+ "v<sseintprefix>gatherd<ssemodesuffix>\t{%6, %0%{%2%}|%0%{%2%}, %X6}"
[(set_attr "type" "ssemov")
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
@@ -19211,7 +19215,9 @@
UNSPEC_GATHER))
(clobber (match_scratch:<avx512fmaskmode> 1 "=&Yk"))]
"TARGET_AVX512F"
- "v<sseintprefix>gatherd<ssemodesuffix>\t{%5, %0%{%1%}|%0%{%1%}, %<xtg_mode>5}"
+;; %X5 so that we don't emit any *WORD PTR for -masm=intel, as
+;; gas changed what it requires incompatibly.
+ "v<sseintprefix>gatherd<ssemodesuffix>\t{%5, %0%{%1%}|%0%{%1%}, %X5}"
[(set_attr "type" "ssemov")
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
@@ -19250,9 +19256,9 @@
UNSPEC_GATHER))
(clobber (match_scratch:QI 2 "=&Yk"))]
"TARGET_AVX512F"
-{
- return "v<sseintprefix>gatherq<ssemodesuffix>\t{%6, %1%{%2%}|%1%{%2%}, %<gatherq_mode>6}";
-}
+;; %X6 so that we don't emit any *WORD PTR for -masm=intel, as
+;; gas changed what it requires incompatibly.
+ "v<sseintprefix>gatherq<ssemodesuffix>\t{%6, %1%{%2%}|%1%{%2%}, %X6}"
[(set_attr "type" "ssemov")
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
@@ -19272,14 +19278,16 @@
(clobber (match_scratch:QI 1 "=&Yk"))]
"TARGET_AVX512F"
{
+ /* %X5 so that we don't emit any *WORD PTR for -masm=intel, as
+ gas changed what it requires incompatibly. */
if (<MODE>mode != <VEC_GATHER_SRCDI>mode)
{
if (<MODE_SIZE> != 64)
- return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %x0%{%1%}|%x0%{%1%}, %<gatherq_mode>5}";
+ return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %x0%{%1%}|%x0%{%1%}, %X5}";
else
- return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %t0%{%1%}|%t0%{%1%}, %t5}";
+ return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %t0%{%1%}|%t0%{%1%}, %X5}";
}
- return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %0%{%1%}|%0%{%1%}, %<gatherq_mode>5}";
+ return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %0%{%1%}|%0%{%1%}, %X5}";
}
[(set_attr "type" "ssemov")
(set_attr "prefix" "evex")
@@ -19316,7 +19324,9 @@
UNSPEC_SCATTER))
(clobber (match_scratch:<avx512fmaskmode> 1 "=&Yk"))]
"TARGET_AVX512F"
- "v<sseintprefix>scatterd<ssemodesuffix>\t{%3, %5%{%1%}|%5%{%1%}, %3}"
+;; %X5 so that we don't emit any *WORD PTR for -masm=intel, as
+;; gas changed what it requires incompatibly.
+ "v<sseintprefix>scatterd<ssemodesuffix>\t{%3, %5%{%1%}|%X5%{%1%}, %3}"
[(set_attr "type" "ssemov")
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
@@ -19352,11 +19362,9 @@
UNSPEC_SCATTER))
(clobber (match_scratch:QI 1 "=&Yk"))]
"TARGET_AVX512F"
-{
- if (GET_MODE_SIZE (GET_MODE_INNER (<MODE>mode)) == 8)
- return "v<sseintprefix>scatterq<ssemodesuffix>\t{%3, %5%{%1%}|%5%{%1%}, %3}";
- return "v<sseintprefix>scatterq<ssemodesuffix>\t{%3, %5%{%1%}|%t5%{%1%}, %3}";
-}
+;; %X5 so that we don't emit any *WORD PTR for -masm=intel, as
+;; gas changed what it requires incompatibly.
+ "v<sseintprefix>scatterq<ssemodesuffix>\t{%3, %5%{%1%}|%X5%{%1%}, %3}"
[(set_attr "type" "ssemov")
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])

2680
floop-interchange.patch Normal file

File diff suppressed because it is too large Load Diff

905
floop-unroll-and-jam.patch Normal file
View File

@ -0,0 +1,905 @@
diff -N -urp a/gcc/Makefile.in b/gcc/Makefile.in
--- a/gcc/Makefile.in 2018-11-07 11:37:24.615223860 +0800
+++ b/gcc/Makefile.in 2018-11-07 11:38:26.155223860 +0800
@@ -1292,6 +1292,7 @@ OBJS = \
gimple-iterator.o \
gimple-fold.o \
gimple-laddress.o \
+ gimple-loop-jam.o \
gimple-low.o \
gimple-pretty-print.o \
gimple-ssa-backprop.o \
diff -N -urp a/gcc/cfgloop.c b/gcc/cfgloop.c
--- a/gcc/cfgloop.c 2018-11-07 11:37:24.947223860 +0800
+++ b/gcc/cfgloop.c 2018-11-07 11:38:26.155223860 +0800
@@ -296,13 +296,25 @@ establish_preds (struct loop *loop, stru
/* Add LOOP to the loop hierarchy tree where FATHER is father of the
added loop. If LOOP has some children, take care of that their
- pred field will be initialized correctly. */
+ pred field will be initialized correctly. If AFTER is non-null
+ then it's expected it's a pointer into FATHERs inner sibling
+ list and LOOP is added behind AFTER, otherwise it's added in front
+ of FATHERs siblings. */
void
-flow_loop_tree_node_add (struct loop *father, struct loop *loop)
+flow_loop_tree_node_add (struct loop *father, struct loop *loop,
+ struct loop *after)
{
- loop->next = father->inner;
- father->inner = loop;
+ if (after)
+ {
+ loop->next = after->next;
+ after->next = loop;
+ }
+ else
+ {
+ loop->next = father->inner;
+ father->inner = loop;
+ }
establish_preds (loop, father);
}
diff -N -urp a/gcc/cfgloop.h b/gcc/cfgloop.h
--- a/gcc/cfgloop.h 2018-11-07 11:37:24.331223860 +0800
+++ b/gcc/cfgloop.h 2018-11-07 11:38:26.155223860 +0800
@@ -324,7 +324,8 @@ void record_loop_exits (void);
void rescan_loop_exit (edge, bool, bool);
/* Loop data structure manipulation/querying. */
-extern void flow_loop_tree_node_add (struct loop *, struct loop *);
+extern void flow_loop_tree_node_add (struct loop *, struct loop *,
+ struct loop * = NULL);
extern void flow_loop_tree_node_remove (struct loop *);
extern bool flow_loop_nested_p (const struct loop *, const struct loop *);
extern bool flow_bb_inside_loop_p (const struct loop *, const_basic_block);
diff -N -urp a/gcc/cfgloopmanip.c b/gcc/cfgloopmanip.c
--- a/gcc/cfgloopmanip.c 2018-11-07 11:37:24.847223860 +0800
+++ b/gcc/cfgloopmanip.c 2018-11-07 11:38:26.155223860 +0800
@@ -1026,9 +1026,11 @@ copy_loop_info (struct loop *loop, struc
}
/* Copies copy of LOOP as subloop of TARGET loop, placing newly
- created loop into loops structure. */
+ created loop into loops structure. If AFTER is non-null
+ the new loop is added at AFTER->next, otherwise in front of TARGETs
+ sibling list. */
struct loop *
-duplicate_loop (struct loop *loop, struct loop *target)
+duplicate_loop (struct loop *loop, struct loop *target, struct loop *after)
{
struct loop *cloop;
cloop = alloc_loop ();
@@ -1040,36 +1042,46 @@ duplicate_loop (struct loop *loop, struc
set_loop_copy (loop, cloop);
/* Add it to target. */
- flow_loop_tree_node_add (target, cloop);
+ flow_loop_tree_node_add (target, cloop, after);
return cloop;
}
/* Copies structure of subloops of LOOP into TARGET loop, placing
- newly created loops into loop tree. */
+ newly created loops into loop tree at the end of TARGETs sibling
+ list in the original order. */
void
duplicate_subloops (struct loop *loop, struct loop *target)
{
- struct loop *aloop, *cloop;
+ struct loop *aloop, *cloop, *tail;
+ for (tail = target->inner; tail && tail->next; tail = tail->next)
+ ;
for (aloop = loop->inner; aloop; aloop = aloop->next)
{
- cloop = duplicate_loop (aloop, target);
+ cloop = duplicate_loop (aloop, target, tail);
+ tail = cloop;
+ gcc_assert (!tail->next);
duplicate_subloops (aloop, cloop);
}
}
/* Copies structure of subloops of N loops, stored in array COPIED_LOOPS,
- into TARGET loop, placing newly created loops into loop tree. */
+ into TARGET loop, placing newly created loops into loop tree adding
+ them to TARGETs sibling list at the end in order. */
static void
copy_loops_to (struct loop **copied_loops, int n, struct loop *target)
{
- struct loop *aloop;
+ struct loop *aloop, *tail;
int i;
+ for (tail = target->inner; tail && tail->next; tail = tail->next)
+ ;
for (i = 0; i < n; i++)
{
- aloop = duplicate_loop (copied_loops[i], target);
+ aloop = duplicate_loop (copied_loops[i], target, tail);
+ tail = aloop;
+ gcc_assert (!tail->next);
duplicate_subloops (copied_loops[i], aloop);
}
}
@@ -1133,14 +1145,15 @@ set_zero_probability (edge e)
}
/* Duplicates body of LOOP to given edge E NDUPL times. Takes care of updating
- loop structure and dominators. E's destination must be LOOP header for
- this to work, i.e. it must be entry or latch edge of this loop; these are
- unique, as the loops must have preheaders for this function to work
- correctly (in case E is latch, the function unrolls the loop, if E is entry
- edge, it peels the loop). Store edges created by copying ORIG edge from
- copies corresponding to set bits in WONT_EXIT bitmap (bit 0 corresponds to
- original LOOP body, the other copies are numbered in order given by control
- flow through them) into TO_REMOVE array. Returns false if duplication is
+ loop structure and dominators (order of inner subloops is retained).
+ E's destination must be LOOP header for this to work, i.e. it must be entry
+ or latch edge of this loop; these are unique, as the loops must have
+ preheaders for this function to work correctly (in case E is latch, the
+ function unrolls the loop, if E is entry edge, it peels the loop). Store
+ edges created by copying ORIG edge from copies corresponding to set bits in
+ WONT_EXIT bitmap (bit 0 corresponds to original LOOP body, the other copies
+ are numbered in order given by control flow through them) into TO_REMOVE
+ array. Returns false if duplication is
impossible. */
bool
diff -N -urp a/gcc/cfgloopmanip.h b/gcc/cfgloopmanip.h
--- a/gcc/cfgloopmanip.h 2018-11-07 11:37:24.939223860 +0800
+++ b/gcc/cfgloopmanip.h 2018-11-07 11:38:26.155223860 +0800
@@ -47,7 +47,8 @@ extern struct loop *loopify (edge, edge,
unsigned, unsigned);
extern void unloop (struct loop *, bool *, bitmap);
extern void copy_loop_info (struct loop *loop, struct loop *target);
-extern struct loop * duplicate_loop (struct loop *, struct loop *);
+extern struct loop * duplicate_loop (struct loop *, struct loop *,
+ struct loop * = NULL);
extern void duplicate_subloops (struct loop *, struct loop *);
extern bool can_duplicate_loop_p (const struct loop *loop);
extern bool duplicate_loop_to_header_edge (struct loop *, edge,
diff -N -urp a/gcc/common.opt b/gcc/common.opt
--- a/gcc/common.opt 2018-11-07 11:37:24.859223860 +0800
+++ b/gcc/common.opt 2018-11-07 11:38:26.159223860 +0800
@@ -1496,8 +1496,8 @@ Common Alias(floop-nest-optimize)
Enable loop nest transforms. Same as -floop-nest-optimize.
floop-unroll-and-jam
-Common Alias(floop-nest-optimize)
-Enable loop nest transforms. Same as -floop-nest-optimize.
+Common Report Var(flag_unroll_jam) Optimization
+Perform unroll-and-jam on loops.
fgnu-tm
Common Report Var(flag_tm)
diff -N -urp a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
--- a/gcc/doc/invoke.texi 2018-11-07 11:37:24.915223860 +0800
+++ b/gcc/doc/invoke.texi 2018-11-07 11:39:49.031223860 +0800
@@ -7120,7 +7120,8 @@ Optimize yet more. @option{-O3} turns o
by @option{-O2} and also turns on the @option{-finline-functions},
@option{-funswitch-loops}, @option{-fpredictive-commoning},
@option{-fgcse-after-reload}, @option{-ftree-loop-vectorize},
-@option{-ftree-loop-distribute-patterns}, @option{-fsplit-paths}
+@option{-ftree-loop-distribute-patterns}, @option{-fsplit-paths},
+@option{-floop-unroll-and-jam},
@option{-ftree-slp-vectorize}, @option{-fvect-cost-model},
@option{-ftree-partial-pre}, @option{-fpeel-loops}
and @option{-fipa-cp-clone} options.
@@ -8226,12 +8227,10 @@ at @option{-O} and higher.
@itemx -floop-interchange
@itemx -floop-strip-mine
@itemx -floop-block
-@itemx -floop-unroll-and-jam
@opindex ftree-loop-linear
@opindex floop-interchange
@opindex floop-strip-mine
@opindex floop-block
-@opindex floop-unroll-and-jam
Perform loop nest optimizations. Same as
@option{-floop-nest-optimize}. To use this code transformation, GCC has
to be configured with @option{--with-isl} to enable the Graphite loop
@@ -8323,6 +8322,12 @@ ENDDO
@end smallexample
and the initialization loop is transformed into a call to memset zero.
+@item -floop-unroll-and-jam
+@opindex floop-unroll-and-jam
+Apply unroll and jam transformations on feasible loops. In a loop
+nest this unrolls the outer loop by some factor and fuses the resulting
+multiple inner loops. This flag is enabled by default at @option{-O3}.
+
@item -ftree-loop-im
@opindex ftree-loop-im
Perform loop invariant motion on trees. This pass moves only invariants that
@@ -10353,13 +10358,13 @@ loop in the loop nest by a given number
length can be changed using the @option{loop-block-tile-size}
parameter. The default value is 51 iterations.
-@item loop-unroll-jam-size
-Specify the unroll factor for the @option{-floop-unroll-and-jam} option. The
-default value is 4.
-
-@item loop-unroll-jam-depth
-Specify the dimension to be unrolled (counting from the most inner loop)
-for the @option{-floop-unroll-and-jam}. The default value is 2.
+@item unroll-jam-min-percent
+The minimum percentage of memory references that must be optimized
+away for the unroll-and-jam transformation to be considered profitable.
+
+@item unroll-jam-max-unroll
+The maximum number of times the outer loop should be unrolled by
+the unroll-and-jam transformation.
@item ipa-cp-value-list-size
IPA-CP attempts to track all possible values and types passed to a function's
diff -N -urp a/gcc/gimple-loop-jam.c b/gcc/gimple-loop-jam.c
--- a/gcc/gimple-loop-jam.c 1970-01-01 08:00:00.000000000 +0800
+++ b/gcc/gimple-loop-jam.c 2018-11-07 11:38:26.167223860 +0800
@@ -0,0 +1,598 @@
+/* Loop unroll-and-jam.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3. If not see
+<http://www.gnu.org/licenses/>. */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "params.h"
+#include "tree-pass.h"
+#include "backend.h"
+#include "tree.h"
+#include "gimple.h"
+#include "ssa.h"
+#include "fold-const.h"
+#include "tree-cfg.h"
+#include "tree-ssa.h"
+#include "tree-ssa-loop-niter.h"
+#include "tree-ssa-loop.h"
+#include "tree-ssa-loop-manip.h"
+#include "cfgloop.h"
+#include "tree-scalar-evolution.h"
+#include "gimple-iterator.h"
+#include "cfghooks.h"
+#include "tree-data-ref.h"
+#include "tree-ssa-loop-ivopts.h"
+#include "tree-vectorizer.h"
+
+/* Unroll and Jam transformation
+
+ This is a combination of two transformations, where the second
+ is not always valid. It's applicable if a loop nest has redundancies
+ over the iterations of an outer loop while not having that with
+ an inner loop.
+
+ Given this nest:
+ for (i) {
+ for (j) {
+ B (i,j)
+ }
+ }
+
+ first unroll:
+ for (i by 2) {
+ for (j) {
+ B (i,j)
+ }
+ for (j) {
+ B (i+1,j)
+ }
+ }
+
+ then fuse the two adjacent inner loops resulting from that:
+ for (i by 2) {
+ for (j) {
+ B (i,j)
+ B (i+1,j)
+ }
+ }
+
+ As the order of evaluations of the body B changes this is valid
+ only in certain situations: all distance vectors need to be forward.
+ Additionally if there are multiple induction variables than just
+ a counting control IV (j above) we can also deal with some situations.
+
+ The validity is checked by unroll_jam_possible_p, and the data-dep
+ testing below.
+
+ A trivial example where the fusion is wrong would be when
+ B (i,j) == x[j-1] = x[j];
+ for (i by 2) {
+ for (j) {
+ x[j-1] = x[j];
+ }
+ for (j) {
+ x[j-1] = x[j];
+ }
+ } effect: move content to front by two elements
+ -->
+ for (i by 2) {
+ for (j) {
+ x[j-1] = x[j];
+ x[j-1] = x[j];
+ }
+ } effect: move content to front by one element
+*/
+
+/* Modify the loop tree for the fact that all code once belonging
+ to the OLD loop or the outer loop of OLD now is inside LOOP. */
+
+static void
+merge_loop_tree (struct loop *loop, struct loop *old)
+{
+ basic_block *bbs;
+ int i, n;
+ struct loop *subloop;
+ edge e;
+ edge_iterator ei;
+
+ /* Find its nodes. */
+ bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
+ n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
+
+ for (i = 0; i < n; i++)
+ {
+ /* If the block was direct child of OLD loop it's now part
+ of LOOP. If it was outside OLD, then it moved into LOOP
+ as well. This avoids changing the loop father for BBs
+ in inner loops of OLD. */
+ if (bbs[i]->loop_father == old
+ || loop_depth (bbs[i]->loop_father) < loop_depth (old))
+ {
+ remove_bb_from_loops (bbs[i]);
+ add_bb_to_loop (bbs[i], loop);
+ continue;
+ }
+
+ /* If we find a direct subloop of OLD, move it to LOOP. */
+ subloop = bbs[i]->loop_father;
+ if (loop_outer (subloop) == old && subloop->header == bbs[i])
+ {
+ flow_loop_tree_node_remove (subloop);
+ flow_loop_tree_node_add (loop, subloop);
+ }
+ }
+
+ /* Update the information about loop exit edges. */
+ for (i = 0; i < n; i++)
+ {
+ FOR_EACH_EDGE (e, ei, bbs[i]->succs)
+ {
+ rescan_loop_exit (e, false, false);
+ }
+ }
+
+ loop->num_nodes = n;
+
+ free (bbs);
+}
+
+/* BB is part of the outer loop of an unroll-and-jam situation.
+ Check if any statements therein would prevent the transformation. */
+
+static bool
+bb_prevents_fusion_p (basic_block bb)
+{
+ gimple_stmt_iterator gsi;
+ /* BB is duplicated by outer unrolling and then all N-1 first copies
+ move into the body of the fused inner loop. If BB exits the outer loop
+ the last copy still does so, and the first N-1 copies are cancelled
+ by loop unrolling, so also after fusion it's the exit block.
+ But there might be other reasons that prevent fusion:
+ * stores or unknown side-effects prevent fusion
+ * loads don't
+ * computations into SSA names: these aren't problematic. Their
+ result will be unused on the exit edges of the first N-1 copies
+ (those aren't taken after unrolling). If they are used on the
+ other edge (the one leading to the outer latch block) they are
+ loop-carried (on the outer loop) and the Nth copy of BB will
+ compute them again (i.e. the first N-1 copies will be dead). */
+ for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ gimple *g = gsi_stmt (gsi);
+ if (gimple_vdef (g) || gimple_has_side_effects (g))
+ return true;
+ }
+ return false;
+}
+
+/* Given an inner loop LOOP (of some OUTER loop) determine if
+ we can safely fuse copies of it (generated by outer unrolling).
+ If so return true, otherwise return false. */
+
+static bool
+unroll_jam_possible_p (struct loop *outer, struct loop *loop)
+{
+ basic_block *bbs;
+ int i, n;
+ struct tree_niter_desc niter;
+
+ /* When fusing the loops we skip the latch block
+ of the first one, so it mustn't have any effects to
+ preserve. */
+ if (!empty_block_p (loop->latch))
+ return false;
+
+ if (!single_exit (loop))
+ return false;
+
+ /* We need a perfect nest. Quick check for adjacent inner loops. */
+ if (outer->inner != loop || loop->next)
+ return false;
+
+ /* Prevent head-controlled inner loops, that we usually have.
+ The guard block would need to be accepted
+ (invariant condition either entering or skipping the loop),
+ without also accepting arbitrary control flow. When unswitching
+ ran before us (as with -O3) this won't be a problem because its
+ outer loop unswitching will have moved out the invariant condition.
+
+ If we do that we need to extend fuse_loops () to cope with this
+ by threading through the (still invariant) copied condition
+ between the two loop copies. */
+ if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header))
+ return false;
+
+ /* The number of iterations of the inner loop must be loop invariant
+ with respect to the outer loop. */
+ if (!number_of_iterations_exit (loop, single_exit (loop), &niter,
+ false, true)
+ || niter.cmp == ERROR_MARK
+ || !integer_zerop (niter.may_be_zero)
+ || !expr_invariant_in_loop_p (outer, niter.niter))
+ return false;
+
+ /* If the inner loop produces any values that are used inside the
+ outer loop (except the virtual op) then it can flow
+ back (perhaps indirectly) into the inner loop. This prevents
+ fusion: without fusion the value at the last iteration is used,
+ with fusion the value after the initial iteration is used.
+
+ If all uses are outside the outer loop this doesn't prevent fusion;
+ the value of the last iteration is still used (and the values from
+ all intermediate iterations are dead). */
+ gphi_iterator psi;
+ for (psi = gsi_start_phis (single_exit (loop)->dest);
+ !gsi_end_p (psi); gsi_next (&psi))
+ {
+ imm_use_iterator imm_iter;
+ use_operand_p use_p;
+ tree op = gimple_phi_result (psi.phi ());
+ if (virtual_operand_p (op))
+ continue;
+ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op)
+ {
+ gimple *use_stmt = USE_STMT (use_p);
+ if (!is_gimple_debug (use_stmt)
+ && flow_bb_inside_loop_p (outer, gimple_bb (use_stmt)))
+ return false;
+ }
+ }
+
+ /* And check blocks belonging to just outer loop. */
+ bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
+ n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
+
+ for (i = 0; i < n; i++)
+ if (bbs[i]->loop_father == outer && bb_prevents_fusion_p (bbs[i]))
+ break;
+ free (bbs);
+ if (i != n)
+ return false;
+
+ /* For now we can safely fuse copies of LOOP only if all
+ loop carried variables are inductions (or the virtual op).
+
+ We could handle reductions as well (the initial value in the second
+ body would be the after-iter value of the first body) if it's over
+ an associative and commutative operation. We wouldn't
+ be able to handle unknown cycles. */
+ for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi))
+ {
+ affine_iv iv;
+ tree op = gimple_phi_result (psi.phi ());
+
+ if (virtual_operand_p (op))
+ continue;
+ if (!simple_iv (loop, loop, op, &iv, true))
+ return false;
+ /* The inductions must be regular, loop invariant step and initial
+ value. */
+ if (!expr_invariant_in_loop_p (outer, iv.step)
+ || !expr_invariant_in_loop_p (outer, iv.base))
+ return false;
+ /* XXX With more effort we could also be able to deal with inductions
+ where the initial value is loop variant but a simple IV in the
+ outer loop. The initial value for the second body would be
+ the original initial value plus iv.base.step. The next value
+ for the fused loop would be the original next value of the first
+ copy, _not_ the next value of the second body. */
+ }
+
+ return true;
+}
+
+/* Fuse LOOP with all further neighbors. The loops are expected to
+ be in appropriate form. */
+
+static void
+fuse_loops (struct loop *loop)
+{
+ struct loop *next = loop->next;
+
+ while (next)
+ {
+ edge e;
+
+ remove_branch (single_pred_edge (loop->latch));
+ /* Make delete_basic_block not fiddle with the loop structure. */
+ basic_block oldlatch = loop->latch;
+ loop->latch = NULL;
+ delete_basic_block (oldlatch);
+ e = redirect_edge_and_branch (loop_latch_edge (next),
+ loop->header);
+ loop->latch = e->src;
+ flush_pending_stmts (e);
+
+ gcc_assert (EDGE_COUNT (next->header->preds) == 1);
+
+ /* The PHI nodes of the second body (single-argument now)
+ need adjustments to use the right values: either directly
+ the value of the corresponding PHI in the first copy or
+ the one leaving the first body which unrolling did for us.
+
+ See also unroll_jam_possible_p () for further possibilities. */
+ gphi_iterator psi_first, psi_second;
+ e = single_pred_edge (next->header);
+ for (psi_first = gsi_start_phis (loop->header),
+ psi_second = gsi_start_phis (next->header);
+ !gsi_end_p (psi_first);
+ gsi_next (&psi_first), gsi_next (&psi_second))
+ {
+ gphi *phi_first = psi_first.phi ();
+ gphi *phi_second = psi_second.phi ();
+ tree firstop = gimple_phi_result (phi_first);
+ /* The virtual operand is correct already as it's
+ always live at exit, hence has a LCSSA node and outer
+ loop unrolling updated SSA form. */
+ if (virtual_operand_p (firstop))
+ continue;
+
+ /* Due to unroll_jam_possible_p () we know that this is
+ an induction. The second body goes over the same
+ iteration space. */
+ add_phi_arg (phi_second, firstop, e,
+ gimple_location (phi_first));
+ }
+ gcc_assert (gsi_end_p (psi_second));
+
+ merge_loop_tree (loop, next);
+ gcc_assert (!next->num_nodes);
+ struct loop *ln = next->next;
+ delete_loop (next);
+ next = ln;
+ }
+ rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
+}
+
+/* Returns true if the distance in DDR can be determined and adjusts
+ the unroll factor in *UNROLL to make unrolling valid for that distance.
+ Otherwise return false.
+
+ If this data dep can lead to a removed memory reference, increment
+ *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
+ for this to happen. */
+
+static bool
+adjust_unroll_factor (struct data_dependence_relation *ddr,
+ unsigned *unroll, unsigned *profit_unroll,
+ unsigned *removed)
+{
+ bool ret = false;
+ if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
+ {
+ if (DDR_NUM_DIST_VECTS (ddr) == 0)
+ return false;
+ unsigned i;
+ lambda_vector dist_v;
+ FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
+ {
+ /* A distance (a,b) is at worst transformed into (a/N,b) by the
+ unrolling (factor N), so the transformation is valid if
+ a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll
+ factor needs to be limited so that the first condition holds.
+ That may limit the factor down to zero in the worst case. */
+ int dist = dist_v[0];
+ if (dist < 0)
+ gcc_unreachable ();
+ else if ((unsigned)dist >= *unroll)
+ ;
+ else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
+ || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
+ && dist > 0))
+ ;
+ else
+ *unroll = dist;
+
+ /* With a distance (a,0) it's always profitable to unroll-and-jam
+ (by a+1), because one memory reference will go away. With
+ (a,b) and b != 0 that's less clear. We will increase the
+ number of streams without lowering the number of mem refs.
+ So for now only handle the first situation. */
+ if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
+ {
+ *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
+ (*removed)++;
+ }
+
+ ret = true;
+ }
+ }
+ return ret;
+}
+
+/* Main entry point for the unroll-and-jam transformation
+ described above. */
+
+static unsigned int
+tree_loop_unroll_and_jam (void)
+{
+ struct loop *loop;
+ bool changed = false;
+
+ gcc_assert (scev_initialized_p ());
+
+ /* Go through all innermost loops. */
+ FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
+ {
+ struct loop *outer = loop_outer (loop);
+
+ if (loop_depth (loop) < 2
+ || optimize_loop_nest_for_size_p (outer))
+ continue;
+
+ if (!unroll_jam_possible_p (outer, loop))
+ continue;
+
+ vec<data_reference_p> datarefs;
+ vec<ddr_p> dependences;
+ unsigned unroll_factor, profit_unroll, removed;
+ struct tree_niter_desc desc;
+ bool unroll = false;
+
+ auto_vec<loop_p, 3> loop_nest;
+ dependences.create (10);
+ datarefs.create (10);
+ if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
+ &datarefs, &dependences))
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Cannot analyze data dependencies\n");
+ free_data_refs (datarefs);
+ free_dependence_relations (dependences);
+ return false;
+ }
+ if (!datarefs.length ())
+ continue;
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ dump_data_dependence_relations (dump_file, dependences);
+
+ unroll_factor = (unsigned)-1;
+ profit_unroll = 1;
+ removed = 0;
+
+ /* Check all dependencies. */
+ unsigned i;
+ struct data_dependence_relation *ddr;
+ FOR_EACH_VEC_ELT (dependences, i, ddr)
+ {
+ struct data_reference *dra, *drb;
+
+ /* If the refs are independend there's nothing to do. */
+ if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
+ continue;
+ dra = DDR_A (ddr);
+ drb = DDR_B (ddr);
+ /* Nothing interesting for the self dependencies. */
+ if (dra == drb)
+ continue;
+
+ /* Now check the distance vector, for determining a sensible
+ outer unroll factor, and for validity of merging the inner
+ loop copies. */
+ if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll,
+ &removed))
+ {
+ /* Couldn't get the distance vector. For two reads that's
+ harmless (we assume we should unroll). For at least
+ one write this means we can't check the dependence direction
+ and hence can't determine safety. */
+
+ if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
+ {
+ unroll_factor = 0;
+ break;
+ }
+ }
+ }
+
+ /* We regard a user-specified minimum percentage of zero as a request
+ to ignore all profitability concerns and apply the transformation
+ always. */
+ if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
+ profit_unroll = 2;
+ else if (removed * 100 / datarefs.length ()
+ < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
+ profit_unroll = 1;
+ if (unroll_factor > profit_unroll)
+ unroll_factor = profit_unroll;
+ if (unroll_factor > (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL))
+ unroll_factor = PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL);
+ unroll = (unroll_factor > 1
+ && can_unroll_loop_p (outer, unroll_factor, &desc));
+
+ if (unroll)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
+ find_loop_location (outer),
+ "applying unroll and jam with factor %d\n",
+ unroll_factor);
+ initialize_original_copy_tables ();
+ tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer),
+ &desc);
+ free_original_copy_tables ();
+ fuse_loops (outer->inner);
+ changed = true;
+ }
+
+ loop_nest.release ();
+ free_dependence_relations (dependences);
+ free_data_refs (datarefs);
+ }
+
+ if (changed)
+ {
+ scev_reset ();
+ free_dominance_info (CDI_DOMINATORS);
+ return TODO_cleanup_cfg;
+ }
+ return 0;
+}
+
+/* Pass boilerplate. */
+
+namespace {
+
+const pass_data pass_data_loop_jam =
+{
+ GIMPLE_PASS, /* type. */
+ "unrolljam", /* name. */
+ OPTGROUP_LOOP, /* optinfo_flags. */
+ TV_LOOP_JAM, /* tv_id. */
+ PROP_cfg, /* properties_required. */
+ 0, /* properties_provided. */
+ 0, /* properties_destroyed. */
+ 0, /* todo_flags_start. */
+ 0, /* todo_flags_finish. */
+};
+
+class pass_loop_jam : public gimple_opt_pass
+{
+public:
+ pass_loop_jam (gcc::context *ctxt)
+ : gimple_opt_pass (pass_data_loop_jam, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ virtual bool gate (function *)
+ {
+ return flag_unroll_jam != 0;
+ }
+ virtual unsigned int execute (function *);
+
+};
+
+unsigned int
+pass_loop_jam::execute (function *fun)
+{
+ if (number_of_loops (fun) <= 1)
+ return 0;
+
+ return tree_loop_unroll_and_jam ();
+}
+
+}
+
+gimple_opt_pass *
+make_pass_loop_jam (gcc::context *ctxt)
+{
+ return new pass_loop_jam (ctxt);
+}
+
diff -N -urp a/gcc/opts.c b/gcc/opts.c
--- a/gcc/opts.c 2018-11-07 11:37:24.891223860 +0800
+++ b/gcc/opts.c 2018-11-07 11:38:26.171223860 +0800
@@ -534,6 +534,7 @@ static const struct default_options defa
{ OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_finline_functions_called_once, NULL, 1 },
{ OPT_LEVELS_3_PLUS, OPT_fsplit_loops, NULL, 1 },
{ OPT_LEVELS_3_PLUS, OPT_funswitch_loops, NULL, 1 },
+ { OPT_LEVELS_3_PLUS, OPT_floop_unroll_and_jam, NULL, 1 },
{ OPT_LEVELS_3_PLUS, OPT_fgcse_after_reload, NULL, 1 },
{ OPT_LEVELS_3_PLUS, OPT_ftree_loop_vectorize, NULL, 1 },
{ OPT_LEVELS_3_PLUS, OPT_ftree_slp_vectorize, NULL, 1 },
diff -N -urp a/gcc/params.def b/gcc/params.def
--- a/gcc/params.def 2018-11-07 11:37:27.543223860 +0800
+++ b/gcc/params.def 2018-11-07 11:38:26.171223860 +0800
@@ -1280,6 +1280,16 @@ DEFPARAM (PARAM_VECT_EPILOGUES_NOMASK,
"Enable loop epilogue vectorization using smaller vector size.",
0, 0, 1)
+DEFPARAM (PARAM_UNROLL_JAM_MIN_PERCENT,
+ "unroll-jam-min-percent",
+ "Minimum percentage of memrefs that must go away for unroll-and-jam to be considered profitable.",
+ 1, 0, 100)
+
+DEFPARAM (PARAM_UNROLL_JAM_MAX_UNROLL,
+ "unroll-jam-max-unroll",
+ "Maximum unroll factor for the unroll-and-jam transformation.",
+ 4, 0, 0)
+
/*
Local variables:
diff -N -urp a/gcc/passes.def b/gcc/passes.def
--- a/gcc/passes.def 2018-11-07 11:37:24.859223860 +0800
+++ b/gcc/passes.def 2018-11-07 11:38:26.171223860 +0800
@@ -272,6 +272,7 @@ along with GCC; see the file COPYING3.
NEXT_PASS (pass_tree_unswitch);
NEXT_PASS (pass_scev_cprop);
NEXT_PASS (pass_loop_split);
+ NEXT_PASS (pass_loop_jam);
/* All unswitching, final value replacement and splitting can expose
empty loops. Remove them now. */
NEXT_PASS (pass_cd_dce);
diff -N -urp a/gcc/timevar.def b/gcc/timevar.def
--- a/gcc/timevar.def 2018-11-07 11:37:24.935223860 +0800
+++ b/gcc/timevar.def 2018-11-07 11:38:26.175223860 +0800
@@ -186,6 +186,7 @@ DEFTIMEVAR (TV_TREE_LOOP_IVCANON , "
DEFTIMEVAR (TV_SCEV_CONST , "scev constant prop")
DEFTIMEVAR (TV_TREE_LOOP_UNSWITCH , "tree loop unswitching")
DEFTIMEVAR (TV_LOOP_SPLIT , "loop splitting")
+DEFTIMEVAR (TV_LOOP_JAM , "unroll and jam")
DEFTIMEVAR (TV_COMPLETE_UNROLL , "complete unrolling")
DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops")
DEFTIMEVAR (TV_TREE_VECTORIZATION , "tree vectorization")
diff -N -urp a/gcc/tree-pass.h b/gcc/tree-pass.h
--- a/gcc/tree-pass.h 2018-11-07 11:37:24.887223860 +0800
+++ b/gcc/tree-pass.h 2018-11-07 11:38:26.175223860 +0800
@@ -369,6 +369,7 @@ extern gimple_opt_pass *make_pass_tree_l
extern gimple_opt_pass *make_pass_lim (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_tree_unswitch (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_loop_split (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_loop_jam (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_predcom (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_iv_canon (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_scev_cprop (gcc::context *ctxt);

View File

@ -0,0 +1,768 @@
diff -N -urp a/gcc/combine-stack-adj.c b/gcc/combine-stack-adj.c
--- a/gcc/combine-stack-adj.c 2017-01-20 08:05:30.925466000 +0800
+++ b/gcc/combine-stack-adj.c 2019-01-10 17:10:16.606528459 +0800
@@ -508,6 +508,8 @@ combine_stack_adjustments_for_block (bas
continue;
set = single_set_for_csa (insn);
+ if (set && find_reg_note (insn, REG_STACK_CHECK, NULL_RTX))
+ set = NULL_RTX;
if (set)
{
rtx dest = SET_DEST (set);
diff -N -urp a/gcc/common.opt b/gcc/common.opt
--- a/gcc/common.opt 2019-01-10 13:33:20.926185828 +0800
+++ b/gcc/common.opt 2019-01-10 16:37:35.238476827 +0800
@@ -2336,13 +2336,18 @@ Common Report Var(flag_variable_expansio
Apply variable expansion when loops are unrolled.
fstack-check=
-Common Report RejectNegative Joined
+Common Report RejectNegative Joined Optimization
-fstack-check=[no|generic|specific] Insert stack checking code into the program.
fstack-check
Common Alias(fstack-check=, specific, no)
Insert stack checking code into the program. Same as -fstack-check=specific.
+fstack-clash-protection
+Common Report Var(flag_stack_clash_protection) Optimization
+Insert code to probe each page of stack space as it is allocated to protect
+from stack-clash style attacks.
+
fstack-limit
Common Var(common_deferred_options) Defer
diff -N -urp a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
--- a/gcc/config/aarch64/aarch64.c 2019-01-10 13:33:20.914185828 +0800
+++ b/gcc/config/aarch64/aarch64.c 2019-01-11 14:12:22.248521895 +0800
@@ -3881,12 +3881,14 @@ aarch64_expand_prologue (void)
{
if (crtl->is_leaf && !cfun->calls_alloca)
{
- if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
- aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
- frame_size - STACK_CHECK_PROTECT);
+ if (frame_size > PROBE_INTERVAL
+ && frame_size > get_stack_check_protect ())
+ aarch64_emit_probe_stack_range (get_stack_check_protect (),
+ (frame_size
+ - get_stack_check_protect ()));
}
else if (frame_size > 0)
- aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
+ aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
}
aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
diff -N -urp a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
--- a/gcc/config/i386/i386.c 2019-01-10 13:33:20.674185822 +0800
+++ b/gcc/config/i386/i386.c 2019-01-28 10:55:37.006876481 +0800
@@ -14396,7 +14396,7 @@ ix86_expand_prologue (void)
HOST_WIDE_INT size = allocate;
if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
- size = 0x80000000 - STACK_CHECK_PROTECT - 1;
+ size = 0x80000000 - get_stack_check_protect () - 1;
if (TARGET_STACK_PROBE)
{
@@ -14406,18 +14406,21 @@ ix86_expand_prologue (void)
ix86_emit_probe_stack_range (0, size);
}
else
- ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
+ ix86_emit_probe_stack_range (0,
+ size + get_stack_check_protect ());
}
else
{
if (crtl->is_leaf && !cfun->calls_alloca)
{
- if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
- ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
- size - STACK_CHECK_PROTECT);
+ if (size > PROBE_INTERVAL
+ && size > get_stack_check_protect ())
+ ix86_emit_probe_stack_range (get_stack_check_protect (),
+ (size
+ - get_stack_check_protect ()));
}
else
- ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
+ ix86_emit_probe_stack_range (get_stack_check_protect (), size);
}
}
}
diff -N -urp a/gcc/config/ia64/ia64.c b/gcc/config/ia64/ia64.c
--- a/gcc/config/ia64/ia64.c 2017-01-01 20:07:43.905435000 +0800
+++ b/gcc/config/ia64/ia64.c 2019-01-28 10:58:37.582881234 +0800
@@ -3481,15 +3481,16 @@ ia64_expand_prologue (void)
if (crtl->is_leaf && !cfun->calls_alloca)
{
- if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
- ia64_emit_probe_stack_range (STACK_CHECK_PROTECT,
- size - STACK_CHECK_PROTECT,
+ if (size > PROBE_INTERVAL && size > get_stack_check_protect ())
+ ia64_emit_probe_stack_range (get_stack_check_protect (),
+ size - get_stack_check_protect (),
bs_size);
- else if (size + bs_size > STACK_CHECK_PROTECT)
- ia64_emit_probe_stack_range (STACK_CHECK_PROTECT, 0, bs_size);
+ else if (size + bs_size > get_stack_check_protect ())
+ ia64_emit_probe_stack_range (get_stack_check_protect (),
+ 0, bs_size);
}
else if (size + bs_size > 0)
- ia64_emit_probe_stack_range (STACK_CHECK_PROTECT, size, bs_size);
+ ia64_emit_probe_stack_range (get_stack_check_protect (), size, bs_size);
}
if (dump_file)
diff -N -urp a/gcc/coretypes.h b/gcc/coretypes.h
--- a/gcc/coretypes.h 2017-01-01 20:07:43.905435000 +0800
+++ b/gcc/coretypes.h 2019-01-11 14:09:58.612518114 +0800
@@ -371,6 +371,7 @@ typedef unsigned char uchar;
#include "input.h"
#include "is-a.h"
#include "memory-block.h"
+#include "dumpfile.h"
#endif /* GENERATOR_FILE && !USED_FOR_TARGET */
#endif /* coretypes.h */
diff -N -urp a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
--- a/gcc/doc/invoke.texi 2019-01-10 13:33:20.882185827 +0800
+++ b/gcc/doc/invoke.texi 2019-01-10 16:40:40.066481692 +0800
@@ -10050,6 +10050,21 @@ compilation without. The value for comp
needs to be more conservative (higher) in order to make tracer
effective.
+@item stack-clash-protection-guard-size
+Specify the size of the operating system provided stack guard as
+2 raised to @var{num} bytes. The default value is 12 (4096 bytes).
+Acceptable values are between 12 and 30. Higher values may reduce the
+number of explicit probes, but a value larger than the operating system
+provided guard will leave code vulnerable to stack clash style attacks.
+
+@item stack-clash-protection-probe-interval
+Stack clash protection involves probing stack space as it is allocated. This
+param controls the maximum distance between probes into the stack as 2 raised
+to @var{num} bytes. Acceptable values are between 10 and 16 and defaults to
+12. Higher values may reduce the number of explicit probes, but a value
+larger than the operating system provided guard will leave code vulnerable to
+stack clash style attacks.
+
@item max-cse-path-length
The maximum number of basic blocks on path that CSE considers.
@@ -11248,7 +11263,8 @@ target support in the compiler but comes
@enumerate
@item
Modified allocation strategy for large objects: they are always
-allocated dynamically if their size exceeds a fixed threshold.
+allocated dynamically if their size exceeds a fixed threshold. Note this
+may change the semantics of some code.
@item
Fixed limit on the size of the static frame of functions: when it is
@@ -11263,6 +11279,25 @@ generic implementation, code performance
Note that old-style stack checking is also the fallback method for
@samp{specific} if no target support has been added in the compiler.
+@samp{-fstack-check=} is designed for Ada's needs to detect infinite recursion
+and stack overflows. @samp{specific} is an excellent choice when compiling
+Ada code. It is not generally sufficient to protect against stack-clash
+attacks. To protect against those you want @samp{-fstack-clash-protection}.
+
+@item -fstack-clash-protection
+@opindex fstack-clash-protection
+Generate code to prevent stack clash style attacks. When this option is
+enabled, the compiler will only allocate one page of stack space at a time
+and each page is accessed immediately after allocation. Thus, it prevents
+allocations from jumping over any stack guard page provided by the
+operating system.
+
+Most targets do not fully support stack clash protection. However, on
+those targets @option{-fstack-clash-protection} will protect dynamic stack
+allocations. @option{-fstack-clash-protection} may also provide limited
+protection for static stack allocations if the target supports
+@option{-fstack-check=specific}.
+
@item -fstack-limit-register=@var{reg}
@itemx -fstack-limit-symbol=@var{sym}
@itemx -fno-stack-limit
diff -N -urp a/gcc/doc/tm.texi b/gcc/doc/tm.texi
--- a/gcc/doc/tm.texi 2017-04-05 01:52:27.193766000 +0800
+++ b/gcc/doc/tm.texi 2019-01-10 16:50:44.006497591 +0800
@@ -3419,6 +3419,10 @@ GCC computed the default from the values
normally not need to override that default.
@end defmac
+@deftypefn {Target Hook} bool TARGET_STACK_CLASH_PROTECTION_FINAL_DYNAMIC_PROBE (rtx @var{residual})
+Some targets make optimistic assumptions about the state of stack probing when they emit their prologues. On such targets a probe into the end of any dynamically allocated space is likely required for safety against stack clash style attacks. Define this variable to return nonzero if such a probe is required or zero otherwise. You need not define this macro if it would always have the value zero.
+@end deftypefn
+
@need 2000
@node Frame Registers
@subsection Registers That Address the Stack Frame
diff -N -urp a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
--- a/gcc/doc/tm.texi.in 2017-04-05 01:52:27.193766000 +0800
+++ b/gcc/doc/tm.texi.in 2019-01-10 16:51:41.530499105 +0800
@@ -2999,6 +2999,8 @@ GCC computed the default from the values
normally not need to override that default.
@end defmac
+@hook TARGET_STACK_CLASH_PROTECTION_FINAL_DYNAMIC_PROBE
+
@need 2000
@node Frame Registers
@subsection Registers That Address the Stack Frame
diff -N -urp a/gcc/explow.c b/gcc/explow.c
--- a/gcc/explow.c 2017-02-02 20:39:09.589196000 +0800
+++ b/gcc/explow.c 2019-01-10 16:56:07.454506105 +0800
@@ -39,8 +39,10 @@ along with GCC; see the file COPYING3.
#include "expr.h"
#include "common/common-target.h"
#include "output.h"
+#include "params.h"
static rtx break_out_memory_refs (rtx);
+static void anti_adjust_stack_and_probe_stack_clash (rtx);
/* Truncate and perhaps sign-extend C as appropriate for MODE. */
@@ -1271,6 +1273,29 @@ get_dynamic_stack_size (rtx *psize, unsi
*psize = size;
}
+/* Return the number of bytes to "protect" on the stack for -fstack-check.
+
+ "protect" in the context of -fstack-check means how many bytes we
+ should always ensure are available on the stack. More importantly
+ this is how many bytes are skipped when probing the stack.
+
+ On some targets we want to reuse the -fstack-check prologue support
+ to give a degree of protection against stack clashing style attacks.
+
+ In that scenario we do not want to skip bytes before probing as that
+ would render the stack clash protections useless.
+
+ So we never use STACK_CHECK_PROTECT directly. Instead we indirect though
+ this helper which allows us to provide different values for
+ -fstack-check and -fstack-clash-protection. */
+HOST_WIDE_INT
+get_stack_check_protect (void)
+{
+ if (flag_stack_clash_protection)
+ return 0;
+ return STACK_CHECK_PROTECT;
+}
+
/* Return an rtx representing the address of an area of memory dynamically
pushed on the stack.
@@ -1429,7 +1454,7 @@ allocate_dynamic_stack_space (rtx size,
probe_stack_range (STACK_OLD_CHECK_PROTECT + STACK_CHECK_MAX_FRAME_SIZE,
size);
else if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
- probe_stack_range (STACK_CHECK_PROTECT, size);
+ probe_stack_range (get_stack_check_protect (), size);
/* Don't let anti_adjust_stack emit notes. */
suppress_reg_args_size = true;
@@ -1482,6 +1507,8 @@ allocate_dynamic_stack_space (rtx size,
if (flag_stack_check && STACK_CHECK_MOVING_SP)
anti_adjust_stack_and_probe (size, false);
+ else if (flag_stack_clash_protection)
+ anti_adjust_stack_and_probe_stack_clash (size);
else
anti_adjust_stack (size);
@@ -1757,6 +1784,237 @@ probe_stack_range (HOST_WIDE_INT first,
emit_insn (gen_blockage ());
}
+/* Compute parameters for stack clash probing a dynamic stack
+ allocation of SIZE bytes.
+
+ We compute ROUNDED_SIZE, LAST_ADDR, RESIDUAL and PROBE_INTERVAL.
+
+ Additionally we conditionally dump the type of probing that will
+ be needed given the values computed. */
+
+void
+compute_stack_clash_protection_loop_data (rtx *rounded_size, rtx *last_addr,
+ rtx *residual,
+ HOST_WIDE_INT *probe_interval,
+ rtx size)
+{
+ /* Round SIZE down to STACK_CLASH_PROTECTION_PROBE_INTERVAL. */
+ *probe_interval
+ = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
+ *rounded_size = simplify_gen_binary (AND, Pmode, size,
+ GEN_INT (-*probe_interval));
+
+ /* Compute the value of the stack pointer for the last iteration.
+ It's just SP + ROUNDED_SIZE. */
+ rtx rounded_size_op = force_operand (*rounded_size, NULL_RTX);
+ *last_addr = force_operand (gen_rtx_fmt_ee (STACK_GROW_OP, Pmode,
+ stack_pointer_rtx,
+ rounded_size_op),
+ NULL_RTX);
+
+ /* Compute any residuals not allocated by the loop above. Residuals
+ are just the ROUNDED_SIZE - SIZE. */
+ *residual = simplify_gen_binary (MINUS, Pmode, size, *rounded_size);
+
+ /* Dump key information to make writing tests easy. */
+ if (dump_file)
+ {
+ if (*rounded_size == CONST0_RTX (Pmode))
+ fprintf (dump_file,
+ "Stack clash skipped dynamic allocation and probing loop.\n");
+ else if (CONST_INT_P (*rounded_size)
+ && INTVAL (*rounded_size) <= 4 * *probe_interval)
+ fprintf (dump_file,
+ "Stack clash dynamic allocation and probing inline.\n");
+ else if (CONST_INT_P (*rounded_size))
+ fprintf (dump_file,
+ "Stack clash dynamic allocation and probing in "
+ "rotated loop.\n");
+ else
+ fprintf (dump_file,
+ "Stack clash dynamic allocation and probing in loop.\n");
+
+ if (*residual != CONST0_RTX (Pmode))
+ fprintf (dump_file,
+ "Stack clash dynamic allocation and probing residuals.\n");
+ else
+ fprintf (dump_file,
+ "Stack clash skipped dynamic allocation and "
+ "probing residuals.\n");
+ }
+}
+
+/* Emit the start of an allocate/probe loop for stack
+ clash protection.
+
+ LOOP_LAB and END_LAB are returned for use when we emit the
+ end of the loop.
+
+ LAST addr is the value for SP which stops the loop. */
+void
+emit_stack_clash_protection_probe_loop_start (rtx *loop_lab,
+ rtx *end_lab,
+ rtx last_addr,
+ bool rotated)
+{
+ /* Essentially we want to emit any setup code, the top of loop
+ label and the comparison at the top of the loop. */
+ *loop_lab = gen_label_rtx ();
+ *end_lab = gen_label_rtx ();
+
+ emit_label (*loop_lab);
+ if (!rotated)
+ emit_cmp_and_jump_insns (stack_pointer_rtx, last_addr, EQ, NULL_RTX,
+ Pmode, 1, *end_lab);
+}
+
+/* Emit the end of a stack clash probing loop.
+
+ This consists of just the jump back to LOOP_LAB and
+ emitting END_LOOP after the loop. */
+
+void
+emit_stack_clash_protection_probe_loop_end (rtx loop_lab, rtx end_loop,
+ rtx last_addr, bool rotated)
+{
+ if (rotated)
+ emit_cmp_and_jump_insns (stack_pointer_rtx, last_addr, NE, NULL_RTX,
+ Pmode, 1, loop_lab);
+ else
+ emit_jump (loop_lab);
+
+ emit_label (end_loop);
+
+}
+
+/* Adjust the stack pointer by minus SIZE (an rtx for a number of bytes)
+ while probing it. This pushes when SIZE is positive. SIZE need not
+ be constant.
+
+ This is subtly different than anti_adjust_stack_and_probe to try and
+ prevent stack-clash attacks
+
+ 1. It must assume no knowledge of the probing state, any allocation
+ must probe.
+
+ Consider the case of a 1 byte alloca in a loop. If the sum of the
+ allocations is large, then this could be used to jump the guard if
+ probes were not emitted.
+
+ 2. It never skips probes, whereas anti_adjust_stack_and_probe will
+ skip probes on the first couple PROBE_INTERVALs on the assumption
+ they're done elsewhere.
+
+ 3. It only allocates and probes SIZE bytes, it does not need to
+ allocate/probe beyond that because this probing style does not
+ guarantee signal handling capability if the guard is hit. */
+
+static void
+anti_adjust_stack_and_probe_stack_clash (rtx size)
+{
+ /* First ensure SIZE is Pmode. */
+ if (GET_MODE (size) != VOIDmode && GET_MODE (size) != Pmode)
+ size = convert_to_mode (Pmode, size, 1);
+
+ /* We can get here with a constant size on some targets. */
+ rtx rounded_size, last_addr, residual;
+ HOST_WIDE_INT probe_interval;
+ compute_stack_clash_protection_loop_data (&rounded_size, &last_addr,
+ &residual, &probe_interval, size);
+
+ if (rounded_size != CONST0_RTX (Pmode))
+ {
+ if (CONST_INT_P (rounded_size)
+ && INTVAL (rounded_size) <= 4 * probe_interval)
+ {
+ for (HOST_WIDE_INT i = 0;
+ i < INTVAL (rounded_size);
+ i += probe_interval)
+ {
+ anti_adjust_stack (GEN_INT (probe_interval));
+
+ /* The prologue does not probe residuals. Thus the offset
+ here to probe just beyond what the prologue had already
+ allocated. */
+ emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+ (probe_interval
+ - GET_MODE_SIZE (word_mode))));
+ emit_insn (gen_blockage ());
+ }
+ }
+ else
+ {
+ rtx loop_lab, end_loop;
+ bool rotate_loop = CONST_INT_P (rounded_size);
+ emit_stack_clash_protection_probe_loop_start (&loop_lab, &end_loop,
+ last_addr, rotate_loop);
+
+ anti_adjust_stack (GEN_INT (probe_interval));
+
+ /* The prologue does not probe residuals. Thus the offset here
+ to probe just beyond what the prologue had already allocated. */
+ emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+ (probe_interval
+ - GET_MODE_SIZE (word_mode))));
+
+ emit_stack_clash_protection_probe_loop_end (loop_lab, end_loop,
+ last_addr, rotate_loop);
+ emit_insn (gen_blockage ());
+ }
+ }
+
+ if (residual != CONST0_RTX (Pmode))
+ {
+ rtx label = NULL_RTX;
+ /* RESIDUAL could be zero at runtime and in that case *sp could
+ hold live data. Furthermore, we do not want to probe into the
+ red zone.
+
+ Go ahead and just guard the probe at *sp on RESIDUAL != 0 at
+ runtime if RESIDUAL is not a compile time constant. */
+ if (!CONST_INT_P (residual))
+ {
+ label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (residual, CONST0_RTX (GET_MODE (residual)),
+ EQ, NULL_RTX, Pmode, 1, label);
+ }
+
+ rtx x = force_reg (Pmode, plus_constant (Pmode, residual,
+ -GET_MODE_SIZE (word_mode)));
+ anti_adjust_stack (residual);
+ emit_stack_probe (gen_rtx_PLUS (Pmode, stack_pointer_rtx, x));
+ emit_insn (gen_blockage ());
+ if (!CONST_INT_P (residual))
+ emit_label (label);
+ }
+
+ /* Some targets make optimistic assumptions in their prologues about
+ how the caller may have probed the stack. Make sure we honor
+ those assumptions when needed. */
+ if (size != CONST0_RTX (Pmode)
+ && targetm.stack_clash_protection_final_dynamic_probe (residual))
+ {
+ /* SIZE could be zero at runtime and in that case *sp could hold
+ live data. Furthermore, we don't want to probe into the red
+ zone.
+
+ Go ahead and just guard the probe at *sp on SIZE != 0 at runtime
+ if SIZE is not a compile time constant. */
+ rtx label = NULL_RTX;
+ if (!CONST_INT_P (size))
+ {
+ label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (size, CONST0_RTX (GET_MODE (size)),
+ EQ, NULL_RTX, Pmode, 1, label);
+ }
+
+ emit_stack_probe (stack_pointer_rtx);
+ emit_insn (gen_blockage ());
+ if (!CONST_INT_P (size))
+ emit_label (label);
+ }
+}
+
/* Adjust the stack pointer by minus SIZE (an rtx for a number of bytes)
while probing it. This pushes when SIZE is positive. SIZE need not
be constant. If ADJUST_BACK is true, adjust back the stack pointer
diff -N -urp a/gcc/explow.h b/gcc/explow.h
--- a/gcc/explow.h 2017-01-01 20:07:43.905435000 +0800
+++ b/gcc/explow.h 2019-01-10 16:57:37.934508487 +0800
@@ -69,6 +69,15 @@ extern void anti_adjust_stack (rtx);
/* Add some bytes to the stack while probing it. An rtx says how many. */
extern void anti_adjust_stack_and_probe (rtx, bool);
+/* Support for building allocation/probing loops for stack-clash
+ protection of dyamically allocated stack space. */
+extern void compute_stack_clash_protection_loop_data (rtx *, rtx *, rtx *,
+ HOST_WIDE_INT *, rtx);
+extern void emit_stack_clash_protection_probe_loop_start (rtx *, rtx *,
+ rtx, bool);
+extern void emit_stack_clash_protection_probe_loop_end (rtx, rtx,
+ rtx, bool);
+
/* This enum is used for the following two functions. */
enum save_level {SAVE_BLOCK, SAVE_FUNCTION, SAVE_NONLOCAL};
diff -N -urp a/gcc/flag-types.h b/gcc/flag-types.h
--- a/gcc/flag-types.h 2017-01-01 20:07:43.905435000 +0800
+++ b/gcc/flag-types.h 2019-01-10 16:42:11.490484099 +0800
@@ -166,7 +166,14 @@ enum permitted_flt_eval_methods
PERMITTED_FLT_EVAL_METHODS_C11
};
-/* Type of stack check. */
+/* Type of stack check.
+
+ Stack checking is designed to detect infinite recursion and stack
+ overflows for Ada programs. Furthermore stack checking tries to ensure
+ in that scenario that enough stack space is left to run a signal handler.
+
+ -fstack-check= does not prevent stack-clash style attacks. For that
+ you want -fstack-clash-protection. */
enum stack_check_type
{
/* Do not check the stack. */
diff -N -urp a/gcc/function.c b/gcc/function.c
--- a/gcc/function.c 2017-08-08 21:21:12.755378000 +0800
+++ b/gcc/function.c 2019-01-10 17:07:17.414523742 +0800
@@ -5695,6 +5695,58 @@ get_arg_pointer_save_area (void)
return ret;
}
+
+/* If debugging dumps are requested, dump information about how the
+ target handled -fstack-check=clash for the prologue.
+
+ PROBES describes what if any probes were emitted.
+
+ RESIDUALS indicates if the prologue had any residual allocation
+ (i.e. total allocation was not a multiple of PROBE_INTERVAL). */
+
+void
+dump_stack_clash_frame_info (enum stack_clash_probes probes, bool residuals)
+{
+ if (!dump_file)
+ return;
+
+ switch (probes)
+ {
+ case NO_PROBE_NO_FRAME:
+ fprintf (dump_file,
+ "Stack clash no probe no stack adjustment in prologue.\n");
+ break;
+ case NO_PROBE_SMALL_FRAME:
+ fprintf (dump_file,
+ "Stack clash no probe small stack adjustment in prologue.\n");
+ break;
+ case PROBE_INLINE:
+ fprintf (dump_file, "Stack clash inline probes in prologue.\n");
+ break;
+ case PROBE_LOOP:
+ fprintf (dump_file, "Stack clash probe loop in prologue.\n");
+ break;
+ }
+
+ if (residuals)
+ fprintf (dump_file, "Stack clash residual allocation in prologue.\n");
+ else
+ fprintf (dump_file, "Stack clash no residual allocation in prologue.\n");
+
+ if (frame_pointer_needed)
+ fprintf (dump_file, "Stack clash frame pointer needed.\n");
+ else
+ fprintf (dump_file, "Stack clash no frame pointer needed.\n");
+
+ if (TREE_THIS_VOLATILE (cfun->decl))
+ fprintf (dump_file,
+ "Stack clash noreturn prologue, assuming no implicit"
+ " probes in caller.\n");
+ else
+ fprintf (dump_file,
+ "Stack clash not noreturn prologue.\n");
+}
+
/* Add a list of INSNS to the hash HASHP, possibly allocating HASHP
for the first time. */
diff -N -urp a/gcc/function.h b/gcc/function.h
--- a/gcc/function.h 2017-01-25 01:07:36.015431000 +0800
+++ b/gcc/function.h 2019-01-10 17:08:12.806525200 +0800
@@ -553,6 +553,14 @@ do { \
((TARGET_PTRMEMFUNC_VBIT_LOCATION == ptrmemfunc_vbit_in_pfn) \
? MAX (FUNCTION_BOUNDARY, 2 * BITS_PER_UNIT) : FUNCTION_BOUNDARY)
+enum stack_clash_probes {
+ NO_PROBE_NO_FRAME,
+ NO_PROBE_SMALL_FRAME,
+ PROBE_INLINE,
+ PROBE_LOOP
+};
+
+extern void dump_stack_clash_frame_info (enum stack_clash_probes, bool);
extern void push_function_context (void);
diff -N -urp a/gcc/params.def b/gcc/params.def
--- a/gcc/params.def 2019-01-10 13:33:20.894185827 +0800
+++ b/gcc/params.def 2019-01-10 16:43:15.414485782 +0800
@@ -213,6 +213,16 @@ DEFPARAM(PARAM_STACK_FRAME_GROWTH,
"Maximal stack frame growth due to inlining (in percent).",
1000, 0, 0)
+DEFPARAM(PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
+ "stack-clash-protection-guard-size",
+ "Size of the stack guard expressed as a power of two.",
+ 12, 12, 30)
+
+DEFPARAM(PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
+ "stack-clash-protection-probe-interval",
+ "Interval in which to probe the stack expressed as a power of two.",
+ 12, 10, 16)
+
/* The GCSE optimization will be disabled if it would require
significantly more memory than this value. */
DEFPARAM(PARAM_MAX_GCSE_MEMORY,
diff -N -urp a/gcc/reg-notes.def b/gcc/reg-notes.def
--- a/gcc/reg-notes.def 2017-03-28 05:00:35.674561000 +0800
+++ b/gcc/reg-notes.def 2019-01-10 17:12:11.678531488 +0800
@@ -223,6 +223,10 @@ REG_NOTE (ARGS_SIZE)
pseudo reg. */
REG_NOTE (RETURNED)
+/* Indicates the instruction is a stack check probe that should not
+ be combined with other stack adjustments. */
+REG_NOTE (STACK_CHECK)
+
/* Used to mark a call with the function decl called by the call.
The decl might not be available in the call due to splitting of the call
insn. This note is a SYMBOL_REF. */
diff -N -urp a/gcc/rtl.h b/gcc/rtl.h
--- a/gcc/rtl.h 2017-03-14 20:47:42.745690000 +0800
+++ b/gcc/rtl.h 2019-01-10 16:59:15.574511058 +0800
@@ -2707,6 +2707,7 @@ get_full_set_src_cost (rtx x, machine_mo
/* In explow.c */
extern HOST_WIDE_INT trunc_int_for_mode (HOST_WIDE_INT, machine_mode);
extern rtx plus_constant (machine_mode, rtx, HOST_WIDE_INT, bool = false);
+extern HOST_WIDE_INT get_stack_check_protect (void);
/* In rtl.c */
extern rtx rtx_alloc_stat (RTX_CODE MEM_STAT_DECL);
diff -N -urp a/gcc/sched-deps.c b/gcc/sched-deps.c
--- a/gcc/sched-deps.c 2017-01-01 20:07:43.905435000 +0800
+++ b/gcc/sched-deps.c 2019-01-10 17:13:37.470533746 +0800
@@ -4717,6 +4717,11 @@ parse_add_or_inc (struct mem_inc_info *m
if (RTX_FRAME_RELATED_P (insn) || !pat)
return false;
+ /* Do not allow breaking data dependencies for insns that are marked
+ with REG_STACK_CHECK. */
+ if (find_reg_note (insn, REG_STACK_CHECK, NULL))
+ return false;
+
/* Result must be single reg. */
if (!REG_P (SET_DEST (pat)))
return false;
diff -N -urp a/gcc/target.def b/gcc/target.def
--- a/gcc/target.def 2019-01-10 13:33:20.762185824 +0800
+++ b/gcc/target.def 2019-01-10 17:01:49.146515100 +0800
@@ -5490,6 +5490,12 @@ these registers when the target switches
void, (void),
hook_void_void)
+DEFHOOK
+(stack_clash_protection_final_dynamic_probe,
+ "Some targets make optimistic assumptions about the state of stack probing when they emit their prologues. On such targets a probe into the end of any dynamically allocated space is likely required for safety against stack clash style attacks. Define this variable to return nonzero if such a probe is required or zero otherwise. You need not define this macro if it would always have the value zero.",
+ bool, (rtx residual),
+ default_stack_clash_protection_final_dynamic_probe)
+
/* Functions specific to the C family of frontends. */
#undef HOOK_PREFIX
#define HOOK_PREFIX "TARGET_C_"
diff -N -urp a/gcc/targhooks.c b/gcc/targhooks.c
--- a/gcc/targhooks.c 2017-02-07 19:29:06.644837000 +0800
+++ b/gcc/targhooks.c 2019-01-10 17:03:23.818517592 +0800
@@ -2107,4 +2107,10 @@ default_excess_precision (enum excess_pr
return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
}
+bool
+default_stack_clash_protection_final_dynamic_probe (rtx residual ATTRIBUTE_UNUSED)
+{
+ return 0;
+}
+
#include "gt-targhooks.h"
diff -N -urp a/gcc/targhooks.h b/gcc/targhooks.h
--- a/gcc/targhooks.h 2017-04-05 01:52:27.193766000 +0800
+++ b/gcc/targhooks.h 2019-01-10 17:04:11.438518846 +0800
@@ -263,5 +263,6 @@ extern unsigned int default_min_arithmet
extern enum flt_eval_method
default_excess_precision (enum excess_precision_type ATTRIBUTE_UNUSED);
+extern bool default_stack_clash_protection_final_dynamic_probe (rtx);
#endif /* GCC_TARGHOOKS_H */
diff -N -urp a/gcc/toplev.c b/gcc/toplev.c
--- a/gcc/toplev.c 2017-09-15 16:18:34.015147000 +0800
+++ b/gcc/toplev.c 2019-01-10 16:45:33.626489420 +0800
@@ -1573,6 +1573,26 @@ process_options (void)
flag_associative_math = 0;
}
+ /* -fstack-clash-protection is not currently supported on targets
+ where the stack grows up. */
+ if (flag_stack_clash_protection && !STACK_GROWS_DOWNWARD)
+ {
+ warning_at (UNKNOWN_LOCATION, 0,
+ "%<-fstack-clash-protection%> is not supported on targets "
+ "where the stack grows from lower to higher addresses");
+ flag_stack_clash_protection = 0;
+ }
+
+ /* We can not support -fstack-check= and -fstack-clash-protection at
+ the same time. */
+ if (flag_stack_check != NO_STACK_CHECK && flag_stack_clash_protection)
+ {
+ warning_at (UNKNOWN_LOCATION, 0,
+ "%<-fstack-check=%> and %<-fstack-clash_protection%> are "
+ "mutually exclusive. Disabling %<-fstack-check=%>");
+ flag_stack_check = NO_STACK_CHECK;
+ }
+
/* With -fcx-limited-range, we do cheap and quick complex arithmetic. */
if (flag_cx_limited_range)
flag_complex_method = 0;

BIN
gcc-7.3.0.tar.gz Normal file

Binary file not shown.

12
gcc-adapt-to-isl.patch Normal file
View File

@ -0,0 +1,12 @@
diff --git a/gcc/graphite.h b/gcc/graphite.h
index 4e0e58c..be0a22b 100644 (file)
--- a/gcc/graphite.h
+++ b/gcc/graphite.h
@@ -37,6 +37,8 @@ along with GCC; see the file COPYING3. If not see
#include <isl/schedule.h>
#include <isl/ast_build.h>
#include <isl/schedule_node.h>
+#include <isl/id.h>
+#include <isl/space.h>
typedef struct poly_dr *poly_dr_p;

3352
gcc.spec Normal file

File diff suppressed because it is too large Load Diff

BIN
isl-0.14.tar.xz Normal file

Binary file not shown.

View File

@ -0,0 +1,13 @@
diff -N -urp a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
--- a/gcc/config/aarch64/aarch64.md 2019-05-30 16:12:52.950606040 +0800
+++ b/gcc/config/aarch64/aarch64.md 2019-05-30 16:15:56.606599549 +0800
@@ -3110,7 +3110,8 @@
(define_insn_and_split "*compare_cstore<mode>_insn"
[(set (match_operand:GPI 0 "register_operand" "=r")
(EQL:GPI (match_operand:GPI 1 "register_operand" "r")
- (match_operand:GPI 2 "aarch64_imm24" "n")))]
+ (match_operand:GPI 2 "aarch64_imm24" "n")))
+ (clobber (reg:CC CC_REGNUM))]
"!aarch64_move_imm (INTVAL (operands[2]), <MODE>mode)
&& !aarch64_plus_operand (operands[2], <MODE>mode)
&& !reload_completed"

View File

@ -0,0 +1,108 @@
diff -N -urp a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
--- a/gcc/config/aarch64/aarch64.c 2018-09-19 17:11:42.583520820 +0800
+++ b/gcc/config/aarch64/aarch64.c 2018-09-19 17:10:22.715520820 +0800
@@ -1260,29 +1260,32 @@ aarch64_is_long_call_p (rtx sym)
void
aarch64_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
{
- if (!TARGET_LONG_CALLS)
+ if (flag_fentry)
{
- fprintf (file, "\tmov\tx9, x30\n");
- fprintf (file, "\tbl\t__fentry__\n");
- fprintf (file, "\tmov\tx30, x9\n");
- }
- else
- {
- if (flag_pic)
+ if (!TARGET_LONG_CALLS)
{
fprintf (file, "\tmov\tx9, x30\n");
- fprintf (file, "\tadrp\tx10, :got:__fentry__\n");
- fprintf (file, "\tldr\tx10, [x10, #:got_lo12:__fentry__]\n");
- fprintf (file, "\tblr\tx10\n");
+ fprintf (file, "\tbl\t__fentry__\n");
fprintf (file, "\tmov\tx30, x9\n");
}
else
{
- fprintf (file, "\tmov\tx9, x30\n");
- fprintf (file, "\tadrp\tx10, __fentry__\n");
- fprintf (file, "\tadd\tx10, x10, :lo12:__fentry__\n");
- fprintf (file, "\tblr\tx10\n");
- fprintf (file, "\tmov\tx30, x9\n");
+ if (flag_pic)
+ {
+ fprintf (file, "\tmov\tx9, x30\n");
+ fprintf (file, "\tadrp\tx10, :got:__fentry__\n");
+ fprintf (file, "\tldr\tx10, [x10, #:got_lo12:__fentry__]\n");
+ fprintf (file, "\tblr\tx10\n");
+ fprintf (file, "\tmov\tx30, x9\n");
+ }
+ else
+ {
+ fprintf (file, "\tmov\tx9, x30\n");
+ fprintf (file, "\tadrp\tx10, __fentry__\n");
+ fprintf (file, "\tadd\tx10, x10, :lo12:__fentry__\n");
+ fprintf (file, "\tblr\tx10\n");
+ fprintf (file, "\tmov\tx30, x9\n");
+ }
}
}
}
@@ -12020,6 +12023,15 @@ aarch64_emit_unlikely_jump (rtx insn)
add_int_reg_note (jump, REG_BR_PROB, very_unlikely);
}
+/* Return true, if profiling code should be emitted before
+ prologue. Otherwise it returns false.
+ Note: For x86 with "hotfix" it is sorried. */
+static bool
+aarch64_profile_before_prologue (void)
+{
+ return flag_fentry != 0;
+}
+
/* Expand a compare and swap pattern. */
void
@@ -14952,6 +14964,9 @@ aarch64_run_selftests (void)
#undef TARGET_ASM_ALIGNED_SI_OP
#define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
+#undef TARGET_PROFILE_BEFORE_PROLOGUE
+#define TARGET_PROFILE_BEFORE_PROLOGUE aarch64_profile_before_prologue
+
#undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
#define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
hook_bool_const_tree_hwi_hwi_const_tree_true
diff -N -urp a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
--- a/gcc/config/aarch64/aarch64.h 2018-09-19 17:11:42.587520820 +0800
+++ b/gcc/config/aarch64/aarch64.h 2018-09-19 17:10:22.715520820 +0800
@@ -850,9 +850,12 @@ typedef struct
{ \
rtx fun, lr; \
const rtx_insn* tmp = get_insns (); \
- lr = get_hard_reg_initial_val (Pmode, LR_REGNUM); \
- fun = gen_rtx_SYMBOL_REF (Pmode, MCOUNT_NAME); \
- emit_library_call (fun, LCT_NORMAL, VOIDmode, 1, lr, Pmode); \
+ if (!flag_fentry) \
+ { \
+ lr = get_hard_reg_initial_val (Pmode, LR_REGNUM); \
+ fun = gen_rtx_SYMBOL_REF (Pmode, MCOUNT_NAME); \
+ emit_library_call (fun, LCT_NORMAL, VOIDmode, 1, lr, Pmode); \
+ } \
if (TARGET_LONG_CALLS) \
{ \
emit_insn (gen_blockage ()); \
diff -N -urp a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
--- a/gcc/config/aarch64/aarch64.opt 2018-09-19 17:11:42.587520820 +0800
+++ b/gcc/config/aarch64/aarch64.opt 2018-09-19 17:10:22.715520820 +0800
@@ -192,3 +192,7 @@ single precision and to 32 bits for doub
mverbose-cost-dump
Common Undocumented Var(flag_aarch64_verbose_cost)
Enables verbose cost model dumping in the debug dump files.
+
+mfentry
+Target Report Var(flag_fentry) Init(0)
+Emit profiling counter call at function entry immediately after prologue.

362
option-mlong-calls.patch Normal file
View File

@ -0,0 +1,362 @@
diff -N -urp a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
--- a/gcc/config/aarch64/aarch64-protos.h 2018-11-06 10:43:27.862079389 +0800
+++ b/gcc/config/aarch64/aarch64-protos.h 2018-11-06 10:44:34.930081154 +0800
@@ -353,6 +353,10 @@ bool aarch64_use_return_insn_p (void);
const char *aarch64_mangle_builtin_type (const_tree);
const char *aarch64_output_casesi (rtx *);
+extern void aarch64_pr_long_calls (struct cpp_reader *);
+extern void aarch64_pr_no_long_calls (struct cpp_reader *);
+extern void aarch64_pr_long_calls_off (struct cpp_reader *);
+
enum aarch64_symbol_type aarch64_classify_symbol (rtx, rtx);
enum aarch64_symbol_type aarch64_classify_tls_symbol (rtx);
enum reg_class aarch64_regno_regclass (unsigned);
@@ -384,6 +388,7 @@ void aarch64_expand_epilogue (bool);
void aarch64_expand_mov_immediate (rtx, rtx);
void aarch64_expand_prologue (void);
void aarch64_expand_vector_init (rtx, rtx);
+void aarch64_function_profiler (FILE *, int);
void aarch64_init_cumulative_args (CUMULATIVE_ARGS *, const_tree, rtx,
const_tree, unsigned);
void aarch64_init_expanders (void);
diff -N -urp a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
--- a/gcc/config/aarch64/aarch64.c 2018-11-06 10:43:27.870079389 +0800
+++ b/gcc/config/aarch64/aarch64.c 2018-11-06 10:44:34.934081154 +0800
@@ -70,6 +70,9 @@
/* This file should be included last. */
#include "target-def.h"
+static void aarch64_set_default_type_attributes (tree);
+static int aarch64_comp_type_attributes (const_tree, const_tree);
+
/* Defined for convenience. */
#define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
@@ -1092,12 +1095,163 @@ aarch64_hard_regno_caller_save_mode (uns
return choose_hard_reg_mode (regno, nregs, false);
}
+/* Table of machine attributes. */
+static const struct attribute_spec aarch64_attribute_table[] =
+{
+ /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
+ affects_type_identity }. */
+ /* Function calls made to this symbol must be done indirectly, because
+ it may lie outside of the 26 bit addressing range of a normal function
+ call. */
+ { "long_call", 0, 0, false, true, true, NULL, false },
+ /* Whereas these functions are always known to reside within the 26 bit
+ addressing range. */
+ { "short_call", 0, 0, false, true, true, NULL, false },
+ { NULL, 0, 0, false, false, false, NULL, false }
+};
+
+/* Encode the current state of the #pragma[no_]long_calls. */
+typedef enum
+{
+ OFF, /* No #pragma[no_]long_calls is in effect. */
+ LONG, /* #pragma long_calls is in effect. */
+ SHORT /* #pragma no_long_calls is in effect. */
+} aarch64_pragma_enum;
+
+static aarch64_pragma_enum aarch64_pragma_long_calls = OFF;
+
+void
+aarch64_pr_long_calls (struct cpp_reader * pfile ATTRIBUTE_UNUSED)
+{
+ aarch64_pragma_long_calls = LONG;
+}
+
+void
+aarch64_pr_no_long_calls (struct cpp_reader * pfile ATTRIBUTE_UNUSED)
+{
+ aarch64_pragma_long_calls = SHORT;
+}
+
+void
+aarch64_pr_long_calls_off (struct cpp_reader * pfile ATTRIBUTE_UNUSED)
+{
+ aarch64_pragma_long_calls = OFF;
+}
+
+/* Return 0 if the attributes for two types are incompatible, 1 if they
+ are compatible. */
+static int
+aarch64_comp_type_attributes (const_tree type1, const_tree type2)
+{
+ int l1, l2, s1, s2;
+
+ /* Check for mismatch of non-default calling convention. */
+ if (TREE_CODE (type1) != FUNCTION_TYPE)
+ return 1;
+
+ /* Check for mismatched call attributes. */
+ l1 = lookup_attribute ("long_call", TYPE_ATTRIBUTES (type1)) != NULL;
+ l2 = lookup_attribute ("long_call", TYPE_ATTRIBUTES (type2)) != NULL;
+ s1 = lookup_attribute ("short_call", TYPE_ATTRIBUTES (type1)) != NULL;
+ s2 = lookup_attribute ("short_call", TYPE_ATTRIBUTES (type2)) != NULL;
+
+ /* Only bother to check if an attribute is defined. */
+ if (l1 | l2 | s1 | s2)
+ {
+ /* If one type has an attribute, the other
+ must have the same attribute. */
+ if ((l1 != l2) || (s1 != s2))
+ {
+ return 0;
+ }
+
+ /* Disallow mixed attributes. */
+ if ((l1 && s2) || (l2 && s1))
+ {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+/* Assigns default attributes to newly defined type. This is used to
+ set short_call/long_call attributes for function types of
+ functions defined inside corresponding #pragma scopes. */
+static void
+aarch64_set_default_type_attributes (tree type)
+{
+ /* Add __attribute__ ((long_call)) to all functions, when
+ inside #pragma long_calls or __attribute__ ((short_call)),
+ when inside #pragma no_long_calls. */
+ if (TREE_CODE (type) == FUNCTION_TYPE || TREE_CODE (type) == METHOD_TYPE)
+ {
+ tree type_attr_list = NULL;
+ tree attr_name = NULL;
+ type_attr_list = TYPE_ATTRIBUTES (type);
+
+ if (aarch64_pragma_long_calls == LONG)
+ {
+ attr_name = get_identifier ("long_call");
+ }
+ else if (aarch64_pragma_long_calls == SHORT)
+ {
+ attr_name = get_identifier ("short_call");
+ }
+ else
+ {
+ return;
+ }
+
+ type_attr_list = tree_cons (attr_name, NULL_TREE, type_attr_list);
+ TYPE_ATTRIBUTES (type) = type_attr_list;
+ }
+}
+
+/* Return true if DECL is known to be linked into section SECTION. */
+static bool
+aarch64_function_in_section_p (tree decl, section *section)
+{
+ /* We can only be certain about the prevailing symbol definition. */
+ if (!decl_binds_to_current_def_p (decl))
+ return false;
+
+ /* If DECL_SECTION_NAME is set, assume it is trustworthy. */
+ if (!DECL_SECTION_NAME (decl))
+ {
+ /* Make sure that we will not create a unique section for DECL. */
+ if (flag_function_sections || DECL_COMDAT_GROUP (decl))
+ return false;
+ }
+
+ return function_section (decl) == section;
+}
+
/* Return true if calls to DECL should be treated as
long-calls (ie called via a register). */
static bool
-aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
+aarch64_decl_is_long_call_p (tree decl)
{
- return false;
+ tree attrs = NULL;
+
+ if (!decl)
+ return TARGET_LONG_CALLS;
+
+ attrs = TYPE_ATTRIBUTES (TREE_TYPE (decl));
+ if (lookup_attribute ("short_call", attrs))
+ return false;
+
+ /* For "f", be conservative, and only cater for cases in which the
+ whole of the current function is placed in the same section. */
+ if (!flag_reorder_blocks_and_partition
+ && TREE_CODE (decl) == FUNCTION_DECL
+ && aarch64_function_in_section_p (decl, current_function_section ()))
+ return false;
+
+ if (lookup_attribute ("long_call", attrs))
+ return true;
+
+ return TARGET_LONG_CALLS;
}
/* Return true if calls to symbol-ref SYM should be treated as
@@ -1108,6 +1257,36 @@ aarch64_is_long_call_p (rtx sym)
return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
}
+void
+aarch64_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
+{
+ if (!TARGET_LONG_CALLS)
+ {
+ fprintf (file, "\tmov\tx9, x30\n");
+ fprintf (file, "\tbl\t__fentry__\n");
+ fprintf (file, "\tmov\tx30, x9\n");
+ }
+ else
+ {
+ if (flag_pic)
+ {
+ fprintf (file, "\tmov\tx9, x30\n");
+ fprintf (file, "\tadrp\tx10, :got:__fentry__\n");
+ fprintf (file, "\tldr\tx10, [x10, #:got_lo12:__fentry__]\n");
+ fprintf (file, "\tblr\tx10\n");
+ fprintf (file, "\tmov\tx30, x9\n");
+ }
+ else
+ {
+ fprintf (file, "\tmov\tx9, x30\n");
+ fprintf (file, "\tadrp\tx10, __fentry__\n");
+ fprintf (file, "\tadd\tx10, x10, :lo12:__fentry__\n");
+ fprintf (file, "\tblr\tx10\n");
+ fprintf (file, "\tmov\tx30, x9\n");
+ }
+ }
+}
+
/* Return true if calls to symbol-ref SYM should not go through
plt stubs. */
@@ -15099,6 +15278,15 @@ aarch64_libgcc_floating_mode_supported_p
#undef TARGET_SCHED_CAN_SPECULATE_INSN
#define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
+#undef TARGET_SET_DEFAULT_TYPE_ATTRIBUTES
+#define TARGET_SET_DEFAULT_TYPE_ATTRIBUTES aarch64_set_default_type_attributes
+
+#undef TARGET_ATTRIBUTE_TABLE
+#define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
+
+#undef TARGET_COMP_TYPE_ATTRIBUTES
+#define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
+
#undef TARGET_CAN_USE_DOLOOP_P
#define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
diff -N -urp a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
--- a/gcc/config/aarch64/aarch64.h 2018-11-06 10:43:27.870079389 +0800
+++ b/gcc/config/aarch64/aarch64.h 2018-11-06 10:49:29.574088911 +0800
@@ -28,7 +28,6 @@
-#define REGISTER_TARGET_PRAGMAS() aarch64_register_pragmas ()
/* Target machine storage layout. */
@@ -659,6 +658,14 @@ typedef struct
} CUMULATIVE_ARGS;
#endif
+/* Handle pragmas for compatibility with Intel's compilers. */
+#define REGISTER_TARGET_PRAGMAS() do { \
+ c_register_pragma (0, "long_calls", aarch64_pr_long_calls); \
+ c_register_pragma (0, "no_long_calls", aarch64_pr_no_long_calls); \
+ c_register_pragma (0, "long_calls_off", aarch64_pr_long_calls_off); \
+ aarch64_register_pragmas (); \
+} while (0)
+
#define FUNCTION_ARG_PADDING(MODE, TYPE) \
(aarch64_pad_arg_upward (MODE, TYPE) ? upward : downward)
@@ -842,13 +849,20 @@ typedef struct
#define PROFILE_HOOK(LABEL) \
{ \
rtx fun, lr; \
+ const rtx_insn* tmp = get_insns (); \
lr = get_hard_reg_initial_val (Pmode, LR_REGNUM); \
fun = gen_rtx_SYMBOL_REF (Pmode, MCOUNT_NAME); \
emit_library_call (fun, LCT_NORMAL, VOIDmode, 1, lr, Pmode); \
+ if (TARGET_LONG_CALLS) \
+ { \
+ emit_insn (gen_blockage ()); \
+ emit_insn_after (gen_blockage (), NEXT_INSN (tmp)); \
+ } \
}
/* All the work done in PROFILE_HOOK, but still required. */
-#define FUNCTION_PROFILER(STREAM, LABELNO) do { } while (0)
+#define FUNCTION_PROFILER(STREAM, LABELNO) \
+ aarch64_function_profiler (STREAM, LABELNO)
/* For some reason, the Linux headers think they know how to define
these macros. They don't!!! */
diff -N -urp a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
--- a/gcc/config/aarch64/aarch64.md 2018-11-06 10:43:27.874079389 +0800
+++ b/gcc/config/aarch64/aarch64.md 2018-11-06 10:44:34.934081154 +0800
@@ -850,9 +850,10 @@
{
rtx pat;
rtx callee = XEXP (operands[0], 0);
- if (!REG_P (callee)
- && ((GET_CODE (callee) != SYMBOL_REF)
- || aarch64_is_noplt_call_p (callee)))
+
+ if (GET_CODE (callee) == SYMBOL_REF
+ ? (aarch64_is_long_call_p (callee) || aarch64_is_noplt_call_p (callee))
+ : !REG_P (callee))
XEXP (operands[0], 0) = force_reg (Pmode, callee);
if (operands[2] == NULL_RTX)
@@ -881,9 +882,10 @@
{
rtx pat;
rtx callee = XEXP (operands[1], 0);
- if (!REG_P (callee)
- && ((GET_CODE (callee) != SYMBOL_REF)
- || aarch64_is_noplt_call_p (callee)))
+
+ if (GET_CODE (callee) == SYMBOL_REF
+ ? (aarch64_is_long_call_p (callee) || aarch64_is_noplt_call_p (callee))
+ : !REG_P (callee))
XEXP (operands[1], 0) = force_reg (Pmode, callee);
if (operands[3] == NULL_RTX)
diff -N -urp a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
--- a/gcc/config/aarch64/aarch64.opt 2018-11-06 10:43:27.874079389 +0800
+++ b/gcc/config/aarch64/aarch64.opt 2018-11-06 10:44:34.934081154 +0800
@@ -80,6 +80,10 @@ mlittle-endian
Target Report RejectNegative InverseMask(BIG_END)
Assume target CPU is configured as little endian.
+mlong-calls
+Target Report Mask(LONG_CALLS)
+Generate call insns as indirect calls, if necessary.
+
mcmodel=
Target RejectNegative Joined Enum(cmodel) Var(aarch64_cmodel_var) Init(AARCH64_CMODEL_SMALL) Save
Specify the code model.
diff -N -urp a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
--- a/gcc/config/aarch64/predicates.md 2018-11-06 10:43:27.878079389 +0800
+++ b/gcc/config/aarch64/predicates.md 2018-11-06 10:44:34.938081154 +0800
@@ -27,8 +27,9 @@
)
(define_predicate "aarch64_call_insn_operand"
- (ior (match_code "symbol_ref")
- (match_operand 0 "register_operand")))
+ (ior (and (match_code "symbol_ref")
+ (match_test "!aarch64_is_long_call_p (op)"))
+ (match_operand 0 "register_operand")))
;; Return true if OP a (const_int 0) operand.
(define_predicate "const0_operand"

33
sanitizer-pr-85835.patch Normal file
View File

@ -0,0 +1,33 @@
diff --git a/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.cc b/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.cc
index 858bb21..de18e56 100644 (file)
--- a/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.cc
+++ b/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.cc
@@ -157,7 +157,6 @@ typedef struct user_fpregs elf_fpregset_t;
# include <sys/procfs.h>
#endif
#include <sys/user.h>
-#include <sys/ustat.h>
#include <linux/cyclades.h>
#include <linux/if_eql.h>
#include <linux/if_plip.h>
@@ -250,7 +249,19 @@ namespace __sanitizer {
#endif // SANITIZER_LINUX || SANITIZER_FREEBSD
#if SANITIZER_LINUX && !SANITIZER_ANDROID
- unsigned struct_ustat_sz = sizeof(struct ustat);
+ // Use pre-computed size of struct ustat to avoid <sys/ustat.h> which
+ // has been removed from glibc 2.28.
+#if defined(__aarch64__) || defined(__s390x__) || defined (__mips64) \
+ || defined(__powerpc64__) || defined(__arch64__) || defined(__sparcv9) \
+ || defined(__x86_64__)
+#define SIZEOF_STRUCT_USTAT 32
+#elif defined(__arm__) || defined(__i386__) || defined(__mips__) \
+ || defined(__powerpc__) || defined(__s390__)
+#define SIZEOF_STRUCT_USTAT 20
+#else
+#error Unknown size of struct ustat
+#endif
+ unsigned struct_ustat_sz = SIZEOF_STRUCT_USTAT;
unsigned struct_rlimit64_sz = sizeof(struct rlimit64);
unsigned struct_statvfs64_sz = sizeof(struct statvfs64);
#endif // SANITIZER_LINUX && !SANITIZER_ANDROID

11
try-unroll.patch Normal file
View File

@ -0,0 +1,11 @@
--- a/gcc/tree-ssa-loop-ivcanon.c 2018-12-06 05:05:43.841181211 +0800
+++ b/gcc/tree-ssa-loop-ivcanon.c 2018-12-06 05:03:17.545185153 +0800
@@ -726,7 +726,7 @@ try_unroll_loop_completely (struct loop
edge_to_cancel = NULL;
}
- if (!n_unroll_found)
+ if (!n_unroll_found || SCEV_NOT_KNOWN == TREE_CODE (niter))
return false;
if (n_unroll > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))

View File

@ -0,0 +1,25 @@
diff -N -urp a/gcc/common/config/aarch64/aarch64-common.c b/gcc/common/config/aarch64/aarch64-common.c
--- a/gcc/common/config/aarch64/aarch64-common.c 2019-07-02 09:28:49.798701181 +0800
+++ b/gcc/common/config/aarch64/aarch64-common.c 2019-07-02 09:30:15.436282799 +0800
@@ -51,6 +51,10 @@ static const struct default_options aarc
{ OPT_LEVELS_1_PLUS, OPT_fsched_pressure, NULL, 1 },
/* Enable redundant extension instructions removal at -O2 and higher. */
{ OPT_LEVELS_2_PLUS, OPT_free, NULL, 1 },
+#if (TARGET_DEFAULT_ASYNC_UNWIND_TABLES == 1)
+ { OPT_LEVELS_ALL, OPT_fasynchronous_unwind_tables, NULL, 1 },
+ { OPT_LEVELS_ALL, OPT_funwind_tables, NULL, 1},
+#endif
{ OPT_LEVELS_NONE, 0, NULL, 0 }
};
diff -N -urp a/gcc/config.gcc b/gcc/config.gcc
--- a/gcc/config.gcc 2019-07-02 09:28:50.114701170 +0800
+++ b/gcc/config.gcc 2019-07-02 09:31:50.636196118 +0800
@@ -966,6 +966,7 @@ aarch64*-*-linux*)
tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h glibc-stdint.h"
tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-linux.h"
tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-linux"
+ tm_defines="${tm_defines} TARGET_DEFAULT_ASYNC_UNWIND_TABLES=1"
case $target in
aarch64_be-*)
tm_defines="${tm_defines} TARGET_BIG_ENDIAN_DEFAULT=1"