491 lines
18 KiB
Diff
491 lines
18 KiB
Diff
|
|
From 1070bc24f53e851cae55320e26715cc594efcd2f Mon Sep 17 00:00:00 2001
|
||
|
|
From: Hongyu Wang <hongyu.wang@intel.com>
|
||
|
|
Date: Thu, 8 Sep 2022 16:52:02 +0800
|
||
|
|
Subject: [PATCH] Enable small loop unrolling for O2
|
||
|
|
|
||
|
|
Modern processors has multiple way instruction decoders
|
||
|
|
For x86, icelake/zen3 has 5 uops, so for small loop with <= 4
|
||
|
|
instructions (usually has 3 uops with a cmp/jmp pair that can be
|
||
|
|
macro-fused), the decoder would have 2 uops bubble for each iteration
|
||
|
|
and the pipeline could not be fully utilized.
|
||
|
|
|
||
|
|
Therefore, this patch enables loop unrolling for small size loop at O2
|
||
|
|
to fullfill the decoder as much as possible. It turns on rtl loop
|
||
|
|
unrolling when targetm.loop_unroll_adjust exists and O2 plus speed only.
|
||
|
|
In x86 backend the default behavior is to unroll small loops with less
|
||
|
|
than 4 insns by 1 time.
|
||
|
|
|
||
|
|
This improves 548.exchange2 by 9% on icelake and 7.4% on zen3 with
|
||
|
|
0.9% codesize increment. For other benchmarks the variants are minor
|
||
|
|
and overall codesize increased by 0.2%.
|
||
|
|
|
||
|
|
The kernel image size increased by 0.06%, and no impact on eembc.
|
||
|
|
|
||
|
|
gcc/ChangeLog:
|
||
|
|
|
||
|
|
* common/config/i386/i386-common.cc (ix86_optimization_table):
|
||
|
|
Enable small loop unroll at O2 by default.
|
||
|
|
* config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
|
||
|
|
factor if -munroll-only-small-loops enabled and -funroll-loops/
|
||
|
|
-funroll-all-loops are disabled.
|
||
|
|
* config/i386/i386.h (struct processor_costs): Add 2 field
|
||
|
|
small_unroll_ninsns and small_unroll_factor.
|
||
|
|
* config/i386/i386.opt: Add -munroll-only-small-loops.
|
||
|
|
* doc/invoke.texi: Document -munroll-only-small-loops.
|
||
|
|
* loop-init.cc (pass_rtl_unroll_loops::gate): Enable rtl
|
||
|
|
loop unrolling for -O2-speed and above if target hook
|
||
|
|
loop_unroll_adjust exists.
|
||
|
|
(pass_rtl_unroll_loops::execute): Set UAP_UNROLL flag
|
||
|
|
when target hook loop_unroll_adjust exists.
|
||
|
|
* config/i386/x86-tune-costs.h: Update all processor costs
|
||
|
|
with small_unroll_ninsns = 4 and small_unroll_factor = 2.
|
||
|
|
|
||
|
|
gcc/testsuite/ChangeLog:
|
||
|
|
|
||
|
|
* gcc.dg/guality/loop-1.c: Add additional option
|
||
|
|
-mno-unroll-only-small-loops.
|
||
|
|
* gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops.
|
||
|
|
* gcc.target/i386/pr93002.c: Likewise.
|
||
|
|
---
|
||
|
|
gcc/common/config/i386/i386-common.cc | 1 +
|
||
|
|
gcc/config/i386/i386.cc | 18 ++++++++
|
||
|
|
gcc/config/i386/i386.h | 5 +++
|
||
|
|
gcc/config/i386/i386.opt | 4 ++
|
||
|
|
gcc/config/i386/x86-tune-costs.h | 58 +++++++++++++++++++++++++
|
||
|
|
gcc/doc/invoke.texi | 11 ++++-
|
||
|
|
gcc/loop-init.cc | 10 +++--
|
||
|
|
gcc/testsuite/gcc.dg/guality/loop-1.c | 2 +
|
||
|
|
gcc/testsuite/gcc.target/i386/pr86270.c | 2 +-
|
||
|
|
gcc/testsuite/gcc.target/i386/pr93002.c | 2 +-
|
||
|
|
10 files changed, 107 insertions(+), 6 deletions(-)
|
||
|
|
|
||
|
|
diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
|
||
|
|
index e2594cae4..cdd5caa55 100644
|
||
|
|
--- a/gcc/common/config/i386/i386-common.cc
|
||
|
|
+++ b/gcc/common/config/i386/i386-common.cc
|
||
|
|
@@ -1687,6 +1687,7 @@ static const struct default_options ix86_option_optimization_table[] =
|
||
|
|
/* The STC algorithm produces the smallest code at -Os, for x86. */
|
||
|
|
{ OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL,
|
||
|
|
REORDER_BLOCKS_ALGORITHM_STC },
|
||
|
|
+ { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 },
|
||
|
|
/* Turn off -fschedule-insns by default. It tends to make the
|
||
|
|
problem with not enough registers even worse. */
|
||
|
|
{ OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
|
||
|
|
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
|
||
|
|
index 9a9ff3b34..e56004300 100644
|
||
|
|
--- a/gcc/config/i386/i386.cc
|
||
|
|
+++ b/gcc/config/i386/i386.cc
|
||
|
|
@@ -23570,6 +23570,24 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop)
|
||
|
|
unsigned i;
|
||
|
|
unsigned mem_count = 0;
|
||
|
|
|
||
|
|
+ /* Unroll small size loop when unroll factor is not explicitly
|
||
|
|
+ specified. */
|
||
|
|
+ if (!(flag_unroll_loops
|
||
|
|
+ || flag_unroll_all_loops
|
||
|
|
+ || loop->unroll))
|
||
|
|
+ {
|
||
|
|
+ nunroll = 1;
|
||
|
|
+
|
||
|
|
+ /* Any explicit -f{no-}unroll-{all-}loops turns off
|
||
|
|
+ -munroll-only-small-loops. */
|
||
|
|
+ if (ix86_unroll_only_small_loops
|
||
|
|
+ && !OPTION_SET_P (flag_unroll_loops)
|
||
|
|
+ && loop->ninsns <= ix86_cost->small_unroll_ninsns)
|
||
|
|
+ nunroll = ix86_cost->small_unroll_factor;
|
||
|
|
+
|
||
|
|
+ return nunroll;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
if (!TARGET_ADJUST_UNROLL)
|
||
|
|
return nunroll;
|
||
|
|
|
||
|
|
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
|
||
|
|
index fce0b3564..688aaabd3 100644
|
||
|
|
--- a/gcc/config/i386/i386.h
|
||
|
|
+++ b/gcc/config/i386/i386.h
|
||
|
|
@@ -219,6 +219,11 @@ struct processor_costs {
|
||
|
|
const char *const align_jump; /* Jump alignment. */
|
||
|
|
const char *const align_label; /* Label alignment. */
|
||
|
|
const char *const align_func; /* Function alignment. */
|
||
|
|
+
|
||
|
|
+ const unsigned small_unroll_ninsns; /* Insn count limit for small loop
|
||
|
|
+ to be unrolled. */
|
||
|
|
+ const unsigned small_unroll_factor; /* Unroll factor for small loop to
|
||
|
|
+ be unrolled. */
|
||
|
|
};
|
||
|
|
|
||
|
|
extern const struct processor_costs *ix86_cost;
|
||
|
|
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
|
||
|
|
index a3675e515..fc1b944ac 100644
|
||
|
|
--- a/gcc/config/i386/i386.opt
|
||
|
|
+++ b/gcc/config/i386/i386.opt
|
||
|
|
@@ -1214,3 +1214,7 @@ Do not use GOT to access external symbols.
|
||
|
|
-param=x86-stlf-window-ninsns=
|
||
|
|
Target Joined UInteger Var(x86_stlf_window_ninsns) Init(64) Param
|
||
|
|
Instructions number above which STFL stall penalty can be compensated.
|
||
|
|
+
|
||
|
|
+munroll-only-small-loops
|
||
|
|
+Target Var(ix86_unroll_only_small_loops) Init(0) Save
|
||
|
|
+Enable conservative small loop unrolling.
|
||
|
|
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
|
||
|
|
index f105d57ca..db4c2da34 100644
|
||
|
|
--- a/gcc/config/i386/x86-tune-costs.h
|
||
|
|
+++ b/gcc/config/i386/x86-tune-costs.h
|
||
|
|
@@ -135,6 +135,8 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
|
||
|
|
NULL, /* Jump alignment. */
|
||
|
|
NULL, /* Label alignment. */
|
||
|
|
NULL, /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
/* Processor costs (relative to an add) */
|
||
|
|
@@ -244,6 +246,8 @@ struct processor_costs i386_cost = { /* 386 specific costs */
|
||
|
|
"4", /* Jump alignment. */
|
||
|
|
NULL, /* Label alignment. */
|
||
|
|
"4", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
static stringop_algs i486_memcpy[2] = {
|
||
|
|
@@ -354,6 +358,8 @@ struct processor_costs i486_cost = { /* 486 specific costs */
|
||
|
|
"16", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"16", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
static stringop_algs pentium_memcpy[2] = {
|
||
|
|
@@ -462,6 +468,8 @@ struct processor_costs pentium_cost = {
|
||
|
|
"16:8:8", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"16", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
static const
|
||
|
|
@@ -563,6 +571,8 @@ struct processor_costs lakemont_cost = {
|
||
|
|
"16:8:8", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"16", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
|
||
|
|
@@ -679,6 +689,8 @@ struct processor_costs pentiumpro_cost = {
|
||
|
|
"16:11:8", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"16", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
static stringop_algs geode_memcpy[2] = {
|
||
|
|
@@ -786,6 +798,8 @@ struct processor_costs geode_cost = {
|
||
|
|
NULL, /* Jump alignment. */
|
||
|
|
NULL, /* Label alignment. */
|
||
|
|
NULL, /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
static stringop_algs k6_memcpy[2] = {
|
||
|
|
@@ -896,6 +910,8 @@ struct processor_costs k6_cost = {
|
||
|
|
"32:8:8", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"32", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
/* For some reason, Athlon deals better with REP prefix (relative to loops)
|
||
|
|
@@ -1007,6 +1023,8 @@ struct processor_costs athlon_cost = {
|
||
|
|
"16:8:8", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"16", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
/* K8 has optimized REP instruction for medium sized blocks, but for very
|
||
|
|
@@ -1127,6 +1145,8 @@ struct processor_costs k8_cost = {
|
||
|
|
"16:8:8", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"16", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
|
||
|
|
@@ -1255,6 +1275,8 @@ struct processor_costs amdfam10_cost = {
|
||
|
|
"32:8:8", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"32", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
/* BDVER has optimized REP instruction for medium sized blocks, but for
|
||
|
|
@@ -1376,6 +1398,8 @@ const struct processor_costs bdver_cost = {
|
||
|
|
"16:8:8", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"11", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
|
||
|
|
@@ -1529,6 +1553,8 @@ struct processor_costs znver1_cost = {
|
||
|
|
"16", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"16", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
/* ZNVER2 has optimized REP instruction for medium sized blocks, but for
|
||
|
|
@@ -1686,6 +1712,8 @@ struct processor_costs znver2_cost = {
|
||
|
|
"16", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"16", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
struct processor_costs znver3_cost = {
|
||
|
|
@@ -1818,6 +1846,8 @@ struct processor_costs znver3_cost = {
|
||
|
|
"16", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"16", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
/* This table currently replicates znver3_cost table. */
|
||
|
|
@@ -1952,6 +1982,8 @@ struct processor_costs znver4_cost = {
|
||
|
|
"16", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"16", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
/* skylake_cost should produce code tuned for Skylake familly of CPUs. */
|
||
|
|
@@ -2076,6 +2108,8 @@ struct processor_costs skylake_cost = {
|
||
|
|
"16:11:8", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"16", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
/* icelake_cost should produce code tuned for Icelake family of CPUs.
|
||
|
|
@@ -2202,6 +2236,8 @@ struct processor_costs icelake_cost = {
|
||
|
|
"16:11:8", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"16", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
/* alderlake_cost should produce code tuned for alderlake family of CPUs. */
|
||
|
|
@@ -2322,6 +2358,8 @@ struct processor_costs alderlake_cost = {
|
||
|
|
"16:11:8", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"16", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
/* BTVER1 has optimized REP instruction for medium sized blocks, but for
|
||
|
|
@@ -2435,6 +2473,8 @@ const struct processor_costs btver1_cost = {
|
||
|
|
"16:8:8", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"11", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
static stringop_algs btver2_memcpy[2] = {
|
||
|
|
@@ -2545,6 +2585,8 @@ const struct processor_costs btver2_cost = {
|
||
|
|
"16:8:8", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"11", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
static stringop_algs pentium4_memcpy[2] = {
|
||
|
|
@@ -2654,6 +2696,8 @@ struct processor_costs pentium4_cost = {
|
||
|
|
NULL, /* Jump alignment. */
|
||
|
|
NULL, /* Label alignment. */
|
||
|
|
NULL, /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
static stringop_algs nocona_memcpy[2] = {
|
||
|
|
@@ -2766,6 +2810,8 @@ struct processor_costs nocona_cost = {
|
||
|
|
NULL, /* Jump alignment. */
|
||
|
|
NULL, /* Label alignment. */
|
||
|
|
NULL, /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
static stringop_algs atom_memcpy[2] = {
|
||
|
|
@@ -2876,6 +2922,8 @@ struct processor_costs atom_cost = {
|
||
|
|
"16:8:8", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"16", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
static stringop_algs slm_memcpy[2] = {
|
||
|
|
@@ -2986,6 +3034,8 @@ struct processor_costs slm_cost = {
|
||
|
|
"16:8:8", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"16", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
static stringop_algs tremont_memcpy[2] = {
|
||
|
|
@@ -3110,6 +3160,8 @@ struct processor_costs tremont_cost = {
|
||
|
|
"16:11:8", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"16", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
static stringop_algs intel_memcpy[2] = {
|
||
|
|
@@ -3220,6 +3272,8 @@ struct processor_costs intel_cost = {
|
||
|
|
"16:8:8", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"16", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
/* Generic should produce code tuned for Core-i7 (and newer chips)
|
||
|
|
@@ -3339,6 +3393,8 @@ struct processor_costs generic_cost = {
|
||
|
|
"16:11:8", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"16", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
/* core_cost should produce code tuned for Core familly of CPUs. */
|
||
|
|
@@ -3465,5 +3521,7 @@ struct processor_costs core_cost = {
|
||
|
|
"16:11:8", /* Jump alignment. */
|
||
|
|
"0:0:8", /* Label alignment. */
|
||
|
|
"16", /* Func alignment. */
|
||
|
|
+ 4, /* Small unroll limit. */
|
||
|
|
+ 2, /* Small unroll factor. */
|
||
|
|
};
|
||
|
|
|
||
|
|
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
|
||
|
|
index ff8cd032f..16f4b367e 100644
|
||
|
|
--- a/gcc/doc/invoke.texi
|
||
|
|
+++ b/gcc/doc/invoke.texi
|
||
|
|
@@ -1449,7 +1449,8 @@ See RS/6000 and PowerPC Options.
|
||
|
|
-mgeneral-regs-only -mcall-ms2sysv-xlogues -mrelax-cmpxchg-loop @gol
|
||
|
|
-mindirect-branch=@var{choice} -mfunction-return=@var{choice} @gol
|
||
|
|
-mindirect-branch-register -mharden-sls=@var{choice} @gol
|
||
|
|
--mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access}
|
||
|
|
+-mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access @gol
|
||
|
|
+-munroll-only-small-loops}
|
||
|
|
|
||
|
|
@emph{x86 Windows Options}
|
||
|
|
@gccoptlist{-mconsole -mcygwin -mno-cygwin -mdll @gol
|
||
|
|
@@ -33183,6 +33184,14 @@ treat access to protected symbols as local symbols. The default is
|
||
|
|
@option{-mno-direct-extern-access} and executable compiled with
|
||
|
|
@option{-mdirect-extern-access} may not be binary compatible if
|
||
|
|
protected symbols are used in shared libraries and executable.
|
||
|
|
+
|
||
|
|
+@item -munroll-only-small-loops
|
||
|
|
+@opindex munroll-only-small-loops
|
||
|
|
+@opindex mno-unroll-only-small-loops
|
||
|
|
+Controls conservative small loop unrolling. It is default enabled by
|
||
|
|
+O2, and unrolls loop with less than 4 insns by 1 time. Explicit
|
||
|
|
+-f[no-]unroll-[all-]loops would disable this flag to avoid any
|
||
|
|
+unintended unrolling behavior that user does not want.
|
||
|
|
@end table
|
||
|
|
|
||
|
|
@node x86 Windows Options
|
||
|
|
diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc
|
||
|
|
index 1e4f6cfd7..f1c717041 100644
|
||
|
|
--- a/gcc/loop-init.cc
|
||
|
|
+++ b/gcc/loop-init.cc
|
||
|
|
@@ -565,9 +565,12 @@ public:
|
||
|
|
{}
|
||
|
|
|
||
|
|
/* opt_pass methods: */
|
||
|
|
- virtual bool gate (function *)
|
||
|
|
+ virtual bool gate (function *fun)
|
||
|
|
{
|
||
|
|
- return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll);
|
||
|
|
+ return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll
|
||
|
|
+ || (targetm.loop_unroll_adjust
|
||
|
|
+ && optimize >= 2
|
||
|
|
+ && optimize_function_for_speed_p (fun)));
|
||
|
|
}
|
||
|
|
|
||
|
|
virtual unsigned int execute (function *);
|
||
|
|
@@ -583,7 +586,8 @@ pass_rtl_unroll_loops::execute (function *fun)
|
||
|
|
if (dump_file)
|
||
|
|
df_dump (dump_file);
|
||
|
|
|
||
|
|
- if (flag_unroll_loops)
|
||
|
|
+ if (flag_unroll_loops
|
||
|
|
+ || targetm.loop_unroll_adjust)
|
||
|
|
flags |= UAP_UNROLL;
|
||
|
|
if (flag_unroll_all_loops)
|
||
|
|
flags |= UAP_UNROLL_ALL;
|
||
|
|
diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c
|
||
|
|
index 1b1f6d322..a32ea445a 100644
|
||
|
|
--- a/gcc/testsuite/gcc.dg/guality/loop-1.c
|
||
|
|
+++ b/gcc/testsuite/gcc.dg/guality/loop-1.c
|
||
|
|
@@ -1,5 +1,7 @@
|
||
|
|
/* { dg-do run } */
|
||
|
|
/* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */
|
||
|
|
+/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */
|
||
|
|
+
|
||
|
|
|
||
|
|
#include "../nop.h"
|
||
|
|
|
||
|
|
diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c
|
||
|
|
index 81841ef5b..cbc9fbb04 100644
|
||
|
|
--- a/gcc/testsuite/gcc.target/i386/pr86270.c
|
||
|
|
+++ b/gcc/testsuite/gcc.target/i386/pr86270.c
|
||
|
|
@@ -1,5 +1,5 @@
|
||
|
|
/* { dg-do compile } */
|
||
|
|
-/* { dg-options "-O2" } */
|
||
|
|
+/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
|
||
|
|
|
||
|
|
int *a;
|
||
|
|
long len;
|
||
|
|
diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c
|
||
|
|
index 0248fcc00..f75a847f7 100644
|
||
|
|
--- a/gcc/testsuite/gcc.target/i386/pr93002.c
|
||
|
|
+++ b/gcc/testsuite/gcc.target/i386/pr93002.c
|
||
|
|
@@ -1,6 +1,6 @@
|
||
|
|
/* PR target/93002 */
|
||
|
|
/* { dg-do compile } */
|
||
|
|
-/* { dg-options "-O2" } */
|
||
|
|
+/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
|
||
|
|
/* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */
|
||
|
|
|
||
|
|
volatile int sink;
|
||
|
|
--
|
||
|
|
2.31.1
|
||
|
|
|