340 lines
13 KiB
Diff
340 lines
13 KiB
Diff
|
|
From 33fff578e7df7aa7e236efc6c9c85c595918d86a Mon Sep 17 00:00:00 2001
|
||
|
|
From: Xi Ruoyao <xry111@xry111.site>
|
||
|
|
Date: Wed, 12 Apr 2023 11:45:48 +0000
|
||
|
|
Subject: [PATCH 043/124] LoongArch: Improve cpymemsi expansion [PR109465]
|
||
|
|
|
||
|
|
We'd been generating really bad block move sequences which is recently
|
||
|
|
complained by kernel developers who tried __builtin_memcpy. To improve
|
||
|
|
it:
|
||
|
|
|
||
|
|
1. Take the advantage of -mno-strict-align. When it is set, set mode
|
||
|
|
size to UNITS_PER_WORD regardless of the alignment.
|
||
|
|
2. Half the mode size when (block size) % (mode size) != 0, instead of
|
||
|
|
falling back to ld.bu/st.b at once.
|
||
|
|
3. Limit the length of block move sequence considering the number of
|
||
|
|
instructions, not the size of block. When -mstrict-align is set and
|
||
|
|
the block is not aligned, the old size limit for straight-line
|
||
|
|
implementation (64 bytes) was definitely too large (we don't have 64
|
||
|
|
registers anyway).
|
||
|
|
|
||
|
|
Change since v1: add a comment about the calculation of num_reg.
|
||
|
|
|
||
|
|
gcc/ChangeLog:
|
||
|
|
|
||
|
|
PR target/109465
|
||
|
|
* config/loongarch/loongarch-protos.h
|
||
|
|
(loongarch_expand_block_move): Add a parameter as alignment RTX.
|
||
|
|
* config/loongarch/loongarch.h:
|
||
|
|
(LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER): Remove.
|
||
|
|
(LARCH_MAX_MOVE_BYTES_STRAIGHT): Remove.
|
||
|
|
(LARCH_MAX_MOVE_OPS_PER_LOOP_ITER): Define.
|
||
|
|
(LARCH_MAX_MOVE_OPS_STRAIGHT): Define.
|
||
|
|
(MOVE_RATIO): Use LARCH_MAX_MOVE_OPS_PER_LOOP_ITER instead of
|
||
|
|
LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER.
|
||
|
|
* config/loongarch/loongarch.cc (loongarch_expand_block_move):
|
||
|
|
Take the alignment from the parameter, but set it to
|
||
|
|
UNITS_PER_WORD if !TARGET_STRICT_ALIGN. Limit the length of
|
||
|
|
straight-line implementation with LARCH_MAX_MOVE_OPS_STRAIGHT
|
||
|
|
instead of LARCH_MAX_MOVE_BYTES_STRAIGHT.
|
||
|
|
(loongarch_block_move_straight): When there are left-over bytes,
|
||
|
|
half the mode size instead of falling back to byte mode at once.
|
||
|
|
(loongarch_block_move_loop): Limit the length of loop body with
|
||
|
|
LARCH_MAX_MOVE_OPS_PER_LOOP_ITER instead of
|
||
|
|
LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER.
|
||
|
|
* config/loongarch/loongarch.md (cpymemsi): Pass the alignment
|
||
|
|
to loongarch_expand_block_move.
|
||
|
|
|
||
|
|
gcc/testsuite/ChangeLog:
|
||
|
|
|
||
|
|
PR target/109465
|
||
|
|
* gcc.target/loongarch/pr109465-1.c: New test.
|
||
|
|
* gcc.target/loongarch/pr109465-2.c: New test.
|
||
|
|
* gcc.target/loongarch/pr109465-3.c: New test.
|
||
|
|
|
||
|
|
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||
|
|
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||
|
|
---
|
||
|
|
gcc/config/loongarch/loongarch-protos.h | 2 +-
|
||
|
|
gcc/config/loongarch/loongarch.cc | 95 +++++++++++--------
|
||
|
|
gcc/config/loongarch/loongarch.h | 10 +-
|
||
|
|
gcc/config/loongarch/loongarch.md | 3 +-
|
||
|
|
.../gcc.target/loongarch/pr109465-1.c | 9 ++
|
||
|
|
.../gcc.target/loongarch/pr109465-2.c | 9 ++
|
||
|
|
.../gcc.target/loongarch/pr109465-3.c | 12 +++
|
||
|
|
7 files changed, 91 insertions(+), 49 deletions(-)
|
||
|
|
create mode 100644 gcc/testsuite/gcc.target/loongarch/pr109465-1.c
|
||
|
|
create mode 100644 gcc/testsuite/gcc.target/loongarch/pr109465-2.c
|
||
|
|
create mode 100644 gcc/testsuite/gcc.target/loongarch/pr109465-3.c
|
||
|
|
|
||
|
|
diff --git a/gcc/config/loongarch/loongarch-protos.h b/gcc/config/loongarch/loongarch-protos.h
|
||
|
|
index 0a9b47722..3ac3b5e19 100644
|
||
|
|
--- a/gcc/config/loongarch/loongarch-protos.h
|
||
|
|
+++ b/gcc/config/loongarch/loongarch-protos.h
|
||
|
|
@@ -95,7 +95,7 @@ extern void loongarch_expand_conditional_trap (rtx);
|
||
|
|
#endif
|
||
|
|
extern void loongarch_set_return_address (rtx, rtx);
|
||
|
|
extern bool loongarch_move_by_pieces_p (unsigned HOST_WIDE_INT, unsigned int);
|
||
|
|
-extern bool loongarch_expand_block_move (rtx, rtx, rtx);
|
||
|
|
+extern bool loongarch_expand_block_move (rtx, rtx, rtx, rtx);
|
||
|
|
extern bool loongarch_do_optimize_block_move_p (void);
|
||
|
|
|
||
|
|
extern bool loongarch_expand_ext_as_unaligned_load (rtx, rtx, HOST_WIDE_INT,
|
||
|
|
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
|
||
|
|
index 233dddbac..d3c6f22ad 100644
|
||
|
|
--- a/gcc/config/loongarch/loongarch.cc
|
||
|
|
+++ b/gcc/config/loongarch/loongarch.cc
|
||
|
|
@@ -4456,41 +4456,46 @@ loongarch_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
|
||
|
|
Assume that the areas do not overlap. */
|
||
|
|
|
||
|
|
static void
|
||
|
|
-loongarch_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length)
|
||
|
|
+loongarch_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length,
|
||
|
|
+ HOST_WIDE_INT delta)
|
||
|
|
{
|
||
|
|
- HOST_WIDE_INT offset, delta;
|
||
|
|
- unsigned HOST_WIDE_INT bits;
|
||
|
|
+ HOST_WIDE_INT offs, delta_cur;
|
||
|
|
int i;
|
||
|
|
machine_mode mode;
|
||
|
|
rtx *regs;
|
||
|
|
|
||
|
|
- bits = MIN (BITS_PER_WORD, MIN (MEM_ALIGN (src), MEM_ALIGN (dest)));
|
||
|
|
-
|
||
|
|
- mode = int_mode_for_size (bits, 0).require ();
|
||
|
|
- delta = bits / BITS_PER_UNIT;
|
||
|
|
+ /* Calculate how many registers we'll need for the block move.
|
||
|
|
+ We'll emit length / delta move operations with delta as the size
|
||
|
|
+ first. Then we may still have length % delta bytes not copied.
|
||
|
|
+ We handle these remaining bytes by move operations with smaller
|
||
|
|
+ (halfed) sizes. For example, if length = 21 and delta = 8, we'll
|
||
|
|
+ emit two ld.d/st.d pairs, one ld.w/st.w pair, and one ld.b/st.b
|
||
|
|
+ pair. For each load/store pair we use a dedicated register to keep
|
||
|
|
+ the pipeline as populated as possible. */
|
||
|
|
+ HOST_WIDE_INT num_reg = length / delta;
|
||
|
|
+ for (delta_cur = delta / 2; delta_cur != 0; delta_cur /= 2)
|
||
|
|
+ num_reg += !!(length & delta_cur);
|
||
|
|
|
||
|
|
/* Allocate a buffer for the temporary registers. */
|
||
|
|
- regs = XALLOCAVEC (rtx, length / delta);
|
||
|
|
+ regs = XALLOCAVEC (rtx, num_reg);
|
||
|
|
|
||
|
|
- /* Load as many BITS-sized chunks as possible. Use a normal load if
|
||
|
|
- the source has enough alignment, otherwise use left/right pairs. */
|
||
|
|
- for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++)
|
||
|
|
+ for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur /= 2)
|
||
|
|
{
|
||
|
|
- regs[i] = gen_reg_rtx (mode);
|
||
|
|
- loongarch_emit_move (regs[i], adjust_address (src, mode, offset));
|
||
|
|
- }
|
||
|
|
+ mode = int_mode_for_size (delta_cur * BITS_PER_UNIT, 0).require ();
|
||
|
|
|
||
|
|
- for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++)
|
||
|
|
- loongarch_emit_move (adjust_address (dest, mode, offset), regs[i]);
|
||
|
|
+ for (; offs + delta_cur <= length; offs += delta_cur, i++)
|
||
|
|
+ {
|
||
|
|
+ regs[i] = gen_reg_rtx (mode);
|
||
|
|
+ loongarch_emit_move (regs[i], adjust_address (src, mode, offs));
|
||
|
|
+ }
|
||
|
|
+ }
|
||
|
|
|
||
|
|
- /* Mop up any left-over bytes. */
|
||
|
|
- if (offset < length)
|
||
|
|
+ for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur /= 2)
|
||
|
|
{
|
||
|
|
- src = adjust_address (src, BLKmode, offset);
|
||
|
|
- dest = adjust_address (dest, BLKmode, offset);
|
||
|
|
- move_by_pieces (dest, src, length - offset,
|
||
|
|
- MIN (MEM_ALIGN (src), MEM_ALIGN (dest)),
|
||
|
|
- (enum memop_ret) 0);
|
||
|
|
+ mode = int_mode_for_size (delta_cur * BITS_PER_UNIT, 0).require ();
|
||
|
|
+
|
||
|
|
+ for (; offs + delta_cur <= length; offs += delta_cur, i++)
|
||
|
|
+ loongarch_emit_move (adjust_address (dest, mode, offs), regs[i]);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
@@ -4520,10 +4525,11 @@ loongarch_adjust_block_mem (rtx mem, HOST_WIDE_INT length, rtx *loop_reg,
|
||
|
|
|
||
|
|
static void
|
||
|
|
loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length,
|
||
|
|
- HOST_WIDE_INT bytes_per_iter)
|
||
|
|
+ HOST_WIDE_INT align)
|
||
|
|
{
|
||
|
|
rtx_code_label *label;
|
||
|
|
rtx src_reg, dest_reg, final_src, test;
|
||
|
|
+ HOST_WIDE_INT bytes_per_iter = align * LARCH_MAX_MOVE_OPS_PER_LOOP_ITER;
|
||
|
|
HOST_WIDE_INT leftover;
|
||
|
|
|
||
|
|
leftover = length % bytes_per_iter;
|
||
|
|
@@ -4543,7 +4549,7 @@ loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length,
|
||
|
|
emit_label (label);
|
||
|
|
|
||
|
|
/* Emit the loop body. */
|
||
|
|
- loongarch_block_move_straight (dest, src, bytes_per_iter);
|
||
|
|
+ loongarch_block_move_straight (dest, src, bytes_per_iter, align);
|
||
|
|
|
||
|
|
/* Move on to the next block. */
|
||
|
|
loongarch_emit_move (src_reg,
|
||
|
|
@@ -4560,7 +4566,7 @@ loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length,
|
||
|
|
|
||
|
|
/* Mop up any left-over bytes. */
|
||
|
|
if (leftover)
|
||
|
|
- loongarch_block_move_straight (dest, src, leftover);
|
||
|
|
+ loongarch_block_move_straight (dest, src, leftover, align);
|
||
|
|
else
|
||
|
|
/* Temporary fix for PR79150. */
|
||
|
|
emit_insn (gen_nop ());
|
||
|
|
@@ -4570,25 +4576,32 @@ loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length,
|
||
|
|
memory reference SRC to memory reference DEST. */
|
||
|
|
|
||
|
|
bool
|
||
|
|
-loongarch_expand_block_move (rtx dest, rtx src, rtx length)
|
||
|
|
+loongarch_expand_block_move (rtx dest, rtx src, rtx r_length, rtx r_align)
|
||
|
|
{
|
||
|
|
- int max_move_bytes = LARCH_MAX_MOVE_BYTES_STRAIGHT;
|
||
|
|
+ if (!CONST_INT_P (r_length))
|
||
|
|
+ return false;
|
||
|
|
+
|
||
|
|
+ HOST_WIDE_INT length = INTVAL (r_length);
|
||
|
|
+ if (length > loongarch_max_inline_memcpy_size)
|
||
|
|
+ return false;
|
||
|
|
+
|
||
|
|
+ HOST_WIDE_INT align = INTVAL (r_align);
|
||
|
|
+
|
||
|
|
+ if (!TARGET_STRICT_ALIGN || align > UNITS_PER_WORD)
|
||
|
|
+ align = UNITS_PER_WORD;
|
||
|
|
|
||
|
|
- if (CONST_INT_P (length)
|
||
|
|
- && INTVAL (length) <= loongarch_max_inline_memcpy_size)
|
||
|
|
+ if (length <= align * LARCH_MAX_MOVE_OPS_STRAIGHT)
|
||
|
|
{
|
||
|
|
- if (INTVAL (length) <= max_move_bytes)
|
||
|
|
- {
|
||
|
|
- loongarch_block_move_straight (dest, src, INTVAL (length));
|
||
|
|
- return true;
|
||
|
|
- }
|
||
|
|
- else if (optimize)
|
||
|
|
- {
|
||
|
|
- loongarch_block_move_loop (dest, src, INTVAL (length),
|
||
|
|
- LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER);
|
||
|
|
- return true;
|
||
|
|
- }
|
||
|
|
+ loongarch_block_move_straight (dest, src, length, align);
|
||
|
|
+ return true;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ if (optimize)
|
||
|
|
+ {
|
||
|
|
+ loongarch_block_move_loop (dest, src, length, align);
|
||
|
|
+ return true;
|
||
|
|
}
|
||
|
|
+
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h
|
||
|
|
index 9d3cd9ca0..af24bfa01 100644
|
||
|
|
--- a/gcc/config/loongarch/loongarch.h
|
||
|
|
+++ b/gcc/config/loongarch/loongarch.h
|
||
|
|
@@ -1062,13 +1062,13 @@ typedef struct {
|
||
|
|
|
||
|
|
/* The maximum number of bytes that can be copied by one iteration of
|
||
|
|
a cpymemsi loop; see loongarch_block_move_loop. */
|
||
|
|
-#define LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER (UNITS_PER_WORD * 4)
|
||
|
|
+#define LARCH_MAX_MOVE_OPS_PER_LOOP_ITER 4
|
||
|
|
|
||
|
|
/* The maximum number of bytes that can be copied by a straight-line
|
||
|
|
implementation of cpymemsi; see loongarch_block_move_straight. We want
|
||
|
|
to make sure that any loop-based implementation will iterate at
|
||
|
|
least twice. */
|
||
|
|
-#define LARCH_MAX_MOVE_BYTES_STRAIGHT (LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER * 2)
|
||
|
|
+#define LARCH_MAX_MOVE_OPS_STRAIGHT (LARCH_MAX_MOVE_OPS_PER_LOOP_ITER * 2)
|
||
|
|
|
||
|
|
/* The base cost of a memcpy call, for MOVE_RATIO and friends. These
|
||
|
|
values were determined experimentally by benchmarking with CSiBE.
|
||
|
|
@@ -1076,7 +1076,7 @@ typedef struct {
|
||
|
|
#define LARCH_CALL_RATIO 8
|
||
|
|
|
||
|
|
/* Any loop-based implementation of cpymemsi will have at least
|
||
|
|
- LARCH_MAX_MOVE_BYTES_STRAIGHT / UNITS_PER_WORD memory-to-memory
|
||
|
|
+ LARCH_MAX_MOVE_OPS_PER_LOOP_ITER memory-to-memory
|
||
|
|
moves, so allow individual copies of fewer elements.
|
||
|
|
|
||
|
|
When cpymemsi is not available, use a value approximating
|
||
|
|
@@ -1087,9 +1087,7 @@ typedef struct {
|
||
|
|
value of LARCH_CALL_RATIO to take that into account. */
|
||
|
|
|
||
|
|
#define MOVE_RATIO(speed) \
|
||
|
|
- (HAVE_cpymemsi \
|
||
|
|
- ? LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER / UNITS_PER_WORD \
|
||
|
|
- : CLEAR_RATIO (speed) / 2)
|
||
|
|
+ (HAVE_cpymemsi ? LARCH_MAX_MOVE_OPS_PER_LOOP_ITER : CLEAR_RATIO (speed) / 2)
|
||
|
|
|
||
|
|
/* For CLEAR_RATIO, when optimizing for size, give a better estimate
|
||
|
|
of the length of a memset call, but use the default otherwise. */
|
||
|
|
diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
|
||
|
|
index b2f7c7f78..b23248c33 100644
|
||
|
|
--- a/gcc/config/loongarch/loongarch.md
|
||
|
|
+++ b/gcc/config/loongarch/loongarch.md
|
||
|
|
@@ -2488,7 +2488,8 @@
|
||
|
|
""
|
||
|
|
{
|
||
|
|
if (TARGET_DO_OPTIMIZE_BLOCK_MOVE_P
|
||
|
|
- && loongarch_expand_block_move (operands[0], operands[1], operands[2]))
|
||
|
|
+ && loongarch_expand_block_move (operands[0], operands[1],
|
||
|
|
+ operands[2], operands[3]))
|
||
|
|
DONE;
|
||
|
|
else
|
||
|
|
FAIL;
|
||
|
|
diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-1.c b/gcc/testsuite/gcc.target/loongarch/pr109465-1.c
|
||
|
|
new file mode 100644
|
||
|
|
index 000000000..4cd35d139
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/gcc/testsuite/gcc.target/loongarch/pr109465-1.c
|
||
|
|
@@ -0,0 +1,9 @@
|
||
|
|
+/* { dg-do compile } */
|
||
|
|
+/* { dg-options "-O2 -mabi=lp64d -mno-strict-align" } */
|
||
|
|
+/* { dg-final { scan-assembler-times "st\\.d|stptr\\.d" 1 } } */
|
||
|
|
+/* { dg-final { scan-assembler-times "st\\.w|stptr\\.w" 1 } } */
|
||
|
|
+/* { dg-final { scan-assembler-times "st\\.h" 1 } } */
|
||
|
|
+/* { dg-final { scan-assembler-times "st\\.b" 1 } } */
|
||
|
|
+
|
||
|
|
+extern char a[], b[];
|
||
|
|
+void test() { __builtin_memcpy(a, b, 15); }
|
||
|
|
diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-2.c b/gcc/testsuite/gcc.target/loongarch/pr109465-2.c
|
||
|
|
new file mode 100644
|
||
|
|
index 000000000..703eb951c
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/gcc/testsuite/gcc.target/loongarch/pr109465-2.c
|
||
|
|
@@ -0,0 +1,9 @@
|
||
|
|
+/* { dg-do compile } */
|
||
|
|
+/* { dg-options "-O2 -mabi=lp64d -mstrict-align" } */
|
||
|
|
+/* { dg-final { scan-assembler-times "st\\.d|stptr\\.d" 1 } } */
|
||
|
|
+/* { dg-final { scan-assembler-times "st\\.w|stptr\\.w" 1 } } */
|
||
|
|
+/* { dg-final { scan-assembler-times "st\\.h" 1 } } */
|
||
|
|
+/* { dg-final { scan-assembler-times "st\\.b" 1 } } */
|
||
|
|
+
|
||
|
|
+extern long a[], b[];
|
||
|
|
+void test() { __builtin_memcpy(a, b, 15); }
|
||
|
|
diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-3.c b/gcc/testsuite/gcc.target/loongarch/pr109465-3.c
|
||
|
|
new file mode 100644
|
||
|
|
index 000000000..d6a80659b
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/gcc/testsuite/gcc.target/loongarch/pr109465-3.c
|
||
|
|
@@ -0,0 +1,12 @@
|
||
|
|
+/* { dg-do compile } */
|
||
|
|
+/* { dg-options "-O2 -mabi=lp64d -mstrict-align" } */
|
||
|
|
+
|
||
|
|
+/* Three loop iterations each contains 4 st.b, and 3 st.b after the loop */
|
||
|
|
+/* { dg-final { scan-assembler-times "st\\.b" 7 } } */
|
||
|
|
+
|
||
|
|
+/* { dg-final { scan-assembler-not "st\\.h" } } */
|
||
|
|
+/* { dg-final { scan-assembler-not "st\\.w|stptr\\.w" } } */
|
||
|
|
+/* { dg-final { scan-assembler-not "st\\.d|stptr\\.d" } } */
|
||
|
|
+
|
||
|
|
+extern char a[], b[];
|
||
|
|
+void test() { __builtin_memcpy(a, b, 15); }
|
||
|
|
--
|
||
|
|
2.33.0
|
||
|
|
|