138 lines
5.0 KiB
Diff
138 lines
5.0 KiB
Diff
|
|
From 4a70bfbf686c2b6a1ecd83fe851de826c612c3e0 Mon Sep 17 00:00:00 2001
|
||
|
|
From: Xi Ruoyao <xry111@xry111.site>
|
||
|
|
Date: Tue, 14 Nov 2023 05:32:38 +0800
|
||
|
|
Subject: [PATCH] LoongArch: Use finer-grained DBAR hints
|
||
|
|
|
||
|
|
LA664 defines DBAR hints 0x1 - 0x1f (except 0xf and 0x1f) as follows [1-2]:
|
||
|
|
|
||
|
|
- Bit 4: kind of constraint (0: completion, 1: ordering)
|
||
|
|
- Bit 3: barrier for previous read (0: true, 1: false)
|
||
|
|
- Bit 2: barrier for previous write (0: true, 1: false)
|
||
|
|
- Bit 1: barrier for succeeding read (0: true, 1: false)
|
||
|
|
- Bit 0: barrier for succeeding write (0: true, 1: false)
|
||
|
|
|
||
|
|
LLVM has already utilized them for different memory orders [3]:
|
||
|
|
|
||
|
|
- Bit 4 is always set to one because it's only intended to be zero for
|
||
|
|
things like MMIO devices, which are out of the scope of memory orders.
|
||
|
|
- An acquire barrier is used to implement acquire loads like
|
||
|
|
|
||
|
|
ld.d $a1, $t0, 0
|
||
|
|
dbar acquire_hint
|
||
|
|
|
||
|
|
where the load operation (ld.d) should not be reordered with any load
|
||
|
|
or store operation after the acquire load. To accomplish this
|
||
|
|
constraint, we need to prevent the load operation from being reordered
|
||
|
|
after the barrier, and also prevent any following load/store operation
|
||
|
|
from being reordered before the barrier. Thus bits 0, 1, and 3 must
|
||
|
|
be zero, and bit 2 can be one, so acquire_hint should be 0b10100.
|
||
|
|
- An release barrier is used to implement release stores like
|
||
|
|
|
||
|
|
dbar release_hint
|
||
|
|
st.d $a1, $t0, 0
|
||
|
|
|
||
|
|
where the store operation (st.d) should not be reordered with any load
|
||
|
|
or store operation before the release store. So we need to prevent
|
||
|
|
the store operation from being reordered before the barrier, and also
|
||
|
|
prevent any preceding load/store operation from being reordered after
|
||
|
|
the barrier. So bits 0, 2, 3 must be zero, and bit 1 can be one. So
|
||
|
|
release_hint should be 0b10010.
|
||
|
|
|
||
|
|
A similar mapping has been utilized for RISC-V GCC [4], LoongArch Linux
|
||
|
|
kernel [1], and LoongArch LLVM [3]. So the mapping should be correct.
|
||
|
|
And I've also bootstrapped & regtested GCC on a LA664 with this patch.
|
||
|
|
|
||
|
|
The LoongArch CPUs should treat "unknown" hints as dbar 0, so we can
|
||
|
|
unconditionally emit the new hints without a compiler switch.
|
||
|
|
|
||
|
|
[1]: https://git.kernel.org/torvalds/c/e031a5f3f1ed
|
||
|
|
[2]: https://github.com/loongson-community/docs/pull/12
|
||
|
|
[3]: https://github.com/llvm/llvm-project/pull/68787
|
||
|
|
[4]: https://gcc.gnu.org/r14-406
|
||
|
|
|
||
|
|
gcc/ChangeLog:
|
||
|
|
|
||
|
|
* config/loongarch/sync.md (mem_thread_fence): Remove redundant
|
||
|
|
check.
|
||
|
|
(mem_thread_fence_1): Emit finer-grained DBAR hints for
|
||
|
|
different memory models, instead of 0.
|
||
|
|
|
||
|
|
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||
|
|
---
|
||
|
|
gcc/config/loongarch/sync.md | 51 +++++++++++++++++++++++++++++-------
|
||
|
|
1 file changed, 42 insertions(+), 9 deletions(-)
|
||
|
|
|
||
|
|
diff --git a/gcc/config/loongarch/sync.md b/gcc/config/loongarch/sync.md
|
||
|
|
index 9924d522bcd..1ad0c63e0d9 100644
|
||
|
|
--- a/gcc/config/loongarch/sync.md
|
||
|
|
+++ b/gcc/config/loongarch/sync.md
|
||
|
|
@@ -50,23 +50,56 @@
|
||
|
|
[(match_operand:SI 0 "const_int_operand" "")] ;; model
|
||
|
|
""
|
||
|
|
{
|
||
|
|
- if (INTVAL (operands[0]) != MEMMODEL_RELAXED)
|
||
|
|
- {
|
||
|
|
- rtx mem = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
|
||
|
|
- MEM_VOLATILE_P (mem) = 1;
|
||
|
|
- emit_insn (gen_mem_thread_fence_1 (mem, operands[0]));
|
||
|
|
- }
|
||
|
|
+ rtx mem = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
|
||
|
|
+ MEM_VOLATILE_P (mem) = 1;
|
||
|
|
+ emit_insn (gen_mem_thread_fence_1 (mem, operands[0]));
|
||
|
|
+
|
||
|
|
DONE;
|
||
|
|
})
|
||
|
|
|
||
|
|
-;; Until the LoongArch memory model (hence its mapping from C++) is finalized,
|
||
|
|
-;; conservatively emit a full FENCE.
|
||
|
|
+;; DBAR hint encoding for LA664 and later micro-architectures, paraphrased from
|
||
|
|
+;; the Linux patch revealing it [1]:
|
||
|
|
+;;
|
||
|
|
+;; - Bit 4: kind of constraint (0: completion, 1: ordering)
|
||
|
|
+;; - Bit 3: barrier for previous read (0: true, 1: false)
|
||
|
|
+;; - Bit 2: barrier for previous write (0: true, 1: false)
|
||
|
|
+;; - Bit 1: barrier for succeeding read (0: true, 1: false)
|
||
|
|
+;; - Bit 0: barrier for succeeding write (0: true, 1: false)
|
||
|
|
+;;
|
||
|
|
+;; [1]: https://git.kernel.org/torvalds/c/e031a5f3f1ed
|
||
|
|
+;;
|
||
|
|
+;; Implementations without support for the finer-granularity hints simply treat
|
||
|
|
+;; all as the full barrier (DBAR 0), so we can unconditionally start emiting the
|
||
|
|
+;; more precise hints right away.
|
||
|
|
(define_insn "mem_thread_fence_1"
|
||
|
|
[(set (match_operand:BLK 0 "" "")
|
||
|
|
(unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))
|
||
|
|
(match_operand:SI 1 "const_int_operand" "")] ;; model
|
||
|
|
""
|
||
|
|
- "dbar\t0")
|
||
|
|
+ {
|
||
|
|
+ enum memmodel model = memmodel_base (INTVAL (operands[1]));
|
||
|
|
+
|
||
|
|
+ switch (model)
|
||
|
|
+ {
|
||
|
|
+ case MEMMODEL_ACQUIRE:
|
||
|
|
+ return "dbar\t0b10100";
|
||
|
|
+ case MEMMODEL_RELEASE:
|
||
|
|
+ return "dbar\t0b10010";
|
||
|
|
+ case MEMMODEL_ACQ_REL:
|
||
|
|
+ case MEMMODEL_SEQ_CST:
|
||
|
|
+ return "dbar\t0b10000";
|
||
|
|
+ default:
|
||
|
|
+ /* GCC internal: "For the '__ATOMIC_RELAXED' model no instructions
|
||
|
|
+ need to be issued and this expansion is not invoked."
|
||
|
|
+
|
||
|
|
+ __atomic builtins doc: "Consume is implemented using the
|
||
|
|
+ stronger acquire memory order because of a deficiency in C++11's
|
||
|
|
+ semantics." See PR 59448 and get_memmodel in builtins.cc.
|
||
|
|
+
|
||
|
|
+ Other values should not be returned by memmodel_base. */
|
||
|
|
+ gcc_unreachable ();
|
||
|
|
+ }
|
||
|
|
+ })
|
||
|
|
|
||
|
|
;; Atomic memory operations.
|
||
|
|
|
||
|
|
--
|
||
|
|
2.33.0
|
||
|
|
|