143 lines
4.3 KiB
Diff
143 lines
4.3 KiB
Diff
From 19ee37b11702c86d7ed271e9e1d00e23cc4ab93c Mon Sep 17 00:00:00 2001
|
|
From: Jan Hubicka <jh@suse.cz>
|
|
Date: Fri, 29 Dec 2023 23:51:03 +0100
|
|
Subject: [PATCH 17/32] Disable FMADD in chains for Zen4 and generic
|
|
|
|
this patch disables use of FMA in matrix multiplication loop for generic (for
|
|
x86-64-v3) and zen4. I tested this on zen4 and Xenon Gold Gold 6212U.
|
|
|
|
For Intel this is neutral both on the matrix multiplication microbenchmark
|
|
(attached) and spec2k17 where the difference was within noise for Core.
|
|
|
|
On core the micro-benchmark runs as follows:
|
|
|
|
With FMA:
|
|
|
|
578,500,241 cycles:u # 3.645 GHz
|
|
( +- 0.12% )
|
|
753,318,477 instructions:u # 1.30 insn per
|
|
cycle ( +- 0.00% )
|
|
125,417,701 branches:u # 790.227 M/sec
|
|
( +- 0.00% )
|
|
0.159146 +- 0.000363 seconds time elapsed ( +- 0.23% )
|
|
|
|
No FMA:
|
|
|
|
577,573,960 cycles:u # 3.514 GHz
|
|
( +- 0.15% )
|
|
878,318,479 instructions:u # 1.52 insn per
|
|
cycle ( +- 0.00% )
|
|
125,417,702 branches:u # 763.035 M/sec
|
|
( +- 0.00% )
|
|
0.164734 +- 0.000321 seconds time elapsed ( +- 0.19% )
|
|
|
|
So the cycle count is unchanged and discrete multiply+add takes same time as
|
|
FMA.
|
|
|
|
While on zen:
|
|
|
|
With FMA:
|
|
484875179 cycles:u # 3.599 GHz
|
|
( +- 0.05% ) (82.11%)
|
|
752031517 instructions:u # 1.55 insn per
|
|
cycle
|
|
125106525 branches:u # 928.712 M/sec
|
|
( +- 0.03% ) (85.09%)
|
|
128356 branch-misses:u # 0.10% of all
|
|
branches ( +- 0.06% ) (83.58%)
|
|
|
|
No FMA:
|
|
375875209 cycles:u # 3.592 GHz
|
|
( +- 0.08% ) (80.74%)
|
|
875725341 instructions:u # 2.33 insn per
|
|
cycle
|
|
124903825 branches:u # 1.194 G/sec
|
|
( +- 0.04% ) (84.59%)
|
|
0.105203 +- 0.000188 seconds time elapsed ( +- 0.18% )
|
|
|
|
The diffrerence is that Cores understand the fact that fmadd does not need
|
|
all three parameters to start computation, while Zen cores doesn't.
|
|
|
|
Since this seems noticeable win on zen and not loss on Core it seems like good
|
|
default for generic.
|
|
|
|
float a[SIZE][SIZE];
|
|
float b[SIZE][SIZE];
|
|
float c[SIZE][SIZE];
|
|
|
|
void init(void)
|
|
{
|
|
int i, j, k;
|
|
for(i=0; i<SIZE; ++i)
|
|
{
|
|
for(j=0; j<SIZE; ++j)
|
|
{
|
|
a[i][j] = (float)i + j;
|
|
b[i][j] = (float)i - j;
|
|
c[i][j] = 0.0f;
|
|
}
|
|
}
|
|
}
|
|
|
|
void mult(void)
|
|
{
|
|
int i, j, k;
|
|
|
|
for(i=0; i<SIZE; ++i)
|
|
{
|
|
for(j=0; j<SIZE; ++j)
|
|
{
|
|
for(k=0; k<SIZE; ++k)
|
|
{
|
|
c[i][j] += a[i][k] * b[k][j];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
int main(void)
|
|
{
|
|
clock_t s, e;
|
|
|
|
init();
|
|
s=clock();
|
|
mult();
|
|
e=clock();
|
|
printf(" mult took %10d clocks\n", (int)(e-s));
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
gcc/ChangeLog:
|
|
|
|
* config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS,
|
|
X86_TUNE_AVOID_256FMA_CHAINS): Enable for znver4 and Core.
|
|
---
|
|
gcc/config/i386/x86-tune.def | 5 +++--
|
|
1 file changed, 3 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
|
|
index bdb455d20..fd095f3ec 100644
|
|
--- a/gcc/config/i386/x86-tune.def
|
|
+++ b/gcc/config/i386/x86-tune.def
|
|
@@ -499,12 +499,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
|
|
|
|
/* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
|
|
smaller FMA chain. */
|
|
-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3)
|
|
+DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2
|
|
+ | m_ZNVER3 | m_ZNVER4 | m_GENERIC)
|
|
|
|
/* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
|
|
smaller FMA chain. */
|
|
DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3
|
|
- | m_ALDERLAKE | m_SAPPHIRERAPIDS)
|
|
+ | m_ZNVER4 | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_GENERIC)
|
|
|
|
/* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
|
|
smaller FMA chain. */
|
|
--
|
|
2.28.0.windows.1
|
|
|