8642 lines
239 KiB
Diff
8642 lines
239 KiB
Diff
From cbf63168fd04e1edda450303ef5c016666a74417 Mon Sep 17 00:00:00 2001
|
|
From: swcompiler <lc@wxiat.com>
|
|
Date: Fri, 29 Nov 2024 14:21:51 +0800
|
|
Subject: [PATCH 12/23] Sw64: Memory and String Implementation
|
|
|
|
---
|
|
sysdeps/sw_64/memchr.c | 179 ++++++
|
|
sysdeps/sw_64/memset.S | 221 +++++++
|
|
sysdeps/sw_64/memusage.h | 24 +
|
|
sysdeps/sw_64/rawmemchr.S | 89 +++
|
|
sysdeps/sw_64/stpcpy.S | 55 ++
|
|
sysdeps/sw_64/stpncpy.S | 106 ++++
|
|
sysdeps/sw_64/strcat.S | 71 +++
|
|
sysdeps/sw_64/strchr.S | 91 +++
|
|
sysdeps/sw_64/strcmp.S | 194 ++++++
|
|
sysdeps/sw_64/strcpy.S | 41 ++
|
|
sysdeps/sw_64/strlen.S | 76 +++
|
|
sysdeps/sw_64/strncat.S | 94 +++
|
|
sysdeps/sw_64/strncmp.S | 277 +++++++++
|
|
sysdeps/sw_64/strncpy.S | 87 +++
|
|
sysdeps/sw_64/strrchr.S | 107 ++++
|
|
sysdeps/sw_64/stxcpy.S | 291 +++++++++
|
|
sysdeps/sw_64/stxncpy.S | 349 +++++++++++
|
|
sysdeps/sw_64/sw6a/memcpy.S | 837 +++++++++++++++++++++++++
|
|
sysdeps/sw_64/sw6a/memset.S | 415 +++++++++++++
|
|
sysdeps/sw_64/sw6a/stxcpy.S | 314 ++++++++++
|
|
sysdeps/sw_64/sw6a/stxncpy.S | 392 ++++++++++++
|
|
sysdeps/sw_64/sw8a/memcpy.S | 320 ++++++++++
|
|
sysdeps/sw_64/sw8a/memmove.S | 1120 ++++++++++++++++++++++++++++++++++
|
|
sysdeps/sw_64/sw8a/memset.S | 332 ++++++++++
|
|
sysdeps/sw_64/sw8a/strcat.S | 669 ++++++++++++++++++++
|
|
sysdeps/sw_64/sw8a/strlen.S | 112 ++++
|
|
sysdeps/sw_64/sw8a/strncat.S | 829 +++++++++++++++++++++++++
|
|
sysdeps/sw_64/sw8a/stxcpy.S | 314 ++++++++++
|
|
sysdeps/sw_64/sw8a/stxncpy.S | 392 ++++++++++++
|
|
29 files changed, 8398 insertions(+)
|
|
create mode 100644 sysdeps/sw_64/memchr.c
|
|
create mode 100644 sysdeps/sw_64/memset.S
|
|
create mode 100644 sysdeps/sw_64/memusage.h
|
|
create mode 100644 sysdeps/sw_64/rawmemchr.S
|
|
create mode 100644 sysdeps/sw_64/stpcpy.S
|
|
create mode 100644 sysdeps/sw_64/stpncpy.S
|
|
create mode 100644 sysdeps/sw_64/strcat.S
|
|
create mode 100644 sysdeps/sw_64/strchr.S
|
|
create mode 100644 sysdeps/sw_64/strcmp.S
|
|
create mode 100644 sysdeps/sw_64/strcpy.S
|
|
create mode 100644 sysdeps/sw_64/strlen.S
|
|
create mode 100644 sysdeps/sw_64/strncat.S
|
|
create mode 100644 sysdeps/sw_64/strncmp.S
|
|
create mode 100644 sysdeps/sw_64/strncpy.S
|
|
create mode 100644 sysdeps/sw_64/strrchr.S
|
|
create mode 100644 sysdeps/sw_64/stxcpy.S
|
|
create mode 100644 sysdeps/sw_64/stxncpy.S
|
|
create mode 100644 sysdeps/sw_64/sw6a/memcpy.S
|
|
create mode 100644 sysdeps/sw_64/sw6a/memset.S
|
|
create mode 100644 sysdeps/sw_64/sw6a/stxcpy.S
|
|
create mode 100644 sysdeps/sw_64/sw6a/stxncpy.S
|
|
create mode 100644 sysdeps/sw_64/sw8a/memcpy.S
|
|
create mode 100644 sysdeps/sw_64/sw8a/memmove.S
|
|
create mode 100644 sysdeps/sw_64/sw8a/memset.S
|
|
create mode 100644 sysdeps/sw_64/sw8a/strcat.S
|
|
create mode 100644 sysdeps/sw_64/sw8a/strlen.S
|
|
create mode 100644 sysdeps/sw_64/sw8a/strncat.S
|
|
create mode 100644 sysdeps/sw_64/sw8a/stxcpy.S
|
|
create mode 100644 sysdeps/sw_64/sw8a/stxncpy.S
|
|
|
|
diff --git a/sysdeps/sw_64/memchr.c b/sysdeps/sw_64/memchr.c
|
|
new file mode 100644
|
|
index 00000000..a21112a3
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/memchr.c
|
|
@@ -0,0 +1,179 @@
|
|
+/* Copyright (C) 2010-2023 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <https://www.gnu.org/licenses/>. */
|
|
+
|
|
+#include <string.h>
|
|
+
|
|
+typedef unsigned long word;
|
|
+
|
|
+static inline word
|
|
+ldq_u (const void *s)
|
|
+{
|
|
+ return *(const word *) ((word) s & -8);
|
|
+}
|
|
+
|
|
+#define unlikely(X) __builtin_expect ((X), 0)
|
|
+#define prefetch(X) __builtin_prefetch ((void *) (X), 0)
|
|
+
|
|
+#define cmpbeq0(X) __builtin_sw_64_cmpbge (0, (X))
|
|
+#define find(X, Y) cmpbeq0 ((X) ^ (Y))
|
|
+
|
|
+/* Search no more than N bytes of S for C. */
|
|
+
|
|
+void *
|
|
+__memchr (const void *s, int xc, size_t n)
|
|
+{
|
|
+ const word *s_align;
|
|
+ word t, current, found, mask, offset;
|
|
+
|
|
+ if (unlikely (n == 0))
|
|
+ return 0;
|
|
+
|
|
+ current = ldq_u (s);
|
|
+
|
|
+ /* Replicate low byte of XC into all bytes of C. */
|
|
+ t = xc & 0xff; /* 0000000c */
|
|
+ t = (t << 8) | t; /* 000000cc */
|
|
+ t = (t << 16) | t; /* 0000cccc */
|
|
+ const word c = (t << 32) | t; /* cccccccc */
|
|
+
|
|
+ /* Align the source, and decrement the count by the number
|
|
+ of bytes searched in the first word. */
|
|
+ s_align = (const word *) ((word) s & -8);
|
|
+ {
|
|
+ size_t inc = n + ((word) s & 7);
|
|
+ n = inc | -(inc < n);
|
|
+ }
|
|
+
|
|
+ /* Deal with misalignment in the first word for the comparison. */
|
|
+ mask = (1ul << ((word) s & 7)) - 1;
|
|
+
|
|
+ /* If the entire string fits within one word, we may need masking
|
|
+ at both the front and the back of the string. */
|
|
+ if (unlikely (n <= 8))
|
|
+ {
|
|
+ mask |= -1ul << n;
|
|
+ goto last_quad;
|
|
+ }
|
|
+
|
|
+ found = find (current, c) & ~mask;
|
|
+ if (unlikely (found))
|
|
+ goto found_it;
|
|
+
|
|
+ s_align++;
|
|
+ n -= 8;
|
|
+
|
|
+ /* If the block is sufficiently large, align to cacheline and prefetch. */
|
|
+ if (unlikely (n >= 256))
|
|
+ {
|
|
+ /* Prefetch 3 cache lines beyond the one we're working on. */
|
|
+ prefetch (s_align + 8);
|
|
+ prefetch (s_align + 16);
|
|
+ prefetch (s_align + 24);
|
|
+
|
|
+ while ((word) s_align & 63)
|
|
+ {
|
|
+ current = *s_align;
|
|
+ found = find (current, c);
|
|
+ if (found)
|
|
+ goto found_it;
|
|
+ s_align++;
|
|
+ n -= 8;
|
|
+ }
|
|
+
|
|
+ /* Within each cacheline, advance the load for the next word
|
|
+ before the test for the previous word is complete. This
|
|
+ allows us to hide the 3 cycle L1 cache load latency. We
|
|
+ only perform this advance load within a cacheline to prevent
|
|
+ reading across page boundary. */
|
|
+#define CACHELINE_LOOP \
|
|
+ do \
|
|
+ { \
|
|
+ word i, next = s_align[0]; \
|
|
+ for (i = 0; i < 7; ++i) \
|
|
+ { \
|
|
+ current = next; \
|
|
+ next = s_align[1]; \
|
|
+ found = find (current, c); \
|
|
+ if (unlikely (found)) \
|
|
+ goto found_it; \
|
|
+ s_align++; \
|
|
+ } \
|
|
+ current = next; \
|
|
+ found = find (current, c); \
|
|
+ if (unlikely (found)) \
|
|
+ goto found_it; \
|
|
+ s_align++; \
|
|
+ n -= 64; \
|
|
+ } \
|
|
+ while (0)
|
|
+
|
|
+ /* While there's still lots more data to potentially be read,
|
|
+ continue issuing prefetches for the 4th cacheline out. */
|
|
+ while (n >= 256)
|
|
+ {
|
|
+ prefetch (s_align + 24);
|
|
+ CACHELINE_LOOP;
|
|
+ }
|
|
+
|
|
+ /* Up to 3 cache lines remaining. Continue issuing advanced
|
|
+ loads, but stop prefetching. */
|
|
+ while (n >= 64)
|
|
+ CACHELINE_LOOP;
|
|
+
|
|
+ /* We may have exhausted the buffer. */
|
|
+ if (n == 0)
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ /* Quadword aligned loop. */
|
|
+ current = *s_align;
|
|
+ while (n > 8)
|
|
+ {
|
|
+ found = find (current, c);
|
|
+ if (unlikely (found))
|
|
+ goto found_it;
|
|
+ current = *++s_align;
|
|
+ n -= 8;
|
|
+ }
|
|
+
|
|
+ /* The last word may need masking at the tail of the compare. */
|
|
+ mask = -1ul << n;
|
|
+last_quad:
|
|
+ found = find (current, c) & ~mask;
|
|
+ if (found == 0)
|
|
+ return NULL;
|
|
+
|
|
+found_it:
|
|
+#ifdef __sw_64_cix__
|
|
+ offset = __builtin_sw_64_cttz (found);
|
|
+#else
|
|
+ /* Extract LSB. */
|
|
+ found &= -found;
|
|
+
|
|
+ /* Binary search for the LSB. */
|
|
+ offset = (found & 0x0f ? 0 : 4);
|
|
+ offset += (found & 0x33 ? 0 : 2);
|
|
+ offset += (found & 0x55 ? 0 : 1);
|
|
+#endif
|
|
+
|
|
+ return (void *) ((word) s_align + offset);
|
|
+}
|
|
+
|
|
+#ifdef weak_alias
|
|
+weak_alias (__memchr, memchr)
|
|
+#endif
|
|
+ libc_hidden_builtin_def (memchr)
|
|
diff --git a/sysdeps/sw_64/memset.S b/sysdeps/sw_64/memset.S
|
|
new file mode 100644
|
|
index 00000000..5a40ce37
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/memset.S
|
|
@@ -0,0 +1,221 @@
|
|
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
|
|
+ Contributed by Richard Henderson (rth@tamu.edu)
|
|
+ SW6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .arch sw6b
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+
|
|
+ENTRY(memset)
|
|
+#ifdef PROF
|
|
+ ldgp gp, 0(pv)
|
|
+ ldi AT, _mcount
|
|
+ call AT, (AT), _mcount
|
|
+ .prologue 1
|
|
+#else
|
|
+ .prologue 0
|
|
+#endif
|
|
+
|
|
+ /*
|
|
+ * Serious stalling happens. The only way to mitigate this is to
|
|
+ * undertake a major re-write to interleave the constant materialization
|
|
+ * with other parts of the fall-through code. This is important, even
|
|
+ * though it makes maintenance tougher.
|
|
+ * Do this later.
|
|
+ */
|
|
+ and $17, 255, $1 # E : 00000000000000ch
|
|
+ ins0b $17, 1, $2 # U : 000000000000ch00
|
|
+ mov $16, $0 # E : return value
|
|
+ ble $18, $end # U : zero length requested
|
|
+
|
|
+ addl $18, $16, $6 # E : max address to write to
|
|
+ or $1, $2, $17 # E : 000000000000chch
|
|
+ ins0b $1, 2, $3 # U : 0000000000ch0000
|
|
+ ins0b $1, 3, $4 # U : 00000000ch000000
|
|
+
|
|
+ or $3, $4, $3 # E : 00000000chch0000
|
|
+ ins1b $17, 4, $5 # U : 0000chch00000000
|
|
+ xor $16, $6, $1 # E : will complete write be within one quadword
|
|
+ ins1b $17, 6, $2 # U : chch000000000000
|
|
+
|
|
+ or $17, $3, $17 # E : 00000000chchchch
|
|
+ or $2, $5, $2 # E : chchchch00000000
|
|
+ bic $1, 7, $1 # E : fit within a single quadword
|
|
+ and $16, 7, $3 # E : Target addr misalignment
|
|
+
|
|
+ or $17, $2, $17 # E : chchchchchchchch
|
|
+ beq $1, $within_quad # U :
|
|
+ nop # E :
|
|
+ beq $3, $aligned # U : target is 0mod8
|
|
+
|
|
+ /*
|
|
+ * Target address is misaligned, and won't fit within a quadword.
|
|
+ */
|
|
+ ldl_u $4, 0($16) # L : Fetch first partial
|
|
+ mov $16, $5 # E : Save the address
|
|
+ ins3b $17, $16, $2 # U : Insert new bytes
|
|
+ subl $3, 8, $3 # E : Invert (for addressing uses)
|
|
+
|
|
+ addl $18, $3, $18 # E : $18 is new count ($3 is negative)
|
|
+ mask3b $4, $16, $4 # U : clear relevant parts of the quad
|
|
+ subl $16, $3, $16 # E : $16 is new aligned destination
|
|
+ or $2, $4, $1 # E : Final bytes
|
|
+
|
|
+ nop
|
|
+ stl_u $1,0($5) # L : Store result
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+ .align 4
|
|
+$aligned:
|
|
+ /*
|
|
+ * We are now guaranteed to be quad aligned, with at least
|
|
+ * one partial quad to write.
|
|
+ */
|
|
+
|
|
+ sra $18, 3, $3 # U : Number of remaining quads to write
|
|
+ and $18, 7, $18 # E : Number of trailing bytes to write
|
|
+ mov $16, $5 # E : Save dest address
|
|
+ beq $3, $no_quad # U : tail stuff only
|
|
+
|
|
+ /*
|
|
+ * It's worth the effort to unroll this and use wh64 if possible.
|
|
+ * At this point, entry values are:
|
|
+ * $16 Current destination address
|
|
+ * $5 A copy of $16
|
|
+ * $6 The max quadword address to write to
|
|
+ * $18 Number trailer bytes
|
|
+ * $3 Number quads to write
|
|
+ */
|
|
+ and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
|
|
+ subl $3, 16, $4 # E : Only try to unroll if > 128 bytes
|
|
+ subl $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
|
|
+ blt $4, $loop # U :
|
|
+
|
|
+ /*
|
|
+ * We know we've got at least 16 quads, minimum of one trip
|
|
+ * through unrolled loop. Do a quad at a time to get us 0mod64
|
|
+ * aligned.
|
|
+ */
|
|
+
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+ beq $1, $bigalign # U :
|
|
+$alignmod32:
|
|
+ stl $17, 0($5) # L :
|
|
+ subl $3, 1, $3 # E : For consistency later
|
|
+ addl $1, 8, $1 # E : Increment towards zero for alignment
|
|
+ addl $5, 8, $4 # E : Initial wh64 address (filler instruction)
|
|
+
|
|
+ nop
|
|
+ nop
|
|
+ addl $5, 8, $5 # E : Inc address
|
|
+ blt $1, $alignmod32 # U :
|
|
+
|
|
+$bigalign:
|
|
+ /*
|
|
+ * $3 - number quads left to go
|
|
+ * $5 - target address (aligned 0mod64)
|
|
+ * $17 - mask of stuff to store
|
|
+ * Scratch registers available: $7, $2, $4, $1
|
|
+ * We know that we'll be taking a minimum of one trip through.
|
|
+ * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
|
|
+ * Assumes the wh64 needs to be for 2 trips through the loop in the
|
|
+ * future.The wh64 is issued on for the starting destination address for
|
|
+ * trip +2 through the loop, and if there are less than two trips left,
|
|
+ * the target address will be for the current trip. */
|
|
+
|
|
+$do_wh64:
|
|
+ wh64 ($4) # L1 : memory subsystem write hint
|
|
+ subl $3, 24, $2 # E : For determining future wh64 addresses
|
|
+ stl $17, 0($5) # L :
|
|
+ nop # E :
|
|
+
|
|
+ addl $5, 128, $4 # E : speculative target of next wh64
|
|
+ stl $17, 8($5) # L :
|
|
+ stl $17, 16($5) # L :
|
|
+ addl $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
|
|
+
|
|
+ stl $17, 24($5) # L :
|
|
+ stl $17, 32($5) # L :
|
|
+ sellt $2, $7, $4, $4 # E : Latency 2, extra mapping cycle
|
|
+ nop
|
|
+
|
|
+ stl $17, 40($5) # L :
|
|
+ stl $17, 48($5) # L :
|
|
+ subl $3, 16, $2 # E : Repeat the loop at least once more
|
|
+ nop
|
|
+
|
|
+ stl $17, 56($5) # L :
|
|
+ addl $5, 64, $5 # E :
|
|
+ subl $3, 8, $3 # E :
|
|
+ bge $2, $do_wh64 # U :
|
|
+
|
|
+ nop
|
|
+ nop
|
|
+ nop
|
|
+ beq $3, $no_quad # U : Might have finished already
|
|
+
|
|
+ .align 4
|
|
+ /*
|
|
+ * Simple loop for trailing quadwords, or for small amounts
|
|
+ * of data (where we can't use an unrolled loop and wh64)
|
|
+ */
|
|
+$loop:
|
|
+ stl $17, 0($5) # L :
|
|
+ subl $3, 1, $3 # E : Decrement number quads left
|
|
+ addl $5, 8, $5 # E : Inc address
|
|
+ bne $3, $loop # U : more
|
|
+
|
|
+$no_quad:
|
|
+ /*
|
|
+ * Write 0..7 trailing bytes.
|
|
+ */
|
|
+ nop # E :
|
|
+ beq $18, $end # U : All done
|
|
+ ldl $7, 0($5) # L :
|
|
+ mask7b $7, $6, $2 # U : Mask final quad
|
|
+
|
|
+ ins7b $17, $6, $4 # U : New bits
|
|
+ or $2, $4, $1 # E : Put it all together
|
|
+ stl $1, 0($5) # L : And back to memory
|
|
+ ret $31,($26),1 # L0 :
|
|
+
|
|
+$within_quad:
|
|
+ ldl_u $1, 0($16) # L :
|
|
+ ins3b $17, $16, $2 # U : New bits
|
|
+ mask3b $1, $16, $4 # U : Clear old
|
|
+ or $2, $4, $2 # E : New result
|
|
+
|
|
+ mask3b $2, $6, $4 # U :
|
|
+ mask7b $1, $6, $2 # U :
|
|
+ or $2, $4, $1 # E :
|
|
+ stl_u $1, 0($16) # L :
|
|
+
|
|
+$end:
|
|
+ nop
|
|
+ nop
|
|
+ nop
|
|
+ ret $31,($26),1 # L0 :
|
|
+
|
|
+ END(memset)
|
|
+libc_hidden_builtin_def (memset)
|
|
diff --git a/sysdeps/sw_64/memusage.h b/sysdeps/sw_64/memusage.h
|
|
new file mode 100644
|
|
index 00000000..ea383e07
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/memusage.h
|
|
@@ -0,0 +1,24 @@
|
|
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+#define GETSP() \
|
|
+ ({ \
|
|
+ register uintptr_t stack_ptr asm ("$30"); \
|
|
+ stack_ptr; \
|
|
+ })
|
|
+
|
|
+#include <sysdeps/generic/memusage.h>
|
|
diff --git a/sysdeps/sw_64/rawmemchr.S b/sysdeps/sw_64/rawmemchr.S
|
|
new file mode 100644
|
|
index 00000000..8ae57459
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/rawmemchr.S
|
|
@@ -0,0 +1,89 @@
|
|
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Return pointer to first occurrence of CH in STR. */
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .set noreorder
|
|
+ .set noat
|
|
+
|
|
+ENTRY(__rawmemchr)
|
|
+#ifdef PROF
|
|
+ ldgp gp, 0(pv)
|
|
+ ldi AT, _mcount
|
|
+ call AT, (AT), _mcount
|
|
+ .prologue 1
|
|
+#else
|
|
+ .prologue 0
|
|
+#endif
|
|
+
|
|
+ zapnot a1, 1, a1 # e0 : zero extend the search character
|
|
+ ldl_u t0, 0(a0) # .. e1 : load first quadword
|
|
+ sll a1, 8, t5 # e0 : replicate the search character
|
|
+ andnot a0, 7, v0 # .. e1 : align our loop pointer
|
|
+
|
|
+ or t5, a1, a1 # e0 :
|
|
+ ldi t4, -1 # .. e1 : build garbage mask
|
|
+ sll a1, 16, t5 # e0 :
|
|
+ unop # :
|
|
+
|
|
+ mask7b t4, a0, t4 # e0 :
|
|
+ or t5, a1, a1 # .. e1 :
|
|
+ sll a1, 32, t5 # e0 :
|
|
+ cmpgeb zero, t4, t4 # .. e1 : bits set iff byte is garbage
|
|
+
|
|
+ or t5, a1, a1 # e0 :
|
|
+ xor t0, a1, t1 # .. e1 : make bytes == c zero
|
|
+ cmpgeb zero, t1, t3 # e0 : bits set iff byte == c
|
|
+ unop # :
|
|
+
|
|
+ andnot t3, t4, t0 # e0 : clear garbage bits
|
|
+ fnop # .. fa :
|
|
+ unop # :
|
|
+ bne t0, $found # .. e1 (zdb)
|
|
+
|
|
+ .align 4
|
|
+$loop:
|
|
+ ldl t0, 8(v0) # e0 :
|
|
+ addl v0, 8, v0 # .. e1 :
|
|
+ nop # e0 :
|
|
+ xor t0, a1, t1 # .. e1 (data stall)
|
|
+
|
|
+ cmpgeb zero, t1, t0 # e0 : bits set iff byte == c
|
|
+ beq t0, $loop # .. e1 (zdb)
|
|
+
|
|
+$found:
|
|
+ negl t0, t1 # e0 : clear all but least set bit
|
|
+ and t0, t1, t0 # e1 (stall)
|
|
+ and t0, 0xf0, t2 # e0 : binary search for that set bit
|
|
+ and t0, 0xcc, t3 # .. e1 :
|
|
+
|
|
+ and t0, 0xaa, t4 # e0 :
|
|
+ selne t2, 4, t2, t2 # .. e1 :
|
|
+ selne t3, 2, t3, t3 # e0 :
|
|
+ selne t4, 1, t4, t4 # .. e1 :
|
|
+
|
|
+ addl t2, t3, t2 # e0 :
|
|
+ addl v0, t4, v0 # .. e1 :
|
|
+ addl v0, t2, v0 # e0 :
|
|
+ ret # .. e1 :
|
|
+
|
|
+ END(__rawmemchr)
|
|
+
|
|
+libc_hidden_def (__rawmemchr)
|
|
+weak_alias (__rawmemchr, rawmemchr)
|
|
diff --git a/sysdeps/sw_64/stpcpy.S b/sysdeps/sw_64/stpcpy.S
|
|
new file mode 100644
|
|
index 00000000..733ad089
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/stpcpy.S
|
|
@@ -0,0 +1,55 @@
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+ Contributed by Richard Henderson <rth@tamu.edu>, 1996.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Copy a null-terminated string from SRC to DST. Return a pointer
|
|
+ to the null-terminator in the source. */
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .text
|
|
+
|
|
+ENTRY(__stpcpy)
|
|
+ ldgp gp, 0(pv)
|
|
+#ifdef PROF
|
|
+ .set noat
|
|
+ ldi AT, _mcount
|
|
+ call AT, (AT), _mcount
|
|
+ .set at
|
|
+#endif
|
|
+ .prologue 1
|
|
+
|
|
+ call t9, __stxcpy # do the work of the copy
|
|
+
|
|
+ and t8, 0xf0, t2 # binary search for byte offset of the
|
|
+ and t8, 0xcc, t1 # last byte written.
|
|
+ and t8, 0xaa, t0
|
|
+ andnot a0, 7, a0
|
|
+ selne t2, 4, t2, t2
|
|
+ selne t1, 2, t1, t1
|
|
+ selne t0, 1, t0, t0
|
|
+ addl a0, t2, v0
|
|
+ addl t0, t1, t0
|
|
+ addl v0, t0, v0
|
|
+
|
|
+ ret
|
|
+
|
|
+ END(__stpcpy)
|
|
+
|
|
+weak_alias (__stpcpy, stpcpy)
|
|
+libc_hidden_def (__stpcpy)
|
|
+libc_hidden_builtin_def (stpcpy)
|
|
diff --git a/sysdeps/sw_64/stpncpy.S b/sysdeps/sw_64/stpncpy.S
|
|
new file mode 100644
|
|
index 00000000..1a52ba85
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/stpncpy.S
|
|
@@ -0,0 +1,106 @@
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+ Contributed by Richard Henderson (rth@tamu.edu)
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Copy no more than COUNT bytes of the null-terminated string from
|
|
+ SRC to DST. If SRC does not cover all of COUNT, the balance is
|
|
+ zeroed. Return the address of the terminating null in DEST, if
|
|
+ any, else DEST + COUNT. */
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+
|
|
+ .text
|
|
+
|
|
+ENTRY(__stpncpy)
|
|
+ ldgp gp, 0(pv)
|
|
+#ifdef PROF
|
|
+ ldi AT, _mcount
|
|
+ call AT, (AT), _mcount
|
|
+#endif
|
|
+ .prologue 1
|
|
+
|
|
+ beq a2, $zerocount
|
|
+ call t9, __stxncpy # do the work of the copy
|
|
+
|
|
+ and t8, 0xf0, t3 # binary search for byte offset of the
|
|
+ and t8, 0xcc, t2 # last byte written.
|
|
+ and t8, 0xaa, t1
|
|
+ andnot a0, 7, v0
|
|
+ selne t3, 4, t3, t3
|
|
+ selne t2, 2, t2, t2
|
|
+ selne t1, 1, t1, t1
|
|
+ addl v0, t3, v0
|
|
+ addl t1, t2, t1
|
|
+ addl v0, t1, v0
|
|
+
|
|
+ bne a2, $multiword # do we have full words left?
|
|
+
|
|
+ .align 3
|
|
+ zapnot t0, t8, t4 # e0 : was last byte a null?
|
|
+ subl t8, 1, t2 # .. e1 :
|
|
+ addl v0, 1, t5 # e0 :
|
|
+ subl t10, 1, t3 # .. e1 :
|
|
+ or t2, t8, t2 # e0 : clear the bits between the last
|
|
+ or t3, t10, t3 # .. e1 : written byte and the last byte in
|
|
+ andnot t3, t2, t3 # e0 : COUNT
|
|
+ selne t4, t5, v0, v0 # .. e1 : if last written wasnt null, inc v0
|
|
+ zap t0, t3, t0 # e0 :
|
|
+ stl_u t0, 0(a0) # e1 :
|
|
+ ret # .. e1 :
|
|
+
|
|
+ .align 3
|
|
+$multiword:
|
|
+ subl t8, 1, t7 # e0 : clear the final bits in the prev
|
|
+ or t7, t8, t7 # e1 : word
|
|
+ zapnot t0, t7, t0 # e0 :
|
|
+ subl a2, 1, a2 # .. e1 :
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+
|
|
+ beq a2, 1f # e1 :
|
|
+ blbc a2, 0f # e1 :
|
|
+
|
|
+ stl_u zero, 0(a0) # e0 : zero one word
|
|
+ subl a2, 1, a2 # .. e1 :
|
|
+ addl a0, 8, a0 # e0 :
|
|
+ beq a2, 1f # .. e1 :
|
|
+
|
|
+0: stl_u zero, 0(a0) # e0 : zero two words
|
|
+ subl a2, 2, a2 # .. e1 :
|
|
+ stl_u zero, 8(a0) # e0 :
|
|
+ addl a0, 16, a0 # .. e1 :
|
|
+ bne a2, 0b # e1 :
|
|
+ unop
|
|
+
|
|
+1: ldl_u t0, 0(a0) # e0 : clear the leading bits in the final
|
|
+ subl t10, 1, t7 # .. e1 : word
|
|
+ or t7, t10, t7 # e0 :
|
|
+ zap t0, t7, t0 # e1 (stall)
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ ret # .. e1 :
|
|
+
|
|
+$zerocount:
|
|
+ mov a0, v0
|
|
+ ret
|
|
+
|
|
+ END(__stpncpy)
|
|
+
|
|
+libc_hidden_def (__stpncpy)
|
|
+weak_alias (__stpncpy, stpncpy)
|
|
diff --git a/sysdeps/sw_64/strcat.S b/sysdeps/sw_64/strcat.S
|
|
new file mode 100644
|
|
index 00000000..778a2e3d
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/strcat.S
|
|
@@ -0,0 +1,71 @@
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+ Contributed by Richard Henderson <rth@tamu.edu>, 1996.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Append a null-terminated string from SRC to DST. */
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .text
|
|
+
|
|
+ENTRY(strcat)
|
|
+ ldgp gp, 0(pv)
|
|
+#ifdef PROF
|
|
+ .set noat
|
|
+ ldi AT, _mcount
|
|
+ call AT, (AT), _mcount
|
|
+ .set at
|
|
+#endif
|
|
+ .prologue 1
|
|
+
|
|
+ mov a0, v0 # set up return value
|
|
+
|
|
+ /* Find the end of the string. */
|
|
+
|
|
+ ldl_u t0, 0(a0) # load first quadword (a0 may be misaligned)
|
|
+ ldi t1, -1(zero)
|
|
+ ins7b t1, a0, t1
|
|
+ andnot a0, 7, a0
|
|
+ or t1, t0, t0
|
|
+ cmpgeb zero, t0, t1 # t1 <- bitmask: bit i == 1 <==> i-th byte == 0
|
|
+ bne t1, $found
|
|
+
|
|
+$loop: ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ beq t1, $loop
|
|
+
|
|
+$found: negl t1, t2 # clear all but least set bit
|
|
+ and t1, t2, t1
|
|
+
|
|
+ and t1, 0xf0, t2 # binary search for that set bit
|
|
+ and t1, 0xcc, t3
|
|
+ and t1, 0xaa, t4
|
|
+ selne t2, 4, t2, t2
|
|
+ selne t3, 2, t3, t3
|
|
+ selne t4, 1, t4, t4
|
|
+ addl t2, t3, t2
|
|
+ addl a0, t4, a0
|
|
+ addl a0, t2, a0
|
|
+
|
|
+ /* Now do the append. */
|
|
+
|
|
+ mov ra, t9
|
|
+ jmp $31, __stxcpy
|
|
+
|
|
+ END(strcat)
|
|
+libc_hidden_builtin_def (strcat)
|
|
diff --git a/sysdeps/sw_64/strchr.S b/sysdeps/sw_64/strchr.S
|
|
new file mode 100644
|
|
index 00000000..c3d20f05
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/strchr.S
|
|
@@ -0,0 +1,91 @@
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+ Contributed by Richard Henderson (rth@tamu.edu)
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Return the address of a given character within a null-terminated
|
|
+ string, or null if it is not found.
|
|
+
|
|
+*/
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .set noreorder
|
|
+ .set noat
|
|
+
|
|
+ENTRY(strchr)
|
|
+#ifdef PROF
|
|
+ ldgp gp, 0(pv)
|
|
+ ldi AT, _mcount
|
|
+ call AT, (AT), _mcount
|
|
+ .prologue 1
|
|
+#else
|
|
+ .prologue 0
|
|
+#endif
|
|
+
|
|
+ zapnot a1, 1, a1 # e0 : zero extend the search character
|
|
+ ldl_u t0, 0(a0) # .. e1 : load first quadword
|
|
+ sll a1, 8, t5 # e0 : replicate the search character
|
|
+ andnot a0, 7, v0 # .. e1 : align our loop pointer
|
|
+ or t5, a1, a1 # e0 :
|
|
+ ldi t4, -1 # .. e1 : build garbage mask
|
|
+ sll a1, 16, t5 # e0 :
|
|
+ cmpgeb zero, t0, t2 # .. e1 : bits set iff byte == zero
|
|
+ mask7b t4, a0, t4 # e0 :
|
|
+ or t5, a1, a1 # .. e1 :
|
|
+ sll a1, 32, t5 # e0 :
|
|
+ cmpgeb zero, t4, t4 # .. e1 : bits set iff byte is garbage
|
|
+ or t5, a1, a1 # e0 :
|
|
+ xor t0, a1, t1 # .. e1 : make bytes == c zero
|
|
+ cmpgeb zero, t1, t3 # e0 : bits set iff byte == c
|
|
+ or t2, t3, t0 # e1 : bits set iff char match or zero match
|
|
+ andnot t0, t4, t0 # e0 : clear garbage bits
|
|
+ bne t0, $found # .. e1 (zdb)
|
|
+
|
|
+$loop: ldl t0, 8(v0) # e0 :
|
|
+ addl v0, 8, v0 # .. e1 :
|
|
+ nop # e0 :
|
|
+ xor t0, a1, t1 # .. e1 (data stall)
|
|
+ cmpgeb zero, t0, t2 # e0 : bits set iff byte == 0
|
|
+ cmpgeb zero, t1, t3 # .. e1 : bits set iff byte == c
|
|
+ or t2, t3, t0 # e0 :
|
|
+ beq t0, $loop # .. e1 (zdb)
|
|
+
|
|
+$found: negl t0, t1 # e0 : clear all but least set bit
|
|
+ and t0, t1, t0 # e1 (stall)
|
|
+
|
|
+ and t0, t3, t1 # e0 : bit set iff byte was the char
|
|
+ beq t1, $retnull # .. e1 (zdb)
|
|
+
|
|
+ and t0, 0xf0, t2 # e0 : binary search for that set bit
|
|
+ and t0, 0xcc, t3 # .. e1 :
|
|
+ and t0, 0xaa, t4 # e0 :
|
|
+ selne t2, 4, t2, t2 # .. e1 :
|
|
+ selne t3, 2, t3, t3 # e0 :
|
|
+ selne t4, 1, t4, t4 # .. e1 :
|
|
+ addl t2, t3, t2 # e0 :
|
|
+ addl v0, t4, v0 # .. e1 :
|
|
+ addl v0, t2, v0 # e0 :
|
|
+ ret # .. e1 :
|
|
+
|
|
+$retnull:
|
|
+ mov zero, v0 # e0 :
|
|
+ ret # .. e1 :
|
|
+
|
|
+ END(strchr)
|
|
+
|
|
+weak_alias (strchr, index)
|
|
+libc_hidden_builtin_def (strchr)
|
|
diff --git a/sysdeps/sw_64/strcmp.S b/sysdeps/sw_64/strcmp.S
|
|
new file mode 100644
|
|
index 00000000..a3c73feb
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/strcmp.S
|
|
@@ -0,0 +1,194 @@
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
|
+ Contributed by Richard Henderson (rth@tamu.edu)
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Bytewise compare two null-terminated strings. */
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+
|
|
+ .text
|
|
+
|
|
+ENTRY(strcmp)
|
|
+#ifdef PROF
|
|
+ ldgp gp, 0(pv)
|
|
+ ldi AT, _mcount
|
|
+ jmp AT, (AT), _mcount
|
|
+ .prologue 1
|
|
+#else
|
|
+ .prologue 0
|
|
+#endif
|
|
+
|
|
+ ldl_u t0, 0(a0) # e0 : give cache time to catch up
|
|
+ xor a0, a1, t2 # .. e1 : are s1 and s2 co-aligned?
|
|
+ ldl_u t1, 0(a1) # e0 :
|
|
+ and t2, 7, t2 # .. e1 :
|
|
+ ldi t3, -1 # e0 :
|
|
+ bne t2, $unaligned # .. e1 :
|
|
+
|
|
+ /* On entry to this basic block:
|
|
+ t0 == the first destination word for masking back in
|
|
+ t1 == the first source word.
|
|
+ t3 == -1. */
|
|
+
|
|
+$aligned:
|
|
+ mask7b t3, a0, t3 # e0 :
|
|
+ nop # .. e1 :
|
|
+ ornot t1, t3, t1 # e0 :
|
|
+ ornot t0, t3, t0 # .. e1 :
|
|
+ cmpgeb zero, t1, t7 # e0 : bits set iff null found
|
|
+ bne t7, $eos # e1 (zdb)
|
|
+
|
|
+ /* Aligned compare main loop.
|
|
+ On entry to this basic block:
|
|
+ t0 == an s1 word.
|
|
+ t1 == an s2 word not containing a null. */
|
|
+
|
|
+$a_loop:
|
|
+ xor t0, t1, t2 # e0 :
|
|
+ bne t2, $wordcmp # .. e1 (zdb)
|
|
+ ldl_u t1, 8(a1) # e0 :
|
|
+ ldl_u t0, 8(a0) # .. e1 :
|
|
+ addl a1, 8, a1 # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ cmpgeb zero, t1, t7 # e0 :
|
|
+ beq t7, $a_loop # .. e1 (zdb)
|
|
+ br $eos # e1 :
|
|
+
|
|
+ /* The two strings are not co-aligned. Align s1 and cope. */
|
|
+
|
|
+$unaligned:
|
|
+ and a0, 7, t4 # e0 : find s1 misalignment
|
|
+ and a1, 7, t5 # .. e1 : find s2 misalignment
|
|
+ subl a1, t4, a1 # e0 :
|
|
+
|
|
+ /* If s2 misalignment is larger than s2 misalignment, we need
|
|
+ extra startup checks to avoid SEGV. */
|
|
+
|
|
+ cmplt t4, t5, t8 # .. e1 :
|
|
+ beq t8, $u_head # e1 :
|
|
+
|
|
+ mask7b t3, t5, t3 # e0 :
|
|
+ ornot t1, t3, t3 # e0 :
|
|
+ cmpgeb zero, t3, t7 # e1 : is there a zero?
|
|
+ beq t7, $u_head # e1 :
|
|
+
|
|
+ /* We've found a zero in the first partial word of s2. Align
|
|
+ our current s1 and s2 words and compare what we've got. */
|
|
+
|
|
+ ext3b t1, t5, t1 # e0 :
|
|
+ ext3b t0, a0, t0 # e0 :
|
|
+ cmpgeb zero, t1, t7 # .. e1 : find that zero again
|
|
+ br $eos # e1 : and finish up
|
|
+
|
|
+ .align 3
|
|
+$u_head:
|
|
+ /* We know just enough now to be able to assemble the first
|
|
+ full word of s2. We can still find a zero at the end of it.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == first word of s1
|
|
+ t1 == first partial word of s2. */
|
|
+
|
|
+ ldl_u t2, 8(a1) # e0 : load second partial s2 word
|
|
+ ldi t3, -1 # .. e1 : create leading garbage mask
|
|
+ ext3b t1, a1, t1 # e0 : create first s2 word
|
|
+ mask7b t3, a0, t3 # e0 :
|
|
+ ext7b t2, a1, t4 # e0 :
|
|
+ ornot t0, t3, t0 # .. e1 : kill s1 garbage
|
|
+ or t1, t4, t1 # e0 : s2 word now complete
|
|
+ cmpgeb zero, t0, t7 # .. e1 : find zero in first s1 word
|
|
+ ornot t1, t3, t1 # e0 : kill s2 garbage
|
|
+ ldi t3, -1 # .. e1 :
|
|
+ mask3b t3, a1, t3 # e0 : mask for s2[1] bits we have seen
|
|
+ bne t7, $eos # .. e1 :
|
|
+ xor t0, t1, t4 # e0 : compare aligned words
|
|
+ bne t4, $wordcmp # .. e1 (zdb)
|
|
+ or t2, t3, t3 # e0 :
|
|
+ cmpgeb zero, t3, t7 # e1 :
|
|
+ bne t7, $u_final # e1 :
|
|
+
|
|
+ /* Unaligned copy main loop. In order to avoid reading too much,
|
|
+ the loop is structured to detect zeros in aligned words from s2.
|
|
+ This has, unfortunately, effectively pulled half of a loop
|
|
+ iteration out into the head and half into the tail, but it does
|
|
+ prevent nastiness from accumulating in the very thing we want
|
|
+ to run as fast as possible.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t2 == the unshifted low-bits from the next s2 word. */
|
|
+
|
|
+ .align 3
|
|
+$u_loop:
|
|
+ ext3b t2, a1, t3 # e0 :
|
|
+ ldl_u t2, 16(a1) # .. e1 : load next s2 high bits
|
|
+ ldl_u t0, 8(a0) # e0 : load next s1 word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ addl a0, 8, a0 # e0 :
|
|
+ nop # .. e1 :
|
|
+ ext7b t2, a1, t1 # e0 :
|
|
+ cmpgeb zero, t0, t7 # .. e1 : find zero in current s1 word
|
|
+ or t1, t3, t1 # e0 :
|
|
+ bne t7, $eos # .. e1 :
|
|
+ xor t0, t1, t4 # e0 : compare the words
|
|
+ bne t4, $wordcmp # .. e1 (zdb)
|
|
+ cmpgeb zero, t2, t4 # e0 : find zero in next low bits
|
|
+ beq t4, $u_loop # .. e1 (zdb)
|
|
+
|
|
+ /* We've found a zero in the low bits of the last s2 word. Get
|
|
+ the next s1 word and align them. */
|
|
+$u_final:
|
|
+ ldl_u t0, 8(a0) # e1 :
|
|
+ ext3b t2, a1, t1 # .. e0 :
|
|
+ cmpgeb zero, t1, t7 # e0 :
|
|
+
|
|
+ /* We've found a zero somewhere in a word we just read.
|
|
+ On entry to this basic block:
|
|
+ t0 == s1 word
|
|
+ t1 == s2 word
|
|
+ t7 == cmpgeb mask containing the zero. */
|
|
+
|
|
+ .align 3
|
|
+$eos:
|
|
+ negl t7, t6 # e0 : create bytemask of valid data
|
|
+ and t6, t7, t8 # e1 :
|
|
+ subl t8, 1, t6 # e0 :
|
|
+ or t6, t8, t7 # e1 :
|
|
+ zapnot t0, t7, t0 # e0 : kill the garbage
|
|
+ zapnot t1, t7, t1 # .. e1 :
|
|
+ xor t0, t1, v0 # e0 : and compare
|
|
+ beq v0, $done # .. e1 :
|
|
+
|
|
+ /* Here we have two differing co-aligned words in t0 & t1.
|
|
+ Bytewise compare them and return (t0 > t1 ? 1 : -1). */
|
|
+$wordcmp:
|
|
+ cmpgeb t0, t1, t2 # e0 : comparison yieflds bit mask of ge
|
|
+ cmpgeb t1, t0, t3 # .. e1 :
|
|
+ xor t2, t3, t0 # e0 : bits set iff t0/t1 bytes differ
|
|
+ negl t0, t1 # e1 : clear all but least bit
|
|
+ and t0, t1, t0 # e0 :
|
|
+ ldi v0, -1 # .. e1 :
|
|
+ and t0, t2, t1 # e0 : was bit set in t0 > t1?
|
|
+ selne t1, 1, v0, v0 # .. e1 (zdb)
|
|
+
|
|
+$done:
|
|
+ ret # e1 :
|
|
+
|
|
+ END(strcmp)
|
|
+libc_hidden_builtin_def (strcmp)
|
|
diff --git a/sysdeps/sw_64/strcpy.S b/sysdeps/sw_64/strcpy.S
|
|
new file mode 100644
|
|
index 00000000..19f6427f
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/strcpy.S
|
|
@@ -0,0 +1,41 @@
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+ Contributed by Richard Henderson <rth@tamu.edu>, 1996.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Copy a null-terminated string from SRC to DST. Return a pointer
|
|
+ to the null-terminator in the source. */
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .text
|
|
+
|
|
+ENTRY(strcpy)
|
|
+ ldgp gp, 0(pv)
|
|
+#ifdef PROF
|
|
+ .set noat
|
|
+ ldi AT, _mcount
|
|
+ call AT, (AT), _mcount
|
|
+ .set at
|
|
+#endif
|
|
+ .prologue 1
|
|
+
|
|
+ mov a0, v0 # set up return value
|
|
+ mov ra, t9
|
|
+ jmp $31, __stxcpy # do the copy
|
|
+
|
|
+ END(strcpy)
|
|
+libc_hidden_builtin_def (strcpy)
|
|
diff --git a/sysdeps/sw_64/strlen.S b/sysdeps/sw_64/strlen.S
|
|
new file mode 100644
|
|
index 00000000..ae96ae88
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/strlen.S
|
|
@@ -0,0 +1,76 @@
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
|
+ Contributed by David Mosberger (davidm@cs.arizona.edu).
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Finds length of a 0-terminated string. Optimized for the Sw_64
|
|
+ architecture:
|
|
+
|
|
+ - memory accessed as aligned quadwords only
|
|
+ - uses cmpgeb to compare 8 bytes in parallel
|
|
+ - does binary search to find 0 byte in last quadword (HAKMEM
|
|
+ needed 12 instructions to do this instead of the 8 instructions
|
|
+ that the binary search needs).
|
|
+*/
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .set noreorder
|
|
+ .set noat
|
|
+
|
|
+ENTRY(strlen)
|
|
+#ifdef PROF
|
|
+ ldgp gp, 0(pv)
|
|
+ ldi AT, _mcount
|
|
+ call AT, (AT), _mcount
|
|
+ .prologue 1
|
|
+#else
|
|
+ .prologue 0
|
|
+#endif
|
|
+
|
|
+ ldl_u t0, 0(a0) # load first quadword (a0 may be misaligned)
|
|
+ ldi t1, -1(zero)
|
|
+ ins7b t1, a0, t1
|
|
+ andnot a0, 7, v0
|
|
+ or t1, t0, t0
|
|
+ nop # dual issue the next two
|
|
+ cmpgeb zero, t0, t1 # t1 <- bitmask: bit i == 1 <==> i-th byte == 0
|
|
+ bne t1, $found
|
|
+
|
|
+$loop: ldl t0, 8(v0)
|
|
+ addl v0, 8, v0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ beq t1, $loop
|
|
+
|
|
+$found: negl t1, t2 # clear all but least set bit
|
|
+ and t1, t2, t1
|
|
+
|
|
+ and t1, 0xf0, t2 # binary search for that set bit
|
|
+ and t1, 0xcc, t3
|
|
+ and t1, 0xaa, t4
|
|
+ selne t2, 4, t2, t2
|
|
+ selne t3, 2, t3, t3
|
|
+ selne t4, 1, t4, t4
|
|
+ addl t2, t3, t2
|
|
+ addl v0, t4, v0
|
|
+ addl v0, t2, v0
|
|
+ nop
|
|
+
|
|
+ subl v0, a0, v0
|
|
+ ret
|
|
+
|
|
+ END(strlen)
|
|
+libc_hidden_builtin_def (strlen)
|
|
diff --git a/sysdeps/sw_64/strncat.S b/sysdeps/sw_64/strncat.S
|
|
new file mode 100644
|
|
index 00000000..414696ac
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/strncat.S
|
|
@@ -0,0 +1,94 @@
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+ Contributed by Richard Henderson <rth@tamu.edu>, 1996.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Append no more than COUNT characters from the null-terminated string SRC
|
|
+ to the null-terminated string DST. Always null-terminate the new DST. */
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .text
|
|
+
|
|
+ENTRY(strncat)
|
|
+ ldgp gp, 0(pv)
|
|
+#ifdef PROF
|
|
+ .set noat
|
|
+ ldi AT, _mcount
|
|
+ call AT, (AT), _mcount
|
|
+ .set at
|
|
+#endif
|
|
+ .prologue 1
|
|
+
|
|
+ mov a0, v0 # set up return value
|
|
+ beq a2, $zerocount
|
|
+
|
|
+ /* Find the end of the string. */
|
|
+
|
|
+ ldl_u t0, 0(a0) # load first quadword (a0 may be misaligned)
|
|
+ ldi t1, -1(zero)
|
|
+ ins7b t1, a0, t1
|
|
+ andnot a0, 7, a0
|
|
+ or t1, t0, t0
|
|
+ cmpgeb zero, t0, t1 # t1 <- bitmask: bit i == 1 <==> i-th byte == 0
|
|
+ bne t1, $found
|
|
+
|
|
+$loop: ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ beq t1, $loop
|
|
+
|
|
+$found: negl t1, t2 # clear all but least set bit
|
|
+ and t1, t2, t1
|
|
+
|
|
+ and t1, 0xf0, t2 # binary search for that set bit
|
|
+ and t1, 0xcc, t3
|
|
+ and t1, 0xaa, t4
|
|
+ selne t2, 4, t2, t2
|
|
+ selne t3, 2, t3, t3
|
|
+ selne t4, 1, t4, t4
|
|
+ addl t2, t3, t2
|
|
+ addl a0, t4, a0
|
|
+ addl a0, t2, a0
|
|
+
|
|
+ /* Now do the append. */
|
|
+
|
|
+ call t9, __stxncpy
|
|
+
|
|
+ /* Worry about the null termination. */
|
|
+
|
|
+ zapnot t0, t8, t1 # was last byte a null?
|
|
+ bne t1, 0f
|
|
+ ret
|
|
+
|
|
+0: and t10, 0x80, t1
|
|
+ bne t1, 1f
|
|
+
|
|
+ /* Here there are bytes left in the current word. Clear one. */
|
|
+ addl t10, t10, t10 # end-of-count bit <<= 1
|
|
+ zap t0, t10, t0
|
|
+ stl_u t0, 0(a0)
|
|
+ ret
|
|
+
|
|
+1: /* Here we must read the next DST word and clear the first byte. */
|
|
+ ldl_u t0, 8(a0)
|
|
+ zap t0, 1, t0
|
|
+ stl_u t0, 8(a0)
|
|
+
|
|
+$zerocount:
|
|
+ ret
|
|
+
|
|
+ END(strncat)
|
|
diff --git a/sysdeps/sw_64/strncmp.S b/sysdeps/sw_64/strncmp.S
|
|
new file mode 100644
|
|
index 00000000..df49b963
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/strncmp.S
|
|
@@ -0,0 +1,277 @@
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
|
+ Contributed by Richard Henderson (rth@tamu.edu)
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <https://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Bytewise compare two null-terminated strings of length no longer than N. */
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+
|
|
+/* sw_64 only predicts one branch per octaword. We'll use these to push
|
|
+ fsubsequent branches back to the next bundle. This will generally add
|
|
+ a fetch+decode cycle to older machines, so skip in that case. */
|
|
+#ifdef __sw_64_fix__
|
|
+# define sw6_unop unop
|
|
+#else
|
|
+# define sw6_unop
|
|
+#endif
|
|
+
|
|
+ .text
|
|
+
|
|
+ENTRY(strncmp)
|
|
+#ifdef PROF
|
|
+ ldgp gp, 0(pv)
|
|
+ ldi AT, _mcount
|
|
+ call AT, (AT), _mcount
|
|
+ .prologue 1
|
|
+#else
|
|
+ .prologue 0
|
|
+#endif
|
|
+
|
|
+ xor a0, a1, t2 # are s1 and s2 co-aligned?
|
|
+ beq a2, $zerolength
|
|
+ ldl_u t0, 0(a0) # load asap to give cache time to catch up
|
|
+ ldl_u t1, 0(a1)
|
|
+ ldi t3, -1
|
|
+ and t2, 7, t2
|
|
+ srl t3, 1, t6
|
|
+ and a0, 7, t4 # find s1 misalignment
|
|
+ and a1, 7, t5 # find s2 misalignment
|
|
+ sellt a2, t6, a2, a2 # bound neg count to LONG_MAX
|
|
+ addl a1, a2, a3 # s2+count
|
|
+ addl a2, t4, a2 # bias count by s1 misalignment
|
|
+ and a2, 7, t10 # ofs of last byte in s1 last word
|
|
+ srl a2, 3, a2 # remaining full words in s1 count
|
|
+ bne t2, $unaligned
|
|
+
|
|
+ /* On entry to this basic block:
|
|
+ t0 == the first word of s1.
|
|
+ t1 == the first word of s2.
|
|
+ t3 == -1. */
|
|
+$aligned:
|
|
+ mask7b t3, a1, t8 # mask off leading garbage
|
|
+ ornot t1, t8, t1
|
|
+ ornot t0, t8, t0
|
|
+ cmpgeb zero, t1, t7 # bits set iff null found
|
|
+ beq a2, $eoc # check end of count
|
|
+ bne t7, $eos
|
|
+ beq t10, $ant_loop
|
|
+
|
|
+ /* Aligned compare main loop.
|
|
+ On entry to this basic block:
|
|
+ t0 == an s1 word.
|
|
+ t1 == an s2 word not containing a null. */
|
|
+
|
|
+ .align 4
|
|
+$a_loop:
|
|
+ xor t0, t1, t2 # e0 :
|
|
+ bne t2, $wordcmp # .. e1 (zdb)
|
|
+ ldl_u t1, 8(a1) # e0 :
|
|
+ ldl_u t0, 8(a0) # .. e1 :
|
|
+
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ addl a0, 8, a0 # e0 :
|
|
+ beq a2, $eoc # .. e1 :
|
|
+
|
|
+ cmpgeb zero, t1, t7 # e0 :
|
|
+ beq t7, $a_loop # .. e1 :
|
|
+
|
|
+ br $eos
|
|
+
|
|
+ /* Alternate aligned compare loop, for when there's no trailing
|
|
+ bytes on the count. We have to avoid reading too much data. */
|
|
+ .align 4
|
|
+$ant_loop:
|
|
+ xor t0, t1, t2 # e0 :
|
|
+ sw6_unop
|
|
+ sw6_unop
|
|
+ bne t2, $wordcmp # .. e1 (zdb)
|
|
+
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ beq a2, $zerolength # .. e1 :
|
|
+ ldl_u t1, 8(a1) # e0 :
|
|
+ ldl_u t0, 8(a0) # .. e1 :
|
|
+
|
|
+ addl a1, 8, a1 # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ cmpgeb zero, t1, t7 # e0 :
|
|
+ beq t7, $ant_loop # .. e1 :
|
|
+
|
|
+ br $eos
|
|
+
|
|
+ /* The two strings are not co-aligned. Align s1 and cope. */
|
|
+ /* On entry to this basic block:
|
|
+ t0 == the first word of s1.
|
|
+ t1 == the first word of s2.
|
|
+ t3 == -1.
|
|
+ t4 == misalignment of s1.
|
|
+ t5 == misalignment of s2.
|
|
+ t10 == misalignment of s1 end. */
|
|
+ .align 4
|
|
+$unaligned:
|
|
+ /* If s1 misalignment is larger than s2 misalignment, we need
|
|
+ extra startup checks to avoid SEGV. */
|
|
+ subl a1, t4, a1 # adjust s2 for s1 misalignment
|
|
+ cmpult t4, t5, t9
|
|
+ subl a3, 1, a3 # last byte of s2
|
|
+ bic a1, 7, t8
|
|
+ mask7b t3, t5, t7 # mask garbage in s2
|
|
+ subl a3, t8, a3
|
|
+ ornot t1, t7, t7
|
|
+ srl a3, 3, a3 # remaining full words in s2 count
|
|
+ beq t9, $u_head
|
|
+
|
|
+ /* Failing that, we need to look for both eos and eoc within the
|
|
+ first word of s2. If we find either, we can continue by
|
|
+ pretending that the next word of s2 is all zeros. */
|
|
+ ldi t2, 0 # next = zero
|
|
+ cmpeq a3, 0, t8 # eoc in the first word of s2?
|
|
+ cmpgeb zero, t7, t7 # eos in the first word of s2?
|
|
+ or t7, t8, t8
|
|
+ bne t8, $u_head_nl
|
|
+
|
|
+ /* We know just enough now to be able to assemble the first
|
|
+ full word of s2. We can still find a zero at the end of it.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == first word of s1
|
|
+ t1 == first partial word of s2.
|
|
+ t3 == -1.
|
|
+ t10 == ofs of last byte in s1 last word.
|
|
+ t11 == ofs of last byte in s2 last word. */
|
|
+$u_head:
|
|
+ ldl_u t2, 8(a1) # load second partial s2 word
|
|
+ subl a3, 1, a3
|
|
+$u_head_nl:
|
|
+ ext3b t1, a1, t1 # create first s2 word
|
|
+ mask7b t3, a0, t8
|
|
+ ext7b t2, a1, t4
|
|
+ ornot t0, t8, t0 # kill s1 garbage
|
|
+ or t1, t4, t1 # s2 word now complete
|
|
+ cmpgeb zero, t0, t7 # find eos in first s1 word
|
|
+ ornot t1, t8, t1 # kill s2 garbage
|
|
+ beq a2, $eoc
|
|
+ subl a2, 1, a2
|
|
+ bne t7, $eos
|
|
+ mask3b t3, a1, t8 # mask out s2[1] bits we have seen
|
|
+ xor t0, t1, t4 # compare aligned words
|
|
+ or t2, t8, t8
|
|
+ bne t4, $wordcmp
|
|
+ cmpgeb zero, t8, t7 # eos in high bits of s2[1]?
|
|
+ cmpeq a3, 0, t8 # eoc in s2[1]?
|
|
+ or t7, t8, t7
|
|
+ bne t7, $u_final
|
|
+
|
|
+ /* Unaligned copy main loop. In order to avoid reading too much,
|
|
+ the loop is structured to detect zeros in aligned words from s2.
|
|
+ This has, unfortunately, effectively pulled half of a loop
|
|
+ iteration out into the head and half into the tail, but it does
|
|
+ prevent nastiness from accumulating in the very thing we want
|
|
+ to run as fast as possible.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t2 == the unshifted low-bits from the next s2 word.
|
|
+ t10 == ofs of last byte in s1 last word.
|
|
+ t11 == ofs of last byte in s2 last word. */
|
|
+ .align 4
|
|
+$u_loop:
|
|
+ ext3b t2, a1, t3 # e0 :
|
|
+ ldl_u t2, 16(a1) # .. e1 : load next s2 high bits
|
|
+ ldl_u t0, 8(a0) # e0 : load next s1 word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+
|
|
+ addl a0, 8, a0 # e0 :
|
|
+ subl a3, 1, a3 # .. e1 :
|
|
+ ext7b t2, a1, t1 # e0 :
|
|
+ cmpgeb zero, t0, t7 # .. e1 : eos in current s1 word
|
|
+
|
|
+ or t1, t3, t1 # e0 :
|
|
+ beq a2, $eoc # .. e1 : eoc in current s1 word
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ cmpgeb zero, t2, t4 # .. e1 : eos in s2[1]
|
|
+
|
|
+ xor t0, t1, t3 # e0 : compare the words
|
|
+ sw6_unop
|
|
+ sw6_unop
|
|
+ bne t7, $eos # .. e1 :
|
|
+
|
|
+ cmpeq a3, 0, t5 # e0 : eoc in s2[1]
|
|
+ sw6_unop
|
|
+ sw6_unop
|
|
+ bne t3, $wordcmp # .. e1 :
|
|
+
|
|
+ or t4, t5, t4 # e0 : eos or eoc in s2[1].
|
|
+ beq t4, $u_loop # .. e1 (zdb)
|
|
+
|
|
+ /* We've found a zero in the low bits of the last s2 word. Get
|
|
+ the next s1 word and align them. */
|
|
+ .align 3
|
|
+$u_final:
|
|
+ ldl_u t0, 8(a0)
|
|
+ ext3b t2, a1, t1
|
|
+ cmpgeb zero, t1, t7
|
|
+ bne a2, $eos
|
|
+
|
|
+ /* We've hit end of count. Zero everything after the count
|
|
+ and compare whats left. */
|
|
+ .align 3
|
|
+$eoc:
|
|
+ mask3b t0, t10, t0
|
|
+ mask3b t1, t10, t1
|
|
+ cmpgeb zero, t1, t7
|
|
+
|
|
+ /* We've found a zero somewhere in a word we just read.
|
|
+ On entry to this basic block:
|
|
+ t0 == s1 word
|
|
+ t1 == s2 word
|
|
+ t7 == cmpgeb mask containing the zero. */
|
|
+ .align 3
|
|
+$eos:
|
|
+ negl t7, t6 # create bytemask of valid data
|
|
+ and t6, t7, t8
|
|
+ subl t8, 1, t6
|
|
+ or t6, t8, t7
|
|
+ zapnot t0, t7, t0 # kill the garbage
|
|
+ zapnot t1, t7, t1
|
|
+ xor t0, t1, v0 # ... and compare
|
|
+ beq v0, $done
|
|
+
|
|
+ /* Here we have two differing co-aligned words in t0 & t1.
|
|
+ Bytewise compare them and return (t0 > t1 ? 1 : -1). */
|
|
+ .align 3
|
|
+$wordcmp:
|
|
+ cmpgeb t0, t1, t2 # comparison yieflds bit mask of ge
|
|
+ cmpgeb t1, t0, t3
|
|
+ xor t2, t3, t0 # bits set iff t0/t1 bytes differ
|
|
+ negl t0, t1 # clear all but least bit
|
|
+ and t0, t1, t0
|
|
+ ldi v0, -1
|
|
+ and t0, t2, t1 # was bit set in t0 > t1?
|
|
+ selne t1, 1, v0, v0
|
|
+$done:
|
|
+ ret
|
|
+
|
|
+ .align 3
|
|
+$zerolength:
|
|
+ clr v0
|
|
+ ret
|
|
+
|
|
+ END(strncmp)
|
|
+libc_hidden_builtin_def (strncmp)
|
|
diff --git a/sysdeps/sw_64/strncpy.S b/sysdeps/sw_64/strncpy.S
|
|
new file mode 100644
|
|
index 00000000..4ea31fb1
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/strncpy.S
|
|
@@ -0,0 +1,87 @@
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
|
+ Contributed by Richard Henderson (rth@tamu.edu)
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Copy no more than COUNT bytes of the null-terminated string from
|
|
+ SRC to DST. If SRC does not cover all of COUNT, the balance is
|
|
+ zeroed. */
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+
|
|
+ .text
|
|
+
|
|
+ENTRY(strncpy)
|
|
+ ldgp gp, 0(pv)
|
|
+#ifdef PROF
|
|
+ ldi AT, _mcount
|
|
+ call AT, (AT), _mcount
|
|
+#endif
|
|
+ .prologue 1
|
|
+
|
|
+ mov a0, v0 # set return value now
|
|
+ beq a2, $zerocount
|
|
+ call t9, __stxncpy # do the work of the copy
|
|
+
|
|
+ bne a2, $multiword # do we have full words left?
|
|
+
|
|
+ .align 3
|
|
+ subl t8, 1, t2 # e0 : guess not
|
|
+ subl t10, 1, t3 # .. e1 :
|
|
+ or t2, t8, t2 # e0 : clear the bits between the last
|
|
+ or t3, t10, t3 # .. e1 : written byte and the last byte in
|
|
+ andnot t3, t2, t3 # e0 : COUNT
|
|
+ zap t0, t3, t0 # e1 :
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ ret # .. e1 :
|
|
+
|
|
+$multiword:
|
|
+ subl t8, 1, t7 # e0 : clear the final bits in the prev
|
|
+ or t7, t8, t7 # e1 : word
|
|
+ zapnot t0, t7, t0 # e0 :
|
|
+ subl a2, 1, a2 # .. e1 :
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+
|
|
+ beq a2, 1f # e1 :
|
|
+ blbc a2, 0f # e1 :
|
|
+
|
|
+ stl_u zero, 0(a0) # e0 : zero one word
|
|
+ subl a2, 1, a2 # .. e1 :
|
|
+ addl a0, 8, a0 # e0 :
|
|
+ beq a2, 1f # .. e1 :
|
|
+
|
|
+0: stl_u zero, 0(a0) # e0 : zero two words
|
|
+ subl a2, 2, a2 # .. e1 :
|
|
+ stl_u zero, 8(a0) # e0 :
|
|
+ addl a0, 16, a0 # .. e1 :
|
|
+ bne a2, 0b # e1 :
|
|
+ unop
|
|
+
|
|
+1: ldl_u t0, 0(a0) # e0 : clear the leading bits in the final
|
|
+ subl t10, 1, t7 # .. e1 : word
|
|
+ or t7, t10, t7 # e0 :
|
|
+ zap t0, t7, t0 # e1 (stall)
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+
|
|
+$zerocount:
|
|
+ ret # .. e1 :
|
|
+
|
|
+ END(strncpy)
|
|
+libc_hidden_builtin_def (strncpy)
|
|
diff --git a/sysdeps/sw_64/strrchr.S b/sysdeps/sw_64/strrchr.S
|
|
new file mode 100644
|
|
index 00000000..eef5fd10
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/strrchr.S
|
|
@@ -0,0 +1,107 @@
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Return the address of the last occurrence of a given character
|
|
+ within a null-terminated string, or null if it is not found.
|
|
+
|
|
+*/
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .set noreorder
|
|
+ .set noat
|
|
+
|
|
+ENTRY(strrchr)
|
|
+#ifdef PROF
|
|
+ ldgp gp, 0(pv)
|
|
+ ldi AT, _mcount
|
|
+ call AT, (AT), _mcount
|
|
+ .prologue 1
|
|
+#else
|
|
+ .prologue 0
|
|
+#endif
|
|
+
|
|
+ and a1, 0xff, a1 # e0 : zero extend our test character
|
|
+ mov zero, t6 # .. e1 : t6 is last match aligned addr
|
|
+ sll a1, 8, t5 # e0 : replicate our test character
|
|
+ mov zero, t7 # .. e1 : t7 is last match byte compare mask
|
|
+ or t5, a1, a1 # e0 :
|
|
+ ldl_u t0, 0(a0) # .. e1 : load first quadword
|
|
+ sll a1, 16, t5 # e0 :
|
|
+ andnot a0, 7, v0 # .. e1 : align source addr
|
|
+ or t5, a1, a1 # e0 :
|
|
+ ldi t4, -1 # .. e1 : build garbage mask
|
|
+ sll a1, 32, t5 # e0 :
|
|
+ cmpgeb zero, t0, t1 # .. e1 : bits set iff byte == zero
|
|
+ mask7b t4, a0, t4 # e0 :
|
|
+ or t5, a1, a1 # .. e1 : character replication complete
|
|
+ xor t0, a1, t2 # e0 : make bytes == c zero
|
|
+ cmpgeb zero, t4, t4 # .. e1 : bits set iff byte is garbage
|
|
+ cmpgeb zero, t2, t3 # e0 : bits set iff byte == c
|
|
+ andnot t1, t4, t1 # .. e1 : clear garbage from null test
|
|
+ andnot t3, t4, t3 # e0 : clear garbage from char test
|
|
+ bne t1, $eos # .. e1 : did we already hit the terminator?
|
|
+
|
|
+ /* Character search main loop */
|
|
+$loop:
|
|
+ ldl t0, 8(v0) # e0 : load next quadword
|
|
+ selne t3, v0, t6, t6 # .. e1 : save previous comparisons match
|
|
+ selne t3, t3, t7, t7 # e0 :
|
|
+ addl v0, 8, v0 # .. e1 :
|
|
+ xor t0, a1, t2 # e0 :
|
|
+ cmpgeb zero, t0, t1 # .. e1 : bits set iff byte == zero
|
|
+ cmpgeb zero, t2, t3 # e0 : bits set iff byte == c
|
|
+ beq t1, $loop # .. e1 : if we havnt seen a null, loop
|
|
+
|
|
+ /* Mask out character matches after terminator */
|
|
+$eos:
|
|
+ negl t1, t4 # e0 : isolate first null byte match
|
|
+ and t1, t4, t4 # e1 :
|
|
+ subl t4, 1, t5 # e0 : build a mask of the bytes upto...
|
|
+ or t4, t5, t4 # e1 : ... and including the null
|
|
+
|
|
+ and t3, t4, t3 # e0 : mask out char matches after null
|
|
+ selne t3, t3, t7, t7 # .. e1 : save it, if match found
|
|
+ selne t3, v0, t6, t6 # e0 :
|
|
+
|
|
+ /* Locate the address of the last matched character */
|
|
+
|
|
+ /* Retain the early exit for the sw_64
|
|
+ -- the same as just falling through. */
|
|
+ beq t7, $retnull # .. e1 :
|
|
+
|
|
+ and t7, 0xf0, t2 # e0 : binary search for the high bit set
|
|
+ selne t2, t2, t7, t7 # .. e1 (zdb)
|
|
+ selne t2, 4, t2, t2 # e0 :
|
|
+ and t7, 0xcc, t1 # .. e1 :
|
|
+ selne t1, t1, t7, t7 # e0 :
|
|
+ selne t1, 2, t1, t1 # .. e1 :
|
|
+ and t7, 0xaa, t0 # e0 :
|
|
+ selne t0, 1, t0, t0 # .. e1 (zdb)
|
|
+ addl t2, t1, t1 # e0 :
|
|
+ addl t6, t0, v0 # .. e1 : add our aligned base ptr to the mix
|
|
+ addl v0, t1, v0 # e0 :
|
|
+ ret # .. e1 :
|
|
+
|
|
+$retnull:
|
|
+ mov zero, v0 # e0 :
|
|
+ ret # .. e1 :
|
|
+
|
|
+ END(strrchr)
|
|
+
|
|
+weak_alias (strrchr, rindex)
|
|
+libc_hidden_builtin_def (strrchr)
|
|
diff --git a/sysdeps/sw_64/stxcpy.S b/sysdeps/sw_64/stxcpy.S
|
|
new file mode 100644
|
|
index 00000000..3f9ae356
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/stxcpy.S
|
|
@@ -0,0 +1,291 @@
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
|
+ Contributed by Richard Henderson (rth@tamu.edu)
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Copy a null-terminated string from SRC to DST.
|
|
+
|
|
+ This is an internal routine used by strcpy, stpcpy, and strcat.
|
|
+ As such, it uses special linkage conventions to make implementation
|
|
+ of these public functions more efficient.
|
|
+
|
|
+ On input:
|
|
+ t9 = return address
|
|
+ a0 = DST
|
|
+ a1 = SRC
|
|
+
|
|
+ On output:
|
|
+ t8 = bitmask (with one bit set) indicating the last byte written
|
|
+ a0 = unaligned address of the last *word* written
|
|
+
|
|
+ Furthermore, v0, a3-a5, t11, and t12 are untouched.
|
|
+*/
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+
|
|
+ .text
|
|
+ .type __stxcpy, @function
|
|
+ .globl __stxcpy
|
|
+ .usepv __stxcpy, no
|
|
+
|
|
+ cfi_startproc
|
|
+ cfi_return_column (t9)
|
|
+
|
|
+ /* On entry to this basic block:
|
|
+ t0 == the first destination word for masking back in
|
|
+ t1 == the first source word. */
|
|
+ .align 3
|
|
+stxcpy_aligned:
|
|
+ /* Create the 1st output word and detect 0's in the 1st input word. */
|
|
+ ldi t2, -1 # e1 : build a mask against false zero
|
|
+ mask7b t2, a1, t2 # e0 : detection in the src word
|
|
+ mask7b t1, a1, t3 # e0 :
|
|
+ ornot t1, t2, t2 # .. e1 :
|
|
+ mask3b t0, a1, t0 # e0 : assemble the first output word
|
|
+ cmpgeb zero, t2, t7 # .. e1 : bits set iff null found
|
|
+ or t0, t3, t1 # e0 :
|
|
+ bne t7, $a_eos # .. e1 :
|
|
+
|
|
+ /* On entry to this basic block:
|
|
+ t0 == the first destination word for masking back in
|
|
+ t1 == a source word not containing a null. */
|
|
+$a_loop:
|
|
+ stl_u t1, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t1, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ cmpgeb zero, t1, t7 # e0 (stall)
|
|
+ beq t7, $a_loop # .. e1 (zdb)
|
|
+
|
|
+ /* Take care of the final (partial) word store.
|
|
+ On entry to this basic block we have:
|
|
+ t1 == the source word containing the null
|
|
+ t7 == the cmpgeb mask that found it. */
|
|
+$a_eos:
|
|
+ negl t7, t6 # e0 : find low bit set
|
|
+ and t7, t6, t8 # e1 (stall)
|
|
+
|
|
+ /* For the sake of the cache, don't read a destination word
|
|
+ if we're not going to need it. */
|
|
+ and t8, 0x80, t6 # e0 :
|
|
+ bne t6, 1f # .. e1 (zdb)
|
|
+
|
|
+ /* We're doing a partial word store and so need to combine
|
|
+ our source and original destination words. */
|
|
+ ldl_u t0, 0(a0) # e0 :
|
|
+ subl t8, 1, t6 # .. e1 :
|
|
+ zapnot t1, t6, t1 # e0 : clear src bytes >= null
|
|
+ or t8, t6, t7 # .. e1 :
|
|
+ zap t0, t7, t0 # e0 : clear dst bytes <= null
|
|
+ or t0, t1, t1 # e1 :
|
|
+
|
|
+1: stl_u t1, 0(a0) # e0 :
|
|
+ ret (t9) # .. e1 :
|
|
+
|
|
+ .align 3
|
|
+__stxcpy:
|
|
+ /* Are source and destination co-aligned? */
|
|
+ xor a0, a1, t0 # e0 :
|
|
+ unop # :
|
|
+ and t0, 7, t0 # e0 :
|
|
+ bne t0, $unaligned # .. e1 :
|
|
+
|
|
+ /* We are co-aligned; take care of a partial first word. */
|
|
+ ldl_u t1, 0(a1) # e0 : load first src word
|
|
+ and a0, 7, t0 # .. e1 : take care not to load a word ...
|
|
+ addl a1, 8, a1 # e0 :
|
|
+ beq t0, stxcpy_aligned # .. e1 : ... if we wont need it
|
|
+ ldl_u t0, 0(a0) # e0 :
|
|
+ br stxcpy_aligned # .. e1 :
|
|
+
|
|
+
|
|
+/* The source and destination are not co-aligned. Align the destination
|
|
+ and cope. We have to be very careful about not reading too much and
|
|
+ causing a SEGV. */
|
|
+
|
|
+ .align 3
|
|
+$u_head:
|
|
+ /* We know just enough now to be able to assemble the first
|
|
+ full source word. We can still find a zero at the end of it
|
|
+ that prevents us from outputting the whole thing.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the first dest word, for masking back in, if needed else 0
|
|
+ t1 == the low bits of the first source word
|
|
+ t6 == bytemask that is -1 in dest word bytes */
|
|
+
|
|
+ ldl_u t2, 8(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+
|
|
+ ext3b t1, a1, t1 # e0 :
|
|
+ ext7b t2, a1, t4 # e0 :
|
|
+ mask3b t0, a0, t0 # e0 :
|
|
+ or t1, t4, t1 # .. e1 :
|
|
+ mask7b t1, a0, t1 # e0 :
|
|
+ or t0, t1, t1 # e1 :
|
|
+
|
|
+ or t1, t6, t6 # e0 :
|
|
+ cmpgeb zero, t6, t7 # .. e1 :
|
|
+ ldi t6, -1 # e0 : for masking just below
|
|
+ bne t7, $u_final # .. e1 :
|
|
+
|
|
+ mask3b t6, a1, t6 # e0 : mask out the bits we have
|
|
+ or t6, t2, t2 # e1 : already extracted before
|
|
+ cmpgeb zero, t2, t7 # e0 : testing eos
|
|
+ bne t7, $u_late_head_exit # .. e1 (zdb)
|
|
+
|
|
+ /* Finally, we've got all the stupid leading edge cases taken care
|
|
+ of and we can set up to enter the main loop. */
|
|
+
|
|
+ stl_u t1, 0(a0) # e0 : store first output word
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t0 # e0 : position ho-bits of lo word
|
|
+ ldl_u t2, 8(a1) # .. e1 : read next high-order source word
|
|
+ addl a1, 8, a1 # e0 :
|
|
+ cmpgeb zero, t2, t7 # .. e1 :
|
|
+ nop # e0 :
|
|
+ bne t7, $u_eos # .. e1 :
|
|
+
|
|
+ /* Unaligned copy main loop. In order to avoid reading too much,
|
|
+ the loop is structured to detect zeros in aligned source words.
|
|
+ This has, unfortunately, effectively pulled half of a loop
|
|
+ iteration out into the head and half into the tail, but it does
|
|
+ prevent nastiness from accumulating in the very thing we want
|
|
+ to run as fast as possible.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the shifted high-order bits from the previous source word
|
|
+ t2 == the unshifted current source word
|
|
+
|
|
+ We further know that t2 does not contain a null terminator. */
|
|
+
|
|
+ .align 3
|
|
+$u_loop:
|
|
+ ext7b t2, a1, t1 # e0 : extract high bits for current word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ ext3b t2, a1, t3 # e0 : extract low bits for next time
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ or t0, t1, t1 # e0 : current dst word now complete
|
|
+ ldl_u t2, 0(a1) # .. e1 : load high word for next time
|
|
+ stl_u t1, -8(a0) # e0 : save the current word
|
|
+ mov t3, t0 # .. e1 :
|
|
+ cmpgeb zero, t2, t7 # e0 : test new word for eos
|
|
+ beq t7, $u_loop # .. e1 :
|
|
+
|
|
+ /* We've found a zero somewhere in the source word we just read.
|
|
+ If it resides in the lower half, we have one (probably partial)
|
|
+ word to write out, and if it resides in the upper half, we
|
|
+ have one full and one partial word left to write out.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the shifted high-order bits from the previous source word
|
|
+ t2 == the unshifted current source word. */
|
|
+$u_eos:
|
|
+ ext7b t2, a1, t1 # e0 :
|
|
+ or t0, t1, t1 # e1 : first (partial) source word complete
|
|
+
|
|
+ cmpgeb zero, t1, t7 # e0 : is the null in this first bit?
|
|
+ bne t7, $u_final # .. e1 (zdb)
|
|
+
|
|
+$u_late_head_exit:
|
|
+ stl_u t1, 0(a0) # e0 : the null was in the high-order bits
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t1 # e0 :
|
|
+ cmpgeb zero, t1, t7 # .. e1 :
|
|
+
|
|
+ /* Take care of a final (probably partial) result word.
|
|
+ On entry to this basic block:
|
|
+ t1 == assembled source word
|
|
+ t7 == cmpgeb mask that found the null. */
|
|
+$u_final:
|
|
+ negl t7, t6 # e0 : isolate low bit set
|
|
+ and t6, t7, t8 # e1 :
|
|
+
|
|
+ and t8, 0x80, t6 # e0 : avoid dest word load if we can
|
|
+ bne t6, 1f # .. e1 (zdb)
|
|
+
|
|
+ ldl_u t0, 0(a0) # e0 :
|
|
+ subl t8, 1, t6 # .. e1 :
|
|
+ or t6, t8, t7 # e0 :
|
|
+ zapnot t1, t6, t1 # .. e1 : kill source bytes >= null
|
|
+ zap t0, t7, t0 # e0 : kill dest bytes <= null
|
|
+ or t0, t1, t1 # e1 :
|
|
+
|
|
+1: stl_u t1, 0(a0) # e0 :
|
|
+ ret (t9) # .. e1 :
|
|
+
|
|
+ /* Unaligned copy entry point. */
|
|
+ .align 3
|
|
+$unaligned:
|
|
+
|
|
+ ldl_u t1, 0(a1) # e0 : load first source word
|
|
+
|
|
+ and a0, 7, t4 # .. e1 : find dest misalignment
|
|
+ and a1, 7, t5 # e0 : find src misalignment
|
|
+
|
|
+ /* Conditionally load the first destination word and a bytemask
|
|
+ with 0xff indicating that the destination byte is sacrosanct. */
|
|
+
|
|
+ mov zero, t0 # .. e1 :
|
|
+ mov zero, t6 # e0 :
|
|
+ beq t4, 1f # .. e1 :
|
|
+ ldl_u t0, 0(a0) # e0 :
|
|
+ ldi t6, -1 # .. e1 :
|
|
+ mask3b t6, a0, t6 # e0 :
|
|
+1:
|
|
+ subl a1, t4, a1 # .. e1 : sub dest misalignment from src addr
|
|
+
|
|
+ /* If source misalignment is larger than dest misalignment, we need
|
|
+ extra startup checks to avoid SEGV. */
|
|
+
|
|
+ cmplt t4, t5, t8 # e0 :
|
|
+ beq t8, $u_head # .. e1 (zdb)
|
|
+
|
|
+ ldi t2, -1 # e1 : mask out leading garbage in source
|
|
+ mask7b t2, t5, t2 # e0 :
|
|
+ nop # e0 :
|
|
+ ornot t1, t2, t3 # .. e1 :
|
|
+ cmpgeb zero, t3, t7 # e0 : is there a zero?
|
|
+ beq t7, $u_head # .. e1 (zdb)
|
|
+
|
|
+ /* At this point we've found a zero in the first partial word of
|
|
+ the source. We need to isolate the valid source data and mask
|
|
+ it into the original destination data. (Incidentally, we know
|
|
+ that we'll need at least one byte of that original dest word.) */
|
|
+
|
|
+ ldl_u t0, 0(a0) # e0 :
|
|
+
|
|
+ negl t7, t6 # .. e1 : build bitmask of bytes <= zero
|
|
+ and t6, t7, t8 # e0 :
|
|
+ and a1, 7, t5 # .. e1 :
|
|
+ subl t8, 1, t6 # e0 :
|
|
+ or t6, t8, t7 # e1 :
|
|
+ srl t8, t5, t8 # e0 : adjust final null return value
|
|
+
|
|
+ zapnot t2, t7, t2 # .. e1 : prepare source word; mirror changes
|
|
+ and t1, t2, t1 # e1 : to source validity mask
|
|
+ ext3b t2, a1, t2 # .. e0 :
|
|
+ ext3b t1, a1, t1 # e0 :
|
|
+
|
|
+ andnot t0, t2, t0 # .. e1 : zero place for source to reside
|
|
+ or t0, t1, t1 # e1 : and put it there
|
|
+ stl_u t1, 0(a0) # .. e0 :
|
|
+ ret (t9)
|
|
+
|
|
+ cfi_endproc
|
|
diff --git a/sysdeps/sw_64/stxncpy.S b/sysdeps/sw_64/stxncpy.S
|
|
new file mode 100644
|
|
index 00000000..aba57c9f
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/stxncpy.S
|
|
@@ -0,0 +1,349 @@
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
|
+ Contributed by Richard Henderson (rth@tamu.edu)
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Copy no more than COUNT bytes of the null-terminated string from
|
|
+ SRC to DST.
|
|
+
|
|
+ This is an internal routine used by strncpy, stpncpy, and strncat.
|
|
+ As such, it uses special linkage conventions to make implementation
|
|
+ of these public functions more efficient.
|
|
+
|
|
+ On input:
|
|
+ t9 = return address
|
|
+ a0 = DST
|
|
+ a1 = SRC
|
|
+ a2 = COUNT
|
|
+
|
|
+ Furthermore, COUNT may not be zero.
|
|
+
|
|
+ On output:
|
|
+ t0 = last word written
|
|
+ t8 = bitmask (with one bit set) indicating the last byte written
|
|
+ t10 = bitmask (with one bit set) indicating the byte position of
|
|
+ the end of the range specified by COUNT
|
|
+ a0 = unaligned address of the last *word* written
|
|
+ a2 = the number of full words left in COUNT
|
|
+
|
|
+ Furthermore, v0, a3-a5, t11, and t12 are untouched.
|
|
+*/
|
|
+
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+
|
|
+ .text
|
|
+ .type __stxncpy, @function
|
|
+ .globl __stxncpy
|
|
+ .usepv __stxncpy, no
|
|
+
|
|
+ cfi_startproc
|
|
+ cfi_return_column (t9)
|
|
+
|
|
+ /* On entry to this basic block:
|
|
+ t0 == the first destination word for masking back in
|
|
+ t1 == the first source word. */
|
|
+ .align 3
|
|
+stxncpy_aligned:
|
|
+ /* Create the 1st output word and detect 0's in the 1st input word. */
|
|
+ ldi t2, -1 # e1 : build a mask against false zero
|
|
+ mask7b t2, a1, t2 # e0 : detection in the src word
|
|
+ mask7b t1, a1, t3 # e0 :
|
|
+ ornot t1, t2, t2 # .. e1 :
|
|
+ mask3b t0, a1, t0 # e0 : assemble the first output word
|
|
+ cmpgeb zero, t2, t7 # .. e1 : bits set iff null found
|
|
+ or t0, t3, t0 # e0 :
|
|
+ beq a2, $a_eoc # .. e1 :
|
|
+ bne t7, $a_eos # .. e1 :
|
|
+
|
|
+ /* On entry to this basic block:
|
|
+ t0 == a source word not containing a null. */
|
|
+$a_loop:
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t0, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ cmpgeb zero, t0, t7 # .. e1 (stall)
|
|
+ beq a2, $a_eoc # e1 :
|
|
+ beq t7, $a_loop # e1 :
|
|
+
|
|
+ /* Take care of the final (partial) word store. At this point
|
|
+ the end-of-count bit is set in t7 iff it applies.
|
|
+
|
|
+ On entry to this basic block we have:
|
|
+ t0 == the source word containing the null
|
|
+ t7 == the cmpgeb mask that found it. */
|
|
+$a_eos:
|
|
+ negl t7, t8 # e0 : find low bit set
|
|
+ and t7, t8, t8 # e1 (stall)
|
|
+
|
|
+ /* For the sake of the cache, don't read a destination word
|
|
+ if we're not going to need it. */
|
|
+ and t8, 0x80, t6 # e0 :
|
|
+ bne t6, 1f # .. e1 (zdb)
|
|
+
|
|
+ /* We're doing a partial word store and so need to combine
|
|
+ our source and original destination words. */
|
|
+ ldl_u t1, 0(a0) # e0 :
|
|
+ subl t8, 1, t6 # .. e1 :
|
|
+ or t8, t6, t7 # e0 :
|
|
+ unop #
|
|
+ zapnot t0, t7, t0 # e0 : clear src bytes > null
|
|
+ zap t1, t7, t1 # .. e1 : clear dst bytes <= null
|
|
+ or t0, t1, t0 # e1 :
|
|
+
|
|
+1: stl_u t0, 0(a0) # e0 :
|
|
+ ret (t9) # e1 :
|
|
+
|
|
+ /* Add the end-of-count bit to the eos detection bitmask. */
|
|
+$a_eoc:
|
|
+ or t10, t7, t7
|
|
+ br $a_eos
|
|
+
|
|
+ .align 3
|
|
+__stxncpy:
|
|
+ /* Are source and destination co-aligned? */
|
|
+ ldi t2, -1
|
|
+ xor a0, a1, t1
|
|
+ srl t2, 1, t2
|
|
+ and a0, 7, t0 # find dest misalignment
|
|
+ sellt a2, t2, a2, a2 # bound neg count to LONG_MAX
|
|
+ and t1, 7, t1
|
|
+ addl a2, t0, a2 # bias count by dest misalignment
|
|
+ subl a2, 1, a2
|
|
+ and a2, 7, t2
|
|
+ srl a2, 3, a2 # a2 = loop counter = (count - 1)/8
|
|
+ addl zero, 1, t10
|
|
+ sll t10, t2, t10 # t10 = bitmask of last count byte
|
|
+ bne t1, $unaligned
|
|
+
|
|
+ /* We are co-aligned; take care of a partial first word. */
|
|
+
|
|
+ ldl_u t1, 0(a1) # e0 : load first src word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+
|
|
+ beq t0, stxncpy_aligned # avoid loading dest word if not needed
|
|
+ ldl_u t0, 0(a0) # e0 :
|
|
+ br stxncpy_aligned # .. e1 :
|
|
+
|
|
+
|
|
+/* The source and destination are not co-aligned. Align the destination
|
|
+ and cope. We have to be very careful about not reading too much and
|
|
+ causing a SEGV. */
|
|
+
|
|
+ .align 3
|
|
+$u_head:
|
|
+ /* We know just enough now to be able to assemble the first
|
|
+ full source word. We can still find a zero at the end of it
|
|
+ that prevents us from outputting the whole thing.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the first dest word, unmasked
|
|
+ t1 == the shifted low bits of the first source word
|
|
+ t6 == bytemask that is -1 in dest word bytes */
|
|
+
|
|
+ ldl_u t2, 8(a1) # e0 : load second src word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ mask3b t0, a0, t0 # e0 : mask trailing garbage in dst
|
|
+ ext7b t2, a1, t4 # e0 :
|
|
+ or t1, t4, t1 # e1 : first aligned src word complete
|
|
+ mask7b t1, a0, t1 # e0 : mask leading garbage in src
|
|
+ or t0, t1, t0 # e0 : first output word complete
|
|
+ or t0, t6, t6 # e1 : mask original data for zero test
|
|
+ cmpgeb zero, t6, t7 # e0 :
|
|
+ beq a2, $u_eocfin # .. e1 :
|
|
+ ldi t6, -1 # e0 :
|
|
+ bne t7, $u_final # .. e1 :
|
|
+
|
|
+ mask3b t6, a1, t6 # e0 : mask out bits already seen
|
|
+ nop # .. e1 :
|
|
+ stl_u t0, 0(a0) # e0 : store first output word
|
|
+ or t6, t2, t2 # .. e1 :
|
|
+ cmpgeb zero, t2, t7 # e0 : find nulls in second partial
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ bne t7, $u_late_head_exit # .. e1 :
|
|
+
|
|
+ /* Finally, we've got all the stupid leading edge cases taken care
|
|
+ of and we can set up to enter the main loop. */
|
|
+
|
|
+ ext3b t2, a1, t1 # e0 : position hi-bits of lo word
|
|
+ beq a2, $u_eoc # .. e1 :
|
|
+ ldl_u t2, 8(a1) # e0 : read next high-order source word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ ext7b t2, a1, t0 # e0 : position lo-bits of hi word
|
|
+ cmpgeb zero, t2, t7 # .. e1 : test new word for eos
|
|
+ nop # e0 :
|
|
+ bne t7, $u_eos # .. e1 :
|
|
+
|
|
+ /* Unaligned copy main loop. In order to avoid reading too much,
|
|
+ the loop is structured to detect zeros in aligned source words.
|
|
+ This has, unfortunately, effectively pulled half of a loop
|
|
+ iteration out into the head and half into the tail, but it does
|
|
+ prevent nastiness from accumulating in the very thing we want
|
|
+ to run as fast as possible.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the shifted low-order bits from the current source word
|
|
+ t1 == the shifted high-order bits from the previous source word
|
|
+ t2 == the unshifted current source word
|
|
+
|
|
+ We further know that t2 does not contain a null terminator. */
|
|
+
|
|
+ .align 3
|
|
+$u_loop:
|
|
+ or t0, t1, t0 # e0 : current dst word now complete
|
|
+ subl a2, 1, a2 # .. e1 : decrement word count
|
|
+ stl_u t0, 0(a0) # e0 : save the current word
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t1 # e0 : extract high bits for next time
|
|
+ beq a2, $u_eoc # .. e1 :
|
|
+ ldl_u t2, 8(a1) # e0 : load high word for next time
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ nop # e0 :
|
|
+ cmpgeb zero, t2, t7 # .. e1 : test new word for eos
|
|
+ ext7b t2, a1, t0 # e0 : extract low bits for current word
|
|
+ beq t7, $u_loop # .. e1 :
|
|
+
|
|
+ /* We've found a zero somewhere in the source word we just read.
|
|
+ If it resides in the lower half, we have one (probably partial)
|
|
+ word to write out, and if it resides in the upper half, we
|
|
+ have one full and one partial word left to write out.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the shifted low-order bits from the current source word
|
|
+ t1 == the shifted high-order bits from the previous source word
|
|
+ t2 == the unshifted current source word. */
|
|
+$u_eos:
|
|
+ or t0, t1, t0 # e0 : first (partial) source word complete
|
|
+ cmpgeb zero, t0, t7 # e0 : is the null in this first bit?
|
|
+ bne t7, $u_final # .. e1 (zdb)
|
|
+
|
|
+ stl_u t0, 0(a0) # e0 : the null was in the high-order bits
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+
|
|
+$u_late_head_exit:
|
|
+ ext3b t2, a1, t0 # e0 :
|
|
+ cmpgeb zero, t0, t7 # e0 :
|
|
+ or t7, t10, t6 # e1 :
|
|
+ seleq a2, t6, t7, t7 # e0 :
|
|
+
|
|
+ /* Take care of a final (probably partial) result word.
|
|
+ On entry to this basic block:
|
|
+ t0 == assembled source word
|
|
+ t7 == cmpgeb mask that found the null. */
|
|
+$u_final:
|
|
+ negl t7, t6 # e0 : isolate low bit set
|
|
+ and t6, t7, t8 # e1 :
|
|
+
|
|
+ and t8, 0x80, t6 # e0 : avoid dest word load if we can
|
|
+ bne t6, 1f # .. e1 (zdb)
|
|
+
|
|
+ ldl_u t1, 0(a0) # e0 :
|
|
+ subl t8, 1, t6 # .. e1 :
|
|
+ or t6, t8, t7 # e0 :
|
|
+ zapnot t0, t7, t0 # .. e1 : kill source bytes > null
|
|
+ zap t1, t7, t1 # e0 : kill dest bytes <= null
|
|
+ or t0, t1, t0 # e1 :
|
|
+
|
|
+1: stl_u t0, 0(a0) # e0 :
|
|
+ ret (t9) # .. e1 :
|
|
+
|
|
+ /* Got to end-of-count before end of string.
|
|
+ On entry to this basic block:
|
|
+ t1 == the shifted high-order bits from the previous source word */
|
|
+$u_eoc:
|
|
+ and a1, 7, t6 # e1 :
|
|
+ sll t10, t6, t6 # e0 :
|
|
+ and t6, 0xff, t6 # e0 :
|
|
+ bne t6, 1f # e1 : avoid src word load if we can
|
|
+
|
|
+ ldl_u t2, 8(a1) # e0 : load final src word
|
|
+ nop # .. e1 :
|
|
+ ext7b t2, a1, t0 # e0 : extract high bits for last word
|
|
+ or t1, t0, t1 # e1 :
|
|
+
|
|
+1: cmpgeb zero, t1, t7
|
|
+ mov t1, t0
|
|
+
|
|
+$u_eocfin: # end-of-count, final word
|
|
+ or t10, t7, t7
|
|
+ br $u_final
|
|
+
|
|
+ /* Unaligned copy entry point. */
|
|
+ .align 3
|
|
+$unaligned:
|
|
+
|
|
+ ldl_u t1, 0(a1) # e0 : load first source word
|
|
+
|
|
+ and a0, 7, t4 # .. e1 : find dest misalignment
|
|
+ and a1, 7, t5 # e0 : find src misalignment
|
|
+
|
|
+ /* Conditionally load the first destination word and a bytemask
|
|
+ with 0xff indicating that the destination byte is sacrosanct. */
|
|
+
|
|
+ mov zero, t0 # .. e1 :
|
|
+ mov zero, t6 # e0 :
|
|
+ beq t4, 1f # .. e1 :
|
|
+ ldl_u t0, 0(a0) # e0 :
|
|
+ ldi t6, -1 # .. e1 :
|
|
+ mask3b t6, a0, t6 # e0 :
|
|
+1:
|
|
+ subl a1, t4, a1 # .. e1 : sub dest misalignment from src addr
|
|
+
|
|
+ /* If source misalignment is larger than dest misalignment, we need
|
|
+ extra startup checks to avoid SEGV. */
|
|
+
|
|
+ cmplt t4, t5, t8 # e1 :
|
|
+ ext3b t1, a1, t1 # .. e0 : shift src into place
|
|
+ ldi t2, -1 # e0 : for creating masks later
|
|
+ beq t8, $u_head # e1 :
|
|
+
|
|
+ mask7b t2, t5, t2 # e0 : begin src byte validity mask
|
|
+ cmpgeb zero, t1, t7 # .. e1 : is there a zero?
|
|
+ ext3b t2, a1, t2 # e0 :
|
|
+ or t7, t10, t5 # .. e1 : test for end-of-count too
|
|
+ cmpgeb zero, t2, t3 # e0 :
|
|
+ seleq a2, t5, t7, t7 # .. e1 :
|
|
+ andnot t7, t3, t7 # e0 :
|
|
+ beq t7, $u_head # .. e1 (zdb)
|
|
+
|
|
+ /* At this point we've found a zero in the first partial word of
|
|
+ the source. We need to isolate the valid source data and mask
|
|
+ it into the original destination data. (Incidentally, we know
|
|
+ that we'll need at least one byte of that original dest word.) */
|
|
+
|
|
+ ldl_u t0, 0(a0) # e0 :
|
|
+ negl t7, t6 # .. e1 : build bitmask of bytes <= zero
|
|
+ mask7b t1, t4, t1 # e0 :
|
|
+ and t6, t7, t8 # .. e1 :
|
|
+ subl t8, 1, t6 # e0 :
|
|
+ or t6, t8, t7 # e1 :
|
|
+
|
|
+ zapnot t2, t7, t2 # e0 : prepare source word; mirror changes
|
|
+ zapnot t1, t7, t1 # .. e1 : to source validity mask
|
|
+
|
|
+ andnot t0, t2, t0 # e0 : zero place for source to reside
|
|
+ or t0, t1, t0 # e1 : and put it there
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ ret (t9) # .. e1 :
|
|
+
|
|
+ cfi_endproc
|
|
diff --git a/sysdeps/sw_64/sw6a/memcpy.S b/sysdeps/sw_64/sw6a/memcpy.S
|
|
new file mode 100644
|
|
index 00000000..92597933
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/sw6a/memcpy.S
|
|
@@ -0,0 +1,837 @@
|
|
+/*
|
|
+ * Inputs:
|
|
+ * length in $18
|
|
+ * destination address in $16
|
|
+ * source address in $17
|
|
+ * return address in $26
|
|
+ *
|
|
+ * Outputs:
|
|
+ * bytes copied in $18
|
|
+ *
|
|
+ * Clobbers:
|
|
+ * $1,$2,$3,$4,$5,$16,$17
|
|
+ * $f10, $f11, $f12, $f13, $f15, $f17, $f22, $f23
|
|
+ */
|
|
+
|
|
+#ifndef STRING_NOOPT
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .arch sw6a
|
|
+ .set noreorder
|
|
+ .set noat
|
|
+
|
|
+ENTRY(memcpy)
|
|
+ .prologue 0
|
|
+
|
|
+ mov $16, $0 # E : copy dest to return
|
|
+ ble $18, $nomoredata # U : done with the copy?
|
|
+ xor $16, $17, $1 # E : are source and dest alignments the same?
|
|
+ and $1, 7, $1 # E : are they the same mod 8?
|
|
+
|
|
+ bne $1, $misaligned # U : Nope - gotta do this the slow way
|
|
+ /* source and dest are same mod 8 address */
|
|
+ and $16, 7, $1 # E : Are both 0mod8?
|
|
+ beq $1, $both_0mod8 # U : Yes
|
|
+ nop # E :
|
|
+
|
|
+ /*
|
|
+ * source and dest are same misalignment. move a byte at a time
|
|
+ * until a 0mod8 alignment for both is reached.
|
|
+ * At least one byte more to move
|
|
+ */
|
|
+
|
|
+$head_align:
|
|
+ ldbu $1, 0($17) # L : grab a byte
|
|
+ subl $18, 1, $18 # E : count--
|
|
+ addl $17, 1, $17 # E : src++
|
|
+ stb $1, 0($16) # L :
|
|
+ addl $16, 1, $16 # E : dest++
|
|
+ and $16, 7, $1 # E : Are we at 0mod8 yet?
|
|
+ ble $18, $nomoredata # U : done with the copy?
|
|
+ bne $1, $head_align # U :
|
|
+
|
|
+$both_0mod8:
|
|
+ cmple $18, 127, $1 # E : Can we unroll the loop?
|
|
+ bne $1, $no_unroll # U :
|
|
+ and $16, 63, $1 # E : get mod64 alignment
|
|
+ beq $1, $do_unroll # U : no single quads to fiddle
|
|
+
|
|
+$single_head_quad:
|
|
+ ldl $1, 0($17) # L : get 8 bytes
|
|
+ subl $18, 8, $18 # E : count -= 8
|
|
+ addl $17, 8, $17 # E : src += 8
|
|
+ nop # E :
|
|
+
|
|
+ stl $1, 0($16) # L : store
|
|
+ addl $16, 8, $16 # E : dest += 8
|
|
+ and $16, 63, $1 # E : get mod64 alignment
|
|
+ bne $1, $single_head_quad # U : still not fully aligned
|
|
+
|
|
+$do_unroll:
|
|
+ addl $16, 64, $7 # E : Initial (+1 trip) wh64 address
|
|
+ cmple $18, 127, $1 # E : Can we go through the unrolled loop?
|
|
+ bne $1, $tail_quads # U : Nope
|
|
+ nop # E :
|
|
+
|
|
+$unroll_body:
|
|
+ wh64 ($7) # L1 : memory subsystem hint: 64 bytes at
|
|
+ # ($7) are about to be over-written
|
|
+ ldl $6, 0($17) # L0 : bytes 0..7
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+ ldl $4, 8($17) # L : bytes 8..15
|
|
+ ldl $5, 16($17) # L : bytes 16..23
|
|
+ addl $7, 64, $7 # E : Update next wh64 address
|
|
+ nop # E :
|
|
+
|
|
+ ldl $3, 24($17) # L : bytes 24..31
|
|
+ addl $16, 64, $1 # E : fallback value for wh64
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+ addl $17, 32, $17 # E : src += 32 bytes
|
|
+ stl $6, 0($16) # L : bytes 0..7
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+ stl $4, 8($16) # L : bytes 8..15
|
|
+ stl $5, 16($16) # L : bytes 16..23
|
|
+ subl $18, 192, $2 # E : At least two more trips to go?
|
|
+ nop # E :
|
|
+
|
|
+ stl $3, 24($16) # L : bytes 24..31
|
|
+ addl $16, 32, $16 # E : dest += 32 bytes
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+ ldl $6, 0($17) # L : bytes 0..7
|
|
+ ldl $4, 8($17) # L : bytes 8..15
|
|
+ sellt $2, $1, $7, $7 # E : Latency 2, extra map slot - Use
|
|
+ # fallback wh64 address if < 2 more trips
|
|
+ nop # E :
|
|
+
|
|
+ ldl $5, 16($17) # L : bytes 16..23
|
|
+ ldl $3, 24($17) # L : bytes 24..31
|
|
+ addl $16, 32, $16 # E : dest += 32
|
|
+ subl $18, 64, $18 # E : count -= 64
|
|
+ addl $17, 32, $17 # E : src += 32
|
|
+ stl $6, -32($16) # L : bytes 0..7
|
|
+ stl $4, -24($16) # L : bytes 8..15
|
|
+ cmple $18, 63, $1 # E : At least one more trip?
|
|
+
|
|
+ stl $5, -16($16) # L : bytes 16..23
|
|
+ stl $3, -8($16) # L : bytes 24..31
|
|
+ nop # E :
|
|
+ beq $1, $unroll_body
|
|
+
|
|
+$tail_quads:
|
|
+$no_unroll:
|
|
+ .align 4
|
|
+ subl $18, 8, $18 # E : At least a quad left?
|
|
+ blt $18, $less_than_8 # U : Nope
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+$move_a_quad:
|
|
+ ldl $1, 0($17) # L : fetch 8
|
|
+ subl $18, 8, $18 # E : count -= 8
|
|
+ addl $17, 8, $17 # E : src += 8
|
|
+ nop # E :
|
|
+
|
|
+ stl $1, 0($16) # L : store 8
|
|
+ addl $16, 8, $16 # E : dest += 8
|
|
+ bge $18, $move_a_quad # U :
|
|
+ nop # E :
|
|
+
|
|
+$less_than_8:
|
|
+ .align 4
|
|
+ addl $18, 8, $18 # E : add back for trailing bytes
|
|
+ ble $18, $nomoredata # U : All-done
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+ /* Trailing bytes */
|
|
+$tail_bytes:
|
|
+ subl $18, 1, $18 # E : count--
|
|
+ ldbu $1, 0($17) # L : fetch a byte
|
|
+ addl $17, 1, $17 # E : src++
|
|
+ nop # E :
|
|
+
|
|
+ stb $1, 0($16) # L : store a byte
|
|
+ addl $16, 1, $16 # E : dest++
|
|
+ bgt $18, $tail_bytes # U : more to be done?
|
|
+ nop # E :
|
|
+
|
|
+ /* branching to exit takes 3 extra cycles, so replicate exit here */
|
|
+ ret $31, ($26), 1 # L0 :
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+$misaligned:
|
|
+ mov $0, $4 # E : dest temp
|
|
+ and $0, 7, $1 # E : dest alignment mod8
|
|
+ beq $1, $dest_0mod8 # U : life doesnt totally suck
|
|
+ nop
|
|
+
|
|
+$aligndest:
|
|
+ ble $18, $nomoredata # U :
|
|
+ ldbu $1, 0($17) # L : fetch a byte
|
|
+ subl $18, 1, $18 # E : count--
|
|
+ addl $17, 1, $17 # E : src++
|
|
+
|
|
+ stb $1, 0($4) # L : store it
|
|
+ addl $4, 1, $4 # E : dest++
|
|
+ and $4, 7, $1 # E : dest 0mod8 yet?
|
|
+ bne $1, $aligndest # U : go until we are aligned.
|
|
+
|
|
+ /* Source has unknown alignment, but dest is known to be 0mod8 */
|
|
+$dest_0mod8:
|
|
+ subl $18, 8, $18 # E : At least a quad left?
|
|
+ blt $18, $misalign_tail # U : Nope
|
|
+ ldl_u $3, 0($17) # L : seed (rotating load) of 8 bytes
|
|
+ nop # E :
|
|
+
|
|
+$mis_quad:
|
|
+ ldl_u $16, 8($17) # L : Fetch next 8
|
|
+ ext3b $3, $17, $3 # U : masking
|
|
+ ext7b $16, $17, $1 # U : masking
|
|
+ bis $3, $1, $1 # E : merged bytes to store
|
|
+
|
|
+ subl $18, 8, $18 # E : count -= 8
|
|
+ addl $17, 8, $17 # E : src += 8
|
|
+ stl $1, 0($4) # L : store 8 (aligned)
|
|
+ mov $16, $3 # E : "rotate" source data
|
|
+
|
|
+ addl $4, 8, $4 # E : dest += 8
|
|
+ bge $18, $mis_quad # U : More quads to move
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+$misalign_tail:
|
|
+ addl $18, 8, $18 # E : account for tail stuff
|
|
+ ble $18, $nomoredata # U :
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+$misalign_byte:
|
|
+ ldbu $1, 0($17) # L : fetch 1
|
|
+ subl $18, 1, $18 # E : count--
|
|
+ addl $17, 1, $17 # E : src++
|
|
+ nop # E :
|
|
+
|
|
+ stb $1, 0($4) # L : store
|
|
+ addl $4, 1, $4 # E : dest++
|
|
+ bgt $18, $misalign_byte # U : more to go?
|
|
+ nop
|
|
+
|
|
+
|
|
+$nomoredata:
|
|
+ ret $31, ($26), 1 # L0 :
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+END(memcpy)
|
|
+libc_hidden_builtin_def (memcpy)
|
|
+
|
|
+#else
|
|
+
|
|
+#include <sysdep.h>
|
|
+#define NOCACHE
|
|
+ .set noreorder
|
|
+ .align 4
|
|
+ .globl memcpy
|
|
+ .ent memcpy
|
|
+
|
|
+ .type $jmppointh,@object
|
|
+$jumppointh:
|
|
+ .gprel32 $dest_0mod32
|
|
+ .gprel32 J$H01
|
|
+ .gprel32 J$H02
|
|
+ .gprel32 J$H03
|
|
+ .gprel32 J$H04
|
|
+ .gprel32 J$H05
|
|
+ .gprel32 J$H06
|
|
+ .gprel32 J$H07
|
|
+ .gprel32 J$H08
|
|
+ .gprel32 J$H09
|
|
+ .gprel32 J$H10
|
|
+ .gprel32 J$H11
|
|
+ .gprel32 J$H12
|
|
+ .gprel32 J$H13
|
|
+ .gprel32 J$H14
|
|
+ .gprel32 J$H15
|
|
+ .gprel32 J$H16
|
|
+ .gprel32 J$H17
|
|
+ .gprel32 J$H18
|
|
+ .gprel32 J$H19
|
|
+ .gprel32 J$H20
|
|
+ .gprel32 J$H21
|
|
+ .gprel32 J$H22
|
|
+ .gprel32 J$H23
|
|
+ .gprel32 J$H24
|
|
+ .gprel32 J$H25
|
|
+ .gprel32 J$H26
|
|
+ .gprel32 J$H27
|
|
+ .gprel32 J$H28
|
|
+ .gprel32 J$H29
|
|
+ .gprel32 J$H30
|
|
+ .gprel32 J$H31
|
|
+
|
|
+
|
|
+#memcpy:
|
|
+ENTRY(memcpy)
|
|
+ .prologue 1
|
|
+
|
|
+ ldgp $29, 0($27)
|
|
+ mov $16, $0
|
|
+ cmplt $18, 32, $1
|
|
+ bne $1, $less_than_32
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+ and $16, 31, $1 # E : Are we at dest 0mod32 yet?
|
|
+ beq $1, $dest_0mod32 # U :
|
|
+
|
|
+$judge_align:
|
|
+ subl $16, $17, $1
|
|
+ and $1, 31, $1
|
|
+ bne $1,$align_32bytes # (src-dest)%32=0 ?
|
|
+ ldi $2, 32
|
|
+ and $16, 31, $1
|
|
+ subl $2, $1, $1
|
|
+ cmplt $18, $1, $3
|
|
+ bne $3,$align_32bytes
|
|
+
|
|
+$Headalign:
|
|
+ addl $16, $1, $16
|
|
+ addl $17, $1, $17
|
|
+ subl $18, $1, $18
|
|
+ ldih $2, $jumppointh($29) !gprelhigh
|
|
+ s4addl $1, $2, $2
|
|
+ ldw $2, $jumppointh($2) !gprellow
|
|
+ addl $2, $29, $2
|
|
+ jmp ($2)
|
|
+
|
|
+
|
|
+$align_32bytes:
|
|
+ ldbu $1, 0($17) # L : grab a byte
|
|
+ addl $17, 1, $17 # E : src++
|
|
+ stb $1, 0($16) # L :
|
|
+ subl $18, 1, $18 # E : count--
|
|
+ addl $16, 1, $16 # E : dest++
|
|
+ and $16, 31, $1 # E : Are we at 0mod32 yet?
|
|
+ ble $18, $nomoredata # U : done with the copy?
|
|
+ .align 4
|
|
+ bne $1, $align_32bytes # U :
|
|
+
|
|
+ .align 4
|
|
+$dest_0mod32:
|
|
+ and $17, 31, $1 # E : Are we both at 0mod32 yet?
|
|
+ bne $1, $misalign
|
|
+
|
|
+ cmple $18, 63, $1 # E
|
|
+ bne $1, $tail_simd # U :
|
|
+
|
|
+#ifdef NOCACHE
|
|
+ .align 4
|
|
+ ldih $1, 8($31)
|
|
+ cmple $18, $1, $1 # small than 512K
|
|
+ beq $1, $big_body_simd # U :
|
|
+#endif
|
|
+
|
|
+$body_simd:
|
|
+ fillcs 128*5($17)
|
|
+
|
|
+ vldd $f12, 0($17)
|
|
+ vldd $f13, 32($17)
|
|
+
|
|
+ vstd $f12, 0($16)
|
|
+ vstd $f13, 32($16)
|
|
+
|
|
+ #fillde 128*5($16) #gxw
|
|
+
|
|
+ addl $16, 64, $16
|
|
+ addl $17, 64, $17
|
|
+ subl $18, 64, $18
|
|
+
|
|
+ cmple $18, 63, $1 # E : At least one more trip?
|
|
+ beq $1, $body_simd
|
|
+
|
|
+ br $tail_simd
|
|
+
|
|
+#ifdef NOCACHE
|
|
+$big_body_simd:
|
|
+ fillcs 128*5($17)
|
|
+
|
|
+ vldd $f12, 0($17)
|
|
+ vldd $f13, 32($17)
|
|
+
|
|
+ vstd_nc $f12, 0($16)
|
|
+ vstd_nc $f13, 32($16)
|
|
+
|
|
+ addl $16, 64, $16
|
|
+ addl $17, 64, $17
|
|
+ subl $18, 64, $18
|
|
+
|
|
+ cmple $18, 63, $1 # E : At least one more trip?
|
|
+ beq $1, $big_body_simd
|
|
+
|
|
+ memb
|
|
+#endif
|
|
+
|
|
+ .align 4
|
|
+$tail_simd:
|
|
+ cmple $18, 31, $1 # E : At least one more trip?
|
|
+ bne $1, $before_tail_quads
|
|
+
|
|
+ vldd $f12, 0($17)
|
|
+ vstd $f12, 0($16)
|
|
+
|
|
+ subl $18, 32, $18
|
|
+ addl $16, 32, $16
|
|
+ addl $17, 32, $17
|
|
+
|
|
+$before_tail_quads:
|
|
+ ble $18, $nomoredata
|
|
+ vldd $f12, 0($17)
|
|
+
|
|
+ br $tail_quads
|
|
+
|
|
+$misalign:
|
|
+ ldi $2, 256($31)
|
|
+ andnot $17, 31, $3
|
|
+ vldd $f10, 0($3)
|
|
+ and $17, 31, $5
|
|
+ sll $5, 3, $5
|
|
+ subw $2, $5, $4
|
|
+ ifmovs $5, $f15
|
|
+ ifmovs $4, $f17
|
|
+
|
|
+ cmple $18, 63, $1 # E
|
|
+ bne $1, $misalign_tail_simd # U :
|
|
+
|
|
+#ifdef NOCACHE
|
|
+ .align 4
|
|
+ ldih $1, 8($31)
|
|
+ cmple $18, $1, $1 # small than 512K
|
|
+ beq $1, $big_misalign_body_simd # U :
|
|
+#endif
|
|
+
|
|
+$misalign_body_simd:
|
|
+ vldd $f11, 32($3)
|
|
+ fillcs 128*5($3)
|
|
+
|
|
+ srlow $f10, $f15, $f12
|
|
+ sllow $f11, $f17, $f13
|
|
+ fillde 128*5($16)
|
|
+ vlogfc $f12, $f13, $f31, $f12
|
|
+
|
|
+ vldd $f10, 64($3)
|
|
+ srlow $f11, $f15, $f22
|
|
+ sllow $f10, $f17, $f23
|
|
+ vlogfc $f22, $f23, $f31, $f22
|
|
+
|
|
+ vstd $f12, 0($16)
|
|
+ vstd $f22, 32($16)
|
|
+
|
|
+ addl $16, 64, $16
|
|
+ addl $3, 64, $3
|
|
+ subl $18, 64, $18
|
|
+
|
|
+ cmple $18, 63, $1 # E : At least one more trip?
|
|
+ beq $1, $misalign_body_simd
|
|
+ br $misalign_tail_simd
|
|
+
|
|
+#ifdef NOCACHE
|
|
+$big_misalign_body_simd:
|
|
+ vldd $f11, 32($3)
|
|
+ fillcs 128*5($3)
|
|
+
|
|
+ srlow $f10, $f15, $f12
|
|
+ sllow $f11, $f17, $f13
|
|
+ vlogfc $f12, $f13, $f31, $f12
|
|
+
|
|
+ vldd $f10, 64($3)
|
|
+ srlow $f11, $f15, $f22
|
|
+ sllow $f10, $f17, $f23
|
|
+ vlogfc $f22, $f23, $f31, $f22
|
|
+
|
|
+ vstd_nc $f12, 0($16)
|
|
+ vstd_nc $f22, 32($16)
|
|
+
|
|
+ addl $16, 64, $16
|
|
+ addl $3, 64, $3
|
|
+ subl $18, 64, $18
|
|
+
|
|
+ cmple $18, 63, $1 # E : At least one more trip?
|
|
+ beq $1, $big_misalign_body_simd
|
|
+ memb
|
|
+#endif
|
|
+
|
|
+ .align 4
|
|
+$misalign_tail_simd:
|
|
+ cmple $18, 31, $1 # E : At least one more trip?
|
|
+ bne $1, $before_misalign_tail_quads
|
|
+
|
|
+ vldd $f11, 32($3)
|
|
+ srlow $f10, $f15, $f12
|
|
+ sllow $f11, $f17, $f13
|
|
+ vlogfc $f12, $f13, $f31, $f12
|
|
+
|
|
+ vstd $f12, 0($16)
|
|
+
|
|
+ subl $18, 32, $18
|
|
+ addl $16, 32, $16
|
|
+ addl $3, 32, $3
|
|
+ vfmov $f11, $f10
|
|
+
|
|
+$before_misalign_tail_quads:
|
|
+ srlow $f10, $f15, $f12
|
|
+ s8subl $18, $4, $1
|
|
+ ble $1, $tail_quads
|
|
+
|
|
+ vldd $f11, 32($3)
|
|
+ sllow $f11, $f17, $f13
|
|
+ vlogfc $f12, $f13, $f31, $f12
|
|
+
|
|
+$tail_quads:
|
|
+ subl $18, 8, $1 # E : At least a quad left?
|
|
+ blt $1, $less_than_8 # U : Nope
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+$move_a_quad:
|
|
+ fimovd $f12, $1
|
|
+ srlow $f12, 64, $f12 # E : src += 8
|
|
+
|
|
+ stl $1, 0($16) # L : store 8
|
|
+ subl $18, 8, $18 # E : count -= 8
|
|
+ addl $16, 8, $16 # E : dest += 8
|
|
+ subl $18, 8, $1
|
|
+ bge $1, $move_a_quad # U :
|
|
+ nop # E :
|
|
+
|
|
+$less_than_8:
|
|
+ .align 4
|
|
+ beq $18, $nomoredata # U : All-done
|
|
+ fimovd $f12, $1
|
|
+
|
|
+
|
|
+$tail_bytes:
|
|
+ stb $1, 0($16) # L : store a byte
|
|
+ subl $18, 1, $18 # E : count--
|
|
+ srl $1, 8, $1
|
|
+ addl $16, 1, $16 # E : dest++
|
|
+ bgt $18, $tail_bytes # U : more to be done?
|
|
+ nop # E :
|
|
+
|
|
+
|
|
+$nomoredata:
|
|
+ ret $31, ($26), 1 # L0 :
|
|
+
|
|
+$less_than_32:
|
|
+ ble $18, $nomoredata # U : done with the copy?
|
|
+
|
|
+$tail_32bytes:
|
|
+ ldbu $1, 0($17) # L : grab a byte
|
|
+ addl $17, 1, $17 # E : src++
|
|
+ stb $1, 0($16) # L :
|
|
+ subl $18, 1, $18 # E : count--
|
|
+ addl $16, 1, $16 # E : dest++
|
|
+ bgt $18, $tail_32bytes # U : done with the copy?
|
|
+ br $nomoredata
|
|
+
|
|
+
|
|
+J$H01:
|
|
+ ldbu $1,-1($17)
|
|
+ stb $1,-1($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H02:
|
|
+ ldh $1,-2($17)
|
|
+ sth $1,-2($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H03:
|
|
+ ldh $1,-2($17)
|
|
+ ldbu $2,-3($17)
|
|
+ sth $1,-2($16)
|
|
+ stb $2,-3($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H04:
|
|
+ ldw $1,-4($17)
|
|
+ stw $1,-4($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H05:
|
|
+ ldw $1,-4($17)
|
|
+ ldbu $2,-5($17)
|
|
+ stw $1,-4($16)
|
|
+ stb $2,-5($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H06:
|
|
+ ldw $1,-4($17)
|
|
+ ldh $2,-6($17)
|
|
+ stw $1,-4($16)
|
|
+ sth $2,-6($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H07:
|
|
+ ldw $1,-4($17)
|
|
+ ldh $2,-6($17)
|
|
+ ldbu $3,-7($17)
|
|
+ stw $1,-4($16)
|
|
+ sth $2,-6($16)
|
|
+ stb $3,-7($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H08:
|
|
+ ldl $1,-8($17)
|
|
+ stl $1,-8($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H09:
|
|
+ ldl $1,-8($17)
|
|
+ ldbu $2,-9($17)
|
|
+ stl $1,-8($16)
|
|
+ stb $2,-9($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H10:
|
|
+ ldl $1,-8($17)
|
|
+ ldh $2,-10($17)
|
|
+ stl $1,-8($16)
|
|
+ sth $2,-10($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H11:
|
|
+ ldl $1,-8($17)
|
|
+ ldh $2,-10($17)
|
|
+ ldbu $3,-11($17)
|
|
+ stl $1,-8($16)
|
|
+ sth $2,-10($16)
|
|
+ stb $3,-11($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H12:
|
|
+ ldl $1,-8($17)
|
|
+ ldw $2,-12($17)
|
|
+ stl $1,-8($16)
|
|
+ stw $2,-12($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H13:
|
|
+ ldl $1,-8($17)
|
|
+ ldw $2,-12($17)
|
|
+ ldbu $3,-13($17)
|
|
+ stl $1,-8($16)
|
|
+ stw $2,-12($16)
|
|
+ stb $3,-13($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H14:
|
|
+ ldl $1,-8($17)
|
|
+ ldw $2,-12($17)
|
|
+ ldh $3,-14($17)
|
|
+ stl $1,-8($16)
|
|
+ stw $2,-12($16)
|
|
+ sth $3,-14($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H15:
|
|
+ ldl $1,-8($17)
|
|
+ ldw $2,-12($17)
|
|
+ ldh $3,-14($17)
|
|
+ ldbu $4,-15($17)
|
|
+ stl $1,-8($16)
|
|
+ stw $2,-12($16)
|
|
+ sth $3,-14($16)
|
|
+ stb $4,-15($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H16:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H17:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldbu $3,-17($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stb $3,-17($16)
|
|
+ br $dest_0mod32
|
|
+J$H18:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldh $3,-18($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ sth $3,-18($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H19:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldh $3,-18($17)
|
|
+ ldbu $4,-19($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ sth $3,-18($16)
|
|
+ stb $4,-19($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H20:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldw $3,-20($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stw $3,-20($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H21:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldw $3,-20($17)
|
|
+ ldbu $4,-21($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stw $3,-20($16)
|
|
+ stb $4,-21($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H22:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldw $3,-20($17)
|
|
+ ldh $4,-22($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stw $3,-20($16)
|
|
+ sth $4,-22($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H23:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldw $3,-20($17)
|
|
+ ldh $4,-22($17)
|
|
+ ldbu $5,-23($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stw $3,-20($16)
|
|
+ sth $4,-22($16)
|
|
+ stb $5,-23($16)
|
|
+ br $dest_0mod32
|
|
+J$H24:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldl $3,-24($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stl $3,-24($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H25:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldl $3,-24($17)
|
|
+ ldbu $4,-25($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stl $3,-24($16)
|
|
+ stb $4,-25($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H26:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldl $3,-24($17)
|
|
+ ldh $4,-26($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stl $3,-24($16)
|
|
+ sth $4,-26($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H27:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldl $3,-24($17)
|
|
+ ldh $4,-26($17)
|
|
+ ldbu $5,-27($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stl $3,-24($16)
|
|
+ sth $4,-26($16)
|
|
+ stb $5,-27($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H28:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldl $3,-24($17)
|
|
+ ldw $4,-28($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stl $3,-24($16)
|
|
+ stw $4,-28($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H29:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldl $3,-24($17)
|
|
+ ldw $4,-28($17)
|
|
+ ldbu $5,-29($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stl $3,-24($16)
|
|
+ stw $4,-28($16)
|
|
+ stb $5,-29($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H30:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldl $3,-24($17)
|
|
+ ldw $4,-28($17)
|
|
+ ldh $5,-30($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stl $3,-24($16)
|
|
+ stw $4,-28($16)
|
|
+ sth $5,-30($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+J$H31:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldl $3,-24($17)
|
|
+ ldw $4,-28($17)
|
|
+ ldh $5,-30($17)
|
|
+ ldbu $6,-31($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stl $3,-24($16)
|
|
+ stw $4,-28($16)
|
|
+ sth $5,-30($16)
|
|
+ stb $6,-31($16)
|
|
+ br $dest_0mod32
|
|
+
|
|
+
|
|
+END(memcpy)
|
|
+libc_hidden_builtin_def (memcpy)
|
|
+
|
|
+ .end memcpy
|
|
+
|
|
+#endif
|
|
diff --git a/sysdeps/sw_64/sw6a/memset.S b/sysdeps/sw_64/sw6a/memset.S
|
|
new file mode 100644
|
|
index 00000000..a1fed3bd
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/sw6a/memset.S
|
|
@@ -0,0 +1,415 @@
|
|
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
|
|
+ Contributed by Richard Henderson (rth@tamu.edu)
|
|
+ SW6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+#ifdef __sw_64_sw6a__
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .arch sw6a
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+
|
|
+#ifndef STRING_OPT
|
|
+ENTRY(memset)
|
|
+#ifdef PROF
|
|
+ ldgp gp, 0(pv)
|
|
+ ldi AT, _mcount
|
|
+ call AT, (AT), _mcount
|
|
+ .prologue 1
|
|
+#else
|
|
+ .prologue 0
|
|
+#endif
|
|
+
|
|
+ /*
|
|
+ * Serious stalling happens. The only way to mitigate this is to
|
|
+ * undertake a major re-write to interleave the constant materialization
|
|
+ * with other parts of the fall-through code. This is important, even
|
|
+ * though it makes maintenance tougher.
|
|
+ * Do this later.
|
|
+ */
|
|
+ and $17, 255, $1 # E : 00000000000000ch
|
|
+ ins0b $17, 1, $2 # U : 000000000000ch00
|
|
+ mov $16, $0 # E : return value
|
|
+ ble $18, $end # U : zero length requested?
|
|
+
|
|
+ addl $18, $16, $6 # E : max address to write to
|
|
+ or $1, $2, $17 # E : 000000000000chch
|
|
+ ins0b $1, 2, $3 # U : 0000000000ch0000
|
|
+ ins0b $1, 3, $4 # U : 00000000ch000000
|
|
+
|
|
+ or $3, $4, $3 # E : 00000000chch0000
|
|
+ ins1b $17, 4, $5 # U : 0000chch00000000
|
|
+ xor $16, $6, $1 # E : will complete write be within one quadword?
|
|
+ ins1b $17, 6, $2 # U : chch000000000000
|
|
+
|
|
+ or $17, $3, $17 # E : 00000000chchchch
|
|
+ or $2, $5, $2 # E : chchchch00000000
|
|
+ bic $1, 7, $1 # E : fit within a single quadword?
|
|
+ and $16, 7, $3 # E : Target addr misalignment
|
|
+
|
|
+ or $17, $2, $17 # E : chchchchchchchch
|
|
+ beq $1, $within_quad # U :
|
|
+ nop # E :
|
|
+ beq $3, $aligned # U : target is 0mod8
|
|
+
|
|
+ /*
|
|
+ * Target address is misaligned, and won't fit within a quadword.
|
|
+ */
|
|
+ ldl_u $4, 0($16) # L : Fetch first partial
|
|
+ mov $16, $5 # E : Save the address
|
|
+ ins3b $17, $16, $2 # U : Insert new bytes
|
|
+ subl $3, 8, $3 # E : Invert (for addressing uses)
|
|
+
|
|
+ addl $18, $3, $18 # E : $18 is new count ($3 is negative)
|
|
+ mask3b $4, $16, $4 # U : clear relevant parts of the quad
|
|
+ subl $16, $3, $16 # E : $16 is new aligned destination
|
|
+ or $2, $4, $1 # E : Final bytes
|
|
+
|
|
+ nop
|
|
+ stl_u $1,0($5) # L : Store result
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+ .align 4
|
|
+$aligned:
|
|
+ /*
|
|
+ * We are now guaranteed to be quad aligned, with at least
|
|
+ * one partial quad to write.
|
|
+ */
|
|
+
|
|
+ sra $18, 3, $3 # U : Number of remaining quads to write
|
|
+ and $18, 7, $18 # E : Number of trailing bytes to write
|
|
+ mov $16, $5 # E : Save dest address
|
|
+ beq $3, $no_quad # U : tail stuff only
|
|
+
|
|
+ /*
|
|
+ * It's worth the effort to unroll this and use wh64 if possible.
|
|
+ * At this point, entry values are:
|
|
+ * $16 Current destination address
|
|
+ * $5 A copy of $16
|
|
+ * $6 The max quadword address to write to
|
|
+ * $18 Number trailer bytes
|
|
+ * $3 Number quads to write
|
|
+ */
|
|
+ and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
|
|
+ subl $3, 16, $4 # E : Only try to unroll if > 128 bytes
|
|
+ subl $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
|
|
+ blt $4, $loop # U :
|
|
+
|
|
+ /*
|
|
+ * We know we've got at least 16 quads, minimum of one trip
|
|
+ * through unrolled loop. Do a quad at a time to get us 0mod64
|
|
+ * aligned.
|
|
+ */
|
|
+
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+ beq $1, $bigalign # U :
|
|
+$alignmod32:
|
|
+ stl $17, 0($5) # L :
|
|
+ subl $3, 1, $3 # E : For consistency later
|
|
+ addl $1, 8, $1 # E : Increment towards zero for alignment
|
|
+ addl $5, 8, $4 # E : Initial wh64 address (filler instruction)
|
|
+
|
|
+ nop
|
|
+ nop
|
|
+ addl $5, 8, $5 # E : Inc address
|
|
+ blt $1, $alignmod32 # U :
|
|
+
|
|
+$bigalign:
|
|
+ /*
|
|
+ * $3 - number quads left to go
|
|
+ * $5 - target address (aligned 0mod64)
|
|
+ * $17 - mask of stuff to store
|
|
+ * Scratch registers available: $7, $2, $4, $1
|
|
+ * We know that we'll be taking a minimum of one trip through.
|
|
+ * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
|
|
+ * Assumes the wh64 needs to be for 2 trips through the loop in the
|
|
+ * future.The wh64 is issued on for the starting destination address for
|
|
+ * trip +2 through the loop, and if there are less than two trips left,
|
|
+ * the target address will be for the current trip. */
|
|
+
|
|
+$do_wh64:
|
|
+ wh64 ($4) # L1 : memory subsystem write hint
|
|
+ subl $3, 24, $2 # E : For determining future wh64 addresses
|
|
+ stl $17, 0($5) # L :
|
|
+ nop # E :
|
|
+
|
|
+ addl $5, 128, $4 # E : speculative target of next wh64
|
|
+ stl $17, 8($5) # L :
|
|
+ stl $17, 16($5) # L :
|
|
+ addl $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
|
|
+
|
|
+ stl $17, 24($5) # L :
|
|
+ stl $17, 32($5) # L :
|
|
+ sellt $2, $7, $4, $4 # E : Latency 2, extra mapping cycle
|
|
+ nop
|
|
+
|
|
+ stl $17, 40($5) # L :
|
|
+ stl $17, 48($5) # L :
|
|
+ subl $3, 16, $2 # E : Repeat the loop at least once more?
|
|
+ nop
|
|
+
|
|
+ stl $17, 56($5) # L :
|
|
+ addl $5, 64, $5 # E :
|
|
+ subl $3, 8, $3 # E :
|
|
+ bge $2, $do_wh64 # U :
|
|
+
|
|
+ nop
|
|
+ nop
|
|
+ nop
|
|
+ beq $3, $no_quad # U : Might have finished already
|
|
+
|
|
+ .align 4
|
|
+ /*
|
|
+ * Simple loop for trailing quadwords, or for small amounts
|
|
+ * of data (where we can't use an unrolled loop and wh64)
|
|
+ */
|
|
+$loop:
|
|
+ stl $17, 0($5) # L :
|
|
+ subl $3, 1, $3 # E : Decrement number quads left
|
|
+ addl $5, 8, $5 # E : Inc address
|
|
+ bne $3, $loop # U : more?
|
|
+
|
|
+$no_quad:
|
|
+ /*
|
|
+ * Write 0..7 trailing bytes.
|
|
+ */
|
|
+ nop # E :
|
|
+ beq $18, $end # U : All done?
|
|
+ ldl $7, 0($5) # L :
|
|
+ mask7b $7, $6, $2 # U : Mask final quad
|
|
+
|
|
+ ins7b $17, $6, $4 # U : New bits
|
|
+ or $2, $4, $1 # E : Put it all together
|
|
+ stl $1, 0($5) # L : And back to memory
|
|
+ ret $31,($26),1 # L0 :
|
|
+
|
|
+$within_quad:
|
|
+ ldl_u $1, 0($16) # L :
|
|
+ ins3b $17, $16, $2 # U : New bits
|
|
+ mask3b $1, $16, $4 # U : Clear old
|
|
+ or $2, $4, $2 # E : New result
|
|
+
|
|
+ mask3b $2, $6, $4 # U :
|
|
+ mask7b $1, $6, $2 # U :
|
|
+ or $2, $4, $1 # E :
|
|
+ stl_u $1, 0($16) # L :
|
|
+
|
|
+$end:
|
|
+ nop
|
|
+ nop
|
|
+ nop
|
|
+ ret $31,($26),1 # L0 :
|
|
+
|
|
+ END(memset)
|
|
+libc_hidden_builtin_def (memset)
|
|
+
|
|
+#else
|
|
+ENTRY(memset)
|
|
+#ifdef PROF
|
|
+ ldgp gp, 0(pv)
|
|
+ ldi AT, _mcount
|
|
+ call AT, (AT), _mcount
|
|
+ .prologue 1
|
|
+#else
|
|
+ .prologue 0
|
|
+#endif
|
|
+
|
|
+ /*
|
|
+ * Serious stalling happens. The only way to mitigate this is to
|
|
+ * undertake a major re-write to interleave the constant materialization
|
|
+ * with other parts of the fall-through code. This is important, even
|
|
+ * though it makes maintenance tougher.
|
|
+ * Do this later.
|
|
+ */
|
|
+ and $17, 255, $1 # E : 00000000000000ch
|
|
+ ins0b $17, 1, $2 # U : 000000000000ch00
|
|
+ mov $16, $0 # E : return value
|
|
+ ble $18, $end # U : zero length requested?
|
|
+
|
|
+ addl $18, $16, $6 # E : max address to write to
|
|
+ or $1, $2, $17 # E : 000000000000chch
|
|
+ ins0b $1, 2, $3 # U : 0000000000ch0000
|
|
+ ins0b $1, 3, $4 # U : 00000000ch000000
|
|
+
|
|
+ or $3, $4, $3 # E : 00000000chch0000
|
|
+ ins1b $17, 4, $5 # U : 0000chch00000000
|
|
+ xor $16, $6, $1 # E : will complete write be within one quadword?
|
|
+ ins1b $17, 6, $2 # U : chch000000000000
|
|
+
|
|
+ or $17, $3, $17 # E : 00000000chchchch
|
|
+ or $2, $5, $2 # E : chchchch00000000
|
|
+ bic $1, 7, $1 # E : fit within a single quadword?
|
|
+ and $16, 7, $3 # E : Target addr misalignment
|
|
+
|
|
+ or $17, $2, $17 # E : chchchchchchchch
|
|
+ beq $1, $within_quad # U :
|
|
+ nop # E :
|
|
+ beq $3, $aligned # U : target is 0mod8
|
|
+
|
|
+ /*
|
|
+ * Target address is misaligned, and won't fit within a quadword.
|
|
+ */
|
|
+ ldl_u $4, 0($16) # L : Fetch first partial
|
|
+ mov $16, $5 # E : Save the address
|
|
+ ins3b $17, $16, $2 # U : Insert new bytes
|
|
+ subl $3, 8, $3 # E : Invert (for addressing uses)
|
|
+
|
|
+ addl $18, $3, $18 # E : $18 is new count ($3 is negative)
|
|
+ mask3b $4, $16, $4 # U : clear relevant parts of the quad
|
|
+ subl $16, $3, $16 # E : $16 is new aligned destination
|
|
+ or $2, $4, $1 # E : Final bytes
|
|
+
|
|
+ nop
|
|
+ stl_u $1,0($5) # L : Store result
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+ .align 4
|
|
+$aligned:
|
|
+ /*
|
|
+ * We are now guaranteed to be quad aligned, with at least
|
|
+ * one partial quad to write.
|
|
+ */
|
|
+
|
|
+ sra $18, 3, $3 # U : Number of remaining quads to write
|
|
+ and $18, 7, $18 # E : Number of trailing bytes to write
|
|
+ mov $16, $5 # E : Save dest address
|
|
+ beq $3, $no_quad # U : tail stuff only
|
|
+
|
|
+ /*
|
|
+ * It's worth the effort to unroll this and use wh64 if possible.
|
|
+ * At this point, entry values are:
|
|
+ * $16 Current destination address
|
|
+ * $5 A copy of $16
|
|
+ * $6 The max quadword address to write to
|
|
+ * $18 Number trailer bytes
|
|
+ * $3 Number quads to write
|
|
+ */
|
|
+ and $16, 0x1f, $2 # E : Forward work (only useful for unrolled loop) : aligned low 63 bits
|
|
+ subl $3, 16, $4 # E : Only try to unroll if > 128 bytes
|
|
+ subl $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
|
|
+ blt $4, $loop # U :
|
|
+
|
|
+ /*
|
|
+ * We know we've got at least 16 quads, minimum of one trip
|
|
+ * through unrolled loop. Do a quad at a time to get us 0mod64
|
|
+ * aligned.
|
|
+ */
|
|
+
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+ beq $2, $bigalign
|
|
+$alignmod32:
|
|
+ stl $17, 0($5) # L :
|
|
+ subl $3, 1, $3 # E : For consistency later
|
|
+ addl $1, 8, $1 # E : Increment towards zero for alignment
|
|
+ addl $5, 8, $4 # E : Initial wh64 address (filler instruction)
|
|
+
|
|
+ nop
|
|
+ nop
|
|
+ addl $5, 8, $5 # E : Inc address
|
|
+ blt $1, $alignmod32 # U :
|
|
+
|
|
+$bigalign:
|
|
+ /*
|
|
+ * $3 - number quads left to go
|
|
+ * $5 - target address (aligned 0mod64)
|
|
+ * $17 - mask of stuff to store
|
|
+ * Scratch registers available: $7, $2, $4, $1
|
|
+ * We know that we'll be taking a minimum of one trip through.
|
|
+ * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
|
|
+ * Assumes the wh64 needs to be for 2 trips through the loop
|
|
+ * in the future.The wh64 is issued on for the starting destination
|
|
+ * address for trip +2 through the loop, and if there are less than two
|
|
+ * trips left, the target address will be for the current trip. */
|
|
+
|
|
+ nop
|
|
+ nop
|
|
+ ifmovs $17, $f1
|
|
+ vcpyw $f1, $f1
|
|
+
|
|
+$do_wh64:
|
|
+ fillde 128*4($5)
|
|
+ subl $3, 16, $2
|
|
+ vstd $f1, 0($5)
|
|
+ vstd $f1, 32($5)
|
|
+
|
|
+ subl $3, 8, $3
|
|
+ addl $5, 64, $5
|
|
+ nop
|
|
+ bge $2, $do_wh64 # U :
|
|
+
|
|
+ nop
|
|
+ nop
|
|
+ nop
|
|
+ beq $3, $no_quad # U : Might have finished already
|
|
+
|
|
+ .align 4
|
|
+ /*
|
|
+ * Simple loop for trailing quadwords, or for small amounts
|
|
+ * of data (where we can't use an unrolled loop and wh64)
|
|
+ */
|
|
+$loop:
|
|
+ stl $17, 0($5) # L :
|
|
+ subl $3, 1, $3 # E : Decrement number quads left
|
|
+ addl $5, 8, $5 # E : Inc address
|
|
+ bne $3, $loop # U : more?
|
|
+
|
|
+$no_quad:
|
|
+ /*
|
|
+ * Write 0..7 trailing bytes.
|
|
+ */
|
|
+ nop # E :
|
|
+ beq $18, $end # U : All done?
|
|
+ ldl $7, 0($5) # L :
|
|
+ mask7b $7, $6, $2 # U : Mask final quad
|
|
+
|
|
+ ins7b $17, $6, $4 # U : New bits
|
|
+ or $2, $4, $1 # E : Put it all together
|
|
+ stl $1, 0($5) # L : And back to memory
|
|
+ ret $31,($26),1 # L0 :
|
|
+
|
|
+$within_quad:
|
|
+ ldl_u $1, 0($16) # L :
|
|
+ ins3b $17, $16, $2 # U : New bits
|
|
+ mask3b $1, $16, $4 # U : Clear old
|
|
+ or $2, $4, $2 # E : New result
|
|
+
|
|
+ mask3b $2, $6, $4 # U :
|
|
+ mask7b $1, $6, $2 # U :
|
|
+ or $2, $4, $1 # E :
|
|
+ stl_u $1, 0($16) # L :
|
|
+
|
|
+$end:
|
|
+ nop
|
|
+ nop
|
|
+ nop
|
|
+ ret $31,($26),1 # L0 :
|
|
+
|
|
+ END(memset)
|
|
+libc_hidden_builtin_def (memset)
|
|
+#endif //STRING_OPT
|
|
+#else
|
|
+#include <sysdeps/sw_64/memset.S>
|
|
+
|
|
+#endif
|
|
diff --git a/sysdeps/sw_64/sw6a/stxcpy.S b/sysdeps/sw_64/sw6a/stxcpy.S
|
|
new file mode 100644
|
|
index 00000000..16cfafef
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/sw6a/stxcpy.S
|
|
@@ -0,0 +1,314 @@
|
|
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
|
|
+ Contributed by Richard Henderson (rth@tamu.edu)
|
|
+ SW6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Copy a null-terminated string from SRC to DST.
|
|
+
|
|
+ This is an internal routine used by strcpy, stpcpy, and strcat.
|
|
+ As such, it uses special linkage conventions to make implementation
|
|
+ of these public functions more efficient.
|
|
+
|
|
+ On input:
|
|
+ t9 = return address
|
|
+ a0 = DST
|
|
+ a1 = SRC
|
|
+
|
|
+ On output:
|
|
+ t8 = bitmask (with one bit set) indicating the last byte written
|
|
+ a0 = unaligned address of the last *word* written
|
|
+
|
|
+ Furthermore, v0, a3-a5, t11, and t12 are untouched.
|
|
+*/
|
|
+
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .arch sw6a
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+
|
|
+ .text
|
|
+ .type __stxcpy, @function
|
|
+ .globl __stxcpy
|
|
+ .usepv __stxcpy, no
|
|
+
|
|
+ cfi_startproc
|
|
+ cfi_return_column (t9)
|
|
+
|
|
+ /* On entry to this basic block:
|
|
+ t0 == the first destination word for masking back in
|
|
+ t1 == the first source word. */
|
|
+ .align 4
|
|
+stxcpy_aligned:
|
|
+ /* Create the 1st output word and detect 0's in the 1st input word. */
|
|
+ ldi t2, -1 # E : build a mask against false zero
|
|
+ mask7b t2, a1, t2 # U : detection in the src word (stall)
|
|
+ mask7b t1, a1, t3 # U :
|
|
+ ornot t1, t2, t2 # E : (stall)
|
|
+
|
|
+ mask3b t0, a1, t0 # U : assemble the first output word
|
|
+ cmpgeb zero, t2, t10 # E : bits set iff null found
|
|
+ or t0, t3, t1 # E : (stall)
|
|
+ bne t10, $a_eos # U : (stall)
|
|
+
|
|
+ /* On entry to this basic block:
|
|
+ t0 == the first destination word for masking back in
|
|
+ t1 == a source word not containing a null. */
|
|
+ /* Nops here to separate store quads from load quads */
|
|
+
|
|
+$a_loop:
|
|
+ stl_u t1, 0(a0) # L :
|
|
+ addl a0, 8, a0 # E :
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+ ldl_u t1, 0(a1) # L : Latency=3
|
|
+ addl a1, 8, a1 # E :
|
|
+ cmpgeb zero, t1, t10 # E : (3 cycle stall)
|
|
+ beq t10, $a_loop # U : (stall for t10)
|
|
+
|
|
+ /* Take care of the final (partial) word store.
|
|
+ On entry to this basic block we have:
|
|
+ t1 == the source word containing the null
|
|
+ t10 == the cmpgeb mask that found it. */
|
|
+$a_eos:
|
|
+ negl t10, t6 # E : find low bit set
|
|
+ and t10, t6, t8 # E : (stall)
|
|
+ /* For the sake of the cache, don't read a destination word
|
|
+ if we're not going to need it. */
|
|
+ and t8, 0x80, t6 # E : (stall)
|
|
+ bne t6, 1f # U : (stall)
|
|
+
|
|
+ /* We're doing a partial word store and so need to combine
|
|
+ our source and original destination words. */
|
|
+ ldl_u t0, 0(a0) # L : Latency=3
|
|
+ subl t8, 1, t6 # E :
|
|
+ zapnot t1, t6, t1 # U : clear src bytes >= null (stall)
|
|
+ or t8, t6, t10 # E : (stall)
|
|
+
|
|
+ zap t0, t10, t0 # E : clear dst bytes <= null
|
|
+ or t0, t1, t1 # E : (stall)
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+1: stl_u t1, 0(a0) # L :
|
|
+ ret (t9) # L0 : Latency=3
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+ .align 4
|
|
+__stxcpy:
|
|
+ /* Are source and destination co-aligned? */
|
|
+ xor a0, a1, t0 # E :
|
|
+ unop # E :
|
|
+ and t0, 7, t0 # E : (stall)
|
|
+ bne t0, $unaligned # U : (stall)
|
|
+
|
|
+ /* We are co-aligned; take care of a partial first word. */
|
|
+ ldl_u t1, 0(a1) # L : load first src word
|
|
+ and a0, 7, t0 # E : take care not to load a word ...
|
|
+ addl a1, 8, a1 # E :
|
|
+ beq t0, stxcpy_aligned # U : ... if we wont need it (stall)
|
|
+
|
|
+ ldl_u t0, 0(a0) # L :
|
|
+ br stxcpy_aligned # L0 : Latency=3
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+
|
|
+/* The source and destination are not co-aligned. Align the destination
|
|
+ and cope. We have to be very careful about not reading too much and
|
|
+ causing a SEGV. */
|
|
+
|
|
+ .align 4
|
|
+$u_head:
|
|
+ /* We know just enough now to be able to assemble the first
|
|
+ full source word. We can still find a zero at the end of it
|
|
+ that prevents us from outputting the whole thing.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the first dest word, for masking back in, if needed else 0
|
|
+ t1 == the low bits of the first source word
|
|
+ t6 == bytemask that is -1 in dest word bytes */
|
|
+
|
|
+ ldl_u t2, 8(a1) # L :
|
|
+ addl a1, 8, a1 # E :
|
|
+ ext3b t1, a1, t1 # U : (stall on a1)
|
|
+ ext7b t2, a1, t4 # U : (stall on a1)
|
|
+
|
|
+ mask3b t0, a0, t0 # U :
|
|
+ or t1, t4, t1 # E :
|
|
+ mask7b t1, a0, t1 # U : (stall on t1)
|
|
+ or t0, t1, t1 # E : (stall on t1)
|
|
+
|
|
+ or t1, t6, t6 # E :
|
|
+ cmpgeb zero, t6, t10 # E : (stall)
|
|
+ ldi t6, -1 # E : for masking just below
|
|
+ bne t10, $u_final # U : (stall)
|
|
+
|
|
+ mask3b t6, a1, t6 # U : mask out the bits we have
|
|
+ or t6, t2, t2 # E : already extracted before (stall)
|
|
+ cmpgeb zero, t2, t10 # E : testing eos (stall)
|
|
+ bne t10, $u_late_head_exit # U : (stall)
|
|
+
|
|
+ /* Finally, we've got all the stupid leading edge cases taken care
|
|
+ of and we can set up to enter the main loop. */
|
|
+
|
|
+ stl_u t1, 0(a0) # L : store first output word
|
|
+ addl a0, 8, a0 # E :
|
|
+ ext3b t2, a1, t0 # U : position ho-bits of lo word
|
|
+ ldl_u t2, 8(a1) # U : read next high-order source word
|
|
+
|
|
+ addl a1, 8, a1 # E :
|
|
+ cmpgeb zero, t2, t10 # E : (stall for t2)
|
|
+ nop # E :
|
|
+ bne t10, $u_eos # U : (stall)
|
|
+
|
|
+ /* Unaligned copy main loop. In order to avoid reading too much,
|
|
+ the loop is structured to detect zeros in aligned source words.
|
|
+ This has, unfortunately, effectively pulled half of a loop
|
|
+ iteration out into the head and half into the tail, but it does
|
|
+ prevent nastiness from accumulating in the very thing we want
|
|
+ to run as fast as possible.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the shifted high-order bits from the previous source word
|
|
+ t2 == the unshifted current source word
|
|
+
|
|
+ We further know that t2 does not contain a null terminator. */
|
|
+
|
|
+ .align 3
|
|
+$u_loop:
|
|
+ ext7b t2, a1, t1 # U : extract high bits for current word
|
|
+ addl a1, 8, a1 # E : (stall)
|
|
+ ext3b t2, a1, t3 # U : extract low bits for next time (stall)
|
|
+ addl a0, 8, a0 # E :
|
|
+
|
|
+ or t0, t1, t1 # E : current dst word now complete
|
|
+ ldl_u t2, 0(a1) # L : Latency=3 load high word for next time
|
|
+ stl_u t1, -8(a0) # L : save the current word (stall)
|
|
+ mov t3, t0 # E :
|
|
+
|
|
+ cmpgeb zero, t2, t10 # E : test new word for eos
|
|
+ beq t10, $u_loop # U : (stall)
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+ /* We've found a zero somewhere in the source word we just read.
|
|
+ If it resides in the lower half, we have one (probably partial)
|
|
+ word to write out, and if it resides in the upper half, we
|
|
+ have one full and one partial word left to write out.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the shifted high-order bits from the previous source word
|
|
+ t2 == the unshifted current source word. */
|
|
+$u_eos:
|
|
+ ext7b t2, a1, t1 # U :
|
|
+ or t0, t1, t1 # E : first (partial) source word complete (stall)
|
|
+ cmpgeb zero, t1, t10 # E : is the null in this first bit? (stall)
|
|
+ bne t10, $u_final # U : (stall)
|
|
+
|
|
+$u_late_head_exit:
|
|
+ stl_u t1, 0(a0) # L : the null was in the high-order bits
|
|
+ addl a0, 8, a0 # E :
|
|
+ ext3b t2, a1, t1 # U :
|
|
+ cmpgeb zero, t1, t10 # E : (stall)
|
|
+
|
|
+ /* Take care of a final (probably partial) result word.
|
|
+ On entry to this basic block:
|
|
+ t1 == assembled source word
|
|
+ t10 == cmpgeb mask that found the null. */
|
|
+$u_final:
|
|
+ negl t10, t6 # E : isolate low bit set
|
|
+ and t6, t10, t8 # E : (stall)
|
|
+ and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
|
|
+ bne t6, 1f # U : (stall)
|
|
+
|
|
+ ldl_u t0, 0(a0) # E :
|
|
+ subl t8, 1, t6 # E :
|
|
+ or t6, t8, t10 # E : (stall)
|
|
+ zapnot t1, t6, t1 # U : kill source bytes >= null (stall)
|
|
+
|
|
+ zap t0, t10, t0 # U : kill dest bytes <= null (2 cycle data stall)
|
|
+ or t0, t1, t1 # E : (stall)
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+1: stl_u t1, 0(a0) # L :
|
|
+ ret (t9) # L0 : Latency=3
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+ /* Unaligned copy entry point. */
|
|
+ .align 4
|
|
+$unaligned:
|
|
+
|
|
+ ldl_u t1, 0(a1) # L : load first source word
|
|
+ and a0, 7, t4 # E : find dest misalignment
|
|
+ and a1, 7, t5 # E : find src misalignment
|
|
+ /* Conditionally load the first destination word and a bytemask
|
|
+ with 0xff indicating that the destination byte is sacrosanct. */
|
|
+ mov zero, t0 # E :
|
|
+
|
|
+ mov zero, t6 # E :
|
|
+ beq t4, 1f # U :
|
|
+ ldl_u t0, 0(a0) # L :
|
|
+ ldi t6, -1 # E :
|
|
+
|
|
+ mask3b t6, a0, t6 # U :
|
|
+ nop
|
|
+ nop
|
|
+ nop
|
|
+1:
|
|
+ subl a1, t4, a1 # E : sub dest misalignment from src addr
|
|
+ /* If source misalignment is larger than dest misalignment, we need
|
|
+ extra startup checks to avoid SEGV. */
|
|
+ cmplt t4, t5, t8 # E :
|
|
+ beq t8, $u_head # U :
|
|
+ ldi t2, -1 # E : mask out leading garbage in source
|
|
+
|
|
+ mask7b t2, t5, t2 # U :
|
|
+ ornot t1, t2, t3 # E : (stall)
|
|
+ cmpgeb zero, t3, t10 # E : is there a zero? (stall)
|
|
+ beq t10, $u_head # U : (stall)
|
|
+
|
|
+ /* At this point we've found a zero in the first partial word of
|
|
+ the source. We need to isolate the valid source data and mask
|
|
+ it into the original destination data. (Incidentally, we know
|
|
+ that we'll need at least one byte of that original dest word.) */
|
|
+
|
|
+ ldl_u t0, 0(a0) # L :
|
|
+ negl t10, t6 # E : build bitmask of bytes <= zero
|
|
+ and t6, t10, t8 # E : (stall)
|
|
+ and a1, 7, t5 # E :
|
|
+
|
|
+ subl t8, 1, t6 # E :
|
|
+ or t6, t8, t10 # E : (stall)
|
|
+ srl t8, t5, t8 # U : adjust final null return value
|
|
+ zapnot t2, t10, t2 # U : prepare source word; mirror changes (stall)
|
|
+
|
|
+ and t1, t2, t1 # E : to source validity mask
|
|
+ ext3b t2, a1, t2 # U :
|
|
+ ext3b t1, a1, t1 # U : (stall)
|
|
+ andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall)
|
|
+
|
|
+ or t0, t1, t1 # e1 : and put it there
|
|
+ stl_u t1, 0(a0) # .. e0 : (stall)
|
|
+ ret (t9) # e1 :
|
|
+
|
|
+ cfi_endproc
|
|
diff --git a/sysdeps/sw_64/sw6a/stxncpy.S b/sysdeps/sw_64/sw6a/stxncpy.S
|
|
new file mode 100644
|
|
index 00000000..8b3681c2
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/sw6a/stxncpy.S
|
|
@@ -0,0 +1,392 @@
|
|
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
|
|
+ Contributed by Richard Henderson (rth@tamu.edu)
|
|
+ SW6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Copy no more than COUNT bytes of the null-terminated string from
|
|
+ SRC to DST.
|
|
+
|
|
+ This is an internal routine used by strncpy, stpncpy, and strncat.
|
|
+ As such, it uses special linkage conventions to make implementation
|
|
+ of these public functions more efficient.
|
|
+
|
|
+ On input:
|
|
+ t9 = return address
|
|
+ a0 = DST
|
|
+ a1 = SRC
|
|
+ a2 = COUNT
|
|
+
|
|
+ Furthermore, COUNT may not be zero.
|
|
+
|
|
+ On output:
|
|
+ t0 = last word written
|
|
+ t8 = bitmask (with one bit set) indicating the last byte written
|
|
+ t10 = bitmask (with one bit set) indicating the byte position of
|
|
+ the end of the range specified by COUNT
|
|
+ a0 = unaligned address of the last *word* written
|
|
+ a2 = the number of full words left in COUNT
|
|
+
|
|
+ Furthermore, v0, a3-a5, t11, and t12 are untouched.
|
|
+*/
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .arch sw6a
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+
|
|
+ .text
|
|
+ .type __stxncpy, @function
|
|
+ .globl __stxncpy
|
|
+ .usepv __stxncpy, no
|
|
+
|
|
+ cfi_startproc
|
|
+ cfi_return_column (t9)
|
|
+
|
|
+ /* On entry to this basic block:
|
|
+ t0 == the first destination word for masking back in
|
|
+ t1 == the first source word. */
|
|
+ .align 4
|
|
+stxncpy_aligned:
|
|
+ /* Create the 1st output word and detect 0's in the 1st input word. */
|
|
+ ldi t2, -1 # E : build a mask against false zero
|
|
+ mask7b t2, a1, t2 # U : detection in the src word (stall)
|
|
+ mask7b t1, a1, t3 # U :
|
|
+ ornot t1, t2, t2 # E : (stall)
|
|
+
|
|
+ mask3b t0, a1, t0 # U : assemble the first output word
|
|
+ cmpgeb zero, t2, t7 # E : bits set iff null found
|
|
+ or t0, t3, t0 # E : (stall)
|
|
+ beq a2, $a_eoc # U :
|
|
+
|
|
+ bne t7, $a_eos # U :
|
|
+ nop
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+ /* On entry to this basic block:
|
|
+ t0 == a source word not containing a null. */
|
|
+
|
|
+ /*
|
|
+ * nops here to:
|
|
+ * separate store quads from load quads
|
|
+ * limit of 1 bcond/quad to permit training
|
|
+ */
|
|
+$a_loop:
|
|
+ stl_u t0, 0(a0) # L :
|
|
+ addl a0, 8, a0 # E :
|
|
+ subl a2, 1, a2 # E :
|
|
+ nop
|
|
+
|
|
+ ldl_u t0, 0(a1) # L :
|
|
+ addl a1, 8, a1 # E :
|
|
+ cmpgeb zero, t0, t7 # E :
|
|
+ beq a2, $a_eoc # U :
|
|
+
|
|
+ beq t7, $a_loop # U :
|
|
+ nop
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+ /* Take care of the final (partial) word store. At this point
|
|
+ the end-of-count bit is set in t7 iff it applies.
|
|
+
|
|
+ On entry to this basic block we have:
|
|
+ t0 == the source word containing the null
|
|
+ t7 == the cmpgeb mask that found it. */
|
|
+$a_eos:
|
|
+ negl t7, t8 # E : find low bit set
|
|
+ and t7, t8, t8 # E : (stall)
|
|
+ /* For the sake of the cache, don't read a destination word
|
|
+ if we're not going to need it. */
|
|
+ and t8, 0x80, t6 # E : (stall)
|
|
+ bne t6, 1f # U : (stall)
|
|
+
|
|
+ /* We're doing a partial word store and so need to combine
|
|
+ our source and original destination words. */
|
|
+ ldl_u t1, 0(a0) # L :
|
|
+ subl t8, 1, t6 # E :
|
|
+ or t8, t6, t7 # E : (stall)
|
|
+ zapnot t0, t7, t0 # U : clear src bytes > null (stall)
|
|
+
|
|
+ zap t1, t7, t1 # .. e1 : clear dst bytes <= null
|
|
+ or t0, t1, t0 # e1 : (stall)
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+1: stl_u t0, 0(a0) # L :
|
|
+ ret (t9) # L0 : Latency=3
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+ /* Add the end-of-count bit to the eos detection bitmask. */
|
|
+$a_eoc:
|
|
+ or t10, t7, t7 # E :
|
|
+ br $a_eos # L0 : Latency=3
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+ .align 4
|
|
+__stxncpy:
|
|
+ /* Are source and destination co-aligned? */
|
|
+ ldi t2, -1 # E :
|
|
+ xor a0, a1, t1 # E :
|
|
+ and a0, 7, t0 # E : find dest misalignment
|
|
+ nop # E :
|
|
+
|
|
+ srl t2, 1, t2 # U :
|
|
+ and t1, 7, t1 # E :
|
|
+ sellt a2, t2, a2, a2 # E : bound count to LONG_MAX (stall)
|
|
+ nop # E :
|
|
+
|
|
+ addl a2, t0, a2 # E : bias count by dest misalignment
|
|
+ subl a2, 1, a2 # E : (stall)
|
|
+ and a2, 7, t2 # E : (stall)
|
|
+ ldi t10, 1 # E :
|
|
+
|
|
+ srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8
|
|
+ sll t10, t2, t10 # U : t10 = bitmask of last count byte
|
|
+ nop # E :
|
|
+ bne t1, $unaligned # U : (stall)
|
|
+
|
|
+ /* We are co-aligned; take care of a partial first word. */
|
|
+ ldl_u t1, 0(a1) # L : load first src word
|
|
+ addl a1, 8, a1 # E :
|
|
+ beq t0, stxncpy_aligned # U : avoid loading dest word if not needed
|
|
+ ldl_u t0, 0(a0) # L :
|
|
+
|
|
+ br stxncpy_aligned # U :
|
|
+ nop
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+
|
|
+
|
|
+/* The source and destination are not co-aligned. Align the destination
|
|
+ and cope. We have to be very careful about not reading too much and
|
|
+ causing a SEGV. */
|
|
+
|
|
+ .align 4
|
|
+$u_head:
|
|
+ /* We know just enough now to be able to assemble the first
|
|
+ full source word. We can still find a zero at the end of it
|
|
+ that prevents us from outputting the whole thing.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the first dest word, unmasked
|
|
+ t1 == the shifted low bits of the first source word
|
|
+ t6 == bytemask that is -1 in dest word bytes */
|
|
+
|
|
+ ldl_u t2, 8(a1) # L : Latency=3 load second src word
|
|
+ addl a1, 8, a1 # E :
|
|
+ mask3b t0, a0, t0 # U : mask trailing garbage in dst
|
|
+ ext7b t2, a1, t4 # U : (3 cycle stall on t2)
|
|
+
|
|
+ or t1, t4, t1 # E : first aligned src word complete (stall)
|
|
+ mask7b t1, a0, t1 # U : mask leading garbage in src (stall)
|
|
+ or t0, t1, t0 # E : first output word complete (stall)
|
|
+ or t0, t6, t6 # E : mask original data for zero test (stall)
|
|
+
|
|
+ cmpgeb zero, t6, t7 # E :
|
|
+ beq a2, $u_eocfin # U :
|
|
+ ldi t6, -1 # E :
|
|
+ nop
|
|
+
|
|
+ bne t7, $u_final # U :
|
|
+ mask3b t6, a1, t6 # U : mask out bits already seen
|
|
+ stl_u t0, 0(a0) # L : store first output word
|
|
+ or t6, t2, t2 # E :
|
|
+
|
|
+ cmpgeb zero, t2, t7 # E : find nulls in second partial
|
|
+ addl a0, 8, a0 # E :
|
|
+ subl a2, 1, a2 # E :
|
|
+ bne t7, $u_late_head_exit # U :
|
|
+
|
|
+ /* Finally, we've got all the stupid leading edge cases taken care
|
|
+ of and we can set up to enter the main loop. */
|
|
+ ext3b t2, a1, t1 # U : position hi-bits of lo word
|
|
+ beq a2, $u_eoc # U :
|
|
+ ldl_u t2, 8(a1) # L : read next high-order source word
|
|
+ addl a1, 8, a1 # E :
|
|
+
|
|
+ ext7b t2, a1, t0 # U : position lo-bits of hi word (stall)
|
|
+ cmpgeb zero, t2, t7 # E :
|
|
+ nop
|
|
+ bne t7, $u_eos # U :
|
|
+
|
|
+ /* Unaligned copy main loop. In order to avoid reading too much,
|
|
+ the loop is structured to detect zeros in aligned source words.
|
|
+ This has, unfortunately, effectively pulled half of a loop
|
|
+ iteration out into the head and half into the tail, but it does
|
|
+ prevent nastiness from accumulating in the very thing we want
|
|
+ to run as fast as possible.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the shifted low-order bits from the current source word
|
|
+ t1 == the shifted high-order bits from the previous source word
|
|
+ t2 == the unshifted current source word
|
|
+
|
|
+ We further know that t2 does not contain a null terminator. */
|
|
+
|
|
+ .align 4
|
|
+$u_loop:
|
|
+ or t0, t1, t0 # E : current dst word now complete
|
|
+ subl a2, 1, a2 # E : decrement word count
|
|
+ ext3b t2, a1, t1 # U : extract high bits for next time
|
|
+ addl a0, 8, a0 # E :
|
|
+
|
|
+ stl_u t0, -8(a0) # L : save the current word
|
|
+ beq a2, $u_eoc # U :
|
|
+ ldl_u t2, 8(a1) # L : Latency=3 load high word for next time
|
|
+ addl a1, 8, a1 # E :
|
|
+
|
|
+ ext7b t2, a1, t0 # U : extract low bits (2 cycle stall)
|
|
+ cmpgeb zero, t2, t7 # E : test new word for eos
|
|
+ nop
|
|
+ beq t7, $u_loop # U :
|
|
+
|
|
+ /* We've found a zero somewhere in the source word we just read.
|
|
+ If it resides in the lower half, we have one (probably partial)
|
|
+ word to write out, and if it resides in the upper half, we
|
|
+ have one full and one partial word left to write out.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the shifted low-order bits from the current source word
|
|
+ t1 == the shifted high-order bits from the previous source word
|
|
+ t2 == the unshifted current source word. */
|
|
+$u_eos:
|
|
+ or t0, t1, t0 # E : first (partial) source word complete
|
|
+ nop
|
|
+ cmpgeb zero, t0, t7 # E : is the null in this first bit? (stall)
|
|
+ bne t7, $u_final # U : (stall)
|
|
+
|
|
+ stl_u t0, 0(a0) # L : the null was in the high-order bits
|
|
+ addl a0, 8, a0 # E :
|
|
+ subl a2, 1, a2 # E :
|
|
+ nop
|
|
+
|
|
+$u_late_head_exit:
|
|
+ ext3b t2, a1, t0 # U :
|
|
+ cmpgeb zero, t0, t7 # E :
|
|
+ or t7, t10, t6 # E : (stall)
|
|
+ seleq a2, t6, t7, t7 # E : Latency=2, extra map slot (stall)
|
|
+
|
|
+ /* Take care of a final (probably partial) result word.
|
|
+ On entry to this basic block:
|
|
+ t0 == assembled source word
|
|
+ t7 == cmpgeb mask that found the null. */
|
|
+$u_final:
|
|
+ negl t7, t6 # E : isolate low bit set
|
|
+ and t6, t7, t8 # E : (stall)
|
|
+ and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
|
|
+ bne t6, 1f # U : (stall)
|
|
+
|
|
+ ldl_u t1, 0(a0) # L :
|
|
+ subl t8, 1, t6 # E :
|
|
+ or t6, t8, t7 # E : (stall)
|
|
+ zapnot t0, t7, t0 # U : kill source bytes > null
|
|
+
|
|
+ zap t1, t7, t1 # U : kill dest bytes <= null
|
|
+ or t0, t1, t0 # E : (stall)
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+1: stl_u t0, 0(a0) # L :
|
|
+ ret (t9) # L0 : Latency=3
|
|
+
|
|
+ /* Got to end-of-count before end of string.
|
|
+ On entry to this basic block:
|
|
+ t1 == the shifted high-order bits from the previous source word */
|
|
+$u_eoc:
|
|
+ and a1, 7, t6 # E :
|
|
+ sll t10, t6, t6 # U : (stall)
|
|
+ and t6, 0xff, t6 # E : (stall)
|
|
+ bne t6, 1f # U : (stall)
|
|
+
|
|
+ ldl_u t2, 8(a1) # L : load final src word
|
|
+ nop
|
|
+ ext7b t2, a1, t0 # U : extract low bits for last word (stall)
|
|
+ or t1, t0, t1 # E : (stall)
|
|
+
|
|
+1: cmpgeb zero, t1, t7 # E :
|
|
+ mov t1, t0
|
|
+
|
|
+$u_eocfin: # end-of-count, final word
|
|
+ or t10, t7, t7 # E :
|
|
+ br $u_final # L0 : Latency=3
|
|
+
|
|
+ /* Unaligned copy entry point. */
|
|
+ .align 4
|
|
+$unaligned:
|
|
+
|
|
+ ldl_u t1, 0(a1) # L : load first source word
|
|
+ and a0, 7, t4 # E : find dest misalignment
|
|
+ and a1, 7, t5 # E : find src misalignment
|
|
+ /* Conditionally load the first destination word and a bytemask
|
|
+ with 0xff indicating that the destination byte is sacrosanct. */
|
|
+ mov zero, t0 # E :
|
|
+
|
|
+ mov zero, t6 # E :
|
|
+ beq t4, 1f # U :
|
|
+ ldl_u t0, 0(a0) # L :
|
|
+ ldi t6, -1 # E :
|
|
+
|
|
+ mask3b t6, a0, t6 # U :
|
|
+ nop
|
|
+ nop
|
|
+1: subl a1, t4, a1 # E : sub dest misalignment from src addr
|
|
+
|
|
+ /* If source misalignment is larger than dest misalignment, we need
|
|
+ extra startup checks to avoid SEGV. */
|
|
+
|
|
+ cmplt t4, t5, t8 # E :
|
|
+ ext3b t1, a1, t1 # U : shift src into place
|
|
+ ldi t2, -1 # E : for creating masks later
|
|
+ beq t8, $u_head # U : (stall)
|
|
+
|
|
+ mask7b t2, t5, t2 # U : begin src byte validity mask
|
|
+ cmpgeb zero, t1, t7 # E : is there a zero?
|
|
+ ext3b t2, a1, t2 # U :
|
|
+ or t7, t10, t5 # E : test for end-of-count too
|
|
+
|
|
+ cmpgeb zero, t2, t3 # E :
|
|
+ seleq a2, t5, t7, t7 # E : Latency=2, extra map slot
|
|
+ nop # E : keep with seleq
|
|
+ andnot t7, t3, t7 # E : (stall)
|
|
+
|
|
+ beq t7, $u_head # U :
|
|
+ /* At this point we've found a zero in the first partial word of
|
|
+ the source. We need to isolate the valid source data and mask
|
|
+ it into the original destination data. (Incidentally, we know
|
|
+ that we'll need at least one byte of that original dest word.) */
|
|
+ ldl_u t0, 0(a0) # L :
|
|
+ negl t7, t6 # E : build bitmask of bytes <= zero
|
|
+ mask7b t1, t4, t1 # U :
|
|
+
|
|
+ and t6, t7, t8 # E :
|
|
+ subl t8, 1, t6 # E : (stall)
|
|
+ or t6, t8, t7 # E : (stall)
|
|
+ zapnot t2, t7, t2 # U : prepare source word; mirror changes (stall)
|
|
+
|
|
+ zapnot t1, t7, t1 # U : to source validity mask
|
|
+ andnot t0, t2, t0 # E : zero place for source to reside
|
|
+ or t0, t1, t0 # E : and put it there (stall both t0, t1)
|
|
+ stl_u t0, 0(a0) # L : (stall)
|
|
+
|
|
+ ret (t9) # L0 : Latency=3
|
|
+
|
|
+ cfi_endproc
|
|
diff --git a/sysdeps/sw_64/sw8a/memcpy.S b/sysdeps/sw_64/sw8a/memcpy.S
|
|
new file mode 100644
|
|
index 00000000..954a5fcf
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/sw8a/memcpy.S
|
|
@@ -0,0 +1,320 @@
|
|
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+ sw6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/*
|
|
+ * Much of the information about 21264 scheduling/coding comes from:
|
|
+ * Compiler Writer's Guide for the Sw_64 21264
|
|
+ * abbreviated as 'CWG' in other comments here
|
|
+ * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
|
|
+ * Scheduling notation:
|
|
+ * E - either cluster
|
|
+ * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
|
|
+ * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
|
|
+ *
|
|
+ * Temp usage notes:
|
|
+ * $0 - destination address
|
|
+ * $1,$2, - scratch
|
|
+ */
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .set noreorder
|
|
+ .set noat
|
|
+ .align 4
|
|
+
|
|
+
|
|
+ .type $jmppointh,@object
|
|
+$jumppointh:
|
|
+ .gprel32 $both_0mod8
|
|
+ .gprel32 J$H01
|
|
+ .gprel32 J$H02
|
|
+ .gprel32 J$H03
|
|
+ .gprel32 J$H04
|
|
+ .gprel32 J$H05
|
|
+ .gprel32 J$H06
|
|
+ .gprel32 J$H07
|
|
+
|
|
+ENTRY(memcpy)
|
|
+#memcpy:
|
|
+ .prologue 1
|
|
+
|
|
+ ldgp $29, 0($27)
|
|
+
|
|
+ mov $16, $0 # E : copy dest to return
|
|
+# mov $16, $1
|
|
+ ble $18, $nomoredata # U : done with the copy?
|
|
+ cmplt $18, 8, $1
|
|
+ bne $1, $less_8
|
|
+
|
|
+ /* source and dest are same mod 8 address */
|
|
+ and $16, 7, $1 # E : Are both 0mod8?
|
|
+ beq $1, $both_0mod8 # U : Yes
|
|
+ nop # E :
|
|
+
|
|
+ /*
|
|
+ * source and dest are same misalignment. move a byte at a time
|
|
+ * until a 0mod8 alignment for both is reached.
|
|
+ * At least one byte more to move
|
|
+ */
|
|
+
|
|
+ ldi $2, 8
|
|
+ subl $2, $1, $1
|
|
+
|
|
+$head_align:
|
|
+ addl $16, $1, $16
|
|
+ addl $17, $1, $17
|
|
+ subl $18, $1, $18
|
|
+ ldih $2, $jumppointh($29) !gprelhigh
|
|
+ s4addl $1, $2, $2
|
|
+ ldw $2, $jumppointh($2) !gprellow
|
|
+ addl $2, $29, $2
|
|
+ jmp ($2)
|
|
+
|
|
+$both_0mod8:
|
|
+ cmple $18, 127, $1 # E : Can we unroll the loop?
|
|
+ bne $1, $no_unroll # U :
|
|
+
|
|
+
|
|
+$do_unroll:
|
|
+ ldih $1, 8($31) # big than 512K
|
|
+ cmple $18, $1, $1
|
|
+ beq $1, $unroll_body_512
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+$unroll_body:
|
|
+ ldl $6, 0($17) # L0 : bytes 0..7
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+ ldl $4, 8($17) # L : bytes 8..15
|
|
+ ldl $5, 16($17) # L : bytes 16..23
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+ ldl $3, 24($17) # L : bytes 24..31
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+ stl $6, 0($16) # L : bytes 0..7
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+ stl $4, 8($16) # L : bytes 8..15
|
|
+ stl $5, 16($16) # L : bytes 16..23
|
|
+ nop # E :
|
|
+
|
|
+ stl $3, 24($16) # L : bytes 24..31
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+ ldl $22, 32($17) # L : bytes 0..7
|
|
+ ldl $23, 40($17) # L : bytes 8..15
|
|
+ # fallback wh64 address if < 2 more trips
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+ ldl $24, 48($17) # L : bytes 16..23
|
|
+ ldl $25, 56($17) # L : bytes 24..31
|
|
+ stl $22, 32($16) # L : bytes 0..7
|
|
+ stl $23, 40($16) # L : bytes 8..15
|
|
+ stl $24, 48($16) # L : bytes 16..23
|
|
+ stl $25, 56($16) # L : bytes 24..31
|
|
+ addl $17, 64, $17 # E : src += 32 bytes
|
|
+ addl $16, 64, $16 # E : dest += 32
|
|
+ subl $18, 64, $18 # E : count -= 64
|
|
+
|
|
+
|
|
+ nop # E :
|
|
+ cmple $18, 63, $1 # E : At least one more trip?
|
|
+ beq $1, $unroll_body
|
|
+ nop
|
|
+ nop
|
|
+ nop
|
|
+ br $tail_quads
|
|
+
|
|
+$unroll_body_512:
|
|
+# fillcs 128*4($17)
|
|
+ e_fillcs 128*20($17) #org
|
|
+
|
|
+ ldl $6, 0($17) # L0 : bytes 0..7
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+ ldl $4, 8($17) # L : bytes 8..15
|
|
+ ldl $5, 16($17) # L : bytes 16..23
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+ ldl $3, 24($17) # L : bytes 24..31
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+ stl $6, 0($16) # L : bytes 0..7
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+ stl $4, 8($16) # L : bytes 8..15
|
|
+ stl $5, 16($16) # L : bytes 16..23
|
|
+ nop # E :
|
|
+
|
|
+ stl $3, 24($16) # L : bytes 24..31
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+ ldl $22, 32($17) # L : bytes 0..7
|
|
+ ldl $23, 40($17) # L : bytes 8..15
|
|
+ # fallback wh64 address if < 2 more trips
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+ ldl $24, 48($17) # L : bytes 16..23
|
|
+ ldl $25, 56($17) # L : bytes 24..31
|
|
+ stl $22, 32($16) # L : bytes 0..7
|
|
+ stl $23, 40($16) # L : bytes 8..15
|
|
+ stl $24, 48($16) # L : bytes 16..23
|
|
+ stl $25, 56($16) # L : bytes 24..31
|
|
+ addl $17, 64, $17 # E : src += 32 bytes
|
|
+ addl $16, 64, $16 # E : dest += 32
|
|
+ subl $18, 64, $18 # E : count -= 64
|
|
+
|
|
+
|
|
+ nop # E :
|
|
+ cmple $18, 63, $1 # E : At least one more trip?
|
|
+
|
|
+
|
|
+
|
|
+// e_fillcs 128*7($16)
|
|
+
|
|
+ nop # E :
|
|
+ beq $1, $unroll_body_512
|
|
+
|
|
+
|
|
+$tail_quads:
|
|
+$no_unroll:
|
|
+ .align 4
|
|
+ subl $18, 8, $18 # E : At least a quad left?
|
|
+ blt $18, $less_than_8 # U : Nope
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+$move_a_quad:
|
|
+ ldl $1, 0($17) # L : fetch 8
|
|
+ subl $18, 8, $18 # E : count -= 8
|
|
+ addl $17, 8, $17 # E : src += 8
|
|
+ nop # E :
|
|
+
|
|
+ stl $1, 0($16) # L : store 8
|
|
+ addl $16, 8, $16 # E : dest += 8
|
|
+ bge $18, $move_a_quad # U :
|
|
+ nop # E :
|
|
+
|
|
+$less_than_8:
|
|
+ .align 4
|
|
+ addl $18, 8, $18 # E : add back for trailing bytes
|
|
+ ble $18, $nomoredata # U : All-done
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+ /* Trailing bytes */
|
|
+$tail_bytes:
|
|
+ subl $18, 1, $18 # E : count--
|
|
+ ldbu $1, 0($17) # L : fetch a byte
|
|
+ addl $17, 1, $17 # E : src++
|
|
+ nop # E :
|
|
+
|
|
+ stb $1, 0($16) # L : store a byte
|
|
+ addl $16, 1, $16 # E : dest++
|
|
+ bgt $18, $tail_bytes # U : more to be done?
|
|
+ nop # E :
|
|
+
|
|
+ /* branching to exit takes 3 extra cycles, so replicate exit here */
|
|
+ ret $31, ($26), 1 # L0 :
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+$less_8:
|
|
+ ldbu $1, 0($17) # L : fetch 1
|
|
+ subl $18, 1, $18 # E : count--
|
|
+ addl $17, 1, $17 # E : src++
|
|
+ nop # E :
|
|
+
|
|
+ stb $1, 0($16) # L : store
|
|
+ addl $16, 1, $16 # E : dest++
|
|
+ bgt $18, $less_8 # U : more to go?
|
|
+ nop
|
|
+
|
|
+$nomoredata:
|
|
+ ret $31, ($26), 1 # L0 :
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+
|
|
+J$H01:
|
|
+ ldbu $1,-1($17)
|
|
+ stb $1,-1($16)
|
|
+ br $both_0mod8
|
|
+
|
|
+J$H02:
|
|
+ ldh $1,-2($17)
|
|
+ sth $1,-2($16)
|
|
+ br $both_0mod8
|
|
+
|
|
+J$H03:
|
|
+ ldh $1,-2($17)
|
|
+ ldbu $2,-3($17)
|
|
+ sth $1,-2($16)
|
|
+ stb $2,-3($16)
|
|
+ br $both_0mod8
|
|
+
|
|
+J$H04:
|
|
+ ldw $1,-4($17)
|
|
+ stw $1,-4($16)
|
|
+ br $both_0mod8
|
|
+
|
|
+J$H05:
|
|
+ ldw $1,-4($17)
|
|
+ ldbu $2,-5($17)
|
|
+ stw $1,-4($16)
|
|
+ stb $2,-5($16)
|
|
+ br $both_0mod8
|
|
+
|
|
+J$H06:
|
|
+ ldw $1,-4($17)
|
|
+ ldh $2,-6($17)
|
|
+ stw $1,-4($16)
|
|
+ sth $2,-6($16)
|
|
+ br $both_0mod8
|
|
+
|
|
+J$H07:
|
|
+ ldw $1,-4($17)
|
|
+ ldh $2,-6($17)
|
|
+ ldbu $3,-7($17)
|
|
+ stw $1,-4($16)
|
|
+ sth $2,-6($16)
|
|
+ stb $3,-7($16)
|
|
+ br $both_0mod8
|
|
+
|
|
+END(memcpy)
|
|
+libc_hidden_builtin_def (memcpy)
|
|
+
|
|
+# .end memcpy
|
|
diff --git a/sysdeps/sw_64/sw8a/memmove.S b/sysdeps/sw_64/sw8a/memmove.S
|
|
new file mode 100644
|
|
index 00000000..22dda9ad
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/sw8a/memmove.S
|
|
@@ -0,0 +1,1120 @@
|
|
+/* Copy memory to memory until the specified number of bytes
|
|
+ has been copied. Overlap is handled correctly.
|
|
+ Copyright (C) 1991-2023 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+ Contributed by Torbjorn Granlund (tege@sics.se).
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library; if not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+/*
|
|
+ * Inputs:
|
|
+ * length in $18
|
|
+ * destination address in $16
|
|
+ * source address in $17
|
|
+ * return address in $26
|
|
+ *
|
|
+ * Outputs:
|
|
+ * bytes copied in $18
|
|
+ *
|
|
+ * Clobbers:
|
|
+ * $1,$2,$3,$4,$5,$6,$7,$8,$16,$17,$18,$24
|
|
+ * $f10,$f11,$f12,$f13,$f15,$f17,$f22,$f23,$f24,$f25,$f26,$f27,$f28,$f29
|
|
+ */
|
|
+
|
|
+/*
|
|
+ * Author:
|
|
+ * memmove simd version 1.0 (20190910) by Bao Zhaoling.
|
|
+ */
|
|
+
|
|
+
|
|
+#define __LABEL(x) x##:
|
|
+#define ENTRY(name) \
|
|
+ .globl name; \
|
|
+ .align 4; \
|
|
+ .ent name, 0; \
|
|
+ __LABEL(name) \
|
|
+ .frame sp, 0, ra
|
|
+#define END(sym) .end sym
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+ .type $jmppoint,@object
|
|
+$jumppoint:
|
|
+ .gprel32 $End
|
|
+ .gprel32 J$A01
|
|
+ .gprel32 J$A02
|
|
+ .gprel32 J$A03
|
|
+ .gprel32 J$A04
|
|
+ .gprel32 J$A05
|
|
+ .gprel32 J$A06
|
|
+ .gprel32 J$A07
|
|
+ .gprel32 J$A08
|
|
+ .gprel32 J$A09
|
|
+ .gprel32 J$A10
|
|
+ .gprel32 J$A11
|
|
+ .gprel32 J$A12
|
|
+ .gprel32 J$A13
|
|
+ .gprel32 J$A14
|
|
+ .gprel32 J$A15
|
|
+ .gprel32 J$A16
|
|
+ .gprel32 J$A17
|
|
+ .gprel32 J$A18
|
|
+ .gprel32 J$A19
|
|
+ .gprel32 J$A20
|
|
+ .gprel32 J$A21
|
|
+ .gprel32 J$A22
|
|
+ .gprel32 J$A23
|
|
+ .gprel32 J$A24
|
|
+ .gprel32 J$A25
|
|
+ .gprel32 J$A26
|
|
+ .gprel32 J$A27
|
|
+ .gprel32 J$A28
|
|
+ .gprel32 J$A29
|
|
+ .gprel32 J$A30
|
|
+ .gprel32 J$A31
|
|
+ .type $jmppointh,@object
|
|
+$jumppointh:
|
|
+ .gprel32 $Loopselect
|
|
+ .gprel32 J$H01
|
|
+ .gprel32 J$H02
|
|
+ .gprel32 J$H03
|
|
+ .gprel32 J$H04
|
|
+ .gprel32 J$H05
|
|
+ .gprel32 J$H06
|
|
+ .gprel32 J$H07
|
|
+ .gprel32 J$H08
|
|
+ .gprel32 J$H09
|
|
+ .gprel32 J$H10
|
|
+ .gprel32 J$H11
|
|
+ .gprel32 J$H12
|
|
+ .gprel32 J$H13
|
|
+ .gprel32 J$H14
|
|
+ .gprel32 J$H15
|
|
+ .gprel32 J$H16
|
|
+ .gprel32 J$H17
|
|
+ .gprel32 J$H18
|
|
+ .gprel32 J$H19
|
|
+ .gprel32 J$H20
|
|
+ .gprel32 J$H21
|
|
+ .gprel32 J$H22
|
|
+ .gprel32 J$H23
|
|
+ .gprel32 J$H24
|
|
+ .gprel32 J$H25
|
|
+ .gprel32 J$H26
|
|
+ .gprel32 J$H27
|
|
+ .gprel32 J$H28
|
|
+ .gprel32 J$H29
|
|
+ .gprel32 J$H30
|
|
+ .gprel32 J$H31
|
|
+ .type $jmppoint_o,@object
|
|
+$jumppoint_o:
|
|
+ .gprel32 $End
|
|
+ .gprel32 Jo$A01
|
|
+ .gprel32 Jo$A02
|
|
+ .gprel32 Jo$A03
|
|
+ .gprel32 Jo$A04
|
|
+ .gprel32 Jo$A05
|
|
+ .gprel32 Jo$A06
|
|
+ .gprel32 Jo$A07
|
|
+ .gprel32 Jo$A08
|
|
+ .gprel32 Jo$A09
|
|
+ .gprel32 Jo$A10
|
|
+ .gprel32 Jo$A11
|
|
+ .gprel32 Jo$A12
|
|
+ .gprel32 Jo$A13
|
|
+ .gprel32 Jo$A14
|
|
+ .gprel32 Jo$A15
|
|
+ .gprel32 Jo$A16
|
|
+ .gprel32 Jo$A17
|
|
+ .gprel32 Jo$A18
|
|
+ .gprel32 Jo$A19
|
|
+ .gprel32 Jo$A20
|
|
+ .gprel32 Jo$A21
|
|
+ .gprel32 Jo$A22
|
|
+ .gprel32 Jo$A23
|
|
+ .gprel32 Jo$A24
|
|
+ .gprel32 Jo$A25
|
|
+ .gprel32 Jo$A26
|
|
+ .gprel32 Jo$A27
|
|
+ .gprel32 Jo$A28
|
|
+ .gprel32 Jo$A29
|
|
+ .gprel32 Jo$A30
|
|
+ .gprel32 Jo$A31
|
|
+ENTRY(memmove)
|
|
+#memmove:
|
|
+ .prologue 1
|
|
+ ldgp $29, 0($27)
|
|
+ mov $16,$0
|
|
+ ldi $3,0($31)
|
|
+ ble $18,$End
|
|
+ cmple $16,$17,$2
|
|
+ beq $2,$L2
|
|
+
|
|
+$L1:
|
|
+# br $opp
|
|
+ call $at,memcpy
|
|
+ ret $31, ($26), 1
|
|
+$L2:
|
|
+ addl $16,$18,$16
|
|
+ addl $17,$18,$17
|
|
+ ldi $24,256($31)
|
|
+# subl $16,$17,$6
|
|
+# and $6,31,$6
|
|
+# bne $6,$Notaligned
|
|
+
|
|
+$Headalign:
|
|
+
|
|
+ and $16,7,$2
|
|
+ cmplt $18,$2,$6
|
|
+ bne $6,$Mvtail
|
|
+ subl $18,$2,$18
|
|
+ subl $16,$2,$16
|
|
+ subl $17,$2,$17
|
|
+ ldih $25, $jumppointh($29) !gprelhigh
|
|
+ s4addl $2,$25,$25
|
|
+ ldw $25, $jumppointh($25) !gprellow
|
|
+ addl $25,$29,$25
|
|
+ jmp ($25)
|
|
+
|
|
+$Loopselect:
|
|
+ cmple $18,255,$6
|
|
+ bne $6,$Endalign
|
|
+
|
|
+ and $17, 7, $6
|
|
+ bne $6, $beforeloop
|
|
+ and $17, 127, $6
|
|
+ beq $6, $beforeloop
|
|
+
|
|
+$align128:
|
|
+ subl $17,8,$17
|
|
+ subl $16,8,$16
|
|
+ ldl $1,0($17)
|
|
+ stl $1,0($16)
|
|
+ subl $18,8,$18
|
|
+ and $17,127,$6
|
|
+ bne $6,$align128
|
|
+ cmple $18,255,$6
|
|
+ bne $6,$Endalign
|
|
+
|
|
+$beforeloop:
|
|
+ ldi $6, 66060288
|
|
+ cmple $18, $6, $6
|
|
+ beq $6, $bigLoop
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+
|
|
+$Loop:
|
|
+ subl $16,64,$16
|
|
+ subl $17,64,$17
|
|
+ fillcs -6*128($17)
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldl $3,16($17)
|
|
+ ldl $4,24($17)
|
|
+ ldl $5,32($17)
|
|
+ ldl $6,40($17)
|
|
+ ldl $7,48($17)
|
|
+ ldl $8,56($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stl $3,16($16)
|
|
+ stl $4,24($16)
|
|
+ stl $5,32($16)
|
|
+ stl $6,40($16)
|
|
+ stl $7,48($16)
|
|
+ stl $8,56($16)
|
|
+ subl $18,64,$18
|
|
+ cmple $18,255,$6
|
|
+ beq $6,$Loop
|
|
+ br $Endalign
|
|
+
|
|
+$bigLoop:
|
|
+ subl $16,64,$16
|
|
+ subl $17,64,$17
|
|
+ fillcs -6*128($17)
|
|
+ e_fillcs -10*128($17)
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldl $3,16($17)
|
|
+ ldl $4,24($17)
|
|
+ ldl $5,32($17)
|
|
+ ldl $6,40($17)
|
|
+ ldl $7,48($17)
|
|
+ ldl $8,56($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stl $3,16($16)
|
|
+ stl $4,24($16)
|
|
+ stl $5,32($16)
|
|
+ stl $6,40($16)
|
|
+ stl $7,48($16)
|
|
+ stl $8,56($16)
|
|
+ subl $18,64,$18
|
|
+ cmple $18,255,$6
|
|
+ beq $6,$bigLoop
|
|
+
|
|
+
|
|
+
|
|
+$Endalign:
|
|
+ cmplt $18,32,$6
|
|
+ beq $6,$Mvsimd
|
|
+ br $Alignedtail
|
|
+
|
|
+$Mvquad:
|
|
+ cmplt $18,8,$6
|
|
+ bne $6,$Mvgprel32
|
|
+ subl $17,8,$17
|
|
+ subl $16,8,$16
|
|
+ ldl $1,0($17)
|
|
+ stl $1,0($16)
|
|
+ subl $18,8,$18
|
|
+ br $Mvquad
|
|
+
|
|
+$Mvgprel32:
|
|
+ cmplt $18,4,$6
|
|
+ bne $6,$Mvhalf
|
|
+ subl $17,4,$17
|
|
+ subl $16,4,$16
|
|
+ ldw $1,0($17)
|
|
+ stw $1,0($16)
|
|
+ subl $18,4,$18
|
|
+ br $Mvgprel32
|
|
+
|
|
+$Mvhalf:
|
|
+ cmplt $18,2,$6
|
|
+ bne $6,$Mvbyte
|
|
+ subl $17,2,$17
|
|
+ subl $16,2,$16
|
|
+ ldh $1,0($17)
|
|
+ sth $1,0($16)
|
|
+ subl $18,2,$18
|
|
+ br $Mvhalf
|
|
+
|
|
+$Mvbyte:
|
|
+ beq $18,$End
|
|
+ subl $17,1,$17
|
|
+ subl $16,1,$16
|
|
+ ldbu $1,0($17)
|
|
+ stb $1,0($16)
|
|
+ subl $18,1,$18
|
|
+ br $Mvbyte
|
|
+
|
|
+$Mvsimd:
|
|
+ subl $17,8,$17
|
|
+ subl $16,8,$16
|
|
+
|
|
+ ldl $22, 0($17)
|
|
+
|
|
+ stl $22, 0($16)
|
|
+
|
|
+ subl $18,8,$18
|
|
+ cmplt $18,32,$6
|
|
+ beq $6,$Mvsimd
|
|
+
|
|
+
|
|
+
|
|
+$Alignedtail:
|
|
+ ldi $25,$jumppoint
|
|
+ ldih $25, $jumppoint($29) !gprelhigh
|
|
+ s4addl $18,$25,$25
|
|
+ ldw $25, $jumppoint($25) !gprellow
|
|
+ addl $25,$29,$25
|
|
+ jmp ($25)
|
|
+
|
|
+
|
|
+$Mvtail:
|
|
+ and $2,7,$6
|
|
+ beq $6,$Alignedtail
|
|
+ and $2,3,$6
|
|
+ beq $6,$Mvgprel32
|
|
+ and $2,1,$6
|
|
+ beq $6,$Mvhalf
|
|
+ br $Mvbyte
|
|
+
|
|
+#################################################
|
|
+J$A01:
|
|
+ ldbu $1,-1($17)
|
|
+ stb $1,-1($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A02:
|
|
+ ldh $1,-2($17)
|
|
+ sth $1,-2($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A03:
|
|
+ ldh $1,-2($17)
|
|
+ ldbu $2,-3($17)
|
|
+ sth $1,-2($16)
|
|
+ stb $2,-3($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A04:
|
|
+ ldw $1,-4($17)
|
|
+ stw $1,-4($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A05:
|
|
+ ldw $1,-4($17)
|
|
+ ldbu $2,-5($17)
|
|
+ stw $1,-4($16)
|
|
+ stb $2,-5($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A06:
|
|
+ ldw $1,-4($17)
|
|
+ ldh $2,-6($17)
|
|
+ stw $1,-4($16)
|
|
+ sth $2,-6($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A07:
|
|
+ ldw $1,-4($17)
|
|
+ ldh $2,-6($17)
|
|
+ ldbu $3,-7($17)
|
|
+ stw $1,-4($16)
|
|
+ sth $2,-6($16)
|
|
+ stb $3,-7($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A08:
|
|
+ ldl $1,-8($17)
|
|
+ stl $1,-8($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A09:
|
|
+ ldl $1,-8($17)
|
|
+ ldbu $2,-9($17)
|
|
+ stl $1,-8($16)
|
|
+ stb $2,-9($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A10:
|
|
+ ldl $1,-8($17)
|
|
+ ldh $2,-10($17)
|
|
+ stl $1,-8($16)
|
|
+ sth $2,-10($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A11:
|
|
+ ldl $1,-8($17)
|
|
+ ldh $2,-10($17)
|
|
+ ldbu $3,-11($17)
|
|
+ stl $1,-8($16)
|
|
+ sth $2,-10($16)
|
|
+ stb $3,-11($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A12:
|
|
+ ldl $1,-8($17)
|
|
+ ldw $2,-12($17)
|
|
+ stl $1,-8($16)
|
|
+ stw $2,-12($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A13:
|
|
+ ldl $1,-8($17)
|
|
+ ldw $2,-12($17)
|
|
+ ldbu $3,-13($17)
|
|
+ stl $1,-8($16)
|
|
+ stw $2,-12($16)
|
|
+ stb $3,-13($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A14:
|
|
+ ldl $1,-8($17)
|
|
+ ldw $2,-12($17)
|
|
+ ldh $3,-14($17)
|
|
+ stl $1,-8($16)
|
|
+ stw $2,-12($16)
|
|
+ sth $3,-14($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A15:
|
|
+ ldl $1,-8($17)
|
|
+ ldw $2,-12($17)
|
|
+ ldh $3,-14($17)
|
|
+ ldbu $4,-15($17)
|
|
+ stl $1,-8($16)
|
|
+ stw $2,-12($16)
|
|
+ sth $3,-14($16)
|
|
+ stb $4,-15($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A16:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A17:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldbu $3,-17($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stb $3,-17($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A18:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldh $3,-18($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ sth $3,-18($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A19:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldh $3,-18($17)
|
|
+ ldbu $4,-19($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ sth $3,-18($16)
|
|
+ stb $4,-19($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A20:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldw $3,-20($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stw $3,-20($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A21:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldw $3,-20($17)
|
|
+ ldbu $4,-21($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stw $3,-20($16)
|
|
+ stb $4,-21($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A22:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldw $3,-20($17)
|
|
+ ldh $4,-22($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stw $3,-20($16)
|
|
+ sth $4,-22($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A23:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldw $3,-20($17)
|
|
+ ldh $4,-22($17)
|
|
+ ldbu $5,-23($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stw $3,-20($16)
|
|
+ sth $4,-22($16)
|
|
+ stb $5,-23($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A24:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldl $3,-24($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stl $3,-24($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A25:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldl $3,-24($17)
|
|
+ ldbu $4,-25($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stl $3,-24($16)
|
|
+ stb $4,-25($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A26:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldl $3,-24($17)
|
|
+ ldh $4,-26($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stl $3,-24($16)
|
|
+ sth $4,-26($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A27:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldl $3,-24($17)
|
|
+ ldh $4,-26($17)
|
|
+ ldbu $5,-27($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stl $3,-24($16)
|
|
+ sth $4,-26($16)
|
|
+ stb $5,-27($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A28:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldl $3,-24($17)
|
|
+ ldw $4,-28($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stl $3,-24($16)
|
|
+ stw $4,-28($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A29:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldl $3,-24($17)
|
|
+ ldw $4,-28($17)
|
|
+ ldbu $5,-29($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stl $3,-24($16)
|
|
+ stw $4,-28($16)
|
|
+ stb $5,-29($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A30:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldl $3,-24($17)
|
|
+ ldw $4,-28($17)
|
|
+ ldh $5,-30($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stl $3,-24($16)
|
|
+ stw $4,-28($16)
|
|
+ sth $5,-30($16)
|
|
+ ret $31, ($26), 1
|
|
+J$A31:
|
|
+ ldl $1,-8($17)
|
|
+ ldl $2,-16($17)
|
|
+ ldl $3,-24($17)
|
|
+ ldw $4,-28($17)
|
|
+ ldh $5,-30($17)
|
|
+ ldbu $6,-31($17)
|
|
+ stl $1,-8($16)
|
|
+ stl $2,-16($16)
|
|
+ stl $3,-24($16)
|
|
+ stw $4,-28($16)
|
|
+ sth $5,-30($16)
|
|
+ stb $6,-31($16)
|
|
+ ret $31, ($26), 1
|
|
+
|
|
+
|
|
+J$H01:
|
|
+ ldbu $1,0($17)
|
|
+ stb $1,0($16)
|
|
+ br $Loopselect
|
|
+J$H02:
|
|
+ ldh $1,0($17)
|
|
+ sth $1,0($16)
|
|
+ br $Loopselect
|
|
+J$H03:
|
|
+ ldh $1,0($17)
|
|
+ ldbu $2,2($17)
|
|
+ sth $1,0($16)
|
|
+ stb $2,2($16)
|
|
+ br $Loopselect
|
|
+J$H04:
|
|
+ ldw $1,0($17)
|
|
+ stw $1,0($16)
|
|
+ br $Loopselect
|
|
+J$H05:
|
|
+ ldw $1,0($17)
|
|
+ ldbu $2,4($17)
|
|
+ stw $1,0($16)
|
|
+ stb $2,4($16)
|
|
+ br $Loopselect
|
|
+J$H06:
|
|
+ ldw $1,0($17)
|
|
+ ldh $2,4($17)
|
|
+ stw $1,0($16)
|
|
+ sth $2,4($16)
|
|
+ br $Loopselect
|
|
+J$H07:
|
|
+ ldw $1,0($17)
|
|
+ ldh $2,4($17)
|
|
+ ldbu $3,6($17)
|
|
+ stw $1,0($16)
|
|
+ sth $2,4($16)
|
|
+ stb $3,6($16)
|
|
+ br $Loopselect
|
|
+J$H08:
|
|
+ ldl $1,0($17)
|
|
+ stl $1,0($16)
|
|
+ br $Loopselect
|
|
+J$H09:
|
|
+ ldl $1,0($17)
|
|
+ ldbu $2,8($17)
|
|
+ stl $1,0($16)
|
|
+ stb $2,8($16)
|
|
+ br $Loopselect
|
|
+J$H10:
|
|
+ ldl $1,0($17)
|
|
+ ldh $2,8($17)
|
|
+ stl $1,0($16)
|
|
+ sth $2,8($16)
|
|
+ br $Loopselect
|
|
+J$H11:
|
|
+ ldl $1,0($17)
|
|
+ ldh $2,8($17)
|
|
+ ldbu $3,10($17)
|
|
+ stl $1,0($16)
|
|
+ sth $2,8($16)
|
|
+ stb $3,10($16)
|
|
+ br $Loopselect
|
|
+J$H12:
|
|
+ ldl $1,0($17)
|
|
+ ldw $2,8($17)
|
|
+ stl $1,0($16)
|
|
+ stw $2,8($16)
|
|
+ br $Loopselect
|
|
+J$H13:
|
|
+ ldl $1,0($17)
|
|
+ ldw $2,8($17)
|
|
+ ldbu $3,12($17)
|
|
+ stl $1,0($16)
|
|
+ stw $2,8($16)
|
|
+ stb $3,12($16)
|
|
+ br $Loopselect
|
|
+J$H14:
|
|
+ ldl $1,0($17)
|
|
+ ldw $2,8($17)
|
|
+ ldh $3,12($17)
|
|
+ stl $1,0($16)
|
|
+ stw $2,8($16)
|
|
+ sth $3,12($16)
|
|
+ br $Loopselect
|
|
+J$H15:
|
|
+ ldl $1,0($17)
|
|
+ ldw $2,8($17)
|
|
+ ldh $3,12($17)
|
|
+ ldbu $4,14($17)
|
|
+ stl $1,-0($16)
|
|
+ stw $2,8($16)
|
|
+ sth $3,12($16)
|
|
+ stb $4,14($16)
|
|
+ br $Loopselect
|
|
+J$H16:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ br $Loopselect
|
|
+J$H17:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldbu $3,16($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stb $3,16($16)
|
|
+ br $Loopselect
|
|
+J$H18:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldh $3,16($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ sth $3,16($16)
|
|
+ br $Loopselect
|
|
+J$H19:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldh $3,16($17)
|
|
+ ldbu $4,18($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ sth $3,16($16)
|
|
+ stb $4,18($16)
|
|
+ br $Loopselect
|
|
+J$H20:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldw $3,16($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stw $3,16($16)
|
|
+ br $Loopselect
|
|
+J$H21:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldw $3,16($17)
|
|
+ ldbu $4,20($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stw $3,16($16)
|
|
+ stb $4,20($16)
|
|
+ br $Loopselect
|
|
+J$H22:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldw $3,16($17)
|
|
+ ldh $4,20($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stw $3,16($16)
|
|
+ sth $4,20($16)
|
|
+ br $Loopselect
|
|
+J$H23:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldw $3,16($17)
|
|
+ ldh $4,20($17)
|
|
+ ldbu $5,22($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stw $3,16($16)
|
|
+ sth $4,20($16)
|
|
+ stb $5,22($16)
|
|
+ br $Loopselect
|
|
+J$H24:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldl $3,16($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stl $3,16($16)
|
|
+ br $Loopselect
|
|
+J$H25:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldl $3,16($17)
|
|
+ ldbu $4,24($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stl $3,16($16)
|
|
+ stb $4,24($16)
|
|
+ br $Loopselect
|
|
+J$H26:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldl $3,16($17)
|
|
+ ldh $4,24($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stl $3,16($16)
|
|
+ sth $4,24($16)
|
|
+ br $Loopselect
|
|
+J$H27:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldl $3,16($17)
|
|
+ ldh $4,24($17)
|
|
+ ldbu $5,26($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stl $3,16($16)
|
|
+ sth $4,24($16)
|
|
+ stb $5,26($16)
|
|
+ br $Loopselect
|
|
+J$H28:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldl $3,16($17)
|
|
+ ldw $4,24($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stl $3,16($16)
|
|
+ stw $4,24($16)
|
|
+ br $Loopselect
|
|
+J$H29:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldl $3,16($17)
|
|
+ ldw $4,24($17)
|
|
+ ldbu $5,28($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stl $3,16($16)
|
|
+ stw $4,24($16)
|
|
+ stb $5,28($16)
|
|
+ br $Loopselect
|
|
+J$H30:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldl $3,16($17)
|
|
+ ldw $4,24($17)
|
|
+ ldh $5,28($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stl $3,16($16)
|
|
+ stw $4,24($16)
|
|
+ sth $5,28($16)
|
|
+ br $Loopselect
|
|
+J$H31:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldl $3,16($17)
|
|
+ ldw $4,24($17)
|
|
+ ldh $5,28($17)
|
|
+ ldbu $6,30($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stl $3,16($16)
|
|
+ stw $4,24($16)
|
|
+ sth $5,28($16)
|
|
+ stb $6,30($16)
|
|
+ br $Loopselect
|
|
+
|
|
+######################################
|
|
+Jo$A01:
|
|
+ ldbu $1,0($17)
|
|
+ stb $1,0($16)
|
|
+ br $End
|
|
+Jo$A02:
|
|
+
|
|
+ ldh $1,0($17)
|
|
+ sth $1,0($16)
|
|
+ br $End
|
|
+Jo$A03:
|
|
+ ldh $1,0($17)
|
|
+ ldbu $2,2($17)
|
|
+ sth $1,0($16)
|
|
+ stb $2,2($16)
|
|
+ br $End
|
|
+Jo$A04:
|
|
+ ldw $1,0($17)
|
|
+ stw $1,0($16)
|
|
+ br $End
|
|
+Jo$A05:
|
|
+ ldw $1,0($17)
|
|
+ ldbu $2,4($17)
|
|
+ stw $1,0($16)
|
|
+ stb $2,4($16)
|
|
+ br $End
|
|
+Jo$A06:
|
|
+ ldw $1,0($17)
|
|
+ ldh $2,4($17)
|
|
+ stw $1,0($16)
|
|
+ sth $2,4($16)
|
|
+ br $End
|
|
+Jo$A07:
|
|
+ ldw $1,0($17)
|
|
+ ldh $2,4($17)
|
|
+ ldbu $3,6($17)
|
|
+ stw $1,0($16)
|
|
+ sth $2,4($16)
|
|
+ stb $3,6($16)
|
|
+ br $End
|
|
+Jo$A08:
|
|
+ ldl $1,0($17)
|
|
+ stl $1,0($16)
|
|
+ br $End
|
|
+Jo$A09:
|
|
+ ldl $1,0($17)
|
|
+ ldbu $2,8($17)
|
|
+ stl $1,0($16)
|
|
+ stb $2,8($16)
|
|
+ br $End
|
|
+Jo$A10:
|
|
+ ldl $1,0($17)
|
|
+ ldh $2,8($17)
|
|
+ stl $1,0($16)
|
|
+ sth $2,8($16)
|
|
+ br $End
|
|
+Jo$A11:
|
|
+ ldl $1,0($17)
|
|
+ ldh $2,8($17)
|
|
+ ldbu $3,10($17)
|
|
+ stl $1,0($16)
|
|
+ sth $2,8($16)
|
|
+ stb $3,10($16)
|
|
+ br $End
|
|
+Jo$A12:
|
|
+ ldl $1,0($17)
|
|
+ ldw $2,8($17)
|
|
+ stl $1,0($16)
|
|
+ stw $2,8($16)
|
|
+ br $End
|
|
+Jo$A13:
|
|
+ ldl $1,0($17)
|
|
+ ldw $2,8($17)
|
|
+ ldbu $3,12($17)
|
|
+ stl $1,0($16)
|
|
+ stw $2,8($16)
|
|
+ stb $3,12($16)
|
|
+ br $End
|
|
+Jo$A14:
|
|
+ ldl $1,0($17)
|
|
+ ldw $2,8($17)
|
|
+ ldh $3,12($17)
|
|
+ stl $1,0($16)
|
|
+ stw $2,8($16)
|
|
+ sth $3,12($16)
|
|
+ br $End
|
|
+Jo$A15:
|
|
+ ldl $1,0($17)
|
|
+ ldw $2,8($17)
|
|
+ ldh $3,12($17)
|
|
+ ldbu $4,14($17)
|
|
+ stl $1,-0($16)
|
|
+ stw $2,8($16)
|
|
+ sth $3,12($16)
|
|
+ stb $4,14($16)
|
|
+ br $End
|
|
+Jo$A16:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ br $End
|
|
+Jo$A17:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldbu $3,16($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stb $3,16($16)
|
|
+ br $End
|
|
+Jo$A18:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldh $3,16($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ sth $3,16($16)
|
|
+ br $End
|
|
+Jo$A19:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldh $3,16($17)
|
|
+ ldbu $4,18($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ sth $3,16($16)
|
|
+ stb $4,18($16)
|
|
+ br $End
|
|
+Jo$A20:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldw $3,16($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stw $3,16($16)
|
|
+ br $End
|
|
+Jo$A21:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldw $3,16($17)
|
|
+ ldbu $4,20($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stw $3,16($16)
|
|
+ stb $4,20($16)
|
|
+ br $End
|
|
+Jo$A22:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldw $3,16($17)
|
|
+ ldh $4,20($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stw $3,16($16)
|
|
+ sth $4,20($16)
|
|
+ br $End
|
|
+Jo$A23:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldw $3,16($17)
|
|
+ ldh $4,20($17)
|
|
+ ldbu $5,22($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stw $3,16($16)
|
|
+ sth $4,20($16)
|
|
+ stb $5,22($16)
|
|
+ br $End
|
|
+Jo$A24:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldl $3,16($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stl $3,16($16)
|
|
+ br $End
|
|
+Jo$A25:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldl $3,16($17)
|
|
+ ldbu $4,24($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stl $3,16($16)
|
|
+ stb $4,24($16)
|
|
+ br $End
|
|
+Jo$A26:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldl $3,16($17)
|
|
+ ldh $4,24($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stl $3,16($16)
|
|
+ sth $4,24($16)
|
|
+ br $End
|
|
+Jo$A27:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldl $3,16($17)
|
|
+ ldh $4,24($17)
|
|
+ ldbu $5,26($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stl $3,16($16)
|
|
+ sth $4,24($16)
|
|
+ stb $5,26($16)
|
|
+ br $End
|
|
+Jo$A28:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldl $3,16($17)
|
|
+ ldw $4,24($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stl $3,16($16)
|
|
+ stw $4,24($16)
|
|
+ br $End
|
|
+Jo$A29:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldl $3,16($17)
|
|
+ ldw $4,24($17)
|
|
+ ldbu $5,28($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stl $3,16($16)
|
|
+ stw $4,24($16)
|
|
+ stb $5,28($16)
|
|
+ br $End
|
|
+Jo$A30:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldl $3,16($17)
|
|
+ ldw $4,24($17)
|
|
+ ldh $5,28($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stl $3,16($16)
|
|
+ stw $4,24($16)
|
|
+ sth $5,28($16)
|
|
+ br $End
|
|
+Jo$A31:
|
|
+ ldl $1,0($17)
|
|
+ ldl $2,8($17)
|
|
+ ldl $3,16($17)
|
|
+ ldw $4,24($17)
|
|
+ ldh $5,28($17)
|
|
+ ldbu $6,30($17)
|
|
+ stl $1,0($16)
|
|
+ stl $2,8($16)
|
|
+ stl $3,16($16)
|
|
+ stw $4,24($16)
|
|
+ sth $5,28($16)
|
|
+ stb $6,30($16)
|
|
+ br $End
|
|
+$End:
|
|
+ ret $31, ($26), 1
|
|
+END(memmove)
|
|
+libc_hidden_builtin_def (memmove)
|
|
+
|
|
+ #.end memmove
|
|
diff --git a/sysdeps/sw_64/sw8a/memset.S b/sysdeps/sw_64/sw8a/memset.S
|
|
new file mode 100644
|
|
index 00000000..eb47b2c4
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/sw8a/memset.S
|
|
@@ -0,0 +1,332 @@
|
|
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
|
|
+ Contributed by Richard Henderson (rth@tamu.edu)
|
|
+ SW6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+# .arch sw6b
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+ENTRY(memset)
|
|
+#memset:
|
|
+#ifdef PROF
|
|
+ ldgp gp, 0(pv)
|
|
+ ldi AT, _mcount
|
|
+ call AT, (AT), _mcount
|
|
+ .prologue 1
|
|
+#else
|
|
+ .prologue 0
|
|
+#endif
|
|
+
|
|
+ /*
|
|
+ * Serious stalling happens. The only way to mitigate this is to
|
|
+ * undertake a major re-write to interleave the constant materialization
|
|
+ * with other parts of the fall-through code. This is important, even
|
|
+ * though it makes maintenance tougher.
|
|
+ * Do this later.
|
|
+ */
|
|
+# mov $17, $22
|
|
+ and $17, 255, $1 # E : 00000000000000ch
|
|
+ ins0b $17, 1, $2 # U : 000000000000ch00
|
|
+ mov $16, $0 # E : return value
|
|
+ ble $18, $end # U : zero length requested?
|
|
+
|
|
+ addl $18, $16, $6 # E : max address to write to
|
|
+ or $1, $2, $17 # E : 000000000000chch
|
|
+ ins0b $1, 2, $3 # U : 0000000000ch0000
|
|
+ ins0b $1, 3, $4 # U : 00000000ch000000
|
|
+
|
|
+ or $3, $4, $3 # E : 00000000chch0000
|
|
+ ins1b $17, 4, $5 # U : 0000chch00000000
|
|
+ xor $16, $6, $1 # E : will complete write be within one quadword?
|
|
+ ins1b $17, 6, $2 # U : chch000000000000
|
|
+
|
|
+ or $17, $3, $17 # E : 00000000chchchch
|
|
+ or $2, $5, $2 # E : chchchch00000000
|
|
+ bic $1, 7, $1 # E : fit within a single quadword?
|
|
+ and $16, 7, $3 # E : Target addr misalignment
|
|
+
|
|
+ or $17, $2, $17 # E : chchchchchchchch
|
|
+
|
|
+
|
|
+ mov $16, $5
|
|
+ beq $1, $within_quad # U :
|
|
+ nop # E :
|
|
+ beq $3, $aligned # U : target is 0mod8
|
|
+
|
|
+
|
|
+ /*
|
|
+ * Target address is misaligned, and won't fit within a quadword.
|
|
+ */
|
|
+
|
|
+
|
|
+ ldi $2, 8
|
|
+ subl $2, $3, $3
|
|
+
|
|
+
|
|
+
|
|
+$misaligned:
|
|
+ stb $17, 0($16)
|
|
+ subl $18, 1, $18
|
|
+ addl $16, 1, $16
|
|
+ subl $3, 1, $3
|
|
+ bne $3, $misaligned
|
|
+
|
|
+
|
|
+
|
|
+$aligned:
|
|
+ /*
|
|
+ * We are now guaranteed to be quad aligned, with at least
|
|
+ * one partial quad to write.
|
|
+ */
|
|
+
|
|
+ sra $18, 3, $3 # U : Number of remaining quads to write
|
|
+ and $18, 7, $18 # E : Number of trailing bytes to write
|
|
+ mov $16, $5 # E : Save dest address
|
|
+ beq $3, $no_quad # U : tail stuff only
|
|
+
|
|
+ /*
|
|
+ * It's worth the effort to unroll this and use wh64 if possible.
|
|
+ * At this point, entry values are:
|
|
+ * $16 Current destination address
|
|
+ * $5 A copy of $16
|
|
+ * $6 The max quadword address to write to
|
|
+ * $18 Number trailer bytes
|
|
+ * $3 Number quads to write
|
|
+ */
|
|
+# and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
|
|
+ and $16, 0x1f, $2 # E : Forward work (only useful for unrolled loop)
|
|
+ subl $3, 16, $4 # E : Only try to unroll if > 128 bytes
|
|
+ subl $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
|
|
+ blt $4, $loop # U :
|
|
+
|
|
+ /*
|
|
+ * We know we've got at least 16 quads, minimum of one trip
|
|
+ * through unrolled loop. Do a quad at a time to get us 0mod64
|
|
+ * aligned.
|
|
+ */
|
|
+
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+ nop # E :
|
|
+# beq $1, $bigalign # U :
|
|
+ beq $2, $do_wh64 # U :
|
|
+$alignmod32:
|
|
+ stl $17, 0($5) # L :
|
|
+ subl $3, 1, $3 # E : For consistency later
|
|
+ addl $1, 8, $1 # E : Increment towards zero for alignment
|
|
+# addl $5, 8, $4 # E : Initial wh64 address (filler instruction)
|
|
+
|
|
+ nop
|
|
+ nop
|
|
+ addl $5, 8, $5 # E : Inc address
|
|
+ blt $1, $alignmod32 # U :
|
|
+
|
|
+
|
|
+
|
|
+ /*
|
|
+ * $3 - number quads left to go
|
|
+ * $5 - target address (aligned 0mod64)
|
|
+ * $17 - mask of stuff to store
|
|
+ * Scratch registers available: $7, $2, $4, $1
|
|
+ * We know that we'll be taking a minimum of one trip through.
|
|
+ * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
|
|
+ * Assumes the wh64 needs to be for 2 trips through the loop in the future.
|
|
+ * The wh64 is issued on for the starting destination address for trip +2
|
|
+ * through the loop, and if there are less than two trips left, the target
|
|
+ * address will be for the current trip.
|
|
+ */
|
|
+
|
|
+$do_wh64:
|
|
+# wh64 ($4) # L1 : memory subsystem write hint
|
|
+ subl $3, 24, $2 # E : For determining future wh64 addresses
|
|
+ stl $17, 0($5) # L :
|
|
+ nop # E :
|
|
+
|
|
+# addl $5, 128, $4 # E : speculative target of next wh64
|
|
+ stl $17, 8($5) # L :
|
|
+ stl $17, 16($5) # L :
|
|
+ addl $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
|
|
+
|
|
+ stl $17, 24($5) # L :
|
|
+ stl $17, 32($5) # L :
|
|
+# sellt $2, $7, $4, $4 # E : Latency 2, extra mapping cycle
|
|
+ nop
|
|
+
|
|
+ stl $17, 40($5) # L :
|
|
+ stl $17, 48($5) # L :
|
|
+ subl $3, 16, $2 # E : Repeat the loop at least once more?
|
|
+ nop
|
|
+
|
|
+ stl $17, 56($5) # L :
|
|
+ addl $5, 64, $5 # E :
|
|
+ subl $3, 8, $3 # E :
|
|
+ bge $2, $do_wh64 # U :
|
|
+
|
|
+ nop
|
|
+ nop
|
|
+ nop
|
|
+ beq $3, $no_quad # U : Might have finished already
|
|
+
|
|
+ nop
|
|
+ nop
|
|
+ nop
|
|
+ .align 4
|
|
+ /*
|
|
+ * Simple loop for trailing quadwords, or for small amounts
|
|
+ * of data (where we can't use an unrolled loop and wh64)
|
|
+ */
|
|
+$loop:
|
|
+ stl $17, 0($5) # L :
|
|
+ subl $3, 1, $3 # E : Decrement number quads left
|
|
+ addl $5, 8, $5 # E : Inc address
|
|
+ bne $3, $loop # U : more?
|
|
+
|
|
+#$no_quad:
|
|
+ /*
|
|
+ * Write 0..7 trailing bytes.
|
|
+ */
|
|
+# nop # E :
|
|
+# beq $18, $end # U : All done?
|
|
+# ldl $7, 0($5) # L :
|
|
+# mask7b $7, $6, $2 # U : Mask final quad
|
|
+#
|
|
+# ins7b $17, $6, $4 # U : New bits
|
|
+# or $2, $4, $1 # E : Put it all together
|
|
+# stl $1, 0($5) # L : And back to memory
|
|
+# ret $31,($26),1 # L0 :
|
|
+
|
|
+# nop # E :
|
|
+# beq $18, $end # U : All done?
|
|
+# stb $22, 0($5)
|
|
+# subl $18, 1, $18 # E : Decrement number quads left
|
|
+# addl $5, 1, $5 # E : Inc address
|
|
+# bne $18, $no_quad # U : more?
|
|
+# ret $31, ($26), 1 # L0 :
|
|
+
|
|
+
|
|
+# nop # E :
|
|
+# beq $18, $end # U : All done?
|
|
+# cmpeq $18, 1, $6
|
|
+# bne $6, J$H01
|
|
+# cmpeq $18, 2, $6
|
|
+# bne $6, J$H02
|
|
+# cmpeq $18, 3, $6
|
|
+# bne $6, J$H03
|
|
+# cmpeq $18, 4, $6
|
|
+# bne $6, J$H04
|
|
+# cmpeq $18, 5, $6
|
|
+# bne $6, J$H05
|
|
+# cmpeq $18, 6, $6
|
|
+# bne $6, J$H06
|
|
+# br J$H07
|
|
+
|
|
+
|
|
+$no_quad:
|
|
+ /*
|
|
+ * Write 0..7 trailing bytes.
|
|
+ */
|
|
+ nop # E :
|
|
+ beq $18, $end # U : All done?
|
|
+
|
|
+
|
|
+$within_quad1:
|
|
+
|
|
+ stb $17, 0($5)
|
|
+ subl $18, 1, $18
|
|
+ addl $5, 1, $5
|
|
+ bne $18, $within_quad1
|
|
+
|
|
+/* ldl $7, 0($5) # L :
|
|
+ mask7b $7, $6, $2 # U : Mask final quad
|
|
+
|
|
+ ins7b $17, $6, $4 # U : New bits
|
|
+ or $2, $4, $1 # E : Put it all together
|
|
+ stl $1, 0($5) # L : And back to memory
|
|
+*/
|
|
+
|
|
+ ret $31,($26),1 # L0 :
|
|
+
|
|
+
|
|
+$within_quad:
|
|
+
|
|
+ stb $17, 0($16)
|
|
+ subl $18, 1, $18
|
|
+ addl $16, 1, $16
|
|
+ bne $18, $within_quad
|
|
+
|
|
+
|
|
+$end:
|
|
+ nop
|
|
+ nop
|
|
+ nop
|
|
+ ret $31,($26),1 # L0 :
|
|
+
|
|
+J$H01:
|
|
+# ldbu $1, 0($22)
|
|
+ stb $22, 0($5)
|
|
+ ret $31, ($26), 1 # L0 :
|
|
+
|
|
+
|
|
+J$H02:
|
|
+ ldh $1, 0($17)
|
|
+ sth $1, 0($16)
|
|
+ ret $31, ($26), 1 # L0 :
|
|
+
|
|
+J$H03:
|
|
+ ldh $1, 0($17)
|
|
+ ldbu $2, 2($17)
|
|
+ sth $1, 0($16)
|
|
+ stb $2, 2($16)
|
|
+ ret $31, ($26), 1 # L0 :
|
|
+
|
|
+J$H04:
|
|
+ ldw $1, 0($17)
|
|
+ stw $1, 0($16)
|
|
+ ret $31, ($26), 1 # L0 :
|
|
+
|
|
+J$H05:
|
|
+ ldw $1, 0($17)
|
|
+ ldbu $2, 4($17)
|
|
+ stw $1, 0($16)
|
|
+ stb $2, 4($16)
|
|
+ ret $31, ($26), 1 # L0 :
|
|
+
|
|
+J$H06:
|
|
+ ldw $1, 0($17)
|
|
+ ldh $2, 4($17)
|
|
+ stw $1, 0($16)
|
|
+ sth $2, 4($16)
|
|
+ ret $31, ($26), 1 # L0 :
|
|
+J$H07:
|
|
+ ldw $1, 0($17)
|
|
+ ldh $2, 4($17)
|
|
+ ldbu $3, 6($17)
|
|
+ stw $1, 0($16)
|
|
+ sth $2, 4($16)
|
|
+ stb $3, 6($16)
|
|
+ ret $31, ($26), 1 # L0 :
|
|
+
|
|
+
|
|
+ END(memset)
|
|
+libc_hidden_builtin_def (memset)
|
|
+# .end memset
|
|
diff --git a/sysdeps/sw_64/sw8a/strcat.S b/sysdeps/sw_64/sw8a/strcat.S
|
|
new file mode 100644
|
|
index 00000000..02037980
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/sw8a/strcat.S
|
|
@@ -0,0 +1,669 @@
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+ Contributed by Richard Henderson <rth@tamu.edu>, 1996.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Append a null-terminated string from SRC to DST. */
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .text
|
|
+
|
|
+ENTRY(strcat)
|
|
+ ldgp gp, 0(pv)
|
|
+#ifdef PROF
|
|
+ .set noat
|
|
+ ldi AT, _mcount
|
|
+ call AT, (AT), _mcount
|
|
+ .set at
|
|
+#endif
|
|
+ .prologue 1
|
|
+ # we know the return reg is v0 and v1
|
|
+
|
|
+ mov a0, v0 # set up return value
|
|
+
|
|
+ /* Find the end of the string. */
|
|
+
|
|
+ ldl_u t0, 0(a0) # load first quadword (a0 may be misaligned)
|
|
+ # so we need to use ldl_u and what we cost ?
|
|
+ ldi t1, -1(zero)
|
|
+ ins7b t1, a0, t1
|
|
+ andnot a0, 7, a0
|
|
+ or t1, t0, t0
|
|
+ cmpgeb zero, t0, t1 # t1 <- bitmask: bit i == 1 <==> i-th byte == 0
|
|
+ bne t1, $found #
|
|
+
|
|
+#cp string
|
|
+$loop:
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ #bne t1,$found
|
|
+ beq t1, $loop
|
|
+
|
|
+$found:
|
|
+ cttz t1,t2
|
|
+ addl a0, t2, a0
|
|
+
|
|
+ /* Now do the append. */
|
|
+ /*which means copy char in the s1 tail*/
|
|
+#a0 = DST a1=src
|
|
+ cfi_startproc
|
|
+ #cfi_return_column (t9)
|
|
+
|
|
+
|
|
+ .align 3
|
|
+__stxcpy:
|
|
+ /* Are source and destination co-aligned? */
|
|
+ xor a0, a1, t0 # e0 :
|
|
+ unop # :
|
|
+ and t0, 7, t0 # e0 :
|
|
+ bne t0, $unaligned # .. e1 :
|
|
+
|
|
+ /* We are co-aligned; take care of a partial first word. */
|
|
+ ldl_u t1, 0(a1) # e0 : load first src word
|
|
+ and a0, 7, t0 # .. e1 : take care not to load a word ...
|
|
+ addl a1, 8, a1 # e0 :
|
|
+ beq t0, stxcpy_aligned # .. e1 : ... if we wont need it
|
|
+ ldl_u t0, 0(a0) # e0 :
|
|
+ br stxcpy_aligned # .. e1 :
|
|
+
|
|
+
|
|
+/* The source and destination are not co-aligned. Align the destination
|
|
+ and cope. We have to be very careful about not reading too much and
|
|
+ causing a SEGV. */
|
|
+
|
|
+ .align 3
|
|
+$u_head:
|
|
+ /* We know just enough now to be able to assemble the first
|
|
+ full source word. We can still find a zero at the end of it
|
|
+ that prevents us from outputting the whole thing.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the first dest word, for masking back in, if needed else 0
|
|
+ t1 == the low bits of the first source word
|
|
+ t6 == bytemask that is -1 in dest word bytes */
|
|
+
|
|
+ ldl_u t2, 8(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+
|
|
+ ext3b t1, a1, t1 # e0 :
|
|
+ ext7b t2, a1, t4 # e0 :
|
|
+ mask3b t0, a0, t0 # e0 :
|
|
+ or t1, t4, t1 # .. e1 :
|
|
+ mask7b t1, a0, t1 # e0 :
|
|
+ or t0, t1, t1 # e1 :
|
|
+
|
|
+ or t1, t6, t6 # e0 :
|
|
+ cmpgeb zero, t6, t7 # .. e1 :
|
|
+ ldi t6, -1 # e0 : for masking just below
|
|
+ bne t7, $u_final # .. e1 :
|
|
+
|
|
+ mask3b t6, a1, t6 # e0 : mask out the bits we have
|
|
+ or t6, t2, t2 # e1 : already extracted before
|
|
+ cmpgeb zero, t2, t7 # e0 : testing eos
|
|
+ bne t7, $u_late_head_exit # .. e1 (zdb)
|
|
+
|
|
+ /* Finally, we've got all the stupid leading edge cases taken care
|
|
+ of and we can set up to enter the main loop. */
|
|
+
|
|
+ stl_u t1, 0(a0) # e0 : store first output word
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t0 # e0 : position ho-bits of lo word
|
|
+ ldl_u t2, 8(a1) # .. e1 : read next high-order source word
|
|
+ addl a1, 8, a1 # e0 :
|
|
+ cmpgeb zero, t2, t7 # .. e1 :
|
|
+ nop # e0 :
|
|
+ bne t7, $u_eos # .. e1 :
|
|
+
|
|
+ /* Unaligned copy main loop. In order to avoid reading too much,
|
|
+ the loop is structured to detect zeros in aligned source words.
|
|
+ This has, unfortunately, effectively pulled half of a loop
|
|
+ iteration out into the head and half into the tail, but it does
|
|
+ prevent nastiness from accumulating in the very thing we want
|
|
+ to run as fast as possible.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the shifted high-order bits from the previous source word
|
|
+ t2 == the unshifted current source word
|
|
+
|
|
+ We further know that t2 does not contain a null terminator. */
|
|
+
|
|
+ .align 3
|
|
+$u_loop:
|
|
+ ext7b t2, a1, t1 # e0 : extract high bits for current word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ ext3b t2, a1, t3 # e0 : extract low bits for next time
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ or t0, t1, t1 # e0 : current dst word now complete
|
|
+ ldl_u t2, 0(a1) # .. e1 : load high word for next time
|
|
+ stl_u t1, -8(a0) # e0 : save the current word
|
|
+ mov t3, t0 # .. e1 :
|
|
+ cmpgeb zero, t2, t7 # e0 : test new word for eos
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ ext7b t2, a1, t1 # e0 : extract high bits for current word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ ext3b t2, a1, t3 # e0 : extract low bits for next time
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ or t0, t1, t1 # e0 : current dst word now complete
|
|
+ ldl_u t2, 0(a1) # .. e1 : load high word for next time
|
|
+ stl_u t1, -8(a0) # e0 : save the current word
|
|
+ mov t3, t0 # .. e1 :
|
|
+ cmpgeb zero, t2, t7 # e0 : test new word for eos
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ ext7b t2, a1, t1 # e0 : extract high bits for current word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ ext3b t2, a1, t3 # e0 : extract low bits for next time
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ or t0, t1, t1 # e0 : current dst word now complete
|
|
+ ldl_u t2, 0(a1) # .. e1 : load high word for next time
|
|
+ stl_u t1, -8(a0) # e0 : save the current word
|
|
+ mov t3, t0 # .. e1 :
|
|
+ cmpgeb zero, t2, t7 # e0 : test new word for eos
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ ext7b t2, a1, t1 # e0 : extract high bits for current word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ ext3b t2, a1, t3 # e0 : extract low bits for next time
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ or t0, t1, t1 # e0 : current dst word now complete
|
|
+ ldl_u t2, 0(a1) # .. e1 : load high word for next time
|
|
+ stl_u t1, -8(a0) # e0 : save the current word
|
|
+ mov t3, t0 # .. e1 :
|
|
+ cmpgeb zero, t2, t7 # e0 : test new word for eos
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ ext7b t2, a1, t1 # e0 : extract high bits for current word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ ext3b t2, a1, t3 # e0 : extract low bits for next time
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ or t0, t1, t1 # e0 : current dst word now complete
|
|
+ ldl_u t2, 0(a1) # .. e1 : load high word for next time
|
|
+ stl_u t1, -8(a0) # e0 : save the current word
|
|
+ mov t3, t0 # .. e1 :
|
|
+ cmpgeb zero, t2, t7 # e0 : test new word for eos
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ ext7b t2, a1, t1 # e0 : extract high bits for current word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ ext3b t2, a1, t3 # e0 : extract low bits for next time
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ or t0, t1, t1 # e0 : current dst word now complete
|
|
+ ldl_u t2, 0(a1) # .. e1 : load high word for next time
|
|
+ stl_u t1, -8(a0) # e0 : save the current word
|
|
+ mov t3, t0 # .. e1 :
|
|
+ cmpgeb zero, t2, t7 # e0 : test new word for eos
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ ext7b t2, a1, t1 # e0 : extract high bits for current word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ ext3b t2, a1, t3 # e0 : extract low bits for next time
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ or t0, t1, t1 # e0 : current dst word now complete
|
|
+ ldl_u t2, 0(a1) # .. e1 : load high word for next time
|
|
+ stl_u t1, -8(a0) # e0 : save the current word
|
|
+ mov t3, t0 # .. e1 :
|
|
+ cmpgeb zero, t2, t7 # e0 : test new word for eos
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ ext7b t2, a1, t1 # e0 : extract high bits for current word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ ext3b t2, a1, t3 # e0 : extract low bits for next time
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ or t0, t1, t1 # e0 : current dst word now complete
|
|
+ ldl_u t2, 0(a1) # .. e1 : load high word for next time
|
|
+ stl_u t1, -8(a0) # e0 : save the current word
|
|
+ mov t3, t0 # .. e1 :
|
|
+ cmpgeb zero, t2, t7 # e0 : test new word for eos
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ext7b t2, a1, t1 # e0 : extract high bits for current word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ ext3b t2, a1, t3 # e0 : extract low bits for next time
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ or t0, t1, t1 # e0 : current dst word now complete
|
|
+ ldl_u t2, 0(a1) # .. e1 : load high word for next time
|
|
+ stl_u t1, -8(a0) # e0 : save the current word
|
|
+ mov t3, t0 # .. e1 :
|
|
+ cmpgeb zero, t2, t7 # e0 : test new word for eos
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ ext7b t2, a1, t1 # e0 : extract high bits for current word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ ext3b t2, a1, t3 # e0 : extract low bits for next time
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ or t0, t1, t1 # e0 : current dst word now complete
|
|
+ ldl_u t2, 0(a1) # .. e1 : load high word for next time
|
|
+ stl_u t1, -8(a0) # e0 : save the current word
|
|
+ mov t3, t0 # .. e1 :
|
|
+ cmpgeb zero, t2, t7 # e0 : test new word for eos
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ ext7b t2, a1, t1 # e0 : extract high bits for current word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ ext3b t2, a1, t3 # e0 : extract low bits for next time
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ or t0, t1, t1 # e0 : current dst word now complete
|
|
+ ldl_u t2, 0(a1) # .. e1 : load high word for next time
|
|
+ stl_u t1, -8(a0) # e0 : save the current word
|
|
+ mov t3, t0 # .. e1 :
|
|
+ cmpgeb zero, t2, t7 # e0 : test new word for eos
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ ext7b t2, a1, t1 # e0 : extract high bits for current word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ ext3b t2, a1, t3 # e0 : extract low bits for next time
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ or t0, t1, t1 # e0 : current dst word now complete
|
|
+ ldl_u t2, 0(a1) # .. e1 : load high word for next time
|
|
+ stl_u t1, -8(a0) # e0 : save the current word
|
|
+ mov t3, t0 # .. e1 :
|
|
+ cmpgeb zero, t2, t7 # e0 : test new word for eos
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ ext7b t2, a1, t1 # e0 : extract high bits for current word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ ext3b t2, a1, t3 # e0 : extract low bits for next time
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ or t0, t1, t1 # e0 : current dst word now complete
|
|
+ ldl_u t2, 0(a1) # .. e1 : load high word for next time
|
|
+ stl_u t1, -8(a0) # e0 : save the current word
|
|
+ mov t3, t0 # .. e1 :
|
|
+ cmpgeb zero, t2, t7 # e0 : test new word for eos
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ ext7b t2, a1, t1 # e0 : extract high bits for current word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ ext3b t2, a1, t3 # e0 : extract low bits for next time
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ or t0, t1, t1 # e0 : current dst word now complete
|
|
+ ldl_u t2, 0(a1) # .. e1 : load high word for next time
|
|
+ stl_u t1, -8(a0) # e0 : save the current word
|
|
+ mov t3, t0 # .. e1 :
|
|
+ cmpgeb zero, t2, t7 # e0 : test new word for eos
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ ext7b t2, a1, t1 # e0 : extract high bits for current word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ ext3b t2, a1, t3 # e0 : extract low bits for next time
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ or t0, t1, t1 # e0 : current dst word now complete
|
|
+ ldl_u t2, 0(a1) # .. e1 : load high word for next time
|
|
+ stl_u t1, -8(a0) # e0 : save the current word
|
|
+ mov t3, t0 # .. e1 :
|
|
+ cmpgeb zero, t2, t7 # e0 : test new word for eos
|
|
+ #bne t7, $u_eos
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+ beq t7, $u_loop # .. e1 :
|
|
+
|
|
+ /* We've found a zero somewhere in the source word we just read.
|
|
+ If it resides in the lower half, we have one (probably partial)
|
|
+ word to write out, and if it resides in the upper half, we
|
|
+ have one full and one partial word left to write out.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the shifted high-order bits from the previous source word
|
|
+ t2 == the unshifted current source word. */
|
|
+$u_eos:
|
|
+ ext7b t2, a1, t1 # e0 :
|
|
+ or t0, t1, t1 # e1 : first (partial) source word complete
|
|
+
|
|
+ cmpgeb zero, t1, t7 # e0 : is the null in this first bit?
|
|
+ bne t7, $u_final # .. e1 (zdb)
|
|
+
|
|
+$u_late_head_exit:
|
|
+ stl_u t1, 0(a0) # e0 : the null was in the high-order bits
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t1 # e0 :
|
|
+ cmpgeb zero, t1, t7 # .. e1 :
|
|
+
|
|
+ /* Take care of a final (probably partial) result word.
|
|
+ On entry to this basic block:
|
|
+ t1 == assembled source word
|
|
+ t7 == cmpgeb mask that found the null. */
|
|
+$u_final:
|
|
+ negl t7, t6 # e0 : isolate low bit set
|
|
+ and t6, t7, t8 # e1 :
|
|
+
|
|
+ and t8, 0x80, t6 # e0 : avoid dest word load if we can
|
|
+ bne t6, 1f # .. e1 (zdb)
|
|
+
|
|
+ ldl_u t0, 0(a0) # e0 :
|
|
+ subl t8, 1, t6 # .. e1 :
|
|
+ or t6, t8, t7 # e0 :
|
|
+ zapnot t1, t6, t1 # .. e1 : kill source bytes >= null
|
|
+ zap t0, t7, t0 # e0 : kill dest bytes <= null
|
|
+ or t0, t1, t1 # e1 :
|
|
+
|
|
+1: stl_u t1, 0(a0) # e0 :
|
|
+ ret
|
|
+ #ret (t9) # .. e1 :
|
|
+
|
|
+ /* Unaligned copy entry point. */
|
|
+ .align 3
|
|
+$unaligned:
|
|
+
|
|
+ ldl_u t1, 0(a1) # e0 : load first source word
|
|
+
|
|
+ and a0, 7, t4 # .. e1 : find dest misalignment
|
|
+ and a1, 7, t5 # e0 : find src misalignment
|
|
+
|
|
+ /* Conditionally load the first destination word and a bytemask
|
|
+ with 0xff indicating that the destination byte is sacrosanct. */
|
|
+
|
|
+ mov zero, t0 # .. e1 :
|
|
+ mov zero, t6 # e0 :
|
|
+ beq t4, 1f # .. e1 :
|
|
+ ldl_u t0, 0(a0) # e0 :
|
|
+ ldi t6, -1 # .. e1 :
|
|
+ mask3b t6, a0, t6 # e0 :
|
|
+1:
|
|
+ subl a1, t4, a1 # .. e1 : sub dest misalignment from src addr
|
|
+
|
|
+ /* If source misalignment is larger than dest misalignment, we need
|
|
+ extra startup checks to avoid SEGV. */
|
|
+
|
|
+ cmplt t4, t5, t8 # e0 :
|
|
+ beq t8, $u_head # .. e1 (zdb)
|
|
+
|
|
+ ldi t2, -1 # e1 : mask out leading garbage in source
|
|
+ mask7b t2, t5, t2 # e0 :
|
|
+ nop # e0 :
|
|
+ ornot t1, t2, t3 # .. e1 :
|
|
+ cmpgeb zero, t3, t7 # e0 : is there a zero?
|
|
+ beq t7, $u_head # .. e1 (zdb)
|
|
+
|
|
+ /* At this point we've found a zero in the first partial word of
|
|
+ the source. We need to isolate the valid source data and mask
|
|
+ it into the original destination data. (Incidentally, we know
|
|
+ that we'll need at least one byte of that original dest word.) */
|
|
+
|
|
+ ldl_u t0, 0(a0) # e0 :
|
|
+
|
|
+ negl t7, t6 # .. e1 : build bitmask of bytes <= zero
|
|
+ and t6, t7, t8 # e0 :
|
|
+ and a1, 7, t5 # .. e1 :
|
|
+ subl t8, 1, t6 # e0 :
|
|
+ or t6, t8, t7 # e1 :
|
|
+ srl t8, t5, t8 # e0 : adjust final null return value
|
|
+
|
|
+ zapnot t2, t7, t2 # .. e1 : prepare source word; mirror changes
|
|
+ and t1, t2, t1 # e1 : to source validity mask
|
|
+ ext3b t2, a1, t2 # .. e0 :
|
|
+ ext3b t1, a1, t1 # e0 :
|
|
+
|
|
+ andnot t0, t2, t0 # .. e1 : zero place for source to reside
|
|
+ or t0, t1, t1 # e1 : and put it there
|
|
+ stl_u t1, 0(a0) # .. e0 :
|
|
+ ret
|
|
+ #ret (t9)
|
|
+
|
|
+ cfi_endproc
|
|
+
|
|
+.align 3
|
|
+stxcpy_aligned:
|
|
+ /* Create the 1st output word and detect 0's in the 1st input word. */
|
|
+ ldi t2, -1 # e1 : build a mask against false zero
|
|
+ mask7b t2, a1, t2 # e0 : detection in the src word
|
|
+ mask7b t1, a1, t3 # e0 :
|
|
+ ornot t1, t2, t2 # .. e1 :
|
|
+ mask3b t0, a1, t0 # e0 : assemble the first output word
|
|
+ cmpgeb zero, t2, t7 # .. e1 : bits set iff null found
|
|
+ or t0, t3, t1 # e0 :
|
|
+ bne t7, $a_eos # .. e1 :
|
|
+
|
|
+ /* On entry to this basic block:
|
|
+ t0 == the first destination word for masking back in
|
|
+ t1 == a source word not containing a null. */
|
|
+$a_loop:
|
|
+
|
|
+ stl_u t1, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t1, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ cmpgeb zero, t1, t7 # e0 (stall)
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t1, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t1, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ cmpgeb zero, t1, t7 # e0 (stall)
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t1, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t1, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ cmpgeb zero, t1, t7 # e0 (stall)
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t1, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t1, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ cmpgeb zero, t1, t7 # e0 (stall)
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t1, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t1, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ cmpgeb zero, t1, t7 # e0 (stall)
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t1, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t1, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ cmpgeb zero, t1, t7 # e0 (stall)
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t1, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t1, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ cmpgeb zero, t1, t7 # e0 (stall)
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t1, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t1, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ cmpgeb zero, t1, t7 # e0 (stall)
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t1, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t1, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ cmpgeb zero, t1, t7 # e0 (stall)
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t1, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t1, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ cmpgeb zero, t1, t7 # e0 (stall)
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t1, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t1, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ cmpgeb zero, t1, t7 # e0 (stall)
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t1, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t1, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ cmpgeb zero, t1, t7 # e0 (stall)
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t1, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t1, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ cmpgeb zero, t1, t7 # e0 (stall)
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t1, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t1, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ cmpgeb zero, t1, t7 # e0 (stall)
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t1, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t1, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ cmpgeb zero, t1, t7 # e0 (stall)
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t1, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t1, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ cmpgeb zero, t1, t7 # e0 (stall)
|
|
+# bne t7, $a_eos
|
|
+ beq t7, $a_loop # .. e1 (zdb)
|
|
+
|
|
+ /* Take care of the final (partial) word store.
|
|
+ On entry to this basic block we have:
|
|
+ t1 == the source word containing the null
|
|
+ t7 == the cmpgeb mask that found it. */
|
|
+$a_eos:
|
|
+ negl t7, t6 # e0 : find low bit set
|
|
+ and t7, t6, t8 # e1 (stall)
|
|
+
|
|
+ /* For the sake of the cache, don't read a destination word
|
|
+ if we're not going to need it. */
|
|
+ and t8, 0x80, t6 # e0 :
|
|
+ bne t6, 1f # .. e1 (zdb)
|
|
+
|
|
+ /* We're doing a partial word store and so need to combine
|
|
+ our source and original destination words. */
|
|
+ ldl_u t0, 0(a0) # e0 :
|
|
+ subl t8, 1, t6 # .. e1 :
|
|
+ zapnot t1, t6, t1 # e0 : clear src bytes >= null
|
|
+ or t8, t6, t7 # .. e1 :
|
|
+ zap t0, t7, t0 # e0 : clear dst bytes <= null
|
|
+ or t0, t1, t1 # e1 :
|
|
+
|
|
+1: stl_u t1, 0(a0) # e0 :
|
|
+ ret
|
|
+ #ret (t9) # .. e1 :
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+# mov ra, t9
|
|
+# jmp $31, __stxcpy
|
|
+
|
|
+ END(strcat)
|
|
+libc_hidden_builtin_def (strcat)
|
|
diff --git a/sysdeps/sw_64/sw8a/strlen.S b/sysdeps/sw_64/sw8a/strlen.S
|
|
new file mode 100644
|
|
index 00000000..8f3d53db
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/sw8a/strlen.S
|
|
@@ -0,0 +1,112 @@
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
|
+ Contributed by David Mosberger (davidm@cs.arizona.edu).
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Finds length of a 0-terminated string. Optimized for the Sw_64
|
|
+ architecture:
|
|
+
|
|
+ - memory accessed as aligned quadwords only
|
|
+ - uses cmpgeb to compare 8 bytes in parallel
|
|
+ - does binary search to find 0 byte in last quadword (HAKMEM
|
|
+ needed 12 instructions to do this instead of the 8 instructions
|
|
+ that the binary search needs).
|
|
+*/
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .set noreorder
|
|
+ .set noat
|
|
+ENTRY(strlen)
|
|
+#ifdef PROF
|
|
+ ldgp gp, 0(pv)
|
|
+ ldi AT, _mcount
|
|
+ call AT, (AT), _mcount
|
|
+ .prologue 1
|
|
+#else
|
|
+ .prologue 0
|
|
+#endif
|
|
+
|
|
+ ldl_u $1, 0($16) # load first quadword ($16 may be misaligned)
|
|
+ ldi $2, -1($31)
|
|
+ ins7b $2, $16, $2
|
|
+ andnot $16, 7, $0
|
|
+ or $2, $1, $1
|
|
+ nop # dual issue the next two on sw_64
|
|
+ cmpgeb $31, $1, $2 # $2 <- bitmask: bit i == 1 <==> i-th byte == 0
|
|
+ bne $2, $found
|
|
+
|
|
+$loop: ldl $1, 8($0)
|
|
+ cmpgeb $31, $1, $2
|
|
+ addl $0, 8, $0 # addr += 8
|
|
+ bne $2, $found
|
|
+
|
|
+ ldl $1, 8($0)
|
|
+ cmpgeb $31, $1, $2
|
|
+ addl $0, 8, $0 # addr += 8
|
|
+ bne $2, $found
|
|
+
|
|
+ ldl $1, 8($0)
|
|
+ cmpgeb $31, $1, $2
|
|
+ addl $0, 8, $0 # addr += 8
|
|
+ bne $2, $found
|
|
+
|
|
+ ldl $1, 8($0)
|
|
+ cmpgeb $31, $1, $2
|
|
+ addl $0, 8, $0 # addr += 8
|
|
+ bne $2, $found
|
|
+
|
|
+ ldl $1, 8($0)
|
|
+ cmpgeb $31, $1, $2
|
|
+ addl $0, 8, $0 # addr += 8
|
|
+ bne $2, $found
|
|
+
|
|
+ ldl $1, 8($0)
|
|
+ cmpgeb $31, $1, $2
|
|
+ addl $0, 8, $0 # addr += 8
|
|
+ bne $2, $found
|
|
+
|
|
+ ldl $1, 8($0)
|
|
+ cmpgeb $31, $1, $2
|
|
+ addl $0, 8, $0 # addr += 8
|
|
+ bne $2, $found
|
|
+
|
|
+ ldl $1, 8($0)
|
|
+ cmpgeb $31, $1, $2
|
|
+ addl $0, 8, $0 # addr += 8
|
|
+ beq $2, $loop
|
|
+
|
|
+$found:
|
|
+ cttz $2, $3
|
|
+ addl $0, $3, $0
|
|
+ subl $0, $16, $0
|
|
+ /*negl $2, $3 # clear all but least set bit
|
|
+ and $2, $3, $2
|
|
+
|
|
+ and $2, 0xf0, $3 # binary search for that set bit
|
|
+ and $2, 0xcc, $4
|
|
+ and $2, 0xaa, $5
|
|
+ selne $3, 4, $3, $3
|
|
+ selne $4, 2, $4, $4
|
|
+ selne $5, 1, $5, $5
|
|
+ addl $3, $4, $3
|
|
+ addl $0, $5, $0
|
|
+ addl $0, $3, $0
|
|
+ nop
|
|
+
|
|
+ subl $0, $16, $0*/
|
|
+ ret
|
|
+END(strlen)
|
|
+libc_hidden_builtin_def (strlen)
|
|
diff --git a/sysdeps/sw_64/sw8a/strncat.S b/sysdeps/sw_64/sw8a/strncat.S
|
|
new file mode 100644
|
|
index 00000000..18893dc4
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/sw8a/strncat.S
|
|
@@ -0,0 +1,829 @@
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+ Contributed by Richard Henderson <rth@tamu.edu>, 1996.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Append no more than COUNT characters from the null-terminated string SRC
|
|
+ to the null-terminated string DST. Always null-terminate the new DST. */
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .text
|
|
+
|
|
+ENTRY(strncat)
|
|
+ ldgp gp, 0(pv)
|
|
+#ifdef PROF
|
|
+ .set noat
|
|
+ ldi AT, _mcount
|
|
+ call AT, (AT), _mcount
|
|
+ .set at
|
|
+#endif
|
|
+ .prologue 1
|
|
+
|
|
+ mov a0, v0 # set up return value
|
|
+ beq a2, $zerocount
|
|
+
|
|
+ /* Find the end of the string. */
|
|
+
|
|
+ ldl_u t0, 0(a0) # load first quadword (a0 may be misaligned)
|
|
+ ldi t1, -1(zero)
|
|
+ ins7b t1, a0, t1
|
|
+ andnot a0, 7, a0
|
|
+ or t1, t0, t0
|
|
+ cmpgeb zero, t0, t1 # t1 <- bitmask: bit i == 1 <==> i-th byte == 0
|
|
+ bne t1, $found
|
|
+
|
|
+$loop:
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+ bne t1,$found
|
|
+
|
|
+
|
|
+ ldl t0, 8(a0)
|
|
+ addl a0, 8, a0 # addr += 8
|
|
+ cmpgeb zero, t0, t1
|
|
+# bne t1,$found
|
|
+
|
|
+
|
|
+ beq t1, $loop
|
|
+
|
|
+
|
|
+$found:
|
|
+ cttz t1,t2
|
|
+ addl a0, t2, a0
|
|
+
|
|
+ /* Now do the append. */
|
|
+
|
|
+# call t9, __stxncpy
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+
|
|
+ .text
|
|
+
|
|
+
|
|
+ .align 3
|
|
+#__stxncpy:
|
|
+ /* Are source and destination co-aligned? */
|
|
+ ldi t2, -1
|
|
+ xor a0, a1, t1
|
|
+ srl t2, 1, t2
|
|
+ and a0, 7, t0 # find dest misalignment
|
|
+ sellt a2, t2, a2, a2 # bound neg count to LONG_MAX
|
|
+ and t1, 7, t1
|
|
+ addl a2, t0, a2 # bias count by dest misalignment
|
|
+ subl a2, 1, a2
|
|
+ and a2, 7, t2
|
|
+ srl a2, 3, a2 # a2 = loop counter = (count - 1)/8
|
|
+ addl zero, 1, t10
|
|
+ sll t10, t2, t10 # t10 = bitmask of last count byte
|
|
+ bne t1, $unaligned
|
|
+
|
|
+ /* We are co-aligned; take care of a partial first word. */
|
|
+
|
|
+ ldl_u t1, 0(a1) # e0 : load first src word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+
|
|
+ beq t0, stxncpy_aligned # avoid loading dest word if not needed
|
|
+ ldl_u t0, 0(a0) # e0 :
|
|
+ br stxncpy_aligned # .. e1 :
|
|
+
|
|
+
|
|
+
|
|
+ .align 3
|
|
+$u_head:
|
|
+ /* We know just enough now to be able to assemble the first
|
|
+ full source word. We can still find a zero at the end of it
|
|
+ that prevents us from outputting the whole thing.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the first dest word, unmasked
|
|
+ t1 == the shifted low bits of the first source word
|
|
+ t6 == bytemask that is -1 in dest word bytes */
|
|
+
|
|
+ ldl_u t2, 8(a1) # e0 : load second src word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ mask3b t0, a0, t0 # e0 : mask trailing garbage in dst
|
|
+ ext7b t2, a1, t4 # e0 :
|
|
+ or t1, t4, t1 # e1 : first aligned src word complete
|
|
+ mask7b t1, a0, t1 # e0 : mask leading garbage in src
|
|
+ or t0, t1, t0 # e0 : first output word complete
|
|
+ or t0, t6, t6 # e1 : mask original data for zero test
|
|
+ cmpgeb zero, t6, t7 # e0 :
|
|
+ beq a2, $u_eocfin # .. e1 :
|
|
+ ldi t6, -1 # e0 :
|
|
+ bne t7, $u_final # .. e1 :
|
|
+
|
|
+ mask3b t6, a1, t6 # e0 : mask out bits already seen
|
|
+ nop # .. e1 :
|
|
+ stl_u t0, 0(a0) # e0 : store first output word
|
|
+ or t6, t2, t2 # .. e1 :
|
|
+ cmpgeb zero, t2, t7 # e0 : find nulls in second partial
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ bne t7, $u_late_head_exit # .. e1 :
|
|
+
|
|
+ /* Finally, we've got all the stupid leading edge cases taken care
|
|
+ of and we can set up to enter the main loop. */
|
|
+
|
|
+ ext3b t2, a1, t1 # e0 : position hi-bits of lo word
|
|
+ beq a2, $u_eoc # .. e1 :
|
|
+ ldl_u t2, 8(a1) # e0 : read next high-order source word
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ ext7b t2, a1, t0 # e0 : position lo-bits of hi word
|
|
+ cmpgeb zero, t2, t7 # .. e1 : test new word for eos
|
|
+ nop # e0 :
|
|
+ bne t7, $u_eos # .. e1 :
|
|
+
|
|
+ /* Unaligned copy main loop. In order to avoid reading too much,
|
|
+ the loop is structured to detect zeros in aligned source words.
|
|
+ This has, unfortunately, effectively pulled half of a loop
|
|
+ iteration out into the head and half into the tail, but it does
|
|
+ prevent nastiness from accumulating in the very thing we want
|
|
+ to run as fast as possible.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the shifted low-order bits from the current source word
|
|
+ t1 == the shifted high-order bits from the previous source word
|
|
+ t2 == the unshifted current source word
|
|
+
|
|
+ We further know that t2 does not contain a null terminator. */
|
|
+
|
|
+ .align 3
|
|
+$u_loop:
|
|
+ or t0, t1, t0 # e0 : current dst word now complete
|
|
+ subl a2, 1, a2 # .. e1 : decrement word count
|
|
+ stl_u t0, 0(a0) # e0 : save the current word
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t1 # e0 : extract high bits for next time
|
|
+ beq a2, $u_eoc # .. e1 :
|
|
+ ldl_u t2, 8(a1) # e0 : load high word for next time
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ nop # e0 :
|
|
+ cmpgeb zero, t2, t7 # .. e1 : test new word for eos
|
|
+ ext7b t2, a1, t0 # e0 : extract low bits for current word
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ or t0, t1, t0 # e0 : current dst word now complete
|
|
+ subl a2, 1, a2 # .. e1 : decrement word count
|
|
+ stl_u t0, 0(a0) # e0 : save the current word
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t1 # e0 : extract high bits for next time
|
|
+ beq a2, $u_eoc # .. e1 :
|
|
+ ldl_u t2, 8(a1) # e0 : load high word for next time
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ nop # e0 :
|
|
+ cmpgeb zero, t2, t7 # .. e1 : test new word for eos
|
|
+ ext7b t2, a1, t0 # e0 : extract low bits for current word
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ or t0, t1, t0 # e0 : current dst word now complete
|
|
+ subl a2, 1, a2 # .. e1 : decrement word count
|
|
+ stl_u t0, 0(a0) # e0 : save the current word
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t1 # e0 : extract high bits for next time
|
|
+ beq a2, $u_eoc # .. e1 :
|
|
+ ldl_u t2, 8(a1) # e0 : load high word for next time
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ nop # e0 :
|
|
+ cmpgeb zero, t2, t7 # .. e1 : test new word for eos
|
|
+ ext7b t2, a1, t0 # e0 : extract low bits for current word
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ or t0, t1, t0 # e0 : current dst word now complete
|
|
+ subl a2, 1, a2 # .. e1 : decrement word count
|
|
+ stl_u t0, 0(a0) # e0 : save the current word
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t1 # e0 : extract high bits for next time
|
|
+ beq a2, $u_eoc # .. e1 :
|
|
+ ldl_u t2, 8(a1) # e0 : load high word for next time
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ nop # e0 :
|
|
+ cmpgeb zero, t2, t7 # .. e1 : test new word for eos
|
|
+ ext7b t2, a1, t0 # e0 : extract low bits for current word
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ or t0, t1, t0 # e0 : current dst word now complete
|
|
+ subl a2, 1, a2 # .. e1 : decrement word count
|
|
+ stl_u t0, 0(a0) # e0 : save the current word
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t1 # e0 : extract high bits for next time
|
|
+ beq a2, $u_eoc # .. e1 :
|
|
+ ldl_u t2, 8(a1) # e0 : load high word for next time
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ nop # e0 :
|
|
+ cmpgeb zero, t2, t7 # .. e1 : test new word for eos
|
|
+ ext7b t2, a1, t0 # e0 : extract low bits for current word
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ or t0, t1, t0 # e0 : current dst word now complete
|
|
+ subl a2, 1, a2 # .. e1 : decrement word count
|
|
+ stl_u t0, 0(a0) # e0 : save the current word
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t1 # e0 : extract high bits for next time
|
|
+ beq a2, $u_eoc # .. e1 :
|
|
+ ldl_u t2, 8(a1) # e0 : load high word for next time
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ nop # e0 :
|
|
+ cmpgeb zero, t2, t7 # .. e1 : test new word for eos
|
|
+ ext7b t2, a1, t0 # e0 : extract low bits for current word
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ or t0, t1, t0 # e0 : current dst word now complete
|
|
+ subl a2, 1, a2 # .. e1 : decrement word count
|
|
+ stl_u t0, 0(a0) # e0 : save the current word
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t1 # e0 : extract high bits for next time
|
|
+ beq a2, $u_eoc # .. e1 :
|
|
+ ldl_u t2, 8(a1) # e0 : load high word for next time
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ nop # e0 :
|
|
+ cmpgeb zero, t2, t7 # .. e1 : test new word for eos
|
|
+ ext7b t2, a1, t0 # e0 : extract low bits for current word
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ or t0, t1, t0 # e0 : current dst word now complete
|
|
+ subl a2, 1, a2 # .. e1 : decrement word count
|
|
+ stl_u t0, 0(a0) # e0 : save the current word
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t1 # e0 : extract high bits for next time
|
|
+ beq a2, $u_eoc # .. e1 :
|
|
+ ldl_u t2, 8(a1) # e0 : load high word for next time
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ nop # e0 :
|
|
+ cmpgeb zero, t2, t7 # .. e1 : test new word for eos
|
|
+ ext7b t2, a1, t0 # e0 : extract low bits for current word
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ or t0, t1, t0 # e0 : current dst word now complete
|
|
+ subl a2, 1, a2 # .. e1 : decrement word count
|
|
+ stl_u t0, 0(a0) # e0 : save the current word
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t1 # e0 : extract high bits for next time
|
|
+ beq a2, $u_eoc # .. e1 :
|
|
+ ldl_u t2, 8(a1) # e0 : load high word for next time
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ nop # e0 :
|
|
+ cmpgeb zero, t2, t7 # .. e1 : test new word for eos
|
|
+ ext7b t2, a1, t0 # e0 : extract low bits for current word
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ or t0, t1, t0 # e0 : current dst word now complete
|
|
+ subl a2, 1, a2 # .. e1 : decrement word count
|
|
+ stl_u t0, 0(a0) # e0 : save the current word
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t1 # e0 : extract high bits for next time
|
|
+ beq a2, $u_eoc # .. e1 :
|
|
+ ldl_u t2, 8(a1) # e0 : load high word for next time
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ nop # e0 :
|
|
+ cmpgeb zero, t2, t7 # .. e1 : test new word for eos
|
|
+ ext7b t2, a1, t0 # e0 : extract low bits for current word
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ or t0, t1, t0 # e0 : current dst word now complete
|
|
+ subl a2, 1, a2 # .. e1 : decrement word count
|
|
+ stl_u t0, 0(a0) # e0 : save the current word
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t1 # e0 : extract high bits for next time
|
|
+ beq a2, $u_eoc # .. e1 :
|
|
+ ldl_u t2, 8(a1) # e0 : load high word for next time
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ nop # e0 :
|
|
+ cmpgeb zero, t2, t7 # .. e1 : test new word for eos
|
|
+ ext7b t2, a1, t0 # e0 : extract low bits for current word
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ or t0, t1, t0 # e0 : current dst word now complete
|
|
+ subl a2, 1, a2 # .. e1 : decrement word count
|
|
+ stl_u t0, 0(a0) # e0 : save the current word
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t1 # e0 : extract high bits for next time
|
|
+ beq a2, $u_eoc # .. e1 :
|
|
+ ldl_u t2, 8(a1) # e0 : load high word for next time
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ nop # e0 :
|
|
+ cmpgeb zero, t2, t7 # .. e1 : test new word for eos
|
|
+ ext7b t2, a1, t0 # e0 : extract low bits for current word
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ or t0, t1, t0 # e0 : current dst word now complete
|
|
+ subl a2, 1, a2 # .. e1 : decrement word count
|
|
+ stl_u t0, 0(a0) # e0 : save the current word
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t1 # e0 : extract high bits for next time
|
|
+ beq a2, $u_eoc # .. e1 :
|
|
+ ldl_u t2, 8(a1) # e0 : load high word for next time
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ nop # e0 :
|
|
+ cmpgeb zero, t2, t7 # .. e1 : test new word for eos
|
|
+ ext7b t2, a1, t0 # e0 : extract low bits for current word
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ or t0, t1, t0 # e0 : current dst word now complete
|
|
+ subl a2, 1, a2 # .. e1 : decrement word count
|
|
+ stl_u t0, 0(a0) # e0 : save the current word
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t1 # e0 : extract high bits for next time
|
|
+ beq a2, $u_eoc # .. e1 :
|
|
+ ldl_u t2, 8(a1) # e0 : load high word for next time
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ nop # e0 :
|
|
+ cmpgeb zero, t2, t7 # .. e1 : test new word for eos
|
|
+ ext7b t2, a1, t0 # e0 : extract low bits for current word
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ or t0, t1, t0 # e0 : current dst word now complete
|
|
+ subl a2, 1, a2 # .. e1 : decrement word count
|
|
+ stl_u t0, 0(a0) # e0 : save the current word
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t1 # e0 : extract high bits for next time
|
|
+ beq a2, $u_eoc # .. e1 :
|
|
+ ldl_u t2, 8(a1) # e0 : load high word for next time
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ nop # e0 :
|
|
+ cmpgeb zero, t2, t7 # .. e1 : test new word for eos
|
|
+ ext7b t2, a1, t0 # e0 : extract low bits for current word
|
|
+ bne t7, $u_eos
|
|
+
|
|
+ or t0, t1, t0 # e0 : current dst word now complete
|
|
+ subl a2, 1, a2 # .. e1 : decrement word count
|
|
+ stl_u t0, 0(a0) # e0 : save the current word
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ext3b t2, a1, t1 # e0 : extract high bits for next time
|
|
+ beq a2, $u_eoc # .. e1 :
|
|
+ ldl_u t2, 8(a1) # e0 : load high word for next time
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ nop # e0 :
|
|
+ cmpgeb zero, t2, t7 # .. e1 : test new word for eos
|
|
+ ext7b t2, a1, t0 # e0 : extract low bits for current word
|
|
+# bne t7, $u_eos
|
|
+
|
|
+ beq t7, $u_loop # .. e1 :
|
|
+
|
|
+ /* We've found a zero somewhere in the source word we just read.
|
|
+ If it resides in the lower half, we have one (probably partial)
|
|
+ word to write out, and if it resides in the upper half, we
|
|
+ have one full and one partial word left to write out.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the shifted low-order bits from the current source word
|
|
+ t1 == the shifted high-order bits from the previous source word
|
|
+ t2 == the unshifted current source word. */
|
|
+$u_eos:
|
|
+ or t0, t1, t0 # e0 : first (partial) source word complete
|
|
+ cmpgeb zero, t0, t7 # e0 : is the null in this first bit?
|
|
+ bne t7, $u_final # .. e1 (zdb)
|
|
+
|
|
+ stl_u t0, 0(a0) # e0 : the null was in the high-order bits
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+
|
|
+$u_late_head_exit:
|
|
+ ext3b t2, a1, t0 # e0 :
|
|
+ cmpgeb zero, t0, t7 # e0 :
|
|
+ or t7, t10, t6 # e1 :
|
|
+ seleq a2, t6, t7, t7 # e0 :
|
|
+
|
|
+ /* Take care of a final (probably partial) result word.
|
|
+ On entry to this basic block:
|
|
+ t0 == assembled source word
|
|
+ t7 == cmpgeb mask that found the null. */
|
|
+$u_final:
|
|
+ negl t7, t6 # e0 : isolate low bit set
|
|
+ and t6, t7, t8 # e1 :
|
|
+
|
|
+ and t8, 0x80, t6 # e0 : avoid dest word load if we can
|
|
+ bne t6, 1f # .. e1 (zdb)
|
|
+
|
|
+ ldl_u t1, 0(a0) # e0 :
|
|
+ subl t8, 1, t6 # .. e1 :
|
|
+ or t6, t8, t7 # e0 :
|
|
+ zapnot t0, t7, t0 # .. e1 : kill source bytes > null
|
|
+ zap t1, t7, t1 # e0 : kill dest bytes <= null
|
|
+ or t0, t1, t0 # e1 :
|
|
+
|
|
+1: stl_u t0, 0(a0) # e0 :
|
|
+ #ret
|
|
+ br $process_tail
|
|
+ #ret (t9) # .. e1 :
|
|
+
|
|
+ /* Got to end-of-count before end of string.
|
|
+ On entry to this basic block:
|
|
+ t1 == the shifted high-order bits from the previous source word */
|
|
+$u_eoc:
|
|
+ and a1, 7, t6 # e1 :
|
|
+ sll t10, t6, t6 # e0 :
|
|
+ and t6, 0xff, t6 # e0 :
|
|
+ bne t6, 1f # e1 : avoid src word load if we can
|
|
+
|
|
+ ldl_u t2, 8(a1) # e0 : load final src word
|
|
+ nop # .. e1 :
|
|
+ ext7b t2, a1, t0 # e0 : extract high bits for last word
|
|
+ or t1, t0, t1 # e1 :
|
|
+
|
|
+1: cmpgeb zero, t1, t7
|
|
+ mov t1, t0
|
|
+
|
|
+$u_eocfin: # end-of-count, final word
|
|
+ or t10, t7, t7
|
|
+ br $u_final
|
|
+
|
|
+ /* Unaligned copy entry point. */
|
|
+ .align 3
|
|
+$unaligned:
|
|
+
|
|
+ ldl_u t1, 0(a1) # e0 : load first source word
|
|
+
|
|
+ and a0, 7, t4 # .. e1 : find dest misalignment
|
|
+ and a1, 7, t5 # e0 : find src misalignment
|
|
+
|
|
+ /* Conditionally load the first destination word and a bytemask
|
|
+ with 0xff indicating that the destination byte is sacrosanct. */
|
|
+
|
|
+ mov zero, t0 # .. e1 :
|
|
+ mov zero, t6 # e0 :
|
|
+ beq t4, 1f # .. e1 :
|
|
+ ldl_u t0, 0(a0) # e0 :
|
|
+ ldi t6, -1 # .. e1 :
|
|
+ mask3b t6, a0, t6 # e0 :
|
|
+1:
|
|
+ subl a1, t4, a1 # .. e1 : sub dest misalignment from src addr
|
|
+
|
|
+ /* If source misalignment is larger than dest misalignment, we need
|
|
+ extra startup checks to avoid SEGV. */
|
|
+
|
|
+ cmplt t4, t5, t8 # e1 :
|
|
+ ext3b t1, a1, t1 # .. e0 : shift src into place
|
|
+ ldi t2, -1 # e0 : for creating masks later
|
|
+ beq t8, $u_head # e1 :
|
|
+
|
|
+ mask7b t2, t5, t2 # e0 : begin src byte validity mask
|
|
+ cmpgeb zero, t1, t7 # .. e1 : is there a zero?
|
|
+ ext3b t2, a1, t2 # e0 :
|
|
+ or t7, t10, t5 # .. e1 : test for end-of-count too
|
|
+ cmpgeb zero, t2, t3 # e0 :
|
|
+ seleq a2, t5, t7, t7 # .. e1 :
|
|
+ andnot t7, t3, t7 # e0 :
|
|
+ beq t7, $u_head # .. e1 (zdb)
|
|
+
|
|
+ /* At this point we've found a zero in the first partial word of
|
|
+ the source. We need to isolate the valid source data and mask
|
|
+ it into the original destination data. (Incidentally, we know
|
|
+ that we'll need at least one byte of that original dest word.) */
|
|
+
|
|
+ ldl_u t0, 0(a0) # e0 :
|
|
+ negl t7, t6 # .. e1 : build bitmask of bytes <= zero
|
|
+ mask7b t1, t4, t1 # e0 :
|
|
+ and t6, t7, t8 # .. e1 :
|
|
+ subl t8, 1, t6 # e0 :
|
|
+ or t6, t8, t7 # e1 :
|
|
+
|
|
+ zapnot t2, t7, t2 # e0 : prepare source word; mirror changes
|
|
+ zapnot t1, t7, t1 # .. e1 : to source validity mask
|
|
+
|
|
+ andnot t0, t2, t0 # e0 : zero place for source to reside
|
|
+ or t0, t1, t0 # e1 : and put it there
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ #ret (t9) # .. e1 :
|
|
+# ret
|
|
+ br $process_tail
|
|
+
|
|
+
|
|
+ .align 3
|
|
+stxncpy_aligned:
|
|
+ /* Create the 1st output word and detect 0's in the 1st input word. */
|
|
+ ldi t2, -1 # e1 : build a mask against false zero
|
|
+ mask7b t2, a1, t2 # e0 : detection in the src word
|
|
+ mask7b t1, a1, t3 # e0 :
|
|
+ ornot t1, t2, t2 # .. e1 :
|
|
+ mask3b t0, a1, t0 # e0 : assemble the first output word
|
|
+ cmpgeb zero, t2, t7 # .. e1 : bits set iff null found
|
|
+ or t0, t3, t0 # e0 :
|
|
+ beq a2, $a_eoc # .. e1 :
|
|
+ bne t7, $a_eos # .. e1 :
|
|
+
|
|
+ /* On entry to this basic block:
|
|
+ t0 == a source word not containing a null. */
|
|
+$a_loop:
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t0, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ cmpgeb zero, t0, t7 # .. e1 (stall)
|
|
+ beq a2, $a_eoc # e1 :
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t0, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ cmpgeb zero, t0, t7 # .. e1 (stall)
|
|
+ beq a2, $a_eoc # e1 :
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t0, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ cmpgeb zero, t0, t7 # .. e1 (stall)
|
|
+ beq a2, $a_eoc # e1 :
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t0, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ cmpgeb zero, t0, t7 # .. e1 (stall)
|
|
+ beq a2, $a_eoc # e1 :
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t0, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ cmpgeb zero, t0, t7 # .. e1 (stall)
|
|
+ beq a2, $a_eoc # e1 :
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t0, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ cmpgeb zero, t0, t7 # .. e1 (stall)
|
|
+ beq a2, $a_eoc # e1 :
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t0, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ cmpgeb zero, t0, t7 # .. e1 (stall)
|
|
+ beq a2, $a_eoc # e1 :
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t0, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ cmpgeb zero, t0, t7 # .. e1 (stall)
|
|
+ beq a2, $a_eoc # e1 :
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t0, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ cmpgeb zero, t0, t7 # .. e1 (stall)
|
|
+ beq a2, $a_eoc # e1 :
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t0, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ cmpgeb zero, t0, t7 # .. e1 (stall)
|
|
+ beq a2, $a_eoc # e1 :
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t0, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ cmpgeb zero, t0, t7 # .. e1 (stall)
|
|
+ beq a2, $a_eoc # e1 :
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t0, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ cmpgeb zero, t0, t7 # .. e1 (stall)
|
|
+ beq a2, $a_eoc # e1 :
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t0, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ cmpgeb zero, t0, t7 # .. e1 (stall)
|
|
+ beq a2, $a_eoc # e1 :
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t0, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ cmpgeb zero, t0, t7 # .. e1 (stall)
|
|
+ beq a2, $a_eoc # e1 :
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t0, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ cmpgeb zero, t0, t7 # .. e1 (stall)
|
|
+ beq a2, $a_eoc # e1 :
|
|
+ bne t7, $a_eos
|
|
+
|
|
+ stl_u t0, 0(a0) # e0 :
|
|
+ addl a0, 8, a0 # .. e1 :
|
|
+ ldl_u t0, 0(a1) # e0 :
|
|
+ addl a1, 8, a1 # .. e1 :
|
|
+ subl a2, 1, a2 # e0 :
|
|
+ cmpgeb zero, t0, t7 # .. e1 (stall)
|
|
+ beq a2, $a_eoc # e1 :
|
|
+ #bne t7, $a_eos
|
|
+ beq t7, $a_loop # e1 :
|
|
+
|
|
+ /* Take care of the final (partial) word store. At this point
|
|
+ the end-of-count bit is set in t7 iff it applies.
|
|
+
|
|
+ On entry to this basic block we have:
|
|
+ t0 == the source word containing the null
|
|
+ t7 == the cmpgeb mask that found it. */
|
|
+$a_eos:
|
|
+ negl t7, t8 # e0 : find low bit set
|
|
+ and t7, t8, t8 # e1 (stall)
|
|
+
|
|
+ /* For the sake of the cache, don't read a destination word
|
|
+ if we're not going to need it. */
|
|
+ and t8, 0x80, t6 # e0 :
|
|
+ bne t6, 1f # .. e1 (zdb)
|
|
+
|
|
+ /* We're doing a partial word store and so need to combine
|
|
+ our source and original destination words. */
|
|
+ ldl_u t1, 0(a0) # e0 :
|
|
+ subl t8, 1, t6 # .. e1 :
|
|
+ or t8, t6, t7 # e0 :
|
|
+ unop #
|
|
+ zapnot t0, t7, t0 # e0 : clear src bytes > null
|
|
+ zap t1, t7, t1 # .. e1 : clear dst bytes <= null
|
|
+ or t0, t1, t0 # e1 :
|
|
+
|
|
+1: stl_u t0, 0(a0) # e0 :
|
|
+ #ret (t9) # e1 :
|
|
+ #ret
|
|
+ br $process_tail
|
|
+
|
|
+ /* Add the end-of-count bit to the eos detection bitmask. */
|
|
+$a_eoc:
|
|
+ or t10, t7, t7
|
|
+ br $a_eos
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+# cfi_endproc
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+$process_tail:
|
|
+ /* Worry about the null termination. */
|
|
+
|
|
+ zapnot t0, t8, t1 # was last byte a null?
|
|
+ bne t1, 0f
|
|
+ ret
|
|
+
|
|
+0: and t10, 0x80, t1
|
|
+ bne t1, 1f
|
|
+
|
|
+ /* Here there are bytes left in the current word. Clear one. */
|
|
+ addl t10, t10, t10 # end-of-count bit <<= 1
|
|
+ zap t0, t10, t0
|
|
+ stl_u t0, 0(a0)
|
|
+ ret
|
|
+
|
|
+1: /* Here we must read the next DST word and clear the first byte. */
|
|
+ ldl_u t0, 8(a0)
|
|
+ zap t0, 1, t0
|
|
+ stl_u t0, 8(a0)
|
|
+
|
|
+$zerocount:
|
|
+ ret
|
|
+
|
|
+ END(strncat)
|
|
diff --git a/sysdeps/sw_64/sw8a/stxcpy.S b/sysdeps/sw_64/sw8a/stxcpy.S
|
|
new file mode 100644
|
|
index 00000000..cf07eb8e
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/sw8a/stxcpy.S
|
|
@@ -0,0 +1,314 @@
|
|
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
|
|
+ Contributed by Richard Henderson (rth@tamu.edu)
|
|
+ SW6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Copy a null-terminated string from SRC to DST.
|
|
+
|
|
+ This is an internal routine used by strcpy, stpcpy, and strcat.
|
|
+ As such, it uses special linkage conventions to make implementation
|
|
+ of these public functions more efficient.
|
|
+
|
|
+ On input:
|
|
+ t9 = return address
|
|
+ a0 = DST
|
|
+ a1 = SRC
|
|
+
|
|
+ On output:
|
|
+ t8 = bitmask (with one bit set) indicating the last byte written
|
|
+ a0 = unaligned address of the last *word* written
|
|
+
|
|
+ Furthermore, v0, a3-a5, t11, and t12 are untouched.
|
|
+*/
|
|
+
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .arch ev6
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+
|
|
+ .text
|
|
+ .type __stxcpy, @function
|
|
+ .globl __stxcpy
|
|
+ .usepv __stxcpy, no
|
|
+
|
|
+ cfi_startproc
|
|
+ cfi_return_column (t9)
|
|
+
|
|
+ /* On entry to this basic block:
|
|
+ t0 == the first destination word for masking back in
|
|
+ t1 == the first source word. */
|
|
+ .align 4
|
|
+stxcpy_aligned:
|
|
+ /* Create the 1st output word and detect 0's in the 1st input word. */
|
|
+ ldi t2, -1 # E : build a mask against false zero
|
|
+ mask7b t2, a1, t2 # U : detection in the src word (stall)
|
|
+ mask7b t1, a1, t3 # U :
|
|
+ ornot t1, t2, t2 # E : (stall)
|
|
+
|
|
+ mask3b t0, a1, t0 # U : assemble the first output word
|
|
+ cmpgeb zero, t2, t10 # E : bits set iff null found
|
|
+ or t0, t3, t1 # E : (stall)
|
|
+ bne t10, $a_eos # U : (stall)
|
|
+
|
|
+ /* On entry to this basic block:
|
|
+ t0 == the first destination word for masking back in
|
|
+ t1 == a source word not containing a null. */
|
|
+ /* Nops here to separate store quads from load quads */
|
|
+
|
|
+$a_loop:
|
|
+ stl_u t1, 0(a0) # L :
|
|
+ addl a0, 8, a0 # E :
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+ ldl_u t1, 0(a1) # L : Latency=3
|
|
+ addl a1, 8, a1 # E :
|
|
+ cmpgeb zero, t1, t10 # E : (3 cycle stall)
|
|
+ beq t10, $a_loop # U : (stall for t10)
|
|
+
|
|
+ /* Take care of the final (partial) word store.
|
|
+ On entry to this basic block we have:
|
|
+ t1 == the source word containing the null
|
|
+ t10 == the cmpgeb mask that found it. */
|
|
+$a_eos:
|
|
+ negl t10, t6 # E : find low bit set
|
|
+ and t10, t6, t8 # E : (stall)
|
|
+ /* For the sake of the cache, don't read a destination word
|
|
+ if we're not going to need it. */
|
|
+ and t8, 0x80, t6 # E : (stall)
|
|
+ bne t6, 1f # U : (stall)
|
|
+
|
|
+ /* We're doing a partial word store and so need to combine
|
|
+ our source and original destination words. */
|
|
+ ldl_u t0, 0(a0) # L : Latency=3
|
|
+ subl t8, 1, t6 # E :
|
|
+ zapnot t1, t6, t1 # U : clear src bytes >= null (stall)
|
|
+ or t8, t6, t10 # E : (stall)
|
|
+
|
|
+ zap t0, t10, t0 # E : clear dst bytes <= null
|
|
+ or t0, t1, t1 # E : (stall)
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+1: stl_u t1, 0(a0) # L :
|
|
+ ret (t9) # L0 : Latency=3
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+ .align 4
|
|
+__stxcpy:
|
|
+ /* Are source and destination co-aligned? */
|
|
+ xor a0, a1, t0 # E :
|
|
+ unop # E :
|
|
+ and t0, 7, t0 # E : (stall)
|
|
+ bne t0, $unaligned # U : (stall)
|
|
+
|
|
+ /* We are co-aligned; take care of a partial first word. */
|
|
+ ldl_u t1, 0(a1) # L : load first src word
|
|
+ and a0, 7, t0 # E : take care not to load a word ...
|
|
+ addl a1, 8, a1 # E :
|
|
+ beq t0, stxcpy_aligned # U : ... if we wont need it (stall)
|
|
+
|
|
+ ldl_u t0, 0(a0) # L :
|
|
+ br stxcpy_aligned # L0 : Latency=3
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+
|
|
+/* The source and destination are not co-aligned. Align the destination
|
|
+ and cope. We have to be very careful about not reading too much and
|
|
+ causing a SEGV. */
|
|
+
|
|
+ .align 4
|
|
+$u_head:
|
|
+ /* We know just enough now to be able to assemble the first
|
|
+ full source word. We can still find a zero at the end of it
|
|
+ that prevents us from outputting the whole thing.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the first dest word, for masking back in, if needed else 0
|
|
+ t1 == the low bits of the first source word
|
|
+ t6 == bytemask that is -1 in dest word bytes */
|
|
+
|
|
+ ldl_u t2, 8(a1) # L :
|
|
+ addl a1, 8, a1 # E :
|
|
+ ext3b t1, a1, t1 # U : (stall on a1)
|
|
+ ext7b t2, a1, t4 # U : (stall on a1)
|
|
+
|
|
+ mask3b t0, a0, t0 # U :
|
|
+ or t1, t4, t1 # E :
|
|
+ mask7b t1, a0, t1 # U : (stall on t1)
|
|
+ or t0, t1, t1 # E : (stall on t1)
|
|
+
|
|
+ or t1, t6, t6 # E :
|
|
+ cmpgeb zero, t6, t10 # E : (stall)
|
|
+ ldi t6, -1 # E : for masking just below
|
|
+ bne t10, $u_final # U : (stall)
|
|
+
|
|
+ mask3b t6, a1, t6 # U : mask out the bits we have
|
|
+ or t6, t2, t2 # E : already extracted before (stall)
|
|
+ cmpgeb zero, t2, t10 # E : testing eos (stall)
|
|
+ bne t10, $u_late_head_exit # U : (stall)
|
|
+
|
|
+ /* Finally, we've got all the stupid leading edge cases taken care
|
|
+ of and we can set up to enter the main loop. */
|
|
+
|
|
+ stl_u t1, 0(a0) # L : store first output word
|
|
+ addl a0, 8, a0 # E :
|
|
+ ext3b t2, a1, t0 # U : position ho-bits of lo word
|
|
+ ldl_u t2, 8(a1) # U : read next high-order source word
|
|
+
|
|
+ addl a1, 8, a1 # E :
|
|
+ cmpgeb zero, t2, t10 # E : (stall for t2)
|
|
+ nop # E :
|
|
+ bne t10, $u_eos # U : (stall)
|
|
+
|
|
+ /* Unaligned copy main loop. In order to avoid reading too much,
|
|
+ the loop is structured to detect zeros in aligned source words.
|
|
+ This has, unfortunately, effectively pulled half of a loop
|
|
+ iteration out into the head and half into the tail, but it does
|
|
+ prevent nastiness from accumulating in the very thing we want
|
|
+ to run as fast as possible.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the shifted high-order bits from the previous source word
|
|
+ t2 == the unshifted current source word
|
|
+
|
|
+ We further know that t2 does not contain a null terminator. */
|
|
+
|
|
+ .align 3
|
|
+$u_loop:
|
|
+ ext7b t2, a1, t1 # U : extract high bits for current word
|
|
+ addl a1, 8, a1 # E : (stall)
|
|
+ ext3b t2, a1, t3 # U : extract low bits for next time (stall)
|
|
+ addl a0, 8, a0 # E :
|
|
+
|
|
+ or t0, t1, t1 # E : current dst word now complete
|
|
+ ldl_u t2, 0(a1) # L : Latency=3 load high word for next time
|
|
+ stl_u t1, -8(a0) # L : save the current word (stall)
|
|
+ mov t3, t0 # E :
|
|
+
|
|
+ cmpgeb zero, t2, t10 # E : test new word for eos
|
|
+ beq t10, $u_loop # U : (stall)
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+ /* We've found a zero somewhere in the source word we just read.
|
|
+ If it resides in the lower half, we have one (probably partial)
|
|
+ word to write out, and if it resides in the upper half, we
|
|
+ have one full and one partial word left to write out.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the shifted high-order bits from the previous source word
|
|
+ t2 == the unshifted current source word. */
|
|
+$u_eos:
|
|
+ ext7b t2, a1, t1 # U :
|
|
+ or t0, t1, t1 # E : first (partial) source word complete (stall)
|
|
+ cmpgeb zero, t1, t10 # E : is the null in this first bit? (stall)
|
|
+ bne t10, $u_final # U : (stall)
|
|
+
|
|
+$u_late_head_exit:
|
|
+ stl_u t1, 0(a0) # L : the null was in the high-order bits
|
|
+ addl a0, 8, a0 # E :
|
|
+ ext3b t2, a1, t1 # U :
|
|
+ cmpgeb zero, t1, t10 # E : (stall)
|
|
+
|
|
+ /* Take care of a final (probably partial) result word.
|
|
+ On entry to this basic block:
|
|
+ t1 == assembled source word
|
|
+ t10 == cmpgeb mask that found the null. */
|
|
+$u_final:
|
|
+ negl t10, t6 # E : isolate low bit set
|
|
+ and t6, t10, t8 # E : (stall)
|
|
+ and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
|
|
+ bne t6, 1f # U : (stall)
|
|
+
|
|
+ ldl_u t0, 0(a0) # E :
|
|
+ subl t8, 1, t6 # E :
|
|
+ or t6, t8, t10 # E : (stall)
|
|
+ zapnot t1, t6, t1 # U : kill source bytes >= null (stall)
|
|
+
|
|
+ zap t0, t10, t0 # U : kill dest bytes <= null (2 cycle data stall)
|
|
+ or t0, t1, t1 # E : (stall)
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+1: stl_u t1, 0(a0) # L :
|
|
+ ret (t9) # L0 : Latency=3
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+ /* Unaligned copy entry point. */
|
|
+ .align 4
|
|
+$unaligned:
|
|
+
|
|
+ ldl_u t1, 0(a1) # L : load first source word
|
|
+ and a0, 7, t4 # E : find dest misalignment
|
|
+ and a1, 7, t5 # E : find src misalignment
|
|
+ /* Conditionally load the first destination word and a bytemask
|
|
+ with 0xff indicating that the destination byte is sacrosanct. */
|
|
+ mov zero, t0 # E :
|
|
+
|
|
+ mov zero, t6 # E :
|
|
+ beq t4, 1f # U :
|
|
+ ldl_u t0, 0(a0) # L :
|
|
+ ldi t6, -1 # E :
|
|
+
|
|
+ mask3b t6, a0, t6 # U :
|
|
+ nop
|
|
+ nop
|
|
+ nop
|
|
+1:
|
|
+ subl a1, t4, a1 # E : sub dest misalignment from src addr
|
|
+ /* If source misalignment is larger than dest misalignment, we need
|
|
+ extra startup checks to avoid SEGV. */
|
|
+ cmplt t4, t5, t8 # E :
|
|
+ beq t8, $u_head # U :
|
|
+ ldi t2, -1 # E : mask out leading garbage in source
|
|
+
|
|
+ mask7b t2, t5, t2 # U :
|
|
+ ornot t1, t2, t3 # E : (stall)
|
|
+ cmpgeb zero, t3, t10 # E : is there a zero? (stall)
|
|
+ beq t10, $u_head # U : (stall)
|
|
+
|
|
+ /* At this point we've found a zero in the first partial word of
|
|
+ the source. We need to isolate the valid source data and mask
|
|
+ it into the original destination data. (Incidentally, we know
|
|
+ that we'll need at least one byte of that original dest word.) */
|
|
+
|
|
+ ldl_u t0, 0(a0) # L :
|
|
+ negl t10, t6 # E : build bitmask of bytes <= zero
|
|
+ and t6, t10, t8 # E : (stall)
|
|
+ and a1, 7, t5 # E :
|
|
+
|
|
+ subl t8, 1, t6 # E :
|
|
+ or t6, t8, t10 # E : (stall)
|
|
+ srl t8, t5, t8 # U : adjust final null return value
|
|
+ zapnot t2, t10, t2 # U : prepare source word; mirror changes (stall)
|
|
+
|
|
+ and t1, t2, t1 # E : to source validity mask
|
|
+ ext3b t2, a1, t2 # U :
|
|
+ ext3b t1, a1, t1 # U : (stall)
|
|
+ andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall)
|
|
+
|
|
+ or t0, t1, t1 # e1 : and put it there
|
|
+ stl_u t1, 0(a0) # .. e0 : (stall)
|
|
+ ret (t9) # e1 :
|
|
+
|
|
+ cfi_endproc
|
|
diff --git a/sysdeps/sw_64/sw8a/stxncpy.S b/sysdeps/sw_64/sw8a/stxncpy.S
|
|
new file mode 100644
|
|
index 00000000..c47029ea
|
|
--- /dev/null
|
|
+++ b/sysdeps/sw_64/sw8a/stxncpy.S
|
|
@@ -0,0 +1,392 @@
|
|
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
|
|
+ Contributed by Richard Henderson (rth@tamu.edu)
|
|
+ SW6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* Copy no more than COUNT bytes of the null-terminated string from
|
|
+ SRC to DST.
|
|
+
|
|
+ This is an internal routine used by strncpy, stpncpy, and strncat.
|
|
+ As such, it uses special linkage conventions to make implementation
|
|
+ of these public functions more efficient.
|
|
+
|
|
+ On input:
|
|
+ t9 = return address
|
|
+ a0 = DST
|
|
+ a1 = SRC
|
|
+ a2 = COUNT
|
|
+
|
|
+ Furthermore, COUNT may not be zero.
|
|
+
|
|
+ On output:
|
|
+ t0 = last word written
|
|
+ t8 = bitmask (with one bit set) indicating the last byte written
|
|
+ t10 = bitmask (with one bit set) indicating the byte position of
|
|
+ the end of the range specified by COUNT
|
|
+ a0 = unaligned address of the last *word* written
|
|
+ a2 = the number of full words left in COUNT
|
|
+
|
|
+ Furthermore, v0, a3-a5, t11, and t12 are untouched.
|
|
+*/
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+ .arch ev6
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+
|
|
+ .text
|
|
+ .type __stxncpy, @function
|
|
+ .globl __stxncpy
|
|
+ .usepv __stxncpy, no
|
|
+
|
|
+ cfi_startproc
|
|
+ cfi_return_column (t9)
|
|
+
|
|
+ /* On entry to this basic block:
|
|
+ t0 == the first destination word for masking back in
|
|
+ t1 == the first source word. */
|
|
+ .align 4
|
|
+stxncpy_aligned:
|
|
+ /* Create the 1st output word and detect 0's in the 1st input word. */
|
|
+ ldi t2, -1 # E : build a mask against false zero
|
|
+ mask7b t2, a1, t2 # U : detection in the src word (stall)
|
|
+ mask7b t1, a1, t3 # U :
|
|
+ ornot t1, t2, t2 # E : (stall)
|
|
+
|
|
+ mask3b t0, a1, t0 # U : assemble the first output word
|
|
+ cmpgeb zero, t2, t7 # E : bits set iff null found
|
|
+ or t0, t3, t0 # E : (stall)
|
|
+ beq a2, $a_eoc # U :
|
|
+
|
|
+ bne t7, $a_eos # U :
|
|
+ nop
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+ /* On entry to this basic block:
|
|
+ t0 == a source word not containing a null. */
|
|
+
|
|
+ /*
|
|
+ * nops here to:
|
|
+ * separate store quads from load quads
|
|
+ * limit of 1 bcond/quad to permit training
|
|
+ */
|
|
+$a_loop:
|
|
+ stl_u t0, 0(a0) # L :
|
|
+ addl a0, 8, a0 # E :
|
|
+ subl a2, 1, a2 # E :
|
|
+ nop
|
|
+
|
|
+ ldl_u t0, 0(a1) # L :
|
|
+ addl a1, 8, a1 # E :
|
|
+ cmpgeb zero, t0, t7 # E :
|
|
+ beq a2, $a_eoc # U :
|
|
+
|
|
+ beq t7, $a_loop # U :
|
|
+ nop
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+ /* Take care of the final (partial) word store. At this point
|
|
+ the end-of-count bit is set in t7 iff it applies.
|
|
+
|
|
+ On entry to this basic block we have:
|
|
+ t0 == the source word containing the null
|
|
+ t7 == the cmpgeb mask that found it. */
|
|
+$a_eos:
|
|
+ negl t7, t8 # E : find low bit set
|
|
+ and t7, t8, t8 # E : (stall)
|
|
+ /* For the sake of the cache, don't read a destination word
|
|
+ if we're not going to need it. */
|
|
+ and t8, 0x80, t6 # E : (stall)
|
|
+ bne t6, 1f # U : (stall)
|
|
+
|
|
+ /* We're doing a partial word store and so need to combine
|
|
+ our source and original destination words. */
|
|
+ ldl_u t1, 0(a0) # L :
|
|
+ subl t8, 1, t6 # E :
|
|
+ or t8, t6, t7 # E : (stall)
|
|
+ zapnot t0, t7, t0 # U : clear src bytes > null (stall)
|
|
+
|
|
+ zap t1, t7, t1 # .. e1 : clear dst bytes <= null
|
|
+ or t0, t1, t0 # e1 : (stall)
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+1: stl_u t0, 0(a0) # L :
|
|
+ ret (t9) # L0 : Latency=3
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+ /* Add the end-of-count bit to the eos detection bitmask. */
|
|
+$a_eoc:
|
|
+ or t10, t7, t7 # E :
|
|
+ br $a_eos # L0 : Latency=3
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+ .align 4
|
|
+__stxncpy:
|
|
+ /* Are source and destination co-aligned? */
|
|
+ ldi t2, -1 # E :
|
|
+ xor a0, a1, t1 # E :
|
|
+ and a0, 7, t0 # E : find dest misalignment
|
|
+ nop # E :
|
|
+
|
|
+ srl t2, 1, t2 # U :
|
|
+ and t1, 7, t1 # E :
|
|
+ sellt a2, t2, a2, a2 # E : bound count to LONG_MAX (stall)
|
|
+ nop # E :
|
|
+
|
|
+ addl a2, t0, a2 # E : bias count by dest misalignment
|
|
+ subl a2, 1, a2 # E : (stall)
|
|
+ and a2, 7, t2 # E : (stall)
|
|
+ ldi t10, 1 # E :
|
|
+
|
|
+ srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8
|
|
+ sll t10, t2, t10 # U : t10 = bitmask of last count byte
|
|
+ nop # E :
|
|
+ bne t1, $unaligned # U : (stall)
|
|
+
|
|
+ /* We are co-aligned; take care of a partial first word. */
|
|
+ ldl_u t1, 0(a1) # L : load first src word
|
|
+ addl a1, 8, a1 # E :
|
|
+ beq t0, stxncpy_aligned # U : avoid loading dest word if not needed
|
|
+ ldl_u t0, 0(a0) # L :
|
|
+
|
|
+ br stxncpy_aligned # U :
|
|
+ nop
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+
|
|
+
|
|
+/* The source and destination are not co-aligned. Align the destination
|
|
+ and cope. We have to be very careful about not reading too much and
|
|
+ causing a SEGV. */
|
|
+
|
|
+ .align 4
|
|
+$u_head:
|
|
+ /* We know just enough now to be able to assemble the first
|
|
+ full source word. We can still find a zero at the end of it
|
|
+ that prevents us from outputting the whole thing.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the first dest word, unmasked
|
|
+ t1 == the shifted low bits of the first source word
|
|
+ t6 == bytemask that is -1 in dest word bytes */
|
|
+
|
|
+ ldl_u t2, 8(a1) # L : Latency=3 load second src word
|
|
+ addl a1, 8, a1 # E :
|
|
+ mask3b t0, a0, t0 # U : mask trailing garbage in dst
|
|
+ ext7b t2, a1, t4 # U : (3 cycle stall on t2)
|
|
+
|
|
+ or t1, t4, t1 # E : first aligned src word complete (stall)
|
|
+ mask7b t1, a0, t1 # U : mask leading garbage in src (stall)
|
|
+ or t0, t1, t0 # E : first output word complete (stall)
|
|
+ or t0, t6, t6 # E : mask original data for zero test (stall)
|
|
+
|
|
+ cmpgeb zero, t6, t7 # E :
|
|
+ beq a2, $u_eocfin # U :
|
|
+ ldi t6, -1 # E :
|
|
+ nop
|
|
+
|
|
+ bne t7, $u_final # U :
|
|
+ mask3b t6, a1, t6 # U : mask out bits already seen
|
|
+ stl_u t0, 0(a0) # L : store first output word
|
|
+ or t6, t2, t2 # E :
|
|
+
|
|
+ cmpgeb zero, t2, t7 # E : find nulls in second partial
|
|
+ addl a0, 8, a0 # E :
|
|
+ subl a2, 1, a2 # E :
|
|
+ bne t7, $u_late_head_exit # U :
|
|
+
|
|
+ /* Finally, we've got all the stupid leading edge cases taken care
|
|
+ of and we can set up to enter the main loop. */
|
|
+ ext3b t2, a1, t1 # U : position hi-bits of lo word
|
|
+ beq a2, $u_eoc # U :
|
|
+ ldl_u t2, 8(a1) # L : read next high-order source word
|
|
+ addl a1, 8, a1 # E :
|
|
+
|
|
+ ext7b t2, a1, t0 # U : position lo-bits of hi word (stall)
|
|
+ cmpgeb zero, t2, t7 # E :
|
|
+ nop
|
|
+ bne t7, $u_eos # U :
|
|
+
|
|
+ /* Unaligned copy main loop. In order to avoid reading too much,
|
|
+ the loop is structured to detect zeros in aligned source words.
|
|
+ This has, unfortunately, effectively pulled half of a loop
|
|
+ iteration out into the head and half into the tail, but it does
|
|
+ prevent nastiness from accumulating in the very thing we want
|
|
+ to run as fast as possible.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the shifted low-order bits from the current source word
|
|
+ t1 == the shifted high-order bits from the previous source word
|
|
+ t2 == the unshifted current source word
|
|
+
|
|
+ We further know that t2 does not contain a null terminator. */
|
|
+
|
|
+ .align 4
|
|
+$u_loop:
|
|
+ or t0, t1, t0 # E : current dst word now complete
|
|
+ subl a2, 1, a2 # E : decrement word count
|
|
+ ext3b t2, a1, t1 # U : extract high bits for next time
|
|
+ addl a0, 8, a0 # E :
|
|
+
|
|
+ stl_u t0, -8(a0) # L : save the current word
|
|
+ beq a2, $u_eoc # U :
|
|
+ ldl_u t2, 8(a1) # L : Latency=3 load high word for next time
|
|
+ addl a1, 8, a1 # E :
|
|
+
|
|
+ ext7b t2, a1, t0 # U : extract low bits (2 cycle stall)
|
|
+ cmpgeb zero, t2, t7 # E : test new word for eos
|
|
+ nop
|
|
+ beq t7, $u_loop # U :
|
|
+
|
|
+ /* We've found a zero somewhere in the source word we just read.
|
|
+ If it resides in the lower half, we have one (probably partial)
|
|
+ word to write out, and if it resides in the upper half, we
|
|
+ have one full and one partial word left to write out.
|
|
+
|
|
+ On entry to this basic block:
|
|
+ t0 == the shifted low-order bits from the current source word
|
|
+ t1 == the shifted high-order bits from the previous source word
|
|
+ t2 == the unshifted current source word. */
|
|
+$u_eos:
|
|
+ or t0, t1, t0 # E : first (partial) source word complete
|
|
+ nop
|
|
+ cmpgeb zero, t0, t7 # E : is the null in this first bit? (stall)
|
|
+ bne t7, $u_final # U : (stall)
|
|
+
|
|
+ stl_u t0, 0(a0) # L : the null was in the high-order bits
|
|
+ addl a0, 8, a0 # E :
|
|
+ subl a2, 1, a2 # E :
|
|
+ nop
|
|
+
|
|
+$u_late_head_exit:
|
|
+ ext3b t2, a1, t0 # U :
|
|
+ cmpgeb zero, t0, t7 # E :
|
|
+ or t7, t10, t6 # E : (stall)
|
|
+ seleq a2, t6, t7, t7 # E : Latency=2, extra map slot (stall)
|
|
+
|
|
+ /* Take care of a final (probably partial) result word.
|
|
+ On entry to this basic block:
|
|
+ t0 == assembled source word
|
|
+ t7 == cmpgeb mask that found the null. */
|
|
+$u_final:
|
|
+ negl t7, t6 # E : isolate low bit set
|
|
+ and t6, t7, t8 # E : (stall)
|
|
+ and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
|
|
+ bne t6, 1f # U : (stall)
|
|
+
|
|
+ ldl_u t1, 0(a0) # L :
|
|
+ subl t8, 1, t6 # E :
|
|
+ or t6, t8, t7 # E : (stall)
|
|
+ zapnot t0, t7, t0 # U : kill source bytes > null
|
|
+
|
|
+ zap t1, t7, t1 # U : kill dest bytes <= null
|
|
+ or t0, t1, t0 # E : (stall)
|
|
+ nop
|
|
+ nop
|
|
+
|
|
+1: stl_u t0, 0(a0) # L :
|
|
+ ret (t9) # L0 : Latency=3
|
|
+
|
|
+ /* Got to end-of-count before end of string.
|
|
+ On entry to this basic block:
|
|
+ t1 == the shifted high-order bits from the previous source word */
|
|
+$u_eoc:
|
|
+ and a1, 7, t6 # E :
|
|
+ sll t10, t6, t6 # U : (stall)
|
|
+ and t6, 0xff, t6 # E : (stall)
|
|
+ bne t6, 1f # U : (stall)
|
|
+
|
|
+ ldl_u t2, 8(a1) # L : load final src word
|
|
+ nop
|
|
+ ext7b t2, a1, t0 # U : extract low bits for last word (stall)
|
|
+ or t1, t0, t1 # E : (stall)
|
|
+
|
|
+1: cmpgeb zero, t1, t7 # E :
|
|
+ mov t1, t0
|
|
+
|
|
+$u_eocfin: # end-of-count, final word
|
|
+ or t10, t7, t7 # E :
|
|
+ br $u_final # L0 : Latency=3
|
|
+
|
|
+ /* Unaligned copy entry point. */
|
|
+ .align 4
|
|
+$unaligned:
|
|
+
|
|
+ ldl_u t1, 0(a1) # L : load first source word
|
|
+ and a0, 7, t4 # E : find dest misalignment
|
|
+ and a1, 7, t5 # E : find src misalignment
|
|
+ /* Conditionally load the first destination word and a bytemask
|
|
+ with 0xff indicating that the destination byte is sacrosanct. */
|
|
+ mov zero, t0 # E :
|
|
+
|
|
+ mov zero, t6 # E :
|
|
+ beq t4, 1f # U :
|
|
+ ldl_u t0, 0(a0) # L :
|
|
+ ldi t6, -1 # E :
|
|
+
|
|
+ mask3b t6, a0, t6 # U :
|
|
+ nop
|
|
+ nop
|
|
+1: subl a1, t4, a1 # E : sub dest misalignment from src addr
|
|
+
|
|
+ /* If source misalignment is larger than dest misalignment, we need
|
|
+ extra startup checks to avoid SEGV. */
|
|
+
|
|
+ cmplt t4, t5, t8 # E :
|
|
+ ext3b t1, a1, t1 # U : shift src into place
|
|
+ ldi t2, -1 # E : for creating masks later
|
|
+ beq t8, $u_head # U : (stall)
|
|
+
|
|
+ mask7b t2, t5, t2 # U : begin src byte validity mask
|
|
+ cmpgeb zero, t1, t7 # E : is there a zero?
|
|
+ ext3b t2, a1, t2 # U :
|
|
+ or t7, t10, t5 # E : test for end-of-count too
|
|
+
|
|
+ cmpgeb zero, t2, t3 # E :
|
|
+ seleq a2, t5, t7, t7 # E : Latency=2, extra map slot
|
|
+ nop # E : keep with seleq
|
|
+ andnot t7, t3, t7 # E : (stall)
|
|
+
|
|
+ beq t7, $u_head # U :
|
|
+ /* At this point we've found a zero in the first partial word of
|
|
+ the source. We need to isolate the valid source data and mask
|
|
+ it into the original destination data. (Incidentally, we know
|
|
+ that we'll need at least one byte of that original dest word.) */
|
|
+ ldl_u t0, 0(a0) # L :
|
|
+ negl t7, t6 # E : build bitmask of bytes <= zero
|
|
+ mask7b t1, t4, t1 # U :
|
|
+
|
|
+ and t6, t7, t8 # E :
|
|
+ subl t8, 1, t6 # E : (stall)
|
|
+ or t6, t8, t7 # E : (stall)
|
|
+ zapnot t2, t7, t2 # U : prepare source word; mirror changes (stall)
|
|
+
|
|
+ zapnot t1, t7, t1 # U : to source validity mask
|
|
+ andnot t0, t2, t0 # E : zero place for source to reside
|
|
+ or t0, t1, t0 # E : and put it there (stall both t0, t1)
|
|
+ stl_u t0, 0(a0) # L : (stall)
|
|
+
|
|
+ ret (t9) # L0 : Latency=3
|
|
+
|
|
+ cfi_endproc
|
|
--
|
|
2.25.1
|
|
|