diff --git a/Makefile b/Makefile index fc021a9..c33edd9 100644 --- a/Makefile +++ b/Makefile @@ -158,18 +158,18 @@ tests : shared ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) touch $(LIBNAME) ifndef NO_FBLAS - $(MAKE) -C test FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all + $(MAKE) -C test all endif endif ifneq ($(ONLY_CBLAS), 1) - $(MAKE) -C utest FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all + #$(MAKE) -C utest all endif ifneq ($(NO_CBLAS), 1) ifneq ($(ONLY_CBLAS), 1) - $(MAKE) -C ctest FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all + $(MAKE) -C ctest all endif ifeq ($(CPP_THREAD_SAFETY_TEST), 1) - $(MAKE) -C cpp_thread_test FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all + $(MAKE) -C cpp_thread_test all endif endif diff --git a/Makefile.sw_64 b/Makefile.sw_64 new file mode 100644 index 0000000..b4542ce --- /dev/null +++ b/Makefile.sw_64 @@ -0,0 +1,35 @@ +CPP = $(CC) -E +RANLIB = ranlib + +ifeq ($(LIBSUBARCH), SW6) +LIBNAME = $(LIBPREFIX)_sw6.a +LIBNAME_P = $(LIBPREFIX)_sw6_p.a +endif + +ifneq ($(COMPILER), NATIVE) +# GCC User +ifeq ($(LIBSUBARCH), SW6) +OPTION += -DSW6 -mcpu=sw6 +endif +else +# Compaq Compiler User +ifeq ($(LIBSUBARCH), SW6) +OPTION += -DSW6 -tune sw6 -arch sw6 +endif +endif + +ifeq ($(F_COMPILER), GFORTRAN) +FCOMMON_OPT += -mieee +endif + +ifeq ($(F_COMPILER), G77) +FCOMMON_OPT += -mieee +endif + +ifndef SMP +LIBCXML = -lcxml -lots -lm +LIBATLAS = -L/usr/lib/atlas3.7.8 -lf77blas -latlas -lm +else +LIBCXML = -lcxmlp -lots -lm +LIBATLAS = -L/usr/lib/atlas3.7.8p -llapack -lptcblas -lptf77blas -latlas -lpthread -lm +endif diff --git a/Makefile.system b/Makefile.system index 3be47c6..ae90af3 100644 --- a/Makefile.system +++ b/Makefile.system @@ -42,6 +42,8 @@ else ifeq ($(ARCH), mips64el) override ARCH=mips64 else ifeq ($(ARCH), zarch) override ARCH=zarch +else ifeq ($(ARCH), sw_64) +override ARCH=sw_64 endif NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib @@ -809,6 +811,11 @@ NO_BINARY_MODE = 1 BINARY_DEFINED = 1 endif +ifeq ($(ARCH), sw_64) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif + ifeq ($(ARCH), arm) NO_BINARY_MODE = 1 BINARY_DEFINED = 1 diff --git a/Makefile.system.libname b/Makefile.system.libname deleted file mode 100644 index 1b84195..0000000 --- a/Makefile.system.libname +++ /dev/null @@ -1,1860 +0,0 @@ -# -# Include user definition -# - -# TO suppress recursive includes -INCLUDED = 1 - -ifndef TOPDIR -TOPDIR = . -endif - -ifndef RELAPACK_REPLACE -RELAPACK_REPLACE=0 -endif - -# we need to use the host system's architecture for getarch compile options even especially when cross-compiling -HOSTARCH := $(shell uname -m) -ifeq ($(HOSTARCH), amd64) -HOSTARCH=x86_64 -endif - -# Catch conflicting usage of ARCH in some BSD environments -ifeq ($(ARCH), amd64) -override ARCH=x86_64 -else ifeq ($(ARCH), powerpc64) -override ARCH=power -else ifeq ($(ARCH), powerpc64le) -override ARCH=power -else ifeq ($(ARCH), powerpc) -override ARCH=power -else ifeq ($(ARCH), i386) -override ARCH=x86 -else ifeq ($(ARCH), armv6) -override ARCH=arm -else ifeq ($(ARCH), armv7) -override ARCH=arm -else ifeq ($(ARCH), aarch64) -override ARCH=arm64 -else ifeq ($(ARCH), mipsel) -override ARCH=mips -else ifeq ($(ARCH), mips64el) -override ARCH=mips64 -else ifeq ($(ARCH), zarch) -override ARCH=zarch -endif - -NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib - -# Default C compiler -# - Only set if not specified on the command line or inherited from the environment. -# - CC is an implicit variable so neither '?=' or 'ifndef' can be used. -# http://stackoverflow.com/questions/4029274/mingw-and-make-variables -# - Default value is 'cc' which is not always a valid command (e.g. MinGW). -ifeq ($(origin CC),default) - -# Check if $(CC) refers to a valid command and set the value to gcc if not -ifneq ($(findstring cmd.exe,$(SHELL)),) -ifeq ($(shell where $(CC) 2>NUL),) -CC = gcc -endif -else # POSIX-ish -ifeq ($(shell command -v $(CC) 2>/dev/null),) -ifeq ($(shell uname -s),Darwin) -CC = clang -# EXTRALIB += -Wl,-no_compact_unwind -else -CC = gcc -endif # Darwin -endif # CC exists -endif # Shell is sane - -endif # CC is set to default - -# Default Fortran compiler (FC) is selected by f_check. - -ifndef MAKEFILE_RULE -include $(TOPDIR)/Makefile.rule -else -include $(TOPDIR)/$(MAKEFILE_RULE) -endif - -# -# Beginning of system configuration -# -ifneq ($(BUILD_SINGLE),1) -ifneq ($(BUILD_DOUBLE),1) -ifneq ($(BUILD_COMPLEX),1) -ifneq ($(BUILD_COMPLEX16),1) -override BUILD_SINGLE=1 -override BUILD_DOUBLE=1 -override BUILD_COMPLEX=1 -override BUILD_COMPLEX16=1 -endif -endif -endif -endif - -ifndef HOSTCC -HOSTCC = $(CC) -endif - -ifdef TARGET -GETARCH_FLAGS := -DFORCE_$(TARGET) -GETARCH_FLAGS += -DUSER_TARGET -ifeq ($(TARGET), GENERIC) -ifeq ($(DYNAMIC_ARCH), 1) -override NO_EXPRECISION=1 -export NO_EXPRECISION -endif -endif -endif - -# Force fallbacks for 32bit - -ifeq ($(BINARY), 32) -ifeq ($(TARGET), HASWELL) -GETARCH_FLAGS := -DFORCE_NEHALEM -endif -ifeq ($(TARGET), SKYLAKEX) -GETARCH_FLAGS := -DFORCE_NEHALEM -endif -ifeq ($(TARGET), COOPERLAKE) -GETARCH_FLAGS := -DFORCE_NEHALEM -endif -ifeq ($(TARGET), SAPPHIRERAPIDS) -GETARCH_FLAGS := -DFORCE_NEHALEM -endif -ifeq ($(TARGET), SANDYBRIDGE) -GETARCH_FLAGS := -DFORCE_NEHALEM -endif -ifeq ($(TARGET), BULLDOZER) -GETARCH_FLAGS := -DFORCE_BARCELONA -endif -ifeq ($(TARGET), PILEDRIVER) -GETARCH_FLAGS := -DFORCE_BARCELONA -endif -ifeq ($(TARGET), STEAMROLLER) -GETARCH_FLAGS := -DFORCE_BARCELONA -endif -ifeq ($(TARGET), EXCAVATOR) -GETARCH_FLAGS := -DFORCE_BARCELONA -endif -ifeq ($(TARGET), ZEN) -GETARCH_FLAGS := -DFORCE_BARCELONA -endif -ifeq ($(TARGET), ARMV8) -GETARCH_FLAGS := -DFORCE_ARMV7 -endif -ifeq ($(TARGET), POWER8) -GETARCH_FLAGS := -DFORCE_POWER6 -endif -ifeq ($(TARGET), POWER9) -GETARCH_FLAGS := -DFORCE_POWER6 -endif -ifeq ($(TARGET), POWER10) -GETARCH_FLAGS := -DFORCE_POWER6 -endif -endif - -#TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. -# -ifdef TARGET_CORE -GETARCH_FLAGS := -DFORCE_$(TARGET_CORE) -endif - -# Force fallbacks for 32bit - -ifeq ($(BINARY), 32) -ifeq ($(TARGET_CORE), HASWELL) -GETARCH_FLAGS := -DFORCE_NEHALEM -endif -ifeq ($(TARGET_CORE), SKYLAKEX) -GETARCH_FLAGS := -DFORCE_NEHALEM -endif -ifeq ($(TARGET_CORE), COOPERLAKE) -GETARCH_FLAGS := -DFORCE_NEHALEM -endif -ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) -GETARCH_FLAGS := -DFORCE_NEHALEM -endif -ifeq ($(TARGET_CORE), SANDYBRIDGE) -GETARCH_FLAGS := -DFORCE_NEHALEM -endif -ifeq ($(TARGET_CORE), BULLDOZER) -GETARCH_FLAGS := -DFORCE_BARCELONA -endif -ifeq ($(TARGET_CORE), PILEDRIVER) -GETARCH_FLAGS := -DFORCE_BARCELONA -endif -ifeq ($(TARGET_CORE), STEAMROLLER) -GETARCH_FLAGS := -DFORCE_BARCELONA -endif -ifeq ($(TARGET_CORE), EXCAVATOR) -GETARCH_FLAGS := -DFORCE_BARCELONA -endif -ifeq ($(TARGET_CORE), ZEN) -GETARCH_FLAGS := -DFORCE_BARCELONA -endif -endif - - -# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch. -ifeq ($(HOSTARCH), x86_64) -ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),) -GETARCH_FLAGS += -march=native -endif -endif - -ifdef INTERFACE64 -ifneq ($(INTERFACE64), 0) -GETARCH_FLAGS += -DUSE64BITINT -endif -endif - -ifndef GEMM_MULTITHREAD_THRESHOLD -GEMM_MULTITHREAD_THRESHOLD=4 -endif -GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) - -ifeq ($(NO_AVX), 1) -GETARCH_FLAGS += -DNO_AVX -endif - -ifeq ($(BINARY), 32) -GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512 -NO_AVX512 = 1 -endif - -ifeq ($(NO_AVX2), 1) -GETARCH_FLAGS += -DNO_AVX2 -endif - -ifeq ($(NO_AVX512), 1) -GETARCH_FLAGS += -DNO_AVX512 -endif - -ifeq ($(DEBUG), 1) -GETARCH_FLAGS += -g -endif - -ifeq ($(QUIET_MAKE), 1) -MAKE += -s -endif - -ifndef NO_PARALLEL_MAKE -NO_PARALLEL_MAKE=0 -endif -GETARCH_FLAGS += -DNO_PARALLEL_MAKE=$(NO_PARALLEL_MAKE) - -ifdef MAKE_NB_JOBS -GETARCH_FLAGS += -DMAKE_NB_JOBS=$(MAKE_NB_JOBS) -endif - -ifeq ($(HOSTCC), loongcc) -GETARCH_FLAGS += -static -endif - -#if don't use Fortran, it will only compile CBLAS. -ifeq ($(ONLY_CBLAS), 1) -NO_LAPACK = 1 -else -ONLY_CBLAS = 0 -endif - -#For small matrix optimization -ifeq ($(ARCH), x86_64) -SMALL_MATRIX_OPT = 1 -else ifeq ($(ARCH), power) -SMALL_MATRIX_OPT = 1 -BUILD_BFLOAT16 = 1 -endif -ifeq ($(SMALL_MATRIX_OPT), 1) -CCOMMON_OPT += -DSMALL_MATRIX_OPT -endif - -# This operation is expensive, so execution should be once. -ifndef GOTOBLAS_MAKEFILE -export GOTOBLAS_MAKEFILE = 1 - -# Generating Makefile.conf and config.h -DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) - -endif - -ifndef TARGET_CORE --include $(TOPDIR)/Makefile.conf -else -HAVE_NEON= -HAVE_VFP= -HAVE_VFPV3= -HAVE_VFPV4= -HAVE_MMX= -HAVE_SSE= -HAVE_SSE2= -HAVE_SSE3= -HAVE_SSSE3= -HAVE_SSE4_1= -HAVE_SSE4_2= -HAVE_SSE4A= -HAVE_SSE5= -HAVE_AVX= -HAVE_AVX2= -HAVE_FMA3= -include $(TOPDIR)/Makefile_kernel.conf -endif - - -ifndef NUM_PARALLEL -NUM_PARALLEL = 1 -endif - -ifndef NUM_THREADS -NUM_THREADS = $(NUM_CORES) -endif - -ifeq ($(NUM_THREADS), 1) -override USE_THREAD = 0 -override USE_OPENMP = 0 -endif - -ifdef USE_THREAD -ifeq ($(USE_THREAD), 0) -SMP = -else -SMP = 1 -endif -else -ifeq ($(NUM_THREADS), 1) -SMP = -else -SMP = 1 -endif -endif - -ifeq ($(SMP), 1) -USE_LOCKING = -endif - -ifndef NEED_PIC -NEED_PIC = 1 -endif - -ARFLAGS = -CPP = $(COMPILER) -E -AR ?= $(CROSS_SUFFIX)ar -AS ?= $(CROSS_SUFFIX)as -LD ?= $(CROSS_SUFFIX)ld -RANLIB ?= $(CROSS_SUFFIX)ranlib -NM = $(CROSS_SUFFIX)nm -DLLWRAP = $(CROSS_SUFFIX)dllwrap -OBJCOPY = $(CROSS_SUFFIX)objcopy -OBJCONV = $(CROSS_SUFFIX)objconv - - -# When fortran support was either not detected or actively deselected, only build BLAS. -ifeq ($(NOFORTRAN), 1) -C_LAPACK = 1 -override FEXTRALIB = -endif - -ifeq ($(C_COMPILER), GCC) -GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) -GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) -GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) -GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) -GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8) -GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) -GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) -GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) -# Note that the behavior of -dumpversion is compile-time-configurable for -# gcc-7.x and newer. Use -dumpfullversion there -ifeq ($(GCCVERSIONGTEQ7),1) - GCCDUMPVERSION_PARAM := -dumpfullversion -else - GCCDUMPVERSION_PARAM := -dumpversion -endif -GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1) -GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2) -GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 4) -GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) -endif - -ifeq ($(C_COMPILER), CLANG) -CLANGVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) -CLANGVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12) -endif - -# -# OS dependent settings -# - -ifeq ($(OSNAME), Darwin) -ifndef MACOSX_DEPLOYMENT_TARGET -ifeq ($(ARCH), arm64) -export MACOSX_DEPLOYMENT_TARGET=11.0 -ifeq ($(C_COMPILER), GCC) -export NO_SVE = 1 -endif -else -export MACOSX_DEPLOYMENT_TARGET=10.8 -endif -endif -MD5SUM = md5 -r -XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.Xcode |awk '/version:/ {print $2}'|cut -d: -f2|cut -f1 -d.) -ifeq (x$(XCVER)x,xx) -XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables |awk '/version:/ {print $2}'|cut -d: -f2|cut -f1 -d.) -endif -ifeq (x$(XCVER), x 15) -CCOMMON_OPT += -Wl,-ld_classic -endif -endif - -ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly)) -MD5SUM = md5 -r -endif - -ifeq ($(OSNAME), NetBSD) -MD5SUM = md5 -n -endif - -ifeq ($(OSNAME), Linux) -EXTRALIB += -lm -NO_EXPRECISION = 1 -endif - -ifeq ($(OSNAME), Android) -EXTRALIB += -lm -endif - -ifeq ($(OSNAME), AIX) -EXTRALIB += -lm -endif - -ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) -ifeq ($(ARCH), $(filter $(ARCH),arm arm64)) -EXTRALIB += -lm -endif -endif - -ifeq ($(OSNAME), WINNT) -NEED_PIC = 0 -NO_EXPRECISION = 1 - -EXTRALIB += -defaultlib:advapi32 - -SUFFIX = obj -PSUFFIX = pobj -LIBSUFFIX = a - -ifeq ($(C_COMPILER), CLANG) -CCOMMON_OPT += -DMS_ABI -endif - -#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics) -ifeq ($(GCCVERSIONGT4), 1) -# GCC Major version > 4 -# It is compatible with MSVC ABI. -CCOMMON_OPT += -DMS_ABI -endif - -ifeq ($(GCCVERSIONGTEQ4), 1) -ifeq ($(GCCMINORVERSIONGTEQ7), 1) -# GCC Version >=4.7 -# It is compatible with MSVC ABI. -CCOMMON_OPT += -DMS_ABI -endif -endif - -# Ensure the correct stack alignment on Win32 -# http://permalink.gmane.org/gmane.comp.lib.openblas.general/97 -ifeq ($(ARCH), x86) -CCOMMON_OPT += -mincoming-stack-boundary=2 -FCOMMON_OPT += -mincoming-stack-boundary=2 -endif - -endif - -ifeq ($(OSNAME), Interix) -NEED_PIC = 0 -NO_EXPRECISION = 1 - -INTERIX_TOOL_DIR = /opt/gcc.3.3/i586-pc-interix3/bin -endif - -ifeq ($(OSNAME), CYGWIN_NT) -NEED_PIC = 0 -NO_EXPRECISION = 1 -OS_CYGWIN_NT = 1 -endif - -ifneq ($(OSNAME), WINNT) -ifneq ($(OSNAME), CYGWIN_NT) -ifneq ($(OSNAME), Interix) -ifneq ($(OSNAME), Android) -ifdef SMP -EXTRALIB += -lpthread -endif -endif -endif -endif -endif - -# ifeq logical or -ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT Interix)) -OS_WINDOWS=1 -endif - -ifdef QUAD_PRECISION -CCOMMON_OPT += -DQUAD_PRECISION -NO_EXPRECISION = 1 -endif - -ifneq ($(ARCH), x86) -ifneq ($(ARCH), x86_64) -NO_EXPRECISION = 1 -endif -endif - -ifdef UTEST_CHECK -CCOMMON_OPT += -DUTEST_CHECK -SANITY_CHECK = 1 -endif - -ifdef SANITY_CHECK -CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU) -endif - -MAX_STACK_ALLOC ?= 2048 -ifneq ($(MAX_STACK_ALLOC), 0) -CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC) -endif - -ifdef USE_LOCKING -ifneq ($(USE_LOCKING), 0) -CCOMMON_OPT += -DUSE_LOCKING -endif -endif - -# -# Architecture dependent settings -# - -ifeq ($(ARCH), x86) -ifndef BINARY -NO_BINARY_MODE = 1 -endif - -ifeq ($(CORE), generic) -NO_EXPRECISION = 1 -endif - -ifndef NO_EXPRECISION -ifeq ($(F_COMPILER), GFORTRAN) -# ifeq logical or. GCC or LSB -ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) -EXPRECISION = 1 -CCOMMON_OPT += -DEXPRECISION -m128bit-long-double -FCOMMON_OPT += -m128bit-long-double -endif -ifeq ($(C_COMPILER), CLANG) -EXPRECISION = 1 -CCOMMON_OPT += -DEXPRECISION -FCOMMON_OPT += -m128bit-long-double -endif -endif -endif -endif - -ifeq ($(ARCH), x86_64) - -ifeq ($(CORE), generic) -NO_EXPRECISION = 1 -endif - -ifndef NO_EXPRECISION -ifeq ($(F_COMPILER), GFORTRAN) -# ifeq logical or. GCC or LSB -ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) -EXPRECISION = 1 -CCOMMON_OPT += -DEXPRECISION -m128bit-long-double -FCOMMON_OPT += -m128bit-long-double -endif -ifeq ($(C_COMPILER), CLANG) -EXPRECISION = 1 -CCOMMON_OPT += -DEXPRECISION -FCOMMON_OPT += -m128bit-long-double -endif -endif -endif -endif - -ifeq ($(C_COMPILER), INTEL) -CCOMMON_OPT += -wd981 -endif - - -ifeq ($(USE_OPENMP), 1) - -#check -ifeq ($(USE_THREAD), 0) -$(error OpenBLAS: Cannot set both USE_OPENMP=1 and USE_THREAD=0. The USE_THREAD=0 is only for building single thread version.) -endif - -# ifeq logical or. GCC or LSB -ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) -CCOMMON_OPT += -fopenmp -endif - -ifeq ($(C_COMPILER), CLANG) -CCOMMON_OPT += -fopenmp -ifeq ($(F_COMPILER), GFORTRAN) -FEXTRALIB := $(subst -lgomp,-lomp,$(FEXTRALIB)) -endif -endif - -ifeq ($(C_COMPILER), INTEL) -CCOMMON_OPT += -fopenmp -endif - -ifeq ($(C_COMPILER), PGI) -CCOMMON_OPT += -mp -endif - -ifeq ($(C_COMPILER), OPEN64) -CCOMMON_OPT += -mp -CEXTRALIB += -lstdc++ -endif - -ifeq ($(C_COMPILER), PATHSCALE) -CCOMMON_OPT += -mp -endif -endif - - -ifeq ($(DYNAMIC_ARCH), 1) -ifeq ($(ARCH), x86) -DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ - CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO -endif - -ifeq ($(ARCH), x86_64) -DYNAMIC_CORE = PRESCOTT CORE2 -ifeq ($(DYNAMIC_OLDER), 1) -DYNAMIC_CORE += PENRYN DUNNINGTON -endif -DYNAMIC_CORE += NEHALEM -ifeq ($(DYNAMIC_OLDER), 1) -DYNAMIC_CORE += OPTERON OPTERON_SSE3 -endif -DYNAMIC_CORE += BARCELONA -ifeq ($(DYNAMIC_OLDER), 1) -DYNAMIC_CORE += BOBCAT ATOM NANO -endif -ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR -endif -ifneq ($(NO_AVX2), 1) -DYNAMIC_CORE += HASWELL ZEN -endif -ifneq ($(NO_AVX512), 1) -ifneq ($(NO_AVX2), 1) -DYNAMIC_CORE += SKYLAKEX COOPERLAKE SAPPHIRERAPIDS -endif -endif -endif - -ifdef DYNAMIC_LIST -override DYNAMIC_CORE = PRESCOTT $(DYNAMIC_LIST) -XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_PRESCOTT -XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) -CCOMMON_OPT += $(XCCOMMON_OPT) -#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)' -endif - -ifeq ($(ARCH), arm64) -DYNAMIC_CORE = ARMV8 -DYNAMIC_CORE += CORTEXA53 -DYNAMIC_CORE += CORTEXA57 -DYNAMIC_CORE += CORTEXA72 -DYNAMIC_CORE += CORTEXA73 -DYNAMIC_CORE += NEOVERSEN1 -ifneq ($(NO_SVE), 1) -DYNAMIC_CORE += NEOVERSEV1 -DYNAMIC_CORE += NEOVERSEN2 -DYNAMIC_CORE += ARMV8SVE -endif -DYNAMIC_CORE += CORTEXA55 -DYNAMIC_CORE += FALKOR -DYNAMIC_CORE += THUNDERX -DYNAMIC_CORE += THUNDERX2T99 -DYNAMIC_CORE += TSV110 -DYNAMIC_CORE += EMAG8180 -DYNAMIC_CORE += THUNDERX3T110 -ifdef DYNAMIC_LIST -override DYNAMIC_CORE = ARMV8 $(DYNAMIC_LIST) -XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_ARMV8 -XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) -endif -endif - -ifeq ($(ARCH), mips64) -DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 MIPS64_GENERIC -ifdef DYNAMIC_LIST -override DYNAMIC_CORE = MIPS64_GENERIC $(DYNAMIC_LIST) -XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_MIPS64_GENERIC -XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) -endif -endif - -ifeq ($(ARCH), loongarch64) -DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC -endif - -ifeq ($(ARCH), zarch) -DYNAMIC_CORE = ZARCH_GENERIC - -# if the compiler accepts -march=arch11 or -march=z13 and can compile a file -# with z13-specific inline assembly, then we can include support for Z13. -# note: -march=z13 is equivalent to -march=arch11 yet some compiler releases -# only support one or the other. -# note: LLVM version 6.x supported -march=z13 yet could not handle vector -# registers in inline assembly, so the check for supporting the -march flag is -# not enough. -ZARCH_TEST_COMPILE=-c $(TOPDIR)/kernel/zarch/damin_z13.c -I$(TOPDIR) -o /dev/null > /dev/null 2> /dev/null -ZARCH_CC_SUPPORTS_ARCH11=$(shell $(CC) -march=arch11 $(ZARCH_TEST_COMPILE) && echo 1) -ZARCH_CC_SUPPORTS_Z13=$(shell $(CC) -march=z13 $(ZARCH_TEST_COMPILE) && echo 1) - -ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH11), $(ZARCH_CC_SUPPORTS_Z13)), 1) -DYNAMIC_CORE += Z13 -CCOMMON_OPT += -DDYN_Z13 -else -$(info OpenBLAS: Not building Z13 kernels because the compiler $(CC) does not support it) -endif - -# as above for z13, check for -march=arch12 and z14 support in the compiler. -ZARCH_CC_SUPPORTS_ARCH12=$(shell $(CC) -march=arch12 $(ZARCH_TEST_COMPILE) && echo 1) -ZARCH_CC_SUPPORTS_Z14=$(shell $(CC) -march=z14 $(ZARCH_TEST_COMPILE) && echo 1) -ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH12), $(ZARCH_CC_SUPPORTS_Z14)), 1) -DYNAMIC_CORE += Z14 -CCOMMON_OPT += -DDYN_Z14 -else -$(info OpenBLAS: Not building Z14 kernels because the compiler $(CC) does not support it) -endif - -endif # ARCH zarch - -ifeq ($(ARCH), power) -ifneq ($(C_COMPILER), PGI) -DYNAMIC_CORE = POWER6 -DYNAMIC_CORE += POWER8 -ifneq ($(C_COMPILER), GCC) -DYNAMIC_CORE += POWER9 -DYNAMIC_CORE += POWER10 -CCOMMON_OPT += -DHAVE_P10_SUPPORT -endif -ifeq ($(C_COMPILER), GCC) -ifeq ($(GCCVERSIONGT5), 1) -DYNAMIC_CORE += POWER9 -else -$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) -endif -ifeq ($(OSNAME), AIX) -LDVERSIONGTEQ35 := 1 -else -LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35) -endif -ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11) -DYNAMIC_CORE += POWER10 -CCOMMON_OPT += -DHAVE_P10_SUPPORT -else ifeq ($(GCCVERSIONGTEQ10), 1) -ifeq ($(GCCMINORVERSIONGTEQ2)$(LDVERSIONGTEQ35), 11) -DYNAMIC_CORE += POWER10 -CCOMMON_OPT += -DHAVE_P10_SUPPORT -endif -else -$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) -endif -endif -else -DYNAMIC_CORE = POWER8 -DYNAMIC_CORE += POWER9 -endif -endif - -# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty -ifndef DYNAMIC_CORE -override DYNAMIC_ARCH= -endif -endif - -ifeq ($(ARCH), ia64) -NO_BINARY_MODE = 1 -BINARY_DEFINED = 1 - -ifeq ($(F_COMPILER), GFORTRAN) -ifeq ($(C_COMPILER), GCC) -# EXPRECISION = 1 -# CCOMMON_OPT += -DEXPRECISION -endif -endif -endif - -ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) -NO_BINARY_MODE = 1 -endif - -ifeq ($(ARCH), alpha) -NO_BINARY_MODE = 1 -BINARY_DEFINED = 1 -endif - -ifeq ($(ARCH), arm) -NO_BINARY_MODE = 1 -BINARY_DEFINED = 1 - -CCOMMON_OPT += -marm -FCOMMON_OPT += -marm - -# If softfp abi is mentioned on the command line, force it. -ifeq ($(ARM_SOFTFP_ABI), 1) -CCOMMON_OPT += -mfloat-abi=softfp -FCOMMON_OPT += -mfloat-abi=softfp -endif - -ifeq ($(OSNAME), Android) -ifeq ($(ARM_SOFTFP_ABI), 1) -EXTRALIB += -lm -else -EXTRALIB += -Wl,-lm_hard -endif -endif -endif - -ifeq ($(ARCH), arm64) -NO_BINARY_MODE = 1 -BINARY_DEFINED = 1 -ifdef INTERFACE64 -ifneq ($(INTERFACE64), 0) -ifeq ($(F_COMPILER), GFORTRAN) -FCOMMON_OPT += -fdefault-integer-8 -endif -ifeq ($(F_COMPILER), FLANG) -FCOMMON_OPT += -i8 -endif -endif -endif -endif - -ifeq ($(ARCH), riscv64) -NO_BINARY_MODE = 1 -BINARY_DEFINED = 1 -ifdef INTERFACE64 -ifneq ($(INTERFACE64), 0) -ifeq ($(F_COMPILER), GFORTRAN) -FCOMMON_OPT += -fdefault-integer-8 -endif -ifeq ($(F_COMPILER), FLANG) -FCOMMON_OPT += -i8 -endif -endif -endif -endif - -ifeq ($(ARCH), loongarch64) -NO_BINARY_MODE = 1 -BINARY_DEFINED = 1 -ifdef INTERFACE64 -ifneq ($(INTERFACE64), 0) -ifeq ($(F_COMPILER), GFORTRAN) -FCOMMON_OPT += -fdefault-integer-8 -endif -ifeq ($(F_COMPILER), FLANG) -FCOMMON_OPT += -i8 -endif -endif -endif -endif - -# -# C Compiler dependent settings -# - - -# ifeq logical or. GCC or CLANG or LSB -# http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or -ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG LSB)) -CCOMMON_OPT += -Wall -COMMON_PROF += -fno-inline -NO_UNINITIALIZED_WARN = -Wno-uninitialized - -ifeq ($(QUIET_MAKE), 1) -CCOMMON_OPT += $(NO_UNINITIALIZED_WARN) -Wno-unused -endif - -ifdef NO_BINARY_MODE - -ifeq ($(ARCH), $(filter $(ARCH),mips64)) -ifdef BINARY64 -CCOMMON_OPT += -mabi=64 -else -CCOMMON_OPT += -mabi=n32 -endif -BINARY_DEFINED = 1 -else ifeq ($(ARCH), $(filter $(ARCH),mips)) -CCOMMON_OPT += -mabi=32 -BINARY_DEFINED = 1 -endif - -ifneq (, $(filter $(CORE), MIPS64_GENERIC)) -CCOMMON_OPT += -DNO_MSA -FCOMMON_OPT += -DNO_MSA -endif - -ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) -CCOMMON_OPT += -march=loongson3a -FCOMMON_OPT += -march=loongson3a -endif - -ifeq ($(CORE), MIPS24K) -CCOMMON_OPT += -mips32r2 -mtune=24kc $(MSA_FLAGS) -FCOMMON_OPT += -mips32r2 -mtune=24kc $(MSA_FLAGS) -endif - -ifeq ($(CORE), MIPS1004K) -CCOMMON_OPT += -mips32r2 $(MSA_FLAGS) -FCOMMON_OPT += -mips32r2 $(MSA_FLAGS) -endif - -ifeq ($(CORE), P5600) -CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) -FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) -endif - -ifeq ($(CORE), I6400) -CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) -FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) -endif - -ifeq ($(CORE), P6600) -CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS) -FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS) -endif - -ifeq ($(CORE), I6500) -CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) -FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) -endif - -ifeq ($(OSNAME), AIX) -BINARY_DEFINED = 1 -endif - -ifeq ($(ARCH), loongarch64) -LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d) -ifneq ($(LA64_ABI), lp64d) -LA64_ABI=lp64 -endif -CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) -FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) -endif - -endif - -ifndef BINARY_DEFINED -ifneq ($(OSNAME), AIX) -ifdef BINARY64 -ifneq ($(ARCH), riscv64) -CCOMMON_OPT += -m64 -endif -else -CCOMMON_OPT += -m32 -endif -endif -endif - -endif - -ifeq ($(C_COMPILER), PGI) -PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20) -PGCVERSIONEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 20) -PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |cut -d "-" -f 1 |sed -e "s/[^0-9.]//g" |cut -c 4-5` \>= 11) -PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11) -ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 100 101 011)) -NEWPGI := 1 -PGCVERSIONGT21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 21) -PGCVERSIONEQ21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 21) -PGCVERSIONCHECK2 := $(PGCVERSIONGT21)$(PGCVERSIONEQ21)$(PGCMINORVERSIONGE11) -ifeq ($(PGCVERSIONCHECK2), $(filter $(PGCVERSIONCHECK2), 100 101 011)) -NEWPGI2 := 1 -endif -endif -ifdef BINARY64 -ifeq ($(ARCH), x86_64) -ifeq (,$(findstring tp,$(CFLAGS))) -ifneq ($(NEWPGI2),1) -CCOMMON_OPT += -tp p7-64 -else -CCOMMON_OPT += -tp px -endif -endif -ifneq ($(NEWPGI),1) -CCOMMON_OPT += -D__MMX__ -Mnollvm -endif -else -ifeq ($(ARCH), power) -ifeq (,$(findstring tp,$(CFLAGS))) -ifeq ($(CORE), POWER8) -CCOMMON_OPT += -tp pwr8 -endif -ifeq ($(CORE), POWER9) -CCOMMON_OPT += -tp pwr9 -endif -endif -endif -endif -else -ifneq ($(NEWPGI2),1) -ifeq (,$(findstring tp,$(CFLAGS))) -CCOMMON_OPT += -tp p7 -else -CCOMMON_OPT += -tp px -endif -endif -endif -endif - -ifeq ($(C_COMPILER), PATHSCALE) -ifdef BINARY64 -CCOMMON_OPT += -m64 -else -CCOMMON_OPT += -m32 -endif -endif - -# -# Fortran Compiler dependent settings -# - -ifeq ($(F_COMPILER), NAG) -FCOMMON_OPT += -dcfuns -recursive -ieee=full -w=obs -thread_safe -ifdef INTERFACE64 -ifneq ($(INTERFACE64), 0) -FCOMMON_OPT += -i8 -endif -endif -ifeq ($(USE_OPENMP), 1) -FCOMMON_OPT += -openmp -endif -endif - -ifeq ($(F_COMPILER), FLANG) -CCOMMON_OPT += -DF_INTERFACE_FLANG -FCOMMON_OPT += -Mrecursive -Kieee -ifeq ($(OSNAME), Linux) -ifeq ($(ARCH), x86_64) -FLANG_VENDOR := $(shell $(FC) --version|head -1 |cut -f 1 -d " ") -ifeq ($(FLANG_VENDOR), AMD) -FCOMMON_OPT += -fno-unroll-loops -endif -endif -endif -ifdef BINARY64 -ifdef INTERFACE64 -ifneq ($(INTERFACE64), 0) -FCOMMON_OPT += -i8 -endif -endif -FCOMMON_OPT += -Wall -else -FCOMMON_OPT += -Wall -endif -ifeq ($(USE_OPENMP), 1) -FCOMMON_OPT += -fopenmp -endif -endif - -ifeq ($(F_COMPILER), G77) -CCOMMON_OPT += -DF_INTERFACE_G77 -FCOMMON_OPT += -Wall -ifndef NO_BINARY_MODE -ifneq ($(OSNAME), AIX) -ifdef BINARY64 -FCOMMON_OPT += -m64 -else -FCOMMON_OPT += -m32 -endif -endif -endif -endif - -ifeq ($(F_COMPILER), G95) -CCOMMON_OPT += -DF_INTERFACE_G95 -FCOMMON_OPT += -Wall -ifneq ($(OSNAME), AIX) -ifndef NO_BINARY_MODE -ifdef BINARY64 -FCOMMON_OPT += -m64 -else -FCOMMON_OPT += -m32 -endif -endif -ifneq ($(NO_LAPACKE), 1) -FCOMMON_OPT += -fno-second-underscore -endif -endif -endif - -ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW)) -CCOMMON_OPT += -DF_INTERFACE_GFORT -ifeq ($(F_COMPILER), GFORTRAN) -FCOMMON_OPT += -Wall -# make single-threaded LAPACK calls thread-safe #1847 -FCOMMON_OPT += -frecursive -# work around ABI problem with passing single-character arguments -FCOMMON_OPT += -fno-optimize-sibling-calls -#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc -ifneq ($(NOFORTRAN), 1) -ifneq ($(NOFORTRAN), 2) -ifneq ($(NO_LAPACK), 1) -EXTRALIB += -lgfortran -endif -endif -endif -endif -ifdef NO_BINARY_MODE -ifeq ($(ARCH), $(filter $(ARCH),mips64)) -ifdef BINARY64 -FCOMMON_OPT += -mabi=64 -else -FCOMMON_OPT += -mabi=n32 -endif -else ifeq ($(ARCH), $(filter $(ARCH),mips)) -FCOMMON_OPT += -mabi=32 -endif -else -ifdef BINARY64 -ifneq ($(OSNAME), AIX) -ifneq ($(ARCH), riscv64) -FCOMMON_OPT += -m64 -endif -endif -ifdef INTERFACE64 -ifneq ($(INTERFACE64), 0) -FCOMMON_OPT += -fdefault-integer-8 -endif -endif -else -ifneq ($(OSNAME), AIX) -FCOMMON_OPT += -m32 -endif -endif -endif -ifeq ($(USE_OPENMP), 1) -FCOMMON_OPT += -fopenmp -endif -endif - -ifeq ($(F_COMPILER), INTEL) -CCOMMON_OPT += -DF_INTERFACE_INTEL -ifdef INTERFACE64 -ifneq ($(INTERFACE64), 0) -FCOMMON_OPT += -i8 -endif -endif -FCOMMON_OPT += -recursive -fp-model strict -assume protect-parens -ifeq ($(USE_OPENMP), 1) -FCOMMON_OPT += -fopenmp -endif -endif - -ifeq ($(F_COMPILER), FUJITSU) -CCOMMON_OPT += -DF_INTERFACE_FUJITSU -ifeq ($(USE_OPENMP), 1) -FCOMMON_OPT += -openmp -endif -endif - -ifeq ($(F_COMPILER), IBM) -CCOMMON_OPT += -DF_INTERFACE_IBM -FEXTRALIB += -lxlf90 -ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG)) -FCOMMON_OPT += -qextname -endif -# FCOMMON_OPT += -qarch=440 -ifdef BINARY64 -FCOMMON_OPT += -q64 -ifdef INTERFACE64 -ifneq ($(INTERFACE64), 0) -FCOMMON_OPT += -qintsize=8 -endif -endif -else -FCOMMON_OPT += -q32 -endif -ifeq ($(USE_OPENMP), 1) -FCOMMON_OPT += -openmp -endif -endif - -ifeq ($(F_COMPILER), PGI) -CCOMMON_OPT += -DF_INTERFACE_PGI -COMMON_PROF += -DPGICOMPILER -ifdef BINARY64 -ifdef INTERFACE64 -ifneq ($(INTERFACE64), 0) -FCOMMON_OPT += -i8 -endif -endif -ifeq ($(ARCH), x86_64) -ifneq ($(NEWPGI2),1) -FCOMMON_OPT += -tp p7-64 -else -FCOMMON_OPT += -tp px -endif -else -ifeq ($(ARCH), power) -ifeq ($(CORE), POWER6) -$(warning NVIDIA HPC compilers do not support POWER6.) -endif -ifeq ($(CORE), POWER8) -FCOMMON_OPT += -tp pwr8 -endif -ifeq ($(CORE), POWER9) -FCOMMON_OPT += -tp pwr9 -endif -ifeq ($(CORE), POWER10) -$(warning NVIDIA HPC compilers do not support POWER10.) -endif -endif -endif -else -FCOMMON_OPT += -tp p7 -endif -FCOMMON_OPT += -Mrecursive -Kieee -ifeq ($(USE_OPENMP), 1) -FCOMMON_OPT += -mp -endif -endif - -ifeq ($(F_COMPILER), PATHSCALE) -CCOMMON_OPT += -DF_INTERFACE_PATHSCALE -ifdef BINARY64 -ifdef INTERFACE64 -ifneq ($(INTERFACE64), 0) -FCOMMON_OPT += -i8 -endif -endif -endif - -ifeq ($(USE_OPENMP), 1) -FCOMMON_OPT += -mp -endif -endif - -ifeq ($(F_COMPILER), OPEN64) -CCOMMON_OPT += -DF_INTERFACE_OPEN64 -ifdef BINARY64 -ifdef INTERFACE64 -ifneq ($(INTERFACE64), 0) -FCOMMON_OPT += -i8 -endif -endif -endif -ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) -ifndef BINARY64 -FCOMMON_OPT += -n32 -else -FCOMMON_OPT += -n64 -endif -ifeq ($(CORE), LOONGSON3R3) -FCOMMON_OPT += -loongson3 -static -endif -ifeq ($(CORE), LOONGSON3R4) -FCOMMON_OPT += -loongson3 -static -endif -else -ifndef BINARY64 -FCOMMON_OPT += -m32 -else -FCOMMON_OPT += -m64 -endif -endif -ifeq ($(USE_OPENMP), 1) -FEXTRALIB += -lstdc++ -FCOMMON_OPT += -mp -endif -endif - -ifeq ($(C_COMPILER), OPEN64) -ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) -ifndef BINARY64 -CCOMMON_OPT += -n32 -else -CCOMMON_OPT += -n64 -endif -ifeq ($(CORE), LOONGSON3R3) -CCOMMON_OPT += -loongson3 -static -endif -ifeq ($(CORE), LOONGSON3R4) -CCOMMON_OPT += -loongson3 -static -endif -else -ifndef BINARY64 -CCOMMON_OPT += -m32 -else -CCOMMON_OPT += -m64 -endif -endif -endif - -ifeq ($(C_COMPILER), SUN) -CCOMMON_OPT += -w -ifeq ($(ARCH), x86) -CCOMMON_OPT += -m32 -else -ifdef BINARY64 -CCOMMON_OPT += -m64 -else -CCOMMON_OPT += -m32 -endif -endif -endif - -ifeq ($(F_COMPILER), SUN) -CCOMMON_OPT += -DF_INTERFACE_SUN -FCOMMON_OPT += -ftrap=%none -xrecursive -ifeq ($(ARCH), x86) -FCOMMON_OPT += -m32 -else -ifdef BINARY64 -FCOMMON_OPT += -m64 -else -FCOMMON_OPT += -m32 -endif -endif -ifeq ($(USE_OPENMP), 1) -FCOMMON_OPT += -xopenmp=parallel -endif -endif - -ifeq ($(F_COMPILER), COMPAQ) -CCOMMON_OPT += -DF_INTERFACE_COMPAQ -ifeq ($(USE_OPENMP), 1) -FCOMMON_OPT += -openmp -endif -endif - -ifeq ($(F_COMPILER), CRAY) -CCOMMON_OPT += -DF_INTERFACE_INTEL -FCOMMON_OPT += -hnopattern -ifdef INTERFACE64 -ifneq ($(INTERFACE64), 0) -FCOMMON_OPT += -s integer64 -endif -endif -ifneq ($(USE_OPENMP), 1) -FCOMMON_OPT += -O noomp -endif -endif - -ifdef BINARY64 -ifdef INTERFACE64 -ifneq ($(INTERFACE64), 0) -CCOMMON_OPT += -#-DUSE64BITINT -endif -endif -endif - -ifeq ($(NEED_PIC), 1) -ifeq ($(C_COMPILER), IBM) -CCOMMON_OPT += -qpic=large -else -CCOMMON_OPT += -fPIC -endif -ifeq ($(F_COMPILER), SUN) -FCOMMON_OPT += -pic -else ifeq ($(F_COMPILER), NAG) -FCOMMON_OPT += -PIC -else ifeq ($(F_COMPILER), IBM) -FCOMMON_OPT += -qpic=large -else -FCOMMON_OPT += -fPIC -endif -endif - -ifeq ($(DYNAMIC_ARCH), 1) -CCOMMON_OPT += -DDYNAMIC_ARCH -endif - -ifeq ($(DYNAMIC_OLDER), 1) -CCOMMON_OPT += -DDYNAMIC_OLDER -endif - -ifeq ($(C_LAPACK), 1) -CCOMMON_OPT += -DC_LAPACK -endif - -ifeq ($(NO_LAPACK), 1) -CCOMMON_OPT += -DNO_LAPACK -#Disable LAPACK C interface -NO_LAPACKE = 1 -endif - -ifeq ($(NO_LAPACKE), 1) -CCOMMON_OPT += -DNO_LAPACKE -endif - -ifeq ($(NO_AVX), 1) -CCOMMON_OPT += -DNO_AVX -endif - -ifeq ($(ARCH), x86) -CCOMMON_OPT += -DNO_AVX -endif - -ifeq ($(NO_AVX2), 1) -CCOMMON_OPT += -DNO_AVX2 -endif - -ifeq ($(NO_AVX512), 1) -CCOMMON_OPT += -DNO_AVX512 -endif - -ifeq ($(NO_SVE), 1) -CCOMMON_OPT += -DNO_SVE -endif - -ifdef SMP -CCOMMON_OPT += -DSMP_SERVER - -ifeq ($(ARCH), mips64) -USE_SIMPLE_THREADED_LEVEL3 = 1 -endif - -ifeq ($(USE_OPENMP), 1) -# USE_SIMPLE_THREADED_LEVEL3 = 1 -# NO_AFFINITY = 1 -CCOMMON_OPT += -DUSE_OPENMP -endif - -ifeq ($(BIGNUMA), 1) -CCOMMON_OPT += -DBIGNUMA -endif - -endif - -ifeq ($(NO_WARMUP), 1) -CCOMMON_OPT += -DNO_WARMUP -endif - -ifeq ($(CONSISTENT_FPCSR), 1) -CCOMMON_OPT += -DCONSISTENT_FPCSR -endif - -# Only for development -# CCOMMON_OPT += -DPARAMTEST -# CCOMMON_OPT += -DPREFETCHTEST -# CCOMMON_OPT += -DNO_SWITCHING -# USE_PAPI = 1 - -ifdef USE_PAPI -CCOMMON_OPT += -DUSE_PAPI -EXTRALIB += -lpapi -lperfctr -endif - -ifdef BUFFERSIZE -CCOMMON_OPT += -DBUFFERSIZE=$(BUFFERSIZE) -endif - -ifdef DYNAMIC_THREADS -CCOMMON_OPT += -DDYNAMIC_THREADS -endif - -CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS) - -CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL) - -ifdef USE_SIMPLE_THREADED_LEVEL3 -CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 -endif - -ifeq ($(USE_TLS), 1) -CCOMMON_OPT += -DUSE_TLS -endif - -ifeq ($(BUILD_BFLOAT16), 1) -CCOMMON_OPT += -DBUILD_BFLOAT16 -endif -ifeq ($(BUILD_SINGLE), 1) -CCOMMON_OPT += -DBUILD_SINGLE=1 -endif -ifeq ($(BUILD_DOUBLE), 1) -CCOMMON_OPT += -DBUILD_DOUBLE=1 -endif -ifeq ($(BUILD_COMPLEX), 1) -CCOMMON_OPT += -DBUILD_COMPLEX=1 -endif -ifeq ($(BUILD_COMPLEX16), 1) -CCOMMON_OPT += -DBUILD_COMPLEX16=1 -endif - -CCOMMON_OPT += -DVERSION=\"$(VERSION)\" - -ifndef SYMBOLPREFIX -SYMBOLPREFIX = -endif - -ifndef SYMBOLSUFFIX -SYMBOLSUFFIX = -endif - -ifndef LIBSONAMEBASE -LIBSONAMEBASE = openblas -endif - -ifndef LIBNAMESUFFIX -LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX) -else -LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX) -endif - -ifeq ($(OSNAME), CYGWIN_NT) -LIBPREFIX = cyg$(LIBNAMEBASE) -else -LIBPREFIX = lib$(LIBNAMEBASE) -endif - -KERNELDIR = $(TOPDIR)/kernel/$(ARCH) - -include $(TOPDIR)/Makefile.$(ARCH) - -ifneq ($(C_COMPILER), PGI) -ifneq ($(C_COMPILER), SUN) -CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME -endif -endif -CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" - -ifeq ($(CORE), PPC440) -CCOMMON_OPT += -DALLOC_QALLOC -endif - -ifeq ($(CORE), PPC440FP2) -STATIC_ALLOCATION = 1 -endif - -ifneq ($(OSNAME), Linux) -NO_AFFINITY = 1 -endif - -ifneq ($(ARCH), x86_64) -ifneq ($(ARCH), x86) -NO_AFFINITY = 1 -endif -endif - -ifdef NO_AFFINITY -ifeq ($(NO_AFFINITY), 0) -override undefine NO_AFFINITY -else -CCOMMON_OPT += -DNO_AFFINITY -endif -endif - -ifdef FUNCTION_PROFILE -CCOMMON_OPT += -DFUNCTION_PROFILE -endif - -ifdef HUGETLB_ALLOCATION -CCOMMON_OPT += -DALLOC_HUGETLB -endif - -ifdef HUGETLBFILE_ALLOCATION -CCOMMON_OPT += -DALLOC_HUGETLBFILE -DHUGETLB_FILE_NAME=$(HUGETLBFILE_ALLOCATION) -endif - -ifdef STATIC_ALLOCATION -CCOMMON_OPT += -DALLOC_STATIC -endif - -ifdef DEVICEDRIVER_ALLOCATION -CCOMMON_OPT += -DALLOC_DEVICEDRIVER -DDEVICEDRIVER_NAME=\"/dev/mapper\" -endif - -ifdef MIXED_MEMORY_ALLOCATION -CCOMMON_OPT += -DMIXED_MEMORY_ALLOCATION -endif - -ifeq ($(OSNAME), SunOS) -TAR = gtar -PATCH = gpatch -GREP = ggrep -AWK = nawk -else -TAR = tar -PATCH = patch -GREP = grep -AWK = awk -endif - -ifndef MD5SUM -MD5SUM = md5sum -endif - - -REVISION = -r$(VERSION) -MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION))) - -ifeq ($(DEBUG), 1) -COMMON_OPT += -g -endif - -ifeq ($(DEBUG), 1) -FCOMMON_OPT += -g -endif - -ifndef COMMON_OPT -COMMON_OPT = -O2 -endif - -ifndef FCOMMON_OPT -FCOMMON_OPT = -O2 -frecursive -endif - -override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) -override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) -override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) -#MAKEOVERRIDES = - -ifeq ($(NEED_PIC), 1) -ifeq (,$(findstring PIC,$(FFLAGS))) -ifneq ($(F_COMPILER),IBM) -override FFLAGS += -fPIC -endif -endif -endif - -#For LAPACK Fortran codes. -#Disable -fopenmp for LAPACK Fortran codes on Windows. -ifdef OS_WINDOWS -LAPACK_FFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FFLAGS)) -LAPACK_FPFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FPFLAGS)) -else -LAPACK_FFLAGS := $(FFLAGS) -LAPACK_FPFLAGS := $(FPFLAGS) -endif - -ifeq ($(F_COMPILER),NAG) -LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) -override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) -endif -ifeq ($(F_COMPILER),CRAY) -LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) -override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) -endif - -LAPACK_CFLAGS = $(CFLAGS) -LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H -ifdef INTERFACE64 -ifneq ($(INTERFACE64), 0) -LAPACK_CFLAGS += -DLAPACK_ILP64 -endif -endif - -ifdef OS_WINDOWS -LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS -LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE -endif -ifeq ($(C_COMPILER), LSB) -LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE -endif - -ifndef SUFFIX -SUFFIX = o -endif - -ifndef PSUFFIX -PSUFFIX = po -endif - -ifndef LIBSUFFIX -LIBSUFFIX = a -endif - -ifneq ($(DYNAMIC_ARCH), 1) -ifndef SMP -LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX) -LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX) -else -LIBNAME = $(LIBPREFIX)_$(LIBCORE)p$(REVISION).$(LIBSUFFIX) -LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)p$(REVISION)_p.$(LIBSUFFIX) -endif -else -ifndef SMP -LIBNAME = $(LIBPREFIX)$(REVISION).$(LIBSUFFIX) -LIBNAME_P = $(LIBPREFIX)$(REVISION)_p.$(LIBSUFFIX) -else -LIBNAME = $(LIBPREFIX)p$(REVISION).$(LIBSUFFIX) -LIBNAME_P = $(LIBPREFIX)p$(REVISION)_p.$(LIBSUFFIX) -endif -endif - - -LIBDLLNAME = $(LIBPREFIX).dll -IMPLIBNAME = lib$(LIBNAMEBASE).dll.a -ifneq ($(OSNAME), AIX) -LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) -else -LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a) -endif -LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) -LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) -LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) -LIBZIPNAME = $(LIBNAME:.$(LIBSUFFIX)=.zip) - -LIBS = $(TOPDIR)/$(LIBNAME) -LIBS_P = $(TOPDIR)/$(LIBNAME_P) - - -LIB_COMPONENTS = BLAS -ifneq ($(NO_CBLAS), 1) -LIB_COMPONENTS += CBLAS -endif - -ifneq ($(NO_LAPACK), 1) -LIB_COMPONENTS += LAPACK -ifneq ($(NO_LAPACKE), 1) -LIB_COMPONENTS += LAPACKE -endif -ifeq ($(BUILD_RELAPACK), 1) -LIB_COMPONENTS += ReLAPACK -endif -endif - -ifeq ($(ONLY_CBLAS), 1) -LIB_COMPONENTS = CBLAS -endif - -export OSNAME -export ARCH -export CORE -export LIBCORE -export __BYTE_ORDER__ -export ELF_VERSION -export PGCPATH -export CONFIG -export CC -export FC -export BU -export FU -export NEED2UNDERSCORES -export USE_THREAD -export NUM_THREADS -export NUM_CORES -export SMP -export MAKEFILE_RULE -export NEED_PIC -export BINARY -export BINARY32 -export BINARY64 -export F_COMPILER -export C_COMPILER -export USE_OPENMP -export CROSS -export CROSS_SUFFIX -export NOFORTRAN -export C_LAPACK -export NO_FBLAS -export EXTRALIB -export CEXTRALIB -export FEXTRALIB -export HAVE_SSE -export HAVE_SSE2 -export HAVE_SSE3 -export HAVE_SSSE3 -export HAVE_SSE4_1 -export HAVE_SSE4_2 -export HAVE_SSE4A -export HAVE_SSE5 -export HAVE_AVX -export HAVE_AVX2 -export HAVE_FMA3 -export HAVE_VFP -export HAVE_VFPV3 -export HAVE_VFPV4 -export HAVE_NEON -ifndef NO_MSA - export HAVE_MSA - export MSA_FLAGS -endif -export KERNELDIR -export FUNCTION_PROFILE -export TARGET_CORE -export NO_AVX512 -export NO_AVX2 -export BUILD_BFLOAT16 -export NO_LSX -export NO_LASX - -export SBGEMM_UNROLL_M -export SBGEMM_UNROLL_N -export SGEMM_UNROLL_M -export SGEMM_UNROLL_N -export DGEMM_UNROLL_M -export DGEMM_UNROLL_N -export QGEMM_UNROLL_M -export QGEMM_UNROLL_N -export CGEMM_UNROLL_M -export CGEMM_UNROLL_N -export ZGEMM_UNROLL_M -export ZGEMM_UNROLL_N -export XGEMM_UNROLL_M -export XGEMM_UNROLL_N -export CGEMM3M_UNROLL_M -export CGEMM3M_UNROLL_N -export ZGEMM3M_UNROLL_M -export ZGEMM3M_UNROLL_N -export XGEMM3M_UNROLL_M -export XGEMM3M_UNROLL_N - - -ifdef USE_CUDA -export CUDADIR -export CUCC -export CUFLAGS -export CULIB -endif - -.SUFFIXES: .$(PSUFFIX) .$(SUFFIX) .f - -.f.$(SUFFIX): - $(FC) $(FFLAGS) -c $< -o $(@F) - -.f.$(PSUFFIX): - $(FC) $(FPFLAGS) -pg -c $< -o $(@F) - - -ifdef BINARY64 -PATHSCALEPATH = /opt/pathscale/lib/3.1 -PGIPATH = /opt/pgi/linux86-64/7.1-5/lib -else -PATHSCALEPATH = /opt/pathscale/lib/3.1/32 -PGIPATH = /opt/pgi/linux86/7.1-5/lib -endif - -ACMLPATH = /opt/acml/4.3.0 -ifneq ($(OSNAME), Darwin) -MKLPATH = /opt/intel/mkl/10.2.2.025/lib -else -MKLPATH = /Library/Frameworks/Intel_MKL.framework/Versions/10.0.1.014/lib -endif -ATLASPATH = /opt/atlas/3.9.17/opteron -FLAMEPATH = $(HOME)/flame/lib -ifneq ($(OSNAME), SunOS) -SUNPATH = /opt/sunstudio12.1 -else -SUNPATH = /opt/SUNWspro -endif diff --git a/Makefile.tail b/Makefile.tail index 54ba649..f73a86d 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -583,7 +583,7 @@ gen_insn_flash.c : echo 'int i;' >> gen_insn_flash.c echo '#ifdef __alpha' >> gen_insn_flash.c echo 'printf(".set noat;.set noreorder;\n");' >> gen_insn_flash.c - echo 'printf(".arch ev6;.text;.align 5\n");' >> gen_insn_flash.c + echo 'printf(".arch sw6;.text;.align 5\n");' >> gen_insn_flash.c echo 'printf(".globl insn_flash\n");' >> gen_insn_flash.c echo 'printf(".ent insn_flash\n");' >> gen_insn_flash.c echo 'printf("insn_flash:\n");' >> gen_insn_flash.c diff --git a/Makefile.tests b/Makefile.tests deleted file mode 100644 index b344abc..0000000 --- a/Makefile.tests +++ /dev/null @@ -1,435 +0,0 @@ -TOPDIR = . -include ./Makefile.system - -BLASDIRS = interface driver/level2 driver/level3 driver/others - -ifneq ($(DYNAMIC_ARCH), 1) -BLASDIRS += kernel -endif - -ifdef SANITY_CHECK -BLASDIRS += reference -endif - -SUBDIRS = $(BLASDIRS) -ifneq ($(NO_LAPACK), 1) -SUBDIRS += lapack -endif - -RELA = -ifeq ($(BUILD_RELAPACK), 1) -RELA = re_lapack -endif - -ifeq ($(NO_FORTRAN), 1) -define NOFORTRAN -1 -endef -ifneq ($(NO_LAPACK), 1) -define C_LAPACK -1 -endef -endif -export NOFORTRAN -export NO_LAPACK -export C_LAPACK -endif - -ifeq ($(F_COMPILER),CRAY) -LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -Og -Os,$(LAPACK_FFLAGS)) -else -LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) -endif - -SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test - -.PHONY : all libs netlib $(RELA) test ctest shared install -.NOTPARALLEL : shared - -all :: tests - @echo - @echo " OpenBLAS build complete. ($(LIB_COMPONENTS))" - @echo - @echo " OS ... $(OSNAME) " - @echo " Architecture ... $(ARCH) " -ifndef BINARY64 - @echo " BINARY ... 32bit " -else - @echo " BINARY ... 64bit " -endif - -ifdef INTERFACE64 -ifneq ($(INTERFACE64), 0) - @echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) " -endif -endif - @$(CC) --version > /dev/null 2>&1;\ - if [ $$? -eq 0 ]; then \ - cverinfo=`$(CC) --version | sed -n '1p'`; \ - if [ -z "$${cverinfo}" ]; then \ - cverinfo=`$(CC) --version | sed -n '2p'`; \ - fi; \ - echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\ - else \ - echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\ - fi -ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) - @$(FC) --version > /dev/null 2>&1;\ - if [ $$? -eq 0 ]; then \ - fverinfo=`$(FC) --version | sed -n '1p'`; \ - if [ -z "$${fverinfo}" ]; then \ - fverinfo=`$(FC) --version | sed -n '2p'`; \ - fi; \ - echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\ - else \ - echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ - fi -endif -ifneq ($(OSNAME), AIX) - @echo -n " Library Name ... $(LIBNAME)" -else - @echo " Library Name ... $(LIBNAME)" -endif - -ifndef SMP - @echo " (Single-threading) " -else - @echo " (Multi-threading; Max num-threads is $(NUM_THREADS))" -endif - -ifeq ($(DYNAMIC_ARCH), 1) - @echo " Supporting multiple $(ARCH) cpu models with minimum requirement for the common code being $(CORE)" -endif - -ifeq ($(USE_OPENMP), 1) - @echo - @echo " Use OpenMP in the multithreading. Because of ignoring OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS flags, " - @echo " you should use OMP_NUM_THREADS environment variable to control the number of threads." - @echo -endif - -ifeq ($(OSNAME), Darwin) - @echo "WARNING: If you plan to use the dynamic library $(LIBDYNNAME), you must run:" - @echo - @echo "\"make PREFIX=/your_installation_path/ install\"." - @echo - @echo "(or set PREFIX in Makefile.rule and run make install." - @echo - @echo "Note that any flags passed to make during build should also be passed to make install" - @echo "to circumvent any install errors." - @echo - @echo "If you want to move the .dylib to a new location later, make sure you change" - @echo "the internal name of the dylib with:" - @echo - @echo "install_name_tool -id /new/absolute/path/to/$(LIBDYNNAME) $(LIBDYNNAME)" -endif - @echo - @echo "To install the library, you can run \"make PREFIX=/path/to/your/installation install\"." - @echo - @echo "Note that any flags passed to make during build should also be passed to make install" - @echo "to circumvent any install errors." - @echo - -shared : libs netlib $(RELA) -ifneq ($(NO_SHARED), 1) -ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) - @$(MAKE) -C exports so - @ln -fs $(LIBSONAME) $(LIBPREFIX).so - @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) -endif -ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD)) - @$(MAKE) -C exports so - @ln -fs $(LIBSONAME) $(LIBPREFIX).so -endif -ifeq ($(OSNAME), Darwin) - @$(MAKE) -C exports dyn - @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib - @ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib -endif -ifeq ($(OSNAME), WINNT) - @$(MAKE) -C exports dll -endif -ifeq ($(OSNAME), CYGWIN_NT) - @$(MAKE) -C exports dll -endif -endif - -tests : shared -ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) - touch $(LIBNAME) -ifndef NO_FBLAS - $(MAKE) -C test all -endif -endif -ifneq ($(ONLY_CBLAS), 1) - $(MAKE) -C utest all -endif -ifneq ($(NO_CBLAS), 1) -ifneq ($(ONLY_CBLAS), 1) - $(MAKE) -C ctest all -endif -ifeq ($(CPP_THREAD_SAFETY_TEST), 1) - $(MAKE) -C cpp_thread_test all -endif -endif - -libs : -ifeq ($(CORE), UNKNOWN) - $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.) -endif -ifeq ($(NOFORTRAN), 1) - $(info OpenBLAS: Detecting fortran compiler failed. Can only compile BLAS and f2c-converted LAPACK.) -endif -ifeq ($(NO_STATIC), 1) -ifeq ($(NO_SHARED), 1) - $(error OpenBLAS: neither static nor shared are enabled.) -endif -endif - @for d in $(SUBDIRS) ; \ - do if test -d $$d; then \ - $(MAKE) -C $$d $(@F) || exit 1 ; \ - fi; \ - done -#Save the config files for installation - @cp Makefile.conf Makefile.conf_last - @cp config.h config_last.h -ifdef QUAD_PRECISION - @echo "#define QUAD_PRECISION">> config_last.h -endif -ifeq ($(EXPRECISION), 1) - @echo "#define EXPRECISION">> config_last.h -endif -## -ifeq ($(DYNAMIC_ARCH), 1) - @$(MAKE) -C kernel commonlibs || exit 1 - @for d in $(DYNAMIC_CORE) ; \ - do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ - done - @echo DYNAMIC_ARCH=1 >> Makefile.conf_last -ifeq ($(DYNAMIC_OLDER), 1) - @echo DYNAMIC_OLDER=1 >> Makefile.conf_last -endif -endif - @echo TARGET=$(CORE) >> Makefile.conf_last -ifdef USE_THREAD - @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last -endif -ifdef SMP -ifdef NUM_THREADS - @echo NUM_THREADS=$(NUM_THREADS) >> Makefile.conf_last -else - @echo NUM_THREADS=$(NUM_CORES) >> Makefile.conf_last -endif -endif -ifeq ($(USE_OPENMP),1) - @echo USE_OPENMP=1 >> Makefile.conf_last -endif -ifeq ($(INTERFACE64),1) - @echo INTERFACE64=1 >> Makefile.conf_last -endif - @echo THELIBNAME=$(LIBNAME) >> Makefile.conf_last - @echo THELIBSONAME=$(LIBSONAME) >> Makefile.conf_last - @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) - @touch lib.grd - -prof : prof_blas prof_lapack - -prof_blas : - ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) - for d in $(SUBDIRS) ; \ - do if test -d $$d; then \ - $(MAKE) -C $$d prof || exit 1 ; \ - fi; \ - done -ifeq ($(DYNAMIC_ARCH), 1) - $(MAKE) -C kernel commonprof || exit 1 -endif - -blas : - ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) - for d in $(BLASDIRS) ; \ - do if test -d $$d; then \ - $(MAKE) -C $$d libs || exit 1 ; \ - fi; \ - done - -hpl : - ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) - for d in $(BLASDIRS) ../laswp exports ; \ - do if test -d $$d; then \ - $(MAKE) -C $$d $(@F) || exit 1 ; \ - fi; \ - done -ifeq ($(DYNAMIC_ARCH), 1) - $(MAKE) -C kernel commonlibs || exit 1 - for d in $(DYNAMIC_CORE) ; \ - do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ - done -endif - -hpl_p : - ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) - for d in $(SUBDIRS) ../laswp exports ; \ - do if test -d $$d; then \ - $(MAKE) -C $$d $(@F) || exit 1 ; \ - fi; \ - done - -netlib : lapack_prebuild -ifneq ($(NO_LAPACK), 1) - @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib - @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib -endif -ifneq ($(NO_LAPACKE), 1) - @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib -endif - -ifeq ($(NO_LAPACK), 1) -re_lapack : - -else -re_lapack : - @$(MAKE) -C relapack -endif - -prof_lapack : lapack_prebuild - @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof - -lapack_prebuild : -ifeq ($(NO_LAPACK), $(filter 0,$(NO_LAPACK))) - -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -ifeq ($(F_COMPILER), GFORTRAN) - -@echo "override FFLAGS = $(LAPACK_FFLAGS) -fno-tree-vectorize" >> $(NETLIB_LAPACK_DIR)/make.inc -else - -@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -endif - -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc -ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1) - -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc -else - -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -endif - -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "ARFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "LAPACKLIB = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "TMGLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "LAPACKELIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -ifeq ($(F_COMPILER), GFORTRAN) - -@echo "TIMER = INT_ETIME" >> $(NETLIB_LAPACK_DIR)/make.inc -ifdef SMP -ifeq ($(OSNAME), WINNT) - -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc -else ifeq ($(OSNAME), Haiku) - -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc -else - -@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc -endif -else - -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc -endif -else - -@echo "TIMER = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc -endif -ifeq ($(BUILD_LAPACK_DEPRECATED), 1) - -@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc -endif -ifeq ($(BUILD_SINGLE), 1) - -@echo "BUILD_SINGLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc -endif -ifeq ($(BUILD_DOUBLE), 1) - -@echo "BUILD_DOUBLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc -endif -ifeq ($(BUILD_COMPLEX), 1) - -@echo "BUILD_COMPLEX = 1" >> $(NETLIB_LAPACK_DIR)/make.inc -endif -ifeq ($(BUILD_COMPLEX16), 1) - -@echo "BUILD_COMPLEX16 = 1" >> $(NETLIB_LAPACK_DIR)/make.inc -endif - -@echo "LAPACKE_WITH_TMG = 1" >> $(NETLIB_LAPACK_DIR)/make.inc - -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc -endif - -large.tgz : -ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) - if [ ! -a $< ]; then - -wget http://www.netlib.org/lapack/timing/large.tgz; - fi -endif - -timing.tgz : -ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) - if [ ! -a $< ]; then - -wget http://www.netlib.org/lapack/timing/timing.tgz; - fi -endif - -lapack-timing : large.tgz timing.tgz -ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) - (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) - (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) - $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING -endif - - -lapack-test : - (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) - $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz - $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc -ifneq ($(CROSS), 1) - ( cd $(NETLIB_LAPACK_DIR)/INSTALL; $(MAKE) all; ./testlsame; ./testslamch; ./testdlamch; \ - ./testsecond; ./testdsecnd; ./testieee; ./testversion ) - (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING) -endif - -lapack-runtest: lapack-test - ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ - ./testsecond; ./testdsecnd; ./testieee; ./testversion ) - (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING ) - - -blas-test: - (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out) - $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing - (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out) - - -dummy : - -install : - $(MAKE) -f Makefile.install install - -clean :: - @for d in $(SUBDIRS_ALL) ; \ - do if test -d $$d; then \ - $(MAKE) -C $$d $(@F) || exit 1 ; \ - fi; \ - done -#ifdef DYNAMIC_ARCH - @$(MAKE) -C kernel clean -#endif - @$(MAKE) -C reference clean - @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h *.so.renamed *.a.renamed *.so.0 -ifeq ($(OSNAME), Darwin) - @rm -rf getarch.dSYM getarch_2nd.dSYM -endif - @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib - @rm -f cblas.tmp cblas.tmp2 - @touch $(NETLIB_LAPACK_DIR)/make.inc - @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean - @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h - @$(MAKE) -C relapack clean - @rm -f *.grd Makefile.conf_last config_last.h - @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) - @echo Done. diff --git a/c_check b/c_check index b018c10..13a7086 100755 --- a/c_check +++ b/c_check @@ -84,6 +84,7 @@ case "$data" in *ARCH_MIPS64*) architecture=mips64 ;; *ARCH_MIPS*) architecture=mips ;; *ARCH_ALPHA*) architecture=alpha ;; + *ARCH_SW_64*) architecture=sw_64 ;; *ARCH_SPARC*) architecture=sparc ;; *ARCH_IA64*) architecture=ia64 ;; *ARCH_ARM64*) architecture=arm64 ;; @@ -124,7 +125,7 @@ case "$architecture" in defined=1 ;; arm|arm64) defined=1 ;; - zarch|e2k|alpha|ia64|riscv64|loonarch64) + zarch|e2k|alpha|ia64|riscv64|loonarch64|sw_64) defined=1 BINARY=64 ;; @@ -232,6 +233,7 @@ case "$data" in *ARCH_MIPS64*) architecture=mips64 ;; *ARCH_MIPS*) architecture=mips ;; *ARCH_ALPHA*) architecture=alpha ;; + *ARCH_SW_64*) architecture=sw_64 ;; *ARCH_SPARC*) architecture=sparc ;; *ARCH_IA64*) architecture=ia64 ;; *ARCH_ARM64*) architecture=arm64 ;; diff --git a/common.h b/common.h index 4074df0..309c3f9 100644 --- a/common.h +++ b/common.h @@ -420,6 +420,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_alpha.h" #endif +#ifdef ARCH_SW_64 +#include "common_sw_64.h" +#endif + #if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(__CET__) && defined(__has_include) #if __has_include() #include diff --git a/common_sw_64.h b/common_sw_64.h new file mode 100644 index 0000000..e14268e --- /dev/null +++ b/common_sw_64.h @@ -0,0 +1,200 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_SW_64 +#define COMMON_SW_64 + +#ifndef ASSEMBLER + +#define MB asm("memb") +#define WMB asm("memb") +#define RMB asm("memb") + +static void __inline blas_lock(unsigned long *address){ +#ifndef __DECC + unsigned long tmp1, tmp2,tmp3; + asm volatile( + "1: ldl %1, %0\n" + " bne %1, 2f\n" + " ldi %3, %0 \n" + " lldl %1, 0(%3)\n" + " ldi %2, 1 \n" + " wr_f %2 \n" + " or %1, 1, %2\n" + " memb\n " + " lstl %2, 0(%3)\n" + " rd_f %2\n" + " bne %1, 2f\n" + " beq %2, 2f\n" + " memb\n " + " br $31, 3f\n" + "2: br $31, 1b\n" + "3:\n" : "=m"(*address), "=&r"(tmp1), "=&r"(tmp2),"=&r"(tmp3) : : "memory"); +#else + asm ( + "10:" + " ldl %t0, 0(%a0); " + " bne %t0, 20f; " + " ldi %t2, %a0" + " lldl %t0, 0(%t2); " + " ldi %t1, 1" + " wr_f %t1" + " or %t0, 1, %t1;" + " memb; " + " lstl %t1, 0(%t2); " + " rd_f %t1" + " bne %t0, 20f; " + " beq %t1, 20f; " + " memb; " + " br %r31,30f; " + "20: " + " br %r31,10b; " + "30:", address); +#endif +} +#define BLAS_LOCK_DEFINED + +static __inline unsigned int rpcc(void){ + + unsigned int r0; + +#ifndef __DECC + asm __volatile__("rtc %0" : "=r"(r0) : : "memory"); +#else + r0 = asm("rtc %v0"); +#endif + + return r0; +} +#define RPCC_DEFINED + + +#define HALT ldl $0, 0($0) + +#ifndef __DECC +#define GET_IMAGE(res) asm __volatile__("fmov $f1, %0" : "=f"(res) : : "memory") +#else +#define GET_IMAGE(res) res = dasm("fmov $f1, %f0") +#endif + +#ifdef SMP +#ifdef USE64BITINT +static __inline long blas_quickdivide(long x, long y){ + return x/y; +} +#else +extern unsigned int blas_quick_divide_table[]; + +static __inline int blas_quickdivide(unsigned int x, unsigned int y){ + if (y <= 1) return x; + return (int)((x * (unsigned long)blas_quick_divide_table[y]) >> 32); +} +#endif +#endif + +#define BASE_ADDRESS ((0x1b0UL << 33) | (0x1c0UL << 23) | (0x000UL << 13)) + +#ifndef PAGESIZE +#define PAGESIZE ( 8UL << 10) +#define HUGE_PAGESIZE ( 4 << 20) +#endif +#define BUFFER_SIZE (32UL << 20) + +#else + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#define PROLOGUE \ + .arch sw6; \ + .set noat; \ + .set noreorder; \ +.text; \ + .align 5; \ + .globl REALNAME; \ + .ent REALNAME; \ +REALNAME: + +#ifdef PROFILE +#define PROFCODE \ + ldgp $gp, 0($27); \ + ldi $28, _mcount; \ + jsr $28, ($28), _mcount; \ + .prologue 1 +#else +#define PROFCODE .prologue 0 +#endif + +#if defined(__linux__) && defined(__ELF__) +#define GNUSTACK .section .note.GNU-stack,"",@progbits +#else +#define GNUSTACK +#endif + +#define EPILOGUE \ + .end REALNAME; \ + .ident VERSION; \ + GNUSTACK + +#endif + +#ifdef DOUBLE +#define SXADDQ s8addl +#define SXSUBL s8subl +#define LD fldd +#define ST fstd +#define STQ stq +#define ADD faddd +#define SUB fsubd +#define MUL fmuld +#define DIV fdivd +#else +#define SXADDQ s4addl +#define SXSUBL s4subl +#define LD flds +#define ST fsts +#define STQ stl +#define ADD fadds +#define SUB fsubs +#define MUL fmuls +#define DIV fdivs +#endif +#endif diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile index be8313e..1ab9bb8 100644 --- a/cpp_thread_test/Makefile +++ b/cpp_thread_test/Makefile @@ -1,14 +1,13 @@ -TOPDIR = .. -include $(TOPDIR)/Makefile.system +include ../Makefile.rule all :: dgemv_tester dgemm_tester dgemv_tester : - $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester + $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester ./dgemv_tester dgemm_tester : dgemv_tester - $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester + $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester ./dgemm_tester clean :: diff --git a/cpuid_sw_64.c b/cpuid_sw_64.c new file mode 100644 index 0000000..61ed28a --- /dev/null +++ b/cpuid_sw_64.c @@ -0,0 +1,105 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#if defined(__sw_64__) && defined(__DECC) +#include +#endif + +int implver(void){ + int arch; + +#ifndef __DECC + asm __volatile__("implver %0" : "=r"(arch) : : "memory"); +#else + arch = asm("implver %v0"); +#endif + return arch; +} + +void get_architecture(void){ + printf("SW_64"); +} + +void get_subarchitecture(void){ + printf("sw%d", implver() + 4); +} + +void get_subdirname(void){ + printf("sw_64"); +} + +char *get_corename(void){ + return "sw_64"; +} + +void get_cpuconfig(void){ + printf("#define SW%d\n", implver() + 4); + + switch (implver()){ + case 0: + printf("#define L1_DATA_SIZE 16384\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 2097152\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 32\n"); + printf("#define DTB_SIZE 8192\n"); + break; + + case 1: + printf("#define L1_DATA_SIZE 16384\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 2097152\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 8192\n"); + break; + + case 2: + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 4194304\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 8192\n"); + break; + } +} + +void get_libname(void){ + printf("sw%d\n", implver() + 4); +} diff --git a/ctest.c b/ctest.c index 2ccae8d..6b21d3a 100644 --- a/ctest.c +++ b/ctest.c @@ -137,6 +137,10 @@ ARCH_MIPS ARCH_ALPHA #endif +#ifdef __sw_64__ +ARCH_SW_64 +#endif + #if defined(__sparc) || defined(__sparc__) ARCH_SPARC #endif diff --git a/getarch.c b/getarch.c index 87384c0..306c389 100644 --- a/getarch.c +++ b/getarch.c @@ -1766,6 +1766,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif +#ifdef __sw_64__ +#include "cpuid_sw_64.c" +#define OPENBLAS_SUPPORTED +#endif #ifndef OPENBLAS_SUPPORTED #error "This arch/CPU is not supported by OpenBLAS." @@ -1831,7 +1835,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__sw_64__) printf("CORE=%s\n", get_corename()); #endif #endif @@ -1979,7 +1983,7 @@ printf("ELF_VERSION=2\n"); #ifdef FORCE printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__sw_64__) printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); #endif #endif diff --git a/interface/gbmv.c b/interface/gbmv.c index 1d58ba8..18aa50e 100644 --- a/interface/gbmv.c +++ b/interface/gbmv.c @@ -236,7 +236,12 @@ void CNAME(enum CBLAS_ORDER order, #ifdef SMP } else { - +//ZYX20220118 +#ifndef TRANSA + memset(buffer, 0, nthreads*m*sizeof(FLOAT)); +#else + memset(buffer, 0, nthreads*n*sizeof(FLOAT)); +#endif (gbmv_thread[(int)trans])(m, n, kl, ku, alpha, a, lda, x, incx, y, incy, buffer, nthreads); } diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index 0933736..111924b 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -398,12 +398,16 @@ ifndef DSWAPKERNEL DSWAPKERNEL = swap.S endif +#ZYX20220301 ifndef CSWAPKERNEL -CSWAPKERNEL = zswap.S +CSWAPKERNEL = zswap.c +#CSWAPKERNEL = zswap.S endif +#ZYX20220301 ifndef ZSWAPKERNEL -ZSWAPKERNEL = zswap.S +ZSWAPKERNEL = zswap.c +#ZSWAPKERNEL = zswap.S endif ifndef QSWAPKERNEL diff --git a/kernel/sw_64/KERNEL b/kernel/sw_64/KERNEL new file mode 100644 index 0000000..d10504b --- /dev/null +++ b/kernel/sw_64/KERNEL @@ -0,0 +1,176 @@ +ifndef SAMINKERNEL +SAMINKERNEL = amax.S +endif + +ifndef DAMINKERNEL +DAMINKERNEL = amax.S +endif + +ifndef CAMINKERNEL +CAMINKERNEL = zamax.S +endif + +ifndef ZAMINKERNEL +ZAMINKERNEL = zamax.S +endif + +ifndef SMINKERNEL +SMINKERNEL = max.S +endif + +ifndef DMINKERNEL +DMINKERNEL = max.S +endif + +ifndef ISAMINKERNEL +ISAMINKERNEL = iamax.S +endif + +ifndef IDAMINKERNEL +IDAMINKERNEL = iamax.S +endif + +ifndef ICAMINKERNEL +ICAMINKERNEL = izamax.S +endif + +ifndef IZAMINKERNEL +IZAMINKERNEL = izamax.S +endif + +#ZYX20220301 +ifndef LSAME_KERNEL +LSAME_KERNEL = ../generic/lsame.c +endif + +#ZYX20220120 +ifndef ISMINKERNEL +ISMINKERNEL = amax.S +#ISMINKERNEL = imin.c +endif + +#ZYX20220120 +#ifndef ISMAXKERNEL +#ISMAXKERNEL = imax.c +#endif + +ifndef IDMINKERNEL +IDMINKERNEL = amax.S +endif + +ifndef CCOPYKERNEL +CCOPYKERNEL = copy.S +endif + +ifndef ZCOPYKERNEL +ZCOPYKERNEL = copy.S +endif + +ifndef SNRM2KERNEL +SNRM2KERNEL = snrm2.S +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = dnrm2.S +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = cnrm2.S +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.S +endif + +ifndef SGEMMKERNEL +SGEMMKERNEL = gemm_kernel_4x4.S +SGEMM_BETA = gemm_beta.S +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) +endif + +ifndef DGEMMKERNEL +DGEMMKERNEL = gemm_kernel_4x4.S +DGEMM_BETA = gemm_beta.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPYOBJ = dgemm_oncopy.$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy.$(SUFFIX) +endif + +ifndef CGEMMKERNEL +CGEMMKERNEL = zgemm_kernel_2x2.S +CGEMM_BETA = zgemm_beta.S +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy.$(SUFFIX) +endif + +ifndef ZGEMMKERNEL +ZGEMMKERNEL = zgemm_kernel_2x2.S +ZGEMM_BETA = zgemm_beta.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy.$(SUFFIX) +endif + +SGEMM_BETA = gemm_beta.S +DGEMM_BETA = gemm_beta.S +CGEMM_BETA = zgemm_beta.S +ZGEMM_BETA = zgemm_beta.S + +ifndef STRSMKERNEL_LN +STRSMKERNEL_LN = trsm_kernel_4x4_LN.S +endif +ifndef STRSMKERNEL_LT +STRSMKERNEL_LT = trsm_kernel_4x4_LT.S +endif +ifndef STRSMKERNEL_RN +STRSMKERNEL_RN = trsm_kernel_4x4_LT.S +endif +ifndef STRSMKERNEL_RT +STRSMKERNEL_RT = trsm_kernel_4x4_RT.S +endif + +ifndef DTRSMKERNEL_LN +DTRSMKERNEL_LN = trsm_kernel_4x4_LN.S +endif +ifndef DTRSMKERNEL_LT +DTRSMKERNEL_LT = trsm_kernel_4x4_LT.S +endif +ifndef DTRSMKERNEL_RN +DTRSMKERNEL_RN = trsm_kernel_4x4_LT.S +endif +ifndef DTRSMKERNEL_RT +DTRSMKERNEL_RT = trsm_kernel_4x4_RT.S +endif + +ifndef CTRSMKERNEL_LN +CTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S +endif +ifndef CTRSMKERNEL_LT +CTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S +endif +ifndef CTRSMKERNEL_RN +CTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S +endif +ifndef CTRSMKERNEL_RT +CTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S +endif + +ifndef ZTRSMKERNEL_LN +ZTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S +endif +ifndef ZTRSMKERNEL_LT +ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S +endif +ifndef ZTRSMKERNEL_RN +ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S +endif +ifndef ZTRSMKERNEL_RT +ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S +endif diff --git a/kernel/sw_64/Makefile b/kernel/sw_64/Makefile new file mode 100644 index 0000000..efae70d --- /dev/null +++ b/kernel/sw_64/Makefile @@ -0,0 +1,2 @@ +clean :: + diff --git a/kernel/sw_64/amax.S b/kernel/sw_64/amax.S new file mode 100644 index 0000000..300a2f7 --- /dev/null +++ b/kernel/sw_64/amax.S @@ -0,0 +1,283 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 + +#ifndef USE_MIN +#define CMPLT(a, b) fcmplt a, b +#else +#define CMPLT(a, b) fcmplt b, a +#endif + +#define STACKSIZE 6 * 8 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + ldi $sp, -STACKSIZE($sp) + nop + .align 4 + + fstd $f2, 0($sp) + fclr $f16 + cmplt $31, N, $2 + unop + + fstd $f3, 8($sp) + fclr $f17 + cmplt $31, INCX, $3 + unop + + fstd $f4, 16($sp) + fclr $f18 + SXADDQ INCX, $31, INCX + unop + + fstd $f5, 24($sp) + fclr $f19 + and $2, $3, $0 + unop + + fstd $f6, 32($sp) + fclr $f0 + sra N, 3, $1 + beq $0, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + LD $f20, 0 * SIZE(X) + unop + fabs $f20, $f0 + ble $1, $L15 + .align 4 + + fabs $f20, $f1 + unop + addl X, INCX, X + unop + + LD $f21, 0 * SIZE(X) + fabs $f20, $f2 + addl X, INCX, X + unop + + LD $f22, 0 * SIZE(X) + fabs $f20, $f3 + addl X, INCX, X + unop + + LD $f23, 0 * SIZE(X) + fabs $f20, $f4 + addl X, INCX, X + unop + + LD $f24, 0 * SIZE(X) + addl X, INCX, X + fabs $f20, $f5 + unop + + LD $f25, 0 * SIZE(X) + fabs $f20, $f6 + addl X, INCX, X + unop + + LD $f26, 0 * SIZE(X) + fabs $f20, $f28 + addl X, INCX, X + ldi $1, -1($1) + + LD $f27, 0 * SIZE(X) + unop + addl X, INCX, X + ble $1, $L13 + .align 4 + +$L12: + fselne $f16, $f12, $f4, $f4 + unop + fabs $f20, $f29 + fillcs 56 * SIZE(X) + + fselne $f17, $f13, $f5, $f5 + LD $f20, 0 * SIZE(X) + fabs $f21, $f30 + addl X, INCX, X + + fselne $f18, $f14, $f6, $f6 + LD $f21, 0 * SIZE(X) + fabs $f22, $f10 + addl X, INCX, X + + fselne $f19, $f15, $f28, $f28 + LD $f22, 0 * SIZE(X) + fabs $f23, $f11 + addl X, INCX, X + + fabs $f24, $f12 + LD $f23, 0 * SIZE(X) + CMPLT($f0, $f29), $f16 + addl X, INCX, X + + fabs $f25, $f13 + LD $f24, 0 * SIZE(X) + CMPLT($f1, $f30), $f17 + addl X, INCX, X + + fabs $f26, $f14 + LD $f25, 0 * SIZE(X) + CMPLT($f2, $f10), $f18 + addl X, INCX, X + + fabs $f27, $f15 + LD $f26, 0 * SIZE(X) + CMPLT($f3, $f11), $f19 + addl X, INCX, X + + fselne $f16, $f29, $f0, $f0 + LD $f27, 0 * SIZE(X) + CMPLT($f4, $f12), $f16 + addl X, INCX, X + + fselne $f17, $f30, $f1, $f1 + unop + CMPLT($f5, $f13), $f17 + ldi $1, -1($1) # i -- + + fselne $f18, $f10, $f2, $f2 + unop + CMPLT($f6, $f14), $f18 + unop + + fselne $f19, $f11, $f3, $f3 + unop + CMPLT($f28, $f15), $f19 + bgt $1,$L12 + .align 4 + +$L13: + fselne $f16, $f12, $f4, $f4 + fabs $f20, $f29 + fselne $f17, $f13, $f5, $f5 + fabs $f21, $f30 + + fselne $f18, $f14, $f6, $f6 + fabs $f22, $f10 + fselne $f19, $f15, $f28, $f28 + fabs $f23, $f11 + + fabs $f24, $f12 + CMPLT($f0, $f29), $f16 + fabs $f25, $f13 + CMPLT($f1, $f30), $f17 + + fabs $f26, $f14 + CMPLT($f2, $f10), $f18 + fabs $f27, $f15 + CMPLT($f3, $f11), $f19 + + fselne $f16, $f29, $f0, $f0 + CMPLT($f4, $f12), $f16 + fselne $f17, $f30, $f1, $f1 + CMPLT($f5, $f13), $f17 + + fselne $f18, $f10, $f2, $f2 + CMPLT($f6, $f14), $f18 + fselne $f19, $f11, $f3, $f3 + CMPLT($f28, $f15), $f19 + + fselne $f16, $f12, $f4, $f4 + CMPLT($f0, $f1), $f16 + fselne $f17, $f13, $f5, $f5 + CMPLT($f2, $f3), $f17 + + fselne $f18, $f14, $f6, $f6 + CMPLT($f4, $f5), $f18 + fselne $f19, $f15, $f28, $f28 + CMPLT($f6, $f28), $f19 + + fselne $f16, $f1, $f0, $f0 + fselne $f17, $f3, $f2, $f2 + fselne $f18, $f5, $f4, $f4 + fselne $f19, $f28, $f6, $f6 + + CMPLT($f0, $f2), $f16 + CMPLT($f4, $f6), $f17 + + fselne $f16, $f2, $f0, $f0 + fselne $f17, $f6, $f4, $f0 + + CMPLT($f0, $f4), $f16 + fselne $f16, $f4, $f0, $f0 + .align 4 + +$L15: + and N, 7, $1 + unop + unop + ble $1, $End + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + addl X, INCX, X + + fabs $f20, $f29 + CMPLT($f0, $f29), $f16 + fselne $f16, $f29, $f0, $f0 + + ldi $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$End: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + + fldd $f6, 32($sp) + ldi $sp, STACKSIZE($sp) + ret + + EPILOGUE diff --git a/kernel/sw_64/asum.S b/kernel/sw_64/asum.S new file mode 100644 index 0000000..54e7fcb --- /dev/null +++ b/kernel/sw_64/asum.S @@ -0,0 +1,230 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define I $19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f19 + +#define t0 $f20 +#define t1 $f21 +#define t2 $f22 +#define t3 $f23 + + PROLOGUE + PROFCODE + + fclr s0 + unop + fclr t0 + ble N, $L999 + + sra N, 3, I + fclr s1 + fclr s2 + ble I, $L15 + + LD a0, 0 * SIZE(X) + fclr t1 + SXADDQ INCX, X, X + fclr t2 + + LD a1, 0 * SIZE(X) + fclr t3 + SXADDQ INCX, X, X + fclr s3 + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a5, 0 * SIZE(X) + SXADDQ INCX, X, X + + ldi I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ADD s0, t0, $f24 + fmov $f24,s0 + ldw $31, PREFETCHSIZE * 2 * SIZE(X) + fabs a0, t0 + ldi I, -1(I) + + ADD s1, t1, $f24 + fmov $f24,s1 + LD a6, 0 * SIZE(X) + fabs a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2,$f24 + fmov $f24,s2 + LD a7, 0 * SIZE(X) + fabs a2, t2 + SXADDQ INCX, X, X + + ADD s3, t3,$f24 + fmov $f24,s3 + LD a0, 0 * SIZE(X) + fabs a3, t3 + SXADDQ INCX, X, X + + ADD s0, t0, $f24 + fmov $f24,s0 + LD a1, 0 * SIZE(X) + fabs a4, t0 + SXADDQ INCX, X, X + + ADD s1, t1, $f24 + fmov $f24,s1 + LD a2, 0 * SIZE(X) + fabs a5, t1 + SXADDQ INCX, X, X + + ADD s2, t2,$f24 + fmov $f24,s2 + LD a3, 0 * SIZE(X) + fabs a6, t2 + SXADDQ INCX, X, X + + ADD s3, t3, $f24 + fmov $f24,s3 + LD a4, 0 * SIZE(X) + fabs a7, t3 + SXADDQ INCX, X, X + + LD a5, 0 * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + ADD s0, t0,$f24 + fmov $f24,s0 + LD a6, 0 * SIZE(X) + fabs a0, t0 + SXADDQ INCX, X, X + + ADD s1, t1,$f24 + fmov $f24,s1 + LD a7, 0 * SIZE(X) + fabs a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, $f24 + fmov $f24,s2 + fabs a2, t2 + ADD s3, t3, $f24 + fmov $f24,s3 + fabs a3, t3 + + ADD s0, t0, $f24 + fmov $f24,s0 + fabs a4, t0 + ADD s1, t1,$f24 + fmov $f24,s1 + fabs a5, t1 + ADD s2, t2, $f24 + fmov $f24,s2 + fabs a6, t2 + ADD s3, t3, $f24 + fmov $f24,s3 + fabs a7, t3 + + ADD s1, t1,$f24 + fmov $f24,s1 + ADD s2, t2, $f24 + fmov $f24,s2 + ADD s3, t3, $f24 + fmov $f24,s3 + + ADD s0, s1, $f24 + fmov $f24,s0 + ADD s2, s3, $f24 + fmov $f24,s2 + .align 4 + +$L15: + and N, 7, I + ADD s0, s2,$f24 + fmov $f24,s0 + unop + ble I, $L999 + .align 4 + +$L17: + ADD s0, t0, a0 + fmov a0,s0 + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + fabs a0, t0 + + ldi I, -1(I) + bne I, $L17 + .align 4 + +$L999: + ADD s0, t0,$f24 + fmov $f24,s0 + ret + EPILOGUE diff --git a/kernel/sw_64/asum.S.bak b/kernel/sw_64/asum.S.bak new file mode 100644 index 0000000..faf7827 --- /dev/null +++ b/kernel/sw_64/asum.S.bak @@ -0,0 +1,206 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define I $19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f19 + +#define t0 $f20 +#define t1 $f21 +#define t2 $f22 +#define t3 $f23 + + PROLOGUE + PROFCODE + + fclr s0 + unop + fclr t0 + ble N, $L999 + + sra N, 3, I + fclr s1 + fclr s2 + ble I, $L15 + + LD a0, 0 * SIZE(X) + fclr t1 + SXADDQ INCX, X, X + fclr t2 + + LD a1, 0 * SIZE(X) + fclr t3 + SXADDQ INCX, X, X + fclr s3 + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a5, 0 * SIZE(X) + SXADDQ INCX, X, X + + ldi I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ADD s0, t0, s0 + fillcs PREFETCHSIZE * 2 * SIZE(X) + fabs a0, t0 + ldi I, -1(I) + + ADD s1, t1, s1 + LD a6, 0 * SIZE(X) + fabs a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + LD a7, 0 * SIZE(X) + fabs a2, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + fabs a3, t3 + SXADDQ INCX, X, X + + ADD s0, t0, s0 + LD a1, 0 * SIZE(X) + fabs a4, t0 + SXADDQ INCX, X, X + + ADD s1, t1, s1 + LD a2, 0 * SIZE(X) + fabs a5, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + LD a3, 0 * SIZE(X) + fabs a6, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + fabs a7, t3 + SXADDQ INCX, X, X + + LD a5, 0 * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + ADD s0, t0, s0 + LD a6, 0 * SIZE(X) + fabs a0, t0 + SXADDQ INCX, X, X + + ADD s1, t1, s1 + LD a7, 0 * SIZE(X) + fabs a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + fabs a2, t2 + ADD s3, t3, s3 + fabs a3, t3 + + ADD s0, t0, s0 + fabs a4, t0 + ADD s1, t1, s1 + fabs a5, t1 + ADD s2, t2, s2 + fabs a6, t2 + ADD s3, t3, s3 + fabs a7, t3 + + ADD s1, t1, s1 + ADD s2, t2, s2 + ADD s3, t3, s3 + + ADD s0, s1, s0 + ADD s2, s3, s2 + .align 4 + +$L15: + and N, 7, I + ADD s0, s2, s0 + unop + ble I, $L999 + .align 4 + +$L17: + ADD s0, t0, s0 + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + fabs a0, t0 + + ldi I, -1(I) + bne I, $L17 + .align 4 + +$L999: + ADD s0, t0, s0 + ret + EPILOGUE diff --git a/kernel/sw_64/asum_simd.S b/kernel/sw_64/asum_simd.S new file mode 100644 index 0000000..f9152ec --- /dev/null +++ b/kernel/sw_64/asum_simd.S @@ -0,0 +1,342 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define I $19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f19 + +#define t0 $f20 +#define t1 $f21 +#define t2 $f22 +#define t3 $f23 + + PROLOGUE + PROFCODE + + fclr s0 + unop + fclr t0 + ble N, $L999 + + cmpeq INCX, 1, $3 + beq $3, $Sub + .align 4 + +/* + Unloop 16 +*/ + +/** + test the address of X +**/ + and X, (VEC_LEN*SIZE-1), $4 + nop + nop + beq $4, $Align + +/** + process the unalign address of X +**/ + +/*if N is too small(less then unroll size), don't need process unalign X. Just jump to remain section.*/ + sra N, 4, I + fclr s1 + fclr s2 + ble I, $Remain + + sra $4, BASE_SHIFT, $4 + ldi $3, VEC_LEN + subl $3, $4, $4 + nop + +$UnAlign_X_Loop: + LD a0, 0 * SIZE(X) + addl X, SIZE, X + fabs a0, t0 + subl $4, 1, $4 + + ADD s0, t0, s0 + subl N, 1, N + nop + bgt $4, $UnAlign_X_Loop + +$Align: + sra N, 4, I + fclr s1 + fclr s2 + ble I, $Remain + + VLD a0, 0*VEC_LEN*SIZE(X) + vcpys $f31, $f31, t0 + VLD a1, 1*VEC_LEN*SIZE(X) + vcpys $f31, $f31, t1 + + VLD a2, 2*VEC_LEN*SIZE(X) + vcpys $f31, $f31, t2 + VLD a3, 3*VEC_LEN*SIZE(X) + vcpys $f31, $f31, t3 + + subl I, 1, I + addl X, 16*SIZE, X + unop + ble I, $MainLoopEnd + +$MainLoop: + + vcpys $f31, a0, a4 + VLD a0, 0*VEC_LEN*SIZE(X) + vcpys $f31, a1, a5 + VLD a1, 1*VEC_LEN*SIZE(X) + + vcpys $f31, a2, a6 + VLD a2, 2*VEC_LEN*SIZE(X) + vcpys $f31, a3, a7 + VLD a3, 3*VEC_LEN*SIZE(X) + + VADD t0, a4, t0 + subl I, 1, I + VADD t1, a5, t1 + fillcs PREFETCHSIZE * SIZE(X) + + VADD t2, a6, t2 + addl X, 16*SIZE, X + VADD t3, a7, t3 + bgt I, $MainLoop + +$MainLoopEnd: + /*fabs*/ + + vcpys $f31, a0, a4 + vcpys $f31, a1, a5 + vcpys $f31, a2, a6 + vcpys $f31, a3, a7 + + VADD t0, a4, t0 + VADD t1, a5, t1 + VADD t2, a6, t2 + VADD t3, a7, t3 + + VADD t0, t1, t0 + VADD t2, t3, t2 + VADD t0, t2, t0 + nop + + vextf t0, 1, s1 + vextf t0, 2, s2 + vextf t0, 3, s3 + nop + + /*sum*/ + ADD t0, s1, t0 + ADD s2, s3, s2 + ADD s0, t0, s0 + nop +$Remain: + and N, 15, I + ADD s0, s2, s0 + unop + ble I, $End + .align 4 + +$RemainLoop: + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + fabs a0, t0 + ldi I, -1(I) + + ADD s0, t0, s0 + bne I, $RemainLoop + .align 4 + +$End: + ret + + +$Sub: + sra N, 3, I + fclr s1 + fclr s2 + ble I, $L15 + + LD a0, 0 * SIZE(X) + fclr t1 + SXADDQ INCX, X, X + fclr t2 + + LD a1, 0 * SIZE(X) + fclr t3 + SXADDQ INCX, X, X + fclr s3 + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a5, 0 * SIZE(X) + SXADDQ INCX, X, X + + ldi I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ADD s0, t0, s0 + fillcs PREFETCHSIZE * 2 * SIZE(X) + fabs a0, t0 + ldi I, -1(I) + + ADD s1, t1, s1 + LD a6, 0 * SIZE(X) + fabs a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + LD a7, 0 * SIZE(X) + fabs a2, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + fabs a3, t3 + SXADDQ INCX, X, X + + ADD s0, t0, s0 + LD a1, 0 * SIZE(X) + fabs a4, t0 + SXADDQ INCX, X, X + + ADD s1, t1, s1 + LD a2, 0 * SIZE(X) + fabs a5, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + LD a3, 0 * SIZE(X) + fabs a6, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + fabs a7, t3 + SXADDQ INCX, X, X + + LD a5, 0 * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + ADD s0, t0, s0 + LD a6, 0 * SIZE(X) + fabs a0, t0 + SXADDQ INCX, X, X + + ADD s1, t1, s1 + LD a7, 0 * SIZE(X) + fabs a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + fabs a2, t2 + ADD s3, t3, s3 + fabs a3, t3 + + ADD s0, t0, s0 + fabs a4, t0 + ADD s1, t1, s1 + fabs a5, t1 + ADD s2, t2, s2 + fabs a6, t2 + ADD s3, t3, s3 + fabs a7, t3 + + ADD s1, t1, s1 + ADD s2, t2, s2 + ADD s3, t3, s3 + + ADD s0, s1, s0 + ADD s2, s3, s2 + .align 4 + +$L15: + and N, 7, I + ADD s0, s2, s0 + unop + ble I, $L999 + .align 4 + +$L17: + ADD s0, t0, s0 + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + fabs a0, t0 + + ldi I, -1(I) + bne I, $L17 + .align 4 + +$L999: + ADD s0, t0, s0 + ret + EPILOGUE diff --git a/kernel/sw_64/axpy.S b/kernel/sw_64/axpy.S new file mode 100644 index 0000000..70e97d6 --- /dev/null +++ b/kernel/sw_64/axpy.S @@ -0,0 +1,428 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 40 + + PROLOGUE + PROFCODE + .frame $sp, 16, $26, 0 + + ldl $24, 0($sp) + fmov $f19, $f30 + ldl $23, 8($sp) + ldi $sp, -16($sp) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + nop + sra $16, 3, $1 + fstd $f2, 0($sp) + cmpeq $21, 1, $3 + + fstd $f3, 8($sp) + cmpeq $23, 1, $4 + and $16, 7, $2 + ble $16, $End + + and $3, $4, $3 + fbeq $f30, $End + + beq $3, $Sub + ble $1, $Remain + .align 4 + + LD $f10, 0*SIZE($20) + LD $f11, 1*SIZE($20) + LD $f12, 2*SIZE($20) + LD $f13, 3*SIZE($20) + + LD $f18, 0*SIZE($24) + LD $f19, 1*SIZE($24) + LD $f20, 2*SIZE($24) + LD $f21, 3*SIZE($24) + + LD $f14, 4*SIZE($20) + LD $f15, 5*SIZE($20) + LD $f16, 6*SIZE($20) + LD $f17, 7*SIZE($20) + + LD $f22, 4*SIZE($24) + LD $f23, 5*SIZE($24) + LD $f24, 6*SIZE($24) + LD $f25, 7*SIZE($24) + + subl $1, 1, $1 + addl $20, 8*SIZE, $20 + unop + ble $1, $LoopEnd + .align 4 + +$Loop: + fillcs PREFETCHSIZE * SIZE($24) + fillcs PREFETCHSIZE * SIZE($20) + + MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 + LD $f10, 0*SIZE($20) + MUL $f30, $f11, $f27 + LD $f11, 1*SIZE($20) + + MUL $f30, $f12, $f28 + LD $f12, 2*SIZE($20) + MUL $f30, $f13, $f29 + LD $f13, 3*SIZE($20) + + ADD $f18, $f26, $f0 + LD $f18, 8*SIZE($24) + MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 + LD $f14, 4*SIZE($20) + + ADD $f19, $f27, $f1 + LD $f19, 9*SIZE($24) + MUL $f30, $f15, $f27 + LD $f15, 5*SIZE($20) + + ADD $f20, $f28, $f2 + LD $f20, 10*SIZE($24) + MUL $f30, $f16, $f28 + LD $f16, 6*SIZE($20) + + ADD $f21, $f29, $f3 + LD $f21, 11*SIZE($24) + MUL $f30, $f17, $f29 + LD $f17, 7*SIZE($20) + + ST $f0, 0*SIZE($24) + ADD $f22, $f26, $f0 + ST $f1, 1*SIZE($24) + ADD $f23, $f27, $f1 + + ST $f2, 2*SIZE($24) + ADD $f24, $f28, $f2 + ST $f3, 3*SIZE($24) + ADD $f25, $f29, $f3 + + LD $f22, 12*SIZE($24) + LD $f23, 13*SIZE($24) + LD $f24, 14*SIZE($24) + LD $f25, 15*SIZE($24) + + ST $f0, 4*SIZE($24) + ST $f1, 5*SIZE($24) + ST $f2, 6*SIZE($24) + ST $f3, 7*SIZE($24) + + subl $1, 1, $1 + addl $24, 8*SIZE, $24 + addl $20, 8*SIZE, $20 + bgt $1, $Loop + .align 4 + +$LoopEnd: + MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 + MUL $f30, $f11, $f27 + MUL $f30, $f12, $f28 + MUL $f30, $f13, $f29 + + ADD $f18, $f26, $f0 + MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 + ADD $f19, $f27, $f1 + MUL $f30, $f15, $f27 + + ADD $f20, $f28, $f2 + MUL $f30, $f16, $f28 + ADD $f21, $f29, $f3 + MUL $f30, $f17, $f29 + + ST $f0, 0*SIZE($24) + ADD $f22, $f26, $f0 + ST $f1, 1*SIZE($24) + ADD $f23, $f27, $f1 + + ST $f2, 2*SIZE($24) + ADD $f24, $f28, $f2 + ST $f3, 3*SIZE($24) + ADD $f25, $f29, $f3 + + ST $f0, 4*SIZE($24) + ST $f1, 5*SIZE($24) + ST $f2, 6*SIZE($24) + ST $f3, 7*SIZE($24) + addl $24, 8*SIZE, $24 + .align 4 + +$Remain: + ble $2, $End + .align 4 + +$RemainLoop: + LD $f10, 0*SIZE($20) + LD $f11, 0*SIZE($24) + addl $20, SIZE, $20 + addl $24, SIZE, $24 + + MUL $f30, $f10, $f12 + subl $2, 1, $2 + ADD $f11, $f12, $f13 + ST $f13, -1*SIZE($24) + bgt $2, $RemainLoop + .align 4 + +$End: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + ldi $sp, 16($sp) + ret + .align 4 + +$Sub: + SXSUBL $16, SIZE, $22 + subl $1, 1, $4 + ble $1, $SubRemain + .align 4 + + LD $f10, 0($20) + SXADDQ $21, $20, $20 + + LD $f11, 0($20) + SXADDQ $21, $20, $20 + LD $f12, 0($20) + SXADDQ $21, $20, $20 + + LD $f13, 0($20) + SXADDQ $21, $20, $20 + LD $f18, 0($24) + SXADDQ $23, $24, $22 + + LD $f19, 0($22) + SXADDQ $23, $22, $22 + LD $f20, 0($22) + SXADDQ $23, $22, $22 + + LD $f21, 0($22) + SXADDQ $23, $22, $22 + LD $f14, 0($20) + SXADDQ $21, $20, $20 + + LD $f15, 0($20) + SXADDQ $21, $20, $20 + LD $f16, 0($20) + SXADDQ $21, $20, $20 + + LD $f17, 0($20) + SXADDQ $21, $20, $20 + LD $f22, 0($22) + SXADDQ $23, $22, $22 + + LD $f23, 0($22) + SXADDQ $23, $22, $22 + LD $f24, 0($22) + SXADDQ $23, $22, $22 + + LD $f25, 0($22) + SXADDQ $23, $22, $22 + unop + ble $4, $SubLoopEnd + .align 4 + +$SubLoop: + MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 + LD $f10, 0($20) + unop + SXADDQ $21, $20, $20 + + MUL $f30, $f11, $f27 + LD $f11, 0($20) + unop + SXADDQ $21, $20, $20 + + MUL $f30, $f12, $f28 + LD $f12, 0($20) + unop + SXADDQ $21, $20, $20 + + MUL $f30, $f13, $f29 + LD $f13, 0($20) + unop + SXADDQ $21, $20, $20 + + ADD $f18, $f26, $f0 + MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 + LD $f14, 0($20) + SXADDQ $21, $20, $20 + + ADD $f19, $f27, $f1 + MUL $f30, $f15, $f27 + LD $f15, 0($20) + SXADDQ $21, $20, $20 + + ADD $f20, $f28, $f2 + MUL $f30, $f16, $f28 + LD $f16, 0($20) + SXADDQ $21, $20, $20 + + ADD $f21, $f29, $f3 + MUL $f30, $f17, $f29 + LD $f17, 0($20) + SXADDQ $21, $20, $20 + + ST $f0, 0($24) + SXADDQ $23, $24, $24 + ADD $f22, $f26, $f0 + unop + + ST $f1, 0($24) + SXADDQ $23, $24, $24 + ADD $f23, $f27, $f1 + unop + + ST $f2, 0($24) + SXADDQ $23, $24, $24 + ADD $f24, $f28, $f2 + unop + + ST $f3, 0($24) + SXADDQ $23, $24, $24 + ADD $f25, $f29, $f3 + unop + + LD $f18, 0($22) + SXADDQ $23, $22, $22 + LD $f19, 0($22) + SXADDQ $23, $22, $22 + + LD $f20, 0($22) + SXADDQ $23, $22, $22 + LD $f21, 0($22) + SXADDQ $23, $22, $22 + + LD $f22, 0($22) + SXADDQ $23, $22, $22 + LD $f23, 0($22) + SXADDQ $23, $22, $22 + + LD $f24, 0($22) + SXADDQ $23, $22, $22 + LD $f25, 0($22) + SXADDQ $23, $22, $22 + + ST $f0, 0($24) + SXADDQ $23, $24, $24 + ST $f1, 0($24) + SXADDQ $23, $24, $24 + ST $f2, 0($24) + SXADDQ $23, $24, $24 + ST $f3, 0($24) + SXADDQ $23, $24, $24 + + subl $4, 1, $4 + bgt $4, $SubLoop + .align 4 + +$SubLoopEnd: + MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 + MUL $f30, $f11, $f27 + MUL $f30, $f12, $f28 + MUL $f30, $f13, $f29 + + ADD $f18, $f26, $f0 + MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 + ADD $f19, $f27, $f1 + MUL $f30, $f15, $f27 + + ADD $f20, $f28, $f2 + MUL $f30, $f16, $f28 + ADD $f21, $f29, $f3 + MUL $f30, $f17, $f29 + + ST $f0, 0($24) + SXADDQ $23, $24, $24 + ST $f1, 0($24) + SXADDQ $23, $24, $24 + + ST $f2, 0($24) + SXADDQ $23, $24, $24 + ST $f3, 0($24) + SXADDQ $23, $24, $24 + + ADD $f22, $f26, $f0 + ADD $f23, $f27, $f1 + ADD $f24, $f28, $f2 + ADD $f25, $f29, $f3 + + ST $f0, 0($24) + SXADDQ $23, $24, $24 + ST $f1, 0($24) + SXADDQ $23, $24, $24 + + ST $f2, 0($24) + SXADDQ $23, $24, $24 + ST $f3, 0($24) + SXADDQ $23, $24, $24 + .align 4 + +$SubRemain: + ble $2, $SubEnd + .align 4 + +$SubRemainLoop: + LD $f10, 0($20) + LD $f11, 0($24) + SXADDQ $21, $20, $20 + + MUL $f30, $f10, $f12 + subl $2, 1, $2 + ADD $f11, $f12, $f13 + ST $f13, 0($24) + SXADDQ $23, $24, $24 + + bgt $2, $SubRemainLoop + .align 4 + +$SubEnd: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + ldi $sp, 16($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/axpy_simd.S b/kernel/sw_64/axpy_simd.S new file mode 100644 index 0000000..3a2219c --- /dev/null +++ b/kernel/sw_64/axpy_simd.S @@ -0,0 +1,655 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + +#define PREFETCHSIZE 80 +// #define PREFETCH_DISTANCE_BYTES 384 + + + PROLOGUE + PROFCODE + .frame $sp, 16, $26, 0 + + ldl $24, 0($sp) + fmov $f19, $f30 + ldl $23, 8($sp) + ldi $sp, -16($sp) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + fstd $f2, 0($sp) + cmpeq $21, 1, $3 + fstd $f3, 8($sp) + cmpeq $23, 1, $4 + + ble $16, $End + fbeq $f30, $End + and $3, $4, $3 + beq $3, $Sub + +/** + test the address of Y +**/ + and $24, (VEC_LEN*SIZE-1), $4 + nop + nop + beq $4, $Align_Y_Access + .align 4 +/** + process the unalign address of Y +**/ + + sra $16, 4, $1 + and $16, 15, $2 + sra $4, BASE_SHIFT, $4 + ble $1, $Remain /*if N is too small(less then unroll size), don't need process unalign Y. Just jump to remain section.*/ + + ldi $3, VEC_LEN + subl $3, $4, $4 + +$UnAlign_Y_Loop: + LD $f10, 0*SIZE($20) + LD $f11, 0*SIZE($24) + addl $20, SIZE, $20 + addl $24, SIZE, $24 + + MAD $f30, $f10, $f11, $f13 + subl $4, 1, $4 + subl $16, 1, $16 + ST $f13, -1*SIZE($24) + bgt $4, $UnAlign_Y_Loop + .align 4 + + +$Align_Y_Access: + + nop + sra $16, 4, $1 + and $16, 15, $2 + ble $1, $Remain + +/** + test the address of X +**/ + + and $20, (VEC_LEN*SIZE-1), $3 + nop + nop + bne $3, $UnAlign_X_Access + + .align 4 +$Align_Access: +/*** + extern alpha from $f30 to vector 4 in $f13 + unloop 16 +***/ + vcpyf $f30, $f13 + + VLD $f10, 0*VEC_LEN*SIZE($20) +/* + LD $f10, 0*SIZE($20) + LD $f11, 1*SIZE($20) + LD $f12, 2*SIZE($20) + LD $f13, 3*SIZE($20) +*/ + VLD $f18, 0*VEC_LEN*SIZE($24) +/* + LD $f18, 0*SIZE($24) + LD $f19, 1*SIZE($24) + LD $f20, 2*SIZE($24) + LD $f21, 3*SIZE($24) +*/ + VLD $f14, 1*VEC_LEN*SIZE($20) + VLD $f15, 2*VEC_LEN*SIZE($20) + VLD $f16, 3*VEC_LEN*SIZE($20) +/* + LD $f14, 4*SIZE($20) + LD $f15, 5*SIZE($20) + LD $f16, 6*SIZE($20) + LD $f17, 7*SIZE($20) +*/ + VLD $f22, 1*VEC_LEN*SIZE($24) + VLD $f23, 2*VEC_LEN*SIZE($24) + VLD $f24, 3*VEC_LEN*SIZE($24) +/* + LD $f22, 4*SIZE($24) + LD $f23, 5*SIZE($24) + LD $f24, 6*SIZE($24) + LD $f25, 7*SIZE($24) +*/ + + subl $1, 1, $1 + addl $20, 16*SIZE, $20 + unop + ble $1, $LoopEnd + .align 4 + +$Loop: + + fillcs PREFETCHSIZE * SIZE($24) + fillcs PREFETCHSIZE * SIZE($20) +/* + fillcs PREFETCH_DISTANCE_BYTES($24) + fillcs PREFETCH_DISTANCE_BYTES($20) +*/ + + VMAD $f13, $f10, $f18, $f0 + VLD $f10, 0*VEC_LEN*SIZE($20) + VLD $f18, 4*VEC_LEN*SIZE($24) +/* + MAD $f30, $f10, $f18, $f0 # y += alpha * x + LD $f10, 0*SIZE($20) + MAD $f30, $f11, $f19, $f1 + LD $f11, 1*SIZE($20) + + MAD $f30, $f12, $f20, $f2 + LD $f12, 2*SIZE($20) + MAD $f30, $f13, $f21, $f3 + LD $f13, 3*SIZE($20) +*/ + + VMAD $f13, $f14, $f22, $f26 + VLD $f14, 1*VEC_LEN*SIZE($20) + VLD $f22, 5*VEC_LEN*SIZE($24) + + VMAD $f13, $f15, $f23, $f27 + VLD $f15, 2*VEC_LEN*SIZE($20) + VLD $f23, 6*VEC_LEN*SIZE($24) + + VMAD $f13, $f16, $f24, $f28 + VLD $f16, 3*VEC_LEN*SIZE($20) + VLD $f24, 7*VEC_LEN*SIZE($24) +/* + MAD $f30, $f14, $f22, $f26 # y += alpha * x + LD $f14, 4*SIZE($20) + MAD $f30, $f15, $f23, $f27 + LD $f15, 5*SIZE($20) + + MAD $f30, $f16, $f24, $f28 + LD $f16, 6*SIZE($20) + MAD $f30, $f17, $f25, $f29 + LD $f17, 7*SIZE($20) +*/ + +/* + LD $f18, 8*SIZE($24) + LD $f19, 9*SIZE($24) + LD $f20, 10*SIZE($24) + LD $f21, 11*SIZE($24) + + LD $f22, 12*SIZE($24) + LD $f23, 13*SIZE($24) + LD $f24, 14*SIZE($24) + LD $f25, 15*SIZE($24) +*/ + + + + VST $f0, 0*VEC_LEN*SIZE($24) + VST $f26, 1*VEC_LEN*SIZE($24) + VST $f27, 2*VEC_LEN*SIZE($24) + VST $f28, 3*VEC_LEN*SIZE($24) +/* + ST $f0, 0*SIZE($24) + ST $f1, 1*SIZE($24) + ST $f2, 2*SIZE($24) + ST $f3, 3*SIZE($24) + + ST $f26, 4*SIZE($24) + ST $f27, 5*SIZE($24) + ST $f28, 6*SIZE($24) + ST $f29, 7*SIZE($24) +*/ + subl $1, 1, $1 + addl $24, 16*SIZE, $24 + addl $20, 16*SIZE, $20 + bgt $1, $Loop + .align 4 + +$LoopEnd: + VMAD $f13, $f10, $f18, $f0 + VST $f0, 0*VEC_LEN*SIZE($24) + VMAD $f13, $f14, $f22, $f26 + VST $f26, 1*VEC_LEN*SIZE($24) + VMAD $f13, $f15, $f23, $f27 + VST $f27, 2*VEC_LEN*SIZE($24) + VMAD $f13, $f16, $f24, $f28 + VST $f28, 3*VEC_LEN*SIZE($24) + +/* + MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 + MUL $f30, $f11, $f27 + MUL $f30, $f12, $f28 + MUL $f30, $f13, $f29 + + ADD $f18, $f26, $f0 + MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 + ADD $f19, $f27, $f1 + MUL $f30, $f15, $f27 + + ADD $f20, $f28, $f2 + MUL $f30, $f16, $f28 + ADD $f21, $f29, $f3 + MUL $f30, $f17, $f29 + + ST $f0, 0*SIZE($24) + ADD $f22, $f26, $f0 + ST $f1, 1*SIZE($24) + ADD $f23, $f27, $f1 + + ST $f2, 2*SIZE($24) + ADD $f24, $f28, $f2 + ST $f3, 3*SIZE($24) + ADD $f25, $f29, $f3 + + ST $f0, 4*SIZE($24) + ST $f1, 5*SIZE($24) + ST $f2, 6*SIZE($24) + ST $f3, 7*SIZE($24) +*/ + addl $24, 16*SIZE, $24 + + .align 4 + +$Remain: + ble $2, $End + + .align 4 + +$RemainLoop: + LD $f10, 0*SIZE($20) + LD $f11, 0*SIZE($24) + addl $20, SIZE, $20 + addl $24, SIZE, $24 + + MAD $f30, $f10, $f11, $f13 + subl $2, 1, $2 + ST $f13, -1*SIZE($24) + bgt $2, $RemainLoop + .align 4 + +$End: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + ldi $sp, 16($sp) + ret + .align 4 + +$UnAlign_X_Access: +/*** + extern alpha from $f30 to vector 4 in $f13 + unloop 16 + unalign access X + align access Y +***/ + vcpyf $f30, $f13 + VLD_UL $f10, 0*VEC_LEN*SIZE($20) + VLD_UH $f2, 1*VEC_LEN*SIZE($20) + + VLD_UL $f14, 1*VEC_LEN*SIZE($20) + VLD_UH $f3, 2*VEC_LEN*SIZE($20) + + VLD_UL $f15, 2*VEC_LEN*SIZE($20) + VLD_UH $f11, 3*VEC_LEN*SIZE($20) + + VLD_UL $f16, 3*VEC_LEN*SIZE($20) + VLD_UH $f12, 4*VEC_LEN*SIZE($20) + + VLD $f18, 0*VEC_LEN*SIZE($24) + VLD $f22, 1*VEC_LEN*SIZE($24) + VLD $f23, 2*VEC_LEN*SIZE($24) + VLD $f24, 3*VEC_LEN*SIZE($24) + + vbisw $f10, $f2, $f10 + vbisw $f14, $f3, $f14 + vbisw $f15, $f11, $f15 + vbisw $f16, $f12, $f16 + + + subl $1, 1, $1 + addl $20, 16*SIZE, $20 + unop + ble $1, $UnAlign_X_LoopEnd + .align 4 + +$UnAlign_X_Loop: + + fillcs PREFETCHSIZE * SIZE($24) + fillcs PREFETCHSIZE * SIZE($20) + + VMAD $f13, $f10, $f18, $f0 + VLD_UL $f10, 0*VEC_LEN*SIZE($20) + VLD_UH $f2, 1*VEC_LEN*SIZE($20) + + + VMAD $f13, $f14, $f22, $f26 + VLD_UL $f14, 1*VEC_LEN*SIZE($20) + VLD_UH $f3, 2*VEC_LEN*SIZE($20) + + VMAD $f13, $f15, $f23, $f27 + VLD_UL $f15, 2*VEC_LEN*SIZE($20) + VLD_UH $f11, 3*VEC_LEN*SIZE($20) + + VMAD $f13, $f16, $f24, $f28 + VLD_UL $f16, 3*VEC_LEN*SIZE($20) + VLD_UH $f12, 4*VEC_LEN*SIZE($20) + + + + + VLD $f18, 4*VEC_LEN*SIZE($24) + vbisw $f10, $f2, $f10 + VLD $f22, 5*VEC_LEN*SIZE($24) + vbisw $f14, $f3, $f14 + VLD $f23, 6*VEC_LEN*SIZE($24) + vbisw $f15, $f11, $f15 + VLD $f24, 7*VEC_LEN*SIZE($24) + vbisw $f16, $f12, $f16 + + + VST $f0, 0*VEC_LEN*SIZE($24) + VST $f26, 1*VEC_LEN*SIZE($24) + VST $f27, 2*VEC_LEN*SIZE($24) + VST $f28, 3*VEC_LEN*SIZE($24) + + + subl $1, 1, $1 + addl $24, 16*SIZE, $24 + addl $20, 16*SIZE, $20 + bgt $1, $UnAlign_X_Loop + .align 4 + +$UnAlign_X_LoopEnd: + VMAD $f13, $f10, $f18, $f0 + VST $f0, 0*VEC_LEN*SIZE($24) + VMAD $f13, $f14, $f22, $f26 + VST $f26, 1*VEC_LEN*SIZE($24) + VMAD $f13, $f15, $f23, $f27 + VST $f27, 2*VEC_LEN*SIZE($24) + VMAD $f13, $f16, $f24, $f28 + VST $f28, 3*VEC_LEN*SIZE($24) + + addl $24, 16*SIZE, $24 + + .align 4 + +$UnAlign_X_Remain: + ble $2, $UnAlign_X_End + + .align 4 + +$UnAlign_X_RemainLoop: + LD $f10, 0*SIZE($20) + LD $f11, 0*SIZE($24) + addl $20, SIZE, $20 + addl $24, SIZE, $24 + + MAD $f30, $f10, $f11, $f13 + subl $2, 1, $2 + ST $f13, -1*SIZE($24) + bgt $2, $UnAlign_X_RemainLoop + .align 4 + +$UnAlign_X_End: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + ldi $sp, 16($sp) + ret + .align 4 + + +$Sub: + sra $16, 3, $1 + and $16, 7, $2 + SXSUBL $16, SIZE, $22 + subl $1, 1, $4 + + ble $1, $SubRemain + .align 4 + + LD $f10, 0($20) + SXADDQ $21, $20, $20 + + LD $f11, 0($20) + SXADDQ $21, $20, $20 + LD $f12, 0($20) + SXADDQ $21, $20, $20 + + LD $f13, 0($20) + SXADDQ $21, $20, $20 + LD $f18, 0($24) + SXADDQ $23, $24, $22 + + LD $f19, 0($22) + SXADDQ $23, $22, $22 + LD $f20, 0($22) + SXADDQ $23, $22, $22 + + LD $f21, 0($22) + SXADDQ $23, $22, $22 + LD $f14, 0($20) + SXADDQ $21, $20, $20 + + LD $f15, 0($20) + SXADDQ $21, $20, $20 + LD $f16, 0($20) + SXADDQ $21, $20, $20 + + LD $f17, 0($20) + SXADDQ $21, $20, $20 + LD $f22, 0($22) + SXADDQ $23, $22, $22 + + LD $f23, 0($22) + SXADDQ $23, $22, $22 + LD $f24, 0($22) + SXADDQ $23, $22, $22 + + LD $f25, 0($22) + SXADDQ $23, $22, $22 + unop + ble $4, $SubLoopEnd + .align 4 + +$SubLoop: + MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 + LD $f10, 0($20) + unop + SXADDQ $21, $20, $20 + + MUL $f30, $f11, $f27 + LD $f11, 0($20) + unop + SXADDQ $21, $20, $20 + + MUL $f30, $f12, $f28 + LD $f12, 0($20) + unop + SXADDQ $21, $20, $20 + + MUL $f30, $f13, $f29 + LD $f13, 0($20) + unop + SXADDQ $21, $20, $20 + + ADD $f18, $f26, $f0 + MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 + LD $f14, 0($20) + SXADDQ $21, $20, $20 + + ADD $f19, $f27, $f1 + MUL $f30, $f15, $f27 + LD $f15, 0($20) + SXADDQ $21, $20, $20 + + ADD $f20, $f28, $f2 + MUL $f30, $f16, $f28 + LD $f16, 0($20) + SXADDQ $21, $20, $20 + + ADD $f21, $f29, $f3 + MUL $f30, $f17, $f29 + LD $f17, 0($20) + SXADDQ $21, $20, $20 + + ST $f0, 0($24) + SXADDQ $23, $24, $24 + ADD $f22, $f26, $f0 + unop + + ST $f1, 0($24) + SXADDQ $23, $24, $24 + ADD $f23, $f27, $f1 + unop + + ST $f2, 0($24) + SXADDQ $23, $24, $24 + ADD $f24, $f28, $f2 + unop + + ST $f3, 0($24) + SXADDQ $23, $24, $24 + ADD $f25, $f29, $f3 + unop + + LD $f18, 0($22) + SXADDQ $23, $22, $22 + LD $f19, 0($22) + SXADDQ $23, $22, $22 + + LD $f20, 0($22) + SXADDQ $23, $22, $22 + LD $f21, 0($22) + SXADDQ $23, $22, $22 + + LD $f22, 0($22) + SXADDQ $23, $22, $22 + LD $f23, 0($22) + SXADDQ $23, $22, $22 + + LD $f24, 0($22) + SXADDQ $23, $22, $22 + LD $f25, 0($22) + SXADDQ $23, $22, $22 + + ST $f0, 0($24) + SXADDQ $23, $24, $24 + ST $f1, 0($24) + SXADDQ $23, $24, $24 + ST $f2, 0($24) + SXADDQ $23, $24, $24 + ST $f3, 0($24) + SXADDQ $23, $24, $24 + + subl $4, 1, $4 + bgt $4, $SubLoop + .align 4 + +$SubLoopEnd: + MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 + MUL $f30, $f11, $f27 + MUL $f30, $f12, $f28 + MUL $f30, $f13, $f29 + + ADD $f18, $f26, $f0 + MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 + ADD $f19, $f27, $f1 + MUL $f30, $f15, $f27 + + ADD $f20, $f28, $f2 + MUL $f30, $f16, $f28 + ADD $f21, $f29, $f3 + MUL $f30, $f17, $f29 + + ST $f0, 0($24) + SXADDQ $23, $24, $24 + ST $f1, 0($24) + SXADDQ $23, $24, $24 + + ST $f2, 0($24) + SXADDQ $23, $24, $24 + ST $f3, 0($24) + SXADDQ $23, $24, $24 + + ADD $f22, $f26, $f0 + ADD $f23, $f27, $f1 + ADD $f24, $f28, $f2 + ADD $f25, $f29, $f3 + + ST $f0, 0($24) + SXADDQ $23, $24, $24 + ST $f1, 0($24) + SXADDQ $23, $24, $24 + + ST $f2, 0($24) + SXADDQ $23, $24, $24 + ST $f3, 0($24) + SXADDQ $23, $24, $24 + .align 4 + +$SubRemain: + ble $2, $SubEnd + .align 4 + +$SubRemainLoop: + LD $f10, 0($20) + LD $f11, 0($24) + SXADDQ $21, $20, $20 + + MUL $f30, $f10, $f12 + subl $2, 1, $2 + ADD $f11, $f12, $f13 + ST $f13, 0($24) + SXADDQ $23, $24, $24 + + bgt $2, $SubRemainLoop + .align 4 + +$SubEnd: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + ldi $sp, 16($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/cabs.S b/kernel/sw_64/cabs.S new file mode 100644 index 0000000..3f9ed2c --- /dev/null +++ b/kernel/sw_64/cabs.S @@ -0,0 +1,72 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + .set noat + .set noreorder +.text + .align 5 + .globl NAME + .ent NAME +NAME: + .frame $sp, 0, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + ldi $28, _mcount + jsr $28, ($28), _mcount +#endif + + LD $f10, 0($16) + LD $f11, SIZE($16) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + fabs $f10, $f12 + fabs $f11, $f0 + ADD $f12, $f0, $f29 + fmov $f29, $f0 + ret + .end NAME + .ident VERSION diff --git a/kernel/sw_64/cabs.S.bak b/kernel/sw_64/cabs.S.bak new file mode 100644 index 0000000..5fa27af --- /dev/null +++ b/kernel/sw_64/cabs.S.bak @@ -0,0 +1,71 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + .set noat + .set noreorder +.text + .align 5 + .globl NAME + .ent NAME +NAME: + .frame $sp, 0, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + lda $28, _mcount + jsr $28, ($28), _mcount +#endif + + LD $f10, 0($16) + LD $f11, SIZE($16) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + fabs $f10, $f12 + fabs $f11, $f0 + ADD $f12, $f0, $f0 + ret + .end NAME + .ident VERSION diff --git a/kernel/sw_64/cnrm2.S b/kernel/sw_64/cnrm2.S new file mode 100644 index 0000000..25eab03 --- /dev/null +++ b/kernel/sw_64/cnrm2.S @@ -0,0 +1,440 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + +#define PREFETCH_SIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#define I $0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f10 +#define a3 $f11 +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f19 +#define x4 $f20 +#define x5 $f21 +#define x6 $f22 +#define x7 $f23 + + PROLOGUE + +#if defined(EV4) || defined(EV5) + .frame $30,16,$26,0 + .mask 0x4000000,-16 + ldih $29, 0($27) !gpdisp!1 + ldi $29, 0($29) !gpdisp!1 + + ldi $sp, -16($sp) + ldl $27, sqrt($29) !literal!2 + stl $26, 0($sp) + + PROFCODE + .prologue 1 +#else + PROFCODE +#endif + + fclr a0 + sll INCX, ZBASE_SHIFT, INCX + fclr a1 + ble N, $L999 + + fclr a2 + cmpeq INCX, 2 * SIZE, $0 + fclr a3 + beq $0, $L20 + + fclr t0 + sra N, 3, I + fclr t1 + ble I, $L15 + + fclr t2 + LD x0, 0 * SIZE(X) + fclr t3 + LD x1, 1 * SIZE(X) + + LD x2, 2 * SIZE(X) + LD x3, 3 * SIZE(X) + LD x4, 4 * SIZE(X) + LD x5, 5 * SIZE(X) + LD x6, 6 * SIZE(X) + LD x7, 7 * SIZE(X) + + ldi I, -1(I) + ble I, $L12 + .align 4 + +$L11: + faddd a0, t0, $f25 + fillcs (PREFETCH_SIZE) * SIZE(X) + fmuld x0, x0, t0 + LD x0, 8 * SIZE(X) + + faddd a1, t1, $f26 + mov X, XX + fmuld x1, x1, t1 + LD x1, 9 * SIZE(X) + + faddd a2, t2, $f27 + unop + fmuld x2, x2, t2 + LD x2, 10 * SIZE(X) + + faddd a3, t3, $f28 + unop + fmuld x3, x3, t3 + LD x3, 11 * SIZE(X) + + faddd $f25, t0, a0 + unop + fmuld x4, x4, t0 + LD x4, 12 * SIZE(X) + + faddd $f26, t1, a1 + unop + fmuld x5, x5, t1 + LD x5, 13 * SIZE(X) + + faddd $f27, t2, a2 + unop + fmuld x6, x6, t2 + LD x6, 14 * SIZE(X) + + faddd $f28, t3, a3 + unop + fmuld x7, x7, t3 + LD x7, 15 * SIZE(X) + + faddd a0, t0, $f25 + unop + fmuld x0, x0, t0 + LD x0, 16 * SIZE(X) + + faddd a1, t1, $f26 + ldi X, 16 * SIZE(X) + fmuld x1, x1, t1 + LD x1, 17 * SIZE(XX) + + faddd a2, t2, $f27 + unop + fmuld x2, x2, t2 + LD x2, 18 * SIZE(XX) + + faddd a3, t3, $f28 + unop + fmuld x3, x3, t3 + LD x3, 19 * SIZE(XX) + + faddd $f25, t0, a0 + unop + fmuld x4, x4, t0 + LD x4, 20 * SIZE(XX) + + faddd $f26, t1, a1 + ldi I, -1(I) + fmuld x5, x5, t1 + LD x5, 21 * SIZE(XX) + + faddd $f27, t2, a2 + unop + fmuld x6, x6, t2 + LD x6, 22 * SIZE(XX) + + faddd $f28, t3, a3 + fmuld x7, x7, t3 + LD x7, 23 * SIZE(XX) + bgt I, $L11 + .align 4 + +$L12: + faddd a0, t0, $f25 + mov X, XX + fmuld x0, x0, t0 + LD x0, 8 * SIZE(X) + + faddd a1, t1, $f26 + unop + fmuld x1, x1, t1 + LD x1, 9 * SIZE(X) + + faddd a2, t2, $f27 + unop + fmuld x2, x2, t2 + LD x2, 10 * SIZE(X) + + faddd a3, t3, $f28 + unop + fmuld x3, x3, t3 + LD x3, 11 * SIZE(X) + + faddd $f25, t0, a0 + unop + fmuld x4, x4, t0 + LD x4, 12 * SIZE(XX) + + faddd $f26, t1, a1 + unop + fmuld x5, x5, t1 + LD x5, 13 * SIZE(XX) + + faddd $f27, t2, a2 + unop + fmuld x6, x6, t2 + LD x6, 14 * SIZE(XX) + + faddd $f28, t3, a3 + ldi X, 16 * SIZE(X) + fmuld x7, x7, t3 + LD x7, 15 * SIZE(XX) + + faddd a0, t0, $f25 + fmuld x0, x0, t0 + faddd a1, t1, $f26 + fmuld x1, x1, t1 + + faddd a2, t2, $f27 + fmuld x2, x2, t2 + faddd a3, t3, $f28 + fmuld x3, x3, t3 + + faddd $f25, t0, a0 + fmuld x4, x4, t0 + faddd $f26, t1, a1 + fmuld x5, x5, t1 + + faddd $f27, t2, a2 + fmuld x6, x6, t2 + faddd $f28, t3, a2 + fmuld x7, x7, t3 + + faddd a2, t2, $f27 + fmov $f27, a2 + faddd a3, t3, $f28 + fmov $f28, a3 + .align 4 + +$L15: + and N, 7, I + ble I, $L998 + .align 4 + +$L16: + LD x0, 0 * SIZE(X) + LD x1, 1 * SIZE(X) + + ldi X, 2 * SIZE(X) + + faddd a0, t0, $f25 + fmov $f25, a0 + fmuld x0, x0, t0 + faddd a1, t1, $f26 + fmov $f26, a1 + fmuld x1, x1, t1 + + ldi I, -1(I) + bgt I, $L16 + bsr $31, $L998 + .align 4 + +$L20: + fclr t0 + sra N, 2, I + fclr t1 + ble I, $L25 + + LD x0, 0 * SIZE(X) + fclr t2 + LD x1, 1 * SIZE(X) + addl X, INCX, X + LD x2, 0 * SIZE(X) + fclr t3 + LD x3, 1 * SIZE(X) + addl X, INCX, X + + LD x4, 0 * SIZE(X) + ldi I, -1(I) + LD x5, 1 * SIZE(X) + addl X, INCX, X + + LD x6, 0 * SIZE(X) + ble I, $L22 + .align 4 + +$L21: + faddd a0, t0, $f25 + LD x7, 1 * SIZE(X) + fmuld x0, x0, t0 + addl X, INCX, X + + faddd a1, t1, $f26 + LD x0, 0 * SIZE(X) + fmuld x1, x1, t1 + unop + + faddd a2, t2, $f27 + LD x1, 1 * SIZE(X) + fmuld x2, x2, t2 + addl X, INCX, X + + faddd a3, t3, $f28 + LD x2, 0 * SIZE(X) + fmuld x3, x3, t3 + unop + + faddd $f25, t0, a0 + LD x3, 1 * SIZE(X) + fmuld x4, x4, t0 + addl X, INCX, X + + faddd $f26, t1, a1 + LD x4, 0 * SIZE(X) + fmuld x5, x5, t1 + ldi I, -1(I) + + faddd $f27, t2, a2 + LD x5, 1 * SIZE(X) + fmuld x6, x6, t2 + addl X, INCX, X + + faddd $f28, t3, a3 + LD x6, 0 * SIZE(X) + fmuld x7, x7, t3 + bgt I, $L21 + .align 4 + +$L22: + faddd a0, t0, $f25 + LD x7, 1 * SIZE(X) + fmuld x0, x0, t0 + addl X, INCX, X + + faddd a1, t1, $f26 + fmuld x1, x1, t1 + faddd a2, t2, $f27 + fmuld x2, x2, t2 + + faddd a3, t3, $f28 + fmuld x3, x3, t3 + faddd $f25, t0, a0 + fmuld x4, x4, t0 + + faddd $f26, t1, a1 + fmuld x5, x5, t1 + faddd $f27, t2, a2 + fmuld x6, x6, t2 + + faddd $f28, t3, a3 + fmuld x7, x7, t3 + faddd a2, t2, $f27 + fmov $f27, a2 + faddd a3, t3, $f28 + fmov $f28, a3 + .align 4 + +$L25: + and N, 3, I + ble I, $L998 + .align 4 + +$L26: + LD x0, 0 * SIZE(X) + ldi I, -1(I) + LD x1, 1 * SIZE(X) + addl X, INCX, X + + faddd a0, t0, $f25 + fmov $f25, a0 + fmuld x0, x0, t0 + faddd a1, t1, $f26 + fmov $f26, a1 + fmuld x1, x1, t1 + + bgt I, $L26 + .align 4 + + +$L998: + faddd a0, t0, $f25 + fmov $f25, a0 + faddd a1, t1, $f26 + fmov $f26, a1 + + faddd a0, a1, $f25 + fmov $f25, a0 + faddd a2, a3, $f26 + fmov $f26, a2 + +#if defined(EV4) || defined(EV5) + faddd a0, a2, $f16 + jsr $26, ($27), sqrt !lituse_jsr!2 + + ldih $29, 0($26) !gpdisp!3 + ldi $29, 0($29) !gpdisp!3 +#else + faddd a0, a2, $f25 + fmov $f25, a0 + fsqrtd a0, $f25 + fmov $f25, a0 +#endif + .align 4 + +$L999: +#if defined(EV4) || defined(EV5) + ldl $26, 0($sp) + ldi $sp, 16($sp) +#endif + ret + EPILOGUE diff --git a/kernel/sw_64/cnrm2.S.bak b/kernel/sw_64/cnrm2.S.bak new file mode 100644 index 0000000..b2e80e0 --- /dev/null +++ b/kernel/sw_64/cnrm2.S.bak @@ -0,0 +1,426 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + +#define PREFETCH_SIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#define I $0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f10 +#define a3 $f11 +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f19 +#define x4 $f20 +#define x5 $f21 +#define x6 $f22 +#define x7 $f23 + + PROLOGUE + +#if defined(EV4) || defined(EV5) + .frame $30,16,$26,0 + .mask 0x4000000,-16 + ldih $29, 0($27) !gpdisp!1 + ldi $29, 0($29) !gpdisp!1 + + ldi $sp, -16($sp) + ldl $27, sqrt($29) !literal!2 + stq $26, 0($sp) + + PROFCODE + .prologue 1 +#else + PROFCODE +#endif + + fclr a0 + sll INCX, ZBASE_SHIFT, INCX + fclr a1 + ble N, $L999 + + fclr a2 + cmpeq INCX, 2 * SIZE, $0 + fclr a3 + beq $0, $L20 + + fclr t0 + sra N, 3, I + fclr t1 + ble I, $L15 + + fclr t2 + LD x0, 0 * SIZE(X) + fclr t3 + LD x1, 1 * SIZE(X) + + LD x2, 2 * SIZE(X) + LD x3, 3 * SIZE(X) + LD x4, 4 * SIZE(X) + LD x5, 5 * SIZE(X) + LD x6, 6 * SIZE(X) + LD x7, 7 * SIZE(X) + + ldi I, -1(I) + ble I, $L12 + .align 4 + +$L11: + faddd a0, t0, a0 + fillcs (PREFETCH_SIZE) * SIZE(X) + fmuld x0, x0, t0 + LD x0, 8 * SIZE(X) + + faddd a1, t1, a1 + mov X, XX + fmuld x1, x1, t1 + LD x1, 9 * SIZE(X) + + faddd a2, t2, a2 + unop + fmuld x2, x2, t2 + LD x2, 10 * SIZE(X) + + faddd a3, t3, a3 + unop + fmuld x3, x3, t3 + LD x3, 11 * SIZE(X) + + faddd a0, t0, a0 + unop + fmuld x4, x4, t0 + LD x4, 12 * SIZE(X) + + faddd a1, t1, a1 + unop + fmuld x5, x5, t1 + LD x5, 13 * SIZE(X) + + faddd a2, t2, a2 + unop + fmuld x6, x6, t2 + LD x6, 14 * SIZE(X) + + faddd a3, t3, a3 + unop + fmuld x7, x7, t3 + LD x7, 15 * SIZE(X) + + faddd a0, t0, a0 + unop + fmuld x0, x0, t0 + LD x0, 16 * SIZE(X) + + faddd a1, t1, a1 + ldi X, 16 * SIZE(X) + fmuld x1, x1, t1 + LD x1, 17 * SIZE(XX) + + faddd a2, t2, a2 + unop + fmuld x2, x2, t2 + LD x2, 18 * SIZE(XX) + + faddd a3, t3, a3 + unop + fmuld x3, x3, t3 + LD x3, 19 * SIZE(XX) + + faddd a0, t0, a0 + unop + fmuld x4, x4, t0 + LD x4, 20 * SIZE(XX) + + faddd a1, t1, a1 + ldi I, -1(I) + fmuld x5, x5, t1 + LD x5, 21 * SIZE(XX) + + faddd a2, t2, a2 + unop + fmuld x6, x6, t2 + LD x6, 22 * SIZE(XX) + + faddd a3, t3, a3 + fmuld x7, x7, t3 + LD x7, 23 * SIZE(XX) + bgt I, $L11 + .align 4 + +$L12: + faddd a0, t0, a0 + mov X, XX + fmuld x0, x0, t0 + LD x0, 8 * SIZE(X) + + faddd a1, t1, a1 + unop + fmuld x1, x1, t1 + LD x1, 9 * SIZE(X) + + faddd a2, t2, a2 + unop + fmuld x2, x2, t2 + LD x2, 10 * SIZE(X) + + faddd a3, t3, a3 + unop + fmuld x3, x3, t3 + LD x3, 11 * SIZE(X) + + faddd a0, t0, a0 + unop + fmuld x4, x4, t0 + LD x4, 12 * SIZE(XX) + + faddd a1, t1, a1 + unop + fmuld x5, x5, t1 + LD x5, 13 * SIZE(XX) + + faddd a2, t2, a2 + unop + fmuld x6, x6, t2 + LD x6, 14 * SIZE(XX) + + faddd a3, t3, a3 + ldi X, 16 * SIZE(X) + fmuld x7, x7, t3 + LD x7, 15 * SIZE(XX) + + faddd a0, t0, a0 + fmuld x0, x0, t0 + faddd a1, t1, a1 + fmuld x1, x1, t1 + + faddd a2, t2, a2 + fmuld x2, x2, t2 + faddd a3, t3, a3 + fmuld x3, x3, t3 + + faddd a0, t0, a0 + fmuld x4, x4, t0 + faddd a1, t1, a1 + fmuld x5, x5, t1 + + faddd a2, t2, a2 + fmuld x6, x6, t2 + faddd a3, t3, a3 + fmuld x7, x7, t3 + + faddd a2, t2, a2 + faddd a3, t3, a3 + .align 4 + +$L15: + and N, 7, I + ble I, $L998 + .align 4 + +$L16: + LD x0, 0 * SIZE(X) + LD x1, 1 * SIZE(X) + + ldi X, 2 * SIZE(X) + + faddd a0, t0, a0 + fmuld x0, x0, t0 + faddd a1, t1, a1 + fmuld x1, x1, t1 + + ldi I, -1(I) + bgt I, $L16 + bsr $31, $L998 + .align 4 + +$L20: + fclr t0 + sra N, 2, I + fclr t1 + ble I, $L25 + + LD x0, 0 * SIZE(X) + fclr t2 + LD x1, 1 * SIZE(X) + addl X, INCX, X + LD x2, 0 * SIZE(X) + fclr t3 + LD x3, 1 * SIZE(X) + addl X, INCX, X + + LD x4, 0 * SIZE(X) + ldi I, -1(I) + LD x5, 1 * SIZE(X) + addl X, INCX, X + + LD x6, 0 * SIZE(X) + ble I, $L22 + .align 4 + +$L21: + faddd a0, t0, a0 + LD x7, 1 * SIZE(X) + fmuld x0, x0, t0 + addl X, INCX, X + + faddd a1, t1, a1 + LD x0, 0 * SIZE(X) + fmuld x1, x1, t1 + unop + + faddd a2, t2, a2 + LD x1, 1 * SIZE(X) + fmuld x2, x2, t2 + addl X, INCX, X + + faddd a3, t3, a3 + LD x2, 0 * SIZE(X) + fmuld x3, x3, t3 + unop + + faddd a0, t0, a0 + LD x3, 1 * SIZE(X) + fmuld x4, x4, t0 + addl X, INCX, X + + faddd a1, t1, a1 + LD x4, 0 * SIZE(X) + fmuld x5, x5, t1 + ldi I, -1(I) + + faddd a2, t2, a2 + LD x5, 1 * SIZE(X) + fmuld x6, x6, t2 + addl X, INCX, X + + faddd a3, t3, a3 + LD x6, 0 * SIZE(X) + fmuld x7, x7, t3 + bgt I, $L21 + .align 4 + +$L22: + faddd a0, t0, a0 + LD x7, 1 * SIZE(X) + fmuld x0, x0, t0 + addl X, INCX, X + + faddd a1, t1, a1 + fmuld x1, x1, t1 + faddd a2, t2, a2 + fmuld x2, x2, t2 + + faddd a3, t3, a3 + fmuld x3, x3, t3 + faddd a0, t0, a0 + fmuld x4, x4, t0 + + faddd a1, t1, a1 + fmuld x5, x5, t1 + faddd a2, t2, a2 + fmuld x6, x6, t2 + + faddd a3, t3, a3 + fmuld x7, x7, t3 + faddd a2, t2, a2 + faddd a3, t3, a3 + .align 4 + +$L25: + and N, 3, I + ble I, $L998 + .align 4 + +$L26: + LD x0, 0 * SIZE(X) + ldi I, -1(I) + LD x1, 1 * SIZE(X) + addl X, INCX, X + + faddd a0, t0, a0 + fmuld x0, x0, t0 + faddd a1, t1, a1 + fmuld x1, x1, t1 + + bgt I, $L26 + .align 4 + + +$L998: + faddd a0, t0, a0 + faddd a1, t1, a1 + + faddd a0, a1, a0 + faddd a2, a3, a2 + +#if defined(EV4) || defined(EV5) + faddd a0, a2, $f16 + jsr $26, ($27), sqrt !lituse_jsr!2 + + ldih $29, 0($26) !gpdisp!3 + ldi $29, 0($29) !gpdisp!3 +#else + faddd a0, a2, a0 + fsqrtd a0, a0 +#endif + .align 4 + +$L999: +#if defined(EV4) || defined(EV5) + ldl $26, 0($sp) + ldi $sp, 16($sp) +#endif + ret + EPILOGUE diff --git a/kernel/sw_64/copy.S b/kernel/sw_64/copy.S new file mode 100644 index 0000000..c960ac1 --- /dev/null +++ b/kernel/sw_64/copy.S @@ -0,0 +1,379 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + cmpeq INCX, 1, $0 + ble N, $End +#ifndef COMPLEX + sra N, 4, $4 +#else + sra N, 3, $4 +#endif + cmpeq INCY, 1, $1 + + and $0, $1, $0 + beq $0, $Sub +#ifndef COMPLEX + and N, 15, $5 +#else + and N, 7, $5 +#endif + ble $4, $Remain + + LD $f10, 0*SIZE(X) + LD $f11, 1*SIZE(X) + LD $f12, 2*SIZE(X) + LD $f13, 3*SIZE(X) + LD $f14, 4*SIZE(X) + LD $f15, 5*SIZE(X) + LD $f16, 6*SIZE(X) + LD $f17, 7*SIZE(X) + + LD $f18, 8*SIZE(X) + LD $f19, 9*SIZE(X) + LD $f20, 10*SIZE(X) + LD $f21, 11*SIZE(X) + LD $f22, 12*SIZE(X) + LD $f23, 13*SIZE(X) + LD $f24, 14*SIZE(X) + LD $f25, 15*SIZE(X) + + subl $4, 1, $4 + ldi X, 16*SIZE(X) + ble $4, $MainLoopEnd + .align 4 + +$MainLoop: + ST $f10, 0*SIZE(Y) + ST $f11, 1*SIZE(Y) + ST $f12, 2*SIZE(Y) + ST $f13, 3*SIZE(Y) + + LD $f10, 0*SIZE(X) + LD $f11, 1*SIZE(X) + LD $f12, 2*SIZE(X) + LD $f13, 3*SIZE(X) + + ST $f14, 4*SIZE(Y) + ST $f15, 5*SIZE(Y) + ST $f16, 6*SIZE(Y) + ST $f17, 7*SIZE(Y) + + LD $f14, 4*SIZE(X) + LD $f15, 5*SIZE(X) + LD $f16, 6*SIZE(X) + LD $f17, 7*SIZE(X) + + ST $f18, 8*SIZE(Y) + ST $f19, 9*SIZE(Y) + ST $f20, 10*SIZE(Y) + ST $f21, 11*SIZE(Y) + + LD $f18, 8*SIZE(X) + LD $f19, 9*SIZE(X) + LD $f20, 10*SIZE(X) + LD $f21, 11*SIZE(X) + + ST $f22, 12*SIZE(Y) + ST $f23, 13*SIZE(Y) + ST $f24, 14*SIZE(Y) + ST $f25, 15*SIZE(Y) + + LD $f22, 12*SIZE(X) + LD $f23, 13*SIZE(X) + LD $f24, 14*SIZE(X) + LD $f25, 15*SIZE(X) + + subl $4, 1, $4 + ldi Y, 16*SIZE(Y) + ldi X, 16*SIZE(X) + bgt $4, $MainLoop + .align 4 + +$MainLoopEnd: + ST $f10, 0*SIZE(Y) + ST $f11, 1*SIZE(Y) + ST $f12, 2*SIZE(Y) + ST $f13, 3*SIZE(Y) + ST $f14, 4*SIZE(Y) + ST $f15, 5*SIZE(Y) + ST $f16, 6*SIZE(Y) + ST $f17, 7*SIZE(Y) + + ST $f18, 8*SIZE(Y) + ST $f19, 9*SIZE(Y) + ST $f20, 10*SIZE(Y) + ST $f21, 11*SIZE(Y) + ST $f22, 12*SIZE(Y) + ST $f23, 13*SIZE(Y) + ST $f24, 14*SIZE(Y) + ST $f25, 15*SIZE(Y) + + ldi Y, 16*SIZE(Y) + .align 4 + +$Remain: + ble $5, $End + .align 4 + +$RemainLoop: +#ifndef COMPLEX + LD $f10, 0*SIZE(X) + ldi X, 1*SIZE(X) + ST $f10, 0*SIZE(Y) + ldi Y, 1*SIZE(Y) +#else + LD $f10, 0*SIZE(X) + LD $f11, 1*SIZE(X) + ldi X, 2*SIZE(X) + ST $f10, 0*SIZE(Y) + ST $f11, 1*SIZE(Y) + ldi Y, 2*SIZE(Y) +#endif + subl $5, 1, $5 + bgt $5, $RemainLoop + .align 4 +$End: + ret + .align 4 + +$Sub: +#ifdef COMPLEX + addl INCX, INCX, INCX + addl INCY, INCY, INCY + and N, 7, $5 +#else + and N, 15, $5 +#endif + ble $4, $SubRemain + .align 4 + +$SubMainLoop: +#ifndef COMPLEX + LD $f10, 0(X) + SXADDQ INCX, X, X + LD $f11, 0(X) + SXADDQ INCX, X, X + + LD $f12, 0(X) + SXADDQ INCX, X, X + LD $f13, 0(X) + SXADDQ INCX, X, X + + LD $f14, 0(X) + SXADDQ INCX, X, X + LD $f15, 0(X) + SXADDQ INCX, X, X + + LD $f16, 0(X) + SXADDQ INCX, X, X + LD $f17, 0(X) + SXADDQ INCX, X, X + + LD $f18, 0(X) + SXADDQ INCX, X, X + LD $f19, 0(X) + SXADDQ INCX, X, X + + LD $f20, 0(X) + SXADDQ INCX, X, X + LD $f21, 0(X) + SXADDQ INCX, X, X + + LD $f22, 0(X) + SXADDQ INCX, X, X + LD $f23, 0(X) + SXADDQ INCX, X, X + + LD $f24, 0(X) + SXADDQ INCX, X, X + LD $f25, 0(X) + SXADDQ INCX, X, X + + ST $f10, 0(Y) + SXADDQ INCY, Y, Y + ST $f11, 0(Y) + SXADDQ INCY, Y, Y + + ST $f12, 0(Y) + SXADDQ INCY, Y, Y + ST $f13, 0(Y) + SXADDQ INCY, Y, Y + + ST $f14, 0(Y) + SXADDQ INCY, Y, Y + ST $f15, 0(Y) + SXADDQ INCY, Y, Y + + ST $f16, 0(Y) + SXADDQ INCY, Y, Y + ST $f17, 0(Y) + SXADDQ INCY, Y, Y + + ST $f18, 0(Y) + SXADDQ INCY, Y, Y + ST $f19, 0(Y) + SXADDQ INCY, Y, Y + + ST $f20, 0(Y) + SXADDQ INCY, Y, Y + ST $f21, 0(Y) + SXADDQ INCY, Y, Y + + ST $f22, 0(Y) + SXADDQ INCY, Y, Y + ST $f23, 0(Y) + SXADDQ INCY, Y, Y + + ST $f24, 0(Y) + SXADDQ INCY, Y, Y + ST $f25, 0(Y) + SXADDQ INCY, Y, Y +#else + LD $f10, 0(X) + LD $f11, SIZE(X) + SXADDQ INCX, X, X + + LD $f12, 0(X) + LD $f13, SIZE(X) + SXADDQ INCX, X, X + + LD $f14, 0(X) + LD $f15, SIZE(X) + SXADDQ INCX, X, X + + LD $f16, 0(X) + LD $f17, SIZE(X) + SXADDQ INCX, X, X + + LD $f18, 0(X) + LD $f19, SIZE(X) + SXADDQ INCX, X, X + + LD $f20, 0(X) + LD $f21, SIZE(X) + SXADDQ INCX, X, X + + LD $f22, 0(X) + LD $f23, SIZE(X) + SXADDQ INCX, X, X + + LD $f24, 0(X) + LD $f25, SIZE(X) + SXADDQ INCX, X, X + + ST $f10, 0(Y) + ST $f11, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f12, 0(Y) + ST $f13, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f14, 0(Y) + ST $f15, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f16, 0(Y) + ST $f17, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f18, 0(Y) + ST $f19, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f20, 0(Y) + ST $f21, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f22, 0(Y) + ST $f23, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f24, 0(Y) + ST $f25, SIZE(Y) + SXADDQ INCY, Y, Y +#endif + subl $4, 1, $4 + bgt $4, $SubMainLoop + .align 4 + +$SubRemain: + ble $5, $SubEnd + .align 4 + + $SubRemainLoop: +#ifndef COMPLEX + LD $f10, 0(X) + SXADDQ INCX, X, X + ST $f10, 0(Y) + SXADDQ INCY, Y, Y +#else + LD $f10, 0(X) + LD $f11, SIZE(X) + SXADDQ INCX, X, X + ST $f10, 0(Y) + ST $f11, SIZE(Y) + SXADDQ INCY, Y, Y +#endif + subl $5, 1, $5 + bgt $5, $SubRemainLoop + .align 4 + +$SubEnd: + ret + EPILOGUE diff --git a/kernel/sw_64/copy_simd.S b/kernel/sw_64/copy_simd.S new file mode 100644 index 0000000..84e96a9 --- /dev/null +++ b/kernel/sw_64/copy_simd.S @@ -0,0 +1,563 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + cmpeq INCX, 1, $0 + ble N, $End +#ifndef COMPLEX + sra N, 4, $4 +#else + sra N, 3, $4 +#endif + cmpeq INCY, 1, $1 + + and $0, $1, $0 + beq $0, $Sub +#ifndef COMPLEX + and N, 15, $5 +#else + and N, 7, $5 +#endif + ble $4, $Remain + +/** + test the address of X & Y +**/ + + and Y, (VEC_LEN*SIZE-1), $6 + and X, (VEC_LEN*SIZE-1), $7 + bgt $6, $UnAlign_Y_ACCESS + bgt $7, $UnAlign_X_ACCESS + + .align 4 + +$Align: + VLD $f10, 0*VEC_LEN*SIZE(X) + VLD $f11, 1*VEC_LEN*SIZE(X) + VLD $f12, 2*VEC_LEN*SIZE(X) + VLD $f13, 3*VEC_LEN*SIZE(X) + + subl $4, 1, $4 + ldi X, 16*SIZE(X) + ble $4, $MainLoopEnd + .align 4 + +$MainLoop: + fillcs PREFETCHSIZE * SIZE(X) + fillcs PREFETCHSIZE * SIZE(Y) + + VST $f10, 0*VEC_LEN*SIZE(Y) + VST $f11, 1*VEC_LEN*SIZE(Y) + VST $f12, 2*VEC_LEN*SIZE(Y) + VST $f13, 3*VEC_LEN*SIZE(Y) + + VLD $f10, 0*VEC_LEN*SIZE(X) + VLD $f11, 1*VEC_LEN*SIZE(X) + VLD $f12, 2*VEC_LEN*SIZE(X) + VLD $f13, 3*VEC_LEN*SIZE(X) + + subl $4, 1, $4 + ldi Y, 16*SIZE(Y) + ldi X, 16*SIZE(X) + bgt $4, $MainLoop + .align 4 + +$MainLoopEnd: + + VST $f10, 0*VEC_LEN*SIZE(Y) + VST $f11, 1*VEC_LEN*SIZE(Y) + VST $f12, 2*VEC_LEN*SIZE(Y) + VST $f13, 3*VEC_LEN*SIZE(Y) + + ldi Y, 16*SIZE(Y) + .align 4 + +$Remain: + ble $5, $End + .align 4 + +$RemainLoop: +#ifndef COMPLEX + LD $f10, 0*SIZE(X) + ldi X, 1*SIZE(X) + ST $f10, 0*SIZE(Y) + ldi Y, 1*SIZE(Y) +#else + LD $f10, 0*SIZE(X) + LD $f11, 1*SIZE(X) + ldi X, 2*SIZE(X) + ST $f10, 0*SIZE(Y) + ST $f11, 1*SIZE(Y) + ldi Y, 2*SIZE(Y) +#endif + subl $5, 1, $5 + bgt $5, $RemainLoop + .align 4 +$End: + ret + .align 4 + +$UnAlign_X_ACCESS: + and Y, (VEC_LEN*SIZE-1), $7 + nop + nop + bgt $7, $UnAlign_XY_ACCESS + .align 4 + + VLD_UL $f10, 0*VEC_LEN*SIZE(X) + VLD_UH $f14, 1*VEC_LEN*SIZE(X) + + VLD_UL $f11, 1*VEC_LEN*SIZE(X) + VLD_UH $f15, 2*VEC_LEN*SIZE(X) + + VLD_UL $f12, 2*VEC_LEN*SIZE(X) + VLD_UH $f16, 3*VEC_LEN*SIZE(X) + + + VLD_UL $f13, 3*VEC_LEN*SIZE(X) + VLD_UH $f17, 4*VEC_LEN*SIZE(X) + + subl $4, 1, $4 + vbisw $f10, $f14, $f10 + ldi X, 16*SIZE(X) + vbisw $f11, $f15, $f11 + + vbisw $f12, $f16, $f12 + vbisw $f13, $f17, $f13 + nop + ble $4, $UnAlign_X_MainLoopEnd + .align 4 + +$UnAlign_X_MainLoop: + fillcs PREFETCHSIZE * SIZE(X) + fillcs PREFETCHSIZE * SIZE(Y) + + VST $f10, 0*VEC_LEN*SIZE(Y) + VST $f11, 1*VEC_LEN*SIZE(Y) + VST $f12, 2*VEC_LEN*SIZE(Y) + VST $f13, 3*VEC_LEN*SIZE(Y) + + VLD_UL $f10, 0*VEC_LEN*SIZE(X) + VLD_UH $f14, 1*VEC_LEN*SIZE(X) + VLD_UL $f11, 1*VEC_LEN*SIZE(X) + VLD_UH $f15, 2*VEC_LEN*SIZE(X) + + VLD_UL $f12, 2*VEC_LEN*SIZE(X) + VLD_UH $f16, 3*VEC_LEN*SIZE(X) + VLD_UL $f13, 3*VEC_LEN*SIZE(X) + VLD_UH $f17, 4*VEC_LEN*SIZE(X) + + subl $4, 1, $4 + vbisw $f10, $f14, $f10 + ldi Y, 16*SIZE(Y) + vbisw $f11, $f15, $f11 + + vbisw $f12, $f16, $f12 + ldi X, 16*SIZE(X) + vbisw $f13, $f17, $f13 + bgt $4, $UnAlign_X_MainLoop + .align 4 + +$UnAlign_X_MainLoopEnd: + + VST $f10, 0*VEC_LEN*SIZE(Y) + VST $f11, 1*VEC_LEN*SIZE(Y) + VST $f12, 2*VEC_LEN*SIZE(Y) + VST $f13, 3*VEC_LEN*SIZE(Y) + + ldi Y, 16*SIZE(Y) + ble $5, $End + jmp $RemainLoop + + .align 4 + +$UnAlign_Y_ACCESS: + and X, (VEC_LEN*SIZE-1), $7 + nop + nop + bgt $7, $UnAlign_XY_ACCESS + .align 4 + + VLD $f10, 0*VEC_LEN*SIZE(X) + VLD $f11, 1*VEC_LEN*SIZE(X) + VLD $f12, 2*VEC_LEN*SIZE(X) + VLD $f13, 3*VEC_LEN*SIZE(X) + + subl $4, 1, $4 + ldi X, 16*SIZE(X) + ble $4, $UnAlign_Y_MainLoopEnd + .align 4 + +$UnAlign_Y_MainLoop: + fillcs PREFETCHSIZE * SIZE(X) + fillcs PREFETCHSIZE * SIZE(Y) + + VST_UL $f10, 0*VEC_LEN*SIZE(Y) + VST_UH $f10, 1*VEC_LEN*SIZE(Y) + + VST_UL $f11, 1*VEC_LEN*SIZE(Y) + VST_UH $f11, 2*VEC_LEN*SIZE(Y) + + VST_UL $f12, 2*VEC_LEN*SIZE(Y) + VST_UH $f12, 3*VEC_LEN*SIZE(Y) + + VST_UL $f13, 3*VEC_LEN*SIZE(Y) + VST_UH $f13, 4*VEC_LEN*SIZE(Y) + + VLD $f10, 0*VEC_LEN*SIZE(X) + VLD $f11, 1*VEC_LEN*SIZE(X) + VLD $f12, 2*VEC_LEN*SIZE(X) + VLD $f13, 3*VEC_LEN*SIZE(X) + + subl $4, 1, $4 + ldi Y, 16*SIZE(Y) + ldi X, 16*SIZE(X) + bgt $4, $UnAlign_Y_MainLoop + .align 4 + +$UnAlign_Y_MainLoopEnd: + + VST_UL $f10, 0*VEC_LEN*SIZE(Y) + VST_UH $f10, 1*VEC_LEN*SIZE(Y) + + VST_UL $f11, 1*VEC_LEN*SIZE(Y) + VST_UH $f11, 2*VEC_LEN*SIZE(Y) + + VST_UL $f12, 2*VEC_LEN*SIZE(Y) + VST_UH $f12, 3*VEC_LEN*SIZE(Y) + + VST_UL $f13, 3*VEC_LEN*SIZE(Y) + VST_UH $f13, 4*VEC_LEN*SIZE(Y) + + ldi Y, 16*SIZE(Y) + ble $5, $End + jmp $RemainLoop + + .align 4 + +$UnAlign_XY_ACCESS: + + VLD_UL $f10, 0*VEC_LEN*SIZE(X) + VLD_UH $f14, 1*VEC_LEN*SIZE(X) + + VLD_UL $f11, 1*VEC_LEN*SIZE(X) + VLD_UH $f15, 2*VEC_LEN*SIZE(X) + + VLD_UL $f12, 2*VEC_LEN*SIZE(X) + VLD_UH $f16, 3*VEC_LEN*SIZE(X) + + + VLD_UL $f13, 3*VEC_LEN*SIZE(X) + VLD_UH $f17, 4*VEC_LEN*SIZE(X) + + subl $4, 1, $4 + vbisw $f10, $f14, $f10 + ldi X, 16*SIZE(X) + vbisw $f11, $f15, $f11 + + vbisw $f12, $f16, $f12 + vbisw $f13, $f17, $f13 + nop + ble $4, $UnAlign_XY_MainLoopEnd + .align 4 + +$UnAlign_XY_MainLoop: + fillcs PREFETCHSIZE * SIZE(X) + fillcs PREFETCHSIZE * SIZE(Y) + + VST_UL $f10, 0*VEC_LEN*SIZE(Y) + VST_UH $f10, 1*VEC_LEN*SIZE(Y) + + VST_UL $f11, 1*VEC_LEN*SIZE(Y) + VST_UH $f11, 2*VEC_LEN*SIZE(Y) + + VST_UL $f12, 2*VEC_LEN*SIZE(Y) + VST_UH $f12, 3*VEC_LEN*SIZE(Y) + + VST_UL $f13, 3*VEC_LEN*SIZE(Y) + VST_UH $f13, 4*VEC_LEN*SIZE(Y) + + + VLD_UL $f10, 0*VEC_LEN*SIZE(X) + VLD_UH $f14, 1*VEC_LEN*SIZE(X) + VLD_UL $f11, 1*VEC_LEN*SIZE(X) + VLD_UH $f15, 2*VEC_LEN*SIZE(X) + + VLD_UL $f12, 2*VEC_LEN*SIZE(X) + VLD_UH $f16, 3*VEC_LEN*SIZE(X) + VLD_UL $f13, 3*VEC_LEN*SIZE(X) + VLD_UH $f17, 4*VEC_LEN*SIZE(X) + + subl $4, 1, $4 + vbisw $f10, $f14, $f10 + ldi Y, 16*SIZE(Y) + vbisw $f11, $f15, $f11 + + vbisw $f12, $f16, $f12 + ldi X, 16*SIZE(X) + vbisw $f13, $f17, $f13 + bgt $4, $UnAlign_XY_MainLoop + .align 4 + +$UnAlign_XY_MainLoopEnd: + + VST_UL $f10, 0*VEC_LEN*SIZE(Y) + VST_UH $f10, 1*VEC_LEN*SIZE(Y) + + VST_UL $f11, 1*VEC_LEN*SIZE(Y) + VST_UH $f11, 2*VEC_LEN*SIZE(Y) + + VST_UL $f12, 2*VEC_LEN*SIZE(Y) + VST_UH $f12, 3*VEC_LEN*SIZE(Y) + + VST_UL $f13, 3*VEC_LEN*SIZE(Y) + VST_UH $f13, 4*VEC_LEN*SIZE(Y) + + ldi Y, 16*SIZE(Y) + ble $5, $End + jmp $RemainLoop + + .align 4 + +$Sub: +#ifdef COMPLEX + addl INCX, INCX, INCX + addl INCY, INCY, INCY + and N, 7, $5 +#else + and N, 15, $5 +#endif + ble $4, $SubRemain + .align 4 + +$SubMainLoop: +#ifndef COMPLEX + LD $f10, 0(X) + SXADDQ INCX, X, X + LD $f11, 0(X) + SXADDQ INCX, X, X + + LD $f12, 0(X) + SXADDQ INCX, X, X + LD $f13, 0(X) + SXADDQ INCX, X, X + + LD $f14, 0(X) + SXADDQ INCX, X, X + LD $f15, 0(X) + SXADDQ INCX, X, X + + LD $f16, 0(X) + SXADDQ INCX, X, X + LD $f17, 0(X) + SXADDQ INCX, X, X + + LD $f18, 0(X) + SXADDQ INCX, X, X + LD $f19, 0(X) + SXADDQ INCX, X, X + + LD $f20, 0(X) + SXADDQ INCX, X, X + LD $f21, 0(X) + SXADDQ INCX, X, X + + LD $f22, 0(X) + SXADDQ INCX, X, X + LD $f23, 0(X) + SXADDQ INCX, X, X + + LD $f24, 0(X) + SXADDQ INCX, X, X + LD $f25, 0(X) + SXADDQ INCX, X, X + + ST $f10, 0(Y) + SXADDQ INCY, Y, Y + ST $f11, 0(Y) + SXADDQ INCY, Y, Y + + ST $f12, 0(Y) + SXADDQ INCY, Y, Y + ST $f13, 0(Y) + SXADDQ INCY, Y, Y + + ST $f14, 0(Y) + SXADDQ INCY, Y, Y + ST $f15, 0(Y) + SXADDQ INCY, Y, Y + + ST $f16, 0(Y) + SXADDQ INCY, Y, Y + ST $f17, 0(Y) + SXADDQ INCY, Y, Y + + ST $f18, 0(Y) + SXADDQ INCY, Y, Y + ST $f19, 0(Y) + SXADDQ INCY, Y, Y + + ST $f20, 0(Y) + SXADDQ INCY, Y, Y + ST $f21, 0(Y) + SXADDQ INCY, Y, Y + + ST $f22, 0(Y) + SXADDQ INCY, Y, Y + ST $f23, 0(Y) + SXADDQ INCY, Y, Y + + ST $f24, 0(Y) + SXADDQ INCY, Y, Y + ST $f25, 0(Y) + SXADDQ INCY, Y, Y +#else + LD $f10, 0(X) + LD $f11, SIZE(X) + SXADDQ INCX, X, X + + LD $f12, 0(X) + LD $f13, SIZE(X) + SXADDQ INCX, X, X + + LD $f14, 0(X) + LD $f15, SIZE(X) + SXADDQ INCX, X, X + + LD $f16, 0(X) + LD $f17, SIZE(X) + SXADDQ INCX, X, X + + LD $f18, 0(X) + LD $f19, SIZE(X) + SXADDQ INCX, X, X + + LD $f20, 0(X) + LD $f21, SIZE(X) + SXADDQ INCX, X, X + + LD $f22, 0(X) + LD $f23, SIZE(X) + SXADDQ INCX, X, X + + LD $f24, 0(X) + LD $f25, SIZE(X) + SXADDQ INCX, X, X + + ST $f10, 0(Y) + ST $f11, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f12, 0(Y) + ST $f13, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f14, 0(Y) + ST $f15, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f16, 0(Y) + ST $f17, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f18, 0(Y) + ST $f19, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f20, 0(Y) + ST $f21, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f22, 0(Y) + ST $f23, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f24, 0(Y) + ST $f25, SIZE(Y) + SXADDQ INCY, Y, Y +#endif + subl $4, 1, $4 + bgt $4, $SubMainLoop + .align 4 + +$SubRemain: + ble $5, $SubEnd + .align 4 + + $SubRemainLoop: +#ifndef COMPLEX + LD $f10, 0(X) + SXADDQ INCX, X, X + ST $f10, 0(Y) + SXADDQ INCY, Y, Y +#else + LD $f10, 0(X) + LD $f11, SIZE(X) + SXADDQ INCX, X, X + ST $f10, 0(Y) + ST $f11, SIZE(Y) + SXADDQ INCY, Y, Y +#endif + subl $5, 1, $5 + bgt $5, $SubRemainLoop + .align 4 + +$SubEnd: + ret + EPILOGUE diff --git a/kernel/sw_64/cscal.S b/kernel/sw_64/cscal.S new file mode 100644 index 0000000..bba3137 --- /dev/null +++ b/kernel/sw_64/cscal.S @@ -0,0 +1,217 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + + .set noat + .set noreorder + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + + .globl NAME + .ent NAME + +NAME: +#ifdef PROFILE + ldgp $gp, 0($27) + lda $28, _mcount + jsr $28, ($28), _mcount +#endif + +#ifndef C_INTERFACE + ldl $16, 0($16) # n + mov $18, $20 # Store Address + ldl $19, 0($19) # incx + nop + + LD $f1, 0($17) # alpha +#else + mov $18, $20 # Store Address + fmov $f17, $f1 # alpha +#endif + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + sra $16, 1, $21 # 4-unrolling + ble $16, $End + + lda $23, -1($19) + ble $19, $End + + bgt $23, $INC_NOT_1 + .align 4 + + ble $21, $Sub + lda $21, -1($21) + LD $f10, 0*SIZE($18) + LD $f11, 1*SIZE($18) + + LD $f12, 2*SIZE($18) + LD $f13, 3*SIZE($18) + lda $18, 4*SIZE($18) + ble $21, $MainRemain + .align 4 + +$MainLoop: + MUL $f10, $f1, $f20 + LD $f10, 0*SIZE($18) + MUL $f11, $f1, $f21 + LD $f11, 1*SIZE($18) + + MUL $f12, $f1, $f22 + LD $f12, 2*SIZE($18) + MUL $f13, $f1, $f23 + LD $f13, 3*SIZE($18) + + lda $18, 4*SIZE($18) + lda $21, -1($21) + + ST $f20, 0*SIZE($20) + ST $f21, 1*SIZE($20) + ST $f22, 2*SIZE($20) + ST $f23, 3*SIZE($20) + lda $20, 4*SIZE($20) + + bgt $21, $MainLoop + .align 4 + +$MainRemain: + MUL $f10, $f1, $f20 + MUL $f11, $f1, $f21 + MUL $f12, $f1, $f22 + MUL $f13, $f1, $f23 + + ST $f20, 0*SIZE($20) + ST $f21, 1*SIZE($20) + ST $f22, 2*SIZE($20) + ST $f23, 3*SIZE($20) + lda $20, 4*SIZE($20) + .align 4 + +$Sub: + blbc $16, $End + LD $f10, 0*SIZE($18) + LD $f11, 1*SIZE($18) + MUL $f10, $f1, $f20 + MUL $f11, $f1, $f21 + ST $f20, 0*SIZE($20) + ST $f21, 1*SIZE($20) + .align 4 + +$End: + ret + .align 4 + +$INC_NOT_1: + addl $19, $19, $19 + ble $21, $INC_Sub + lda $21, -1($21) + + LD $f10, 0*SIZE($18) + LD $f11, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f12, 0*SIZE($18) + LD $f13, 1*SIZE($18) + SXADDQ $19, $18, $18 + ble $21, $INC_MainRemain + .align 4 + +$INC_MainLoop: + MUL $f10, $f1, $f20 + LD $f10, 0*SIZE($18) + MUL $f11, $f1, $f21 + LD $f11, 1*SIZE($18) + + SXADDQ $19, $18, $18 + + MUL $f12, $f1, $f22 + LD $f12, 0*SIZE($18) + MUL $f13, $f1, $f23 + LD $f13, 1*SIZE($18) + + SXADDQ $19, $18, $18 + + ST $f20, 0*SIZE($20) + lda $21, -1($21) + ST $f21, 1*SIZE($20) + SXADDQ $19, $20, $20 + + ST $f22, 0*SIZE($20) + ST $f23, 1*SIZE($20) + SXADDQ $19, $20, $20 + unop + bgt $21, $INC_MainLoop + .align 4 + +$INC_MainRemain: + MUL $f10, $f1, $f20 + MUL $f11, $f1, $f21 + MUL $f12, $f1, $f22 + MUL $f13, $f1, $f23 + + ST $f20, 0*SIZE($20) + ST $f21, 1*SIZE($20) + SXADDQ $19, $20, $20 + + ST $f22, 0*SIZE($20) + ST $f23, 1*SIZE($20) + SXADDQ $19, $20, $20 + .align 4 + +$INC_Sub: + blbc $16, $INC_End + + LD $f10, 0*SIZE($18) + LD $f11, 1*SIZE($18) + MUL $f10, $f1, $f20 + MUL $f11, $f1, $f21 + + ST $f20, 0*SIZE($20) + ST $f21, 1*SIZE($20) + .align 4 + +$INC_End: + ret + .end NAME + .ident VERSION diff --git a/kernel/sw_64/dnrm2.S b/kernel/sw_64/dnrm2.S new file mode 100644 index 0000000..89cf787 --- /dev/null +++ b/kernel/sw_64/dnrm2.S @@ -0,0 +1,490 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + +#define PREFETCH_SIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#define I $0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f10 +#define a3 $f11 +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f19 +#define x4 $f20 +#define x5 $f21 +#define x6 $f22 +#define x7 $f23 + + PROLOGUE + +#if defined(EV4) || defined(EV5) + .frame $30,16,$26,0 + .mask 0x4000000,-16 + ldih $29, 0($27) !gpdisp!1 + ldi $29, 0($29) !gpdisp!1 + + ldi $sp, -16($sp) + ldl $27, sqrt($29) !literal!2 + stl $26, 0($sp) + + PROFCODE + .prologue 1 +#else + PROFCODE +#endif + + fclr a0 + SXADDQ INCX, 0, INCX + fclr a1 + ble N, $L999 + + fclr a2 + cmpeq INCX, SIZE, $0 + fclr a3 + beq $0, $L20 + + fclr t0 + sra N, 4, I + fclr t1 + ble I, $L15 + + fclr t2 + LD x0, 0 * SIZE(X) + fclr t3 + LD x1, 1 * SIZE(X) + + LD x2, 2 * SIZE(X) + LD x3, 3 * SIZE(X) + LD x4, 4 * SIZE(X) + LD x5, 5 * SIZE(X) + LD x6, 6 * SIZE(X) + LD x7, 7 * SIZE(X) + + ldi I, -1(I) + ble I, $L12 + .align 4 + +$L11: + faddd a0, t0,$f24 + fmov $f24,a0 + fillcs (PREFETCH_SIZE) * SIZE(X) + fmuld x0, x0, t0 + LD x0, 8 * SIZE(X) + + faddd a1, t1,$f24 + fmov $f24,a1 + mov X, XX + fmuld x1, x1, t1 + LD x1, 9 * SIZE(X) + + faddd a2, t2, $f24 + fmov $f24,a2 + #unop + fmuld x2, x2, t2 + LD x2, 10 * SIZE(X) + + faddd a3, t3, $f24 + fmov $f24,a3 + #unop + fmuld x3, x3, t3 + LD x3, 11 * SIZE(X) + + faddd a0, t0, $f24 + fmov $f24,a0 + #unop + fmuld x4, x4, t0 + LD x4, 12 * SIZE(X) + + faddd a1, t1, $f24 + fmov $f24,a1 + #unop + fmuld x5, x5, t1 + LD x5, 13 * SIZE(X) + + faddd a2, t2, $f24 + fmov $f24,a2 + #unop + fmuld x6, x6, t2 + LD x6, 14 * SIZE(X) + + faddd a3, t3, $f24 + fmov $f24,a3 + #unop + fmuld x7, x7, t3 + LD x7, 15 * SIZE(X) + + faddd a0, t0, $f24 + fmov $f24,a0 + #unop + fmuld x0, x0, t0 + LD x0, 16 * SIZE(X) + + faddd a1, t1, $f24 + fmov $f24,a1 + ldi X, 16 * SIZE(X) + fmuld x1, x1, t1 + LD x1, 17 * SIZE(XX) + + faddd a2, t2, $f24 + fmov $f24,a2 + #unop + fmuld x2, x2, t2 + LD x2, 18 * SIZE(XX) + + faddd a3, t3, $f24 + fmov $f24,a3 + #unop + fmuld x3, x3, t3 + LD x3, 19 * SIZE(XX) + + faddd a0, t0, $f24 + fmov $f24,a0 + unop + fmuld x4, x4, t0 + LD x4, 20 * SIZE(XX) + + faddd a1, t1, $f24 + fmov $f24,a1 + ldi I, -1(I) + fmuld x5, x5, t1 + LD x5, 21 * SIZE(XX) + + faddd a2, t2, $f24 + fmov $f24,a2 + #unop + fmuld x6, x6, t2 + LD x6, 22 * SIZE(XX) + + faddd a3, t3, $f24 + fmov $f24,a3 + fmuld x7, x7, t3 + LD x7, 23 * SIZE(XX) + bgt I, $L11 + .align 4 + +$L12: + faddd a0, t0,$f24 + fmov $f24,a0 + mov X, XX + fmuld x0, x0, t0 + LD x0, 8 * SIZE(X) + + faddd a1, t1, $f24 + fmov $f24,a1 + #unop + fmuld x1, x1, t1 + LD x1, 9 * SIZE(X) + + faddd a2, t2, $f24 + fmov $f24,a2 + #unop + fmuld x2, x2, t2 + LD x2, 10 * SIZE(X) + + faddd a3, t3, $f24 + fmov $f24,a3 + #unop + fmuld x3, x3, t3 + LD x3, 11 * SIZE(X) + + faddd a0, t0, $f24 + fmov $f24,a0 + #unop + fmuld x4, x4, t0 + LD x4, 12 * SIZE(XX) + + faddd a1, t1,$f24 + fmov $f24,a1 + unop + fmuld x5, x5, t1 + LD x5, 13 * SIZE(XX) + + faddd a2, t2,$f24 + fmov $f24,a2 + unop + fmuld x6, x6, t2 + LD x6, 14 * SIZE(XX) + + faddd a3, t3, $f24 + fmov $f24,a3 + ldi X, 16 * SIZE(X) + fmuld x7, x7, t3 + LD x7, 15 * SIZE(XX) + + faddd a0, t0, $f24 + fmov $f24,a0 + fmuld x0, x0, t0 + faddd a1, t1, $f24 + fmov $f24,a1 + fmuld x1, x1, t1 + + faddd a2, t2, $f24 + fmov $f24,a2 + fmuld x2, x2, t2 + faddd a3, t3, $f24 + fmov $f24,a3 + fmuld x3, x3, t3 + + faddd a0, t0, $f24 + fmov $f24,a0 + fmuld x4, x4, t0 + faddd a1, t1, $f24 + fmov $f24,a1 + fmuld x5, x5, t1 + + faddd a2, t2, $f24 + fmov $f24,a2 + fmuld x6, x6, t2 + faddd a3, t3, $f24 + fmov $f24,a3 + fmuld x7, x7, t3 + + faddd a1, t1, $f24 + fmov $f24,a1 + faddd a2, t2, $f24 + fmov $f24,a2 + faddd a3, t3, $f24 + fmov $f24,a3 + .align 4 + +$L15: + and N, 15, I + ble I, $L998 + .align 4 + +$L16: + LD x0, 0 * SIZE(X) + ldi X, 1 * SIZE(X) + + faddd a0, t0, $f24 + fmov $f24,a0 + fmuld x0, x0, t0 + + ldi I, -1(I) + bgt I, $L16 + bsr $31, $L998 + .align 4 + +$L20: + fclr t0 + sra N, 3, I + fclr t1 + ble I, $L25 + + fclr t2 + fclr t3 + + LD x0, 0 * SIZE(X) + addl X, INCX, X + LD x1, 0 * SIZE(X) + addl X, INCX, X + LD x2, 0 * SIZE(X) + addl X, INCX, X + LD x3, 0 * SIZE(X) + addl X, INCX, X + + LD x4, 0 * SIZE(X) + addl X, INCX, X + LD x5, 0 * SIZE(X) + addl X, INCX, X + LD x6, 0 * SIZE(X) + addl X, INCX, X + + ldi I, -1(I) + ble I, $L22 + .align 4 + +$L21: + faddd a0, t0,$f24 + fmov $f24,a0 + LD x7, 0 * SIZE(X) + fmuld x0, x0, t0 + addl X, INCX, X + + faddd a1, t1, $f24 + fmov $f24,a1 + LD x0, 0 * SIZE(X) + fmuld x1, x1, t1 + addl X, INCX, X + + faddd a2, t2, $f24 + fmov $f24,a2 + LD x1, 0 * SIZE(X) + fmuld x2, x2, t2 + addl X, INCX, X + + faddd a3, t3, $f24 + fmov $f24,a3 + LD x2, 0 * SIZE(X) + fmuld x3, x3, t3 + addl X, INCX, X + + faddd a0, t0, $f24 + fmov $f24,a0 + LD x3, 0 * SIZE(X) + fmuld x4, x4, t0 + addl X, INCX, X + + faddd a1, t1, $f24 + fmov $f24,a1 + LD x4, 0 * SIZE(X) + fmuld x5, x5, t1 + addl X, INCX, X + + faddd a2, t2, $f24 + fmov $f24,a2 + LD x5, 0 * SIZE(X) + fmuld x6, x6, t2 + addl X, INCX, X + + faddd a3, t3, $f24 + fmov $f24,a3 + LD x6, 0 * SIZE(X) + fmuld x7, x7, t3 + addl X, INCX, X + + ldi I, -1(I) + bgt I, $L21 + .align 4 + +$L22: + faddd a0, t0, $f24 + fmov $f24,a0 + LD x7, 0 * SIZE(X) + fmuld x0, x0, t0 + addl X, INCX, X + + faddd a1, t1, $f24 + fmov $f24,a1 + unop + fmuld x1, x1, t1 + unop + + faddd a2, t2, $f24 + fmov $f24,a2 + fmuld x2, x2, t2 + faddd a3, t3, $f24 + fmov $f24,a3 + fmuld x3, x3, t3 + + faddd a0, t0, $f24 + fmov $f24,a0 + fmuld x4, x4, t0 + faddd a1, t1, $f24 + fmov $f24,a1 + fmuld x5, x5, t1 + + faddd a2, t2, $f24 + fmov $f24,a2 + fmuld x6, x6, t2 + faddd a3, t3, $f24 + fmov $f24,a3 + fmuld x7, x7, t3 + + faddd a1, t1, $f24 + fmov $f24,a1 + faddd a2, t2, $f24 + fmov $f24,a2 + faddd a3, t3, $f24 + fmov $f24,a3 + .align 4 + +$L25: + and N, 7, I + ble I, $L998 + .align 4 + +$L26: + LD x0, 0 * SIZE(X) + addl X, INCX, X + + faddd a0, t0,$f24 + fmov $f24,a0 + fmuld x0, x0, t0 + + ldi I, -1(I) + bgt I, $L26 + .align 4 + + +$L998: + faddd a0, t0, $f24 + fmov $f24,a0 + + faddd a0, a1, $f24 + fmov $f24,a1 + faddd a2, a3, $f24 + fmov $f24,a2 + +#if defined(EV4) || defined(EV5) + faddd a0, a2, $f16 + jsr $26, ($27), sqrt !lituse_jsr!2 + + ldih $29, 0($26) !gpdisp!3 + ldi $29, 0($29) !gpdisp!3 +#else + faddd a0, a2, $f24 + fsqrtd $f24, a0 +#endif + .align 4 + +$L999: +#if defined(EV4) || defined(EV5) + ldl $26, 0($sp) + ldi $sp, 16($sp) +#endif + ret + EPILOGUE diff --git a/kernel/sw_64/dnrm2.S.bak b/kernel/sw_64/dnrm2.S.bak new file mode 100644 index 0000000..753c90b --- /dev/null +++ b/kernel/sw_64/dnrm2.S.bak @@ -0,0 +1,431 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + +#define PREFETCH_SIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#define I $0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f10 +#define a3 $f11 +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f19 +#define x4 $f20 +#define x5 $f21 +#define x6 $f22 +#define x7 $f23 + + PROLOGUE + +#if defined(EV4) || defined(EV5) + .frame $30,16,$26,0 + .mask 0x4000000,-16 + ldih $29, 0($27) !gpdisp!1 + ldi $29, 0($29) !gpdisp!1 + + ldi $sp, -16($sp) + ldl $27, sqrt($29) !literal!2 + stq $26, 0($sp) + + PROFCODE + .prologue 1 +#else + PROFCODE +#endif + + fclr a0 + SXADDQ INCX, 0, INCX + fclr a1 + ble N, $L999 + + fclr a2 + cmpeq INCX, SIZE, $0 + fclr a3 + beq $0, $L20 + + fclr t0 + sra N, 4, I + fclr t1 + ble I, $L15 + + fclr t2 + LD x0, 0 * SIZE(X) + fclr t3 + LD x1, 1 * SIZE(X) + + LD x2, 2 * SIZE(X) + LD x3, 3 * SIZE(X) + LD x4, 4 * SIZE(X) + LD x5, 5 * SIZE(X) + LD x6, 6 * SIZE(X) + LD x7, 7 * SIZE(X) + + ldi I, -1(I) + ble I, $L12 + .align 4 + +$L11: + faddd a0, t0, a0 + fillcs (PREFETCH_SIZE) * SIZE(X) + fmuld x0, x0, t0 + LD x0, 8 * SIZE(X) + + faddd a1, t1, a1 + mov X, XX + fmuld x1, x1, t1 + LD x1, 9 * SIZE(X) + + faddd a2, t2, a2 + unop + fmuld x2, x2, t2 + LD x2, 10 * SIZE(X) + + faddd a3, t3, a3 + unop + fmuld x3, x3, t3 + LD x3, 11 * SIZE(X) + + faddd a0, t0, a0 + unop + fmuld x4, x4, t0 + LD x4, 12 * SIZE(X) + + faddd a1, t1, a1 + unop + fmuld x5, x5, t1 + LD x5, 13 * SIZE(X) + + faddd a2, t2, a2 + unop + fmuld x6, x6, t2 + LD x6, 14 * SIZE(X) + + faddd a3, t3, a3 + unop + fmuld x7, x7, t3 + LD x7, 15 * SIZE(X) + + faddd a0, t0, a0 + unop + fmuld x0, x0, t0 + LD x0, 16 * SIZE(X) + + faddd a1, t1, a1 + ldi X, 16 * SIZE(X) + fmuld x1, x1, t1 + LD x1, 17 * SIZE(XX) + + faddd a2, t2, a2 + unop + fmuld x2, x2, t2 + LD x2, 18 * SIZE(XX) + + faddd a3, t3, a3 + unop + fmuld x3, x3, t3 + LD x3, 19 * SIZE(XX) + + faddd a0, t0, a0 + unop + fmuld x4, x4, t0 + LD x4, 20 * SIZE(XX) + + faddd a1, t1, a1 + ldi I, -1(I) + fmuld x5, x5, t1 + LD x5, 21 * SIZE(XX) + + faddd a2, t2, a2 + unop + fmuld x6, x6, t2 + LD x6, 22 * SIZE(XX) + + faddd a3, t3, a3 + fmuld x7, x7, t3 + LD x7, 23 * SIZE(XX) + bgt I, $L11 + .align 4 + +$L12: + faddd a0, t0, a0 + mov X, XX + fmuld x0, x0, t0 + LD x0, 8 * SIZE(X) + + faddd a1, t1, a1 + unop + fmuld x1, x1, t1 + LD x1, 9 * SIZE(X) + + faddd a2, t2, a2 + unop + fmuld x2, x2, t2 + LD x2, 10 * SIZE(X) + + faddd a3, t3, a3 + unop + fmuld x3, x3, t3 + LD x3, 11 * SIZE(X) + + faddd a0, t0, a0 + unop + fmuld x4, x4, t0 + LD x4, 12 * SIZE(XX) + + faddd a1, t1, a1 + unop + fmuld x5, x5, t1 + LD x5, 13 * SIZE(XX) + + faddd a2, t2, a2 + unop + fmuld x6, x6, t2 + LD x6, 14 * SIZE(XX) + + faddd a3, t3, a3 + ldi X, 16 * SIZE(X) + fmuld x7, x7, t3 + LD x7, 15 * SIZE(XX) + + faddd a0, t0, a0 + fmuld x0, x0, t0 + faddd a1, t1, a1 + fmuld x1, x1, t1 + + faddd a2, t2, a2 + fmuld x2, x2, t2 + faddd a3, t3, a3 + fmuld x3, x3, t3 + + faddd a0, t0, a0 + fmuld x4, x4, t0 + faddd a1, t1, a1 + fmuld x5, x5, t1 + + faddd a2, t2, a2 + fmuld x6, x6, t2 + faddd a3, t3, a3 + fmuld x7, x7, t3 + + faddd a1, t1, a1 + faddd a2, t2, a2 + faddd a3, t3, a3 + .align 4 + +$L15: + and N, 15, I + ble I, $L998 + .align 4 + +$L16: + LD x0, 0 * SIZE(X) + ldi X, 1 * SIZE(X) + + faddd a0, t0, a0 + fmuld x0, x0, t0 + + ldi I, -1(I) + bgt I, $L16 + bsr $31, $L998 + .align 4 + +$L20: + fclr t0 + sra N, 3, I + fclr t1 + ble I, $L25 + + fclr t2 + fclr t3 + + LD x0, 0 * SIZE(X) + addl X, INCX, X + LD x1, 0 * SIZE(X) + addl X, INCX, X + LD x2, 0 * SIZE(X) + addl X, INCX, X + LD x3, 0 * SIZE(X) + addl X, INCX, X + + LD x4, 0 * SIZE(X) + addl X, INCX, X + LD x5, 0 * SIZE(X) + addl X, INCX, X + LD x6, 0 * SIZE(X) + addl X, INCX, X + + ldi I, -1(I) + ble I, $L22 + .align 4 + +$L21: + faddd a0, t0, a0 + LD x7, 0 * SIZE(X) + fmuld x0, x0, t0 + addl X, INCX, X + + faddd a1, t1, a1 + LD x0, 0 * SIZE(X) + fmuld x1, x1, t1 + addl X, INCX, X + + faddd a2, t2, a2 + LD x1, 0 * SIZE(X) + fmuld x2, x2, t2 + addl X, INCX, X + + faddd a3, t3, a3 + LD x2, 0 * SIZE(X) + fmuld x3, x3, t3 + addl X, INCX, X + + faddd a0, t0, a0 + LD x3, 0 * SIZE(X) + fmuld x4, x4, t0 + addl X, INCX, X + + faddd a1, t1, a1 + LD x4, 0 * SIZE(X) + fmuld x5, x5, t1 + addl X, INCX, X + + faddd a2, t2, a2 + LD x5, 0 * SIZE(X) + fmuld x6, x6, t2 + addl X, INCX, X + + faddd a3, t3, a3 + LD x6, 0 * SIZE(X) + fmuld x7, x7, t3 + addl X, INCX, X + + ldi I, -1(I) + bgt I, $L21 + .align 4 + +$L22: + faddd a0, t0, a0 + LD x7, 0 * SIZE(X) + fmuld x0, x0, t0 + addl X, INCX, X + + faddd a1, t1, a1 + unop + fmuld x1, x1, t1 + unop + + faddd a2, t2, a2 + fmuld x2, x2, t2 + faddd a3, t3, a3 + fmuld x3, x3, t3 + + faddd a0, t0, a0 + fmuld x4, x4, t0 + faddd a1, t1, a1 + fmuld x5, x5, t1 + + faddd a2, t2, a2 + fmuld x6, x6, t2 + faddd a3, t3, a3 + fmuld x7, x7, t3 + + faddd a1, t1, a1 + faddd a2, t2, a2 + faddd a3, t3, a3 + .align 4 + +$L25: + and N, 7, I + ble I, $L998 + .align 4 + +$L26: + LD x0, 0 * SIZE(X) + addl X, INCX, X + + faddd a0, t0, a0 + fmuld x0, x0, t0 + + ldi I, -1(I) + bgt I, $L26 + .align 4 + + +$L998: + faddd a0, t0, a0 + + faddd a0, a1, a0 + faddd a2, a3, a2 + +#if defined(EV4) || defined(EV5) + faddd a0, a2, $f16 + jsr $26, ($27), sqrt !lituse_jsr!2 + + ldih $29, 0($26) !gpdisp!3 + ldi $29, 0($29) !gpdisp!3 +#else + faddd a0, a2, a0 + fsqrtd a0, a0 +#endif + .align 4 + +$L999: +#if defined(EV4) || defined(EV5) + ldl $26, 0($sp) + ldi $sp, 16($sp) +#endif + ret + EPILOGUE diff --git a/kernel/sw_64/dot.S b/kernel/sw_64/dot.S new file mode 100644 index 0000000..513eada --- /dev/null +++ b/kernel/sw_64/dot.S @@ -0,0 +1,607 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 + +#define I $5 + +#define s0 $f0 +#define s1 $f30 +#define s2 $f1 +#define s3 $f2 + +#define a0 $f10 +#define a1 $f11 +#define a2 $f12 +#define a3 $f13 +#define a4 $f14 +#define a5 $f15 +#define a6 $f16 +#define a7 $f17 + +#define b0 $f18 +#define b1 $f19 +#define b2 $f20 +#define b3 $f21 +#define b4 $f22 +#define b5 $f23 +#define b6 $f24 +#define b7 $f25 + +#define t0 $f26 +#define t1 $f27 +#define t2 $f28 +#define t3 $f29 + + + PROLOGUE + PROFCODE + .frame $sp, 16, $26, 0 + + ldi $sp, -16($sp) + fclr s0 + fstd $f2, 0($sp) +#ifndef ZYX20220111 + fstd $f3, 8($sp) +#endif + fclr s1 + + fclr s2 + nop + fclr s3 + ble N, $L999 + + fclr t0 + cmpeq INCX, 1, $21 + fclr t1 + cmpeq INCY, 1, $22 + fclr t2 + and $21, $22, $22 + fclr t3 + beq $22, $L20 + +#ifndef DOUBLE + srl N, 4, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + LD b2, 2 * SIZE(Y) + LD b3, 3 * SIZE(Y) + + LD a4, 4 * SIZE(X) + LD a5, 5 * SIZE(X) + LD b4, 4 * SIZE(Y) + LD b5, 5 * SIZE(Y) + + LD a6, 6 * SIZE(X) + LD a7, 7 * SIZE(X) + addl X, 16 * SIZE, X + subl I, 1, I + + addl Y, 16 * SIZE, Y + ble I, $L13 + .align 4 + +$L12: + fillcs PREFETCHSIZE * 2 * SIZE(X) + subl I, 1, I + fillcs PREFETCHSIZE * 2 * SIZE(Y) + addl X, 16 * SIZE, X + + ADD s0, t0, $f3 + fmov $f3, s0 + LD b6, -10 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -9 * SIZE(Y) + + ADD s1, t1, $f3 + fmov $f3, s1 + LD a0, -24 * SIZE(X) + MUL a1, b1, t1 + LD a1, -23 * SIZE(X) + + ADD s2, t2, $f3 + fmov $f3, s2 + LD b0, -8 * SIZE(Y) + MUL a2, b2, $f3 + fmov $f3, t2 + LD b1, -7 * SIZE(Y) + + ADD s3, t3, $f3 + fmov $f3, s3 + LD a2, -22 * SIZE(X) + MUL a3, b3, t3 + LD a3, -21 * SIZE(X) + + ADD s0, t0, $f3 + fmov $f3, s0 + LD b2, -6 * SIZE(Y) + MUL a4, b4, t0 + LD b3, -5 * SIZE(Y) + + ADD s1, t1, $f3 + fmov $f3, s1 + LD a4, -20 * SIZE(X) + MUL a5, b5, t1 + LD a5, -19 * SIZE(X) + + ADD s2, t2, $f3 + fmov $f3, s2 + LD b4, -4 * SIZE(Y) + MUL a6, b6, t2 + LD b5, -3 * SIZE(Y) + + ADD s3, t3, $f3 + fmov $f3, s3 + LD a6, -18 * SIZE(X) + MUL a7, b7, t3 + LD a7, -17 * SIZE(X) + + ADD s0, t0, $f3 + fmov $f3, s0 + LD b6, -2 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -1 * SIZE(Y) + + ADD s1, t1, $f3 + fmov $f3, s1 + LD a0, -16 * SIZE(X) + MUL a1, b1, t1 + LD a1, -15 * SIZE(X) + + ADD s2, t2, $f3 + fmov $f3, s2 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t2 + LD b1, 1 * SIZE(Y) + + ADD s3, t3, $f3 + fmov $f3, s3 + LD a2, -14 * SIZE(X) + MUL a3, b3, t3 + LD a3, -13 * SIZE(X) + + ADD s0, t0, $f3 + fmov $f3, s0 + LD b2, 2 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 3 * SIZE(Y) + + ADD s1, t1, $f3 + fmov $f3, s1 + LD a4, -12 * SIZE(X) + MUL a5, b5, t1 + LD a5, -11 * SIZE(X) + + ADD s2, t2, $f3 + fmov $f3, s2 + LD b4, 4 * SIZE(Y) + MUL a6, b6, t2 + LD b5, 5 * SIZE(Y) + + ADD s3, t3, $f3 + fmov $f3, s3 + LD a6, -10 * SIZE(X) + MUL a7, b7, t3 + LD a7, -9 * SIZE(X) + + addl Y, 16 * SIZE, Y + bgt I, $L12 + nop + fnop + .align 4 + +$L13: + ADD s0, t0, $f3 + fmov $f3, s0 + LD b6,-10 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -9 * SIZE(Y) + + ADD s1, t1, $f3 + fmov $f3, s1 + LD a0, -8 * SIZE(X) + MUL a1, b1, t1 + LD a1, -7 * SIZE(X) + + ADD s2, t2, $f3 + fmov $f3, s2 + LD b0, -8 * SIZE(Y) + MUL a2, b2, t2 + LD b1, -7 * SIZE(Y) + + ADD s3, t3, $f3 + fmov $f3, s3 + LD a2, -6 * SIZE(X) + MUL a3, b3, t3 + LD a3, -5 * SIZE(X) + + ADD s0, t0, $f3 + fmov $f3, s0 + LD b2, -6 * SIZE(Y) + MUL a4, b4, t0 + LD b3, -5 * SIZE(Y) + + ADD s1, t1, $f3 + fmov $f3, s1 + LD a4, -4 * SIZE(X) + MUL a5, b5, t1 + LD a5, -3 * SIZE(X) + + ADD s2, t2, $f3 + fmov $f3, s2 + LD b4, -4 * SIZE(Y) + MUL a6, b6, t2 + LD b5, -3 * SIZE(Y) + + ADD s3, t3, $f3 + fmov $f3, s3 + LD a6, -2 * SIZE(X) + MUL a7, b7, t3 + LD a7, -1 * SIZE(X) + + ADD s0, t0, $f3 + fmov $f3, s0 + LD b6, -2 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -1 * SIZE(Y) + ADD s1, t1, $f3 + fmov $f3, s1 + MUL a1, b1, t1 + + ADD s2, t2, $f3 + fmov $f3, s2 + MUL a2, b2, t2 + ADD s3, t3, $f3 + fmov $f3, s3 + MUL a3, b3, t3 + + ADD s0, t0, $f3 + fmov $f3, s0 + MUL a4, b4, t0 + ADD s1, t1, $f3 + fmov $f3, s1 + MUL a5, b5, t1 + ADD s2, t2, $f3 + fmov $f3, s2 + MUL a6, b6, t2 + ADD s3, t3, $f3 + fmov $f3, s3 + MUL a7, b7, t3 + .align 4 + +$L15: + ADD s0, t0, $f3 + fmov $f3, s0 + and N, 15, I + ADD s1, t1, $f3 + fmov $f3, s1 + ble I, $L18 + .align 4 + +#else + + srl N, 3, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + LD b2, 2 * SIZE(Y) + LD b3, 3 * SIZE(Y) + + LD a4, 4 * SIZE(X) + LD a5, 5 * SIZE(X) + LD b4, 4 * SIZE(Y) + LD b5, 5 * SIZE(Y) + + LD a6, 6 * SIZE(X) + LD a7, 7 * SIZE(X) + addl X, 8 * SIZE, X + subl I, 1, I + + addl Y, 8 * SIZE, Y + ble I, $L13 + .align 4 + +$L12: + fillcs PREFETCHSIZE * SIZE(X) + subl I, 1, I + fillcs PREFETCHSIZE * SIZE(Y) + addl X, 8 * SIZE, X + + ADD s0, t0, $f3 + fmov $f3, s0 + LD b6, -2 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -1 * SIZE(Y) + + ADD s1, t1, $f3 + fmov $f3, s1 + LD a0, -8 * SIZE(X) + MUL a1, b1, t1 + LD a1, -7 * SIZE(X) + + ADD s2, t2, $f3 + fmov $f3, s2 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t2 + LD b1, 1 * SIZE(Y) + + ADD s3, t3, $f3 + fmov $f3, s3 + LD a2, -6 * SIZE(X) + MUL a3, b3, t3 + LD a3, -5 * SIZE(X) + + ADD s0, t0, $f3 + fmov $f3, s0 + LD b2, 2 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 3 * SIZE(Y) + + ADD s1, t1, $f3 + fmov $f3, s1 + LD a4, -4 * SIZE(X) + MUL a5, b5, t1 + LD a5, -3 * SIZE(X) + + ADD s2, t2, $f3 + fmov $f3, s2 + LD b4, 4 * SIZE(Y) + MUL a6, b6, t2 + LD b5, 5 * SIZE(Y) + + ADD s3, t3, $f3 + fmov $f3, s3 + LD a6, -2 * SIZE(X) + MUL a7, b7, t3 + LD a7, -1 * SIZE(X) + + addl Y, 8 * SIZE, Y + bgt I, $L12 + nop + fnop + .align 4 + +$L13: + ADD s0, t0, $f3 + fmov $f3, s0 + LD b6, -2 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -1 * SIZE(Y) + ADD s1, t1, $f3 + fmov $f3, s1 + MUL a1, b1, t1 + + ADD s2, t2, $f3 + fmov $f3, s2 + MUL a2, b2, t2 + ADD s3, t3, $f3 + fmov $f3, s3 + MUL a3, b3, t3 + + ADD s0, t0, $f3 + fmov $f3, s0 + MUL a4, b4, t0 + ADD s1, t1, $f3 + fmov $f3, s1 + MUL a5, b5, t1 + ADD s2, t2, $f3 + fmov $f3, s2 + MUL a6, b6, t2 + ADD s3, t3, $f3 + fmov $f3, s3 + MUL a7, b7, t3 + .align 4 + +$L15: + ADD s0, t0, $f3 + fmov $f3, s0 + and N, 7, I + ADD s1, t1, $f3 + fmov $f3, s1 + ble I, $L18 + .align 4 + +#endif + +$L16: + LD a0, 0 * SIZE(X) + addl X, SIZE, X + LD b0, 0 * SIZE(Y) + addl Y, SIZE, Y + + ADD s2, t2, $f3 + fmov $f3, s2 + MUL a0, b0, t2 + subl I, 1, I + bgt I, $L16 + .align 4 + +$L18: + ADD s2, t2, $f3 + fmov $f3, s2 + ADD s3, t3, $f3 + fmov $f3, s3 + br $L999 + .align 4 + +$L20: + srl N, 2, I + ble I, $L25 + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b0, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b1, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b2, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b3, 0 * SIZE(Y) + subl I, 1, I + + SXADDQ INCY, Y, Y + ble I, $L23 + .align 4 + +$L22: + ADD s0, t0, $f3 + fmov $f3, s0 + MUL a0, b0, t0 + ADD s1, t1, $f3 + fmov $f3, s1 + MUL a1, b1, t1 + ADD s2, t2, $f3 + fmov $f3, s2 + MUL a2, b2, t2 + ADD s3, t3, $f3 + fmov $f3, s3 + MUL a3, b3, t3 + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b0, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b1, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b2, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b3, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + + subl I, 1, I + bgt I, $L22 + nop + fnop + .align 4 + +$L23: + ADD s0, t0, $f3 + fmov $f3, s0 + MUL a0, b0, t0 + ADD s1, t1, $f3 + fmov $f3, s1 + MUL a1, b1, t1 + ADD s2, t2, $f3 + fmov $f3, s2 + MUL a2, b2, t2 + ADD s3, t3, $f3 + fmov $f3, s3 + MUL a3, b3, t3 + .align 4 + +$L25: + ADD s0, t0, $f3 + fmov $f3, s0 + and N, 3, I + ADD s1, t1, $f3 + fmov $f3, s1 + ble I, $L28 + .align 4 + +$L26: + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b0, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + + ADD s2, t2, $f3 + fmov $f3, s2 + MUL a0, b0, t2 + subl I, 1, I + bgt I, $L26 + .align 4 + +$L28: + ADD s2, t2, $f3 + fmov $f3, s2 + ADD s3, t3, $f3 + fmov $f3, s3 + .align 4 + +$L999: + ADD s2, s3, $f3 + fmov $f3, s2 + fldd $f2, 0($sp) + ADD s0, s1, $f3 + fmov $f3, s0 + ADD s0, s2, $f3 + fmov $f3, s0 +#ifndef ZYX20220111 + fldd $f3, 8($sp) + ldi $sp, 16($sp) +#endif + ret + + EPILOGUE diff --git a/kernel/sw_64/dot.S.bak b/kernel/sw_64/dot.S.bak new file mode 100644 index 0000000..cd96e21 --- /dev/null +++ b/kernel/sw_64/dot.S.bak @@ -0,0 +1,602 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 + +#define I $5 + +#define s0 $f0 +#define s1 $f30 +#define s2 $f1 +#define s3 $f2 + +#define a0 $f10 +#define a1 $f11 +#define a2 $f12 +#define a3 $f13 +#define a4 $f14 +#define a5 $f15 +#define a6 $f16 +#define a7 $f17 + +#define b0 $f18 +#define b1 $f19 +#define b2 $f20 +#define b3 $f21 +#define b4 $f22 +#define b5 $f23 +#define b6 $f24 +#define b7 $f25 + +#define t0 $f26 +#define t1 $f27 +#define t2 $f28 +#define t3 $f29 + + + PROLOGUE + PROFCODE + .frame $sp, 16, $26, 0 + + ldi $sp, -16($sp) + fclr s0 + fstd $f2, 0($sp) + fclr s1 + + fclr s2 + nop + fclr s3 + ble N, $L999 + + fclr t0 + cmpeq INCX, 1, $21 + fclr t1 + cmpeq INCY, 1, $22 + fclr t2 + and $21, $22, $22 + fclr t3 + beq $22, $L20 + +#ifndef DOUBLE + srl N, 4, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + LD b2, 2 * SIZE(Y) + LD b3, 3 * SIZE(Y) + + LD a4, 4 * SIZE(X) + LD a5, 5 * SIZE(X) + LD b4, 4 * SIZE(Y) + LD b5, 5 * SIZE(Y) + + LD a6, 6 * SIZE(X) + LD a7, 7 * SIZE(X) + addl X, 16 * SIZE, X + subl I, 1, I + + addl Y, 16 * SIZE, Y + ble I, $L13 + .align 4 + +$L12: + fillcs PREFETCHSIZE * 2 * SIZE(X) + subl I, 1, I + fillcs PREFETCHSIZE * 2 * SIZE(Y) + addl X, 16 * SIZE, X + + ADD s0, t0, $f3 + fmov $f3, s0 + LD b6, -10 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -9 * SIZE(Y) + + ADD s1, t1, $f3 + fmov $f3, s1 + LD a0, -24 * SIZE(X) + MUL a1, b1, t1 + LD a1, -23 * SIZE(X) + + ADD s2, t2, $f3 + fmov $f3, s2 + LD b0, -8 * SIZE(Y) + MUL a2, b2, $f3 + fmov $f3, t2 + LD b1, -7 * SIZE(Y) + + ADD s3, t3, $f3 + fmov $f3, s3 + LD a2, -22 * SIZE(X) + MUL a3, b3, t3 + LD a3, -21 * SIZE(X) + + ADD s0, t0, $f3 + fmov $f3, s0 + LD b2, -6 * SIZE(Y) + MUL a4, b4, t0 + LD b3, -5 * SIZE(Y) + + ADD s1, t1, $f3 + fmov $f3, s1 + LD a4, -20 * SIZE(X) + MUL a5, b5, t1 + LD a5, -19 * SIZE(X) + + ADD s2, t2, $f3 + fmov $f3, s2 + LD b4, -4 * SIZE(Y) + MUL a6, b6, t2 + LD b5, -3 * SIZE(Y) + + ADD s3, t3, $f3 + fmov $f3, s3 + LD a6, -18 * SIZE(X) + MUL a7, b7, t3 + LD a7, -17 * SIZE(X) + + ADD s0, t0, $f3 + fmov $f3, s0 + LD b6, -2 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -1 * SIZE(Y) + + ADD s1, t1, $f3 + fmov $f3, s1 + LD a0, -16 * SIZE(X) + MUL a1, b1, t1 + LD a1, -15 * SIZE(X) + + ADD s2, t2, $f3 + fmov $f3, s2 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t2 + LD b1, 1 * SIZE(Y) + + ADD s3, t3, $f3 + fmov $f3, s3 + LD a2, -14 * SIZE(X) + MUL a3, b3, t3 + LD a3, -13 * SIZE(X) + + ADD s0, t0, $f3 + fmov $f3, s0 + LD b2, 2 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 3 * SIZE(Y) + + ADD s1, t1, $f3 + fmov $f3, s1 + LD a4, -12 * SIZE(X) + MUL a5, b5, t1 + LD a5, -11 * SIZE(X) + + ADD s2, t2, $f3 + fmov $f3, s2 + LD b4, 4 * SIZE(Y) + MUL a6, b6, t2 + LD b5, 5 * SIZE(Y) + + ADD s3, t3, $f3 + fmov $f3, s3 + LD a6, -10 * SIZE(X) + MUL a7, b7, t3 + LD a7, -9 * SIZE(X) + + addl Y, 16 * SIZE, Y + bgt I, $L12 + nop + fnop + .align 4 + +$L13: + ADD s0, t0, $f3 + fmov $f3, s0 + LD b6,-10 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -9 * SIZE(Y) + + ADD s1, t1, $f3 + fmov $f3, s1 + LD a0, -8 * SIZE(X) + MUL a1, b1, t1 + LD a1, -7 * SIZE(X) + + ADD s2, t2, $f3 + fmov $f3, s2 + LD b0, -8 * SIZE(Y) + MUL a2, b2, t2 + LD b1, -7 * SIZE(Y) + + ADD s3, t3, $f3 + fmov $f3, s3 + LD a2, -6 * SIZE(X) + MUL a3, b3, t3 + LD a3, -5 * SIZE(X) + + ADD s0, t0, $f3 + fmov $f3, s0 + LD b2, -6 * SIZE(Y) + MUL a4, b4, t0 + LD b3, -5 * SIZE(Y) + + ADD s1, t1, $f3 + fmov $f3, s1 + LD a4, -4 * SIZE(X) + MUL a5, b5, t1 + LD a5, -3 * SIZE(X) + + ADD s2, t2, $f3 + fmov $f3, s2 + LD b4, -4 * SIZE(Y) + MUL a6, b6, t2 + LD b5, -3 * SIZE(Y) + + ADD s3, t3, $f3 + fmov $f3, s3 + LD a6, -2 * SIZE(X) + MUL a7, b7, t3 + LD a7, -1 * SIZE(X) + + ADD s0, t0, $f3 + fmov $f3, s0 + LD b6, -2 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -1 * SIZE(Y) + ADD s1, t1, $f3 + fmov $f3, s1 + MUL a1, b1, t1 + + ADD s2, t2, $f3 + fmov $f3, s2 + MUL a2, b2, t2 + ADD s3, t3, $f3 + fmov $f3, s3 + MUL a3, b3, t3 + + ADD s0, t0, $f3 + fmov $f3, s0 + MUL a4, b4, t0 + ADD s1, t1, $f3 + fmov $f3, s1 + MUL a5, b5, t1 + ADD s2, t2, $f3 + fmov $f3, s2 + MUL a6, b6, t2 + ADD s3, t3, $f3 + fmov $f3, s3 + MUL a7, b7, t3 + .align 4 + +$L15: + ADD s0, t0, $f3 + fmov $f3, s0 + and N, 15, I + ADD s1, t1, $f3 + fmov $f3, s1 + ble I, $L18 + .align 4 + +#else + + srl N, 3, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + LD b2, 2 * SIZE(Y) + LD b3, 3 * SIZE(Y) + + LD a4, 4 * SIZE(X) + LD a5, 5 * SIZE(X) + LD b4, 4 * SIZE(Y) + LD b5, 5 * SIZE(Y) + + LD a6, 6 * SIZE(X) + LD a7, 7 * SIZE(X) + addl X, 8 * SIZE, X + subl I, 1, I + + addl Y, 8 * SIZE, Y + ble I, $L13 + .align 4 + +$L12: + fillcs PREFETCHSIZE * SIZE(X) + subl I, 1, I + fillcs PREFETCHSIZE * SIZE(Y) + addl X, 8 * SIZE, X + + ADD s0, t0, $f3 + fmov $f3, s0 + LD b6, -2 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -1 * SIZE(Y) + + ADD s1, t1, $f3 + fmov $f3, s1 + LD a0, -8 * SIZE(X) + MUL a1, b1, t1 + LD a1, -7 * SIZE(X) + + ADD s2, t2, $f3 + fmov $f3, s2 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t2 + LD b1, 1 * SIZE(Y) + + ADD s3, t3, $f3 + fmov $f3, s3 + LD a2, -6 * SIZE(X) + MUL a3, b3, t3 + LD a3, -5 * SIZE(X) + + ADD s0, t0, $f3 + fmov $f3, s0 + LD b2, 2 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 3 * SIZE(Y) + + ADD s1, t1, $f3 + fmov $f3, s1 + LD a4, -4 * SIZE(X) + MUL a5, b5, t1 + LD a5, -3 * SIZE(X) + + ADD s2, t2, $f3 + fmov $f3, s2 + LD b4, 4 * SIZE(Y) + MUL a6, b6, t2 + LD b5, 5 * SIZE(Y) + + ADD s3, t3, $f3 + fmov $f3, s3 + LD a6, -2 * SIZE(X) + MUL a7, b7, t3 + LD a7, -1 * SIZE(X) + + addl Y, 8 * SIZE, Y + bgt I, $L12 + nop + fnop + .align 4 + +$L13: + ADD s0, t0, $f3 + fmov $f3, s0 + LD b6, -2 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -1 * SIZE(Y) + ADD s1, t1, $f3 + fmov $f3, s1 + MUL a1, b1, t1 + + ADD s2, t2, $f3 + fmov $f3, s2 + MUL a2, b2, t2 + ADD s3, t3, $f3 + fmov $f3, s3 + MUL a3, b3, t3 + + ADD s0, t0, $f3 + fmov $f3, s0 + MUL a4, b4, t0 + ADD s1, t1, $f3 + fmov $f3, s1 + MUL a5, b5, t1 + ADD s2, t2, $f3 + fmov $f3, s2 + MUL a6, b6, t2 + ADD s3, t3, $f3 + fmov $f3, s3 + MUL a7, b7, t3 + .align 4 + +$L15: + ADD s0, t0, $f3 + fmov $f3, s0 + and N, 7, I + ADD s1, t1, $f3 + fmov $f3, s1 + ble I, $L18 + .align 4 + +#endif + +$L16: + LD a0, 0 * SIZE(X) + addl X, SIZE, X + LD b0, 0 * SIZE(Y) + addl Y, SIZE, Y + + ADD s2, t2, $f3 + fmov $f3, s2 + MUL a0, b0, t2 + subl I, 1, I + bgt I, $L16 + .align 4 + +$L18: + ADD s2, t2, $f3 + fmov $f3, s2 + ADD s3, t3, $f3 + fmov $f3, s3 + br $L999 + .align 4 + +$L20: + srl N, 2, I + ble I, $L25 + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b0, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b1, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b2, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b3, 0 * SIZE(Y) + subl I, 1, I + + SXADDQ INCY, Y, Y + ble I, $L23 + .align 4 + +$L22: + ADD s0, t0, $f3 + fmov $f3, s0 + MUL a0, b0, t0 + ADD s1, t1, $f3 + fmov $f3, s1 + MUL a1, b1, t1 + ADD s2, t2, $f3 + fmov $f3, s2 + MUL a2, b2, t2 + ADD s3, t3, $f3 + fmov $f3, s3 + MUL a3, b3, t3 + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b0, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b1, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b2, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b3, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + + subl I, 1, I + bgt I, $L22 + nop + fnop + .align 4 + +$L23: + ADD s0, t0, $f3 + fmov $f3, s0 + MUL a0, b0, t0 + ADD s1, t1, $f3 + fmov $f3, s1 + MUL a1, b1, t1 + ADD s2, t2, $f3 + fmov $f3, s2 + MUL a2, b2, t2 + ADD s3, t3, $f3 + fmov $f3, s3 + MUL a3, b3, t3 + .align 4 + +$L25: + ADD s0, t0, $f3 + fmov $f3, s0 + and N, 3, I + ADD s1, t1, $f3 + fmov $f3, s1 + ble I, $L28 + .align 4 + +$L26: + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b0, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + + ADD s2, t2, $f3 + fmov $f3, s2 + MUL a0, b0, t2 + subl I, 1, I + bgt I, $L26 + .align 4 + +$L28: + ADD s2, t2, $f3 + fmov $f3, s2 + ADD s3, t3, $f3 + fmov $f3, s3 + .align 4 + +$L999: + ADD s2, s3, $f3 + fmov $f3, s2 + fldd $f2, 0($sp) + ADD s0, s1, $f3 + fmov $f3, s0 + ldi $sp, 16($sp) + + ADD s0, s2, $f3 + fmov $f3, s0 + ret + + EPILOGUE diff --git a/kernel/sw_64/dot_simd.S b/kernel/sw_64/dot_simd.S new file mode 100644 index 0000000..3e2288d --- /dev/null +++ b/kernel/sw_64/dot_simd.S @@ -0,0 +1,634 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 + +#define I $5 + +#define s0 $f0 +#define s1 $f30 +#define s2 $f1 +#define s3 $f2 + +#define a0 $f10 +#define a1 $f11 +#define a2 $f12 +#define a3 $f13 +#define a4 $f14 +#define a5 $f15 +#define a6 $f16 +#define a7 $f17 + +#define b0 $f18 +#define b1 $f19 +#define b2 $f20 +#define b3 $f21 +#define b4 $f22 +#define b5 $f23 +#define b6 $f24 +#define b7 $f25 + +#define t0 $f26 +#define t1 $f27 +#define t2 $f28 +#define t3 $f29 + + + PROLOGUE + PROFCODE + .frame $sp, 16, $26, 0 + + ldi $sp, -16($sp) + fclr s0 + fstd $f2, 0($sp) + fclr s1 + + fclr s2 + nop + fclr s3 + ble N, $L999 + + fclr t0 + cmpeq INCX, 1, $21 + fclr t1 + cmpeq INCY, 1, $22 + fclr t2 + and $21, $22, $22 + fclr t3 + beq $22, $L20 + + +/* + test the address of Y & X +*/ + and Y, (VEC_LEN*SIZE-1), $4 + and X, (VEC_LEN*SIZE-1), $3 + or $3, $4, $4 + bne $4, $UnAlign_ACCESS + +/*Align Accessing*/ + sra N, 4, I + ble I, $Remain + + VLD a0, 0*VEC_LEN*SIZE(X) + vcpys $f31, $f31, s0 #clear s0 vector + VLD a1, 1*VEC_LEN*SIZE(X) + vcpys $f31, $f31, s1 + + VLD a2, 2*VEC_LEN*SIZE(X) + vcpys $f31, $f31, s2 + VLD a3, 3*VEC_LEN*SIZE(X) + vcpys $f31, $f31, s3 + + VLD b0, 0*VEC_LEN*SIZE(Y) + VLD b1, 1*VEC_LEN*SIZE(Y) + VLD b2, 2*VEC_LEN*SIZE(Y) + VLD b3, 3*VEC_LEN*SIZE(Y) + + addl X, 16 * SIZE, X + addl Y, 16 * SIZE, Y + subl I, 1, I + ble I, $MainLoopEnd +$MainLoop: + VMAD a0, b0, s0, s0 + fillcs PREFETCHSIZE * SIZE(X) + VMAD a1, b1, s1, s1 + fillcs PREFETCHSIZE * SIZE(Y) + + subl I, 1, I + VMAD a2, b2, s2, s2 + addl X, 16 * SIZE, X + VMAD a3, b3, s3, s3 + + VLD a0, -4*VEC_LEN*SIZE(X) + VLD a1, -3*VEC_LEN*SIZE(X) + VLD a2, -2*VEC_LEN*SIZE(X) + VLD a3, -1*VEC_LEN*SIZE(X) + + VLD b0, 0*VEC_LEN*SIZE(Y) + VLD b1, 1*VEC_LEN*SIZE(Y) + VLD b2, 2*VEC_LEN*SIZE(Y) + VLD b3, 3*VEC_LEN*SIZE(Y) + + + addl Y, 16 * SIZE, Y + bgt I, $MainLoop + .align 4 + +$MainLoopEnd: + VMAD a0, b0, s0, s0 + VMAD a1, b1, s1, s1 + VMAD a2, b2, s2, s2 + VMAD a3, b3, s3, s3 + + VADD s0, s1, t0 + VADD s2, s3, t1 + nop + VADD t0, t1, s0 + + vextf s0, 1, s1 + vextf s0, 2, s2 + vextf s0, 3, s3 + nop + + ADD s0, s1, t2 + ADD s2, s3, t3 + nop + ADD t2, t3, s0 + + .align 4 +$Remain: + and N, 15, I + ble I, $End + .align 4 +$Remain_Loop: + LD a0, 0 * SIZE(X) + addl X, SIZE, X + LD b0, 0 * SIZE(Y) + addl Y, SIZE, Y + + MAD a0, b0, s0, s0 + subl I, 1, I + bgt I, $Remain_Loop + .align 4 +$End: + + fldd $f2, 0($sp) + ldi $sp, 16($sp) + ret + .align 4 + +/*UnAlign Accessing*/ +$UnAlign_ACCESS: + +#ifndef DOUBLE + srl N, 4, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + LD b2, 2 * SIZE(Y) + LD b3, 3 * SIZE(Y) + + LD a4, 4 * SIZE(X) + LD a5, 5 * SIZE(X) + LD b4, 4 * SIZE(Y) + LD b5, 5 * SIZE(Y) + + LD a6, 6 * SIZE(X) + LD a7, 7 * SIZE(X) + addl X, 16 * SIZE, X + subl I, 1, I + + addl Y, 16 * SIZE, Y + ble I, $L13 + .align 4 + +$L12: + fillcs PREFETCHSIZE * 2 * SIZE(X) + subl I, 1, I + fillcs PREFETCHSIZE * 2 * SIZE(Y) + addl X, 16 * SIZE, X + + ADD s0, t0, s0 + LD b6, -10 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -9 * SIZE(Y) + + ADD s1, t1, s1 + LD a0, -24 * SIZE(X) + MUL a1, b1, t1 + LD a1, -23 * SIZE(X) + + ADD s2, t2, s2 + LD b0, -8 * SIZE(Y) + MUL a2, b2, t2 + LD b1, -7 * SIZE(Y) + + ADD s3, t3, s3 + LD a2, -22 * SIZE(X) + MUL a3, b3, t3 + LD a3, -21 * SIZE(X) + + ADD s0, t0, s0 + LD b2, -6 * SIZE(Y) + MUL a4, b4, t0 + LD b3, -5 * SIZE(Y) + + ADD s1, t1, s1 + LD a4, -20 * SIZE(X) + MUL a5, b5, t1 + LD a5, -19 * SIZE(X) + + ADD s2, t2, s2 + LD b4, -4 * SIZE(Y) + MUL a6, b6, t2 + LD b5, -3 * SIZE(Y) + + ADD s3, t3, s3 + LD a6, -18 * SIZE(X) + MUL a7, b7, t3 + LD a7, -17 * SIZE(X) + + ADD s0, t0, s0 + LD b6, -2 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -1 * SIZE(Y) + + ADD s1, t1, s1 + LD a0, -16 * SIZE(X) + MUL a1, b1, t1 + LD a1, -15 * SIZE(X) + + ADD s2, t2, s2 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t2 + LD b1, 1 * SIZE(Y) + + ADD s3, t3, s3 + LD a2, -14 * SIZE(X) + MUL a3, b3, t3 + LD a3, -13 * SIZE(X) + + ADD s0, t0, s0 + LD b2, 2 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 3 * SIZE(Y) + + ADD s1, t1, s1 + LD a4, -12 * SIZE(X) + MUL a5, b5, t1 + LD a5, -11 * SIZE(X) + + ADD s2, t2, s2 + LD b4, 4 * SIZE(Y) + MUL a6, b6, t2 + LD b5, 5 * SIZE(Y) + + ADD s3, t3, s3 + LD a6, -10 * SIZE(X) + MUL a7, b7, t3 + LD a7, -9 * SIZE(X) + + addl Y, 16 * SIZE, Y + bgt I, $L12 + nop + fnop + .align 4 + +$L13: + ADD s0, t0, s0 + LD b6,-10 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -9 * SIZE(Y) + + ADD s1, t1, s1 + LD a0, -8 * SIZE(X) + MUL a1, b1, t1 + LD a1, -7 * SIZE(X) + + ADD s2, t2, s2 + LD b0, -8 * SIZE(Y) + MUL a2, b2, t2 + LD b1, -7 * SIZE(Y) + + ADD s3, t3, s3 + LD a2, -6 * SIZE(X) + MUL a3, b3, t3 + LD a3, -5 * SIZE(X) + + ADD s0, t0, s0 + LD b2, -6 * SIZE(Y) + MUL a4, b4, t0 + LD b3, -5 * SIZE(Y) + + ADD s1, t1, s1 + LD a4, -4 * SIZE(X) + MUL a5, b5, t1 + LD a5, -3 * SIZE(X) + + ADD s2, t2, s2 + LD b4, -4 * SIZE(Y) + MUL a6, b6, t2 + LD b5, -3 * SIZE(Y) + + ADD s3, t3, s3 + LD a6, -2 * SIZE(X) + MUL a7, b7, t3 + LD a7, -1 * SIZE(X) + + ADD s0, t0, s0 + LD b6, -2 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -1 * SIZE(Y) + ADD s1, t1, s1 + MUL a1, b1, t1 + + ADD s2, t2, s2 + MUL a2, b2, t2 + ADD s3, t3, s3 + MUL a3, b3, t3 + + ADD s0, t0, s0 + MUL a4, b4, t0 + ADD s1, t1, s1 + MUL a5, b5, t1 + ADD s2, t2, s2 + MUL a6, b6, t2 + ADD s3, t3, s3 + MUL a7, b7, t3 + .align 4 + +$L15: + ADD s0, t0, s0 + and N, 15, I + ADD s1, t1, s1 + ble I, $L18 + .align 4 + +#else + + srl N, 3, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + LD b2, 2 * SIZE(Y) + LD b3, 3 * SIZE(Y) + + LD a4, 4 * SIZE(X) + LD a5, 5 * SIZE(X) + LD b4, 4 * SIZE(Y) + LD b5, 5 * SIZE(Y) + + LD a6, 6 * SIZE(X) + LD a7, 7 * SIZE(X) + addl X, 8 * SIZE, X + subl I, 1, I + + addl Y, 8 * SIZE, Y + ble I, $L13 + .align 4 + +$L12: + fillcs PREFETCHSIZE * SIZE(X) + subl I, 1, I + fillcs PREFETCHSIZE * SIZE(Y) + addl X, 8 * SIZE, X + + ADD s0, t0, s0 + LD b6, -2 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -1 * SIZE(Y) + + ADD s1, t1, s1 + LD a0, -8 * SIZE(X) + MUL a1, b1, t1 + LD a1, -7 * SIZE(X) + + ADD s2, t2, s2 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t2 + LD b1, 1 * SIZE(Y) + + ADD s3, t3, s3 + LD a2, -6 * SIZE(X) + MUL a3, b3, t3 + LD a3, -5 * SIZE(X) + + ADD s0, t0, s0 + LD b2, 2 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 3 * SIZE(Y) + + ADD s1, t1, s1 + LD a4, -4 * SIZE(X) + MUL a5, b5, t1 + LD a5, -3 * SIZE(X) + + ADD s2, t2, s2 + LD b4, 4 * SIZE(Y) + MUL a6, b6, t2 + LD b5, 5 * SIZE(Y) + + ADD s3, t3, s3 + LD a6, -2 * SIZE(X) + MUL a7, b7, t3 + LD a7, -1 * SIZE(X) + + addl Y, 8 * SIZE, Y + bgt I, $L12 + nop + fnop + .align 4 + +$L13: + ADD s0, t0, s0 + LD b6, -2 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -1 * SIZE(Y) + ADD s1, t1, s1 + MUL a1, b1, t1 + + ADD s2, t2, s2 + MUL a2, b2, t2 + ADD s3, t3, s3 + MUL a3, b3, t3 + + ADD s0, t0, s0 + MUL a4, b4, t0 + ADD s1, t1, s1 + MUL a5, b5, t1 + ADD s2, t2, s2 + MUL a6, b6, t2 + ADD s3, t3, s3 + MUL a7, b7, t3 + .align 4 + +$L15: + ADD s0, t0, s0 + and N, 7, I + ADD s1, t1, s1 + ble I, $L18 + .align 4 + +#endif + +$L16: + LD a0, 0 * SIZE(X) + addl X, SIZE, X + LD b0, 0 * SIZE(Y) + addl Y, SIZE, Y + + ADD s2, t2, s2 + MUL a0, b0, t2 + subl I, 1, I + bgt I, $L16 + .align 4 + +$L18: + ADD s2, t2, s2 + ADD s3, t3, s3 + br $L999 + .align 4 + +$L20: + srl N, 2, I + ble I, $L25 + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b0, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b1, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b2, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b3, 0 * SIZE(Y) + subl I, 1, I + + SXADDQ INCY, Y, Y + ble I, $L23 + .align 4 + +$L22: + ADD s0, t0, s0 + MUL a0, b0, t0 + ADD s1, t1, s1 + MUL a1, b1, t1 + ADD s2, t2, s2 + MUL a2, b2, t2 + ADD s3, t3, s3 + MUL a3, b3, t3 + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b0, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b1, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b2, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b3, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + + subl I, 1, I + bgt I, $L22 + nop + fnop + .align 4 + +$L23: + ADD s0, t0, s0 + MUL a0, b0, t0 + ADD s1, t1, s1 + MUL a1, b1, t1 + ADD s2, t2, s2 + MUL a2, b2, t2 + ADD s3, t3, s3 + MUL a3, b3, t3 + .align 4 + +$L25: + ADD s0, t0, s0 + and N, 3, I + ADD s1, t1, s1 + ble I, $L28 + .align 4 + +$L26: + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b0, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + MUL a0, b0, t2 + subl I, 1, I + bgt I, $L26 + .align 4 + +$L28: + ADD s2, t2, s2 + ADD s3, t3, s3 + .align 4 + +$L999: + ADD s2, s3, s2 + fldd $f2, 0($sp) + ADD s0, s1, s0 + ldi $sp, 16($sp) + + ADD s0, s2, s0 + ret + + EPILOGUE diff --git a/kernel/sw_64/gemm_beta.S b/kernel/sw_64/gemm_beta.S new file mode 100644 index 0000000..d9ea890 --- /dev/null +++ b/kernel/sw_64/gemm_beta.S @@ -0,0 +1,179 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + .set noat + .set noreorder +.text + .align 5 + .globl CNAME + .ent CNAME +CNAME: + .frame $sp, 0, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + ldi $28, _mcount + jsr $28, ($28), _mcount +#endif + + ldl $18, 16($sp) + ble $16, $End + ldl $19, 24($sp) + ble $17, $End +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + fbeq $f19, $BETA_EQ_ZERO # if (beta == ZERO) + .align 4 + +$BETA_NE_ZERO: + sra $16, 3, $2 # i = (m >> 3) + mov $18, $1 # c_offset = c + ldi $17, -1($17) # j -- + ble $2,$L52 + .align 4 + +$L51: + fillcs 64($1) + ldi $2, -1($2) + + LD $f14, 0*SIZE($1) + LD $f15, 1*SIZE($1) + LD $f16, 2*SIZE($1) + LD $f17, 3*SIZE($1) + LD $f18, 4*SIZE($1) + LD $f11, 5*SIZE($1) + LD $f21, 6*SIZE($1) + LD $f22, 7*SIZE($1) + + MUL $f19, $f14, $f23 + MUL $f19, $f15, $f24 + MUL $f19, $f16, $f25 + MUL $f19, $f17, $f26 + MUL $f19, $f18, $f27 + MUL $f19, $f11, $f28 + MUL $f19, $f21, $f29 + MUL $f19, $f22, $f30 + + ST $f23, 0*SIZE($1) + ST $f24, 1*SIZE($1) + ST $f25, 2*SIZE($1) + ST $f26, 3*SIZE($1) + ST $f27, 4*SIZE($1) + ST $f28, 5*SIZE($1) + ST $f29, 6*SIZE($1) + ST $f30, 7*SIZE($1) + + ldi $1,8*SIZE($1) + bgt $2,$L51 + .align 4 + +$L52: + and $16, 7, $2 + ble $2,$L54 + .align 4 + +$L53: + LD $f12, 0($1) + ldi $2, -1($2) + MUL $f19, $f12, $f23 + ST $f23, 0($1) + ldi $1, SIZE($1) + bgt $2,$L53 + .align 4 + +$L54: + SXADDQ $19, $18, $18 # c += ldc + bgt $17,$BETA_NE_ZERO + clr $0 + ret + .align 4 + +$BETA_EQ_ZERO: + sra $16, 3, $2 # i = (m >> 3) + ldi $4, 8*SIZE($18) + mov $18, $1 # c_offset = c + ldi $17, -1($17) # j -- + ble $2,$L42 + .align 4 + +$L41: + ST $f31, 0*SIZE($1) + ST $f31, 1*SIZE($1) + ST $f31, 2*SIZE($1) + ST $f31, 3*SIZE($1) + ST $f31, 4*SIZE($1) + ST $f31, 5*SIZE($1) + ST $f31, 6*SIZE($1) + ST $f31, 7*SIZE($1) + ldi $2, -1($2) + + ldi $4, 8*SIZE($4) + ldi $1, 8*SIZE($1) + bgt $2,$L41 + .align 4 + +$L42: + and $16, 7, $2 + ble $2,$L44 + .align 4 + +$L43: + ldi $2, -1($2) + ST $f31, 0($1) + ldi $1, SIZE($1) + bgt $2, $L43 + .align 4 + +$L44: + SXADDQ $19, $18, $18 # c += ldc + bgt $17,$BETA_EQ_ZERO + clr $0 + .align 4 + +$End: + ret + .ident VERSION + .end CNAME diff --git a/kernel/sw_64/gemm_kernel_4x4.S b/kernel/sw_64/gemm_kernel_4x4.S new file mode 100644 index 0000000..dd17554 --- /dev/null +++ b/kernel/sw_64/gemm_kernel_4x4.S @@ -0,0 +1,3244 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(SW6) +#error "Architecture is not specified." +#endif + +#ifdef SW6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 56 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + +#define STACKSIZE 96 + +#define M $16 +#define N $17 +#define K $18 +#define A $20 +#define B $21 +#define C $22 +#define LDC $23 + +#define C1 $19 +#define C2 $24 +#define C3 $25 +#define C4 $27 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define BB $3 +#define OFFSET $4 + +#define tmp $9 + +#define ALPHA 64($sp) + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + ldi $sp, -STACKSIZE($sp) + + ldl C, 0 + STACKSIZE($sp) + ldl LDC, 8 + STACKSIZE($sp) +#ifdef TRMMKERNEL + ldl OFFSET, 16 + STACKSIZE($sp) +#endif + + SXADDQ LDC, 0, LDC + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + stl $9, 80($sp) + fstd $f19, ALPHA + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#if defined(TRMMKERNEL) && !defined(LEFT) + subl $31, OFFSET, KK +#endif + + sra N, 2, J + ble J, $L40 + .align 4 + +$L01: + mov C, C1 + addl C, LDC, C2 + mov A, AO + s4addl K, 0, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + addl C2, LDC, C3 + s4addl LDC, C, C + + SXADDQ BB, B, BB + fclr t1 + addl C3, LDC, C4 + fclr t2 + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L20 + .align 4 + +$L11: +#if defined(EV5) || defined(SW6A) + fillcs 0 * SIZE(BB) + fillcs 8 * SIZE(BB) + unop + ldi BB, 16 * SIZE(BB) +#endif + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 4, TMP1 +#else + addl KK, 4, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + + LD b3, 2 * SIZE(B) + fclr c06 + LD b4, 3 * SIZE(B) + fclr c05 + + fillcs 4 * SIZE(C1) + fclr c03 +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + fclr c04 + + fillcs 7 * SIZE(C2) + fclr c08 + ldi BO, 4 * SIZE(B) + fclr c13 + + fillcs 4 * SIZE(C3) + fclr c09 + ldi AO, 4 * SIZE(AO) + fclr c10 + +#else + sll KK, BASE_SHIFT + 2, TMP1 + addl AO, TMP1, AO + addl B, TMP1, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + + LD b3, 2 * SIZE(BO) + fclr c06 + LD b4, 3 * SIZE(BO) + fclr c05 + + fillcs 4 * SIZE(C1) + fclr c03 + ldi L, -2(TMP1) + fclr c04 + + fillcs 7 * SIZE(C2) + fclr c08 + ldi BO, 4 * SIZE(BO) + fclr c13 + + fillcs 4 * SIZE(C3) + fclr c09 + ldi AO, 4 * SIZE(AO) + fclr c10 +#endif + + fillcs 7 * SIZE(C4) + fclr c14 + fclr c07 + ble L, $L15 + .align 5 + +$L12: +/* 1 */ + ADD c11, t1, b5 + fmov b5, c11 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD c12, t2, b5 + fmov b5, c12 + unop + MUL b1, a2, t2 + unop + + ADD c16, t3,b5 + fmov b5, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD c15, t4,b5 + fmov b5, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + FIMOVD b5, tmp +/* 2 */ + ADD c01, t1,b5 + fmov b5, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD c02, t2,b5 + fmov b5, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD c06, t3,b5 + fmov b5, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, b5 + fmov b5, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD c03, t1,b5 + fmov b5, c03 + unop + MUL b3, a1, t1 + unop + + ADD c04, t2,b5 + fmov b5, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3,b5 + fmov b5, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4,b5 + fmov b5, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD c09, t1,b5 + fmov b5, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, b5 + fmov b5, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD c07, t4, b5 + fmov b5, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD c11, t1, b5 + fmov b5, c11 + unop + IFMOVD tmp, b5 + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD c12, t2, b5 + fmov b5, c12 + ldi L, -2(L) + IFMOVD tmp, b5 + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD c16, t3, b5 + fmov b5, c16 + unop + MUL b2, a2, t3 + unop + + ADD c15, t4, b5 + fmov b5, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD c01, t1, b5 + fmov b5, c01 + unop + IFMOVD tmp, b5 + MUL b5, a6, t1 + unop + + ADD c02, t2, b5 + fmov b5, c02 + unop + IFMOVD tmp, b5 + MUL b5, a4, t2 + unop + + ADD c06, t3, b5 + fmov b5, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, b5 + fmov b5, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD c03, t1, b5 + fmov b5, c03 + ldi AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD c04, t2, b5 + fmov b5, c04 + ldi BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD c08, t3, b5 + fmov b5, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD c13, t4, b5 + fmov b5, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD c09, t1, b5 + fmov b5, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD c14, t3, b5 + fmov b5, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, b5 + fmov b5, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD c11, t1, b5 + fmov b5, c11 + fldd alpha, ALPHA + MUL b1, a1, t1 +#ifndef TRMMKERNEL + blbs K, $L18 +#else + blbs TMP1, $L18 +#endif + .align 4 + + ADD c12, t2, b5 + fmov b5, c12 + MUL b1, a2, t2 + ADD c16, t3, b5 + fmov b5, c16 + MUL b2, a2, t3 + + ADD c15, t4, b5 + fmov b5, c15 + MUL b2, a1, t4 + ADD c01, t1, b5 + fmov b5, c01 + MUL b1, a3, t1 + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD c06, t3, b5 + fmov b5, c06 + MUL b2, a4, t3 + ADD c05, t4, b5 + fmov b5, c05 + MUL b4, a1, t4 + + ADD c03, t1, b5 + fmov b5, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD c04, t2, b5 + fmov b5, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, b5 + fmov b5, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, b5 + fmov b5, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD c09, t1, b5 + fmov b5, c09 + unop + MUL b3, a3, t1 + ldi AO, 4 * SIZE(AO) + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, b5 + fmov b5, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, b5 + fmov b5, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD c11, t1, b5 + fmov b5, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L18: + ADD c12, t2, b5 + fmov b5, c12 + unop + MUL b1, a2, t2 +#ifndef TRMMKERNEL + LD a5, 0 * SIZE(C1) +#else + unop +#endif + + ADD c16, t3, b5 + fmov b5, c16 + unop + MUL b2, a2, t3 + unop + + ADD c15, t4, b5 + fmov b5, c15 + unop + MUL b2, a1, t4 +#ifndef TRMMKERNEL + LD b5, 1 * SIZE(C1) + FIMOVD b5, tmp +#else + unop +#endif + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL b1, a3, t1 + unop + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL b1, a4, t2 +#ifndef TRMMKERNEL + LD b1, 0 * SIZE(C2) +#else + unop +#endif + + ADD c06, t3, b5 + fmov b5, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, b5 + fmov b5, c05 + unop + MUL b4, a1, t4 + unop + + ADD c03, t1, b5 + fmov b5, c03 + unop + MUL b3, a1, t1 + unop + + ADD c04, t2, b5 + fmov b5, c04 + unop + MUL b3, a2, t2 +#ifndef TRMMKERNEL + LD a1, 0 * SIZE(C3) +#else + unop +#endif + + ADD c08, t3, b5 + fmov b5, c08 + unop + MUL b4, a2, t3 +#ifndef TRMMKERNEL + LD a2, 2 * SIZE(C1) +#else + unop +#endif + + ADD c13, t4, b5 + fmov b5, c13 + unop + MUL b2, a3, t4 +#ifndef TRMMKERNEL + LD b2, 3 * SIZE(C1) +#else + unop +#endif + + ADD c09, t1, b5 + fmov b5, c09 + ldi I, -1(I) + MUL b3, a3, t1 + unop + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL b3, a4, t2 +#ifndef TRMMKERNEL + LD b3, 0 * SIZE(C4) +#else + unop +#endif + + ADD c14, t3, b5 + fmov b5, c14 + unop + MUL b4, a4, t3 +#ifndef TRMMKERNEL + LD a4, 1 * SIZE(C2) +#else + unop +#endif + + ADD c07, t4, b5 + fmov b5, c07 + unop + MUL b4, a3, t4 +#ifndef TRMMKERNEL + LD a3, 2 * SIZE(C2) +#else + unop +#endif + + ADD c11, t1, b5 + fmov b5, c11 + unop + MUL alpha, c01, b5 + fmov b5, c01 +#ifndef TRMMKERNEL + LD b4, 3 * SIZE(C2) +#else + unop +#endif + + ADD c12, t2, b5 + fmov b5, c12 + unop + MUL alpha, c02, b5 + fmov b5, c02 +#ifndef TRMMKERNEL + LD t1, 1 * SIZE(C3) +#else + unop +#endif + + ADD c16, t3, b5 + fmov b5, c16 + unop + MUL alpha, c03, b5 + fmov b5, c03 +#ifndef TRMMKERNEL + LD t2, 2 * SIZE(C3) +#else + unop +#endif + + ADD c15, t4, b5 + fmov b5, c15 + unop + MUL alpha, c04, b5 + fmov b5, c04 +#ifndef TRMMKERNEL + LD t3, 3 * SIZE(C3) +#else + unop +#endif + + MUL alpha, c05, b5 + fmov b5, c05 + unop +#ifndef TRMMKERNEL + ADD c01, a5, b5 + fmov b5, c01 + LD t4, 1 * SIZE(C4) +#else + unop + unop +#endif + + MUL alpha, c06, b5 + fmov b5, c06 +#ifndef TRMMKERNEL + unop + IFMOVD tmp, b5 + fstd b1, 88($sp) +# FIMOVD b1, tmp + ADD c02, b5, b1 + fmov b1, c02 + fldd b1, 88($sp) +# IFMOVD tmp, b1 + LD a5, 2 * SIZE(C4) +#endif + + MUL alpha, c07, b5 + fmov b5, c07 +#ifndef TRMMKERNEL + unop + ADD c03, a2, b5 + fmov b5, c03 + LD b5, 3 * SIZE(C4) + FIMOVD b5, tmp +#endif + + MUL alpha, c08, b5 + fmov b5, c08 +#ifndef TRMMKERNEL + unop + ADD c04, b2, b5 + fmov b5, c04 + unop +#endif + + MUL alpha, c09, b5 + fmov b5, c09 + ST c01, 0 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c05, b1, b5 + fmov b5, c05 + unop +#endif + + MUL alpha, c10, b5 + fmov b5, c10 + ST c02, 1 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c06, a4, b5 + fmov b5, c06 + unop +#endif + + MUL alpha, c11, b5 + fmov b5, c11 + ST c03, 2 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c07, a3, b5 + fmov b5, c07 + unop +#endif + + MUL alpha, c12, b5 + fmov b5, c12 + ST c04, 3 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c08, b4, b5 + fmov b5, c08 +#else + unop +#endif + ldi C1, 4 * SIZE(C1) + + MUL alpha, c13, b5 + fmov b5, c13 + ST c05, 0 * SIZE(C2) +#ifndef TRMMKERNEL + ADD c09, a1, b5 + fmov b5, c09 + unop +#endif + + MUL alpha, c14, b5 + fmov b5, c14 + ST c06, 1 * SIZE(C2) +#ifndef TRMMKERNEL + ADD c10, t1, b5 + fmov b5, c10 + unop +#endif + + MUL alpha, c15, b5 + fmov b5, c15 + ST c07, 2 * SIZE(C2) +#ifndef TRMMKERNEL + ADD c11, t2, b5 + fmov b5, c11 + unop +#endif + + MUL alpha, c16, b5 + fmov b5, c16 + ST c08, 3 * SIZE(C2) +#ifndef TRMMKERNEL + ADD c12, t3, b5 + fmov b5, c12 +#else + unop +#endif + ldi C2, 4 * SIZE(C2) + +#ifndef TRMMKERNEL + ADD c13, b3, b5 + fmov b5, c13 +#else + unop +#endif + ST c09, 0 * SIZE(C3) + fclr t1 + ldi C4, 4 * SIZE(C4) + +#ifndef TRMMKERNEL + ADD c14, t4, b5 + fmov b5, c14 +#else + unop +#endif + ST c10, 1 * SIZE(C3) + fclr t2 + unop + +#ifndef TRMMKERNEL + ADD c15, a5, b5 + fmov b5, c15 +#else + unop +#endif + ST c11, 2 * SIZE(C3) + fclr t3 + unop + +#ifndef TRMMKERNEL + IFMOVD tmp, b5 +# FIMOVD b1, tmp + fstd b1, 88($sp) + ADD c16, b5, b1 + fmov b1, c16 + fldd b1, 88($sp) +# IFMOVD tmp, b1 +#else + unop +#endif + ST c12, 3 * SIZE(C3) + fclr t4 + ldi C3, 4 * SIZE(C3) + + ST c13, -4 * SIZE(C4) + ST c14, -3 * SIZE(C4) + ST c15, -2 * SIZE(C4) + ST c16, -1 * SIZE(C4) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 4, TMP1 +#else + subl TMP1, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP1 + addl AO, TMP1, AO + addl BO, TMP1, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 4, KK +#endif + + bgt I, $L11 + .align 4 + +$L20: + and M, 2, I + ble I, $L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 2, TMP1 +#else + addl KK, 4, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(B) +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + LD b2, 1 * SIZE(B) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c01 + LD b4, 3 * SIZE(B) + fclr c05 + + ldi BO, 4 * SIZE(B) + fclr c02 + fclr c06 + ble L, $L25 + +#else + sll KK, BASE_SHIFT + 1, TMP1 + addl AO, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c01 + LD b4, 3 * SIZE(BO) + fclr c05 + + ldi BO, 4 * SIZE(BO) + fclr c02 + fclr c06 + ble L, $L25 +#endif + .align 4 + +$L22: + ADD c09, t1, b5 + fmov b5, c09 + unop + MUL a1, b1, t1 + unop + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, b5 + fmov b5, c13 + unop + MUL a1, b2, t3 + ldi BO, 8 * SIZE(BO) + + ADD c14, t4, b5 + fmov b5, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b3, t1 + unop + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + FIMOVD b5, tmp + + ADD c09, t1, b5 + fmov b5, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD c13, t3, b5 + fmov b5, c13 + unop + MUL a3, b2, t3 + ldi AO, 4 * SIZE(AO) + + ADD c14, t4, b5 + fmov b5, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + unop + IFMOVD tmp, b5 + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + IFMOVD tmp, b5 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD c09, t1, b5 + fmov b5, c09 + fldd alpha, ALPHA + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L28 +#else + blbs TMP1, $L28 +#endif + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, b5 + fmov b5, c13 + unop + MUL a1, b2, t3 + unop + + ADD c14, t4, b5 + fmov b5, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b3, t1 + ldi AO, 2 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD c09, t1, b5 + fmov b5, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L28: + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL a2, b1, t2 +#ifndef TRMMKERNEL + LD a3, 0 * SIZE(C1) +#else + unop +#endif + + ADD c13, t3, b5 + fmov b5, c13 + unop + MUL a1, b2, t3 +#ifndef TRMMKERNEL + LD a4, 1 * SIZE(C1) +#else + unop +#endif + + ADD c14, t4, b5 + fmov b5, c14 + unop + MUL a2, b2, t4 +#ifndef TRMMKERNEL + LD a5, 0 * SIZE(C2) +#else + unop +#endif + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b3, t1 +#ifndef TRMMKERNEL + LD b5, 1 * SIZE(C2) + FIMOVD b5, tmp +#else + unop +#endif + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b3, t2 +#ifndef TRMMKERNEL + LD b1, 0 * SIZE(C3) +#else + unop +#endif + + ADD c05, t3, b5 + fmov b5, c05 + unop + MUL a1, b4, t3 +#ifndef TRMMKERNEL + LD b2, 1 * SIZE(C3) +#else + unop +#endif + + ADD c06, t4, b5 + fmov b5, c06 + unop + MUL a2, b4, t4 +#ifndef TRMMKERNEL + LD b3, 0 * SIZE(C4) +#else + unop +#endif + + ADD c09, t1, b5 + fmov b5, c09 + unop + MUL alpha, c01, b5 + fmov b5, c01 +#ifndef TRMMKERNEL + LD b4, 1 * SIZE(C4) +#else + unop +#endif + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL alpha, c02, b5 + fmov b5, c02 + unop + + ADD c13, t3, b5 + fmov b5, c13 + MUL alpha, c05, b5 + fmov b5, c05 + ADD c14, t4, b5 + fmov b5, c14 + MUL alpha, c06, b5 + fmov b5, c06 + + MUL alpha, c09, b5 + fmov b5, c09 +#ifndef TRMMKERNEL + ADD c01, a3, b5 + fmov b5, c01 +#endif + MUL alpha, c10, b5 + fmov b5, c10 +#ifndef TRMMKERNEL + ADD c02, a4, b5 + fmov b5, c02 +#endif + + MUL alpha, c13, b5 + fmov b5, c13 +#ifndef TRMMKERNEL + ADD c05, a5, b5 + fmov b5, c05 +#endif + MUL alpha, c14, b5 + fmov b5, c14 +#ifndef TRMMKERNEL + IFMOVD tmp, b5 + fstd b1, 88($sp) +# FIMOVD b1, tmp + ADD c06, b5, b1 + fmov b1, c06 + fldd b1, 88($sp) +# IFMOVD tmp, b1 +#endif + +#ifndef TRMMKERNEL + ADD c09, b1, b5 + fmov b5, c09 + unop +#endif + ST c01, 0 * SIZE(C1) + fclr t1 + +#ifndef TRMMKERNEL + ADD c10, b2, b5 + fmov b5, c10 + unop +#endif + ST c02, 1 * SIZE(C1) + fclr t2 + +#ifndef TRMMKERNEL + ADD c13, b3, b5 + fmov b5, c13 + unop +#endif + ST c05, 0 * SIZE(C2) + fclr t3 + +#ifndef TRMMKERNEL + ADD c14, b4, b5 + fmov b5, c14 + unop +#endif + ST c06, 1 * SIZE(C2) + fclr t4 + + ST c09, 0 * SIZE(C3) + ldi C1, 2 * SIZE(C1) + ST c10, 1 * SIZE(C3) + ldi C2, 2 * SIZE(C2) + + ST c13, 0 * SIZE(C4) + ldi C3, 2 * SIZE(C3) + ST c14, 1 * SIZE(C4) + ldi C4, 2 * SIZE(C4) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 2, TMP1 +#else + subl TMP1, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 2, KK +#endif + .align 4 + +$L30: + and M, 1, I + ble I, $L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 1, TMP1 +#else + addl KK, 4, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + LD b2, 1 * SIZE(B) + ldi AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c09 + LD b4, 3 * SIZE(B) + fclr c13 + + ldi BO, 4 * SIZE(B) + ble L, $L35 +#else + sll KK, BASE_SHIFT + 0, TMP1 + addl AO, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c09 + LD b4, 3 * SIZE(BO) + fclr c13 + + ldi BO, 4 * SIZE(BO) + ble L, $L35 +#endif + .align 4 + +$L32: + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -2(L) + MUL a1, b1, t1 + LD b1, 0 * SIZE(BO) + + ADD c05, t2, b5 + fmov b5, c05 + ldi AO, 2 * SIZE(AO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, b5 + fmov b5, c09 + LD b5, 3 * SIZE(BO) + FIMOVD b5, tmp + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, b5 + fmov b5, c13 + MUL a1, b4, t4 + LD a1, -1 * SIZE(AO) + + ADD c01, t1, b5 + fmov b5, c01 + MUL a2, b1, t1 + LD b1, 4 * SIZE(BO) + ldi BO, 8 * SIZE(BO) + + ADD c05, t2, b5 + fmov b5, c05 + MUL a2, b2, t2 + LD b2, -3 * SIZE(BO) + + ADD c09, t3, b5 + fmov b5, c09 + LD b4, -1 * SIZE(BO) + MUL a2, b3, t3 + LD b3, -2 * SIZE(BO) + + ADD c13, t4, b5 + fmov b5, c13 + IFMOVD tmp, b5 + MUL a2, b5, t4 + LD a2, 0 * SIZE(AO) + bgt L, $L32 + .align 4 + +$L35: + ADD c01, t1, b5 + fmov b5, c01 + fldd alpha, ALPHA + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L38 +#else + blbs TMP1, $L38 +#endif + .align 4 + + ADD c05, t2, b5 + fmov b5, c05 + LD b1, 0 * SIZE(BO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, b5 + fmov b5, c09 + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, b5 + fmov b5, c13 + MUL a1, b4, t4 + LD a1, 0 * SIZE(AO) + ldi AO, 1 * SIZE(AO) + + ADD c01, t1, b5 + fmov b5, c01 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L38: + ADD c05, t2, b5 + fmov b5, c05 + unop + MUL a1, b2, t2 +#ifndef TRMMKERNEL + LD a5, 0 * SIZE(C1) +#else + unop +#endif + + ADD c09, t3, b5 + fmov b5, c09 + unop + MUL a1, b3, t3 +#ifndef TRMMKERNEL + LD b5, 0 * SIZE(C2) + FIMOVD b5, tmp +#else + unop +#endif + + ADD c13, t4, b5 + fmov b5, c13 + unop + MUL a1, b4, t4 +#ifndef TRMMKERNEL + LD a2, 0 * SIZE(C3) +#else + unop +#endif + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL alpha, c01, b5 + fmov b5, c01 +#ifndef TRMMKERNEL + LD a3, 0 * SIZE(C4) +#else + unop +#endif + + ADD c05, t2, b5 + fmov b5, c05 + unop + MUL alpha, c05, b5 + fmov b5, c05 + unop + + ADD c09, t3, b5 + fmov b5, c09 + MUL alpha, c09, b5 + fmov b5, c09 + ADD c13, t4, b5 + fmov b5, c13 + MUL alpha, c13, b5 + fmov b5, c13 + +#ifndef TRMMKERNEL + IFMOVD tmp, b5 + fstd b1, 88($sp) +# FIMOVD b1, tmp + ADD c01, a5, b1 + fmov b1, c01 + ADD c05, b5, b1 + fmov b1, c05 + ADD c09, a2, b1 + fmov b1, c09 + ADD c13, a3, b1 + fmov b1, c13 + fldd b1, 88($sp) +# IFMOVD tmp, b1 +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c09, 0 * SIZE(C3) + ST c13, 0 * SIZE(C4) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 1, TMP1 +#else + subl TMP1, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 1, KK +#endif + .align 4 + +$L39: + mov BO, B + ldi J, -1(J) +#if defined(TRMMKERNEL) && !defined(LEFT) + addl KK, 4, KK +#else + unop +#endif + bgt J, $L01 + .align 4 + +$L40: + and N, 2, J + ble J, $L80 + + mov C, C1 + addl C, LDC, C2 + mov A, AO + fclr t1 + addl C2, LDC, C + fclr t2 + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L60 + .align 4 + +$L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 4, TMP1 +#else + addl KK, 2, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + ldi BO, 2 * SIZE(B) + ldi AO, 4 * SIZE(AO) + ble L, $L55 +#else + sll KK, BASE_SHIFT + 2, TMP1 + addl AO, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi L, -2(TMP1) + ldi BO, 2 * SIZE(BO) + ldi AO, 4 * SIZE(AO) + ble L, $L55 +#endif + .align 4 + +$L52: + ADD c05, t1, b5 + fmov b5, c05 + unop + MUL a1, b1, t1 + unop + + ADD c06, t2, b5 + fmov b5, c06 + ldi L, -2(L) + MUL a2, b1, t2 + unop + + ADD c07, t3, b5 + fmov b5, c07 + unop + MUL a3, b1, t3 + unop + + ADD c08, t4, b5 + fmov b5, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD c05, t1, b5 + fmov b5, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD c06, t2, b5 + fmov b5, c06 + unop + MUL a2, b3, t2 + unop + + ADD c07, t3, b5 + fmov b5, c07 + unop + MUL a3, b3, t3 + ldi AO, 8 * SIZE(AO) + + ADD c08, t4, b5 + fmov b5, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L52 + .align 4 + +$L55: + ADD c05, t1, b5 + fmov b5, c05 + fldd alpha, ALPHA + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L58 +#else + blbs TMP1, $L58 +#endif + .align 4 + + ADD c06, t2, b5 + fmov b5, c06 + MUL a2, b1, t2 + ADD c07, t3, b5 + fmov b5, c07 + MUL a3, b1, t3 + + ADD c08, t4, b5 + fmov b5, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + ldi AO, 4 * SIZE(AO) + + ADD c05, t1, b5 + fmov b5, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L58: + ADD c06, t2, b5 + fmov b5, c06 + unop + MUL a2, b1, t2 +#ifndef TRMMKERNEL + LD c09, 0 * SIZE(C1) +#else + unop +#endif + + ADD c07, t3, b5 + fmov b5, c07 + unop + MUL a3, b1, t3 +#ifndef TRMMKERNEL + LD c10, 1 * SIZE(C1) +#else + unop +#endif + + ADD c08, t4, b5 + fmov b5, c08 + unop + MUL a4, b1, t4 +#ifndef TRMMKERNEL + LD c11, 2 * SIZE(C1) +#else + unop +#endif + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b2, t1 +#ifndef TRMMKERNEL + LD c12, 3 * SIZE(C1) +#else + unop +#endif + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b2, t2 +#ifndef TRMMKERNEL + LD c13, 0 * SIZE(C2) + unop +#endif + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b2, t3 +#ifndef TRMMKERNEL + LD c14, 1 * SIZE(C2) +#else + unop +#endif + + ADD c04, t4, b5 + fmov b5, c04 + unop + MUL a4, b2, t4 +#ifndef TRMMKERNEL + LD c15, 2 * SIZE(C2) +#else + unop +#endif + + ADD c05, t1, b5 + fmov b5, c05 + unop + MUL alpha, c01, b5 + fmov b5, c01 +#ifndef TRMMKERNEL + LD c16, 3 * SIZE(C2) +#else + unop +#endif + + ADD c06, t2, b5 + fmov b5, c06 + ldi I, -1(I) + MUL alpha, c02, b5 + fmov b5, c02 + unop + + ADD c07, t3, b5 + fmov b5, c07 + MUL alpha, c03, b5 + fmov b5, c03 + ADD c08, t4, b5 + fmov b5, c08 + MUL alpha, c04, b5 + fmov b5, c04 + + MUL alpha, c05, b5 + fmov b5, c05 +#ifndef TRMMKERNEL + ADD c01, c09, b5 + fmov b5, c01 +#endif + MUL alpha, c06, b5 + fmov b5, c06 +#ifndef TRMMKERNEL + ADD c02, c10, b5 + fmov b5, c02 +#endif + + MUL alpha, c07, b5 + fmov b5, c07 +#ifndef TRMMKERNEL + ADD c03, c11, b5 + fmov b5, c03 +#endif + MUL alpha, c08, b5 + fmov b5, c08 +#ifndef TRMMKERNEL + ADD c04, c12, b5 + fmov b5, c04 +#endif + +#ifndef TRMMKERNEL + ADD c05, c13, b5 + fmov b5, c05 +#endif + ST c01, 0 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c06, c14, b5 + fmov b5, c06 +#endif + ST c02, 1 * SIZE(C1) + +#ifndef TRMMKERNEL + ADD c07, c15, b5 + fmov b5, c07 +#endif + ST c03, 2 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c08, c16, b5 + fmov b5, c08 +#endif + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + fclr t1 + ST c06, 1 * SIZE(C2) + fclr t2 + ST c07, 2 * SIZE(C2) + fclr t3 + ST c08, 3 * SIZE(C2) + fclr t4 + + ldi C1, 4 * SIZE(C1) + ldi C2, 4 * SIZE(C2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 4, TMP1 +#else + subl TMP1, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 4, KK +#endif + bgt I, $L51 + .align 4 + +$L60: + and M, 2, I + ble I, $L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 2, TMP1 +#else + addl KK, 2, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + LD b2, 1 * SIZE(B) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + ldi BO, 2 * SIZE(B) + ble L, $L65 +#else + sll KK, BASE_SHIFT + 1, TMP1 + addl AO, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + ldi BO, 2 * SIZE(BO) + ble L, $L65 +#endif + .align 4 + +$L62: + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b1, t1 + unop + + ADD c02, t2, b5 + fmov b5, c02 + ldi AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + ldi L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD c01, t1, b5 + fmov b5, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + ldi BO, 4 * SIZE(BO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L62 + .align 4 + +$L65: + ADD c01, t1, b5 + fmov b5, c01 + fldd alpha, ALPHA + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L68 +#else + blbs TMP1, $L68 +#endif + .align 4 + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + ldi BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD c01, t1, b5 + fmov b5, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + ldi AO, 2 * SIZE(AO) + .align 4 + +$L68: + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b1, t2 +#ifndef TRMMKERNEL + LD c09, 0 * SIZE(C1) +#else + unop +#endif + + ADD c05, t3, b5 + fmov b5, c05 + unop + MUL a1, b2, t3 +#ifndef TRMMKERNEL + LD c10, 1 * SIZE(C1) +#else + unop +#endif + + ADD c06, t4, b5 + fmov b5, c06 + unop + MUL a2, b2, t4 +#ifndef TRMMKERNEL + LD c11, 0 * SIZE(C2) +#else + unop +#endif + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL alpha, c01, b5 + fmov b5, c01 +#ifndef TRMMKERNEL + LD c12, 1 * SIZE(C2) +#else + unop +#endif + + ADD c02, t2, b5 + fmov b5, c02 + ldi C1, 2 * SIZE(C1) + MUL alpha, c02, b5 + fmov b5, c02 + ldi C2, 2 * SIZE(C2) + + ADD c05, t3, b5 + fmov b5, c05 + MUL alpha, c05, b5 + fmov b5, c05 + ADD c06, t4, b5 + fmov b5, c06 + MUL alpha, c06, b5 + fmov b5, c06 + +#ifndef TRMMKERNEL + ADD c01, c09, b5 + fmov b5, c01 + ADD c02, c10, b5 + fmov b5, c02 + ADD c05, c11, b5 + fmov b5, c05 + ADD c06, c12, b5 + fmov b5, c06 +#endif + + ST c01, -2 * SIZE(C1) + fclr t1 + ST c02, -1 * SIZE(C1) + fclr t2 + ST c05, -2 * SIZE(C2) + fclr t3 + ST c06, -1 * SIZE(C2) + fclr t4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 2, TMP1 +#else + subl TMP1, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 2, KK +#endif + .align 4 + +$L70: + and M, 1, I + ble I, $L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 1, TMP1 +#else + addl KK, 2, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + fclr c02 + LD b2, 1 * SIZE(B) + fclr c06 + +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + + LD b3, 2 * SIZE(B) + ldi AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(B) + ldi BO, 2 * SIZE(B) + ble L, $L75 +#else + sll KK, BASE_SHIFT + 0, TMP1 + addl AO, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + fclr c02 + LD b2, 1 * SIZE(BO) + fclr c06 + +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + + LD b3, 2 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(BO) + ldi BO, 2 * SIZE(BO) + ble L, $L75 +#endif + .align 4 + +$L72: + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -2(L) + MUL a1, b1, t1 + LD b1, 2 * SIZE(BO) + + ADD c05, t2, b5 + fmov b5, c05 + MUL a1, b2, t2 + LD a1, 1 * SIZE(AO) + LD b2, 3 * SIZE(BO) + + ADD c02, t3, b5 + fmov b5, c02 + ldi AO, 2 * SIZE(AO) + MUL a2, b3, t3 + LD b3, 4 * SIZE(BO) + + ADD c06, t4, b5 + fmov b5, c06 + MUL a2, b4, t4 + LD a2, 0 * SIZE(AO) + LD b4, 5 * SIZE(BO) + + ldi BO, 4 * SIZE(BO) + unop + unop + bgt L, $L72 + .align 4 + +$L75: + ADD c01, t1, b5 + fmov b5, c01 + fldd alpha, ALPHA + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L78 +#else + blbs TMP1, $L78 +#endif + .align 4 + + ADD c05, t2, b5 + fmov b5, c05 + MUL a1, b2, t2 + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + LD b2, 1 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + MUL a1, b1, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L78: + ADD c05, t2, b5 + fmov b5, c05 + MUL a1, b2, t2 +#ifndef TRMMKERNEL + LD a5, 0 * SIZE(C1) +#else + unop +#endif + + ADD c02, t3, b5 + fmov b5, c02 + ADD c06, t4, b5 + fmov b5, c06 +#ifndef TRMMKERNEL + LD b5, 0 * SIZE(C2) + FIMOVD b5, tmp +#else + unop +#endif + + ADD c01, c02, b5 + fmov b5, c01 + ADD c05, c06, b5 + fmov b5, c05 + + ADD c01, t1, b5 + fmov b5, c01 + ADD c05, t2, b5 + fmov b5, c05 + + MUL alpha, c01, b5 + fmov b5, c01 + MUL alpha, c05, b5 + fmov b5, c05 + +#ifndef TRMMKERNEL + IFMOVD tmp ,b5 + fstd b1, 88($sp) +# FIMOVD b1, tmp + ADD c01, a5, b1 + fmov b1, c01 + ADD c05, b5, b1 + fmov b1, c05 + fldd b1, 88($sp) +# IFMOVD tmp ,b1 +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 1, TMP1 +#else + subl TMP1, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 1, KK +#endif + .align 4 + +$L79: + mov BO, B +#if defined(TRMMKERNEL) && !defined(LEFT) + addl KK, 2, KK +#else + unop +#endif + unop + unop + .align 4 + +$L80: + and N, 1, J + ble J, $L999 + + mov C, C1 + mov A, AO + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + sra M, 2, I + ble I, $L100 + .align 4 + +$L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 4, TMP1 +#else + addl KK, 1, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + mov B, BO + unop + ble L, $L95 +#else + sll KK, BASE_SHIFT + 2, TMP1 + addl AO, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP2 + addl B, TMP2, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + unop + ble L, $L95 +#endif + .align 5 + +$L92: + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + ldi L, -1(L) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b2, t1 + LD a1, 8 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b2, t2 + LD a2, 9 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b2, t3 + LD a3, 10 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b2, t4 + LD a4, 11 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b3, t1 + LD a1, 12 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b3, t2 + LD a2, 13 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b3, t3 + LD a3, 14 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b3, t4 + LD a5, 15 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b4, t1 + LD a1, 16 * SIZE(AO) + ldi AO, 16 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b4, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L92 + .align 4 + +$L95: +#ifndef TRMMKERNEL + and K, 3, L +#else + and TMP1, 3, L +#endif + fldd alpha, ALPHA + unop + ble L, $L98 + .align 4 + +$L96: + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + ldi BO, 1 * SIZE(BO) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ldi AO, 4 * SIZE(AO) + bgt L, $L96 + .align 4 + +$L98: +#ifndef TRMMKERNEL + ADD c01, t1, b5 + fmov b5, c01 + LD c05, 0 * SIZE(C1) + ADD c02, t2, b5 + fmov b5, c02 + LD c06, 1 * SIZE(C1) + ADD c03, t3, b5 + fmov b5, c03 + LD c07, 2 * SIZE(C1) + ADD c04, t4, b5 + fmov b5, c04 + LD c08, 3 * SIZE(C1) +#else + ADD c01, t1, b5 + fmov b5, c01 + ADD c02, t2, b5 + fmov b5, c02 + ADD c03, t3, b5 + fmov b5, c03 + ADD c04, t4, b5 + fmov b5, c04 +#endif + + MUL alpha, c01, b5 + fmov b5, c01 + MUL alpha, c02, b5 + fmov b5, c02 + MUL alpha, c03, b5 + fmov b5, c03 + MUL alpha, c04, b5 + fmov b5, c04 + +#ifndef TRMMKERNEL + ADD c01, c05, b5 + fmov b5, c01 + ADD c02, c06, b5 + fmov b5, c02 + ADD c03, c07, b5 + fmov b5, c03 + ADD c04, c08, b5 + fmov b5, c04 +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ldi C1, 4 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 4, TMP1 +#else + subl TMP1, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 4, KK +#endif + + ldi I, -1(I) + bgt I, $L91 + .align 4 + +$L100: + and M, 2, I + unop + unop + ble I, $L110 + .align 4 + +$L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 2, TMP1 +#else + addl KK, 1, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + mov B, BO + unop + ble L, $L105 +#else + sll KK, BASE_SHIFT + 1, TMP1 + addl AO, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP2 + addl B, TMP2, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + unop + ble L, $L105 +#endif + .align 5 + +$L102: + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c03, t3, b5 + fmov b5, c03 + ldi BO, 4 * SIZE(BO) + MUL a3, b2, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b2, t4 + LD a5, 7 * SIZE(AO) + LD b2, 1 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b3, t1 + LD a1, 8 * SIZE(AO) + ldi AO, 8 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + LD a2, 1 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L102 + .align 4 + +$L105: +#ifndef TRMMKERNEL + and K, 3, L +#else + and TMP1, 3, L +#endif + fldd alpha, ALPHA +#ifndef TRMMKERNEL + LD a3, 0 * SIZE(C1) + LD a4, 1 * SIZE(C1) +#endif + ble L, $L108 + .align 4 + +$L106: + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -1(L) + MUL a1, b1, t1 + LD a1, 2 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b1, t2 + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + ldi AO, 2 * SIZE(AO) + unop + ldi BO, 1 * SIZE(BO) + bgt L, $L106 + .align 4 + +$L108: + ADD c01, t1, b5 + fmov b5, c01 + fclr t1 + ADD c02, t2, b5 + fmov b5, c02 + fclr t2 + ADD c03, t3, b5 + fmov b5, c03 + fclr t3 + ADD c04, t4, b5 + fmov b5, c04 + fclr t4 + + ADD c01, c03, b5 + fmov b5, c01 + ADD c02, c04, b5 + fmov b5, c02 + + MUL alpha, c01, b5 + fmov b5, c01 + MUL alpha, c02, b5 + fmov b5, c02 + +#ifndef TRMMKERNEL + ADD c01, a3, b5 + fmov b5, c01 + ADD c02, a4, b5 + fmov b5, c02 +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ldi C1, 2 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 2, TMP1 +#else + subl TMP1, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 2, KK +#endif + .align 4 + +$L110: + and M, 1, I + ble I, $L999 + .align 4 + +$L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 1, TMP1 +#else + addl KK, 1, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + mov B, BO + unop + ble L, $L115 +#else + sll KK, BASE_SHIFT + 0, TMP1 + addl AO, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP2 + addl B, TMP2, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + unop + ble L, $L115 +#endif + .align 4 + +$L112: + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b2, t2 + LD a2, 5 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c03, t3, b5 + fmov b5, c03 + MUL a3, b3, t3 + LD a3, 6 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b4, t4 + LD a4, 7 * SIZE(AO) + LD b4, 7 * SIZE(BO) + + ldi L, -1(L) + ldi AO, 4 * SIZE(AO) + ldi BO, 4 * SIZE(BO) + bgt L, $L112 + .align 4 + +$L115: +#ifndef TRMMKERNEL + and K, 3, L +#else + and TMP1, 3, L +#endif + fldd alpha, ALPHA +#ifndef TRMMKERNEL + LD a2, 0 * SIZE(C1) +#endif + ble L, $L118 + .align 4 + +$L116: + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b1, t1 + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + ldi L, -1(L) + ldi AO, 1 * SIZE(AO) + ldi BO, 1 * SIZE(BO) + bgt L, $L116 + .align 4 + +$L118: + ADD c01, t1, b5 + fmov b5, c01 + ADD c02, t2, b5 + fmov b5, c02 + ADD c03, t3, b5 + fmov b5, c03 + ADD c04, t4, b5 + fmov b5, c04 + + ADD c01, c02, b5 + fmov b5, c01 + ADD c03, c04, b5 + fmov b5, c03 + ADD c01, c03, b5 + fmov b5, c01 + + MUL alpha, c01, b5 + fmov b5, c01 +#ifndef TRMMKERNEL + ADD c01, a2, b5 + fmov b5, c01 +#endif + ST c01, 0 * SIZE(C1) + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + ldl $9, 80($sp) + clr $0 + ldi $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/gemm_kernel_4x4.S.bak b/kernel/sw_64/gemm_kernel_4x4.S.bak new file mode 100644 index 0000000..10dc98d --- /dev/null +++ b/kernel/sw_64/gemm_kernel_4x4.S.bak @@ -0,0 +1,2844 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(SW2B) +#error "Architecture is not specified." +#endif + +#ifdef SW2B +#define PREFETCHSIZE 56 +#define UNOP nop +#endif + + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $20 +#define B $21 +#define C $22 +#define LDC $23 + +#define C1 $19 +#define C2 $24 +#define C3 $25 +#define C4 $27 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define BB $3 +#define OFFSET $4 + +#define ALPHA 64($sp) + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + ldi $sp, -STACKSIZE($sp) + + ldl C, 0 + STACKSIZE($sp) + ldl LDC, 8 + STACKSIZE($sp) +#ifdef TRMMKERNEL + ldl OFFSET, 16 + STACKSIZE($sp) +#endif + + SXADDQ LDC, 0, LDC + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + fstd $f19, ALPHA + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#if defined(TRMMKERNEL) && !defined(LEFT) + subl $31, OFFSET, KK +#endif + + sra N, 2, J + ble J, $L40 + .align 4 + +$L01: + mov C, C1 + addl C, LDC, C2 + mov A, AO + s4addl K, 0, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + addl C2, LDC, C3 + s4addl LDC, C, C + + SXADDQ BB, B, BB + fclr t1 + addl C3, LDC, C4 + fclr t2 + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L20 + .align 4 + +$L11: +#if defined(EV5) || defined(EV6) || defined(SW2B) + fillcs 0 * SIZE(BB) + fillcs 8 * SIZE(BB) + unop + ldi BB, 16 * SIZE(BB) +#endif + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 4, TMP1 +#else + addl KK, 4, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + + LD b3, 2 * SIZE(B) + fclr c06 + LD b4, 3 * SIZE(B) + fclr c05 + + fillcs 4 * SIZE(C1) + fclr c03 +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + fclr c04 + + fillcs 7 * SIZE(C2) + fclr c08 + ldi BO, 4 * SIZE(B) + fclr c13 + + fillcs 4 * SIZE(C3) + fclr c09 + ldi AO, 4 * SIZE(AO) + fclr c10 + +#else + sll KK, BASE_SHIFT + 2, TMP1 + addl AO, TMP1, AO + addl B, TMP1, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + + LD b3, 2 * SIZE(BO) + fclr c06 + LD b4, 3 * SIZE(BO) + fclr c05 + + fillcs 4 * SIZE(C1) + fclr c03 + ldi L, -2(TMP1) + fclr c04 + + fillcs 7 * SIZE(C2) + fclr c08 + ldi BO, 4 * SIZE(BO) + fclr c13 + + fillcs 4 * SIZE(C3) + fclr c09 + ldi AO, 4 * SIZE(AO) + fclr c10 +#endif + + fillcs 7 * SIZE(C4) + fclr c14 + fclr c07 + ble L, $L15 + .align 5 + +$L12: +/* 1 */ + ADD c11, t1, c11 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD c12, t2, c12 + ldi L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD c03, t1, c03 + ldi AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD c04, t2, c04 + ldi BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD c11, t1, c11 + fldd alpha, ALPHA + MUL b1, a1, t1 +#ifndef TRMMKERNEL + blbs K, $L18 +#else + blbs TMP1, $L18 +#endif + .align 4 + + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD c06, t3, c06 + MUL b2, a4, t3 + ADD c05, t4, c05 + MUL b4, a1, t4 + + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + ldi AO, 4 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L18: + ADD c12, t2, c12 + unop + MUL b1, a2, t2 +#ifndef TRMMKERNEL + LD a5, 0 * SIZE(C1) +#else + unop +#endif + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD c15, t4, c15 + unop + MUL b2, a1, t4 +#ifndef TRMMKERNEL + LD b5, 1 * SIZE(C1) +#else + unop +#endif + + ADD c01, t1, c01 + unop + MUL b1, a3, t1 + unop + + ADD c02, t2, c02 + unop + MUL b1, a4, t2 +#ifndef TRMMKERNEL + LD b1, 0 * SIZE(C2) +#else + unop +#endif + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a1, t4 + unop + + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 +#ifndef TRMMKERNEL + LD a1, 0 * SIZE(C3) +#else + unop +#endif + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 +#ifndef TRMMKERNEL + LD a2, 2 * SIZE(C1) +#else + unop +#endif + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 +#ifndef TRMMKERNEL + LD b2, 3 * SIZE(C1) +#else + unop +#endif + + ADD c09, t1, c09 + ldi I, -1(I) + MUL b3, a3, t1 + unop + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 +#ifndef TRMMKERNEL + LD b3, 0 * SIZE(C4) +#else + unop +#endif + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 +#ifndef TRMMKERNEL + LD a4, 1 * SIZE(C2) +#else + unop +#endif + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 +#ifndef TRMMKERNEL + LD a3, 2 * SIZE(C2) +#else + unop +#endif + + ADD c11, t1, c11 + unop + MUL alpha, c01, c01 +#ifndef TRMMKERNEL + LD b4, 3 * SIZE(C2) +#else + unop +#endif + + ADD c12, t2, c12 + unop + MUL alpha, c02, c02 +#ifndef TRMMKERNEL + LD t1, 1 * SIZE(C3) +#else + unop +#endif + + ADD c16, t3, c16 + unop + MUL alpha, c03, c03 +#ifndef TRMMKERNEL + LD t2, 2 * SIZE(C3) +#else + unop +#endif + + ADD c15, t4, c15 + unop + MUL alpha, c04, c04 +#ifndef TRMMKERNEL + LD t3, 3 * SIZE(C3) +#else + unop +#endif + + MUL alpha, c05, c05 + unop +#ifndef TRMMKERNEL + ADD c01, a5, c01 + LD t4, 1 * SIZE(C4) +#else + unop + unop +#endif + + MUL alpha, c06, c06 +#ifndef TRMMKERNEL + unop + ADD c02, b5, c02 + LD a5, 2 * SIZE(C4) +#endif + + MUL alpha, c07, c07 +#ifndef TRMMKERNEL + unop + ADD c03, a2, c03 + LD b5, 3 * SIZE(C4) +#endif + + MUL alpha, c08, c08 +#ifndef TRMMKERNEL + unop + ADD c04, b2, c04 + unop +#endif + + MUL alpha, c09, c09 + ST c01, 0 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c05, b1, c05 + unop +#endif + + MUL alpha, c10, c10 + ST c02, 1 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c06, a4, c06 + unop +#endif + + MUL alpha, c11, c11 + ST c03, 2 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c07, a3, c07 + unop +#endif + + MUL alpha, c12, c12 + ST c04, 3 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c08, b4, c08 +#else + unop +#endif + ldi C1, 4 * SIZE(C1) + + MUL alpha, c13, c13 + ST c05, 0 * SIZE(C2) +#ifndef TRMMKERNEL + ADD c09, a1, c09 + unop +#endif + + MUL alpha, c14, c14 + ST c06, 1 * SIZE(C2) +#ifndef TRMMKERNEL + ADD c10, t1, c10 + unop +#endif + + MUL alpha, c15, c15 + ST c07, 2 * SIZE(C2) +#ifndef TRMMKERNEL + ADD c11, t2, c11 + unop +#endif + + MUL alpha, c16, c16 + ST c08, 3 * SIZE(C2) +#ifndef TRMMKERNEL + ADD c12, t3, c12 +#else + unop +#endif + ldi C2, 4 * SIZE(C2) + +#ifndef TRMMKERNEL + ADD c13, b3, c13 +#else + unop +#endif + ST c09, 0 * SIZE(C3) + fclr t1 + ldi C4, 4 * SIZE(C4) + +#ifndef TRMMKERNEL + ADD c14, t4, c14 +#else + unop +#endif + ST c10, 1 * SIZE(C3) + fclr t2 + unop + +#ifndef TRMMKERNEL + ADD c15, a5, c15 +#else + unop +#endif + ST c11, 2 * SIZE(C3) + fclr t3 + unop + +#ifndef TRMMKERNEL + ADD c16, b5, c16 +#else + unop +#endif + ST c12, 3 * SIZE(C3) + fclr t4 + ldi C3, 4 * SIZE(C3) + + ST c13, -4 * SIZE(C4) + ST c14, -3 * SIZE(C4) + ST c15, -2 * SIZE(C4) + ST c16, -1 * SIZE(C4) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 4, TMP1 +#else + subl TMP1, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP1 + addl AO, TMP1, AO + addl BO, TMP1, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 4, KK +#endif + + bgt I, $L11 + .align 4 + +$L20: + and M, 2, I + ble I, $L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 2, TMP1 +#else + addl KK, 4, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(B) +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + LD b2, 1 * SIZE(B) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c01 + LD b4, 3 * SIZE(B) + fclr c05 + + ldi BO, 4 * SIZE(B) + fclr c02 + fclr c06 + ble L, $L25 + +#else + sll KK, BASE_SHIFT + 1, TMP1 + addl AO, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c01 + LD b4, 3 * SIZE(BO) + fclr c05 + + ldi BO, 4 * SIZE(BO) + fclr c02 + fclr c06 + ble L, $L25 +#endif + .align 4 + +$L22: + ADD c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + ldi BO, 8 * SIZE(BO) + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a3, b2, t3 + ldi AO, 4 * SIZE(AO) + + ADD c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD c01, t1, c01 + ldi L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD c09, t1, c09 + fldd alpha, ALPHA + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L28 +#else + blbs TMP1, $L28 +#endif + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + ldi AO, 2 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L28: + ADD c10, t2, c10 + unop + MUL a2, b1, t2 +#ifndef TRMMKERNEL + LD a3, 0 * SIZE(C1) +#else + unop +#endif + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 +#ifndef TRMMKERNEL + LD a4, 1 * SIZE(C1) +#else + unop +#endif + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 +#ifndef TRMMKERNEL + LD a5, 0 * SIZE(C2) +#else + unop +#endif + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 +#ifndef TRMMKERNEL + LD b5, 1 * SIZE(C2) +#else + unop +#endif + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 +#ifndef TRMMKERNEL + LD b1, 0 * SIZE(C3) +#else + unop +#endif + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 +#ifndef TRMMKERNEL + LD b2, 1 * SIZE(C3) +#else + unop +#endif + + ADD c06, t4, c06 + unop + MUL a2, b4, t4 +#ifndef TRMMKERNEL + LD b3, 0 * SIZE(C4) +#else + unop +#endif + + ADD c09, t1, c09 + unop + MUL alpha, c01, c01 +#ifndef TRMMKERNEL + LD b4, 1 * SIZE(C4) +#else + unop +#endif + + ADD c10, t2, c10 + unop + MUL alpha, c02, c02 + unop + + ADD c13, t3, c13 + MUL alpha, c05, c05 + ADD c14, t4, c14 + MUL alpha, c06, c06 + + MUL alpha, c09, c09 +#ifndef TRMMKERNEL + ADD c01, a3, c01 +#endif + MUL alpha, c10, c10 +#ifndef TRMMKERNEL + ADD c02, a4, c02 +#endif + + MUL alpha, c13, c13 +#ifndef TRMMKERNEL + ADD c05, a5, c05 +#endif + MUL alpha, c14, c14 +#ifndef TRMMKERNEL + ADD c06, b5, c06 +#endif + +#ifndef TRMMKERNEL + ADD c09, b1, c09 + unop +#endif + ST c01, 0 * SIZE(C1) + fclr t1 + +#ifndef TRMMKERNEL + ADD c10, b2, c10 + unop +#endif + ST c02, 1 * SIZE(C1) + fclr t2 + +#ifndef TRMMKERNEL + ADD c13, b3, c13 + unop +#endif + ST c05, 0 * SIZE(C2) + fclr t3 + +#ifndef TRMMKERNEL + ADD c14, b4, c14 + unop +#endif + ST c06, 1 * SIZE(C2) + fclr t4 + + ST c09, 0 * SIZE(C3) + ldi C1, 2 * SIZE(C1) + ST c10, 1 * SIZE(C3) + ldi C2, 2 * SIZE(C2) + + ST c13, 0 * SIZE(C4) + ldi C3, 2 * SIZE(C3) + ST c14, 1 * SIZE(C4) + ldi C4, 2 * SIZE(C4) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 2, TMP1 +#else + subl TMP1, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 2, KK +#endif + .align 4 + +$L30: + and M, 1, I + ble I, $L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 1, TMP1 +#else + addl KK, 4, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + LD b2, 1 * SIZE(B) + ldi AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c09 + LD b4, 3 * SIZE(B) + fclr c13 + + ldi BO, 4 * SIZE(B) + ble L, $L35 +#else + sll KK, BASE_SHIFT + 0, TMP1 + addl AO, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c09 + LD b4, 3 * SIZE(BO) + fclr c13 + + ldi BO, 4 * SIZE(BO) + ble L, $L35 +#endif + .align 4 + +$L32: + ADD c01, t1, c01 + ldi L, -2(L) + MUL a1, b1, t1 + LD b1, 0 * SIZE(BO) + + ADD c05, t2, c05 + ldi AO, 2 * SIZE(AO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + LD b5, 3 * SIZE(BO) + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, -1 * SIZE(AO) + + ADD c01, t1, c01 + MUL a2, b1, t1 + LD b1, 4 * SIZE(BO) + ldi BO, 8 * SIZE(BO) + + ADD c05, t2, c05 + MUL a2, b2, t2 + LD b2, -3 * SIZE(BO) + + ADD c09, t3, c09 + LD b4, -1 * SIZE(BO) + MUL a2, b3, t3 + LD b3, -2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a2, b5, t4 + LD a2, 0 * SIZE(AO) + bgt L, $L32 + .align 4 + +$L35: + ADD c01, t1, c01 + fldd alpha, ALPHA + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L38 +#else + blbs TMP1, $L38 +#endif + .align 4 + + ADD c05, t2, c05 + LD b1, 0 * SIZE(BO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, 0 * SIZE(AO) + ldi AO, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L38: + ADD c05, t2, c05 + unop + MUL a1, b2, t2 +#ifndef TRMMKERNEL + LD a5, 0 * SIZE(C1) +#else + unop +#endif + + ADD c09, t3, c09 + unop + MUL a1, b3, t3 +#ifndef TRMMKERNEL + LD b5, 0 * SIZE(C2) +#else + unop +#endif + + ADD c13, t4, c13 + unop + MUL a1, b4, t4 +#ifndef TRMMKERNEL + LD a2, 0 * SIZE(C3) +#else + unop +#endif + + ADD c01, t1, c01 + unop + MUL alpha, c01, c01 +#ifndef TRMMKERNEL + LD a3, 0 * SIZE(C4) +#else + unop +#endif + + ADD c05, t2, c05 + unop + MUL alpha, c05, c05 + unop + + ADD c09, t3, c09 + MUL alpha, c09, c09 + ADD c13, t4, c13 + MUL alpha, c13, c13 + +#ifndef TRMMKERNEL + ADD c01, a5, c01 + ADD c05, b5, c05 + ADD c09, a2, c09 + ADD c13, a3, c13 +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c09, 0 * SIZE(C3) + ST c13, 0 * SIZE(C4) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 1, TMP1 +#else + subl TMP1, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 1, KK +#endif + .align 4 + +$L39: + mov BO, B + ldi J, -1(J) +#if defined(TRMMKERNEL) && !defined(LEFT) + addl KK, 4, KK +#else + unop +#endif + bgt J, $L01 + .align 4 + +$L40: + and N, 2, J + ble J, $L80 + + mov C, C1 + addl C, LDC, C2 + mov A, AO + fclr t1 + addl C2, LDC, C + fclr t2 + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L60 + .align 4 + +$L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 4, TMP1 +#else + addl KK, 2, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + ldi BO, 2 * SIZE(B) + ldi AO, 4 * SIZE(AO) + ble L, $L55 +#else + sll KK, BASE_SHIFT + 2, TMP1 + addl AO, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi L, -2(TMP1) + ldi BO, 2 * SIZE(BO) + ldi AO, 4 * SIZE(AO) + ble L, $L55 +#endif + .align 4 + +$L52: + ADD c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD c06, t2, c06 + ldi L, -2(L) + MUL a2, b1, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b3, t3 + ldi AO, 8 * SIZE(AO) + + ADD c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L52 + .align 4 + +$L55: + ADD c05, t1, c05 + fldd alpha, ALPHA + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L58 +#else + blbs TMP1, $L58 +#endif + .align 4 + + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + ldi AO, 4 * SIZE(AO) + + ADD c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L58: + ADD c06, t2, c06 + unop + MUL a2, b1, t2 +#ifndef TRMMKERNEL + LD c09, 0 * SIZE(C1) +#else + unop +#endif + + ADD c07, t3, c07 + unop + MUL a3, b1, t3 +#ifndef TRMMKERNEL + LD c10, 1 * SIZE(C1) +#else + unop +#endif + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 +#ifndef TRMMKERNEL + LD c11, 2 * SIZE(C1) +#else + unop +#endif + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 +#ifndef TRMMKERNEL + LD c12, 3 * SIZE(C1) +#else + unop +#endif + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 +#ifndef TRMMKERNEL + LD c13, 0 * SIZE(C2) + unop +#endif + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 +#ifndef TRMMKERNEL + LD c14, 1 * SIZE(C2) +#else + unop +#endif + + ADD c04, t4, c04 + unop + MUL a4, b2, t4 +#ifndef TRMMKERNEL + LD c15, 2 * SIZE(C2) +#else + unop +#endif + + ADD c05, t1, c05 + unop + MUL alpha, c01, c01 +#ifndef TRMMKERNEL + LD c16, 3 * SIZE(C2) +#else + unop +#endif + + ADD c06, t2, c06 + ldi I, -1(I) + MUL alpha, c02, c02 + unop + + ADD c07, t3, c07 + MUL alpha, c03, c03 + ADD c08, t4, c08 + MUL alpha, c04, c04 + + MUL alpha, c05, c05 +#ifndef TRMMKERNEL + ADD c01, c09, c01 +#endif + MUL alpha, c06, c06 +#ifndef TRMMKERNEL + ADD c02, c10, c02 +#endif + + MUL alpha, c07, c07 +#ifndef TRMMKERNEL + ADD c03, c11, c03 +#endif + MUL alpha, c08, c08 +#ifndef TRMMKERNEL + ADD c04, c12, c04 +#endif + +#ifndef TRMMKERNEL + ADD c05, c13, c05 +#endif + ST c01, 0 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c06, c14, c06 +#endif + ST c02, 1 * SIZE(C1) + +#ifndef TRMMKERNEL + ADD c07, c15, c07 +#endif + ST c03, 2 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c08, c16, c08 +#endif + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + fclr t1 + ST c06, 1 * SIZE(C2) + fclr t2 + ST c07, 2 * SIZE(C2) + fclr t3 + ST c08, 3 * SIZE(C2) + fclr t4 + + ldi C1, 4 * SIZE(C1) + ldi C2, 4 * SIZE(C2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 4, TMP1 +#else + subl TMP1, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 4, KK +#endif + bgt I, $L51 + .align 4 + +$L60: + and M, 2, I + ble I, $L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 2, TMP1 +#else + addl KK, 2, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + LD b2, 1 * SIZE(B) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + ldi BO, 2 * SIZE(B) + ble L, $L65 +#else + sll KK, BASE_SHIFT + 1, TMP1 + addl AO, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + ldi BO, 2 * SIZE(BO) + ble L, $L65 +#endif + .align 4 + +$L62: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD c02, t2, c02 + ldi AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD c05, t3, c05 + ldi L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + ldi BO, 4 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L62 + .align 4 + +$L65: + ADD c01, t1, c01 + fldd alpha, ALPHA + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L68 +#else + blbs TMP1, $L68 +#endif + .align 4 + + ADD c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c05, t3, c05 + ldi BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + ldi AO, 2 * SIZE(AO) + .align 4 + +$L68: + ADD c02, t2, c02 + unop + MUL a2, b1, t2 +#ifndef TRMMKERNEL + LD c09, 0 * SIZE(C1) +#else + unop +#endif + + ADD c05, t3, c05 + unop + MUL a1, b2, t3 +#ifndef TRMMKERNEL + LD c10, 1 * SIZE(C1) +#else + unop +#endif + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 +#ifndef TRMMKERNEL + LD c11, 0 * SIZE(C2) +#else + unop +#endif + + ADD c01, t1, c01 + unop + MUL alpha, c01, c01 +#ifndef TRMMKERNEL + LD c12, 1 * SIZE(C2) +#else + unop +#endif + + ADD c02, t2, c02 + ldi C1, 2 * SIZE(C1) + MUL alpha, c02, c02 + ldi C2, 2 * SIZE(C2) + + ADD c05, t3, c05 + MUL alpha, c05, c05 + ADD c06, t4, c06 + MUL alpha, c06, c06 + +#ifndef TRMMKERNEL + ADD c01, c09, c01 + ADD c02, c10, c02 + ADD c05, c11, c05 + ADD c06, c12, c06 +#endif + + ST c01, -2 * SIZE(C1) + fclr t1 + ST c02, -1 * SIZE(C1) + fclr t2 + ST c05, -2 * SIZE(C2) + fclr t3 + ST c06, -1 * SIZE(C2) + fclr t4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 2, TMP1 +#else + subl TMP1, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 2, KK +#endif + .align 4 + +$L70: + and M, 1, I + ble I, $L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 1, TMP1 +#else + addl KK, 2, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + fclr c02 + LD b2, 1 * SIZE(B) + fclr c06 + +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + + LD b3, 2 * SIZE(B) + ldi AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(B) + ldi BO, 2 * SIZE(B) + ble L, $L75 +#else + sll KK, BASE_SHIFT + 0, TMP1 + addl AO, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + fclr c02 + LD b2, 1 * SIZE(BO) + fclr c06 + +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + + LD b3, 2 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(BO) + ldi BO, 2 * SIZE(BO) + ble L, $L75 +#endif + .align 4 + +$L72: + ADD c01, t1, c01 + ldi L, -2(L) + MUL a1, b1, t1 + LD b1, 2 * SIZE(BO) + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 1 * SIZE(AO) + LD b2, 3 * SIZE(BO) + + ADD c02, t3, c02 + ldi AO, 2 * SIZE(AO) + MUL a2, b3, t3 + LD b3, 4 * SIZE(BO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD a2, 0 * SIZE(AO) + LD b4, 5 * SIZE(BO) + + ldi BO, 4 * SIZE(BO) + unop + unop + bgt L, $L72 + .align 4 + +$L75: + ADD c01, t1, c01 + fldd alpha, ALPHA + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L78 +#else + blbs TMP1, $L78 +#endif + .align 4 + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + LD b2, 1 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + MUL a1, b1, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L78: + ADD c05, t2, c05 + MUL a1, b2, t2 +#ifndef TRMMKERNEL + LD a5, 0 * SIZE(C1) +#else + unop +#endif + + ADD c02, t3, c02 + ADD c06, t4, c06 +#ifndef TRMMKERNEL + LD b5, 0 * SIZE(C2) +#else + unop +#endif + + ADD c01, c02, c01 + ADD c05, c06, c05 + + ADD c01, t1, c01 + ADD c05, t2, c05 + + MUL alpha, c01, c01 + MUL alpha, c05, c05 + +#ifndef TRMMKERNEL + ADD c01, a5, c01 + ADD c05, b5, c05 +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 1, TMP1 +#else + subl TMP1, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 1, KK +#endif + .align 4 + +$L79: + mov BO, B +#if defined(TRMMKERNEL) && !defined(LEFT) + addl KK, 2, KK +#else + unop +#endif + unop + unop + .align 4 + +$L80: + and N, 1, J + ble J, $L999 + + mov C, C1 + mov A, AO + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + sra M, 2, I + ble I, $L100 + .align 4 + +$L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 4, TMP1 +#else + addl KK, 1, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + mov B, BO + unop + ble L, $L95 +#else + sll KK, BASE_SHIFT + 2, TMP1 + addl AO, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP2 + addl B, TMP2, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + unop + ble L, $L95 +#endif + .align 5 + +$L92: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + ldi L, -1(L) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 8 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 9 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 10 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 11 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + LD a1, 12 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD a2, 13 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b3, t3 + LD a3, 14 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b3, t4 + LD a5, 15 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b4, t1 + LD a1, 16 * SIZE(AO) + ldi AO, 16 * SIZE(AO) + + ADD c02, t2, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b4, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L92 + .align 4 + +$L95: +#ifndef TRMMKERNEL + and K, 3, L +#else + and TMP1, 3, L +#endif + fldd alpha, ALPHA + unop + ble L, $L98 + .align 4 + +$L96: + ADD c01, t1, c01 + ldi L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + ldi BO, 1 * SIZE(BO) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ldi AO, 4 * SIZE(AO) + bgt L, $L96 + .align 4 + +$L98: +#ifndef TRMMKERNEL + ADD c01, t1, c01 + LD c05, 0 * SIZE(C1) + ADD c02, t2, c02 + LD c06, 1 * SIZE(C1) + ADD c03, t3, c03 + LD c07, 2 * SIZE(C1) + ADD c04, t4, c04 + LD c08, 3 * SIZE(C1) +#else + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 +#endif + + MUL alpha, c01, c01 + MUL alpha, c02, c02 + MUL alpha, c03, c03 + MUL alpha, c04, c04 + +#ifndef TRMMKERNEL + ADD c01, c05, c01 + ADD c02, c06, c02 + ADD c03, c07, c03 + ADD c04, c08, c04 +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ldi C1, 4 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 4, TMP1 +#else + subl TMP1, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 4, KK +#endif + + ldi I, -1(I) + bgt I, $L91 + .align 4 + +$L100: + and M, 2, I + unop + unop + ble I, $L110 + .align 4 + +$L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 2, TMP1 +#else + addl KK, 1, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + mov B, BO + unop + ble L, $L105 +#else + sll KK, BASE_SHIFT + 1, TMP1 + addl AO, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP2 + addl B, TMP2, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + unop + ble L, $L105 +#endif + .align 5 + +$L102: + ADD c01, t1, c01 + ldi L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c03, t3, c03 + ldi BO, 4 * SIZE(BO) + MUL a3, b2, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a5, 7 * SIZE(AO) + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b3, t1 + LD a1, 8 * SIZE(AO) + ldi AO, 8 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L102 + .align 4 + +$L105: +#ifndef TRMMKERNEL + and K, 3, L +#else + and TMP1, 3, L +#endif + fldd alpha, ALPHA +#ifndef TRMMKERNEL + LD a3, 0 * SIZE(C1) + LD a4, 1 * SIZE(C1) +#endif + ble L, $L108 + .align 4 + +$L106: + ADD c01, t1, c01 + ldi L, -1(L) + MUL a1, b1, t1 + LD a1, 2 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + ldi AO, 2 * SIZE(AO) + unop + ldi BO, 1 * SIZE(BO) + bgt L, $L106 + .align 4 + +$L108: + ADD c01, t1, c01 + fclr t1 + ADD c02, t2, c02 + fclr t2 + ADD c03, t3, c03 + fclr t3 + ADD c04, t4, c04 + fclr t4 + + ADD c01, c03, c01 + ADD c02, c04, c02 + + MUL alpha, c01, c01 + MUL alpha, c02, c02 + +#ifndef TRMMKERNEL + ADD c01, a3, c01 + ADD c02, a4, c02 +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ldi C1, 2 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 2, TMP1 +#else + subl TMP1, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 2, KK +#endif + .align 4 + +$L110: + and M, 1, I + ble I, $L999 + .align 4 + +$L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 1, TMP1 +#else + addl KK, 1, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + mov B, BO + unop + ble L, $L115 +#else + sll KK, BASE_SHIFT + 0, TMP1 + addl AO, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP2 + addl B, TMP2, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + unop + ble L, $L115 +#endif + .align 4 + +$L112: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c02, t2, c02 + MUL a2, b2, t2 + LD a2, 5 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c03, t3, c03 + MUL a3, b3, t3 + LD a3, 6 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c04, t4, c04 + MUL a4, b4, t4 + LD a4, 7 * SIZE(AO) + LD b4, 7 * SIZE(BO) + + ldi L, -1(L) + ldi AO, 4 * SIZE(AO) + ldi BO, 4 * SIZE(BO) + bgt L, $L112 + .align 4 + +$L115: +#ifndef TRMMKERNEL + and K, 3, L +#else + and TMP1, 3, L +#endif + fldd alpha, ALPHA +#ifndef TRMMKERNEL + LD a2, 0 * SIZE(C1) +#endif + ble L, $L118 + .align 4 + +$L116: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + ldi L, -1(L) + ldi AO, 1 * SIZE(AO) + ldi BO, 1 * SIZE(BO) + bgt L, $L116 + .align 4 + +$L118: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c02, c01 + ADD c03, c04, c03 + ADD c01, c03, c01 + + MUL alpha, c01, c01 +#ifndef TRMMKERNEL + ADD c01, a2, c01 +#endif + ST c01, 0 * SIZE(C1) + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + clr $0 + ldi $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/gemm_kernel_simd_16x4.S b/kernel/sw_64/gemm_kernel_simd_16x4.S new file mode 100644 index 0000000..1acf679 --- /dev/null +++ b/kernel/sw_64/gemm_kernel_simd_16x4.S @@ -0,0 +1,4054 @@ +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(SW2B) +#error "Architecture is not specified." +#endif + + +#define STACKSIZE 336 + +#define CO $1 +#define C1 $2 +#define C2 $3 +#define C3 $4 + +#define LDM $5 + +#define PREB $7 +#define SPANA $8 +#define SPANB $9 +#define NC1 $10 +#define KC1 $11 +#define MC1 $12 +#define PREA $13 + +#define A $20 +#define B $21 +#define C $19 +#define MC $16 +#define NC $17 +#define KC $18 + +#define A1 $22 +#define B1 $23 + +#define ALPHA $f8 + +#define a0 $f0 +#define a4 $f1 +#define a8 $f2 +#define a12 $f3 + +#define b0 $f4 +#define b1 $f5 +#define b2 $f6 +#define b3 $f7 + +#define na0 $f0 +#define na4 $f8 +#define na8 $f9 +#define na12 $f10 + +#define nb0 $f11 +#define nb1 $f12 +#define nb2 $f13 +#define nb3 $f14 + +#define t00 $f15 +#define t01 $f16 +#define t02 $f17 +#define t03 $f18 +#define t04 $f19 +#define t05 $f20 +#define t06 $f21 +#define t07 $f22 +#define t08 $f23 +#define t09 $f24 +#define t10 $f25 +#define t11 $f26 +#define t12 $f27 +#define t13 $f28 +#define t14 $f29 +#define t15 $f30 + +#define c00 $f1 +#define c01 $f2 +#define c02 $f3 +#define c03 $f4 + +#define c04 $f5 +#define c05 $f6 +#define c06 $f7 +#define c07 $f9 + +#define c08 $f10 +#define c09 $f11 +#define c10 $f12 +#define c11 $f13 + +#define c12 $f1 +#define c13 $f2 +#define c14 $f3 +#define c15 $f4 + +#if defined(TRMMKERNEL) +#define TEMP $14 +#define KK $24 +#define OFFSET $25 +#endif + + PROLOGUE + PROFCODE + +.frame $30,STACKSIZE,$26,0 +ldi $sp,-STACKSIZE($sp) # # [2] + + stl $9,328($sp) # Integer Saved Register + stl $10,320($sp) + stl $11,312($sp) + stl $12,304($sp) + stl $13,296($sp) + stl $14,288($sp) + + + ST $f2,280($sp) # Float Saved Register + ST $f3,272($sp) + ST $f4,264($sp) + ST $f5,256($sp) + ST $f6,248($sp) + ST $f7,240($sp) + ST $f8,232($sp) + ST $f9,224($sp) + + + + .align 5 + +$Begin_NC_Unroll4: + ldl C, 0 + STACKSIZE($sp) # load C + ldl LDM, 8 + STACKSIZE($sp) # load ldm + +#ifdef TRMMKERNEL + ldl OFFSET, 16 + STACKSIZE($sp) # load offset + nop +#endif + + ST $f19, 192($sp) # store alpha + SXADDQ LDM, 0, LDM # ldm*X+0 + + mov NC, NC1 # backup nc + mov KC, KC1 # backup kc + mov MC, MC1 # backup mc + + mov B, B1 # backup the initial address of b + sra NC1,2,NC # NC=NC1/4 Unroll N 4 + +#if defined(TRMMKERNEL) && !defined(LEFT) + subl $31, OFFSET, KK # when trmm at right + nop +#endif + + mov A, A1 # backup the initial address of a + sll KC1,1+BASE_SHIFT,SPANB # kc*2nr + + sll KC1,4+BASE_SHIFT,SPANA # kc*16mr + beq NC,$Begin_NC_Unroll2 + + + .align 5 + +.L0: + sra MC1,4,MC # MC=MC1/16 + mov C, CO # compute c pointer + + addl B1,SPANB,PREB # prefetch B + addl A1,SPANA,PREA # prefetch A + + addl C, LDM, C1 + addl C1,LDM, C2 + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET,KK # Reset the left offset + nop +#endif + + subl PREA,16*SIZE,PREA # prea=kc1*mc-mc + addl C2,LDM, C3 + + s4addl LDM,C,C # C=ldm*4+C + beq MC,.L15 # MC=0:MC1<16 + + + .align 5 # nr=4,mr=4----------------------------- + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B1, B # LL && RU reset B + nop +#else + sll KK, 4 + BASE_SHIFT, KC # KK*16 + sll KK, 2 + BASE_SHIFT, TEMP # KK*4 + + addl A, KC, A # mov A point to the data part + addl B1,TEMP,B # mov B point to the data part +#endif + + vcpys $f31,$f31,t00 # CLEAR Results Register + fillcs 0(CO) # prefetch C + fillcs 0(C1) + + vcpys $f31,$f31,t01 # 64 results + fillcs 0(C2) + fillcs 0(C3) + + vcpys $f31,$f31,t02 + LDDE b0,0*SIZE(B) + LDDE b1,1*SIZE(B) + + vcpys $f31,$f31,t03 + LDDE b2,2*SIZE(B) + LDDE b3,3*SIZE(B) + + vcpys $f31,$f31,t04 + fillcs 4(CO) # prefetch C + fillcs 4(C1) + + vcpys $f31,$f31,t05 + fillcs 4(C2) + fillcs 4(C3) + + vcpys $f31,$f31,t06 + VLD a0, 0*SIZE(A) + VLD a4, 4*SIZE(A) + + vcpys $f31,$f31,t07 + VLD a8, 8*SIZE(A) + VLD a12,12*SIZE(A) + + vcpys $f31,$f31,t08 + fillcs 8*SIZE(CO) + fillcs 8*SIZE(C1) + + vcpys $f31,$f31,t09 + fillcs 8*SIZE(C2) + fillcs 8*SIZE(C3) + + vcpys $f31,$f31,t10 + fillcs 12*SIZE(CO) + fillcs 12*SIZE(C1) + + vcpys $f31,$f31,t11 + fillcs 12*SIZE(C2) + fillcs 12*SIZE(C3) + + vcpys $f31,$f31,t12 + vcpys $f31,$f31,t13 + vcpys $f31,$f31,t14 + vcpys $f31,$f31,t15 + + +#if (defined(LEFT) && !defined(TRANSA)) \ + ||(!defined(LEFT) && defined(TRANSA)) + subl KC1, KK, TEMP # temp is the length of data part +#elif defined(LEFT) + addl KK, 16, TEMP # mr=16 +#else + addl KK, 4, TEMP # right nr=4 +#endif + sra TEMP, 1, KC # KC=TEMP/2 + + nop + beq KC, $Rest_16x4x1 + +#else + + vcpys $f31,$f31,t00 # CLEAR Results Register + mov B1,B # Reset B + sra KC1,1,KC # Unroll Kr=2, KC=KC1/2 + + vcpys $f31,$f31,t01 # 64 results + fillcs 0(CO) # prefetch C + fillcs 0(C1) + + vcpys $f31,$f31,t02 + fillcs 0(C2) + fillcs 0(C3) + + vcpys $f31,$f31,t03 + LDDE b0,0*SIZE(B) + LDDE b1,1*SIZE(B) + + vcpys $f31,$f31,t04 + LDDE b2,2*SIZE(B) + LDDE b3,3*SIZE(B) + + vcpys $f31,$f31,t05 + fillcs 4(CO) # prefetch C + fillcs 4(C1) + + vcpys $f31,$f31,t06 + fillcs 4(C2) + fillcs 4(C3) + + vcpys $f31,$f31,t07 + VLD a0, 0*SIZE(A) + VLD a4, 4*SIZE(A) + + vcpys $f31,$f31,t08 + VLD a8, 8*SIZE(A) + VLD a12,12*SIZE(A) + + vcpys $f31,$f31,t09 + fillcs 8(CO) # prefetch C + fillcs 8(C1) + + vcpys $f31,$f31,t10 + fillcs 8(C2) + fillcs 8(C3) + + vcpys $f31,$f31,t11 + fillcs 12*SIZE(CO) + fillcs 12*SIZE(C1) + + vcpys $f31,$f31,t12 + fillcs 12*SIZE(C2) + fillcs 12*SIZE(C3) + + vcpys $f31,$f31,t13 + vcpys $f31,$f31,t14 + + vcpys $f31,$f31,t15 + beq KC,$Rest_16x4x1 # KC1<2 goto $Rest_16x4x1 + +#endif + + .align 5 + +$Panel_16x4x2: # nr=4,mr=4,kr=2------------------------ + + VMAD a0,b0,t00,t00 + addl A,16*SIZE,A # 16a*1k + LDDE nb0,4*SIZE(B) # get next 4b + + VMAD a0,b1,t04,t04 + LDDE nb1,5*SIZE(B) + + VMAD a4,b0,t01,t01 + VLD na12,12*SIZE(A) + + VMAD a4,b1,t05,t05 + VLD na8,8*SIZE(A) + + VMAD a0,b2,t08,t08 + LDDE nb2,6*SIZE(B) + + VMAD a0,b3,t12,t12 + LDDE nb3,7*SIZE(B) + + VMAD a8,b0,t02,t02 + VMAD a8,b1,t06,t06 + + VMAD a4,b2,t09,t09 + addl B,8*SIZE,B # 4b*2k + VLD na0,0*SIZE(A) # carefule na0=a0 use the same register + + VMAD a4,b3,t13,t13 + VLD na4,4*SIZE(A) # get next 16a + + VMAD a12,b0,t03,t03 + VMAD a12,b1,t07,t07 + + VMAD a8,b2,t10,t10 + fillcs 0(PREB) + + VMAD a8,b3,t14,t14 + fillcs 0(PREA) + + VMAD a12,b2,t11,t11 + fillcs 8*SIZE(PREA) + + VMAD a12,b3,t15,t15 + subl KC,1,KC # loop k -- + + + VMAD na12,nb0,t03,t03 + addl A,16*SIZE,A # ### next k ### + LDDE b0,0(B) # get 3rd 4b + + VMAD na12,nb1,t07,t07 + LDDE b1,1*SIZE(B) + + VMAD na8,nb0,t02,t02 + VLD a12,12*SIZE(A) + + VMAD na8,nb1,t06,t06 + VLD a8,8*SIZE(A) + + VMAD na0,nb0,t00,t00 + subl PREA,16*SIZE,PREA # prea-=16 + LDDE b2,2*SIZE(B) + + VMAD na0,nb1,t04,t04 + LDDE b3,3*SIZE(B) + + VMAD na12,nb2,t11,t11 + VMAD na12,nb3,t15,t15 + VMAD na8,nb2,t10,t10 + VMAD na8,nb3,t14,t14 + + VMAD na0,nb2,t08,t08 + fillcs 0(PREA) + + VMAD na0,nb3,t12,t12 + fillcs 4*SIZE(PREB) + + VMAD na4,nb0,t01,t01 + VLD a0,0(A) # get 3rd 16a + + VMAD na4,nb1,t05,t05 + VLD a4,4*SIZE(A) + + VMAD na4,nb2,t09,t09 + fillcs 8*SIZE(PREA) + addl PREB,8*SIZE,PREB # preb+=8 + + VMAD na4,nb3,t13,t13 + subl PREA,16*SIZE,PREA # prea-=16 + bne KC,$Panel_16x4x2 + + +$Rest_16x4x1: + LDDE ALPHA, 192($sp) # get alpha +#ifndef TRMMKERNEL + blbc KC1, $Write_16x4 +#else + blbc TEMP,$Write_16x4 +#endif + + VMAD a0,b0,t00,t00 + addl A,16*SIZE,A # 16a*1k + + VMAD a0,b1,t04,t04 + addl B,4*SIZE,B # 4b*1k + + VMAD a0,b2,t08,t08 + VMAD a0,b3,t12,t12 + + + VMAD a4,b0,t01,t01 + VMAD a4,b1,t05,t05 + VMAD a4,b2,t09,t09 + VMAD a4,b3,t13,t13 + + VMAD a8,b0,t02,t02 + VMAD a8,b1,t06,t06 + VMAD a8,b2,t10,t10 + VMAD a8,b3,t14,t14 + + VMAD a12,b0,t03,t03 + VMAD a12,b1,t07,t07 + VMAD a12,b2,t11,t11 + VMAD a12,b3,t15,t15 + + + .align 5 + +$Write_16x4: + +#ifndef TRMMKERNEL + and CO, (VEC_LEN*SIZE-1), $6 ### gemm part #### + bne $6, $UnAlign_CO_Access_16x4 + +$Align_CO_Access_16x4: + VLD c00,0(CO) + VLD c01,4*SIZE(CO) + VLD c02,8*SIZE(CO) + VLD c03,12*SIZE(CO) + + VMAD t00,ALPHA,c00,t00 + VMAD t01,ALPHA,c01,t01 + VMAD t02,ALPHA,c02,t02 + VMAD t03,ALPHA,c03,t03 + + VST t00,0(CO) + VST t01,4*SIZE(CO) + VST t02,8*SIZE(CO) + VST t03,12*SIZE(CO) + jmp $Access_C1_16x4 + +$UnAlign_CO_Access_16x4: + VLD_UL c00, 0*VEC_LEN*SIZE(CO) + VLD_UH c04, 1*VEC_LEN*SIZE(CO) + + VLD_UL c01, 1*VEC_LEN*SIZE(CO) + VLD_UH c05, 2*VEC_LEN*SIZE(CO) + + vbisw c00,c04,c00 + VLD_UL c02, 2*VEC_LEN*SIZE(CO) + VLD_UH c06, 3*VEC_LEN*SIZE(CO) + + vbisw c01,c05,c01 + VLD_UL c03, 3*VEC_LEN*SIZE(CO) + VLD_UH c07, 4*VEC_LEN*SIZE(CO) + + vbisw c02,c06,c02 + vbisw c03,c07,c03 + + VMAD t00,ALPHA,c00,t00 + VMAD t01,ALPHA,c01,t01 + + VMAD t02,ALPHA,c02,t02 + VST_UL t00, 0*VEC_LEN*SIZE(CO) + VST_UH t00, 1*VEC_LEN*SIZE(CO) + + VMAD t03,ALPHA,c03,t03 + VST_UL t01, 1*VEC_LEN*SIZE(CO) + VST_UH t01, 2*VEC_LEN*SIZE(CO) + + VST_UL t02, 2*VEC_LEN*SIZE(CO) + VST_UH t02, 3*VEC_LEN*SIZE(CO) + + VST_UL t03, 3*VEC_LEN*SIZE(CO) + VST_UH t03, 4*VEC_LEN*SIZE(CO) + + +$Access_C1_16x4: + and C1, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_C1_Access_16x4 + +$Align_C1_Access_16x4: + VLD c04,0(C1) + VLD c05,4*SIZE(C1) + VLD c06,8*SIZE(C1) + VLD c07,12*SIZE(C1) + + VMAD t04,ALPHA,c04,t04 + VMAD t05,ALPHA,c05,t05 + VMAD t06,ALPHA,c06,t06 + VMAD t07,ALPHA,c07,t07 + + VST t04,0(C1) + VST t05,4*SIZE(C1) + VST t06,8*SIZE(C1) + VST t07,12*SIZE(C1) + jmp $Access_C2_16x4 + +$UnAlign_C1_Access_16x4: + VLD_UL c04, 0*VEC_LEN*SIZE(C1) + VLD_UH t00, 1*VEC_LEN*SIZE(C1) + + VLD_UL c05, 1*VEC_LEN*SIZE(C1) + VLD_UH t01, 2*VEC_LEN*SIZE(C1) + + vbisw c04,t00,c04 + VLD_UL c06, 2*VEC_LEN*SIZE(C1) + VLD_UH t02, 3*VEC_LEN*SIZE(C1) + + vbisw c05,t01,c05 + VLD_UL c07, 3*VEC_LEN*SIZE(C1) + VLD_UH t03, 4*VEC_LEN*SIZE(C1) + + vbisw c06,t02,c06 + vbisw c07,t03,c07 + + VMAD t04,ALPHA,c04,t04 + VMAD t05,ALPHA,c05,t05 + + VMAD t06,ALPHA,c06,t06 + VST_UL t04, 0*VEC_LEN*SIZE(C1) + VST_UH t04, 1*VEC_LEN*SIZE(C1) + + VMAD t07,ALPHA,c07,t07 + VST_UL t05, 1*VEC_LEN*SIZE(C1) + VST_UH t05, 2*VEC_LEN*SIZE(C1) + + VST_UL t06, 2*VEC_LEN*SIZE(C1) + VST_UH t06, 3*VEC_LEN*SIZE(C1) + + VST_UL t07, 3*VEC_LEN*SIZE(C1) + VST_UH t07, 4*VEC_LEN*SIZE(C1) + + +$Access_C2_16x4: + and C2, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_C2_Access_16x4 + + $Align_C2_Access_16x4: + VLD c08,0(C2) + VLD c09,4*SIZE(C2) + VLD c10,8*SIZE(C2) + VLD c11,12*SIZE(C2) + + VMAD t08,ALPHA,c08,t08 + VMAD t09,ALPHA,c09,t09 + VMAD t10,ALPHA,c10,t10 + VMAD t11,ALPHA,c11,t11 + + VST t08,0(C2) + VST t09,4*SIZE(C2) + VST t10,8*SIZE(C2) + VST t11,12*SIZE(C2) + jmp $Access_C3_16x4 + +$UnAlign_C2_Access_16x4: + VLD_UL c08, 0*VEC_LEN*SIZE(C2) + VLD_UH t00, 1*VEC_LEN*SIZE(C2) + + VLD_UL c09, 1*VEC_LEN*SIZE(C2) + VLD_UH t01, 2*VEC_LEN*SIZE(C2) + + vbisw c08,t00,c08 + VLD_UL c10, 2*VEC_LEN*SIZE(C2) + VLD_UH t02, 3*VEC_LEN*SIZE(C2) + + vbisw c09,t01,c09 + VLD_UL c11, 3*VEC_LEN*SIZE(C2) + VLD_UH t03, 4*VEC_LEN*SIZE(C2) + + vbisw c10,t02,c10 + vbisw c11,t03,c11 + + VMAD t08,ALPHA,c08,t08 + VMAD t09,ALPHA,c09,t09 + + VMAD t10,ALPHA,c10,t10 + VST_UL t08, 0*VEC_LEN*SIZE(C2) + VST_UH t08, 1*VEC_LEN*SIZE(C2) + + VMAD t11,ALPHA,c11,t11 + VST_UL t09, 1*VEC_LEN*SIZE(C2) + VST_UH t09, 2*VEC_LEN*SIZE(C2) + + VST_UL t10, 2*VEC_LEN*SIZE(C2) + VST_UH t10, 3*VEC_LEN*SIZE(C2) + + VST_UL t11, 3*VEC_LEN*SIZE(C2) + VST_UH t11, 4*VEC_LEN*SIZE(C2) + + +$Access_C3_16x4: + and C3, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_C3_Access_16x4 + +$Align_C3_Access_16x4: + VLD c12,0(C3) + VLD c13,4*SIZE(C3) + VLD c14,8*SIZE(C3) + VLD c15,12*SIZE(C3) + + VMAD t12,ALPHA,c12,t12 + VMAD t13,ALPHA,c13,t13 + VMAD t14,ALPHA,c14,t14 + VMAD t15,ALPHA,c15,t15 + + VST t12,0(C3) + VST t13,4*SIZE(C3) + VST t14,8*SIZE(C3) + VST t15,12*SIZE(C3) + jmp $End_NC_Unroll4 + +$UnAlign_C3_Access_16x4: + VLD_UL c12, 0*VEC_LEN*SIZE(C3) + VLD_UH t04, 1*VEC_LEN*SIZE(C3) + + VLD_UL c13, 1*VEC_LEN*SIZE(C3) + VLD_UH t05, 2*VEC_LEN*SIZE(C3) + + vbisw c12,t04,c12 + VLD_UL c14, 2*VEC_LEN*SIZE(C3) + VLD_UH t06, 3*VEC_LEN*SIZE(C3) + + vbisw c13,t05,c13 + VLD_UL c15, 3*VEC_LEN*SIZE(C3) + VLD_UH t07, 4*VEC_LEN*SIZE(C3) + + vbisw c14,t06,c14 + vbisw c15,t07,c15 + + VMAD t12,ALPHA,c12,t12 + VMAD t13,ALPHA,c13,t13 + + VMAD t14,ALPHA,c14,t14 + VST_UL t12, 0*VEC_LEN*SIZE(C3) + VST_UH t12, 1*VEC_LEN*SIZE(C3) + + VMAD t15,ALPHA,c15,t15 + VST_UL t13, 1*VEC_LEN*SIZE(C3) + VST_UH t13, 2*VEC_LEN*SIZE(C3) + + VST_UL t14, 2*VEC_LEN*SIZE(C3) + VST_UH t14, 3*VEC_LEN*SIZE(C3) + + VST_UL t15, 3*VEC_LEN*SIZE(C3) + VST_UH t15, 4*VEC_LEN*SIZE(C3) + jmp $End_NC_Unroll4 + +#else + and CO, (VEC_LEN*SIZE-1),$6 ### trmm part ### + bne $6,$UnAlign_CO_Access_16x4 + +$Align_CO_Access_16x4: + VMUL t00,ALPHA,t00 + VMUL t01,ALPHA,t01 + VMUL t02,ALPHA,t02 + VMUL t03,ALPHA,t03 + + VST t00,0(CO) + VST t01,4*SIZE(CO) + VST t02,8*SIZE(CO) + VST t03,12*SIZE(CO) + jmp $Access_C1_16x4 + +$UnAlign_CO_Access_16x4: + VMUL t00,ALPHA,t00 + VMUL t01,ALPHA,t01 + + VMUL t02,ALPHA,t02 + VST_UL t00, 0*VEC_LEN*SIZE(CO) + VST_UH t00, 1*VEC_LEN*SIZE(CO) + + VMUL t03,ALPHA,t03 + VST_UL t01, 1*VEC_LEN*SIZE(CO) + VST_UH t01, 2*VEC_LEN*SIZE(CO) + + VST_UL t02, 2*VEC_LEN*SIZE(CO) + VST_UH t02, 3*VEC_LEN*SIZE(CO) + + VST_UL t03, 3*VEC_LEN*SIZE(CO) + VST_UH t03, 4*VEC_LEN*SIZE(CO) + + +$Access_C1_16x4: + and C1, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_C1_Access_16x4 + +$Align_C1_Access_16x4: + VMUL t04,ALPHA,t04 + VMUL t05,ALPHA,t05 + VMUL t06,ALPHA,t06 + VMUL t07,ALPHA,t07 + + VST t04,0(C1) + VST t05,4*SIZE(C1) + VST t06,8*SIZE(C1) + VST t07,12*SIZE(C1) + jmp $Access_C2_16x4 + +$UnAlign_C1_Access_16x4: + VMUL t04,ALPHA,t04 + VMUL t05,ALPHA,t05 + + VMUL t06,ALPHA,t06 + VST_UL t04, 0*VEC_LEN*SIZE(C1) + VST_UH t04, 1*VEC_LEN*SIZE(C1) + + VMUL t07,ALPHA,t07 + VST_UL t05, 1*VEC_LEN*SIZE(C1) + VST_UH t05, 2*VEC_LEN*SIZE(C1) + + VST_UL t06, 2*VEC_LEN*SIZE(C1) + VST_UH t06, 3*VEC_LEN*SIZE(C1) + + VST_UL t07, 3*VEC_LEN*SIZE(C1) + VST_UH t07, 4*VEC_LEN*SIZE(C1) + + +$Access_C2_16x4: + and C2, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_C2_Access_16x4 + +$Align_C2_Access_16x4: + VMUL t08,ALPHA,t08 + VMUL t09,ALPHA,t09 + VMUL t10,ALPHA,t10 + VMUL t11,ALPHA,t11 + + VST t08,0(C2) + VST t09,4*SIZE(C2) + VST t10,8*SIZE(C2) + VST t11,12*SIZE(C2) + jmp $Access_C3_16x4 + +$UnAlign_C2_Access_16x4: + VMUL t08,ALPHA,t08 + VMUL t09,ALPHA,t09 + + VMUL t10,ALPHA,t10 + VST_UL t08, 0*VEC_LEN*SIZE(C2) + VST_UH t08, 1*VEC_LEN*SIZE(C2) + + VMUL t11,ALPHA,t11 + VST_UL t09, 1*VEC_LEN*SIZE(C2) + VST_UH t09, 2*VEC_LEN*SIZE(C2) + + VST_UL t10, 2*VEC_LEN*SIZE(C2) + VST_UH t10, 3*VEC_LEN*SIZE(C2) + + VST_UL t11, 3*VEC_LEN*SIZE(C2) + VST_UH t11, 4*VEC_LEN*SIZE(C2) + + +$Access_C3_16x4: + and C3, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_C3_Access_16x4 + +$Align_C3_Access_16x4: + VMUL t12,ALPHA,t12 + VMUL t13,ALPHA,t13 + VMUL t14,ALPHA,t14 + VMUL t15,ALPHA,t15 + + VST t12,0(C3) + VST t13,4*SIZE(C3) + VST t14,8*SIZE(C3) + VST t15,12*SIZE(C3) + jmp $TRMMKERNEL_16x4 + +$UnAlign_C3_Access_16x4: + VMUL t12,ALPHA,t12 + VMUL t13,ALPHA,t13 + + VMUL t14,ALPHA,t14 + VST_UL t12, 0*VEC_LEN*SIZE(C3) + VST_UH t12, 1*VEC_LEN*SIZE(C3) + + VMUL t15,ALPHA,t15 + VST_UL t13, 1*VEC_LEN*SIZE(C3) + VST_UH t13, 2*VEC_LEN*SIZE(C3) + + VST_UL t14, 2*VEC_LEN*SIZE(C3) + VST_UH t14, 3*VEC_LEN*SIZE(C3) + + VST_UL t15, 3*VEC_LEN*SIZE(C3) + VST_UH t15, 4*VEC_LEN*SIZE(C3) + + +$TRMMKERNEL_16x4: +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl KC1, KK, TEMP # nodata length +#ifdef LEFT + subl TEMP, 16, TEMP # mr=16 +#else + subl TEMP, 4, TEMP # nr=4 +#endif + + sll TEMP, 4 + BASE_SHIFT,KC # mr=16 + sll TEMP, 2 + BASE_SHIFT,TEMP # nr=4 + + addl A, KC, A # mov A to the end of this panel + addl B, TEMP,B # mov B to the end of this panel +#endif + +#ifdef LEFT + addl KK, 16 ,KK +#endif + nop + jmp $End_NC_Unroll4 +#endif + + + .align 5 + +.L15: # n=4,m=8----------------------------- + and MC1,8,MC + sll KC1,3+BASE_SHIFT,SPANA # spana=kc1*mc + nop + beq MC,.L16 + + addl A1,SPANA,PREA + subl PREA,8*SIZE,PREA # PREA-=MC + + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA))\ + || (!defined(LEFT) && !defined(TRANSA)) + mov B1, B # set B + nop +#else + sll KK, 3 + BASE_SHIFT,KC # mr=8 + sll KK, 2 + BASE_SHIFT,TEMP # nr=4 + + addl A,KC,A + addl B1,TEMP,B +#endif + + vcpys $f31,$f31,t00 # clear (32 results) + vcpys $f31,$f31,t01 + vcpys $f31,$f31,t04 + vcpys $f31,$f31,t05 + + LDDE b0,0(B) + LDDE b1,1*SIZE(B) + LDDE b2,2*SIZE(B) + LDDE b3,3*SIZE(B) + + vcpys $f31,$f31,t08 + vcpys $f31,$f31,t09 + vcpys $f31,$f31,t12 + vcpys $f31,$f31,t13 + + VLD a0,0(A) # get 8 A + VLD a4,4*SIZE(A) + + fillcs 0(CO) # fetch C + fillcs 0(C1) + fillcs 0(C2) + fillcs 0(C3) + + fillcs 4*SIZE(CO) # + fillcs 4*SIZE(C1) + fillcs 4*SIZE(C2) + fillcs 4*SIZE(C3) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl KC1, KK, TEMP # temp is the length of the data part +#elif defined(LEFT) + addl KK, 8, TEMP # mr=8 +#else + addl KK, 4, TEMP # nr=4 +#endif + sra TEMP,1, KC # kc/2 + beq KC,$Rest_8x4x1 + +#else + + mov B1,B # Reset B + sra KC1,1,KC # unroll kc as 2, kc=kc1/2 + vcpys $f31,$f31,t00 # clear (32 results) + vcpys $f31,$f31,t01 + vcpys $f31,$f31,t04 + vcpys $f31,$f31,t05 + + LDDE b0,0(B) + LDDE b1,1*SIZE(B) + LDDE b2,2*SIZE(B) + LDDE b3,3*SIZE(B) + + vcpys $f31,$f31,t08 + vcpys $f31,$f31,t09 + vcpys $f31,$f31,t12 + vcpys $f31,$f31,t13 + + VLD a0,0(A) # get 8 A + VLD a4,4*SIZE(A) + + fillcs 0(CO) # fetch C + fillcs 0(C1) + fillcs 0(C2) + fillcs 0(C3) + + fillcs 4*SIZE(CO) # + fillcs 4*SIZE(C1) + fillcs 4*SIZE(C2) + fillcs 4*SIZE(C3) + + beq KC,$Rest_8x4x1 +#endif + + .align 5 + +$Panel_8x4x2: + VMAD a0,b0,t00,t00 + VMAD a0,b1,t04,t04 + VMAD a0,b2,t08,t08 + VMAD a0,b3,t12,t12 + + LDDE nb0,4*SIZE(B) # get next 4b + LDDE nb1,5*SIZE(B) + LDDE nb2,6*SIZE(B) + LDDE nb3,7*SIZE(B) + + addl B,8*SIZE,B # 4n*2k + VMAD a4,b0,t01,t01 + VMAD a4,b1,t05,t05 + VMAD a4,b2,t09,t09 + VMAD a4,b3,t13,t13 + + VLD na8,8*SIZE(A) # get next 8a + VLD na12,12*SIZE(A) + + fillcs 0(PREA) + fillcs 4*SIZE(PREA) + subl PREA,8*SIZE,PREA # prea -= 8 + + subl KC,1,KC + addl A,16*SIZE,A # ### next k ###8m*2k + VMAD na8,nb0,t00,t00 + VMAD na8,nb1,t04,t04 + VMAD na8,nb2,t08,t08 + VMAD na8,nb3,t12,t12 + + LDDE b0,0(B) # get 3rd 4b + LDDE b1,1*SIZE(B) + LDDE b2,2*SIZE(B) + LDDE b3,3*SIZE(B) + + VMAD na12,nb0,t01,t01 + VMAD na12,nb1,t05,t05 + VMAD na12,nb2,t09,t09 + VMAD na12,nb3,t13,t13 + + VLD a0,0(A) # get 3rd 8a + VLD a4,4*SIZE(A) + + fillcs 0(PREA) + fillcs 4*SIZE(PREA) + subl PREA,8*SIZE,PREA # prea -= mc + bne KC,$Panel_8x4x2 # loop k-- + +$Rest_8x4x1: + LDDE ALPHA, 192($sp) # get alpha +#ifndef TRMMKERNEL + blbc KC1, $Write_8x4 +#else + blbc TEMP, $Write_8x4 +#endif + + addl A,8*SIZE,A # 8a*1k + addl B,4*SIZE,B # 4b*1K + + VMAD a0,b0,t00,t00 + VMAD a0,b1,t04,t04 + VMAD a0,b2,t08,t08 + VMAD a0,b3,t12,t12 + + fillcs 0(PREA) + fillcs 4*SIZE(PREA) + subl PREA,8*SIZE,PREA + + VMAD a4,b0,t01,t01 + VMAD a4,b1,t05,t05 + VMAD a4,b2,t09,t09 + VMAD a4,b3,t13,t13 + +$Write_8x4: + +#ifndef TRMMKERNEL + and CO, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_CO_Access_8x4 + +$Align_CO_Access_8x4: + VLD c00,0(CO) # get 1st colum of 16c + VLD c01,4*SIZE(CO) + + VMAD t00,ALPHA,c00,t00 + VMAD t01,ALPHA,c01,t01 + + VST t00,0(CO) + VST t01,4*SIZE(CO) + jmp $Access_C1_8x4 + +$UnAlign_CO_Access_8x4: + VLD_UL c00, 0*VEC_LEN*SIZE(CO) + VLD_UH c02, 1*VEC_LEN*SIZE(CO) + + VLD_UL c01, 1*VEC_LEN*SIZE(CO) + VLD_UH c03, 2*VEC_LEN*SIZE(CO) + + vbisw c00,c02,c00 + vbisw c01,c03,c01 + + VMAD t00,ALPHA,c00,t00 + VMAD t01,ALPHA,c01,t01 + + VST_UL t00, 0*VEC_LEN*SIZE(CO) + VST_UH t00, 1*VEC_LEN*SIZE(CO) + + VST_UL t01, 1*VEC_LEN*SIZE(CO) + VST_UH t01, 2*VEC_LEN*SIZE(CO) + + +$Access_C1_8x4: + and C1, (VEC_LEN*SIZE-1),$6 + addl CO,8*SIZE,CO + nop + bne $6,$UnAlign_C1_Access_8x4 + +$Align_C1_Access_8x4: + VLD c04,0(C1) + VLD c05,4*SIZE(C1) + + VMAD t04,ALPHA,c04,t04 + VMAD t05,ALPHA,c05,t05 + + VST t04,0(C1) + VST t05,4*SIZE(C1) + jmp $Access_C2_8x4 + +$UnAlign_C1_Access_8x4: + VLD_UL c04, 0*VEC_LEN*SIZE(C1) + VLD_UH c06, 1*VEC_LEN*SIZE(C1) + + VLD_UL c05, 1*VEC_LEN*SIZE(C1) + VLD_UH c07, 2*VEC_LEN*SIZE(C1) + + vbisw c04,c06,c04 + vbisw c05,c07,c05 + + VMAD t04,ALPHA,c04,t04 + VMAD t05,ALPHA,c05,t05 + + VST_UL t04, 0*VEC_LEN*SIZE(C1) + VST_UH t04, 1*VEC_LEN*SIZE(C1) + + VST_UL t05, 1*VEC_LEN*SIZE(C1) + VST_UH t05, 2*VEC_LEN*SIZE(C1) + + +$Access_C2_8x4: + and C2, (VEC_LEN*SIZE-1),$6 + addl C1,8*SIZE,C1 + nop + bne $6,$UnAlign_C2_Access_8x4 + +$Align_C2_Access_8x4: + VLD c08,0(C2) + VLD c09,4*SIZE(C2) + + VMAD t08,ALPHA,c08,t08 + VMAD t09,ALPHA,c09,t09 + + VST t08,0(C2) + VST t09,4*SIZE(C2) + jmp $Access_C3_8x4 + +$UnAlign_C2_Access_8x4: + VLD_UL c08, 0*VEC_LEN*SIZE(C2) + VLD_UH c10, 1*VEC_LEN*SIZE(C2) + + VLD_UL c09, 1*VEC_LEN*SIZE(C2) + VLD_UH c11, 2*VEC_LEN*SIZE(C2) + + vbisw c08,c10,c08 + vbisw c09,c11,c09 + + VMAD t08,ALPHA,c08,t08 + VMAD t09,ALPHA,c09,t09 + + VST_UL t08, 0*VEC_LEN*SIZE(C2) + VST_UH t08, 1*VEC_LEN*SIZE(C2) + + VST_UL t09, 1*VEC_LEN*SIZE(C2) + VST_UH t09, 2*VEC_LEN*SIZE(C2) + + +$Access_C3_8x4: + and C3, (VEC_LEN*SIZE-1),$6 + addl C2,8*SIZE,C2 + nop + bne $6,$UnAlign_C3_Access_8x4 + +$Align_C3_Access_8x4: + VLD c12,0(C3) + VLD c13,4*SIZE(C3) + + VMAD t12,ALPHA,c12,t12 + VMAD t13,ALPHA,c13,t13 + + VST t12,0(C3) + VST t13,4*SIZE(C3) + addl C3,8*SIZE,C3 + jmp .L16 + + +$UnAlign_C3_Access_8x4: + VLD_UL c12, 0*VEC_LEN*SIZE(C3) + VLD_UH c14, 1*VEC_LEN*SIZE(C3) + + VLD_UL c13, 1*VEC_LEN*SIZE(C3) + VLD_UH c15, 2*VEC_LEN*SIZE(C3) + + vbisw c12,c14,c12 + vbisw c13,c15,c13 + + VMAD t12,ALPHA,c12,t12 + VMAD t13,ALPHA,c13,t13 + + VST_UL t12, 0*VEC_LEN*SIZE(C3) + VST_UH t12, 1*VEC_LEN*SIZE(C3) + + VST_UL t13, 1*VEC_LEN*SIZE(C3) + VST_UH t13, 2*VEC_LEN*SIZE(C3) + addl C3,8*SIZE,C3 + +#else + + and CO, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_CO_Access_8x4 + +$Align_CO_Access_8x4: + VMUL t00,ALPHA,t00 + VMUL t01,ALPHA,t01 + + VST t00,0(CO) + VST t01,4*SIZE(CO) + jmp $Access_C1_8x4 + +$UnAlign_CO_Access_8x4: + VMUL t00,ALPHA,t00 + VMUL t01,ALPHA,t01 + + VST_UL t00, 0*VEC_LEN*SIZE(CO) + VST_UH t00, 1*VEC_LEN*SIZE(CO) + + VST_UL t01, 1*VEC_LEN*SIZE(CO) + VST_UH t01, 2*VEC_LEN*SIZE(CO) + + +$Access_C1_8x4: + and C1, (VEC_LEN*SIZE-1),$6 + addl CO,8*SIZE,CO # 8c + nop + bne $6,$UnAlign_C1_Access_8x4 + +$Align_C1_Access_8x4: + VMUL t04,ALPHA,t04 + VMUL t05,ALPHA,t05 + + VST t04,0(C1) + VST t05,4*SIZE(C1) + jmp $Access_C2_8x4 + +$UnAlign_C1_Access_8x4: + VMUL t04,ALPHA,t04 + VMUL t05,ALPHA,t05 + + VST_UL t04, 0*VEC_LEN*SIZE(C1) + VST_UH t04, 1*VEC_LEN*SIZE(C1) + + VST_UL t05, 1*VEC_LEN*SIZE(C1) + VST_UH t05, 2*VEC_LEN*SIZE(C1) + + +$Access_C2_8x4: + and C2, (VEC_LEN*SIZE-1),$6 + addl C1,8*SIZE,C1 # 8c + nop + bne $6,$UnAlign_C2_Access_8x4 + +$Align_C2_Access_8x4: + VMUL t08,ALPHA,t08 + VMUL t09,ALPHA,t09 + + VST t08,0(C2) + VST t09,4*SIZE(C2) + jmp $Access_C3_8x4 + +$UnAlign_C2_Access_8x4: + VMUL t08,ALPHA,t08 + VMUL t09,ALPHA,t09 + + VST_UL t08, 0*VEC_LEN*SIZE(C2) + VST_UH t08, 1*VEC_LEN*SIZE(C2) + + VST_UL t09, 1*VEC_LEN*SIZE(C2) + VST_UH t09, 2*VEC_LEN*SIZE(C2) + + +$Access_C3_8x4: + and C3, (VEC_LEN*SIZE-1),$6 + addl C2,8*SIZE,C2 # 8c + nop + bne $6,$UnAlign_C3_Access_8x4 + +$Align_C3_Access_8x4: + VMUL t12,ALPHA,t12 + VMUL t13,ALPHA,t13 + + VST t12,0(C3) + VST t13,4*SIZE(C3) + addl C3,8*SIZE,C3 + jmp $TRMMKERNEL_8x4 + +$UnAlign_C3_Access_8x4: + VMUL t12,ALPHA,t12 + VMUL t13,ALPHA,t13 + + VST_UL t12, 0*VEC_LEN*SIZE(C3) + VST_UH t12, 1*VEC_LEN*SIZE(C3) + + VST_UL t13, 1*VEC_LEN*SIZE(C3) + VST_UH t13, 2*VEC_LEN*SIZE(C3) + addl C3,8*SIZE,C3 + +$TRMMKERNEL_8x4: +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl KC1, KK, TEMP +#ifdef LEFT + subl TEMP, 8,TEMP # mr=8 +#else + subl TEMP, 4,TEMP # nr=4 +#endif + + sll TEMP, 3 + BASE_SHIFT,KC + sll TEMP, 2 + BASE_SHIFT,TEMP + + addl A, KC, A # move A, B to the end of this panel + addl B, TEMP, B +#endif + +#ifdef LEFT + addl KK, 8, KK +#endif +#endif + + + + .align 5 + +.L16: + and MC1,4,MC # nr=4,mr=4---------------------------- + sll KC1,2+BASE_SHIFT,SPANA # spana=kc1*mc + nop + beq MC,.L17 + + addl A1,SPANA,PREA + subl PREA,4*SIZE,PREA # PREA-=MC + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B1,B # Set B + nop +#else + sll KK, 2 + BASE_SHIFT,KC # mr=nr=4 + nop + + addl A, KC, A + addl B1,KC, B +#endif + + vcpys $f31,$f31,t00 # clear 16 register + vcpys $f31,$f31,t04 + vcpys $f31,$f31,t08 + vcpys $f31,$f31,t12 + + LDDE b0,0(B) # get 4b + LDDE b1,1*SIZE(B) + LDDE b2,2*SIZE(B) + LDDE b3,3*SIZE(B) + + VLD a0,0(A) # get 4a + + fillcs 0(CO) # prefetch C + fillcs 0(C1) + fillcs 0(C2) + fillcs 0(C3) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl KC1, KK, TEMP +#else + addl KK, 4, TEMP +#endif + sra TEMP,1,KC + nop + beq KC,$Rest_4x4x1 + +#else + mov B1,B # Reset B + sra KC1,1,KC # Unroll KC 2, KC=KC1/2 + vcpys $f31,$f31,t00 # clear 16 register + vcpys $f31,$f31,t04 + vcpys $f31,$f31,t08 + vcpys $f31,$f31,t12 + + LDDE b0,0(B) # get 4b + LDDE b1,1*SIZE(B) + LDDE b2,2*SIZE(B) + LDDE b3,3*SIZE(B) + + VLD a0,0(A) # get 4a + + fillcs 0(CO) # prefetch C + fillcs 0(C1) + fillcs 0(C2) + fillcs 0(C3) + + beq KC,$Rest_4x4x1 + +#endif + + +$Panel_4x4x2: + VMAD a0,b0,t00,t00 + VMAD a0,b1,t04,t04 + VMAD a0,b2,t08,t08 + VMAD a0,b3,t12,t12 + + VLD a4,4*SIZE(A) + LDDE nb0,4*SIZE(B) # get next 4b and 4a + LDDE nb1,5*SIZE(B) + LDDE nb2,6*SIZE(B) + LDDE nb3,7*SIZE(B) + addl B,8*SIZE,B # 4b*2k + + fillcs 0(PREA) + subl PREA,4*SIZE,PREA + + subl KC,1,KC + VMAD a4,nb0,t00,t00 + VMAD a4,nb1,t04,t04 + VMAD a4,nb2,t08,t08 + VMAD a4,nb3,t12,t12 + + addl A,8*SIZE,A # 4a*2k + LDDE b0,0(B) # get 3rd 4b and 4a + LDDE b1,1*SIZE(B) + LDDE b2,2*SIZE(B) + LDDE b3,3*SIZE(B) + VLD a0,0(A) + + fillcs 0(PREA) + subl PREA,4*SIZE,PREA + bne KC,$Panel_4x4x2 + + +$Rest_4x4x1: + LDDE ALPHA, 192($sp) # Get ALPHA +#ifndef TRMMKERNEL + blbc KC1, $Write_4x4 +#else + blbc TEMP, $Write_4x4 +#endif + + addl A,4*SIZE,A # 4a*1k + addl B,4*SIZE,B # 4b*1K + + fillcs 0(PREA) + subl PREA,4*SIZE,PREA + + VMAD a0,b0,t00,t00 + VMAD a0,b1,t04,t04 + VMAD a0,b2,t08,t08 + VMAD a0,b3,t12,t12 + + +$Write_4x4: + +#ifndef TRMMKERNEL + and CO, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_CO_Access_4x4 + +$Align_CO_Access_4x4: + VLD c00,0(CO) # get 1st colum of 16c + VMAD t00,ALPHA,c00,t00 + VST t00,0(CO) + jmp $Access_C1_4x4 + +$UnAlign_CO_Access_4x4: + VLD_UL c00, 0*VEC_LEN*SIZE(CO) + VLD_UH c02, 1*VEC_LEN*SIZE(CO) + + vbisw c00,c02,c00 + + VMAD t00,ALPHA,c00,t00 + + VST_UL t00, 0*VEC_LEN*SIZE(CO) + VST_UH t00, 1*VEC_LEN*SIZE(CO) + + +$Access_C1_4x4: + and C1, (VEC_LEN*SIZE-1),$6 + addl CO,4*SIZE,CO # 4c + nop + bne $6,$UnAlign_C1_Access_4x4 + +$Align_C1_Access_4x4: + VLD c04,0(C1) + VMAD t04,ALPHA,c04,t04 + VST t04,0(C1) + jmp $Access_C2_4x4 + +$UnAlign_C1_Access_4x4: + VLD_UL c04, 0*VEC_LEN*SIZE(C1) + VLD_UH c06, 1*VEC_LEN*SIZE(C1) + + vbisw c04,c06,c04 + + VMAD t04,ALPHA,c04,t04 + + VST_UL t04, 0*VEC_LEN*SIZE(C1) + VST_UH t04, 1*VEC_LEN*SIZE(C1) + + +$Access_C2_4x4: + and C2, (VEC_LEN*SIZE-1),$6 + addl C1,4*SIZE,C1 # 4c + nop + bne $6,$UnAlign_C2_Access_4x4 + +$Align_C2_Access_4x4: + VLD c08,0(C2) + VMAD t08,ALPHA,c08,t08 + VST t08,0(C2) + jmp $Access_C3_4x4 + +$UnAlign_C2_Access_4x4: + VLD_UL c08, 0*VEC_LEN*SIZE(C2) + VLD_UH c10, 1*VEC_LEN*SIZE(C2) + + vbisw c08,c10,c08 + + VMAD t08,ALPHA,c08,t08 + + VST_UL t08, 0*VEC_LEN*SIZE(C2) + VST_UH t08, 1*VEC_LEN*SIZE(C2) + + +$Access_C3_4x4: + and C3, (VEC_LEN*SIZE-1),$6 + addl C2,4*SIZE,C2 # 4c + nop + bne $6,$UnAlign_C3_Access_4x4 + +$Align_C3_Access_4x4: + VLD c12,0(C3) + VMAD t12,ALPHA,c12,t12 + VST t12,0(C3) + addl C3,4*SIZE,C3 + jmp .L17 + +$UnAlign_C3_Access_4x4: + VLD_UL c12, 0*VEC_LEN*SIZE(C3) + VLD_UH c14, 1*VEC_LEN*SIZE(C3) + + vbisw c12,c14,c12 + + VMAD t12,ALPHA,c12,t12 + + VST_UL t12, 0*VEC_LEN*SIZE(C3) + VST_UH t12, 1*VEC_LEN*SIZE(C3) + addl C3,4*SIZE,C3 + + +#else + + and CO, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_CO_Access_4x4 + +$Align_CO_Access_4x4: + VMUL t00,ALPHA,t00 + VST t00,0(CO) + jmp $Access_C1_4x4 + +$UnAlign_CO_Access_4x4: + VMUL t00,ALPHA,t00 # careful: c00~c03 use the same register + VST_UL t00, 0*VEC_LEN*SIZE(CO) + VST_UH t00, 1*VEC_LEN*SIZE(CO) + + +$Access_C1_4x4: + and C1, (VEC_LEN*SIZE-1),$6 + addl CO,4*SIZE,CO # 4c + nop + bne $6,$UnAlign_C1_Access_4x4 + +$Align_C1_Access_4x4: + VMUL t04,ALPHA,t04 + VST t04,0(C1) + jmp $Access_C2_4x4 + +$UnAlign_C1_Access_4x4: + VMUL t04,ALPHA,t04 + VST_UL t04, 0*VEC_LEN*SIZE(C1) + VST_UH t04, 1*VEC_LEN*SIZE(C1) + + +$Access_C2_4x4: + and C2, (VEC_LEN*SIZE-1),$6 + addl C1,4*SIZE,C1 # 4c + nop + bne $6,$UnAlign_C2_Access_4x4 + +$Align_C2_Access_4x4: + VMUL t08,ALPHA,t08 + VST t08,0(C2) + jmp $Access_C3_4x4 + +$UnAlign_C2_Access_4x4: + VMUL t08,ALPHA,t08 + VST_UL t08, 0*VEC_LEN*SIZE(C2) + VST_UH t08, 1*VEC_LEN*SIZE(C2) + + +$Access_C3_4x4: + and C3, (VEC_LEN*SIZE-1),$6 + addl C2,4*SIZE,C2 # 4c + nop + bne $6,$UnAlign_C3_Access_4x4 + +$Align_C3_Access_4x4: + VMUL t12,ALPHA,t12 + VST t12,0(C3) + addl C3,4*SIZE,C3 + jmp $TRMMKERNEL_4x4 + +$UnAlign_C3_Access_4x4: + VMUL t12,ALPHA,t12 + VST_UL t12, 0*VEC_LEN*SIZE(C3) + VST_UH t12, 1*VEC_LEN*SIZE(C3) + addl C3,4*SIZE,C3 + +$TRMMKERNEL_4x4: +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl KC1, KK, TEMP + subl TEMP, 4, TEMP # mr=nr=4 + + sll TEMP, 2 + BASE_SHIFT,KC + nop + + addl A, KC, A # move A B to the end of this panel + addl B, KC, B +#endif + +#ifdef LEFT + addl KK, 4, KK +#endif +#endif + + + + + .align 5 +.L17: # nr=4,mr=2-------------------- + and MC1,2,MC + beq MC,.L18 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA))\ + || (!defined(LEFT) && !defined(TRANSA)) + mov B1, B +#else + sll KK, 1 + BASE_SHIFT, KC # mr=2 + sll KK, 2 + BASE_SHIFT, TEMP # nr=4 + + addl A, KC, A + addl B1,TEMP, B +#endif + + fclr t00 # CLEAR 8 register + fclr t01 + fclr t04 + fclr t05 + fclr t08 + fclr t09 + fclr t12 + fclr t13 + + LD b0,0(B) # get 4b + LD b1,1*SIZE(B) + LD a0,0(A) # get 2a + LD b2,2*SIZE(B) + LD b3,3*SIZE(B) + LD a4,1*SIZE(A) + + fillcs 0(CO) # prefetch C + fillcs 0(C1) + fillcs 0(C2) + fillcs 0(C3) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl KC1, KK, TEMP +#elif defined(LEFT) + addl KK, 2, TEMP # mr=2 +#else + addl KK, 4, TEMP # nr=4 +#endif + sra TEMP, 1, KC + beq KC,$Rest_2x4x1 + +#else + mov B1,B # reset B + sra KC1,1,KC # Unroll KC 2, KC=KC1/2 + fclr t00 # CLEAR 8 register + fclr t01 + fclr t04 + fclr t05 + fclr t08 + fclr t09 + fclr t12 + fclr t13 + + LD b0,0(B) # get 4b + LD b1,1*SIZE(B) + LD a0,0(A) # get 2a + LD b2,2*SIZE(B) + LD b3,3*SIZE(B) + LD a4,1*SIZE(A) + + fillcs 0(CO) # prefetch C + fillcs 0(C1) + fillcs 0(C2) + fillcs 0(C3) + + beq KC,$Rest_2x4x1 +#endif + + +$Panel_2x4x2: + MAD a0,b0,t00,t00 + MAD a0,b1,t04,t04 + MAD a0,b2,t08,t08 + MAD a0,b3,t12,t12 + + LD nb0,4*SIZE(B) # get next 4b and 2a + LD nb1,5*SIZE(B) + LD a8,2*SIZE(A) + LD nb2,6*SIZE(B) + LD nb3,7*SIZE(B) + LD a12,3*SIZE(A) + addl B,8*SIZE,B # 4b*2k + + MAD a4,b0,t01,t01 + MAD a4,b1,t05,t05 + MAD a4,b2,t09,t09 + MAD a4,b3,t13,t13 + + subl KC,1,KC + MAD a8,nb0,t00,t00 + MAD a8,nb1,t04,t04 + MAD a8,nb2,t08,t08 + MAD a8,nb3,t12,t12 + + addl A,4*SIZE,A # 2a*2k + LD b0,0(B) # get 3rd 4b and 2a + LD b1,1*SIZE(B) + LD a0,0(A) + LD b2,2*SIZE(B) + LD b3,3*SIZE(B) + LD a4,1*SIZE(A) + + MAD a12,nb0,t01,t01 + MAD a12,nb1,t05,t05 + MAD a12,nb2,t09,t09 + MAD a12,nb3,t13,t13 + + bne KC,$Panel_2x4x2 + + +$Rest_2x4x1: + LD ALPHA, 192($sp) # get alpha +#ifndef TRMMKERNEL + blbc KC1, $Write_2x4 +#else + blbc TEMP, $Write_2x4 +#endif + + addl A,2*SIZE,A # 2a*1k + addl B,4*SIZE,B # 4b*1K + + MAD a0,b0,t00,t00 + MAD a0,b1,t04,t04 + MAD a0,b2,t08,t08 + MAD a0,b3,t12,t12 + + MAD a4,b0,t01,t01 + MAD a4,b1,t05,t05 + MAD a4,b2,t09,t09 + MAD a4,b3,t13,t13 + +$Write_2x4: +#ifndef TRMMKERNEL + LD c00,0(CO) + LD c01,1*SIZE(CO) + LD c04,0(C1) + LD c05,1*SIZE(C1) + + MAD t00,ALPHA,c00,t00 + MAD t01,ALPHA,c01,t01 + + LD c08,0(C2) + LD c09,1*SIZE(C2) + + MAD t04,ALPHA,c04,t04 + MAD t05,ALPHA,c05,t05 + + LD c12,0(C3) + LD c13,1*SIZE(C3) + + MAD t08,ALPHA,c08,t08 + MAD t09,ALPHA,c09,t09 + + addl CO,2*SIZE,CO # 2c + addl C1,2*SIZE,C1 + addl C2,2*SIZE,C2 + addl C3,2*SIZE,C3 + + ST t00,-2*SIZE(CO) # 2c + ST t01,-1*SIZE(CO) + + MAD t12,ALPHA,c12,t12 + MAD t13,ALPHA,c13,t13 + + ST t04,-2*SIZE(C1) + ST t05,-1*SIZE(C1) + + ST t08,-2*SIZE(C2) + ST t09,-1*SIZE(C2) + + ST t12,-2*SIZE(C3) + ST t13,-1*SIZE(C3) + +#else + MUL t00,ALPHA,t00 + MUL t01,ALPHA,t01 + + MUL t04,ALPHA,t04 + MUL t05,ALPHA,t05 + + MUL t08,ALPHA,t08 + MUL t09,ALPHA,t09 + + addl CO,2*SIZE,CO # 2c + addl C1,2*SIZE,C1 + addl C2,2*SIZE,C2 + addl C3,2*SIZE,C3 + + ST t00,-2*SIZE(CO) # 2c + ST t01,-1*SIZE(CO) + + MUL t12,ALPHA,t12 + MUL t13,ALPHA,t13 + + ST t04,-2*SIZE(C1) + ST t05,-1*SIZE(C1) + + ST t08,-2*SIZE(C2) + ST t09,-1*SIZE(C2) + + ST t12,-2*SIZE(C3) + ST t13,-1*SIZE(C3) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl KC1, KK, TEMP +#ifdef LEFT + subl TEMP, 2, TEMP +#else + subl TEMP, 4, TEMP +#endif + + sll TEMP, 1 + BASE_SHIFT,KC + sll TEMP, 2 + BASE_SHIFT,TEMP + + addl A, KC, A + addl B, TEMP, B +#endif + +#ifdef LEFT + addl KK,2,KK +#endif +#endif + + + +.align 5 +.L18: # nr=4,mr=1--------------------------- + and MC1,1,MC + beq MC,$End_NC_Unroll4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B1, B + nop +#else + sll KK, BASE_SHIFT, KC # mr=1 + sll KK, 2 + BASE_SHIFT,TEMP # nr=4 + + addl A, KC, A + addl B1,TEMP, B +#endif + + fclr t00 # clear 4 regitster + fclr t04 + fclr t08 + fclr t12 + + LD b0,0(B) # get 4b + LD b1,1*SIZE(B) + LD b2,2*SIZE(B) + LD b3,3*SIZE(B) + + LD a0,0(A) # get 1 a + + fillcs 0(CO) # prefetch C + fillcs 0(C1) + fillcs 0(C2) + fillcs 0(C3) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl KC1, KK, TEMP +#elif defined(LEFT) + addl KK, 1, TEMP # mr=1 +#else + addl KK, 4,TEMP # nr=4 +#endif + sra TEMP,1,KC + beq KC,$Rest_1x4x1 + +#else + mov B1,B # Reset B + fclr t00 # clear 4 regitster + fclr t04 + fclr t08 + fclr t12 + sra KC1,1,KC # Unroll KC 2, KC=KC1/2 + + LD b0,0(B) # get 4b + LD b1,1*SIZE(B) + LD b2,2*SIZE(B) + LD b3,3*SIZE(B) + + LD a0,0(A) # get 1 a + + fillcs 0(CO) # prefetch C + fillcs 0(C1) + fillcs 0(C2) + fillcs 0(C3) + + beq KC,$Rest_1x4x1 + +#endif + + +$Panel_1x4x2: + MAD a0,b0,t00,t00 + MAD a0,b1,t04,t04 + MAD a0,b2,t08,t08 + MAD a0,b3,t12,t12 + + LD a8,1*SIZE(A) + LD nb0,4*SIZE(B) + LD nb1,5*SIZE(B) + LD nb2,6*SIZE(B) + LD nb3,7*SIZE(B) + + addl B,8*SIZE,B # 4b*2k + + subl KC,1,KC + MAD a8,nb0,t00,t00 + MAD a8,nb1,t04,t04 + MAD a8,nb2,t08,t08 + MAD a8,nb3,t12,t12 + + addl A,2*SIZE,A # 1a*2k + LD a0,0(A) # get 3rd 4b and 1a + LD b0,0(B) + LD b1,1*SIZE(B) + LD b2,2*SIZE(B) + LD b3,3*SIZE(B) + bne KC,$Panel_1x4x2 + + +$Rest_1x4x1: + LD ALPHA,192($sp) # get alpha +#ifndef TRMMKERNEL + blbc KC1, $Write_1x4 +#else + blbc TEMP, $Write_1x4 +#endif + + addl A,1*SIZE,A # 1m*1k*8Byte + addl B,4*SIZE,B # 4n*1K*8Byte + + MAD a0,b0,t00,t00 + MAD a0,b1,t04,t04 + MAD a0,b2,t08,t08 + MAD a0,b3,t12,t12 + + +$Write_1x4: +#ifndef TRMMKERNEL + LD c00,0(CO) + LD c04,0(C1) + MAD t00,ALPHA,c00,t00 + MAD t04,ALPHA,c04,t04 + LD c08,0(C2) + LD c12,0(C3) + MAD t08,ALPHA,c08,t08 + MAD t12,ALPHA,c12,t12 + ST t00,0(CO) + ST t04,0(C1) + ST t08,0(C2) + ST t12,0(C3) + +#else + MUL t00,ALPHA,t00 + MUL t04,ALPHA,t04 + MUL t08,ALPHA,t08 + MUL t12,ALPHA,t12 + + ST t00,0(CO) + ST t04,0(C1) + ST t08,0(C2) + ST t12,0(C3) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl KC1, KK, TEMP +#ifdef LEFT + subl TEMP, 1, TEMP +#else + subl TEMP, 4, TEMP +#endif + + sll TEMP, BASE_SHIFT, KC + sll TEMP, 2 + BASE_SHIFT, TEMP + + addl A, KC, A + addl B, TEMP,B +#endif + +#ifdef LEFT + addl KK, 1,KK +#endif +#endif + + + .align 5 + +$End_NC_Unroll4: + subl NC,1,NC # Loop N -- +#if defined(TRMMKERNEL) && !defined(LEFT) + addl KK, 4, KK + nop +#endif + mov A1,A # Reset A + mov B, B1 # mov B1 to the next panel + bne NC,.L0 + + + + + .align 5 +$Begin_NC_Unroll2: + + and NC1, 2, NC + beq NC, $Begin_NC_Unroll1 + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK # reset KK +#endif + + mov C,CO + addl C,LDM,C1 + + sra MC1,4,MC # MC=MC1/16 + sll KC1,4+BASE_SHIFT,SPANA # SPANA=KC1*MC + + addl A1,SPANA,PREA + subl PREA,16*SIZE,PREA + + addl C1,LDM,C # C=C1+LDM, Mov C to Next Panel + beq MC,.L25 # MC=0:MC1<16 + + + .align 5 +.L2: # nr=2,mr=16------------------- + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA))\ + || (!defined(LEFT) && !defined(TRANSA)) + mov B1,B +#else + sll KK, 4 + BASE_SHIFT,KC # mr=16 + sll KK, 1 + BASE_SHIFT,TEMP # nr=2 + + addl A,KC,A + addl B1,TEMP,B +#endif + + vcpys $f31,$f31,t00 # CLEAR Results Register + vcpys $f31,$f31,t01 + vcpys $f31,$f31,t02 + vcpys $f31,$f31,t03 + + LDDE b0,0(B) + LDDE b1,1*SIZE(B) + + VLD a0,0(A) # Get 16 A and 2 B + VLD a4,4*SIZE(A) + VLD a8,8*SIZE(A) + VLD a12,12*SIZE(A) + + vcpys $f31,$f31,t04 + vcpys $f31,$f31,t06 + vcpys $f31,$f31,t05 + vcpys $f31,$f31,t07 + + fillcs 0(CO) # fetch C + fillcs 0(C1) + fillcs 8*SIZE(CO) + fillcs 8*SIZE(C1) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl KC1, KK, TEMP +#elif defined(LEFT) + addl KK, 16, TEMP # mr=16 +#else + addl KK, 2, TEMP # nr=2 +#endif + sra TEMP, 1, KC + nop + beq KC,$Rest_16x2x1 + +#else + + mov B1,B # Set B + sra KC1,1,KC # Unroll KC 2, KC=KC1/2 + vcpys $f31,$f31,t00 # CLEAR Results Register + vcpys $f31,$f31,t01 + vcpys $f31,$f31,t02 + vcpys $f31,$f31,t03 + + LDDE b0,0(B) + LDDE b1,1*SIZE(B) + + VLD a0,0(A) # Get 16 A and 2 B + VLD a4,4*SIZE(A) + VLD a8,8*SIZE(A) + VLD a12,12*SIZE(A) + + vcpys $f31,$f31,t04 + vcpys $f31,$f31,t06 + vcpys $f31,$f31,t05 + vcpys $f31,$f31,t07 + + fillcs 0(CO) # fetch C + fillcs 0(C1) + fillcs 8*SIZE(CO) + fillcs 8*SIZE(C1) + + beq KC,$Rest_16x2x1 + +#endif + + +$Panel_16x2x2: + VMAD a0,b0,t00,t00 + VMAD a0,b1,t04,t04 + + addl A,16*SIZE,A # 16m*1k + LDDE nb0,2*SIZE(B) + LDDE nb1,3*SIZE(B) + + VMAD a4,b0,t01,t01 + VMAD a4,b1,t05,t05 + + addl B,4*SIZE,B # 2n*2k + VLD na0,0(A) + VLD na4,4*SIZE(A) + VLD na8,8*SIZE(A) + VLD na12,12*SIZE(A) + + VMAD a8,b0,t02,t02 + VMAD a8,b1,t06,t06 + + VMAD a12,b0,t03,t03 + VMAD a12,b1,t07,t07 + + fillcs 0(PREA) + fillcs 8*SIZE(PREA) + subl PREA,16*SIZE,PREA + + subl KC,1,KC + VMAD na0,nb0,t00,t00 + VMAD na0,nb1,t04,t04 + + addl A,16*SIZE,A # 16m*1k + LDDE b0,0(B) + LDDE b1,1*SIZE(B) + + VMAD na4,nb0,t01,t01 + VMAD na4,nb1,t05,t05 + + VLD a0,0(A) # get 3rd 16a + VLD a4,4*SIZE(A) + VLD a8,8*SIZE(A) + VLD a12,12*SIZE(A) + + VMAD na8,nb0,t02,t02 + VMAD na8,nb1,t06,t06 + + VMAD na12,nb0,t03,t03 + VMAD na12,nb1,t07,t07 + + fillcs 0(PREA) + fillcs 8*SIZE(PREA) + subl PREA,16*SIZE,PREA + bne KC,$Panel_16x2x2 + + +$Rest_16x2x1: + LDDE ALPHA, 192($sp) # get alpha +#ifndef TRMMKERNEL + blbc KC1, $Write_16x2 +#else + blbc TEMP, $Write_16x2 +#endif + + addl A,16*SIZE,A # 16m*1k + addl B,2*SIZE,B # 2n*1k + + VMAD a0,b0,t00,t00 + VMAD a0,b1,t04,t04 + + fillcs 0(PREA) + fillcs 8*SIZE(PREA) + subl PREA,16*SIZE,PREA + + VMAD a4,b0,t01,t01 + VMAD a4,b1,t05,t05 + VMAD a8,b0,t02,t02 + VMAD a8,b1,t06,t06 + VMAD a12,b0,t03,t03 + VMAD a12,b1,t07,t07 + + +$Write_16x2: + +#ifndef TRMMKERNEL + and CO, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_CO_Access_16x2 + +$Align_CO_Access_16x2: + VLD c00,0(CO) # get 1st colum of 16c + VLD c01,4*SIZE(CO) + VLD c02,8*SIZE(CO) + VLD c03,12*SIZE(CO) + + VMAD t00,ALPHA,c00,t00 + VMAD t01,ALPHA,c01,t01 + VMAD t02,ALPHA,c02,t02 + VMAD t03,ALPHA,c03,t03 + + VST t00,0(CO) + VST t01,4*SIZE(CO) + VST t02,8*SIZE(CO) + VST t03,12*SIZE(CO) + jmp $Access_C1_16x2 + +$UnAlign_CO_Access_16x2: + VLD_UL c00, 0*VEC_LEN*SIZE(CO) + VLD_UH c04, 1*VEC_LEN*SIZE(CO) + + VLD_UL c01, 1*VEC_LEN*SIZE(CO) + VLD_UH c05, 2*VEC_LEN*SIZE(CO) + + VLD_UL c02, 2*VEC_LEN*SIZE(CO) + VLD_UH c06, 3*VEC_LEN*SIZE(CO) + + VLD_UL c03, 3*VEC_LEN*SIZE(CO) + VLD_UH c07, 4*VEC_LEN*SIZE(CO) + + vbisw c00,c04,c00 + vbisw c01,c05,c01 + vbisw c02,c06,c02 + vbisw c03,c07,c03 + + VMAD t00,ALPHA,c00,t00 + VMAD t01,ALPHA,c01,t01 + VMAD t02,ALPHA,c02,t02 + VMAD t03,ALPHA,c03,t03 + + VST_UL t00, 0*VEC_LEN*SIZE(CO) + VST_UH t00, 1*VEC_LEN*SIZE(CO) + + VST_UL t01, 1*VEC_LEN*SIZE(CO) + VST_UH t01, 2*VEC_LEN*SIZE(CO) + + VST_UL t02, 2*VEC_LEN*SIZE(CO) + VST_UH t02, 3*VEC_LEN*SIZE(CO) + + VST_UL t03, 3*VEC_LEN*SIZE(CO) + VST_UH t03, 4*VEC_LEN*SIZE(CO) + + +$Access_C1_16x2: + and C1, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_C1_Access_16x2 + +$Align_C1_Access_16x2: + VLD c04,0(C1) + VLD c05,4*SIZE(C1) + VLD c06,8*SIZE(C1) + VLD c07,12*SIZE(C1) + + VMAD t04,ALPHA,c04,t04 + VMAD t05,ALPHA,c05,t05 + VMAD t06,ALPHA,c06,t06 + VMAD t07,ALPHA,c07,t07 + + VST t04,0(C1) + VST t05,4*SIZE(C1) + VST t06,8*SIZE(C1) + VST t07,12*SIZE(C1) + jmp $End_NC_Unroll2 + +$UnAlign_C1_Access_16x2: + VLD_UL c04, 0*VEC_LEN*SIZE(C1) + VLD_UH t00, 1*VEC_LEN*SIZE(C1) + + VLD_UL c05, 1*VEC_LEN*SIZE(C1) + VLD_UH t01, 2*VEC_LEN*SIZE(C1) + + VLD_UL c06, 2*VEC_LEN*SIZE(C1) + VLD_UH t02, 3*VEC_LEN*SIZE(C1) + + VLD_UL c07, 3*VEC_LEN*SIZE(C1) + VLD_UH t03, 4*VEC_LEN*SIZE(C1) + + vbisw c04,t00,c04 + vbisw c05,t01,c05 + vbisw c06,t02,c06 + vbisw c07,t03,c07 + + VMAD t04,ALPHA,c04,t04 + VMAD t05,ALPHA,c05,t05 + VMAD t06,ALPHA,c06,t06 + VMAD t07,ALPHA,c07,t07 + + VST_UL t04, 0*VEC_LEN*SIZE(C1) + VST_UH t04, 1*VEC_LEN*SIZE(C1) + + VST_UL t05, 1*VEC_LEN*SIZE(C1) + VST_UH t05, 2*VEC_LEN*SIZE(C1) + + VST_UL t06, 2*VEC_LEN*SIZE(C1) + VST_UH t06, 3*VEC_LEN*SIZE(C1) + + VST_UL t07, 3*VEC_LEN*SIZE(C1) + VST_UH t07, 4*VEC_LEN*SIZE(C1) + jmp $End_NC_Unroll2 # loop m finished + + +#else + + and CO, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_CO_Access_16x2 + +$Align_CO_Access_16x2: + VMUL t00,ALPHA,t00 + VMUL t01,ALPHA,t01 + VMUL t02,ALPHA,t02 + VMUL t03,ALPHA,t03 + + VST t00,0(CO) + VST t01,4*SIZE(CO) + VST t02,8*SIZE(CO) + VST t03,12*SIZE(CO) + jmp $Access_C1_16x2 + +$UnAlign_CO_Access_16x2: + VMUL t00,ALPHA,t00 + VMUL t01,ALPHA,t01 + VMUL t02,ALPHA,t02 + VMUL t03,ALPHA,t03 + + VST_UL t00, 0*VEC_LEN*SIZE(CO) + VST_UH t00, 1*VEC_LEN*SIZE(CO) + + VST_UL t01, 1*VEC_LEN*SIZE(CO) + VST_UH t01, 2*VEC_LEN*SIZE(CO) + + VST_UL t02, 2*VEC_LEN*SIZE(CO) + VST_UH t02, 3*VEC_LEN*SIZE(CO) + + VST_UL t03, 3*VEC_LEN*SIZE(CO) + VST_UH t03, 4*VEC_LEN*SIZE(CO) + + +$Access_C1_16x2: + and C1, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_C1_Access_16x2 + +$Align_C1_Access_16x2: + VMUL t04,ALPHA,t04 + VMUL t05,ALPHA,t05 + VMUL t06,ALPHA,t06 + VMUL t07,ALPHA,t07 + + VST t04,0(C1) + VST t05,4*SIZE(C1) + VST t06,8*SIZE(C1) + VST t07,12*SIZE(C1) + jmp $TRMMKERNEL_16x2 + +$UnAlign_C1_Access_16x2: + VMUL t04,ALPHA,t04 + VMUL t05,ALPHA,t05 + VMUL t06,ALPHA,t06 + VMUL t07,ALPHA,t07 + + VST_UL t04, 0*VEC_LEN*SIZE(C1) + VST_UH t04, 1*VEC_LEN*SIZE(C1) + + VST_UL t05, 1*VEC_LEN*SIZE(C1) + VST_UH t05, 2*VEC_LEN*SIZE(C1) + + VST_UL t06, 2*VEC_LEN*SIZE(C1) + VST_UH t06, 3*VEC_LEN*SIZE(C1) + + VST_UL t07, 3*VEC_LEN*SIZE(C1) + VST_UH t07, 4*VEC_LEN*SIZE(C1) + +$TRMMKERNEL_16x2: +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl KC1, KK, TEMP +#ifdef LEFT + subl TEMP, 16, TEMP +#else + subl TEMP, 2, TEMP +#endif + + sll TEMP, 4 + BASE_SHIFT,KC + sll TEMP, 1 + BASE_SHIFT,TEMP + + addl A, KC, A + addl B, TEMP,B +#endif + +#ifdef LEFT + addl KK, 16, KK + nop +#endif + + jmp $End_NC_Unroll2 # loop m finished +#endif + + + + .align 5 + +.L25: + and MC1,8,MC + sll KC1,3+BASE_SHIFT,SPANA # spana=kc1*mc + nop + beq MC,.L26 + + addl A1,SPANA,PREA + subl PREA,8*SIZE,PREA # PREA-=MC + + + .align 5 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA))\ + || (!defined(LEFT) && !defined(TRANSA)) + mov B1, B +#else + sll KK, 3 + BASE_SHIFT,KC # mr=8 + sll KK, 1 + BASE_SHIFT,TEMP # nr=2 + + addl A,KC, A + addl B1,TEMP,B +#endif + + vcpys $f31,$f31,t00 # clear 16 registers + vcpys $f31,$f31,t01 + + LDDE b0,0(B) # Get 2b + LDDE b1,1*SIZE(B) + + vcpys $f31,$f31,t04 + vcpys $f31,$f31,t05 + + VLD a0,0(A) # Get 8a + VLD a4,4*SIZE(A) + + fillcs 0(CO) # fetch C + fillcs 0(C1) + fillcs 4*SIZE(CO) + fillcs 4*SIZE(C1) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl KC1, KK, TEMP +#elif defined(LEFT) + addl KK, 8, TEMP # mr=8 +#else + addl KK, 2, TEMP # nr=2 +#endif + sra TEMP, 1,KC + nop + beq KC,$Rest_8x2x1 + +#else + + mov B1, B + sra KC1,1,KC + vcpys $f31,$f31,t00 # clear 16 registers + vcpys $f31,$f31,t01 + + LDDE b0,0(B) # Get 2b + LDDE b1,1*SIZE(B) + + vcpys $f31,$f31,t04 + vcpys $f31,$f31,t05 + + VLD a0,0(A) # Get 8a + VLD a4,4*SIZE(A) + + fillcs 0(CO) # fetch C + fillcs 0(C1) + fillcs 4*SIZE(CO) + fillcs 4*SIZE(C1) + + beq KC,$Rest_8x2x1 +#endif + + +$Panel_8x2x2: + VMAD a0,b0,t00,t00 + VMAD a0,b1,t04,t04 + + LDDE nb0,2*SIZE(B) # get next 2b + LDDE nb1,3*SIZE(B) + + VMAD a4,b0,t01,t01 + VMAD a4,b1,t05,t05 + + addl B,4*SIZE,B # 2n*2k + VLD na8,8*SIZE(A) # get next 8a + VLD na12,12*SIZE(A) + + fillcs 0(PREA) + fillcs 4*SIZE(PREA) + subl PREA,8*SIZE,PREA + + subl KC,1,KC + VMAD na8,nb0,t00,t00 + VMAD na8,nb1,t04,t04 + + addl A,16*SIZE,A # 8m*2k + LDDE b0,0(B) + LDDE b1,1*SIZE(B) # get 3rd 2b + + VMAD na12,nb0,t01,t01 + VMAD na12,nb1,t05,t05 + + VLD a0,0(A) # get 3rd 8a + VLD a4,4*SIZE(A) + + fillcs 0(PREA) + fillcs 4*SIZE(PREA) + subl PREA,8*SIZE,PREA + bne KC,$Panel_8x2x2 + + +$Rest_8x2x1: + LDDE ALPHA,192($sp) # get alpha +#ifndef TRMMKERNEL + blbc KC1,$Write_8x2 +#else + blbc TEMP,$Write_8x2 +#endif + + addl A,8*SIZE,A # 8m*1k + addl B,2*SIZE,B # 2n*1K + + fillcs 0(PREA) + fillcs 4*SIZE(PREA) + subl PREA,8*SIZE,PREA + + VMAD a0,b0,t00,t00 + VMAD a0,b1,t04,t04 + VMAD a4,b0,t01,t01 + VMAD a4,b1,t05,t05 + + +$Write_8x2: + +#ifndef TRMMKERNEL + and CO, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_CO_Access_8x2 + +$Align_CO_Access_8x2: + VLD c00,0(CO) # get 1st colum of 16c + VLD c01,4*SIZE(CO) + + VMAD t00,ALPHA,c00,t00 + VMAD t01,ALPHA,c01,t01 + + VST t00,0(CO) + VST t01,4*SIZE(CO) + jmp $Access_C1_8x2 + +$UnAlign_CO_Access_8x2: + VLD_UL c00, 0*VEC_LEN*SIZE(CO) + VLD_UH c02, 1*VEC_LEN*SIZE(CO) + + VLD_UL c01, 1*VEC_LEN*SIZE(CO) + VLD_UH c03, 2*VEC_LEN*SIZE(CO) + + vbisw c00,c02,c00 + vbisw c01,c03,c01 + + VMAD t00,ALPHA,c00,t00 + VMAD t01,ALPHA,c01,t01 + + VST_UL t00, 0*VEC_LEN*SIZE(CO) + VST_UH t00, 1*VEC_LEN*SIZE(CO) + + VST_UL t01, 1*VEC_LEN*SIZE(CO) + VST_UH t01, 2*VEC_LEN*SIZE(CO) + + +$Access_C1_8x2: + and C1, (VEC_LEN*SIZE-1),$6 + addl CO,8*SIZE,CO # 8c + nop + bne $6,$UnAlign_C1_Access_8x2 + +$Align_C1_Access_8x2: + VLD c04,0(C1) + VLD c05,4*SIZE(C1) + + VMAD t04,ALPHA,c04,t04 + VMAD t05,ALPHA,c05,t05 + + VST t04,0(C1) + VST t05,4*SIZE(C1) + addl C1,8*SIZE,C1 + jmp .L26 + +$UnAlign_C1_Access_8x2: + VLD_UL c04, 0*VEC_LEN*SIZE(C1) + VLD_UH c06, 1*VEC_LEN*SIZE(C1) + + VLD_UL c05, 1*VEC_LEN*SIZE(C1) + VLD_UH c07, 2*VEC_LEN*SIZE(C1) + + vbisw c04,c06,c04 + vbisw c05,c07,c05 + + VMAD t04,ALPHA,c04,t04 + VMAD t05,ALPHA,c05,t05 + + VST_UL t04, 0*VEC_LEN*SIZE(C1) + VST_UH t04, 1*VEC_LEN*SIZE(C1) + + VST_UL t05, 1*VEC_LEN*SIZE(C1) + VST_UH t05, 2*VEC_LEN*SIZE(C1) + addl C1,8*SIZE,C1 + +#else + + and CO, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_CO_Access_8x2 + +$Align_CO_Access_8x2: + VMUL t00,ALPHA,t00 + VMUL t01,ALPHA,t01 + + VST t00,0(CO) + VST t01,4*SIZE(CO) + jmp $Access_C1_8x2 + +$UnAlign_CO_Access_8x2: + VMUL t00,ALPHA,t00 + VMUL t01,ALPHA,t01 + + VST_UL t00, 0*VEC_LEN*SIZE(CO) + VST_UH t00, 1*VEC_LEN*SIZE(CO) + + VST_UL t01, 1*VEC_LEN*SIZE(CO) + VST_UH t01, 2*VEC_LEN*SIZE(CO) + + +$Access_C1_8x2: + and C1, (VEC_LEN*SIZE-1),$6 + addl CO,8*SIZE,CO # 8c + nop + bne $6,$UnAlign_C1_Access_8x2 + +$Align_C1_Access_8x2: + VMUL t04,ALPHA,t04 + VMUL t05,ALPHA,t05 + + VST t04,0(C1) + VST t05,4*SIZE(C1) + addl C1,8*SIZE,C1 + jmp $TRMMKERNEL_8x2 + +$UnAlign_C1_Access_8x2: + VMUL t04,ALPHA,t04 + VMUL t05,ALPHA,t05 + + VST_UL t04, 0*VEC_LEN*SIZE(C1) + VST_UH t04, 1*VEC_LEN*SIZE(C1) + + VST_UL t05, 1*VEC_LEN*SIZE(C1) + VST_UH t05, 2*VEC_LEN*SIZE(C1) + addl C1,8*SIZE,C1 + +$TRMMKERNEL_8x2: + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl KC1, KK,TEMP +#ifdef LEFT + subl TEMP, 8,TEMP # mr=8 +#else + subl TEMP, 2,TEMP # nr=2 +#endif + + sll TEMP, 3 + BASE_SHIFT,KC + sll TEMP, 1 + BASE_SHIFT,TEMP + + addl A,KC,A + addl B,TEMP,B +#endif + +#ifdef LEFT + addl KK,8,KK + nop +#endif +#endif + + + + .align 5 + +.L26: # nr=2,mr=4------------------ + and MC1,4,MC # MC1&4 + beq MC,.L27 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B1, B + nop +#else + sll KK, 2 + BASE_SHIFT,KC # mr=4 + sll KK, 1 + BASE_SHIFT,TEMP # nr=2 + + addl A,KC,A + addl B1,TEMP,B +#endif + + vcpys $f31,$f31,t00 # clear 2vector registers + vcpys $f31,$f31,t04 + + LDDE b0,0(B) # get 2b + LDDE b1,1*SIZE(B) + + VLD a0,0(A) # Get 4 a + + fillcs 0(CO) # fetch C + fillcs 0(C1) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl KC1, KK, TEMP +#elif defined(LEFT) + addl KK, 4, TEMP +#else + addl KK, 2, TEMP +#endif + sra TEMP,1,KC + beq KC,$Rest_4x2x1 + +#else + + mov B1,B + sra KC1,1,KC + vcpys $f31,$f31,t00 # clear 2vector registers + vcpys $f31,$f31,t04 + + LDDE b0,0(B) # get 2b + LDDE b1,1*SIZE(B) + + VLD a0,0(A) # Get 4 a + + fillcs 0(CO) # fetch C + fillcs 0(C1) + + beq KC,$Rest_4x2x1 +#endif + +$Panel_4x2x2: + VMAD a0,b0,t00,t00 + VMAD a0,b1,t04,t04 + + LDDE nb0,2*SIZE(B) # get next 2b + LDDE nb1,3*SIZE(B) + + addl B,4*SIZE,B # 2n*2K + VLD a4,4*SIZE(A) # get next 4a + + subl KC,1,KC + VMAD a4,nb0,t00,t00 + VMAD a4,nb1,t04,t04 + + addl A,8*SIZE,A # 4m*2k + LDDE b0,0(B) # get 3rd 2b + LDDE b1,1*SIZE(B) + + VLD a0,0(A) # get 3rd 4a + bne KC,$Panel_4x2x2 + + +$Rest_4x2x1: + LDDE ALPHA,192($sp) # get alpha +#ifndef TRMMKERNEL + blbc KC1,$Write_4x2 +#else + blbc TEMP,$Write_4x2 +#endif + + addl A,4*SIZE,A # 4m*1k + addl B,2*SIZE,B # 2n*1K + + VMAD a0,b0,t00,t00 + VMAD a0,b1,t04,t04 + + +$Write_4x2: + +#ifndef TRMMKERNEL + and CO, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_CO_Access_4x2 + +$Align_CO_Access_4x2: + VLD c00,0(CO) # get 1st colum of 16c + VMAD t00,ALPHA,c00,t00 + VST t00,0(CO) + jmp $Access_C1_4x2 + +$UnAlign_CO_Access_4x2: + VLD_UL c00, 0*VEC_LEN*SIZE(CO) + VLD_UH c01, 1*VEC_LEN*SIZE(CO) + + vbisw c00,c01,c00 + + VMAD t00,ALPHA,c00,t00 + + VST_UL t00, 0*VEC_LEN*SIZE(CO) + VST_UH t00, 1*VEC_LEN*SIZE(CO) + + +$Access_C1_4x2: + and C1, (VEC_LEN*SIZE-1),$6 + addl CO,4*SIZE,CO # 4c + nop + bne $6,$UnAlign_C1_Access_4x2 + +$Align_C1_Access_4x2: + VLD c04,0(C1) + VMAD t04,ALPHA,c04,t04 + VST t04,0(C1) + addl C1,4*SIZE,C1 + jmp .L27 + +$UnAlign_C1_Access_4x2: + VLD_UL c04, 0*VEC_LEN*SIZE(C1) + VLD_UH c05, 1*VEC_LEN*SIZE(C1) + + vbisw c04,c05,c04 + + VMAD t04,ALPHA,c04,t04 + + VST_UL t04, 0*VEC_LEN*SIZE(C1) + VST_UH t04, 1*VEC_LEN*SIZE(C1) + addl C1,4*SIZE,C1 + +#else + + and CO, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_CO_Access_4x2 + +$Align_CO_Access_4x2: + VMUL t00,ALPHA,t00 + VST t00,0(CO) + jmp $Access_C1_4x2 + +$UnAlign_CO_Access_4x2: + VMUL t00,ALPHA,t00 + VST_UL t00, 0*VEC_LEN*SIZE(CO) + VST_UH t00, 1*VEC_LEN*SIZE(CO) + + +$Access_C1_4x2: + and C1, (VEC_LEN*SIZE-1),$6 + addl CO,4*SIZE,CO # 4c + nop + bne $6,$UnAlign_C1_Access_4x2 + +$Align_C1_Access_4x2: + VMUL t04,ALPHA,t04 + VST t04,0(C1) + addl C1,4*SIZE,C1 + jmp $TRMMKERNEL_4x2 + +$UnAlign_C1_Access_4x2: + VMUL t04,ALPHA,t04 + VST_UL t04, 0*VEC_LEN*SIZE(C1) + VST_UH t04, 1*VEC_LEN*SIZE(C1) + addl C1,4*SIZE,C1 + +$TRMMKERNEL_4x2: +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl KC1, KK, TEMP +#ifdef LEFT + subl TEMP, 4, TEMP +#else + subl TEMP, 2, TEMP +#endif + + sll TEMP, 2 + BASE_SHIFT,KC + sll TEMP, 1 + BASE_SHIFT,TEMP + + addl A, KC, A + addl B, TEMP, B +#endif + +#ifdef LEFT + addl KK, 4, KK + nop +#endif +#endif + + + + .align 5 + +.L27: # nr=2,mr=2-------------- + and MC1,2,MC + beq MC,.L28 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B1, B +#else + sll KK, 1 + BASE_SHIFT,KC # mr=nr=2 + nop + addl A,KC,A + addl B1,KC,B +#endif + + fclr t00 # clear 4 register + fclr t01 + fclr t04 + fclr t05 + + LD b0,0(B) # get 2b + LD b1,1*SIZE(B) + + LD a0,0(A) # get 2a + LD a4,1*SIZE(A) + + fillcs 0(CO) # fetch C + fillcs 0(C1) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl KC1, KK, TEMP +#else + addl KK, 2, TEMP # mr=nr=2 +#endif + sra TEMP,1, KC + nop + nop + beq KC,$Rest_2x2x1 + +#else + + mov B1,B # Reset B + sra KC1,1,KC # Unroll KC 2, KC=KC1/2 + fclr t00 # clear 4 register + fclr t01 + fclr t04 + fclr t05 + + LD b0,0(B) # get 2b + LD b1,1*SIZE(B) + + LD a0,0(A) # get 2a + LD a4,1*SIZE(A) + + fillcs 0(CO) # fetch C + fillcs 0(C1) + beq KC,$Rest_2x2x1 + +#endif + + +$Panel_2x2x2: + MAD a0,b0,t00,t00 + MAD a0,b1,t04,t04 + + LD nb0,2*SIZE(B) # get next 2b + LD nb1,3*SIZE(B) + + MAD a4,b0,t01,t01 + MAD a4,b1,t05,t05 + + addl B,4*SIZE,B # 2(n)*2(k) + LD a8,2*SIZE(A) # get next 2a + LD a12,3*SIZE(A) + + subl KC,1,KC + MAD a8,nb0,t00,t00 + MAD a8,nb1,t04,t04 + + addl A,4*SIZE,A # 2m*2k + LD b0,0(B) + LD b1,1*SIZE(B) + + MAD a12,nb0,t01,t01 + MAD a12,nb1,t05,t05 + + LD a0,0(A) + LD a4,1*SIZE(A) + bne KC,$Panel_2x2x2 + + +$Rest_2x2x1: + LD ALPHA,192($sp) # Get ALPHA +#ifndef TRMMKERNEL + blbc KC1,$Write_2x2 +#else + blbc TEMP,$Write_2x2 +#endif + + addl A,2*SIZE,A # 2m*1k + addl B,2*SIZE,B # 2n*1K + + MAD a0,b0,t00,t00 + MAD a0,b1,t04,t04 + MAD a4,b0,t01,t01 + MAD a4,b1,t05,t05 + + +$Write_2x2: + +#ifndef TRMMKERNEL + LD c00,0(CO) + LD c04,0(C1) + LD c01,1*SIZE(CO) + LD c05,1*SIZE(C1) + + MAD t00,ALPHA,c00,t00 + MAD t04,ALPHA,c04,t04 + MAD t01,ALPHA,c01,t01 + MAD t05,ALPHA,c05,t05 + + ST t00,0(CO) + ST t04,0(C1) + ST t01,1*SIZE(CO) + ST t05,1*SIZE(C1) + + addl CO,2*SIZE,CO # 2c + addl C1,2*SIZE,C1 + +#else + + MUL t00,ALPHA,t00 + MUL t04,ALPHA,t04 + MUL t01,ALPHA,t01 + MUL t05,ALPHA,t05 + + ST t00,0(CO) + ST t04,0(C1) + ST t01,1*SIZE(CO) + ST t05,1*SIZE(C1) + + addl CO,2*SIZE,CO # 2c + addl C1,2*SIZE,C1 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl KC1, KK, TEMP + subl TEMP, 2, TEMP + + sll TEMP, 1 + BASE_SHIFT, KC + nop + + addl A,KC, A + addl B,KC, B +#endif + +#ifdef LEFT + addl KK, 2, KK +#endif +#endif + + + + .align 5 +.L28: + and MC1,1,MC # nr=2,mr=1------------------- + beq MC,$End_NC_Unroll2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B1, B +#else + sll KK, BASE_SHIFT,KC # mr=1 + sll KK, 1 + BASE_SHIFT,TEMP # nr=2 + + addl A,KC,A + addl B1,TEMP,B +#endif + + fclr t00 # clear 2 registers + fclr t04 + + LD b0,0(B) # 2b + LD b1,1*SIZE(B) + + LD a0,0(A) # 1a + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl KC1, KK, TEMP +#elif defined(LEFT) + addl KK, 1, TEMP +#else + addl KK, 2, TEMP +#endif + sra TEMP,1,KC + nop + beq KC,$Rest_1x2x1 + +#else + mov B1,B # Reset B + sra KC1,1,KC # Unroll KC 2, KC=KC1/2 + fclr t00 # clear 2 registers + fclr t04 + + LD b0,0(B) # 2b + LD b1,1*SIZE(B) + + LD a0,0(A) # 1a + beq KC,$Rest_1x2x1 +#endif + + + .align 5 + +$Panel_1x2x2: + MAD a0,b0,t00,t00 + MAD a0,b1,t04,t04 + + LD nb0,2*SIZE(B) # get next 2b + LD nb1,3*SIZE(B) + + addl B,4*SIZE,B # 2(n)*2(k) + LD a8,1*SIZE(A) # get next 1a + + subl KC,1,KC + MAD a8,nb0,t00,t00 + MAD a8,nb1,t04,t04 + + addl A,2*SIZE,A # 1m*2k + LD b0,0(B) # get 3rd 2b + LD b1,1*SIZE(B) + + LD a0,0(A) # get 3rd 1a + bne KC,$Panel_1x2x2 + + +$Rest_1x2x1: + LD ALPHA,192($sp) # Get ALPHA +#ifndef TRMMKERNEL + blbc KC1,$Write_1x2 +#else + blbc TEMP,$Write_1x2 +#endif + + addl A,1*SIZE,A # 1m*1k + addl B,2*SIZE,B # 2n*1K + + MAD a0,b0,t00,t00 + MAD a0,b1,t04,t04 + + +$Write_1x2: # Write back 2 results +#ifndef TRMMKERNEL + LD c00,0(CO) + LD c04,0(C1) + + MAD t00,ALPHA,c00,t00 + MAD t04,ALPHA,c04,t04 + + ST t00,0(CO) + ST t04,0(C1) + +#else + + MUL t00,ALPHA,t00 + MUL t04,ALPHA,t04 + + ST t00,0(CO) + ST t04,0(C1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl KC1, KK, TEMP +#ifdef LEFT + subl TEMP, 1,TEMP +#else + subl TEMP, 2,TEMP +#endif + + sll TEMP, BASE_SHIFT,KC + sll TEMP, 1 + BASE_SHIFT,TEMP + + addl A,KC,A + addl B,TEMP,B +#endif + +#ifdef LEFT + addl KK,1,KK +#endif +#endif + + + .align 5 + +$End_NC_Unroll2: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl KK, 2,KK +#endif + mov B, B1 + + + .align 5 +$Begin_NC_Unroll1: # Nr=1 + and NC1,1,NC # NC=NC1&1 + beq NC,$Kernel_End + + mov A1,A # Reset A + mov C,CO # Reset C + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET,KK # reset offset +#endif + + sll KC1,4+BASE_SHIFT,SPANA # SPANA=KC1*MC + subl PREA,16*SIZE,PREA + + sra MC1,4,MC # MC=MC1/16 + beq MC,.L35 # MC=0:MC1<16 + + +.L3: # nr=1,mr=16 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B1,B +#else + sll KK, 4 + BASE_SHIFT, KC # mr=16 + sll KK, BASE_SHIFT,TEMP # nr=1 + + addl A,KC,A + addl B1,TEMP,B +#endif + + vcpys $f31,$f31,t00 # CLEAR 16 Register + vcpys $f31,$f31,t01 + vcpys $f31,$f31,t02 + vcpys $f31,$f31,t03 + + LDDE b0,0(B) # get 1b and 16a + + VLD a0,0(A) + VLD a4,4*SIZE(A) + VLD a8,8*SIZE(A) + VLD a12,12*SIZE(A) + + fillcs 0(CO) # fetch C + fillcs 4*SIZE(CO) + fillcs 8*SIZE(CO) + fillcs 12*SIZE(CO) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl KC1, KK, TEMP +#elif defined(LEFT) + addl KK, 16, TEMP +#else + addl KK, 1, TEMP +#endif + sra TEMP, 1, KC + beq KC,$Rest_16x1x1 + +#else + + mov B1,B # Set B + sra KC1,1,KC # Unroll KC 2, KC=KC1/2 + vcpys $f31,$f31,t00 # CLEAR 16 Register + vcpys $f31,$f31,t01 + vcpys $f31,$f31,t02 + vcpys $f31,$f31,t03 + + LDDE b0,0(B) # get 1b and 16a + + VLD a0,0(A) + VLD a4,4*SIZE(A) + VLD a8,8*SIZE(A) + VLD a12,12*SIZE(A) + + fillcs 0(CO) # fetch C + fillcs 4*SIZE(CO) + fillcs 8*SIZE(CO) + fillcs 12*SIZE(CO) + + beq KC,$Rest_16x1x1 + +#endif + +$Panel_16x1x2: + addl A,16*SIZE,A # 16(m)*1(k) + LDDE b1,1*SIZE(B) # get next 1b + + VMAD a0,b0,t00,t00 + VMAD a4,b0,t01,t01 + + addl B,2*SIZE,B # 1(n)*2(k) + VLD na0,0(A) # get next 16a + VLD na4,4*SIZE(A) + VLD na8,8*SIZE(A) + VLD na12,12*SIZE(A) + + VMAD a8,b0,t02,t02 + VMAD a12,b0,t03,t03 + + subl KC,1,KC + addl A,16*SIZE,A # 16m*1k + LDDE b0,0(B) + + VMAD na0,b1,t00,t00 + VMAD na4,b1,t01,t01 + + VLD a0,0(A) + VLD a4,4*SIZE(A) + VLD a8,8*SIZE(A) + VLD a12,12*SIZE(A) + + VMAD na8,b1,t02,t02 + VMAD na12,b1,t03,t03 + bne KC,$Panel_16x1x2 + + +$Rest_16x1x1: + LDDE ALPHA,192($sp) +#ifndef TRMMKERNEL + blbc KC1,$Write_16x1 # If(KC1[0]==0) goto $Write_16x1 +#else + blbc TEMP,$Write_16x1 # If(KC1[0]==0) goto $Write_16x1 +#endif + + addl A,16*SIZE,A # 16a*1k + addl B,1*SIZE,B # 1b*1k + + VMAD a0,b0,t00,t00 + VMAD a4,b0,t01,t01 + VMAD a8,b0,t02,t02 + VMAD a12,b0,t03,t03 + + +$Write_16x1: + +#ifndef TRMMKERNEL + and CO, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_CO_Access_16x1 + +$Align_CO_Access_16x1: + VLD c00,0(CO) # get 1st colum of 16c + VLD c01,4*SIZE(CO) + VLD c02,8*SIZE(CO) + VLD c03,12*SIZE(CO) + + VMAD t00,ALPHA,c00,t00 + VMAD t01,ALPHA,c01,t01 + VMAD t02,ALPHA,c02,t02 + VMAD t03,ALPHA,c03,t03 + + VST t00,0(CO) + VST t01,4*SIZE(CO) + VST t02,8*SIZE(CO) + VST t03,12*SIZE(CO) + jmp $Kernel_End + +$UnAlign_CO_Access_16x1: + VLD_UL c00, 0*VEC_LEN*SIZE(CO) + VLD_UH c04, 1*VEC_LEN*SIZE(CO) + + VLD_UL c01, 1*VEC_LEN*SIZE(CO) + VLD_UH c05, 2*VEC_LEN*SIZE(CO) + + VLD_UL c02, 2*VEC_LEN*SIZE(CO) + VLD_UH c06, 3*VEC_LEN*SIZE(CO) + + VLD_UL c03, 3*VEC_LEN*SIZE(CO) + VLD_UH c07, 4*VEC_LEN*SIZE(CO) + + vbisw c00,c04,c00 + vbisw c01,c05,c01 + vbisw c02,c06,c02 + vbisw c03,c07,c03 + + VMAD t00,ALPHA,c00,t00 + VMAD t01,ALPHA,c01,t01 + VMAD t02,ALPHA,c02,t02 + VMAD t03,ALPHA,c03,t03 + + VST_UL t00, 0*VEC_LEN*SIZE(CO) + VST_UH t00, 1*VEC_LEN*SIZE(CO) + + VST_UL t01, 1*VEC_LEN*SIZE(CO) + VST_UH t01, 2*VEC_LEN*SIZE(CO) + + VST_UL t02, 2*VEC_LEN*SIZE(CO) + VST_UH t02, 3*VEC_LEN*SIZE(CO) + + VST_UL t03, 3*VEC_LEN*SIZE(CO) + VST_UH t03, 4*VEC_LEN*SIZE(CO) + jmp $Kernel_End + +#else + + and CO, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_CO_Access_16x1 + +$Align_CO_Access_16x1: + VMUL t00,ALPHA,t00 + VMUL t01,ALPHA,t01 + VMUL t02,ALPHA,t02 + VMUL t03,ALPHA,t03 + + VST t00,0(CO) + VST t01,4*SIZE(CO) + VST t02,8*SIZE(CO) + VST t03,12*SIZE(CO) + jmp $TRMMKERNEL_16x1 + +$UnAlign_CO_Access_16x1: + VMUL t00,ALPHA,t00 + VMUL t01,ALPHA,t01 + VMUL t02,ALPHA,t02 + VMUL t03,ALPHA,t03 + + VST_UL t00, 0*VEC_LEN*SIZE(CO) + VST_UH t00, 1*VEC_LEN*SIZE(CO) + + VST_UL t01, 1*VEC_LEN*SIZE(CO) + VST_UH t01, 2*VEC_LEN*SIZE(CO) + + VST_UL t02, 2*VEC_LEN*SIZE(CO) + VST_UH t02, 3*VEC_LEN*SIZE(CO) + + VST_UL t03, 3*VEC_LEN*SIZE(CO) + VST_UH t03, 4*VEC_LEN*SIZE(CO) + +$TRMMKERNEL_16x1: +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl KC1, KK, TEMP +#ifdef LEFT + subl TEMP, 16, TEMP +#else + subl TEMP, 1,TEMP +#endif + + sll TEMP, 4 + BASE_SHIFT,KC + sll TEMP, BASE_SHIFT, TEMP + + addl A,KC,A + addl B,TEMP,B +#endif + +#ifdef LEFT + addl KK, 16, KK + nop +#endif + + jmp $Kernel_End +#endif + + + + .align 5 +.L35: # nr=1,mr=8------------------ + and MC1,8,MC + sll KC1,3+BASE_SHIFT,SPANA # spana=kc1*mc + nop + beq MC,.L36 # MC1<8 + + addl A1,SPANA,PREA + subl PREA,8*SIZE,PREA # PREA-=MC + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B1, B +#else + sll KK, 3 + BASE_SHIFT,KC # mr=8 + sll KK, BASE_SHIFT,TEMP # nr=1 + + addl A,KC, A + addl B1,TEMP,B +#endif + + vcpys $f31,$f31,t00 # CLEAR 8Register + vcpys $f31,$f31,t01 + + LDDE b0,0(B) # get 1b + + VLD a0,0(A) # get 8a + VLD a4,4*SIZE(A) + + fillcs 0(CO) # fetch C + fillcs 4*SIZE(CO) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl KC1, KK,TEMP +#elif defined(LEFT) + addl KK, 8,TEMP +#else + addl KK, 1,TEMP +#endif + sra TEMP,1,KC + nop + beq KC,$Rest_8x1x1 + +#else + + mov B1, B + sra KC1,1,KC + vcpys $f31,$f31,t00 # CLEAR 8Register + vcpys $f31,$f31,t01 + + LDDE b0,0(B) # get 1b + + VLD a0,0(A) # get 8a + VLD a4,4*SIZE(A) + + fillcs 0(CO) # fetch C + fillcs 4*SIZE(CO) + beq KC,$Rest_8x1x1 + +#endif + + +$Panel_8x1x2: + VMAD a0,b0,t00,t00 + VMAD a4,b0,t01,t01 + + LDDE nb0,1*SIZE(B) # get next 1b + + addl B,2*SIZE,B # 1(n)*2k + VLD na8,8*SIZE(A) # get next 8a + VLD na12,12*SIZE(A) + + fillcs 0(PREA) + subl PREA,8*SIZE,PREA + + subl KC,1,KC + VMAD na8,nb0,t00,t00 + VMAD na12,nb0,t01,t01 + + addl A,16*SIZE,A # 8m*2k + LDDE b0,0(B) # get 3rd 1b + + VLD a0,0(A) # get 3rd 8a + VLD a4,4*SIZE(A) + + fillcs 0(PREA) + subl PREA,8*SIZE,PREA + bne KC,$Panel_8x1x2 + + +$Rest_8x1x1: + LDDE ALPHA,192($sp) # Get ALPHA +#ifndef TRMMKERNEL + blbc KC1,$Write_8x1 +#else + blbc TEMP,$Write_8x1 +#endif + + addl A,8*SIZE,A # 8m*1k + addl B,1*SIZE,B # 1n*1k + + VMAD a0,b0,t00,t00 + VMAD a4,b0,t01,t01 + + +$Write_8x1: + +#ifndef TRMMKERNEL + and CO, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_CO_Access_8x1 + +$Align_CO_Access_8x1: + VLD c00,0(CO) # get 1st colum of 16c + VLD c01,4*SIZE(CO) + + VMAD t00,ALPHA,c00,t00 + VMAD t01,ALPHA,c01,t01 + + VST t00,0(CO) + VST t01,4*SIZE(CO) + addl CO,8*SIZE,CO # 8c + jmp .L36 + +$UnAlign_CO_Access_8x1: + VLD_UL c00, 0*VEC_LEN*SIZE(CO) + VLD_UH c02, 1*VEC_LEN*SIZE(CO) + + VLD_UL c01, 1*VEC_LEN*SIZE(CO) + VLD_UH c03, 2*VEC_LEN*SIZE(CO) + + vbisw c00,c02,c00 + vbisw c01,c03,c01 + + VMAD t00,ALPHA,c00,t00 + VMAD t01,ALPHA,c01,t01 + + VST_UL t00, 0*VEC_LEN*SIZE(CO) + VST_UH t00, 1*VEC_LEN*SIZE(CO) + + VST_UL t01, 1*VEC_LEN*SIZE(CO) + VST_UH t01, 2*VEC_LEN*SIZE(CO) + addl CO,8*SIZE,CO # 8c + +#else + + and CO, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_CO_Access_8x1 + +$Align_CO_Access_8x1: + VMUL t00,ALPHA,t00 + VMUL t01,ALPHA,t01 + + VST t00,0(CO) + VST t01,4*SIZE(CO) + jmp $TRMMKERNEL_8x1 + +$UnAlign_CO_Access_8x1: + VMUL t00,ALPHA,t00 + VMUL t01,ALPHA,t01 + + VST_UL t00, 0*VEC_LEN*SIZE(CO) + VST_UH t00, 1*VEC_LEN*SIZE(CO) + + VST_UL t01, 1*VEC_LEN*SIZE(CO) + VST_UH t01, 2*VEC_LEN*SIZE(CO) + +$TRMMKERNEL_8x1: + addl CO,8*SIZE,CO # 8c +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl KC1, KK, TEMP +#ifdef LEFT + subl TEMP, 8, TEMP +#else + subl TEMP, 1, TEMP +#endif + + sll TEMP, 3 + BASE_SHIFT, KC + sll TEMP, BASE_SHIFT,TEMP + + addl A,KC, A + addl B,TEMP,B +#endif + +#ifdef LEFT + addl KK,8, KK +#endif +#endif + + + + .align 5 +.L36: # nr=1,mr=4--------------- + and MC1,4,MC # MC1&4 + beq MC,.L37 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA))\ + || (!defined(LEFT) && !defined(TRANSA)) + mov B1, B +#else + sll KK, 2 + BASE_SHIFT, KC # mr=4 + sll KK, BASE_SHIFT, TEMP # nr=1 + + addl A,KC,A + addl B1,TEMP,B +#endif + + vcpys $f31,$f31,t00 # CLEAR 4 Register + + LDDE b0,0(B) + VLD a0,0(A) + + fillcs 0(CO) # fetch C + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl KC1, KK, TEMP +#elif defined(LEFT) + addl KK, 4, TEMP # mr=4 +#else + addl KK, 1, TEMP # nr=1 +#endif + sra TEMP,1, KC + beq KC,$Rest_4x1x1 + +#else + + mov B1,B # Reset B + sra KC1,1,KC # Unroll KC 2, KC=KC1/2 + vcpys $f31,$f31,t00 # CLEAR 4 Register + + LDDE b0,0(B) + VLD a0,0(A) + + fillcs 0(CO) # fetch C + beq KC,$Rest_4x1x1 +#endif + + +$Panel_4x1x2: + VMAD a0,b0,t00,t00 + + LDDE nb0,1*SIZE(B) + VLD a4,4*SIZE(A) + addl B,2*SIZE,B # 1(n)*2(k)*8Byte + + subl KC,1,KC + VMAD a4,nb0,t00,t00 + + addl A,8*SIZE,A # 4m*2k + LDDE b0,0(B) + VLD a0,0(A) + + bne KC,$Panel_4x1x2 + + +$Rest_4x1x1: + LDDE ALPHA,192($sp) # Get ALPHA +#ifndef TRMMKERNEL + blbc KC1,$Write_4x1 +#else + blbc TEMP,$Write_4x1 +#endif + + addl A,4*SIZE,A # 4m*1k + addl B,1*SIZE,B # 1n*1K + + VMAD a0,b0,t00,t00 + + +$Write_4x1: # Write back 4 results + +#ifndef TRMMKERNEL + and CO, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_CO_Access_4x1 + +$Align_CO_Access_4x1: + VLD c00,0(CO) # get 1st colum of 16c + VMAD t00,ALPHA,c00,t00 + VST t00,0(CO) + addl CO,4*SIZE,CO # 4c + jmp .L37 + +$UnAlign_CO_Access_4x1: + VLD_UL c00, 0*VEC_LEN*SIZE(CO) + VLD_UH c01, 1*VEC_LEN*SIZE(CO) + + vbisw c00,c01,c00 + + VMAD t00,ALPHA,c00,t00 + + VST_UL t00, 0*VEC_LEN*SIZE(CO) + VST_UH t00, 1*VEC_LEN*SIZE(CO) + addl CO,4*SIZE,CO # 4c + + +#else + and CO, (VEC_LEN*SIZE-1),$6 + bne $6,$UnAlign_CO_Access_4x1 + +$Align_CO_Access_4x1: + VMUL t00,ALPHA,t00 # careful: c00~c03 use the same register + VST t00,0(CO) + jmp $TRMMKERNEL_4x1 + +$UnAlign_CO_Access_4x1: + VMUL t00,ALPHA,t00 # careful: c00~c03 use the same register + + VST_UL t00, 0*VEC_LEN*SIZE(CO) + VST_UH t00, 1*VEC_LEN*SIZE(CO) + +$TRMMKERNEL_4x1: + addl CO,4*SIZE,CO # 4c +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl KC1, KK, TEMP +#ifdef LEFT + subl TEMP, 4, TEMP # mr=4 +#else + subl TEMP, 1, TEMP +#endif + + sll TEMP, 2 + BASE_SHIFT, KC + sll TEMP, BASE_SHIFT, TEMP + + addl A, KC, A + addl B, TEMP,B +#endif + +#ifdef LEFT + addl KK, 4, KK +#endif +#endif + + + + + .align 5 +.L37: # nr=1,mr=2------------------------- + and MC1,2,MC + beq MC,.L38 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B1, B +#else + sll KK, 1 + BASE_SHIFT,KC # mr=2 + sll KK, BASE_SHIFT, TEMP # nr=1 + + addl A,KC, A + addl B1,TEMP,B +#endif + + fclr t00 # CLEAR 2 Register + fclr t01 + + LD b0,0(B) + + LD a0,0(A) + LD a4,1*SIZE(A) + + fillcs 0(CO) # fetch C + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl KC1, KK, TEMP +#elif defined(LEFT) + addl KK, 2,TEMP +#else + addl KK, 1,TEMP +#endif + sra TEMP,1,KC + beq KC,.L373 + +#else + + mov B1,B # Reset B + sra KC1,1,KC # Unroll KC 2, KC=KC1/2 + fclr t00 # CLEAR 2 Register + fclr t01 + + LD b0,0(B) + + LD a0,0(A) + LD a4,1*SIZE(A) + + fillcs 0(CO) # fetch C + beq KC,.L373 + +#endif + +.L371: + MAD a0,b0,t00,t00 + MAD a4,b0,t01,t01 + + LD nb0,1*SIZE(B) + + addl B,2*SIZE,B # 1(n)*2(k) + LD a8,2*SIZE(A) + LD a12,3*SIZE(A) + + subl KC,1,KC + MAD a8,nb0,t00,t00 + MAD a12,nb0,t01,t01 + + addl A,4*SIZE,A # 2m*2k + LD b0,0(B) + + LD a0,0(A) + LD a4,1*SIZE(A) + bne KC,.L371 + +.L373: + LD ALPHA,192($sp) # Get ALPHA +#ifndef TRMMKERNEL + blbc KC1,.L374 +#else + blbc TEMP,.L374 +#endif + + addl A,2*SIZE,A # 2m*1k*8Byte + addl B,1*SIZE,B # 1n*1K*8Byte + + MAD a0,b0,t00,t00 + MAD a4,b0,t01,t01 + +.L374: # Write back 2 results + +#ifndef TRMMKERNEL + LD c00,0(CO) + LD c01,1*SIZE(CO) + + MAD t00,ALPHA,c00,t00 + MAD t01,ALPHA,c01,t01 + + ST t00,0(CO) + ST t01,1*SIZE(CO) + addl CO,2*SIZE,CO # 2c + +#else + + MUL t00,ALPHA,t00 + MUL t01,ALPHA,t01 + + ST t00,0(CO) + ST t01,1*SIZE(CO) + + addl CO,2*SIZE,CO # 2c + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl KC1, KK, TEMP +#ifdef LEFT + subl TEMP, 2, TEMP +#else + subl TEMP, 1, TEMP +#endif + + sll TEMP, 1 + BASE_SHIFT,KC + sll TEMP, BASE_SHIFT,TEMP + + addl A,KC,A + addl B,TEMP,B +#endif + +#ifdef LEFT + addl KK, 2, KK +#endif +#endif + + + + .align 5 +.L38: + and MC1,1,MC + beq MC,$Kernel_End + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B1, B +#else + sll KK, BASE_SHIFT,KC # mr=nr=1 + nop + + addl A,KC,A + addl B1,KC,B +#endif + + fclr t00 # CLEAR Results Register + + LD b0,0(B) + LD a0,0(A) # Get 16 A and 4 B + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl KC1, KK, TEMP +#else + addl KK, 1, TEMP # mr=nr=1 +#endif + sra TEMP,1,KC + nop + beq KC,.L383 + +#else + + mov B1,B # Reset B + sra KC1,1,KC # Unroll KC 2, KC=KC1/2 + fclr t00 # CLEAR Results Register + + LD b0,0(B) + LD a0,0(A) # Get 16 A and 4 B + + beq KC,.L383 +#endif + +.L381: + MAD a0,b0,t00,t00 + LD nb0,1*SIZE(B) + + addl B,2*SIZE,B # 1n*2k + LD a8,1*SIZE(A) + + + subl KC,1,KC + MAD a8,nb0,t00,t00 + + addl A,2*SIZE,A # 1m*2k + LD b0,0(B) + + LD a0,0(A) + bne KC,.L381 + + +.L383: + LD ALPHA,192($sp) # get alpha +#ifndef TRMMKERNEL + blbc KC1,.L384 +#else + blbc TEMP,.L384 +#endif + + addl A,1*SIZE,A # 1m*1k + addl B,1*SIZE,B # 1n*1K + + MAD a0,b0,t00,t00 + + +.L384: # Write back 1 results + +#ifndef TRMMKERNEL + LD c00,0(CO) + MAD t00,ALPHA,c00,t00 + ST t00,0(CO) + +#else + MUL t00,ALPHA,t00 + ST t00,0(CO) +#endif + + + +$Kernel_End: + ldl $9,328($sp) # Integer Saved Register + ldl $10,320($sp) + ldl $11,312($sp) + ldl $12,304($sp) + ldl $13,296($sp) +ldl $14,288($sp) +# Float Saved Register + LD $f2,280($sp) + LD $f3,272($sp) + LD $f4,264($sp) + LD $f5,256($sp) + LD $f6,248($sp) + LD $f7,240($sp) + LD $f8,232($sp) +LD $f9,224($sp) + + ldi $sp,STACKSIZE($sp) # + ret $31,($26),1 # + + EPILOGUE + + diff --git a/kernel/sw_64/gemv_n.S b/kernel/sw_64/gemv_n.S new file mode 100644 index 0000000..90284db --- /dev/null +++ b/kernel/sw_64/gemv_n.S @@ -0,0 +1,1647 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define STACKSIZE 72 +#define PREFETCHSIZE 32 + +#define M $16 +#define N $17 +#define A $20 +#define LDA $21 + +#define X $18 +#define INCX $19 +#define Y $22 +#define INCY $23 + +#define BUFFER $24 + +#define I $25 +#define J $27 + +#define Y1 $4 + +#define A1 $5 +#define A2 $6 +#define A3 $7 +#define A4 $8 + +#define alpha $f19 + +#define alpha1 $f0 +#define alpha2 $f1 +#define alpha3 $f10 +#define alpha4 $f11 + +#define y0 $f12 +#define y1 $f13 +#define y2 $f14 +#define y3 $f15 + +#define y4 $f16 +#define y5 $f17 +#define y6 $f18 +#define y7 $f21 + +#define a0 $f22 +#define a1 $f23 +#define a2 $f24 +#define a3 $f25 +#define a4 $f26 +#define a5 $f27 +#define a6 $f28 +#define a7 $f29 + +#define a8 $f2 +#define a9 $f3 +#define a10 $f4 +#define a11 $f5 +#define a12 $f6 +#define a13 $f7 +#define a14 $f8 +#define a15 $f9 + +#define tmp $f20 + PROLOGUE + + ldi $sp, -STACKSIZE($sp) + ldl X, 0 + STACKSIZE($sp) + ldl INCX, 8 + STACKSIZE($sp) + ldl Y, 16 + STACKSIZE($sp) + ldl INCY, 24 + STACKSIZE($sp) + ldl BUFFER, 32 + STACKSIZE($sp) + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + fstd tmp, 64($sp) + PROFCODE + + cmple M, 0, $0 + SXADDQ INCX, 0, INCX + cmple N, 0, $1 + SXADDQ INCY, 0, INCY + + or $0, $1, $0 + bne $0, $L999 + + SXADDQ LDA, 0, LDA + + cmpeq INCY, SIZE, $0 + bne $0, $L10 + + mov BUFFER, Y1 + + mov Y, BUFFER + mov Y1, Y + + sra M, 3, I + ble I, $L05 + .align 4 + +$L02: + ST $f31, 0 * SIZE(Y1) + ST $f31, 1 * SIZE(Y1) + ST $f31, 2 * SIZE(Y1) + ST $f31, 3 * SIZE(Y1) + ST $f31, 4 * SIZE(Y1) + ST $f31, 5 * SIZE(Y1) + ST $f31, 6 * SIZE(Y1) + ST $f31, 7 * SIZE(Y1) + + ldi Y1, 8 * SIZE(Y1) + ldi I, -1(I) + bgt I, $L02 + .align 4 + +$L05: + and M, 7, I + ble I, $L10 + .align 4 + +$L06: + ST $f31, 0 * SIZE(Y1) + addl Y1, SIZE, Y1 + + ldi I, -1(I) + bgt I, $L06 + .align 4 + +$L10: + sra N, 2, J + ble J, $L20 + .align 4 + +$L11: + LD alpha1, 0 * SIZE(X) + addl X, INCX, X + LD alpha2, 0 * SIZE(X) + addl X, INCX, X + LD alpha3, 0 * SIZE(X) + addl X, INCX, X + LD alpha4, 0 * SIZE(X) + addl X, INCX, X + + MUL alpha, alpha1, tmp + fmov tmp, alpha1 + MUL alpha, alpha2, tmp + fmov tmp, alpha2 + MUL alpha, alpha3, tmp + fmov tmp, alpha3 + MUL alpha, alpha4, tmp + fmov tmp, alpha4 + + mov A, A1 + addl A, LDA, A2 + addl A2, LDA, A3 + addl A3, LDA, A4 + s4addl LDA, A, A + + mov Y, Y1 + ldw $31, 4 * SIZE(X) + + sra M, 3, I + ble I, $L15 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD a4, 0 * SIZE(A2) + LD a5, 1 * SIZE(A2) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + + LD a8, 0 * SIZE(A3) + LD a9, 1 * SIZE(A3) + LD a10, 2 * SIZE(A3) + LD a11, 3 * SIZE(A3) + + LD y4, 4 * SIZE(Y1) + LD y5, 5 * SIZE(Y1) + LD y6, 6 * SIZE(Y1) + LD y7, 7 * SIZE(Y1) + + MUL alpha1, a0, tmp + fmov tmp, a0 + LD a12, 0 * SIZE(A4) + MUL alpha1, a1, tmp + fmov tmp, a1 + LD a13, 1 * SIZE(A4) + MUL alpha1, a2, tmp + fmov tmp, a2 + LD a14, 2 * SIZE(A4) + MUL alpha1, a3, tmp + fmov tmp, a3 + LD a15, 3 * SIZE(A4) + + ADD y0, a0, tmp + fmov tmp, y0 + LD a0, 4 * SIZE(A1) + MUL alpha2, a4, tmp + fmov tmp, a4 + unop + + ADD y1, a1, tmp + fmov tmp, y1 + LD a1, 5 * SIZE(A1) + MUL alpha2, a5, tmp + fmov tmp, a5 + unop + + ADD y2, a2, tmp + fmov tmp, y2 + LD a2, 6 * SIZE(A1) + MUL alpha2, a6, tmp + fmov tmp, a6 + unop + + ADD y3, a3, tmp + fmov tmp, y3 + LD a3, 7 * SIZE(A1) + MUL alpha2, a7, tmp + fmov tmp, a7 + unop + + ADD y0, a4, tmp + fmov tmp, y0 + LD a4, 4 * SIZE(A2) + MUL alpha3, a8, tmp + fmov tmp, a8 + unop + + ADD y1, a5, tmp + fmov tmp, y1 + LD a5, 5 * SIZE(A2) + MUL alpha3, a9, tmp + fmov tmp, a9 + ldi I, -1(I) + + ADD y2, a6, tmp + fmov tmp, y2 + LD a6, 6 * SIZE(A2) + MUL alpha3, a10, tmp + fmov tmp, a10 + unop + + ADD y3, a7, tmp + fmov tmp, y3 + LD a7, 7 * SIZE(A2) + MUL alpha3, a11, tmp + fmov tmp, a11 + unop + + ADD y0, a8, tmp + fmov tmp, y0 + LD a8, 4 * SIZE(A3) + MUL alpha4, a12, tmp + fmov tmp, a12 + ble I, $L13 + .align 4 + +$L12: + ADD y1, a9, tmp + fmov tmp, y1 + LD a9, 5 * SIZE(A3) + MUL alpha4, a13, tmp + fmov tmp, a13 + ldw $31, (PREFETCHSIZE + 0) * SIZE(A1) + + ADD y2, a10, tmp + fmov tmp, y2 + LD a10, 6 * SIZE(A3) + MUL alpha4, a14, tmp + fmov tmp, a14 + unop + + ADD y3, a11, tmp + fmov tmp, y3 + LD a11, 7 * SIZE(A3) + MUL alpha4, a15, tmp + fmov tmp, a15 + ldi I, -1(I) + + ADD y0, a12, tmp + fmov tmp, y0 + LD a12, 4 * SIZE(A4) + MUL alpha1, a0, tmp + fmov tmp, a0 + flds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) + + ADD y1, a13, tmp + fmov tmp, y1 + LD a13, 5 * SIZE(A4) + MUL alpha1, a1, tmp + fmov tmp, a1 + unop + + ADD y2, a14, tmp + fmov tmp, y2 + LD a14, 6 * SIZE(A4) + MUL alpha1, a2, tmp + fmov tmp, a2 + unop + + ADD y3, a15, tmp + fmov tmp, y3 + LD a15, 7 * SIZE(A4) + MUL alpha1, a3, tmp + fmov tmp, a3 + ldw $31, (PREFETCHSIZE + 0) * SIZE(A2) + + ADD y4, a0, tmp + fmov tmp, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha2, a4, tmp + fmov tmp, a4 + LD a0, 8 * SIZE(A1) + + ADD y5, a1, tmp + fmov tmp, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha2, a5, tmp + fmov tmp, a5 + LD a1, 9 * SIZE(A1) + + ADD y6, a2, tmp + fmov tmp, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha2, a6, tmp + fmov tmp, a6 + LD a2, 10 * SIZE(A1) + + ADD y7, a3, tmp + fmov tmp, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha2, a7, tmp + fmov tmp, a7 + LD a3, 11 * SIZE(A1) + + ADD y4, a4, tmp + fmov tmp, y4 + LD a4, 8 * SIZE(A2) + MUL alpha3, a8, tmp + fmov tmp, a8 + LD y0, 8 * SIZE(Y1) + + ADD y5, a5, tmp + fmov tmp, y5 + LD a5, 9 * SIZE(A2) + MUL alpha3, a9, tmp + fmov tmp, a9 + LD y1, 9 * SIZE(Y1) + + ADD y6, a6, tmp + fmov tmp, y6 + LD a6, 10 * SIZE(A2) + MUL alpha3, a10, tmp + fmov tmp, a10 + LD y2, 10 * SIZE(Y1) + + ADD y7, a7, tmp + fmov tmp, y7 + LD a7, 11 * SIZE(A2) + MUL alpha3, a11, tmp + fmov tmp, a11 + LD y3, 11 * SIZE(Y1) + + ADD y4, a8, tmp + fmov tmp, y4 + LD a8, 8 * SIZE(A3) + MUL alpha4, a12, tmp + fmov tmp, a12 + ldw $31, (PREFETCHSIZE + 0) * SIZE(A3) + + ADD y5, a9, tmp + fmov tmp, y5 + LD a9, 9 * SIZE(A3) + MUL alpha4, a13, tmp + fmov tmp, a13 + ldi A1, 8 * SIZE(A1) + + ADD y6, a10, tmp + fmov tmp, y6 + LD a10, 10 * SIZE(A3) + MUL alpha4, a14, tmp + fmov tmp, a14 + ldi A2, 8 * SIZE(A2) + + ADD y7, a11, tmp + fmov tmp, y7 + LD a11, 11 * SIZE(A3) + MUL alpha4, a15, tmp + fmov tmp, a15 + ldi Y1, 8 * SIZE(Y1) + + ADD y4, a12, tmp + fmov tmp, y4 + LD a12, 8 * SIZE(A4) + MUL alpha1, a0, tmp + fmov tmp, a0 + unop + + ADD y5, a13, tmp + fmov tmp, y5 + LD a13, 9 * SIZE(A4) + MUL alpha1, a1, tmp + fmov tmp, a1 + ldi A3, 8 * SIZE(A3) + + ADD y6, a14, tmp + fmov tmp, y6 + LD a14, 10 * SIZE(A4) + MUL alpha1, a2, tmp + fmov tmp, a2 + ldw $31, (PREFETCHSIZE + 0) * SIZE(A4) + + ADD y7, a15, tmp + fmov tmp, y7 + LD a15, 11 * SIZE(A4) + MUL alpha1, a3, tmp + fmov tmp, a3 + ldi A4, 8 * SIZE(A4) + + ADD y0, a0, tmp + fmov tmp, y0 + LD a0, 4 * SIZE(A1) + MUL alpha2, a4, tmp + fmov tmp, a4 + ST y4, -4 * SIZE(Y1) + + ADD y1, a1, tmp + fmov tmp, y1 + LD a1, 5 * SIZE(A1) + MUL alpha2, a5, tmp + fmov tmp, a5 + ST y5, -3 * SIZE(Y1) + + ADD y2, a2, tmp + fmov tmp, y2 + LD a2, 6 * SIZE(A1) + MUL alpha2, a6, tmp + fmov tmp, a6 + ST y6, -2 * SIZE(Y1) + + ADD y3, a3, tmp + fmov tmp, y3 + LD a3, 7 * SIZE(A1) + MUL alpha2, a7, tmp + fmov tmp, a7 + ST y7, -1 * SIZE(Y1) + + ADD y0, a4, tmp + fmov tmp, y0 + LD a4, 4 * SIZE(A2) + MUL alpha3, a8, tmp + fmov tmp, a8 + LD y4, 4 * SIZE(Y1) + + ADD y1, a5, tmp + fmov tmp, y1 + LD a5, 5 * SIZE(A2) + MUL alpha3, a9, tmp + fmov tmp, a9 + LD y5, 5 * SIZE(Y1) + + ADD y2, a6, tmp + fmov tmp, y2 + LD a6, 6 * SIZE(A2) + MUL alpha3, a10, tmp + fmov tmp, a10 + LD y6, 6 * SIZE(Y1) + + ADD y3, a7, tmp + fmov tmp, y3 + LD a7, 7 * SIZE(A2) + MUL alpha3, a11, tmp + fmov tmp, a11 + LD y7, 7 * SIZE(Y1) + + ADD y0, a8, tmp + fmov tmp, y0 + LD a8, 4 * SIZE(A3) + MUL alpha4, a12, tmp + fmov tmp, a12 + bgt I, $L12 + .align 4 + +$L13: + ADD y1, a9, tmp + fmov tmp, y1 + LD a9, 5 * SIZE(A3) + MUL alpha4, a13, tmp + fmov tmp, a13 + unop + + ADD y2, a10, tmp + fmov tmp, y2 + LD a10, 6 * SIZE(A3) + MUL alpha4, a14, tmp + fmov tmp, a14 + unop + + ADD y3, a11, tmp + fmov tmp, y3 + LD a11, 7 * SIZE(A3) + MUL alpha4, a15, tmp + fmov tmp, a15 + unop + + ADD y0, a12, tmp + fmov tmp, y0 + LD a12, 4 * SIZE(A4) + MUL alpha1, a0, tmp + fmov tmp, a0 + unop + + ADD y1, a13, tmp + fmov tmp, y1 + LD a13, 5 * SIZE(A4) + MUL alpha1, a1, tmp + fmov tmp, a1 + unop + + ADD y2, a14, tmp + fmov tmp, y2 + LD a14, 6 * SIZE(A4) + MUL alpha1, a2, tmp + fmov tmp, a2 + unop + + ADD y3, a15, tmp + fmov tmp, y3 + LD a15, 7 * SIZE(A4) + MUL alpha1, a3, tmp + fmov tmp, a3 + unop + + ST y0, 0 * SIZE(Y1) + ADD y4, a0, tmp + fmov tmp, y4 + unop + MUL alpha2, a4, tmp + fmov tmp, a4 + + ST y1, 1 * SIZE(Y1) + ADD y5, a1, tmp + fmov tmp, y5 + unop + MUL alpha2, a5, tmp + fmov tmp, a5 + + ST y2, 2 * SIZE(Y1) + ADD y6, a2, tmp + fmov tmp, y6 + unop + MUL alpha2, a6, tmp + fmov tmp, a6 + + ST y3, 3 * SIZE(Y1) + ADD y7, a3, tmp + fmov tmp, y7 + ldi Y1, 8 * SIZE(Y1) + MUL alpha2, a7, tmp + fmov tmp, a7 + + ADD y4, a4, tmp + fmov tmp, y4 + MUL alpha3, a8, tmp + fmov tmp, a8 + ADD y5, a5, tmp + fmov tmp, y5 + MUL alpha3, a9, tmp + fmov tmp, a9 + ADD y6, a6, tmp + fmov tmp, y6 + MUL alpha3, a10, tmp + fmov tmp, a10 + ADD y7, a7, tmp + fmov tmp, y7 + MUL alpha3, a11, tmp + fmov tmp, a11 + + ADD y4, a8, tmp + fmov tmp, y4 + MUL alpha4, a12, tmp + fmov tmp, a12 + ADD y5, a9, tmp + fmov tmp, y5 + MUL alpha4, a13, tmp + fmov tmp, a13 + ADD y6, a10, tmp + fmov tmp, y6 + MUL alpha4, a14, tmp + fmov tmp, a14 + ADD y7, a11, tmp + fmov tmp, y7 + MUL alpha4, a15, tmp + fmov tmp, a15 + + ADD y4, a12, tmp + fmov tmp, y4 + ADD y5, a13, tmp + fmov tmp, y5 + ADD y6, a14, tmp + fmov tmp, y6 + ADD y7, a15, tmp + fmov tmp, y7 + + ST y4, -4 * SIZE(Y1) + ldi A1, 8 * SIZE(A1) + ST y5, -3 * SIZE(Y1) + ldi A2, 8 * SIZE(A2) + ST y6, -2 * SIZE(Y1) + ldi A3, 8 * SIZE(A3) + ST y7, -1 * SIZE(Y1) + ldi A4, 8 * SIZE(A4) + .align 4 + +$L15: + and M, 4, I + ble I, $L16 + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD a4, 0 * SIZE(A2) + LD a5, 1 * SIZE(A2) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + LD a8, 0 * SIZE(A3) + LD a9, 1 * SIZE(A3) + LD a10, 2 * SIZE(A3) + LD a11, 3 * SIZE(A3) + + MUL alpha1, a0, tmp + fmov tmp, a0 + LD a12, 0 * SIZE(A4) + MUL alpha1, a1, tmp + fmov tmp, a1 + LD a13, 1 * SIZE(A4) + MUL alpha1, a2, tmp + fmov tmp, a2 + LD a14, 2 * SIZE(A4) + MUL alpha1, a3, tmp + fmov tmp, a3 + LD a15, 3 * SIZE(A4) + + ADD y0, a0, tmp + fmov tmp, y0 + MUL alpha2, a4, tmp + fmov tmp, a4 + ADD y1, a1, tmp + fmov tmp, y1 + MUL alpha2, a5, tmp + fmov tmp, a5 + ADD y2, a2, tmp + fmov tmp, y2 + MUL alpha2, a6, tmp + fmov tmp, a6 + ADD y3, a3, tmp + fmov tmp, y3 + MUL alpha2, a7, tmp + fmov tmp, a7 + + ADD y0, a4, tmp + fmov tmp, y0 + MUL alpha3, a8, tmp + fmov tmp, a8 + ADD y1, a5, tmp + fmov tmp, y1 + MUL alpha3, a9, tmp + fmov tmp, a9 + ADD y2, a6, tmp + fmov tmp, y2 + MUL alpha3, a10, tmp + fmov tmp, a10 + ADD y3, a7, tmp + fmov tmp, y3 + MUL alpha3, a11, tmp + fmov tmp, a11 + + ADD y0, a8, tmp + fmov tmp, y0 + MUL alpha4, a12, tmp + fmov tmp, a12 + ADD y1, a9, tmp + fmov tmp, y1 + MUL alpha4, a13, tmp + fmov tmp, a13 + ADD y2, a10, tmp + fmov tmp, y2 + MUL alpha4, a14, tmp + fmov tmp, a14 + ADD y3, a11, tmp + fmov tmp, y3 + MUL alpha4, a15, tmp + fmov tmp, a15 + + ADD y0, a12, tmp + fmov tmp, y0 + ldi Y1, 4 * SIZE(Y1) + ADD y1, a13, tmp + fmov tmp, y1 + unop + + ADD y2, a14, tmp + fmov tmp, y2 + unop + ADD y3, a15, tmp + fmov tmp, y3 + unop + + ST y0, -4 * SIZE(Y1) + ldi A1, 4 * SIZE(A1) + ST y1, -3 * SIZE(Y1) + ldi A2, 4 * SIZE(A2) + ST y2, -2 * SIZE(Y1) + ldi A3, 4 * SIZE(A3) + ST y3, -1 * SIZE(Y1) + ldi A4, 4 * SIZE(A4) + .align 4 + +$L16: + and M, 2, I + ble I, $L17 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 0 * SIZE(A2) + LD a3, 1 * SIZE(A2) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + + LD a4, 0 * SIZE(A3) + MUL alpha1, a0, tmp + fmov tmp, a0 + LD a5, 1 * SIZE(A3) + MUL alpha1, a1, tmp + fmov tmp, a1 + LD a6, 0 * SIZE(A4) + MUL alpha2, a2, tmp + fmov tmp, a2 + LD a7, 1 * SIZE(A4) + MUL alpha2, a3, tmp + fmov tmp, a3 + + ADD y0, a0, tmp + fmov tmp, y0 + MUL alpha3, a4, tmp + fmov tmp, a4 + ADD y1, a1, tmp + fmov tmp, y1 + MUL alpha3, a5, tmp + fmov tmp, a5 + ADD y0, a2, tmp + fmov tmp, y0 + MUL alpha4, a6, tmp + fmov tmp, a6 + ADD y1, a3, tmp + fmov tmp, y1 + MUL alpha4, a7, tmp + fmov tmp, a7 + + ADD y0, a4, tmp + fmov tmp, y0 + ldi A1, 2 * SIZE(A1) + ADD y1, a5, tmp + fmov tmp, y1 + ldi A2, 2 * SIZE(A2) + ADD y0, a6, tmp + fmov tmp, y0 + ldi A3, 2 * SIZE(A3) + ADD y1, a7, tmp + fmov tmp, y1 + ldi A4, 2 * SIZE(A4) + + ST y0, 0 * SIZE(Y1) + unop + ST y1, 1 * SIZE(Y1) + ldi Y1, 2 * SIZE(Y1) + .align 4 + +$L17: + blbc M, $L18 + + LD y0, 0 * SIZE(Y1) + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + LD a2, 0 * SIZE(A3) + LD a3, 0 * SIZE(A4) + + MUL alpha1, a0, tmp + fmov tmp, a0 + MUL alpha2, a1, tmp + fmov tmp, a1 + MUL alpha3, a2, tmp + fmov tmp, a2 + MUL alpha4, a3, tmp + fmov tmp, a3 + + ADD y0, a0, tmp + fmov tmp, y0 + ADD y0, a1, tmp + fmov tmp, y0 + ADD y0, a2, tmp + fmov tmp, y0 + ADD y0, a3, tmp + fmov tmp, y0 + + ST y0, 0 * SIZE(Y1) + .align 4 + +$L18: + ldi J, -1(J) + bgt J, $L11 + .align 4 + +$L20: + and N, 2, J + ble J, $L30 + + LD alpha1, 0 * SIZE(X) + addl X, INCX, X + LD alpha2, 0 * SIZE(X) + addl X, INCX, X + + mov A, A1 + MUL alpha, alpha1, tmp + fmov tmp, alpha1 + addl A, LDA, A2 + MUL alpha, alpha2, tmp + fmov tmp, alpha2 + + addl A2, LDA, A + mov Y, Y1 + + sra M, 3, I + ble I, $L25 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD a4, 0 * SIZE(A2) + LD a5, 1 * SIZE(A2) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + + MUL alpha1, a0, tmp + fmov tmp, a0 + LD y4, 4 * SIZE(Y1) + MUL alpha1, a1, tmp + fmov tmp, a1 + LD y5, 5 * SIZE(Y1) + MUL alpha1, a2, tmp + fmov tmp, a2 + LD y6, 6 * SIZE(Y1) + MUL alpha1, a3, tmp + fmov tmp, a3 + LD y7, 7 * SIZE(Y1) + + ADD y0, a0, tmp + fmov tmp, y0 + LD a0, 4 * SIZE(A1) + MUL alpha2, a4, tmp + fmov tmp, a4 + + ADD y1, a1, tmp + fmov tmp, y1 + LD a1, 5 * SIZE(A1) + MUL alpha2, a5, tmp + fmov tmp, a5 + + ADD y2, a2, tmp + fmov tmp, y2 + LD a2, 6 * SIZE(A1) + MUL alpha2, a6, tmp + fmov tmp, a6 + + ADD y3, a3, tmp + fmov tmp, y3 + LD a3, 7 * SIZE(A1) + MUL alpha2, a7, tmp + fmov tmp, a7 + + ADD y0, a4, tmp + fmov tmp, y0 + LD a4, 4 * SIZE(A2) + MUL alpha1, a0, tmp + fmov tmp, a0 + + ADD y1, a5, tmp + fmov tmp, y1 + LD a5, 5 * SIZE(A2) + MUL alpha1, a1, tmp + fmov tmp, a1 + + ADD y2, a6, tmp + fmov tmp, y2 + LD a6, 6 * SIZE(A2) + MUL alpha1, a2, tmp + fmov tmp, a2 + + ADD y3, a7, tmp + fmov tmp, y3 + LD a7, 7 * SIZE(A2) + MUL alpha1, a3, tmp + fmov tmp, a3 + + ldi I, -1(I) + ble I, $L23 + .align 4 + +$L22: + ldw $31, (PREFETCHSIZE + 0) * SIZE(A1) + ldi I, -1(I) + ldw $31, (PREFETCHSIZE + 0) * SIZE(A2) + ldi A2, 8 * SIZE(A2) + + ADD y4, a0, tmp + fmov tmp, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha2, a4, tmp + fmov tmp, a4 + LD a0, 8 * SIZE(A1) + + ADD y5, a1, tmp + fmov tmp, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha2, a5, tmp + fmov tmp, a5 + LD a1, 9 * SIZE(A1) + + ADD y6, a2, tmp + fmov tmp, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha2, a6, tmp + fmov tmp, a6 + LD a2, 10 * SIZE(A1) + + ADD y7, a3, tmp + fmov tmp, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha2, a7, tmp + fmov tmp, a7 + LD a3, 11 * SIZE(A1) + + ADD y4, a4, tmp + fmov tmp, y4 + LD a4, 0 * SIZE(A2) + MUL alpha1, a0, tmp + fmov tmp, a0 + LD y0, 8 * SIZE(Y1) + + ADD y5, a5, tmp + fmov tmp, y5 + LD a5, 1 * SIZE(A2) + MUL alpha1, a1, tmp + fmov tmp, a1 + LD y1, 9 * SIZE(Y1) + + ADD y6, a6, tmp + fmov tmp, y6 + LD a6, 2 * SIZE(A2) + MUL alpha1, a2, tmp + fmov tmp, a2 + LD y2, 10 * SIZE(Y1) + + ADD y7, a7, tmp + fmov tmp, y7 + LD a7, 3 * SIZE(A2) + MUL alpha1, a3, tmp + fmov tmp, a3 + LD y3, 11 * SIZE(Y1) + + ADD y0, a0, tmp + fmov tmp, y0 + ST y4, 4 * SIZE(Y1) + MUL alpha2, a4, tmp + fmov tmp, a4 + LD a0, 12 * SIZE(A1) + + ADD y1, a1, tmp + fmov tmp, y1 + ST y5, 5 * SIZE(Y1) + MUL alpha2, a5, tmp + fmov tmp, a5 + LD a1, 13 * SIZE(A1) + + ADD y2, a2, tmp + fmov tmp, y2 + ST y6, 6 * SIZE(Y1) + MUL alpha2, a6, tmp + fmov tmp, a6 + LD a2, 14 * SIZE(A1) + + ADD y3, a3, tmp + fmov tmp, y3 + ST y7, 7 * SIZE(Y1) + MUL alpha2, a7, tmp + fmov tmp, a7 + LD a3, 15 * SIZE(A1) + + ADD y0, a4, tmp + fmov tmp, y0 + LD a4, 4 * SIZE(A2) + MUL alpha1, a0, tmp + fmov tmp, a0 + LD y4, 12 * SIZE(Y1) + + ADD y1, a5, tmp + fmov tmp, y1 + LD a5, 5 * SIZE(A2) + MUL alpha1, a1, tmp + fmov tmp, a1 + LD y5, 13 * SIZE(Y1) + + ADD y2, a6, tmp + fmov tmp, y2 + LD a6, 6 * SIZE(A2) + MUL alpha1, a2, tmp + fmov tmp, a2 + LD y6, 14 * SIZE(Y1) + + ADD y3, a7, tmp + fmov tmp, y3 + LD a7, 7 * SIZE(A2) + MUL alpha1, a3, tmp + fmov tmp, a3 + LD y7, 15 * SIZE(Y1) + + flds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) + ldi A1, 8 * SIZE(A1) + ldi Y1, 8 * SIZE(Y1) + bgt I, $L22 + .align 4 + +$L23: + ADD y4, a0, tmp + fmov tmp, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha2, a4, tmp + fmov tmp, a4 + unop + + ADD y5, a1, tmp + fmov tmp, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha2, a5, tmp + fmov tmp, a5 + unop + + ADD y6, a2, tmp + fmov tmp, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha2, a6, tmp + fmov tmp, a6 + unop + + ADD y7, a3, tmp + fmov tmp, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha2, a7, tmp + fmov tmp, a7 + unop + + ADD y4, a4, tmp + fmov tmp, y4 + ADD y5, a5, tmp + fmov tmp, y5 + ADD y6, a6, tmp + fmov tmp, y6 + ADD y7, a7, tmp + fmov tmp, y7 + + ST y4, 4 * SIZE(Y1) + ldi A1, 8 * SIZE(A1) + ST y5, 5 * SIZE(Y1) + ldi A2, 8 * SIZE(A2) + + ST y6, 6 * SIZE(Y1) + unop + ST y7, 7 * SIZE(Y1) + ldi Y1, 8 * SIZE(Y1) + .align 4 + +$L25: + and M, 4, I + ble I, $L26 + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + MUL alpha1, a0, tmp + fmov tmp, a0 + LD a4, 0 * SIZE(A2) + MUL alpha1, a1, tmp + fmov tmp, a1 + LD a5, 1 * SIZE(A2) + MUL alpha1, a2, tmp + fmov tmp, a2 + LD a6, 2 * SIZE(A2) + MUL alpha1, a3, tmp + fmov tmp, a3 + LD a7, 3 * SIZE(A2) + + ADD y0, a0, tmp + fmov tmp, y0 + MUL alpha2, a4, tmp + fmov tmp, a4 + ADD y1, a1, tmp + fmov tmp, y1 + MUL alpha2, a5, tmp + fmov tmp, a5 + ADD y2, a2, tmp + fmov tmp, y2 + MUL alpha2, a6, tmp + fmov tmp, a6 + ADD y3, a3, tmp + fmov tmp, y3 + MUL alpha2, a7, tmp + fmov tmp, a7 + + ADD y0, a4, tmp + fmov tmp, y0 + ldi Y1, 4 * SIZE(Y1) + ADD y1, a5, tmp + fmov tmp, y1 + unop + ADD y2, a6, tmp + fmov tmp, y2 + unop + ADD y3, a7, tmp + fmov tmp, y3 + unop + + ST y0, -4 * SIZE(Y1) + ldi A1, 4 * SIZE(A1) + ST y1, -3 * SIZE(Y1) + ldi A2, 4 * SIZE(A2) + ST y2, -2 * SIZE(Y1) + ldi A3, 4 * SIZE(A3) + ST y3, -1 * SIZE(Y1) + ldi A4, 4 * SIZE(A4) + .align 4 + +$L26: + and M, 2, I + ble I, $L27 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 0 * SIZE(A2) + LD a3, 1 * SIZE(A2) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + + MUL alpha1, a0, tmp + fmov tmp, a0 + MUL alpha1, a1, tmp + fmov tmp, a1 + MUL alpha2, a2, tmp + fmov tmp, a2 + MUL alpha2, a3, tmp + fmov tmp, a3 + + ADD y0, a0, tmp + fmov tmp, y0 + ldi A1, 2 * SIZE(A1) + ADD y1, a1, tmp + fmov tmp, y1 + ldi A2, 2 * SIZE(A2) + ADD y0, a2, tmp + fmov tmp, y0 + unop + ADD y1, a3, tmp + fmov tmp, y1 + unop + + ST y0, 0 * SIZE(Y1) + unop + ST y1, 1 * SIZE(Y1) + ldi Y1, 2 * SIZE(Y1) + .align 4 + +$L27: + blbc M, $L30 + + LD y0, 0 * SIZE(Y1) + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + + MUL alpha1, a0, tmp + fmov tmp, a0 + MUL alpha2, a1, tmp + fmov tmp, a1 + + ADD y0, a0, tmp + fmov tmp, y0 + ADD y0, a1, tmp + fmov tmp, y0 + + ST y0, 0 * SIZE(Y1) + .align 4 + +$L30: + blbc N, $L990 + + LD alpha1, 0 * SIZE(X) + mov A, A1 + MUL alpha, alpha1, tmp + fmov tmp, alpha1 + mov Y, Y1 + + sra M, 3, I + ble I, $L35 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + LD a4, 4 * SIZE(A1) + LD a5, 5 * SIZE(A1) + LD a6, 6 * SIZE(A1) + LD a7, 7 * SIZE(A1) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + LD y4, 4 * SIZE(Y1) + LD y5, 5 * SIZE(Y1) + LD y6, 6 * SIZE(Y1) + LD y7, 7 * SIZE(Y1) + + MUL alpha1, a0, tmp + fmov tmp, a0 + MUL alpha1, a1, tmp + fmov tmp, a1 + MUL alpha1, a2, tmp + fmov tmp, a2 + MUL alpha1, a3, tmp + fmov tmp, a3 + + ldi I, -1(I) + ble I, $L33 + .align 4 + +$L32: + ADD y0, a0, tmp + fmov tmp, y0 + LD y4, 4 * SIZE(Y1) + MUL alpha1, a4, tmp + fmov tmp, a4 + LD a0, 8 * SIZE(A1) + + ADD y1, a1, tmp + fmov tmp, y1 + LD y5, 5 * SIZE(Y1) + MUL alpha1, a5, tmp + fmov tmp, a5 + LD a1, 9 * SIZE(A1) + + ADD y2, a2, tmp + fmov tmp, y2 + LD y6, 6 * SIZE(Y1) + MUL alpha1, a6, tmp + fmov tmp, a6 + LD a2, 10 * SIZE(A1) + + ADD y3, a3, tmp + fmov tmp, y3 + LD y7, 7 * SIZE(Y1) + MUL alpha1, a7, tmp + fmov tmp, a7 + LD a3, 11 * SIZE(A1) + + ST y0, 0 * SIZE(Y1) + ST y1, 1 * SIZE(Y1) + ST y2, 2 * SIZE(Y1) + ST y3, 3 * SIZE(Y1) + + ADD y4, a4, tmp + fmov tmp, y4 + LD y0, 8 * SIZE(Y1) + MUL alpha1, a0, tmp + fmov tmp, a0 + LD a4, 12 * SIZE(A1) + + ADD y5, a5, tmp + fmov tmp, y5 + LD y1, 9 * SIZE(Y1) + MUL alpha1, a1, tmp + fmov tmp, a1 + LD a5, 13 * SIZE(A1) + + ADD y6, a6, tmp + fmov tmp, y6 + LD y2, 10 * SIZE(Y1) + MUL alpha1, a2, tmp + fmov tmp, a2 + LD a6, 14 * SIZE(A1) + + ADD y7, a7, tmp + fmov tmp, y7 + LD y3, 11 * SIZE(Y1) + MUL alpha1, a3, tmp + fmov tmp, a3 + LD a7, 15 * SIZE(A1) + + ST y4, 4 * SIZE(Y1) + ldi I, -1(I) + ST y5, 5 * SIZE(Y1) + ldi A1, 8 * SIZE(A1) + + ST y6, 6 * SIZE(Y1) + ldw $31, (PREFETCHSIZE + 0) * SIZE(A1) + ST y7, 7 * SIZE(Y1) + flds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) + + ldi Y1, 8 * SIZE(Y1) + bgt I, $L32 + .align 4 + +$L33: + ADD y0, a0, tmp + fmov tmp, y0 + LD y4, 4 * SIZE(Y1) + MUL alpha1, a4, tmp + fmov tmp, a4 + unop + + ADD y1, a1, tmp + fmov tmp, y1 + LD y5, 5 * SIZE(Y1) + MUL alpha1, a5, tmp + fmov tmp, a5 + unop + + ADD y2, a2, tmp + fmov tmp, y2 + LD y6, 6 * SIZE(Y1) + MUL alpha1, a6, tmp + fmov tmp, a6 + unop + + ADD y3, a3, tmp + fmov tmp, y3 + LD y7, 7 * SIZE(Y1) + MUL alpha1, a7, tmp + fmov tmp, a7 + unop + + ADD y4, a4, tmp + fmov tmp, y4 + ST y0, 0 * SIZE(Y1) + ADD y5, a5, tmp + fmov tmp, y5 + ST y1, 1 * SIZE(Y1) + ADD y6, a6, tmp + fmov tmp, y6 + ST y2, 2 * SIZE(Y1) + ADD y7, a7, tmp + fmov tmp, y7 + ST y3, 3 * SIZE(Y1) + + ST y4, 4 * SIZE(Y1) + unop + ST y5, 5 * SIZE(Y1) + unop + + ST y6, 6 * SIZE(Y1) + ldi A1, 8 * SIZE(A1) + ST y7, 7 * SIZE(Y1) + ldi Y1, 8 * SIZE(Y1) + .align 4 + +$L35: + and M, 4, I + ble I, $L36 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + MUL alpha1, a0, tmp + fmov tmp, a0 + LD y0, 0 * SIZE(Y1) + MUL alpha1, a1, tmp + fmov tmp, a1 + LD y1, 1 * SIZE(Y1) + MUL alpha1, a2, tmp + fmov tmp, a2 + LD y2, 2 * SIZE(Y1) + MUL alpha1, a3, tmp + fmov tmp, a3 + LD y3, 3 * SIZE(Y1) + + ADD y0, a0, tmp + fmov tmp, y0 + ADD y1, a1, tmp + fmov tmp, y1 + ADD y2, a2, tmp + fmov tmp, y2 + ADD y3, a3, tmp + fmov tmp, y3 + + ST y0, 0 * SIZE(Y1) + ldi A1, 4 * SIZE(A1) + ST y1, 1 * SIZE(Y1) + ldi A2, 4 * SIZE(A2) + ST y2, 2 * SIZE(Y1) + unop + ST y3, 3 * SIZE(Y1) + ldi Y1, 4 * SIZE(Y1) + .align 4 + +$L36: + and M, 2, I + ble I, $L37 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + + LD y0, 0 * SIZE(Y1) + MUL alpha1, a0, tmp + fmov tmp, a0 + LD y1, 1 * SIZE(Y1) + MUL alpha1, a1, tmp + fmov tmp, a1 + + ADD y0, a0, tmp + fmov tmp, y0 + ADD y1, a1, tmp + fmov tmp, y1 + + ST y0, 0 * SIZE(Y1) + ldi A1, 2 * SIZE(A1) + ST y1, 1 * SIZE(Y1) + ldi Y1, 2 * SIZE(Y1) + .align 4 + +$L37: + blbc M, $L990 + + LD y0, 0 * SIZE(Y1) + LD a0, 0 * SIZE(A1) + + MUL alpha1, a0, tmp + fmov tmp, a0 + + ADD y0, a0, tmp + fmov tmp, y0 + ST y0, 0 * SIZE(Y1) + .align 4 + +$L990: + cmpeq INCY, SIZE, $0 + bne $0, $L999 + + mov BUFFER, Y1 + + sra M, 3, I + ble I, $L995 + .align 4 + +$L992: + LD a0, 0 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + LD a1, 0 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + LD a2, 0 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + LD a3, 0 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + + LD y0, 0 * SIZE(Y) + LD y1, 1 * SIZE(Y) + LD y2, 2 * SIZE(Y) + LD y3, 3 * SIZE(Y) + + LD a4, 0 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + LD a5, 0 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + LD a6, 0 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + LD a7, 0 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + + LD y4, 4 * SIZE(Y) + LD y5, 5 * SIZE(Y) + LD y6, 6 * SIZE(Y) + LD y7, 7 * SIZE(Y) + + ADD a0, y0, tmp + fmov tmp, a0 + ADD a1, y1, tmp + fmov tmp, a1 + ADD a2, y2, tmp + fmov tmp, a2 + ADD a3, y3, tmp + fmov tmp, a3 + ADD a4, y4, tmp + fmov tmp, a4 + ADD a5, y5, tmp + fmov tmp, a5 + ADD a6, y6, tmp + fmov tmp, a6 + ADD a7, y7, tmp + fmov tmp, a7 + + ST a0, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + ST a1, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + ST a2, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + ST a3, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + + ST a4, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + ST a5, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + ST a6, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + ST a7, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + + ldi I, -1(I) + ldi Y, 8 * SIZE(Y) + bgt I, $L992 + .align 4 + +$L995: + and M, 7, I + ble I, $L999 + .align 4 + +$L996: + LD a0, 0 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + + LD y0, 0 * SIZE(Y) + ldi Y, 1 * SIZE(Y) + + ADD a0, y0, tmp + fmov tmp, a0 + + ST a0, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + + ldi I, -1(I) + bgt I, $L996 + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + fldd $f20, 64($sp) + + ldi $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/gemv_n.S.bak b/kernel/sw_64/gemv_n.S.bak new file mode 100644 index 0000000..f90abdf --- /dev/null +++ b/kernel/sw_64/gemv_n.S.bak @@ -0,0 +1,1307 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define STACKSIZE 64 +#define PREFETCHSIZE 32 + +#define M $16 +#define N $17 +#define A $20 +#define LDA $21 + +#define X $18 +#define INCX $19 +#define Y $22 +#define INCY $23 + +#define BUFFER $24 + +#define I $25 +#define J $27 + +#define Y1 $4 + +#define A1 $5 +#define A2 $6 +#define A3 $7 +#define A4 $8 + +#define alpha $f19 + +#define alpha1 $f0 +#define alpha2 $f1 +#define alpha3 $f10 +#define alpha4 $f11 + +#define y0 $f12 +#define y1 $f13 +#define y2 $f14 +#define y3 $f15 + +#define y4 $f16 +#define y5 $f17 +#define y6 $f18 +#define y7 $f21 + +#define a0 $f22 +#define a1 $f23 +#define a2 $f24 +#define a3 $f25 +#define a4 $f26 +#define a5 $f27 +#define a6 $f28 +#define a7 $f29 + +#define a8 $f2 +#define a9 $f3 +#define a10 $f4 +#define a11 $f5 +#define a12 $f6 +#define a13 $f7 +#define a14 $f8 +#define a15 $f9 + + PROLOGUE + + ldi $sp, -STACKSIZE($sp) + ldl X, 0 + STACKSIZE($sp) + ldl INCX, 8 + STACKSIZE($sp) + ldl Y, 16 + STACKSIZE($sp) + ldl INCY, 24 + STACKSIZE($sp) + ldl BUFFER, 32 + STACKSIZE($sp) + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + + PROFCODE + + cmple M, 0, $0 + SXADDQ INCX, 0, INCX + cmple N, 0, $1 + SXADDQ INCY, 0, INCY + + or $0, $1, $0 + bne $0, $L999 + + SXADDQ LDA, 0, LDA + + cmpeq INCY, SIZE, $0 + bne $0, $L10 + + mov BUFFER, Y1 + + mov Y, BUFFER + mov Y1, Y + + sra M, 3, I + ble I, $L05 + .align 4 + +$L02: + ST $f31, 0 * SIZE(Y1) + ST $f31, 1 * SIZE(Y1) + ST $f31, 2 * SIZE(Y1) + ST $f31, 3 * SIZE(Y1) + ST $f31, 4 * SIZE(Y1) + ST $f31, 5 * SIZE(Y1) + ST $f31, 6 * SIZE(Y1) + ST $f31, 7 * SIZE(Y1) + + ldi Y1, 8 * SIZE(Y1) + ldi I, -1(I) + bgt I, $L02 + .align 4 + +$L05: + and M, 7, I + ble I, $L10 + .align 4 + +$L06: + ST $f31, 0 * SIZE(Y1) + addl Y1, SIZE, Y1 + + ldi I, -1(I) + bgt I, $L06 + .align 4 + +$L10: + sra N, 2, J + ble J, $L20 + .align 4 + +$L11: + LD alpha1, 0 * SIZE(X) + addl X, INCX, X + LD alpha2, 0 * SIZE(X) + addl X, INCX, X + LD alpha3, 0 * SIZE(X) + addl X, INCX, X + LD alpha4, 0 * SIZE(X) + addl X, INCX, X + + MUL alpha, alpha1, alpha1 + MUL alpha, alpha2, alpha2 + MUL alpha, alpha3, alpha3 + MUL alpha, alpha4, alpha4 + + mov A, A1 + addl A, LDA, A2 + addl A2, LDA, A3 + addl A3, LDA, A4 + s4addl LDA, A, A + + mov Y, Y1 + fillcs 4 * SIZE(X) + + sra M, 3, I + ble I, $L15 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD a4, 0 * SIZE(A2) + LD a5, 1 * SIZE(A2) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + + LD a8, 0 * SIZE(A3) + LD a9, 1 * SIZE(A3) + LD a10, 2 * SIZE(A3) + LD a11, 3 * SIZE(A3) + + LD y4, 4 * SIZE(Y1) + LD y5, 5 * SIZE(Y1) + LD y6, 6 * SIZE(Y1) + LD y7, 7 * SIZE(Y1) + + MUL alpha1, a0, a0 + LD a12, 0 * SIZE(A4) + MUL alpha1, a1, a1 + LD a13, 1 * SIZE(A4) + MUL alpha1, a2, a2 + LD a14, 2 * SIZE(A4) + MUL alpha1, a3, a3 + LD a15, 3 * SIZE(A4) + + ADD y0, a0, y0 + LD a0, 4 * SIZE(A1) + MUL alpha2, a4, a4 + unop + + ADD y1, a1, y1 + LD a1, 5 * SIZE(A1) + MUL alpha2, a5, a5 + unop + + ADD y2, a2, y2 + LD a2, 6 * SIZE(A1) + MUL alpha2, a6, a6 + unop + + ADD y3, a3, y3 + LD a3, 7 * SIZE(A1) + MUL alpha2, a7, a7 + unop + + ADD y0, a4, y0 + LD a4, 4 * SIZE(A2) + MUL alpha3, a8, a8 + unop + + ADD y1, a5, y1 + LD a5, 5 * SIZE(A2) + MUL alpha3, a9, a9 + ldi I, -1(I) + + ADD y2, a6, y2 + LD a6, 6 * SIZE(A2) + MUL alpha3, a10, a10 + unop + + ADD y3, a7, y3 + LD a7, 7 * SIZE(A2) + MUL alpha3, a11, a11 + unop + + ADD y0, a8, y0 + LD a8, 4 * SIZE(A3) + MUL alpha4, a12, a12 + ble I, $L13 + .align 4 + +$L12: + ADD y1, a9, y1 + LD a9, 5 * SIZE(A3) + MUL alpha4, a13, a13 + fillcs (PREFETCHSIZE + 0) * SIZE(A1) + + ADD y2, a10, y2 + LD a10, 6 * SIZE(A3) + MUL alpha4, a14, a14 + unop + + ADD y3, a11, y3 + LD a11, 7 * SIZE(A3) + MUL alpha4, a15, a15 + ldi I, -1(I) + + ADD y0, a12, y0 + LD a12, 4 * SIZE(A4) + MUL alpha1, a0, a0 + fillcs (PREFETCHSIZE + 0) * SIZE(Y1) + + ADD y1, a13, y1 + LD a13, 5 * SIZE(A4) + MUL alpha1, a1, a1 + unop + + ADD y2, a14, y2 + LD a14, 6 * SIZE(A4) + MUL alpha1, a2, a2 + unop + + ADD y3, a15, y3 + LD a15, 7 * SIZE(A4) + MUL alpha1, a3, a3 + fillcs (PREFETCHSIZE + 0) * SIZE(A2) + + ADD y4, a0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha2, a4, a4 + LD a0, 8 * SIZE(A1) + + ADD y5, a1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha2, a5, a5 + LD a1, 9 * SIZE(A1) + + ADD y6, a2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha2, a6, a6 + LD a2, 10 * SIZE(A1) + + ADD y7, a3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha2, a7, a7 + LD a3, 11 * SIZE(A1) + + ADD y4, a4, y4 + LD a4, 8 * SIZE(A2) + MUL alpha3, a8, a8 + LD y0, 8 * SIZE(Y1) + + ADD y5, a5, y5 + LD a5, 9 * SIZE(A2) + MUL alpha3, a9, a9 + LD y1, 9 * SIZE(Y1) + + ADD y6, a6, y6 + LD a6, 10 * SIZE(A2) + MUL alpha3, a10, a10 + LD y2, 10 * SIZE(Y1) + + ADD y7, a7, y7 + LD a7, 11 * SIZE(A2) + MUL alpha3, a11, a11 + LD y3, 11 * SIZE(Y1) + + ADD y4, a8, y4 + LD a8, 8 * SIZE(A3) + MUL alpha4, a12, a12 + fillcs (PREFETCHSIZE + 0) * SIZE(A3) + + ADD y5, a9, y5 + LD a9, 9 * SIZE(A3) + MUL alpha4, a13, a13 + ldi A1, 8 * SIZE(A1) + + ADD y6, a10, y6 + LD a10, 10 * SIZE(A3) + MUL alpha4, a14, a14 + ldi A2, 8 * SIZE(A2) + + ADD y7, a11, y7 + LD a11, 11 * SIZE(A3) + MUL alpha4, a15, a15 + ldi Y1, 8 * SIZE(Y1) + + ADD y4, a12, y4 + LD a12, 8 * SIZE(A4) + MUL alpha1, a0, a0 + unop + + ADD y5, a13, y5 + LD a13, 9 * SIZE(A4) + MUL alpha1, a1, a1 + ldi A3, 8 * SIZE(A3) + + ADD y6, a14, y6 + LD a14, 10 * SIZE(A4) + MUL alpha1, a2, a2 + fillcs (PREFETCHSIZE + 0) * SIZE(A4) + + ADD y7, a15, y7 + LD a15, 11 * SIZE(A4) + MUL alpha1, a3, a3 + ldi A4, 8 * SIZE(A4) + + ADD y0, a0, y0 + LD a0, 4 * SIZE(A1) + MUL alpha2, a4, a4 + ST y4, -4 * SIZE(Y1) + + ADD y1, a1, y1 + LD a1, 5 * SIZE(A1) + MUL alpha2, a5, a5 + ST y5, -3 * SIZE(Y1) + + ADD y2, a2, y2 + LD a2, 6 * SIZE(A1) + MUL alpha2, a6, a6 + ST y6, -2 * SIZE(Y1) + + ADD y3, a3, y3 + LD a3, 7 * SIZE(A1) + MUL alpha2, a7, a7 + ST y7, -1 * SIZE(Y1) + + ADD y0, a4, y0 + LD a4, 4 * SIZE(A2) + MUL alpha3, a8, a8 + LD y4, 4 * SIZE(Y1) + + ADD y1, a5, y1 + LD a5, 5 * SIZE(A2) + MUL alpha3, a9, a9 + LD y5, 5 * SIZE(Y1) + + ADD y2, a6, y2 + LD a6, 6 * SIZE(A2) + MUL alpha3, a10, a10 + LD y6, 6 * SIZE(Y1) + + ADD y3, a7, y3 + LD a7, 7 * SIZE(A2) + MUL alpha3, a11, a11 + LD y7, 7 * SIZE(Y1) + + ADD y0, a8, y0 + LD a8, 4 * SIZE(A3) + MUL alpha4, a12, a12 + bgt I, $L12 + .align 4 + +$L13: + ADD y1, a9, y1 + LD a9, 5 * SIZE(A3) + MUL alpha4, a13, a13 + unop + + ADD y2, a10, y2 + LD a10, 6 * SIZE(A3) + MUL alpha4, a14, a14 + unop + + ADD y3, a11, y3 + LD a11, 7 * SIZE(A3) + MUL alpha4, a15, a15 + unop + + ADD y0, a12, y0 + LD a12, 4 * SIZE(A4) + MUL alpha1, a0, a0 + unop + + ADD y1, a13, y1 + LD a13, 5 * SIZE(A4) + MUL alpha1, a1, a1 + unop + + ADD y2, a14, y2 + LD a14, 6 * SIZE(A4) + MUL alpha1, a2, a2 + unop + + ADD y3, a15, y3 + LD a15, 7 * SIZE(A4) + MUL alpha1, a3, a3 + unop + + ST y0, 0 * SIZE(Y1) + ADD y4, a0, y4 + unop + MUL alpha2, a4, a4 + + ST y1, 1 * SIZE(Y1) + ADD y5, a1, y5 + unop + MUL alpha2, a5, a5 + + ST y2, 2 * SIZE(Y1) + ADD y6, a2, y6 + unop + MUL alpha2, a6, a6 + + ST y3, 3 * SIZE(Y1) + ADD y7, a3, y7 + ldi Y1, 8 * SIZE(Y1) + MUL alpha2, a7, a7 + + ADD y4, a4, y4 + MUL alpha3, a8, a8 + ADD y5, a5, y5 + MUL alpha3, a9, a9 + ADD y6, a6, y6 + MUL alpha3, a10, a10 + ADD y7, a7, y7 + MUL alpha3, a11, a11 + + ADD y4, a8, y4 + MUL alpha4, a12, a12 + ADD y5, a9, y5 + MUL alpha4, a13, a13 + ADD y6, a10, y6 + MUL alpha4, a14, a14 + ADD y7, a11, y7 + MUL alpha4, a15, a15 + + ADD y4, a12, y4 + ADD y5, a13, y5 + ADD y6, a14, y6 + ADD y7, a15, y7 + + ST y4, -4 * SIZE(Y1) + ldi A1, 8 * SIZE(A1) + ST y5, -3 * SIZE(Y1) + ldi A2, 8 * SIZE(A2) + ST y6, -2 * SIZE(Y1) + ldi A3, 8 * SIZE(A3) + ST y7, -1 * SIZE(Y1) + ldi A4, 8 * SIZE(A4) + .align 4 + +$L15: + and M, 4, I + ble I, $L16 + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD a4, 0 * SIZE(A2) + LD a5, 1 * SIZE(A2) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + LD a8, 0 * SIZE(A3) + LD a9, 1 * SIZE(A3) + LD a10, 2 * SIZE(A3) + LD a11, 3 * SIZE(A3) + + MUL alpha1, a0, a0 + LD a12, 0 * SIZE(A4) + MUL alpha1, a1, a1 + LD a13, 1 * SIZE(A4) + MUL alpha1, a2, a2 + LD a14, 2 * SIZE(A4) + MUL alpha1, a3, a3 + LD a15, 3 * SIZE(A4) + + ADD y0, a0, y0 + MUL alpha2, a4, a4 + ADD y1, a1, y1 + MUL alpha2, a5, a5 + ADD y2, a2, y2 + MUL alpha2, a6, a6 + ADD y3, a3, y3 + MUL alpha2, a7, a7 + + ADD y0, a4, y0 + MUL alpha3, a8, a8 + ADD y1, a5, y1 + MUL alpha3, a9, a9 + ADD y2, a6, y2 + MUL alpha3, a10, a10 + ADD y3, a7, y3 + MUL alpha3, a11, a11 + + ADD y0, a8, y0 + MUL alpha4, a12, a12 + ADD y1, a9, y1 + MUL alpha4, a13, a13 + ADD y2, a10, y2 + MUL alpha4, a14, a14 + ADD y3, a11, y3 + MUL alpha4, a15, a15 + + ADD y0, a12, y0 + ldi Y1, 4 * SIZE(Y1) + ADD y1, a13, y1 + unop + + ADD y2, a14, y2 + unop + ADD y3, a15, y3 + unop + + ST y0, -4 * SIZE(Y1) + ldi A1, 4 * SIZE(A1) + ST y1, -3 * SIZE(Y1) + ldi A2, 4 * SIZE(A2) + ST y2, -2 * SIZE(Y1) + ldi A3, 4 * SIZE(A3) + ST y3, -1 * SIZE(Y1) + ldi A4, 4 * SIZE(A4) + .align 4 + +$L16: + and M, 2, I + ble I, $L17 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 0 * SIZE(A2) + LD a3, 1 * SIZE(A2) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + + LD a4, 0 * SIZE(A3) + MUL alpha1, a0, a0 + LD a5, 1 * SIZE(A3) + MUL alpha1, a1, a1 + LD a6, 0 * SIZE(A4) + MUL alpha2, a2, a2 + LD a7, 1 * SIZE(A4) + MUL alpha2, a3, a3 + + ADD y0, a0, y0 + MUL alpha3, a4, a4 + ADD y1, a1, y1 + MUL alpha3, a5, a5 + ADD y0, a2, y0 + MUL alpha4, a6, a6 + ADD y1, a3, y1 + MUL alpha4, a7, a7 + + ADD y0, a4, y0 + ldi A1, 2 * SIZE(A1) + ADD y1, a5, y1 + ldi A2, 2 * SIZE(A2) + ADD y0, a6, y0 + ldi A3, 2 * SIZE(A3) + ADD y1, a7, y1 + ldi A4, 2 * SIZE(A4) + + ST y0, 0 * SIZE(Y1) + unop + ST y1, 1 * SIZE(Y1) + ldi Y1, 2 * SIZE(Y1) + .align 4 + +$L17: + blbc M, $L18 + + LD y0, 0 * SIZE(Y1) + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + LD a2, 0 * SIZE(A3) + LD a3, 0 * SIZE(A4) + + MUL alpha1, a0, a0 + MUL alpha2, a1, a1 + MUL alpha3, a2, a2 + MUL alpha4, a3, a3 + + ADD y0, a0, y0 + ADD y0, a1, y0 + ADD y0, a2, y0 + ADD y0, a3, y0 + + ST y0, 0 * SIZE(Y1) + .align 4 + +$L18: + ldi J, -1(J) + bgt J, $L11 + .align 4 + +$L20: + and N, 2, J + ble J, $L30 + + LD alpha1, 0 * SIZE(X) + addl X, INCX, X + LD alpha2, 0 * SIZE(X) + addl X, INCX, X + + mov A, A1 + MUL alpha, alpha1, alpha1 + addl A, LDA, A2 + MUL alpha, alpha2, alpha2 + + addl A2, LDA, A + mov Y, Y1 + + sra M, 3, I + ble I, $L25 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD a4, 0 * SIZE(A2) + LD a5, 1 * SIZE(A2) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + + MUL alpha1, a0, a0 + LD y4, 4 * SIZE(Y1) + MUL alpha1, a1, a1 + LD y5, 5 * SIZE(Y1) + MUL alpha1, a2, a2 + LD y6, 6 * SIZE(Y1) + MUL alpha1, a3, a3 + LD y7, 7 * SIZE(Y1) + + ADD y0, a0, y0 + LD a0, 4 * SIZE(A1) + MUL alpha2, a4, a4 + + ADD y1, a1, y1 + LD a1, 5 * SIZE(A1) + MUL alpha2, a5, a5 + + ADD y2, a2, y2 + LD a2, 6 * SIZE(A1) + MUL alpha2, a6, a6 + + ADD y3, a3, y3 + LD a3, 7 * SIZE(A1) + MUL alpha2, a7, a7 + + ADD y0, a4, y0 + LD a4, 4 * SIZE(A2) + MUL alpha1, a0, a0 + + ADD y1, a5, y1 + LD a5, 5 * SIZE(A2) + MUL alpha1, a1, a1 + + ADD y2, a6, y2 + LD a6, 6 * SIZE(A2) + MUL alpha1, a2, a2 + + ADD y3, a7, y3 + LD a7, 7 * SIZE(A2) + MUL alpha1, a3, a3 + + ldi I, -1(I) + ble I, $L23 + .align 4 + +$L22: + fillcs (PREFETCHSIZE + 0) * SIZE(A1) + ldi I, -1(I) + fillcs (PREFETCHSIZE + 0) * SIZE(A2) + ldi A2, 8 * SIZE(A2) + + ADD y4, a0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha2, a4, a4 + LD a0, 8 * SIZE(A1) + + ADD y5, a1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha2, a5, a5 + LD a1, 9 * SIZE(A1) + + ADD y6, a2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha2, a6, a6 + LD a2, 10 * SIZE(A1) + + ADD y7, a3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha2, a7, a7 + LD a3, 11 * SIZE(A1) + + ADD y4, a4, y4 + LD a4, 0 * SIZE(A2) + MUL alpha1, a0, a0 + LD y0, 8 * SIZE(Y1) + + ADD y5, a5, y5 + LD a5, 1 * SIZE(A2) + MUL alpha1, a1, a1 + LD y1, 9 * SIZE(Y1) + + ADD y6, a6, y6 + LD a6, 2 * SIZE(A2) + MUL alpha1, a2, a2 + LD y2, 10 * SIZE(Y1) + + ADD y7, a7, y7 + LD a7, 3 * SIZE(A2) + MUL alpha1, a3, a3 + LD y3, 11 * SIZE(Y1) + + ADD y0, a0, y0 + ST y4, 4 * SIZE(Y1) + MUL alpha2, a4, a4 + LD a0, 12 * SIZE(A1) + + ADD y1, a1, y1 + ST y5, 5 * SIZE(Y1) + MUL alpha2, a5, a5 + LD a1, 13 * SIZE(A1) + + ADD y2, a2, y2 + ST y6, 6 * SIZE(Y1) + MUL alpha2, a6, a6 + LD a2, 14 * SIZE(A1) + + ADD y3, a3, y3 + ST y7, 7 * SIZE(Y1) + MUL alpha2, a7, a7 + LD a3, 15 * SIZE(A1) + + ADD y0, a4, y0 + LD a4, 4 * SIZE(A2) + MUL alpha1, a0, a0 + LD y4, 12 * SIZE(Y1) + + ADD y1, a5, y1 + LD a5, 5 * SIZE(A2) + MUL alpha1, a1, a1 + LD y5, 13 * SIZE(Y1) + + ADD y2, a6, y2 + LD a6, 6 * SIZE(A2) + MUL alpha1, a2, a2 + LD y6, 14 * SIZE(Y1) + + ADD y3, a7, y3 + LD a7, 7 * SIZE(A2) + MUL alpha1, a3, a3 + LD y7, 15 * SIZE(Y1) + + fillcs (PREFETCHSIZE + 0) * SIZE(Y1) + ldi A1, 8 * SIZE(A1) + ldi Y1, 8 * SIZE(Y1) + bgt I, $L22 + .align 4 + +$L23: + ADD y4, a0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha2, a4, a4 + unop + + ADD y5, a1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha2, a5, a5 + unop + + ADD y6, a2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha2, a6, a6 + unop + + ADD y7, a3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha2, a7, a7 + unop + + ADD y4, a4, y4 + ADD y5, a5, y5 + ADD y6, a6, y6 + ADD y7, a7, y7 + + ST y4, 4 * SIZE(Y1) + ldi A1, 8 * SIZE(A1) + ST y5, 5 * SIZE(Y1) + ldi A2, 8 * SIZE(A2) + + ST y6, 6 * SIZE(Y1) + unop + ST y7, 7 * SIZE(Y1) + ldi Y1, 8 * SIZE(Y1) + .align 4 + +$L25: + and M, 4, I + ble I, $L26 + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + MUL alpha1, a0, a0 + LD a4, 0 * SIZE(A2) + MUL alpha1, a1, a1 + LD a5, 1 * SIZE(A2) + MUL alpha1, a2, a2 + LD a6, 2 * SIZE(A2) + MUL alpha1, a3, a3 + LD a7, 3 * SIZE(A2) + + ADD y0, a0, y0 + MUL alpha2, a4, a4 + ADD y1, a1, y1 + MUL alpha2, a5, a5 + ADD y2, a2, y2 + MUL alpha2, a6, a6 + ADD y3, a3, y3 + MUL alpha2, a7, a7 + + ADD y0, a4, y0 + ldi Y1, 4 * SIZE(Y1) + ADD y1, a5, y1 + unop + ADD y2, a6, y2 + unop + ADD y3, a7, y3 + unop + + ST y0, -4 * SIZE(Y1) + ldi A1, 4 * SIZE(A1) + ST y1, -3 * SIZE(Y1) + ldi A2, 4 * SIZE(A2) + ST y2, -2 * SIZE(Y1) + ldi A3, 4 * SIZE(A3) + ST y3, -1 * SIZE(Y1) + ldi A4, 4 * SIZE(A4) + .align 4 + +$L26: + and M, 2, I + ble I, $L27 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 0 * SIZE(A2) + LD a3, 1 * SIZE(A2) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + + MUL alpha1, a0, a0 + MUL alpha1, a1, a1 + MUL alpha2, a2, a2 + MUL alpha2, a3, a3 + + ADD y0, a0, y0 + ldi A1, 2 * SIZE(A1) + ADD y1, a1, y1 + ldi A2, 2 * SIZE(A2) + ADD y0, a2, y0 + unop + ADD y1, a3, y1 + unop + + ST y0, 0 * SIZE(Y1) + unop + ST y1, 1 * SIZE(Y1) + ldi Y1, 2 * SIZE(Y1) + .align 4 + +$L27: + blbc M, $L30 + + LD y0, 0 * SIZE(Y1) + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + + MUL alpha1, a0, a0 + MUL alpha2, a1, a1 + + ADD y0, a0, y0 + ADD y0, a1, y0 + + ST y0, 0 * SIZE(Y1) + .align 4 + +$L30: + blbc N, $L990 + + LD alpha1, 0 * SIZE(X) + mov A, A1 + MUL alpha, alpha1, alpha1 + mov Y, Y1 + + sra M, 3, I + ble I, $L35 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + LD a4, 4 * SIZE(A1) + LD a5, 5 * SIZE(A1) + LD a6, 6 * SIZE(A1) + LD a7, 7 * SIZE(A1) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + LD y4, 4 * SIZE(Y1) + LD y5, 5 * SIZE(Y1) + LD y6, 6 * SIZE(Y1) + LD y7, 7 * SIZE(Y1) + + MUL alpha1, a0, a0 + MUL alpha1, a1, a1 + MUL alpha1, a2, a2 + MUL alpha1, a3, a3 + + ldi I, -1(I) + ble I, $L33 + .align 4 + +$L32: + ADD y0, a0, y0 + LD y4, 4 * SIZE(Y1) + MUL alpha1, a4, a4 + LD a0, 8 * SIZE(A1) + + ADD y1, a1, y1 + LD y5, 5 * SIZE(Y1) + MUL alpha1, a5, a5 + LD a1, 9 * SIZE(A1) + + ADD y2, a2, y2 + LD y6, 6 * SIZE(Y1) + MUL alpha1, a6, a6 + LD a2, 10 * SIZE(A1) + + ADD y3, a3, y3 + LD y7, 7 * SIZE(Y1) + MUL alpha1, a7, a7 + LD a3, 11 * SIZE(A1) + + ST y0, 0 * SIZE(Y1) + ST y1, 1 * SIZE(Y1) + ST y2, 2 * SIZE(Y1) + ST y3, 3 * SIZE(Y1) + + ADD y4, a4, y4 + LD y0, 8 * SIZE(Y1) + MUL alpha1, a0, a0 + LD a4, 12 * SIZE(A1) + + ADD y5, a5, y5 + LD y1, 9 * SIZE(Y1) + MUL alpha1, a1, a1 + LD a5, 13 * SIZE(A1) + + ADD y6, a6, y6 + LD y2, 10 * SIZE(Y1) + MUL alpha1, a2, a2 + LD a6, 14 * SIZE(A1) + + ADD y7, a7, y7 + LD y3, 11 * SIZE(Y1) + MUL alpha1, a3, a3 + LD a7, 15 * SIZE(A1) + + ST y4, 4 * SIZE(Y1) + ldi I, -1(I) + ST y5, 5 * SIZE(Y1) + ldi A1, 8 * SIZE(A1) + + ST y6, 6 * SIZE(Y1) + fillcs (PREFETCHSIZE + 0) * SIZE(A1) + ST y7, 7 * SIZE(Y1) + fillcs (PREFETCHSIZE + 0) * SIZE(Y1) + + ldi Y1, 8 * SIZE(Y1) + bgt I, $L32 + .align 4 + +$L33: + ADD y0, a0, y0 + LD y4, 4 * SIZE(Y1) + MUL alpha1, a4, a4 + unop + + ADD y1, a1, y1 + LD y5, 5 * SIZE(Y1) + MUL alpha1, a5, a5 + unop + + ADD y2, a2, y2 + LD y6, 6 * SIZE(Y1) + MUL alpha1, a6, a6 + unop + + ADD y3, a3, y3 + LD y7, 7 * SIZE(Y1) + MUL alpha1, a7, a7 + unop + + ADD y4, a4, y4 + ST y0, 0 * SIZE(Y1) + ADD y5, a5, y5 + ST y1, 1 * SIZE(Y1) + ADD y6, a6, y6 + ST y2, 2 * SIZE(Y1) + ADD y7, a7, y7 + ST y3, 3 * SIZE(Y1) + + ST y4, 4 * SIZE(Y1) + unop + ST y5, 5 * SIZE(Y1) + unop + + ST y6, 6 * SIZE(Y1) + ldi A1, 8 * SIZE(A1) + ST y7, 7 * SIZE(Y1) + ldi Y1, 8 * SIZE(Y1) + .align 4 + +$L35: + and M, 4, I + ble I, $L36 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + MUL alpha1, a0, a0 + LD y0, 0 * SIZE(Y1) + MUL alpha1, a1, a1 + LD y1, 1 * SIZE(Y1) + MUL alpha1, a2, a2 + LD y2, 2 * SIZE(Y1) + MUL alpha1, a3, a3 + LD y3, 3 * SIZE(Y1) + + ADD y0, a0, y0 + ADD y1, a1, y1 + ADD y2, a2, y2 + ADD y3, a3, y3 + + ST y0, 0 * SIZE(Y1) + ldi A1, 4 * SIZE(A1) + ST y1, 1 * SIZE(Y1) + ldi A2, 4 * SIZE(A2) + ST y2, 2 * SIZE(Y1) + unop + ST y3, 3 * SIZE(Y1) + ldi Y1, 4 * SIZE(Y1) + .align 4 + +$L36: + and M, 2, I + ble I, $L37 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + + LD y0, 0 * SIZE(Y1) + MUL alpha1, a0, a0 + LD y1, 1 * SIZE(Y1) + MUL alpha1, a1, a1 + + ADD y0, a0, y0 + ADD y1, a1, y1 + + ST y0, 0 * SIZE(Y1) + ldi A1, 2 * SIZE(A1) + ST y1, 1 * SIZE(Y1) + ldi Y1, 2 * SIZE(Y1) + .align 4 + +$L37: + blbc M, $L990 + + LD y0, 0 * SIZE(Y1) + LD a0, 0 * SIZE(A1) + + MUL alpha1, a0, a0 + + ADD y0, a0, y0 + ST y0, 0 * SIZE(Y1) + .align 4 + +$L990: + cmpeq INCY, SIZE, $0 + bne $0, $L999 + + mov BUFFER, Y1 + + sra M, 3, I + ble I, $L995 + .align 4 + +$L992: + LD a0, 0 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + LD a1, 0 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + LD a2, 0 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + LD a3, 0 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + + LD y0, 0 * SIZE(Y) + LD y1, 1 * SIZE(Y) + LD y2, 2 * SIZE(Y) + LD y3, 3 * SIZE(Y) + + LD a4, 0 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + LD a5, 0 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + LD a6, 0 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + LD a7, 0 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + + LD y4, 4 * SIZE(Y) + LD y5, 5 * SIZE(Y) + LD y6, 6 * SIZE(Y) + LD y7, 7 * SIZE(Y) + + ADD a0, y0, a0 + ADD a1, y1, a1 + ADD a2, y2, a2 + ADD a3, y3, a3 + ADD a4, y4, a4 + ADD a5, y5, a5 + ADD a6, y6, a6 + ADD a7, y7, a7 + + ST a0, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + ST a1, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + ST a2, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + ST a3, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + + ST a4, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + ST a5, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + ST a6, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + ST a7, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + + ldi I, -1(I) + ldi Y, 8 * SIZE(Y) + bgt I, $L992 + .align 4 + +$L995: + and M, 7, I + ble I, $L999 + .align 4 + +$L996: + LD a0, 0 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + + LD y0, 0 * SIZE(Y) + ldi Y, 1 * SIZE(Y) + + ADD a0, y0, a0 + + ST a0, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + + ldi I, -1(I) + bgt I, $L996 + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + + ldi $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/gemv_t.S b/kernel/sw_64/gemv_t.S new file mode 100644 index 0000000..4d8f130 --- /dev/null +++ b/kernel/sw_64/gemv_t.S @@ -0,0 +1,1222 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define STACKSIZE 72 +#define PREFETCHSIZE 32 + +#define M $16 +#define N $17 +#define A $20 +#define LDA $21 + +#define X $18 +#define INCX $19 +#define Y $22 +#define INCY $23 + +#define BUFFER $24 + +#define I $25 +#define J $27 + +#define X1 $3 +#define Y1 $4 + +#define A1 $5 +#define A2 $6 +#define A3 $7 +#define A4 $8 + +#define alpha $f19 +#define f20 $f20 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f21 + +#define a0 $f22 +#define a1 $f23 +#define a2 $f24 +#define a3 $f25 +#define a4 $f26 +#define a5 $f27 +#define a6 $f28 +#define a7 $f29 + +#define a8 $f2 +#define a9 $f3 +#define a10 $f4 +#define a11 $f5 +#define a12 $f6 +#define a13 $f7 +#define a14 $f8 +#define a15 $f9 + + PROLOGUE + + ldi $sp, -STACKSIZE($sp) + ldl X, 0 + STACKSIZE($sp) + ldl INCX, 8 + STACKSIZE($sp) + ldl Y, 16 + STACKSIZE($sp) + ldl INCY, 24 + STACKSIZE($sp) + ldl BUFFER, 32 + STACKSIZE($sp) + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + fstd f20, 64($sp) + + PROFCODE + + cmple M, 0, $0 + SXADDQ INCX, 0, INCX + cmple N, 0, $1 + SXADDQ INCY, 0, INCY + + or $0, $1, $0 + bne $0, $L999 + + cmpeq INCX, SIZE, $0 + mov X, X1 + SXADDQ LDA, 0, LDA + bne $0, $L10 + + sra M, 3, I + mov BUFFER, Y1 + mov BUFFER, X + ble I, $L05 + .align 4 + +$L02: + ldw $31, (PREFETCHSIZE + 0) * SIZE(X1) + ldi I, -1(I) + + LD a0, 0 * SIZE(X1) + addl X1, INCX, X1 + LD a1, 0 * SIZE(X1) + addl X1, INCX, X1 + LD a2, 0 * SIZE(X1) + addl X1, INCX, X1 + LD a3, 0 * SIZE(X1) + addl X1, INCX, X1 + + ST a0, 0 * SIZE(Y1) + ST a1, 1 * SIZE(Y1) + ST a2, 2 * SIZE(Y1) + ST a3, 3 * SIZE(Y1) + + LD a4, 0 * SIZE(X1) + addl X1, INCX, X1 + LD a5, 0 * SIZE(X1) + addl X1, INCX, X1 + LD a6, 0 * SIZE(X1) + addl X1, INCX, X1 + LD a7, 0 * SIZE(X1) + addl X1, INCX, X1 + + ST a4, 4 * SIZE(Y1) + ST a5, 5 * SIZE(Y1) + ST a6, 6 * SIZE(Y1) + ST a7, 7 * SIZE(Y1) + + ldi Y1, 8 * SIZE(Y1) + bgt I, $L02 + .align 4 + +$L05: + and M, 7, I + ble I, $L10 + .align 4 + +$L06: + LD a0, 0 * SIZE(X1) + addl X1, INCX, X1 + ST a0, 0 * SIZE(Y1) + addl Y1, SIZE, Y1 + + ldi I, -1(I) + bgt I, $L06 + .align 4 + +$L10: + mov Y, Y1 + fclr t0 + unop + fclr t1 + + sra N, 2, J + fclr t2 + fclr t3 + ble J, $L20 + .align 4 + +$L11: + mov A, A1 + fclr s0 + addl A, LDA, A2 + fclr s1 + + addl A2, LDA, A3 + fclr s2 + addl A3, LDA, A4 + fclr s3 + + s4addl LDA, A, A + unop + mov X, X1 + flds $f31, 3 * SIZE(Y) + + sra M, 3, I + ble I, $L15 + + LD x0, 0 * SIZE(X1) + LD x1, 1 * SIZE(X1) + LD x2, 2 * SIZE(X1) + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + LD a2, 0 * SIZE(A3) + LD a3, 0 * SIZE(A4) + LD a4, 1 * SIZE(A1) + LD a5, 1 * SIZE(A2) + LD a6, 1 * SIZE(A3) + LD a7, 1 * SIZE(A4) + LD a8, 2 * SIZE(A1) + LD a9, 2 * SIZE(A2) + LD a10, 2 * SIZE(A3) + LD a11, 2 * SIZE(A4) + LD a12, 3 * SIZE(A1) + LD a13, 3 * SIZE(A2) + LD a14, 3 * SIZE(A3) + LD a15, 3 * SIZE(A4) + + ldi I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ADD s0, t0, f20 + fmov f20,s0 + LD x3, 3 * SIZE(X1) + MUL x0, a0, t0 + + ADD s1, t1, f20 + fmov f20,s1 + ldw $31, (PREFETCHSIZE + 0) * SIZE(A1) + MUL x0, a1, t1 + LD a1, 4 * SIZE(A2) + + ADD s2, t2, f20 + fmov f20,s2 + unop + MUL x0, a2, t2 + LD a2, 4 * SIZE(A3) + + ADD s3, t3, f20 + fmov f20, s3 + LD a0, 4 * SIZE(A1) + unop + MUL x0, a3, t3 + LD a3, 4 * SIZE(A4) + + ADD s0, t0, f20 + fmov f20,s0 + LD x0, 4 * SIZE(X1) + MUL x1, a4, t0 + + ADD s1, t1, f20 + fmov f20, s1 + LD a4, 5 * SIZE(A1) + ldi A1, 8 * SIZE(A1) + MUL x1, a5, t1 + LD a5, 5 * SIZE(A2) + + ADD s2, t2, f20 + fmov f20,s2 + #unop + MUL x1, a6, t2 + LD a6, 5 * SIZE(A3) + + ADD s3, t3, f20 + fmov f20,s3 + #unop + MUL x1, a7, t3 + LD a7, 5 * SIZE(A4) + + ADD s0, t0, f20 + fmov f20,s0 + LD x1, 5 * SIZE(X1) + MUL x2, a8, t0 + LD a8, -2 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + ldw $31, (PREFETCHSIZE + 0) * SIZE(A2) + MUL x2, a9, t1 + LD a9, 6 * SIZE(A2) + + ADD s2, t2, f20 + fmov f20,s2 + ldi A2, 8 * SIZE(A2) + MUL x2, a10, t2 + LD a10, 6 * SIZE(A3) + + ADD s3, t3, f20 + fmov f20,s3 + ldi A3, 8 * SIZE(A3) + MUL x2, a11, t3 + LD a11, 6 * SIZE(A4) + + ADD s0, t0, f20 + fmov f20,s0 + LD x2, 6 * SIZE(X1) + MUL x3, a12, t0 + LD a12, -1 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + ldi A4, 8 * SIZE(A4) + MUL x3, a13, t1 + LD a13, -1 * SIZE(A2) + + ADD s2, t2, f20 + fmov f20,s2 + unop + MUL x3, a14, t2 + LD a14, -1 * SIZE(A3) + + ADD s3, t3, f20 + fmov f20,s3 + unop + MUL x3, a15, t3 + LD a15, -1 * SIZE(A4) + + ADD s0, t0, f20 + fmov f20,s0 + LD x3, 7 * SIZE(X1) + MUL x0, a0, t0 + LD a0, 0 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + ldw $31, (PREFETCHSIZE - 8) * SIZE(A3) + MUL x0, a1, t1 + LD a1, 0 * SIZE(A2) + + ADD s2, t2, f20 + fmov f20,s2 + unop + MUL x0, a2, t2 + LD a2, 0 * SIZE(A3) + + ADD s3, t3, f20 + fmov f20,s3 + unop + MUL x0, a3, t3 + LD a3, 0 * SIZE(A4) + + ADD s0, t0, f20 + fmov f20,s0 + LD x0, 8 * SIZE(X1) + MUL x1, a4, t0 + LD a4, 1 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + unop + MUL x1, a5, t1 + LD a5, 1 * SIZE(A2) + + ADD s2, t2, f20 + fmov f20,s2 + unop + MUL x1, a6, t2 + LD a6, 1 * SIZE(A3) + + ADD s3, t3, f20 + fmov f20,s3 + unop + MUL x1, a7, t3 + LD a7, 1 * SIZE(A4) + + ADD s0, t0, f20 + fmov f20,s0 + LD x1, 9 * SIZE(X1) + MUL x2, a8, t0 + LD a8, 2 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + ldw $31, (PREFETCHSIZE - 8) * SIZE(A4) + MUL x2, a9, t1 + LD a9, 2 * SIZE(A2) + + ADD s2, t2, f20 + fmov f20,s2 + ldi X1, 8 * SIZE(X1) + MUL x2, a10, t2 + LD a10, 2 * SIZE(A3) + + ADD s3, t3, f20 + fmov f20,s3 + ldi I, -1(I) + MUL x2, a11, t3 + LD a11, 2 * SIZE(A4) + + ADD s0, t0, f20 + fmov f20,s0 + LD x2, 2 * SIZE(X1) + MUL x3, a12, t0 + LD a12, 3 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + ldw $31, (PREFETCHSIZE - 8) * SIZE(X1) + MUL x3, a13, t1 + LD a13, 3 * SIZE(A2) + + ADD s2, t2, f20 + fmov f20,s2 + unop + MUL x3, a14, t2 + LD a14, 3 * SIZE(A3) + + ADD s3, t3, f20 + fmov f20,s3 + MUL x3, a15, t3 + LD a15, 3 * SIZE(A4) + bgt I, $L12 + .align 4 + +$L13: + ADD s0, t0, f20 + fmov f20,s0 + LD x3, 3 * SIZE(X1) + MUL x0, a0, t0 + LD a0, 4 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + #unop + MUL x0, a1, t1 + LD a1, 4 * SIZE(A2) + + ADD s2, t2, f20 + fmov f20,s2 + #unop + MUL x0, a2, t2 + LD a2, 4 * SIZE(A3) + + ADD s3, t3, f20 + fmov f20,s3 + #unop + MUL x0, a3, t3 + LD a3, 4 * SIZE(A4) + + ADD s0, t0, x0 + fmov x0,s0 + LD x0, 4 * SIZE(X1) + MUL x1, a4, t0 + LD a4, 5 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + #unop + MUL x1, a5, t1 + LD a5, 5 * SIZE(A2) + + ADD s2, t2, f20 + fmov f20,s2 + #unop + MUL x1, a6, t2 + LD a6, 5 * SIZE(A3) + + ADD s3, t3, f20 + fmov f20,s3 + #unop + MUL x1, a7, t3 + LD a7, 5 * SIZE(A4) + + ADD s0, t0, f20 + fmov f20,s0 + LD x1, 5 * SIZE(X1) + MUL x2, a8, t0 + LD a8, 6 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + #unop + MUL x2, a9, t1 + LD a9, 6 * SIZE(A2) + + ADD s2, t2, f20 + fmov f20,s2 + #unop + MUL x2, a10, t2 + LD a10, 6 * SIZE(A3) + + ADD s3, t3, f20 + fmov f20,s3 + #unop + MUL x2, a11, t3 + LD a11, 6 * SIZE(A4) + + ADD s0, t0, f20 + fmov f20,s0 + LD x2, 6 * SIZE(X1) + MUL x3, a12, t0 + LD a12, 7 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + ldi A1, 8 * SIZE(A1) + MUL x3, a13, t1 + LD a13, 7 * SIZE(A2) + + ADD s2, t2, f20 + fmov f20,s2 + ldi A2, 8 * SIZE(A2) + MUL x3, a14, t2 + LD a14, 7 * SIZE(A3) + + ADD s3, t3, f20 + fmov f20,s3 + ldi A3, 8 * SIZE(A3) + MUL x3, a15, t3 + LD a15, 7 * SIZE(A4) + + ADD s0, t0, f20 + fmov f20,s0 + LD x3, 7 * SIZE(X1) + MUL x0, a0, t0 + unop + + ADD s1, t1, f20 + fmov f20,s1 + ldi X1, 8 * SIZE(X1) + MUL x0, a1, t1 + ldi A4, 8 * SIZE(A4) + + ADD s2, t2, f20 + fmov f20,s2 + MUL x0, a2, t2 + ADD s3, t3, f20 + fmov f20,s3 + MUL x0, a3, t3 + + ADD s0, t0, f20 + fmov f20,s0 + MUL x1, a4, t0 + ADD s1, t1, f20 + fmov f20,s1 + MUL x1, a5, t1 + + ADD s2, t2, f20 + fmov f20,s2 + MUL x1, a6, t2 + ADD s3, t3, f20 + fmov f20,s3 + MUL x1, a7, t3 + + ADD s0, t0, f20 + fmov f20,s0 + MUL x2, a8, t0 + ADD s1, t1, f20 + fmov f20,s1 + MUL x2, a9, t1 + + ADD s2, t2, f20 + fmov f20,s2 + MUL x2, a10, t2 + ADD s3, t3, f20 + fmov f20,s3 + MUL x2, a11, t3 + + ADD s0, t0, f20 + fmov f20,s0 + MUL x3, a12, t0 + ADD s1, t1, f20 + fmov f20,s1 + MUL x3, a13, t1 + + ADD s2, t2, f20 + fmov f20,s2 + MUL x3, a14, t2 + ADD s3, t3, f20 + fmov f20,s3 + MUL x3, a15, t3 + .align 4 + +$L15: + and M, 7, I + ble I, $L18 + + LD x0, 0 * SIZE(X1) + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + LD a2, 0 * SIZE(A3) + LD a3, 0 * SIZE(A4) + + ldi I, -1(I) + ble I, $L17 + .align 4 + +$L16: + ADD s0, t0,f20 + fmov f20,s0 + ldi A4, 1 * SIZE(A4) + MUL x0, a0, t0 + LD a0, 1 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + ldi A1, 1 * SIZE(A1) + MUL x0, a1, t1 + LD a1, 1 * SIZE(A2) + + ADD s2, t2, f20 + fmov f20,s2 + ldi A2, 1 * SIZE(A2) + MUL x0, a2, t2 + LD a2, 1 * SIZE(A3) + + ADD s3, t3, f20 + fmov f20,s3 + ldi A3, 1 * SIZE(A3) + MUL x0, a3, t3 + LD a3, 0 * SIZE(A4) + + LD x0, 1 * SIZE(X1) + ldi X1, 1 * SIZE(X1) + ldi I, -1(I) + bgt I, $L16 + .align 4 + +$L17: + ADD s0, t0,f20 + fmov f20,s0 + MUL x0, a0, t0 + ADD s1, t1, f20 + fmov f20,s1 + MUL x0, a1, t1 + + ADD s2, t2, f20 + fmov f20,s2 + MUL x0, a2, t2 + ADD s3, t3, f20 + fmov f20,s3 + MUL x0, a3, t3 + .align 4 + +$L18: + LD a0, 0 * SIZE(Y) + addl Y, INCY, Y + LD a1, 0 * SIZE(Y) + addl Y, INCY, Y + LD a2, 0 * SIZE(Y) + addl Y, INCY, Y + LD a3, 0 * SIZE(Y) + addl Y, INCY, Y + + ADD s0, t0,f20 + fmov f20,s0 + ADD s1, t1, f20 + fmov f20,s1 + ADD s2, t2, f20 + fmov f20,s2 + ADD s3, t3, f20 + fmov f20,s3 + + MUL alpha, s0,f20 + fmov f20,s0 + MUL alpha, s1, f20 + fmov f20,s1 + MUL alpha, s2, f20 + fmov f20,s2 + MUL alpha, s3, f20 + fmov f20,s3 + + ADD a0, s0,f20 + fmov f20,a0 + fclr t0 + ADD a1, s1, f20 + fmov f20,a1 + fclr t1 + ADD a2, s2, f20 + fmov f20,a2 + fclr t2 + ADD a3, s3, f20 + fmov f20,a3 + fclr t3 + + ST a0, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + ST a1, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + ST a2, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + ST a3, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + + ldi J, -1(J) + bgt J, $L11 + .align 4 + +$L20: + and N, 2, J + ble J, $L30 + mov A, A1 + addl A, LDA, A2 + + addl A2, LDA, A + fclr s0 + mov X, X1 + fclr s1 + + sra M, 3, I + fclr s2 + fclr s3 + ble I, $L25 + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + LD a2, 1 * SIZE(A1) + LD a3, 1 * SIZE(A2) + LD a4, 2 * SIZE(A1) + LD a5, 2 * SIZE(A2) + LD a6, 3 * SIZE(A1) + LD a7, 3 * SIZE(A2) + + LD a8, 4 * SIZE(A1) + LD a9, 4 * SIZE(A2) + LD a10, 5 * SIZE(A1) + LD a11, 5 * SIZE(A2) + LD a12, 6 * SIZE(A1) + LD a13, 6 * SIZE(A2) + LD a14, 7 * SIZE(A1) + LD a15, 7 * SIZE(A2) + + LD x0, 0 * SIZE(X1) + LD x1, 1 * SIZE(X1) + LD x2, 2 * SIZE(X1) + + ldi I, -1(I) + ble I, $L23 + .align 4 + +$L22: + ADD s0, t0, x3 + fmov x3,s0 + LD x3, 3 * SIZE(X1) + MUL x0, a0, t0 + LD a0, 8 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + ldw $31, (PREFETCHSIZE + 0) * SIZE(A1) + MUL x0, a1, t1 + LD a1, 8 * SIZE(A2) + + ADD s0, t2, x0 + fmov x0,s0 + LD x0, 4 * SIZE(X1) + MUL x1, a2, t2 + LD a2, 9 * SIZE(A1) + + ADD s1, t3, f20 + fmov f20,s1 + #unop + MUL x1, a3, t3 + LD a3, 9 * SIZE(A2) + + ADD s0, t0, f20 + fmov f20,s0 + LD x1, 5 * SIZE(X1) + MUL x2, a4, t0 + LD a4, 10 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + ldi I, -1(I) + MUL x2, a5, t1 + LD a5, 10 * SIZE(A2) + + ADD s0, t2, f20 + fmov f20,s0 + LD x2, 6 * SIZE(X1) + MUL x3, a6, t2 + LD a6, 11 * SIZE(A1) + + ADD s1, t3, f20 + fmov f20,s1 + ldi X1, 8 * SIZE(X1) + MUL x3, a7, t3 + LD a7, 11 * SIZE(A2) + + ADD s0, t0, f20 + fmov f20,s0 + LD x3, -1 * SIZE(X1) + MUL x0, a8, t0 + LD a8, 12 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + ldw $31, (PREFETCHSIZE + 0) * SIZE(A2) + MUL x0, a9, t1 + LD a9, 12 * SIZE(A2) + + ADD s0, t0, f20 + fmov f20,s0 + LD x0, 0 * SIZE(X1) + MUL x1, a10, t0 + LD a10, 13 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + ldi A1, 8 * SIZE(A1) + MUL x1, a11, t1 + LD a11, 13 * SIZE(A2) + + ADD s0, t0, f20 + fmov f20,s0 + LD x1, 1 * SIZE(X1) + MUL x2, a12, t0 + LD a12, 6 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + MUL x2, a13, t1 + LD a13, 14 * SIZE(A2) + ldi A2, 8 * SIZE(A2) + + ADD s0, t0, f20 + fmov f20,s0 + LD x2, 2 * SIZE(X1) + MUL x3, a14, t0 + LD a14, 7 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + MUL x3, a15, t1 + LD a15, 7 * SIZE(A2) + bgt I, $L22 + .align 4 + +$L23: + ADD s0, t0, f20 + fmov f20,s0 + LD x3, 3 * SIZE(X1) + MUL x0, a0, t0 + ldi A1, 8 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + unop + MUL x0, a1, t1 + unop + + ADD s0, t2, f20 + fmov f20,s0 + LD x0, 4 * SIZE(X1) + MUL x1, a2, t2 + ldi A2, 8 * SIZE(A2) + + ADD s1, t3, f20 + fmov f20,s1 + unop + MUL x1, a3, t3 + unop + + ADD s0, t0, f20 + fmov f20,s0 + LD x1, 5 * SIZE(X1) + MUL x2, a4, t0 + unop + + ADD s1, t1, f20 + fmov f20,s1 + unop + MUL x2, a5, t1 + unop + + ADD s0, t2, f20 + fmov f20,s0 + LD x2, 6 * SIZE(X1) + MUL x3, a6, t2 + unop + + ADD s1, t3, f20 + fmov f20,s1 + unop + MUL x3, a7, t3 + unop + + ADD s0, t0, f20 + fmov f20,s0 + LD x3, 7 * SIZE(X1) + MUL x0, a8, t0 + ldi X1, 8 * SIZE(X1) + + ADD s1, t1, f20 + fmov f20,s1 + unop + MUL x0, a9, t1 + unop + + ADD s0, t0, f20 + fmov f20,s0 + MUL x1, a10, t0 + ADD s1, t1, f20 + fmov f20,s1 + MUL x1, a11, t1 + + ADD s0, t0, f20 + fmov f20,s0 + MUL x2, a12, t0 + ADD s1, t1, f20 + fmov f20,s1 + MUL x2, a13, t1 + + ADD s0, t0, f20 + fmov f20,s0 + MUL x3, a14, t0 + ADD s1, t1, f20 + fmov f20,s1 + MUL x3, a15, t1 + .align 4 + +$L25: + and M, 7, I + ble I, $L28 + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + LD x0, 0 * SIZE(X1) + + ldi I, -1(I) + ble I, $L27 + .align 4 + +$L26: + ADD s0, t0,f20 + fmov f20,s0 + ldi A2, 1 * SIZE(A2) + MUL x0, a0, t0 + LD a0, 1 * SIZE(A1) + + ADD s1, t1,f20 + fmov f20,s1 + ldi A1, 1 * SIZE(A1) + MUL x0, a1, t1 + LD a1, 0 * SIZE(A2) + + LD x0, 1 * SIZE(X1) + ldi X1, 1 * SIZE(X1) + ldi I, -1(I) + bgt I, $L26 + .align 4 + +$L27: + ADD s0, t0, f20 + fmov f20,s0 + MUL x0, a0, t0 + ADD s1, t1, f20 + fmov f20,s1 + MUL x0, a1, t1 + .align 4 + +$L28: + LD a0, 0 * SIZE(Y) + addl Y, INCY, Y + LD a1, 0 * SIZE(Y) + addl Y, INCY, Y + + ADD s0, t0, f20 + fmov f20,s0 + ADD s1, t1, f20 + fmov f20,s1 + ADD s2, t2, f20 + fmov f20,s2 + ADD s3, t3, f20 + fmov f20,s3 + + ADD s0, s2, f20 + fmov f20,s0 + ADD s1, s3, f20 + fmov f20,s1 + + MUL alpha, s0, f20 + fmov f20,s0 + MUL alpha, s1,f20 + fmov f20,s1 + + ADD a0, s0, f20 + fmov f20,a0 + ADD a1, s1, f20 + fmov f20,a1 + + ST a0, 0 * SIZE(Y1) + fclr t0 + addl Y1, INCY, Y1 + fclr t1 + + ST a1, 0 * SIZE(Y1) + fclr t2 + addl Y1, INCY, Y1 + fclr t3 + .align 4 + +$L30: + blbc N, $L999 + + mov A, A1 + fclr s0 + mov X, X1 + fclr s1 + + sra M, 3, I + fclr s2 + fclr s3 + ble I, $L35 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a8, 0 * SIZE(X1) + LD a9, 1 * SIZE(X1) + + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + LD a10, 2 * SIZE(X1) + LD a11, 3 * SIZE(X1) + + LD a4, 4 * SIZE(A1) + LD a5, 5 * SIZE(A1) + LD a12, 4 * SIZE(X1) + LD a13, 5 * SIZE(X1) + + LD a6, 6 * SIZE(A1) + LD a7, 7 * SIZE(A1) + LD a14, 6 * SIZE(X1) + + ldi I, -1(I) + ble I, $L33 + .align 4 + +$L32: + ADD s0, t0, f20 + fmov f20,s0 + LD a15, 7 * SIZE(X1) + MUL a0, a8, f20 + fmov f20,t0 + LD a0, 8 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + LD a8, 8 * SIZE(X1) + MUL a1, a9, t1 + LD a1, 9 * SIZE(A1) + + ADD s2, t2, f20 + fmov f20,s2 + LD a9, 9 * SIZE(X1) + MUL a2, a10, t2 + LD a2, 10 * SIZE(A1) + + ADD s3, t3, f20 + fmov f20,s3 + LD a10, 10 * SIZE(X1) + MUL a3, a11, t3 + LD a3, 11 * SIZE(A1) + + ADD s0, t0, f20 + fmov f20,s0 + LD a11, 11 * SIZE(X1) + MUL a4, a12, t0 + LD a4, 12 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + LD a12, 12 * SIZE(X1) + MUL a5, a13, t1 + LD a5, 13 * SIZE(A1) + + ADD s2, t2, f20 + fmov f20,s2 + LD a13, 13 * SIZE(X1) + MUL a6, a14, t2 + LD a6, 14 * SIZE(A1) + + ADD s3, t3, f20 + fmov f20,s3 + LD a14, 14 * SIZE(X1) + MUL a7, a15, t3 + LD a7, 15 * SIZE(A1) + + ldi A1, 8 * SIZE(A1) + ldi I, -1(I) + ldi X1, 8 * SIZE(X1) + bgt I, $L32 + .align 4 + +$L33: + ADD s0, t0, f20 + fmov f20,s0 + LD a15, 7 * SIZE(X1) + MUL a0, a8, t0 + ldi A1, 8 * SIZE(A1) + + ADD s1, t1, f20 + fmov f20,s1 + unop + MUL a1, a9, t1 + ldi X1, 8 * SIZE(X1) + + ADD s2, t2, f20 + fmov f20,s2 + MUL a2, a10, t2 + ADD s3, t3, f20 + fmov f20,s3 + MUL a3, a11, t3 + + ADD s0, t0, f20 + fmov f20,s0 + MUL a4, a12, t0 + ADD s1, t1, f20 + fmov f20,s1 + MUL a5, a13, t1 + + ADD s2, t2, f20 + fmov f20,s2 + MUL a6, a14, t2 + ADD s3, t3, f20 + fmov f20,s3 + MUL a7, a15, t3 + .align 4 + +$L35: + and M, 7, I + ble I, $L38 + + LD a0, 0 * SIZE(A1) + LD x0, 0 * SIZE(X1) + + ldi I, -1(I) + ble I, $L37 + .align 4 + +$L36: + ADD s0, t0,f20 + fmov f20,s0 + MUL x0, a0, t0 + LD a0, 1 * SIZE(A1) + LD x0, 1 * SIZE(X1) + + ldi A1, 1 * SIZE(A1) + ldi X1, 1 * SIZE(X1) + ldi I, -1(I) + bgt I, $L36 + .align 4 + +$L37: + ADD s0, t0,f20 + fmov f20,s0 + MUL x0, a0, t0 + .align 4 + +$L38: + LD a0, 0 * SIZE(Y) + + ADD s0, t0,f20 + fmov f20,s0 + ADD s1, t1, f20 + fmov f20,s1 + ADD s2, t2, f20 + fmov f20,s2 + ADD s3, t3, f20 + fmov f20,s3 + + ADD s0, s2, f20 + fmov f20,s0 + ADD s1, s3, f20 + fmov f20,s1 + ADD s0, s1, f20 + fmov f20,s0 + + MUL alpha, s0, f20 + fmov f20,s0 + ADD a0, s0, f20 + fmov f20,a0 + + ST a0, 0 * SIZE(Y1) + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + fldd f20, 64($sp) + + ldi $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/gemv_t.S.bak b/kernel/sw_64/gemv_t.S.bak new file mode 100644 index 0000000..068e463 --- /dev/null +++ b/kernel/sw_64/gemv_t.S.bak @@ -0,0 +1,1061 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define STACKSIZE 64 +#define PREFETCHSIZE 32 + +#define M $16 +#define N $17 +#define A $20 +#define LDA $21 + +#define X $18 +#define INCX $19 +#define Y $22 +#define INCY $23 + +#define BUFFER $24 + +#define I $25 +#define J $27 + +#define X1 $3 +#define Y1 $4 + +#define A1 $5 +#define A2 $6 +#define A3 $7 +#define A4 $8 + +#define alpha $f19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f21 + +#define a0 $f22 +#define a1 $f23 +#define a2 $f24 +#define a3 $f25 +#define a4 $f26 +#define a5 $f27 +#define a6 $f28 +#define a7 $f29 + +#define a8 $f2 +#define a9 $f3 +#define a10 $f4 +#define a11 $f5 +#define a12 $f6 +#define a13 $f7 +#define a14 $f8 +#define a15 $f9 + + PROLOGUE + + ldi $sp, -STACKSIZE($sp) + ldl X, 0 + STACKSIZE($sp) + ldl INCX, 8 + STACKSIZE($sp) + ldl Y, 16 + STACKSIZE($sp) + ldl INCY, 24 + STACKSIZE($sp) + ldl BUFFER, 32 + STACKSIZE($sp) + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + + PROFCODE + + cmple M, 0, $0 + SXADDQ INCX, 0, INCX + cmple N, 0, $1 + SXADDQ INCY, 0, INCY + + or $0, $1, $0 + bne $0, $L999 + + cmpeq INCX, SIZE, $0 + mov X, X1 + SXADDQ LDA, 0, LDA + bne $0, $L10 + + sra M, 3, I + mov BUFFER, Y1 + mov BUFFER, X + ble I, $L05 + .align 4 + +$L02: + fillcs (PREFETCHSIZE + 0) * SIZE(X1) + ldi I, -1(I) + + LD a0, 0 * SIZE(X1) + addl X1, INCX, X1 + LD a1, 0 * SIZE(X1) + addl X1, INCX, X1 + LD a2, 0 * SIZE(X1) + addl X1, INCX, X1 + LD a3, 0 * SIZE(X1) + addl X1, INCX, X1 + + ST a0, 0 * SIZE(Y1) + ST a1, 1 * SIZE(Y1) + ST a2, 2 * SIZE(Y1) + ST a3, 3 * SIZE(Y1) + + LD a4, 0 * SIZE(X1) + addl X1, INCX, X1 + LD a5, 0 * SIZE(X1) + addl X1, INCX, X1 + LD a6, 0 * SIZE(X1) + addl X1, INCX, X1 + LD a7, 0 * SIZE(X1) + addl X1, INCX, X1 + + ST a4, 4 * SIZE(Y1) + ST a5, 5 * SIZE(Y1) + ST a6, 6 * SIZE(Y1) + ST a7, 7 * SIZE(Y1) + + ldi Y1, 8 * SIZE(Y1) + bgt I, $L02 + .align 4 + +$L05: + and M, 7, I + ble I, $L10 + .align 4 + +$L06: + LD a0, 0 * SIZE(X1) + addl X1, INCX, X1 + ST a0, 0 * SIZE(Y1) + addl Y1, SIZE, Y1 + + ldi I, -1(I) + bgt I, $L06 + .align 4 + +$L10: + mov Y, Y1 + fclr t0 + unop + fclr t1 + + sra N, 2, J + fclr t2 + fclr t3 + ble J, $L20 + .align 4 + +$L11: + mov A, A1 + fclr s0 + addl A, LDA, A2 + fclr s1 + + addl A2, LDA, A3 + fclr s2 + addl A3, LDA, A4 + fclr s3 + + s4addl LDA, A, A + unop + mov X, X1 + fillcs 3 * SIZE(Y) + + sra M, 3, I + ble I, $L15 + + LD x0, 0 * SIZE(X1) + LD x1, 1 * SIZE(X1) + LD x2, 2 * SIZE(X1) + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + LD a2, 0 * SIZE(A3) + LD a3, 0 * SIZE(A4) + LD a4, 1 * SIZE(A1) + LD a5, 1 * SIZE(A2) + LD a6, 1 * SIZE(A3) + LD a7, 1 * SIZE(A4) + LD a8, 2 * SIZE(A1) + LD a9, 2 * SIZE(A2) + LD a10, 2 * SIZE(A3) + LD a11, 2 * SIZE(A4) + LD a12, 3 * SIZE(A1) + LD a13, 3 * SIZE(A2) + LD a14, 3 * SIZE(A3) + LD a15, 3 * SIZE(A4) + + ldi I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ADD s0, t0, s0 + LD x3, 3 * SIZE(X1) + MUL x0, a0, t0 + LD a0, 4 * SIZE(A1) + + ADD s1, t1, s1 + fillcs (PREFETCHSIZE + 0) * SIZE(A1) + MUL x0, a1, t1 + LD a1, 4 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x0, a2, t2 + LD a2, 4 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x0, a3, t3 + LD a3, 4 * SIZE(A4) + + ADD s0, t0, s0 + LD x0, 4 * SIZE(X1) + MUL x1, a4, t0 + LD a4, 5 * SIZE(A1) + + ADD s1, t1, s1 + ldi A1, 8 * SIZE(A1) + MUL x1, a5, t1 + LD a5, 5 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x1, a6, t2 + LD a6, 5 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x1, a7, t3 + LD a7, 5 * SIZE(A4) + + ADD s0, t0, s0 + LD x1, 5 * SIZE(X1) + MUL x2, a8, t0 + LD a8, -2 * SIZE(A1) + + ADD s1, t1, s1 + fillcs (PREFETCHSIZE + 0) * SIZE(A2) + MUL x2, a9, t1 + LD a9, 6 * SIZE(A2) + + ADD s2, t2, s2 + ldi A2, 8 * SIZE(A2) + MUL x2, a10, t2 + LD a10, 6 * SIZE(A3) + + ADD s3, t3, s3 + ldi A3, 8 * SIZE(A3) + MUL x2, a11, t3 + LD a11, 6 * SIZE(A4) + + ADD s0, t0, s0 + LD x2, 6 * SIZE(X1) + MUL x3, a12, t0 + LD a12, -1 * SIZE(A1) + + ADD s1, t1, s1 + ldi A4, 8 * SIZE(A4) + MUL x3, a13, t1 + LD a13, -1 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x3, a14, t2 + LD a14, -1 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x3, a15, t3 + LD a15, -1 * SIZE(A4) + + ADD s0, t0, s0 + LD x3, 7 * SIZE(X1) + MUL x0, a0, t0 + LD a0, 0 * SIZE(A1) + + ADD s1, t1, s1 + fillcs (PREFETCHSIZE - 8) * SIZE(A3) + MUL x0, a1, t1 + LD a1, 0 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x0, a2, t2 + LD a2, 0 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x0, a3, t3 + LD a3, 0 * SIZE(A4) + + ADD s0, t0, s0 + LD x0, 8 * SIZE(X1) + MUL x1, a4, t0 + LD a4, 1 * SIZE(A1) + + ADD s1, t1, s1 + unop + MUL x1, a5, t1 + LD a5, 1 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x1, a6, t2 + LD a6, 1 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x1, a7, t3 + LD a7, 1 * SIZE(A4) + + ADD s0, t0, s0 + LD x1, 9 * SIZE(X1) + MUL x2, a8, t0 + LD a8, 2 * SIZE(A1) + + ADD s1, t1, s1 + fillcs (PREFETCHSIZE - 8) * SIZE(A4) + MUL x2, a9, t1 + LD a9, 2 * SIZE(A2) + + ADD s2, t2, s2 + ldi X1, 8 * SIZE(X1) + MUL x2, a10, t2 + LD a10, 2 * SIZE(A3) + + ADD s3, t3, s3 + ldi I, -1(I) + MUL x2, a11, t3 + LD a11, 2 * SIZE(A4) + + ADD s0, t0, s0 + LD x2, 2 * SIZE(X1) + MUL x3, a12, t0 + LD a12, 3 * SIZE(A1) + + ADD s1, t1, s1 + fillcs (PREFETCHSIZE - 8) * SIZE(X1) + MUL x3, a13, t1 + LD a13, 3 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x3, a14, t2 + LD a14, 3 * SIZE(A3) + + ADD s3, t3, s3 + MUL x3, a15, t3 + LD a15, 3 * SIZE(A4) + bgt I, $L12 + .align 4 + +$L13: + ADD s0, t0, s0 + LD x3, 3 * SIZE(X1) + MUL x0, a0, t0 + LD a0, 4 * SIZE(A1) + + ADD s1, t1, s1 + unop + MUL x0, a1, t1 + LD a1, 4 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x0, a2, t2 + LD a2, 4 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x0, a3, t3 + LD a3, 4 * SIZE(A4) + + ADD s0, t0, s0 + LD x0, 4 * SIZE(X1) + MUL x1, a4, t0 + LD a4, 5 * SIZE(A1) + + ADD s1, t1, s1 + unop + MUL x1, a5, t1 + LD a5, 5 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x1, a6, t2 + LD a6, 5 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x1, a7, t3 + LD a7, 5 * SIZE(A4) + + ADD s0, t0, s0 + LD x1, 5 * SIZE(X1) + MUL x2, a8, t0 + LD a8, 6 * SIZE(A1) + + ADD s1, t1, s1 + unop + MUL x2, a9, t1 + LD a9, 6 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x2, a10, t2 + LD a10, 6 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x2, a11, t3 + LD a11, 6 * SIZE(A4) + + ADD s0, t0, s0 + LD x2, 6 * SIZE(X1) + MUL x3, a12, t0 + LD a12, 7 * SIZE(A1) + + ADD s1, t1, s1 + ldi A1, 8 * SIZE(A1) + MUL x3, a13, t1 + LD a13, 7 * SIZE(A2) + + ADD s2, t2, s2 + ldi A2, 8 * SIZE(A2) + MUL x3, a14, t2 + LD a14, 7 * SIZE(A3) + + ADD s3, t3, s3 + ldi A3, 8 * SIZE(A3) + MUL x3, a15, t3 + LD a15, 7 * SIZE(A4) + + ADD s0, t0, s0 + LD x3, 7 * SIZE(X1) + MUL x0, a0, t0 + unop + + ADD s1, t1, s1 + ldi X1, 8 * SIZE(X1) + MUL x0, a1, t1 + ldi A4, 8 * SIZE(A4) + + ADD s2, t2, s2 + MUL x0, a2, t2 + ADD s3, t3, s3 + MUL x0, a3, t3 + + ADD s0, t0, s0 + MUL x1, a4, t0 + ADD s1, t1, s1 + MUL x1, a5, t1 + + ADD s2, t2, s2 + MUL x1, a6, t2 + ADD s3, t3, s3 + MUL x1, a7, t3 + + ADD s0, t0, s0 + MUL x2, a8, t0 + ADD s1, t1, s1 + MUL x2, a9, t1 + + ADD s2, t2, s2 + MUL x2, a10, t2 + ADD s3, t3, s3 + MUL x2, a11, t3 + + ADD s0, t0, s0 + MUL x3, a12, t0 + ADD s1, t1, s1 + MUL x3, a13, t1 + + ADD s2, t2, s2 + MUL x3, a14, t2 + ADD s3, t3, s3 + MUL x3, a15, t3 + .align 4 + +$L15: + and M, 7, I + ble I, $L18 + + LD x0, 0 * SIZE(X1) + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + LD a2, 0 * SIZE(A3) + LD a3, 0 * SIZE(A4) + + ldi I, -1(I) + ble I, $L17 + .align 4 + +$L16: + ADD s0, t0, s0 + ldi A4, 1 * SIZE(A4) + MUL x0, a0, t0 + LD a0, 1 * SIZE(A1) + + ADD s1, t1, s1 + ldi A1, 1 * SIZE(A1) + MUL x0, a1, t1 + LD a1, 1 * SIZE(A2) + + ADD s2, t2, s2 + ldi A2, 1 * SIZE(A2) + MUL x0, a2, t2 + LD a2, 1 * SIZE(A3) + + ADD s3, t3, s3 + ldi A3, 1 * SIZE(A3) + MUL x0, a3, t3 + LD a3, 0 * SIZE(A4) + + LD x0, 1 * SIZE(X1) + ldi X1, 1 * SIZE(X1) + ldi I, -1(I) + bgt I, $L16 + .align 4 + +$L17: + ADD s0, t0, s0 + MUL x0, a0, t0 + ADD s1, t1, s1 + MUL x0, a1, t1 + + ADD s2, t2, s2 + MUL x0, a2, t2 + ADD s3, t3, s3 + MUL x0, a3, t3 + .align 4 + +$L18: + LD a0, 0 * SIZE(Y) + addl Y, INCY, Y + LD a1, 0 * SIZE(Y) + addl Y, INCY, Y + LD a2, 0 * SIZE(Y) + addl Y, INCY, Y + LD a3, 0 * SIZE(Y) + addl Y, INCY, Y + + ADD s0, t0, s0 + ADD s1, t1, s1 + ADD s2, t2, s2 + ADD s3, t3, s3 + + MUL alpha, s0, s0 + MUL alpha, s1, s1 + MUL alpha, s2, s2 + MUL alpha, s3, s3 + + ADD a0, s0, a0 + fclr t0 + ADD a1, s1, a1 + fclr t1 + ADD a2, s2, a2 + fclr t2 + ADD a3, s3, a3 + fclr t3 + + ST a0, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + ST a1, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + ST a2, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + ST a3, 0 * SIZE(Y1) + addl Y1, INCY, Y1 + + ldi J, -1(J) + bgt J, $L11 + .align 4 + +$L20: + and N, 2, J + ble J, $L30 + mov A, A1 + addl A, LDA, A2 + + addl A2, LDA, A + fclr s0 + mov X, X1 + fclr s1 + + sra M, 3, I + fclr s2 + fclr s3 + ble I, $L25 + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + LD a2, 1 * SIZE(A1) + LD a3, 1 * SIZE(A2) + LD a4, 2 * SIZE(A1) + LD a5, 2 * SIZE(A2) + LD a6, 3 * SIZE(A1) + LD a7, 3 * SIZE(A2) + + LD a8, 4 * SIZE(A1) + LD a9, 4 * SIZE(A2) + LD a10, 5 * SIZE(A1) + LD a11, 5 * SIZE(A2) + LD a12, 6 * SIZE(A1) + LD a13, 6 * SIZE(A2) + LD a14, 7 * SIZE(A1) + LD a15, 7 * SIZE(A2) + + LD x0, 0 * SIZE(X1) + LD x1, 1 * SIZE(X1) + LD x2, 2 * SIZE(X1) + + ldi I, -1(I) + ble I, $L23 + .align 4 + +$L22: + ADD s0, t0, s0 + LD x3, 3 * SIZE(X1) + MUL x0, a0, t0 + LD a0, 8 * SIZE(A1) + + ADD s1, t1, s1 + fillcs (PREFETCHSIZE + 0) * SIZE(A1) + MUL x0, a1, t1 + LD a1, 8 * SIZE(A2) + + ADD s0, t2, s0 + LD x0, 4 * SIZE(X1) + MUL x1, a2, t2 + LD a2, 9 * SIZE(A1) + + ADD s1, t3, s1 + unop + MUL x1, a3, t3 + LD a3, 9 * SIZE(A2) + + ADD s0, t0, s0 + LD x1, 5 * SIZE(X1) + MUL x2, a4, t0 + LD a4, 10 * SIZE(A1) + + ADD s1, t1, s1 + ldi I, -1(I) + MUL x2, a5, t1 + LD a5, 10 * SIZE(A2) + + ADD s0, t2, s0 + LD x2, 6 * SIZE(X1) + MUL x3, a6, t2 + LD a6, 11 * SIZE(A1) + + ADD s1, t3, s1 + ldi X1, 8 * SIZE(X1) + MUL x3, a7, t3 + LD a7, 11 * SIZE(A2) + + ADD s0, t0, s0 + LD x3, -1 * SIZE(X1) + MUL x0, a8, t0 + LD a8, 12 * SIZE(A1) + + ADD s1, t1, s1 + fillcs (PREFETCHSIZE + 0) * SIZE(A2) + MUL x0, a9, t1 + LD a9, 12 * SIZE(A2) + + ADD s0, t0, s0 + LD x0, 0 * SIZE(X1) + MUL x1, a10, t0 + LD a10, 13 * SIZE(A1) + + ADD s1, t1, s1 + ldi A1, 8 * SIZE(A1) + MUL x1, a11, t1 + LD a11, 13 * SIZE(A2) + + ADD s0, t0, s0 + LD x1, 1 * SIZE(X1) + MUL x2, a12, t0 + LD a12, 6 * SIZE(A1) + + ADD s1, t1, s1 + MUL x2, a13, t1 + LD a13, 14 * SIZE(A2) + ldi A2, 8 * SIZE(A2) + + ADD s0, t0, s0 + LD x2, 2 * SIZE(X1) + MUL x3, a14, t0 + LD a14, 7 * SIZE(A1) + + ADD s1, t1, s1 + MUL x3, a15, t1 + LD a15, 7 * SIZE(A2) + bgt I, $L22 + .align 4 + +$L23: + ADD s0, t0, s0 + LD x3, 3 * SIZE(X1) + MUL x0, a0, t0 + ldi A1, 8 * SIZE(A1) + + ADD s1, t1, s1 + unop + MUL x0, a1, t1 + unop + + ADD s0, t2, s0 + LD x0, 4 * SIZE(X1) + MUL x1, a2, t2 + ldi A2, 8 * SIZE(A2) + + ADD s1, t3, s1 + unop + MUL x1, a3, t3 + unop + + ADD s0, t0, s0 + LD x1, 5 * SIZE(X1) + MUL x2, a4, t0 + unop + + ADD s1, t1, s1 + unop + MUL x2, a5, t1 + unop + + ADD s0, t2, s0 + LD x2, 6 * SIZE(X1) + MUL x3, a6, t2 + unop + + ADD s1, t3, s1 + unop + MUL x3, a7, t3 + unop + + ADD s0, t0, s0 + LD x3, 7 * SIZE(X1) + MUL x0, a8, t0 + ldi X1, 8 * SIZE(X1) + + ADD s1, t1, s1 + unop + MUL x0, a9, t1 + unop + + ADD s0, t0, s0 + MUL x1, a10, t0 + ADD s1, t1, s1 + MUL x1, a11, t1 + + ADD s0, t0, s0 + MUL x2, a12, t0 + ADD s1, t1, s1 + MUL x2, a13, t1 + + ADD s0, t0, s0 + MUL x3, a14, t0 + ADD s1, t1, s1 + MUL x3, a15, t1 + .align 4 + +$L25: + and M, 7, I + ble I, $L28 + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + LD x0, 0 * SIZE(X1) + + ldi I, -1(I) + ble I, $L27 + .align 4 + +$L26: + ADD s0, t0, s0 + ldi A2, 1 * SIZE(A2) + MUL x0, a0, t0 + LD a0, 1 * SIZE(A1) + + ADD s1, t1, s1 + ldi A1, 1 * SIZE(A1) + MUL x0, a1, t1 + LD a1, 0 * SIZE(A2) + + LD x0, 1 * SIZE(X1) + ldi X1, 1 * SIZE(X1) + ldi I, -1(I) + bgt I, $L26 + .align 4 + +$L27: + ADD s0, t0, s0 + MUL x0, a0, t0 + ADD s1, t1, s1 + MUL x0, a1, t1 + .align 4 + +$L28: + LD a0, 0 * SIZE(Y) + addl Y, INCY, Y + LD a1, 0 * SIZE(Y) + addl Y, INCY, Y + + ADD s0, t0, s0 + ADD s1, t1, s1 + ADD s2, t2, s2 + ADD s3, t3, s3 + + ADD s0, s2, s0 + ADD s1, s3, s1 + + MUL alpha, s0, s0 + MUL alpha, s1, s1 + + ADD a0, s0, a0 + ADD a1, s1, a1 + + ST a0, 0 * SIZE(Y1) + fclr t0 + addl Y1, INCY, Y1 + fclr t1 + + ST a1, 0 * SIZE(Y1) + fclr t2 + addl Y1, INCY, Y1 + fclr t3 + .align 4 + +$L30: + blbc N, $L999 + + mov A, A1 + fclr s0 + mov X, X1 + fclr s1 + + sra M, 3, I + fclr s2 + fclr s3 + ble I, $L35 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a8, 0 * SIZE(X1) + LD a9, 1 * SIZE(X1) + + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + LD a10, 2 * SIZE(X1) + LD a11, 3 * SIZE(X1) + + LD a4, 4 * SIZE(A1) + LD a5, 5 * SIZE(A1) + LD a12, 4 * SIZE(X1) + LD a13, 5 * SIZE(X1) + + LD a6, 6 * SIZE(A1) + LD a7, 7 * SIZE(A1) + LD a14, 6 * SIZE(X1) + + ldi I, -1(I) + ble I, $L33 + .align 4 + +$L32: + ADD s0, t0, s0 + LD a15, 7 * SIZE(X1) + MUL a0, a8, t0 + LD a0, 8 * SIZE(A1) + + ADD s1, t1, s1 + LD a8, 8 * SIZE(X1) + MUL a1, a9, t1 + LD a1, 9 * SIZE(A1) + + ADD s2, t2, s2 + LD a9, 9 * SIZE(X1) + MUL a2, a10, t2 + LD a2, 10 * SIZE(A1) + + ADD s3, t3, s3 + LD a10, 10 * SIZE(X1) + MUL a3, a11, t3 + LD a3, 11 * SIZE(A1) + + ADD s0, t0, s0 + LD a11, 11 * SIZE(X1) + MUL a4, a12, t0 + LD a4, 12 * SIZE(A1) + + ADD s1, t1, s1 + LD a12, 12 * SIZE(X1) + MUL a5, a13, t1 + LD a5, 13 * SIZE(A1) + + ADD s2, t2, s2 + LD a13, 13 * SIZE(X1) + MUL a6, a14, t2 + LD a6, 14 * SIZE(A1) + + ADD s3, t3, s3 + LD a14, 14 * SIZE(X1) + MUL a7, a15, t3 + LD a7, 15 * SIZE(A1) + + ldi A1, 8 * SIZE(A1) + ldi I, -1(I) + ldi X1, 8 * SIZE(X1) + bgt I, $L32 + .align 4 + +$L33: + ADD s0, t0, s0 + LD a15, 7 * SIZE(X1) + MUL a0, a8, t0 + ldi A1, 8 * SIZE(A1) + + ADD s1, t1, s1 + unop + MUL a1, a9, t1 + ldi X1, 8 * SIZE(X1) + + ADD s2, t2, s2 + MUL a2, a10, t2 + ADD s3, t3, s3 + MUL a3, a11, t3 + + ADD s0, t0, s0 + MUL a4, a12, t0 + ADD s1, t1, s1 + MUL a5, a13, t1 + + ADD s2, t2, s2 + MUL a6, a14, t2 + ADD s3, t3, s3 + MUL a7, a15, t3 + .align 4 + +$L35: + and M, 7, I + ble I, $L38 + + LD a0, 0 * SIZE(A1) + LD x0, 0 * SIZE(X1) + + ldi I, -1(I) + ble I, $L37 + .align 4 + +$L36: + ADD s0, t0, s0 + MUL x0, a0, t0 + LD a0, 1 * SIZE(A1) + LD x0, 1 * SIZE(X1) + + ldi A1, 1 * SIZE(A1) + ldi X1, 1 * SIZE(X1) + ldi I, -1(I) + bgt I, $L36 + .align 4 + +$L37: + ADD s0, t0, s0 + MUL x0, a0, t0 + .align 4 + +$L38: + LD a0, 0 * SIZE(Y) + + ADD s0, t0, s0 + ADD s1, t1, s1 + ADD s2, t2, s2 + ADD s3, t3, s3 + + ADD s0, s2, s0 + ADD s1, s3, s1 + ADD s0, s1, s0 + + MUL alpha, s0, s0 + ADD a0, s0, a0 + + ST a0, 0 * SIZE(Y1) + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + + ldi $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/iamax.S b/kernel/sw_64/iamax.S new file mode 100644 index 0000000..f3b2909 --- /dev/null +++ b/kernel/sw_64/iamax.S @@ -0,0 +1,440 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#ifndef USE_MIN +#define CMPLT(a, b) fcmplt a, b +#else +#define CMPLT(a, b) fcmplt b, a +#endif + +#define STACKSIZE 6 * 8 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + +#ifdef F_INTERFACE + ldl N, 0(N) # n + ldl INCX, 0(INCX) # incx +#endif + ldi $sp, -STACKSIZE($sp) + mov X, XX + .align 4 + + fstd $f2, 0($sp) + fclr $f16 + cmplt $31, N, $2 + unop + + fstd $f3, 8($sp) + fclr $f17 + cmplt $31, INCX, $3 + unop + + fstd $f4, 16($sp) + fclr $f18 + SXADDQ INCX, $31, INCX + unop + + fstd $f5, 24($sp) + fclr $f19 + and $2, $3, $2 + clr $0 + + fstd $f6, 32($sp) + fclr $f0 + sra N, 3, $1 + beq $2, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + LD $f20, 0 * SIZE(X) + unop + fabs $f20, $f0 + ble $1, $L15 + .align 4 + + fabs $f20, $f1 + unop + addl X, INCX, X + unop + + LD $f21, 0 * SIZE(X) + fabs $f20, $f2 + addl X, INCX, X + unop + + LD $f22, 0 * SIZE(X) + fabs $f20, $f3 + addl X, INCX, X + unop + + LD $f23, 0 * SIZE(X) + fabs $f20, $f4 + addl X, INCX, X + unop + + LD $f24, 0 * SIZE(X) + addl X, INCX, X + fabs $f20, $f5 + unop + + LD $f25, 0 * SIZE(X) + fabs $f20, $f6 + addl X, INCX, X + unop + + LD $f26, 0 * SIZE(X) + fabs $f20, $f28 + addl X, INCX, X + ldi $1, -1($1) + + LD $f27, 0 * SIZE(X) + unop + addl X, INCX, X + ble $1, $L13 + .align 4 + +$L12: + fselne $f16, $f12, $f4, $f4 + unop + fabs $f20, $f29 + fillcs 56 * SIZE(X) + + fselne $f17, $f13, $f5, $f5 + LD $f20, 0 * SIZE(X) + fabs $f21, $f30 + addl X, INCX, X + + fselne $f18, $f14, $f6, $f6 + LD $f21, 0 * SIZE(X) + fabs $f22, $f10 + addl X, INCX, X + + fselne $f19, $f15, $f28, $f28 + LD $f22, 0 * SIZE(X) + fabs $f23, $f11 + addl X, INCX, X + + fabs $f24, $f12 + LD $f23, 0 * SIZE(X) + CMPLT($f0, $f29), $f16 + addl X, INCX, X + + fabs $f25, $f13 + LD $f24, 0 * SIZE(X) + CMPLT($f1, $f30), $f17 + addl X, INCX, X + + fabs $f26, $f14 + LD $f25, 0 * SIZE(X) + CMPLT($f2, $f10), $f18 + addl X, INCX, X + + fabs $f27, $f15 + LD $f26, 0 * SIZE(X) + CMPLT($f3, $f11), $f19 + addl X, INCX, X + + fselne $f16, $f29, $f0, $f0 + LD $f27, 0 * SIZE(X) + CMPLT($f4, $f12), $f16 + addl X, INCX, X + + fselne $f17, $f30, $f1, $f1 + unop + CMPLT($f5, $f13), $f17 + ldi $1, -1($1) # i -- + + fselne $f18, $f10, $f2, $f2 + unop + CMPLT($f6, $f14), $f18 + unop + + fselne $f19, $f11, $f3, $f3 + unop + CMPLT($f28, $f15), $f19 + bgt $1,$L12 + .align 4 + +$L13: + fselne $f16, $f12, $f4, $f4 + fabs $f20, $f29 + fselne $f17, $f13, $f5, $f5 + fabs $f21, $f30 + + fselne $f18, $f14, $f6, $f6 + fabs $f22, $f10 + fselne $f19, $f15, $f28, $f28 + fabs $f23, $f11 + + fabs $f24, $f12 + CMPLT($f0, $f29), $f16 + fabs $f25, $f13 + CMPLT($f1, $f30), $f17 + + fabs $f26, $f14 + CMPLT($f2, $f10), $f18 + fabs $f27, $f15 + CMPLT($f3, $f11), $f19 + + fselne $f16, $f29, $f0, $f0 + CMPLT($f4, $f12), $f16 + fselne $f17, $f30, $f1, $f1 + CMPLT($f5, $f13), $f17 + + fselne $f18, $f10, $f2, $f2 + CMPLT($f6, $f14), $f18 + fselne $f19, $f11, $f3, $f3 + CMPLT($f28, $f15), $f19 + + fselne $f16, $f12, $f4, $f4 + CMPLT($f0, $f1), $f16 + fselne $f17, $f13, $f5, $f5 + CMPLT($f2, $f3), $f17 + + fselne $f18, $f14, $f6, $f6 + CMPLT($f4, $f5), $f18 + fselne $f19, $f15, $f28, $f28 + CMPLT($f6, $f28), $f19 + + fselne $f16, $f1, $f0, $f0 + fselne $f17, $f3, $f2, $f2 + fselne $f18, $f5, $f4, $f4 + fselne $f19, $f28, $f6, $f6 + + CMPLT($f0, $f2), $f16 + CMPLT($f4, $f6), $f17 + + fselne $f16, $f2, $f0, $f0 + fselne $f17, $f6, $f4, $f4 + + CMPLT($f0, $f4), $f16 + fselne $f16, $f4, $f0, $f0 + .align 4 + +$L15: + and N, 7, $1 + unop + unop + ble $1, $L20 + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + addl X, INCX, X + + fabs $f20, $f29 + CMPLT($f0, $f29), $f16 + fselne $f16, $f29, $f0, $f0 + + ldi $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$L20: + sra N, 3, $1 + ble $1, $L40 + .align 4 + + LD $f10, 0 * SIZE(XX) + addl XX, INCX, XX + LD $f11, 0 * SIZE(XX) + addl XX, INCX, XX + + LD $f12, 0 * SIZE(XX) + addl XX, INCX, XX + LD $f13, 0 * SIZE(XX) + addl XX, INCX, XX + + LD $f14, 0 * SIZE(XX) + addl XX, INCX, XX + LD $f15, 0 * SIZE(XX) + addl XX, INCX, XX + + LD $f16, 0 * SIZE(XX) + addl XX, INCX, XX + LD $f17, 0 * SIZE(XX) + addl XX, INCX, XX + + fabs $f10, $f18 + fabs $f11, $f19 + fabs $f12, $f20 + fabs $f13, $f21 + + ldi $1, -1($1) + ble $1, $L23 + .align 4 + +$L22: + LD $f10, 0 * SIZE(XX) + fabs $f14, $f22 + addl XX, INCX, XX + fcmpeq $f0, $f18, $f2 + + LD $f11, 0 * SIZE(XX) + fabs $f15, $f23 + addl XX, INCX, XX + fcmpeq $f0, $f19, $f3 + + LD $f12, 0 * SIZE(XX) + fabs $f16, $f24 + addl XX, INCX, XX + fcmpeq $f0, $f20, $f4 + + LD $f13, 0 * SIZE(XX) + fabs $f17, $f25 + addl XX, INCX, XX + fcmpeq $f0, $f21, $f5 + + LD $f14, 0 * SIZE(XX) + ldi $1, -1($1) # i -- + fcmpeq $f0, $f22, $f26 + addl XX, INCX, XX + + ldi $0, 1($0) + fbne $f2, $End + + LD $f15, 0 * SIZE(XX) + fcmpeq $f0, $f23, $f27 + ldi $0, 1($0) + fbne $f3, $End + + addl XX, INCX, XX + fcmpeq $f0, $f24, $f28 + ldi $0, 1($0) + fbne $f4, $End + + LD $f16, 0 * SIZE(XX) + fcmpeq $f0, $f25, $f29 + ldi $0, 1($0) + fbne $f5, $End + + addl XX, INCX, XX + ldi $0, 1($0) + fabs $f10, $f18 + fbne $f26, $End + + LD $f17, 0 * SIZE(XX) + ldi $0, 1($0) + fabs $f11, $f19 + fbne $f27, $End + + addl XX, INCX, XX + ldi $0, 1($0) + fabs $f12, $f20 + fbne $f28, $End + + ldi $0, 1($0) + fabs $f13, $f21 + fbne $f29, $End + bgt $1, $L22 + .align 4 + +$L23: + fabs $f14, $f22 + fcmpeq $f0, $f18, $f2 + fabs $f15, $f23 + fcmpeq $f0, $f19, $f3 + + fabs $f16, $f24 + fcmpeq $f0, $f20, $f4 + fabs $f17, $f25 + fcmpeq $f0, $f21, $f5 + + fcmpeq $f0, $f22, $f26 + ldi $0, 1($0) + unop + fbne $f2, $End + + fcmpeq $f0, $f23, $f27 + ldi $0, 1($0) + unop + fbne $f3, $End + + fcmpeq $f0, $f24, $f28 + ldi $0, 1($0) + unop + fbne $f4, $End + + fcmpeq $f0, $f25, $f29 + ldi $0, 1($0) + unop + fbne $f5, $End + + ldi $0, 1($0) + fbne $f26, $End + ldi $0, 1($0) + fbne $f27, $End + ldi $0, 1($0) + fbne $f28, $End + ldi $0, 1($0) + fbne $f29, $End + .align 4 + +$L40: + LD $f20, 0 * SIZE(XX) + addl XX, INCX, XX + + fabs $f20, $f25 + fcmpeq $f0, $f25, $f29 + + ldi $0, 1($0) + fbne $f29, $End + br $31, $L40 + .align 4 + +$End: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + + fldd $f6, 32($sp) + ldi $sp, STACKSIZE($sp) + ret + + EPILOGUE diff --git a/kernel/sw_64/iamax_simd.S b/kernel/sw_64/iamax_simd.S new file mode 100644 index 0000000..c7c6c27 --- /dev/null +++ b/kernel/sw_64/iamax_simd.S @@ -0,0 +1,732 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 96 + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 +#define I $1 +#define NN $22 + +#ifndef USE_MIN +#define CMPLT(a, b) fcmplt a, b +#else +#define CMPLT(a, b) fcmplt b, a +#endif + +#ifndef USE_MIN +#define VCMPLT(a, b) vfcmplt a, b +#else +#define VCMPLT(a, b) vfcmplt b, a +#endif + +#define STACKSIZE 6 * 8 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + +#ifdef F_INTERFACE + ldl N, 0(N) # n + ldl INCX, 0(INCX) # incx +#endif + ldi $sp, -STACKSIZE($sp) + mov X, XX + mov N, NN + .align 4 + + fstd $f2, 0($sp) + fclr $f16 + cmplt $31, N, $2 + unop + + fstd $f3, 8($sp) + fclr $f17 + cmplt $31, INCX, $3 + unop + + fstd $f4, 16($sp) + fclr $f18 + SXADDQ INCX, $31, INCX + unop + + fstd $f5, 24($sp) + fclr $f19 + and $2, $3, $2 + clr $0 + + fstd $f6, 32($sp) + fclr $f0 + unop + beq $2, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + cmpeq INCX, SIZE, $3 + beq $3, $Sub + .align 4 + + +/** + test the address of Y +**/ + + and X, (VEC_LEN*SIZE-1), $3 + LD $f10, 0*SIZE(X) + fabs $f10, $f0 # init temp max/min result value + beq $3, $Align_Access + .align 4 +/** + process the unalign address of X +**/ + +/*if N is too small(less then unroll size), don't need process unalign X. Just jump to remain section.*/ + sra NN, 4, I + and NN, 15, $3 + ble I, $Remain + nop + + sra $3, BASE_SHIFT, $3 + ldi $2, VEC_LEN + subl $2, $3, $3 + nop +$UnAlign_Y_Loop: + LD $f10, 0*SIZE(X) + addl X, SIZE, X + fabs $f10, $f29 + CMPLT($f0, $f29), $f16 + + fseleq $f16, $f0, $f29, $f0 + subl $3, 1, $3 + subl NN, 1, NN + bgt $3, $UnAlign_Y_Loop + .align 4 + + +$Align_Access: +/*search max or min. Unloop 16 */ + sra NN, 4, I + and NN, 15, $3 + ble I, $Remain + nop + + VLD $f10, 0*VEC_LEN*SIZE(X) + VLD $f11, 1*VEC_LEN*SIZE(X) + VLD $f12, 2*VEC_LEN*SIZE(X) + VLD $f13, 3*VEC_LEN*SIZE(X) + + /*vfabs*/ + vcpys $f31, $f10, $f22 + vcpys $f31, $f11, $f23 + vcpys $f31, $f12, $f24 + vcpys $f31, $f13, $f25 + + vcpyf $f0, $f0 + vcpys $f22, $f22, $f1 # copy $f22 -> $f1 + vcpys $f22, $f22, $f2 + vcpys $f22, $f22, $f3 + + subl I, 1, I + addl X, 16*SIZE, X + nop + ble I, $MainLoopEnd + .align 4 +$MainLoop: + + fillcs PREFETCHSIZE * SIZE(X) + VCMPLT($f0, $f22), $f26 + subl I, 1, I + VCMPLT($f1, $f23), $f27 + + VLD $f10, 0*VEC_LEN*SIZE(X) + VLD $f11, 1*VEC_LEN*SIZE(X) + VLD $f12, 2*VEC_LEN*SIZE(X) + VLD $f13, 3*VEC_LEN*SIZE(X) + + VCMPLT($f2, $f24), $f28 + addl X, 16 * SIZE, X + nop + VCMPLT($f3, $f25), $f29 + + vfseleq $f26, $f0, $f22, $f0 + vfseleq $f27, $f1, $f23, $f1 + vfseleq $f28, $f2, $f24, $f2 + vfseleq $f29, $f3, $f25, $f3 + + vcpys $f31, $f10, $f22 + vcpys $f31, $f11, $f23 + vcpys $f31, $f12, $f24 + vcpys $f31, $f13, $f25 + + bne I, $MainLoop + .align 4 + +$MainLoopEnd: + VCMPLT($f0, $f22), $f26 + VCMPLT($f1, $f23), $f27 + VCMPLT($f2, $f24), $f28 + VCMPLT($f3, $f25), $f29 + + vfseleq $f26, $f0, $f22, $f0 + vfseleq $f27, $f1, $f23, $f1 + vfseleq $f28, $f2, $f24, $f2 + vfseleq $f29, $f3, $f25, $f3 + + /*find the max or min among f0, f1 ,f2 and f3*/ + VCMPLT($f0, $f1), $f26 + VCMPLT($f2, $f3), $f27 + vfseleq $f26, $f0, $f1, $f0 + vfseleq $f27, $f2, $f3, $f2 + + VCMPLT($f0, $f2), $f26 + vfseleq $f26, $f0, $f2, $f0 + vextf $f0, 1, $f22 + vextf $f0, 2, $f23 + + vextf $f0, 3, $f24 + CMPLT($f0, $f22), $f16 + CMPLT($f23, $f24), $f17 + fseleq $f16, $f0, $f22, $f0 + + fseleq $f17, $f23, $f24, $f23 + CMPLT($f0, $f23), $f18 + fseleq $f18, $f0, $f23, $f0 + nop +$Remain: + ble $3, $Continuous_FindIndex + .align 4 +$RemainLoop: + LD $f20, 0 * SIZE(X) + addl X, INCX, X + + fabs $f20, $f29 + CMPLT($f0, $f29), $f16 + fseleq $f16, $f0, $f29, $f0 + + subl $3, 1, $3 + bgt $3, $RemainLoop + .align 4 + /*find index*/ +$Continuous_FindIndex: + sra N, 3, $1 + ble $1, $L40 + .align 4 + + LD $f10, 0 * SIZE(XX) + LD $f11, 1 * SIZE(XX) + LD $f12, 2 * SIZE(XX) + LD $f13, 3 * SIZE(XX) + + + LD $f14, 4 * SIZE(XX) + LD $f15, 5 * SIZE(XX) + LD $f16, 6 * SIZE(XX) + LD $f17, 7 * SIZE(XX) + + + fabs $f10, $f18 + fabs $f11, $f19 + fabs $f12, $f20 + fabs $f13, $f21 + + addl XX, 8*SIZE, XX + ldi $1, -1($1) + ble $1, $Continuous_FindIndex_Loop + .align 4 + +$Continuous_FindIndex_Loop: + LD $f10, 0 * SIZE(XX) + fabs $f14, $f22 + LD $f11, 1 * SIZE(XX) + fcmpeq $f0, $f18, $f2 + + LD $f12, 2 * SIZE(XX) + fabs $f15, $f23 + LD $f13, 3 * SIZE(XX) + fcmpeq $f0, $f19, $f3 + + LD $f14, 4 * SIZE(XX) + fabs $f16, $f24 + ldi $1, -1($1) # i -- + fcmpeq $f0, $f20, $f4 + + LD $f15, 5 * SIZE(XX) + fabs $f17, $f25 + fcmpeq $f0, $f21, $f5 + fillcs PREFETCHSIZE * SIZE(X) + + LD $f16, 6 * SIZE(XX) + fcmpeq $f0, $f22, $f26 + ldi $0, 1($0) + fbne $f2, $End + + LD $f17, 7 * SIZE(XX) + fcmpeq $f0, $f23, $f27 + ldi $0, 1($0) + fbne $f3, $End + + addl XX, 8*SIZE, XX + fcmpeq $f0, $f24, $f28 + ldi $0, 1($0) + fbne $f4, $End + + fcmpeq $f0, $f25, $f29 + ldi $0, 1($0) + nop + fbne $f5, $End + + ldi $0, 1($0) + fabs $f10, $f18 + nop + fbne $f26, $End + + ldi $0, 1($0) + fabs $f11, $f19 + nop + fbne $f27, $End + + ldi $0, 1($0) + fabs $f12, $f20 + nop + fbne $f28, $End + + ldi $0, 1($0) + fabs $f13, $f21 + fbne $f29, $End + bgt $1, $Continuous_FindIndex_Loop + .align 4 + +$Continuous_FindIndex_LoopEnd: + fabs $f14, $f22 + fcmpeq $f0, $f18, $f2 + fabs $f15, $f23 + fcmpeq $f0, $f19, $f3 + + fabs $f16, $f24 + fcmpeq $f0, $f20, $f4 + fabs $f17, $f25 + fcmpeq $f0, $f21, $f5 + + fcmpeq $f0, $f22, $f26 + ldi $0, 1($0) + unop + fbne $f2, $End + + fcmpeq $f0, $f23, $f27 + ldi $0, 1($0) + unop + fbne $f3, $End + + fcmpeq $f0, $f24, $f28 + ldi $0, 1($0) + unop + fbne $f4, $End + + fcmpeq $f0, $f25, $f29 + ldi $0, 1($0) + unop + fbne $f5, $End + + ldi $0, 1($0) + fbne $f26, $End + ldi $0, 1($0) + fbne $f27, $End + ldi $0, 1($0) + fbne $f28, $End + ldi $0, 1($0) + fbne $f29, $End + .align 4 + + jmp $L40 + .align 4 +$Sub: + sra N, 3, $1 + LD $f20, 0 * SIZE(X) + fabs $f20, $f0 + ble $1, $L15 + .align 4 + + fabs $f20, $f1 + unop + addl X, INCX, X + unop + + LD $f21, 0 * SIZE(X) + fabs $f20, $f2 + addl X, INCX, X + unop + + LD $f22, 0 * SIZE(X) + fabs $f20, $f3 + addl X, INCX, X + unop + + LD $f23, 0 * SIZE(X) + fabs $f20, $f4 + addl X, INCX, X + unop + + LD $f24, 0 * SIZE(X) + addl X, INCX, X + fabs $f20, $f5 + unop + + LD $f25, 0 * SIZE(X) + fabs $f20, $f6 + addl X, INCX, X + unop + + LD $f26, 0 * SIZE(X) + fabs $f20, $f28 + addl X, INCX, X + ldi $1, -1($1) + + LD $f27, 0 * SIZE(X) + unop + addl X, INCX, X + ble $1, $L13 + .align 4 + +$L12: + fselne $f16, $f12, $f4, $f4 + unop + fabs $f20, $f29 + fillcs 56 * SIZE(X) + + fselne $f17, $f13, $f5, $f5 + LD $f20, 0 * SIZE(X) + fabs $f21, $f30 + addl X, INCX, X + + fselne $f18, $f14, $f6, $f6 + LD $f21, 0 * SIZE(X) + fabs $f22, $f10 + addl X, INCX, X + + fselne $f19, $f15, $f28, $f28 + LD $f22, 0 * SIZE(X) + fabs $f23, $f11 + addl X, INCX, X + + fabs $f24, $f12 + LD $f23, 0 * SIZE(X) + CMPLT($f0, $f29), $f16 + addl X, INCX, X + + fabs $f25, $f13 + LD $f24, 0 * SIZE(X) + CMPLT($f1, $f30), $f17 + addl X, INCX, X + + fabs $f26, $f14 + LD $f25, 0 * SIZE(X) + CMPLT($f2, $f10), $f18 + addl X, INCX, X + + fabs $f27, $f15 + LD $f26, 0 * SIZE(X) + CMPLT($f3, $f11), $f19 + addl X, INCX, X + + fselne $f16, $f29, $f0, $f0 + LD $f27, 0 * SIZE(X) + CMPLT($f4, $f12), $f16 + addl X, INCX, X + + fselne $f17, $f30, $f1, $f1 + unop + CMPLT($f5, $f13), $f17 + ldi $1, -1($1) # i -- + + fselne $f18, $f10, $f2, $f2 + unop + CMPLT($f6, $f14), $f18 + unop + + fselne $f19, $f11, $f3, $f3 + unop + CMPLT($f28, $f15), $f19 + bgt $1,$L12 + .align 4 + +$L13: + fselne $f16, $f12, $f4, $f4 + fabs $f20, $f29 + fselne $f17, $f13, $f5, $f5 + fabs $f21, $f30 + + fselne $f18, $f14, $f6, $f6 + fabs $f22, $f10 + fselne $f19, $f15, $f28, $f28 + fabs $f23, $f11 + + fabs $f24, $f12 + CMPLT($f0, $f29), $f16 + fabs $f25, $f13 + CMPLT($f1, $f30), $f17 + + fabs $f26, $f14 + CMPLT($f2, $f10), $f18 + fabs $f27, $f15 + CMPLT($f3, $f11), $f19 + + fselne $f16, $f29, $f0, $f0 + CMPLT($f4, $f12), $f16 + fselne $f17, $f30, $f1, $f1 + CMPLT($f5, $f13), $f17 + + fselne $f18, $f10, $f2, $f2 + CMPLT($f6, $f14), $f18 + fselne $f19, $f11, $f3, $f3 + CMPLT($f28, $f15), $f19 + + fselne $f16, $f12, $f4, $f4 + CMPLT($f0, $f1), $f16 + fselne $f17, $f13, $f5, $f5 + CMPLT($f2, $f3), $f17 + + fselne $f18, $f14, $f6, $f6 + CMPLT($f4, $f5), $f18 + fselne $f19, $f15, $f28, $f28 + CMPLT($f6, $f28), $f19 + + fselne $f16, $f1, $f0, $f0 + fselne $f17, $f3, $f2, $f2 + fselne $f18, $f5, $f4, $f4 + fselne $f19, $f28, $f6, $f6 + + CMPLT($f0, $f2), $f16 + CMPLT($f4, $f6), $f17 + + fselne $f16, $f2, $f0, $f0 + fselne $f17, $f6, $f4, $f4 + + CMPLT($f0, $f4), $f16 + fselne $f16, $f4, $f0, $f0 + .align 4 + +$L15: + and N, 7, $1 + unop + unop + ble $1, $L20 + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + addl X, INCX, X + + fabs $f20, $f29 + CMPLT($f0, $f29), $f16 + fselne $f16, $f29, $f0, $f0 + + ldi $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +/* + find the index +*/ +$L20: + sra N, 3, $1 + ble $1, $L40 + .align 4 + + LD $f10, 0 * SIZE(XX) + addl XX, INCX, XX + LD $f11, 0 * SIZE(XX) + addl XX, INCX, XX + + LD $f12, 0 * SIZE(XX) + addl XX, INCX, XX + LD $f13, 0 * SIZE(XX) + addl XX, INCX, XX + + LD $f14, 0 * SIZE(XX) + addl XX, INCX, XX + LD $f15, 0 * SIZE(XX) + addl XX, INCX, XX + + LD $f16, 0 * SIZE(XX) + addl XX, INCX, XX + LD $f17, 0 * SIZE(XX) + addl XX, INCX, XX + + fabs $f10, $f18 + fabs $f11, $f19 + fabs $f12, $f20 + fabs $f13, $f21 + + ldi $1, -1($1) + ble $1, $L23 + .align 4 + +$L22: + LD $f10, 0 * SIZE(XX) + fabs $f14, $f22 + addl XX, INCX, XX + fcmpeq $f0, $f18, $f2 + + LD $f11, 0 * SIZE(XX) + fabs $f15, $f23 + addl XX, INCX, XX + fcmpeq $f0, $f19, $f3 + + LD $f12, 0 * SIZE(XX) + fabs $f16, $f24 + addl XX, INCX, XX + fcmpeq $f0, $f20, $f4 + + LD $f13, 0 * SIZE(XX) + fabs $f17, $f25 + addl XX, INCX, XX + fcmpeq $f0, $f21, $f5 + + LD $f14, 0 * SIZE(XX) + ldi $1, -1($1) # i -- + fcmpeq $f0, $f22, $f26 + addl XX, INCX, XX + + ldi $0, 1($0) + fbne $f2, $End + + LD $f15, 0 * SIZE(XX) + fcmpeq $f0, $f23, $f27 + ldi $0, 1($0) + fbne $f3, $End + + addl XX, INCX, XX + fcmpeq $f0, $f24, $f28 + ldi $0, 1($0) + fbne $f4, $End + + LD $f16, 0 * SIZE(XX) + fcmpeq $f0, $f25, $f29 + ldi $0, 1($0) + fbne $f5, $End + + addl XX, INCX, XX + ldi $0, 1($0) + fabs $f10, $f18 + fbne $f26, $End + + LD $f17, 0 * SIZE(XX) + ldi $0, 1($0) + fabs $f11, $f19 + fbne $f27, $End + + addl XX, INCX, XX + ldi $0, 1($0) + fabs $f12, $f20 + fbne $f28, $End + + ldi $0, 1($0) + fabs $f13, $f21 + fbne $f29, $End + bgt $1, $L22 + .align 4 + +$L23: + fabs $f14, $f22 + fcmpeq $f0, $f18, $f2 + fabs $f15, $f23 + fcmpeq $f0, $f19, $f3 + + fabs $f16, $f24 + fcmpeq $f0, $f20, $f4 + fabs $f17, $f25 + fcmpeq $f0, $f21, $f5 + + fcmpeq $f0, $f22, $f26 + ldi $0, 1($0) + unop + fbne $f2, $End + + fcmpeq $f0, $f23, $f27 + ldi $0, 1($0) + unop + fbne $f3, $End + + fcmpeq $f0, $f24, $f28 + ldi $0, 1($0) + unop + fbne $f4, $End + + fcmpeq $f0, $f25, $f29 + ldi $0, 1($0) + unop + fbne $f5, $End + + ldi $0, 1($0) + fbne $f26, $End + ldi $0, 1($0) + fbne $f27, $End + ldi $0, 1($0) + fbne $f28, $End + ldi $0, 1($0) + fbne $f29, $End + .align 4 + +$L40: + LD $f20, 0 * SIZE(XX) + addl XX, INCX, XX + + fabs $f20, $f25 + fcmpeq $f0, $f25, $f29 + + ldi $0, 1($0) + fbne $f29, $End + br $31, $L40 + .align 4 + +$End: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + + fldd $f6, 32($sp) + ldi $sp, STACKSIZE($sp) + ret + + EPILOGUE diff --git a/kernel/sw_64/imax.S b/kernel/sw_64/imax.S new file mode 100644 index 0000000..b0cf5c8 --- /dev/null +++ b/kernel/sw_64/imax.S @@ -0,0 +1,351 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#ifndef USE_MIN +#define CMPLT(a, b) cmptlt a, b +#else +#define CMPLT(a, b) cmptlt b, a +#endif + +#define STACKSIZE 8 * 8 + + PROLOGUE + PROFCODE + + clr $0 + mov X, XX + .align 4 + + cmplt $31, N, $2 + cmplt $31, INCX, $3 + SXADDQ INCX, $31, INCX + and $2, $3, $2 + + sra N, 3, $1 + fclr $f0 + unop + beq $2, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + LD $f0, 0 * SIZE(X) + unop + unop + ble $1, $L15 + .align 4 + + fmov $f0, $f1 + addq X, INCX, X + fmov $f0, $f10 + lda $1, -1($1) + + LD $f21, 0 * SIZE(X) + fmov $f0, $f11 + addq X, INCX, X + fmov $f0, $f12 + + LD $f22, 0 * SIZE(X) + fmov $f0, $f13 + addq X, INCX, X + fmov $f0, $f14 + + LD $f23, 0 * SIZE(X) + fmov $f0, $f15 + addq X, INCX, X + fmov $f0, $f20 + + LD $f24, 0 * SIZE(X) + addq X, INCX, X + LD $f25, 0 * SIZE(X) + addq X, INCX, X + LD $f26, 0 * SIZE(X) + addq X, INCX, X + LD $f27, 0 * SIZE(X) + addq X, INCX, X + + CMPLT($f0, $f20), $f16 + CMPLT($f1, $f21), $f17 + CMPLT($f10, $f22), $f18 + CMPLT($f11, $f23), $f19 + + ble $1, $L13 + .align 4 + +$L12: + fcmovne $f16, $f20, $f0 + LD $f20, 0 * SIZE(X) + CMPLT($f12, $f24), $f16 + addq X, INCX, X + + fcmovne $f17, $f21, $f1 + LD $f21, 0 * SIZE(X) + CMPLT($f13, $f25), $f17 + addq X, INCX, X + + fcmovne $f18, $f22, $f10 + LD $f22, 0 * SIZE(X) + CMPLT($f14, $f26), $f18 + addq X, INCX, X + + fcmovne $f19, $f23, $f11 + LD $f23, 0 * SIZE(X) + CMPLT($f15, $f27), $f19 + addq X, INCX, X + + fcmovne $f16, $f24, $f12 + LD $f24, 0 * SIZE(X) + CMPLT($f0, $f20), $f16 + addq X, INCX, X + + fcmovne $f17, $f25, $f13 + LD $f25, 0 * SIZE(X) + CMPLT($f1, $f21), $f17 + addq X, INCX, X + + fcmovne $f18, $f26, $f14 + LD $f26, 0 * SIZE(X) + CMPLT($f10, $f22), $f18 + addq X, INCX, X + + fcmovne $f19, $f27, $f15 + LD $f27, 0 * SIZE(X) + CMPLT($f11, $f23), $f19 + lda $1, -1($1) # i -- + + addq X, INCX, X + unop + unop + bgt $1,$L12 + .align 4 + +$L13: + fcmovne $f16, $f20, $f0 + CMPLT($f12, $f24), $f16 + + fcmovne $f17, $f21, $f1 + CMPLT($f13, $f25), $f17 + + fcmovne $f18, $f22, $f10 + CMPLT($f14, $f26), $f18 + + fcmovne $f19, $f23, $f11 + CMPLT($f15, $f27), $f19 + + fcmovne $f16, $f24, $f12 + CMPLT($f0, $f1), $f16 + fcmovne $f17, $f25, $f13 + CMPLT($f10, $f11), $f17 + + fcmovne $f18, $f26, $f14 + CMPLT($f12, $f13), $f18 + fcmovne $f19, $f27, $f15 + CMPLT($f14, $f15), $f19 + + fcmovne $f16, $f1, $f0 + fcmovne $f17, $f11, $f10 + fcmovne $f18, $f13, $f12 + fcmovne $f19, $f15, $f14 + + CMPLT($f0, $f10), $f16 + CMPLT($f12, $f14), $f17 + + fcmovne $f16, $f10, $f0 + fcmovne $f17, $f14, $f12 + + CMPLT($f0, $f12), $f16 + fcmovne $f16, $f12, $f0 + .align 4 + +$L15: + and N, 7, $1 + unop + unop + ble $1, $L20 + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + addq X, INCX, X + + CMPLT($f0, $f20), $f16 + fcmovne $f16, $f20, $f0 + lda $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$L20: + sra N, 3, $1 + ble $1, $L40 + .align 4 + + LD $f10, 0 * SIZE(XX) + addq XX, INCX, XX + LD $f11, 0 * SIZE(XX) + addq XX, INCX, XX + + LD $f12, 0 * SIZE(XX) + addq XX, INCX, XX + LD $f13, 0 * SIZE(XX) + addq XX, INCX, XX + + LD $f14, 0 * SIZE(XX) + addq XX, INCX, XX + LD $f15, 0 * SIZE(XX) + addq XX, INCX, XX + + LD $f16, 0 * SIZE(XX) + addq XX, INCX, XX + LD $f17, 0 * SIZE(XX) + addq XX, INCX, XX + + cmpteq $f0, $f10, $f20 + cmpteq $f0, $f11, $f21 + cmpteq $f0, $f12, $f22 + cmpteq $f0, $f13, $f23 + + lda $1, -1($1) + ble $1, $L23 + .align 4 + +$L22: + LD $f10, 0 * SIZE(XX) + cmpteq $f0, $f14, $f24 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f20, $End + + LD $f11, 0 * SIZE(XX) + cmpteq $f0, $f15, $f25 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f21, $End + + LD $f12, 0 * SIZE(XX) + cmpteq $f0, $f16, $f26 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f22, $End + + LD $f13, 0 * SIZE(XX) + cmpteq $f0, $f17, $f27 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f23, $End + + LD $f14, 0 * SIZE(XX) + cmpteq $f0, $f10, $f20 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f24, $End + + LD $f15, 0 * SIZE(XX) + cmpteq $f0, $f11, $f21 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f25, $End + + LD $f16, 0 * SIZE(XX) + lda $1, -1($1) # i -- + cmpteq $f0, $f12, $f22 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f26, $End + + LD $f17, 0 * SIZE(XX) + cmpteq $f0, $f13, $f23 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f27, $End + + bgt $1, $L22 + .align 4 + +$L23: + lda $0, 1($0) + cmpteq $f0, $f14, $f24 + unop + fbne $f20, $End + + lda $0, 1($0) + cmpteq $f0, $f15, $f25 + unop + fbne $f21, $End + + lda $0, 1($0) + cmpteq $f0, $f16, $f26 + unop + fbne $f22, $End + + lda $0, 1($0) + cmpteq $f0, $f17, $f27 + unop + fbne $f23, $End + + lda $0, 1($0) + fbne $f24, $End + lda $0, 1($0) + fbne $f25, $End + lda $0, 1($0) + fbne $f26, $End + lda $0, 1($0) + fbne $f27, $End + .align 4 + +$L40: + LD $f20, 0 * SIZE(XX) + addq XX, INCX, XX + + cmpteq $f0, $f20, $f29 + + lda $0, 1($0) + fbne $f29, $End + br $31, $L40 + .align 4 + +$End: + ret + + EPILOGUE diff --git a/kernel/sw_64/imax.c b/kernel/sw_64/imax.c new file mode 100644 index 0000000..5072dd1 --- /dev/null +++ b/kernel/sw_64/imax.c @@ -0,0 +1,69 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + max = i; + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/sw_64/imin.c b/kernel/sw_64/imin.c new file mode 100644 index 0000000..ffc6522 --- /dev/null +++ b/kernel/sw_64/imin.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** +* 2013/08/19 Saar +* BLASTEST float +* BLASTEST double +* +**************************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] < minf ) + { + min = i; + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/sw_64/izamax.S b/kernel/sw_64/izamax.S new file mode 100644 index 0000000..5ccc60e --- /dev/null +++ b/kernel/sw_64/izamax.S @@ -0,0 +1,429 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#ifndef USE_MIN +#define CMPLT(a, b) fcmplt a, b +#else +#define CMPLT(a, b) fcmplt b, a +#endif + +#define STACKSIZE 8 * 8 + + PROLOGUE + PROFCODE + + ldi $sp, -STACKSIZE($sp) + + fstd $f2, 0($sp) + fclr $f16 + cmplt $31, N, $2 + unop + + fstd $f3, 8($sp) + fclr $f17 + cmplt $31, INCX, $3 + unop + + fstd $f4, 16($sp) + fclr $f18 + SXADDQ INCX, $31, INCX + unop + + fstd $f5, 24($sp) + fclr $f19 + and $2, $3, $2 + clr $0 + + fstd $f6, 32($sp) + mov X, XX + + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + + fclr $f0 + beq $2, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + sra N, 2, $1 + addl INCX, INCX, INCX + + fabs $f20, $f20 + fabs $f21, $f21 + faddd $f20, $f21, $f0 + ble $1, $L15 + .align 4 + + ldi $1, -1($1) + unop + addl X, INCX, X + unop + + LD $f22, 0 * SIZE(X) + fmov $f0, $f1 + LD $f23, 1 * SIZE(X) + addl X, INCX, X + + LD $f24, 0 * SIZE(X) + fmov $f0, $f2 + LD $f25, 1 * SIZE(X) + addl X, INCX, X + + LD $f26, 0 * SIZE(X) + fmov $f0, $f3 + LD $f27, 1 * SIZE(X) + addl X, INCX, X + + fabs $f20, $f8 + fabs $f21, $f9 + fabs $f22, $f10 + fabs $f23, $f11 + + fabs $f24, $f12 + fabs $f25, $f13 + fabs $f26, $f14 + fabs $f27, $f15 + + ble $1, $L14 + .align 4 + + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + ldi $1, -1($1) + addl X, INCX, X + + LD $f22, 0 * SIZE(X) + LD $f23, 1 * SIZE(X) + unop + addl X, INCX, X + + LD $f24, 0 * SIZE(X) + LD $f25, 1 * SIZE(X) + unop + addl X, INCX, X + + LD $f26, 0 * SIZE(X) + LD $f27, 1 * SIZE(X) + addl X, INCX, X + ble $1, $L13 + .align 4 + +$L12: + faddd $f8, $f9, $f16 + unop + fabs $f20, $f8 + fillcs 64 * SIZE(X) + + faddd $f10, $f11, $f17 + unop + fabs $f21, $f9 + LD $f20, 0 * SIZE(X) + + faddd $f12, $f13, $f18 + LD $f21, 1 * SIZE(X) + fabs $f22, $f10 + addl X, INCX, X + + faddd $f14, $f15, $f19 + LD $f22, 0 * SIZE(X) + fabs $f23, $f11 + unop + + CMPLT($f0, $f16), $f4 + LD $f23, 1 * SIZE(X) + fabs $f24, $f12 + addl X, INCX, X + + CMPLT($f1, $f17), $f5 + LD $f24, 0 * SIZE(X) + fabs $f25, $f13 + unop + + CMPLT($f2, $f18), $f6 + LD $f25, 1 * SIZE(X) + fabs $f26, $f14 + addl X, INCX, X + + CMPLT($f3, $f19), $f7 + LD $f26, 0 * SIZE(X) + fabs $f27, $f15 + unop + + fselne $f4, $f16, $f0, $f0 + LD $f27, 1 * SIZE(X) + addl X, INCX, X + ldi $1, -1($1) # i -- + + fselne $f5, $f17, $f1, $f1 + fselne $f6, $f18, $f2, $f2 + fselne $f7, $f19, $f3, $f3 + bgt $1,$L12 + .align 4 + +$L13: + faddd $f8, $f9, $f16 + fabs $f20, $f8 + + faddd $f10, $f11, $f17 + fabs $f21, $f9 + + faddd $f12, $f13, $f18 + fabs $f22, $f10 + + faddd $f14, $f15, $f19 + fabs $f23, $f11 + + CMPLT($f0, $f16), $f4 + fabs $f24, $f12 + + CMPLT($f1, $f17), $f5 + fabs $f25, $f13 + + CMPLT($f2, $f18), $f6 + fabs $f26, $f14 + CMPLT($f3, $f19), $f7 + fabs $f27, $f15 + + fselne $f4, $f16, $f0, $f0 + fselne $f5, $f17, $f1, $f1 + fselne $f6, $f18, $f2, $f2 + fselne $f7, $f19, $f3, $f3 + .align 4 + +$L14: + faddd $f8, $f9, $f16 + faddd $f10, $f11, $f17 + faddd $f12, $f13, $f18 + faddd $f14, $f15, $f19 + + CMPLT($f0, $f16), $f4 + CMPLT($f1, $f17), $f5 + CMPLT($f2, $f18), $f6 + CMPLT($f3, $f19), $f7 + + fselne $f4, $f16, $f0, $f0 + fselne $f5, $f17, $f1, $f1 + fselne $f6, $f18, $f2, $f2 + fselne $f7, $f19, $f3, $f3 + + CMPLT($f0, $f1), $f16 + CMPLT($f2, $f3), $f17 + + fselne $f16, $f1, $f0, $f0 + fselne $f17, $f3, $f2, $f2 + + CMPLT($f0, $f2), $f16 + fselne $f16, $f2, $f0, $f0 + .align 4 + +$L15: + and N, 3, $1 + unop + unop + ble $1, $L20 + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + unop + addl X, INCX, X + + fabs $f20, $f29 + fabs $f21, $f30 + faddd $f29, $f30, $f24 + fmov $f24,$f29 + + CMPLT($f0, $f29), $f16 + fselne $f16, $f29, $f0, $f0 + + ldi $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$L20: + sra N, 2, $1 + ble $1, $L40 + .align 4 + + LD $f10, 0 * SIZE(XX) + LD $f11, 1 * SIZE(XX) + addl XX, INCX, XX + + LD $f12, 0 * SIZE(XX) + LD $f13, 1 * SIZE(XX) + addl XX, INCX, XX + + LD $f14, 0 * SIZE(XX) + LD $f15, 1 * SIZE(XX) + addl XX, INCX, XX + + LD $f16, 0 * SIZE(XX) + LD $f17, 1 * SIZE(XX) + addl XX, INCX, XX + + fabs $f10, $f18 + fabs $f11, $f19 + fabs $f12, $f20 + fabs $f13, $f21 + + ldi $1, -1($1) + ble $1, $L23 + .align 4 + +$L22: + LD $f10, 0 * SIZE(XX) + fabs $f14, $f22 + LD $f11, 1 * SIZE(XX) + addl XX, INCX, XX + + LD $f12, 0 * SIZE(XX) + fabs $f15, $f23 + LD $f13, 1 * SIZE(XX) + addl XX, INCX, XX + + LD $f14, 0 * SIZE(XX) + fabs $f16, $f24 + LD $f15, 1 * SIZE(XX) + addl XX, INCX, XX + + LD $f16, 0 * SIZE(XX) + fabs $f17, $f25 + LD $f17, 1 * SIZE(XX) + addl XX, INCX, XX + + faddd $f18, $f19, $f4 + faddd $f20, $f21, $f5 + faddd $f22, $f23, $f6 + faddd $f24, $f25, $f7 + + fcmpeq $f0, $f4, $f26 + fcmpeq $f0, $f5, $f27 + fcmpeq $f0, $f6, $f28 + fcmpeq $f0, $f7, $f29 + + fabs $f10, $f18 + ldi $0, 1($0) + ldi $1, -1($1) # i -- + fbne $f26, $End + + fabs $f11, $f19 + ldi $0, 1($0) + unop + fbne $f27, $End + + fabs $f12, $f20 + ldi $0, 1($0) + unop + fbne $f28, $End + + fabs $f13, $f21 + ldi $0, 1($0) + fbne $f29, $End + bgt $1, $L22 + .align 4 + +$L23: + fabs $f14, $f22 + fabs $f15, $f23 + fabs $f16, $f24 + fabs $f17, $f25 + + faddd $f18, $f19, $f4 + faddd $f20, $f21, $f5 + faddd $f22, $f23, $f6 + faddd $f24, $f25, $f7 + + fcmpeq $f0, $f4, $f26 + fcmpeq $f0, $f5, $f27 + fcmpeq $f0, $f6, $f28 + fcmpeq $f0, $f7, $f29 + + ldi $0, 1($0) + fbne $f26, $End + ldi $0, 1($0) + fbne $f27, $End + ldi $0, 1($0) + fbne $f28, $End + ldi $0, 1($0) + fbne $f29, $End + .align 4 + +$L40: + LD $f10, 0 * SIZE(XX) + LD $f11, 1 * SIZE(XX) + + addl XX, INCX, XX + + fabs $f10, $f18 + fabs $f11, $f19 + + faddd $f18, $f19, $f2 + fmov $f2,$f18 + fcmpeq $f0, $f18, $f2 + + ldi $0, 1($0) + fbne $f2, $End + br $31, $L40 + .align 4 + +$End: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + ldi $sp, STACKSIZE($sp) + ret + + EPILOGUE diff --git a/kernel/sw_64/izamax.S.bak b/kernel/sw_64/izamax.S.bak new file mode 100644 index 0000000..34e4c88 --- /dev/null +++ b/kernel/sw_64/izamax.S.bak @@ -0,0 +1,427 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#ifndef USE_MIN +#define CMPLT(a, b) fcmplt a, b +#else +#define CMPLT(a, b) fcmplt b, a +#endif + +#define STACKSIZE 8 * 8 + + PROLOGUE + PROFCODE + + ldi $sp, -STACKSIZE($sp) + + fstd $f2, 0($sp) + fclr $f16 + cmplt $31, N, $2 + unop + + fstd $f3, 8($sp) + fclr $f17 + cmplt $31, INCX, $3 + unop + + fstd $f4, 16($sp) + fclr $f18 + SXADDQ INCX, $31, INCX + unop + + fstd $f5, 24($sp) + fclr $f19 + and $2, $3, $2 + clr $0 + + fstd $f6, 32($sp) + mov X, XX + + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + + fclr $f0 + beq $2, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + sra N, 2, $1 + addl INCX, INCX, INCX + + fabs $f20, $f20 + fabs $f21, $f21 + faddd $f20, $f21, $f0 + ble $1, $L15 + .align 4 + + ldi $1, -1($1) + unop + addl X, INCX, X + unop + + LD $f22, 0 * SIZE(X) + fmov $f0, $f1 + LD $f23, 1 * SIZE(X) + addl X, INCX, X + + LD $f24, 0 * SIZE(X) + fmov $f0, $f2 + LD $f25, 1 * SIZE(X) + addl X, INCX, X + + LD $f26, 0 * SIZE(X) + fmov $f0, $f3 + LD $f27, 1 * SIZE(X) + addl X, INCX, X + + fabs $f20, $f8 + fabs $f21, $f9 + fabs $f22, $f10 + fabs $f23, $f11 + + fabs $f24, $f12 + fabs $f25, $f13 + fabs $f26, $f14 + fabs $f27, $f15 + + ble $1, $L14 + .align 4 + + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + ldi $1, -1($1) + addl X, INCX, X + + LD $f22, 0 * SIZE(X) + LD $f23, 1 * SIZE(X) + unop + addl X, INCX, X + + LD $f24, 0 * SIZE(X) + LD $f25, 1 * SIZE(X) + unop + addl X, INCX, X + + LD $f26, 0 * SIZE(X) + LD $f27, 1 * SIZE(X) + addl X, INCX, X + ble $1, $L13 + .align 4 + +$L12: + faddd $f8, $f9, $f16 + unop + fabs $f20, $f8 + fillcs 64 * SIZE(X) + + faddd $f10, $f11, $f17 + unop + fabs $f21, $f9 + LD $f20, 0 * SIZE(X) + + faddd $f12, $f13, $f18 + LD $f21, 1 * SIZE(X) + fabs $f22, $f10 + addl X, INCX, X + + faddd $f14, $f15, $f19 + LD $f22, 0 * SIZE(X) + fabs $f23, $f11 + unop + + CMPLT($f0, $f16), $f4 + LD $f23, 1 * SIZE(X) + fabs $f24, $f12 + addl X, INCX, X + + CMPLT($f1, $f17), $f5 + LD $f24, 0 * SIZE(X) + fabs $f25, $f13 + unop + + CMPLT($f2, $f18), $f6 + LD $f25, 1 * SIZE(X) + fabs $f26, $f14 + addl X, INCX, X + + CMPLT($f3, $f19), $f7 + LD $f26, 0 * SIZE(X) + fabs $f27, $f15 + unop + +fselne $f4,$f16,$f0, $f0 + LD $f27, 1 * SIZE(X) + addl X, INCX, X + ldi $1, -1($1) # i -- + +fselne $f5,$f17,$f1, $f1 +fselne $f6,$f18,$f2, $f2 +fselne $f7,$f19,$f3, $f3 + bgt $1,$L12 + .align 4 + +$L13: + faddd $f8, $f9, $f16 + fabs $f20, $f8 + + faddd $f10, $f11, $f17 + fabs $f21, $f9 + + faddd $f12, $f13, $f18 + fabs $f22, $f10 + + faddd $f14, $f15, $f19 + fabs $f23, $f11 + + CMPLT($f0, $f16), $f4 + fabs $f24, $f12 + + CMPLT($f1, $f17), $f5 + fabs $f25, $f13 + + CMPLT($f2, $f18), $f6 + fabs $f26, $f14 + CMPLT($f3, $f19), $f7 + fabs $f27, $f15 + +fselne $f4,$f16,$f0, $f0 +fselne $f5,$f17,$f1, $f1 +fselne $f6,$f18,$f2, $f2 +fselne $f7,$f19,$f3, $f3 + .align 4 + +$L14: + faddd $f8, $f9, $f16 + faddd $f10, $f11, $f17 + faddd $f12, $f13, $f18 + faddd $f14, $f15, $f19 + + CMPLT($f0, $f16), $f4 + CMPLT($f1, $f17), $f5 + CMPLT($f2, $f18), $f6 + CMPLT($f3, $f19), $f7 + +fselne $f4,$f16,$f0, $f0 +fselne $f5,$f17,$f1, $f1 +fselne $f6,$f18,$f2, $f2 +fselne $f7,$f19,$f3, $f3 + + CMPLT($f0, $f1), $f16 + CMPLT($f2, $f3), $f17 + +fselne $f16,$f1,$f0, $f0 +fselne $f17,$f3,$f2, $f2 + + CMPLT($f0, $f2), $f16 +fselne $f16,$f2,$f0, $f0 + .align 4 + +$L15: + and N, 3, $1 + unop + unop + ble $1, $L20 + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + unop + addl X, INCX, X + + fabs $f20, $f29 + fabs $f21, $f30 + faddd $f29, $f30, $f29 + + CMPLT($f0, $f29), $f16 +fselne $f16,$f29,$f0, $f0 + + ldi $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$L20: + sra N, 2, $1 + ble $1, $L40 + .align 4 + + LD $f10, 0 * SIZE(XX) + LD $f11, 1 * SIZE(XX) + addl XX, INCX, XX + + LD $f12, 0 * SIZE(XX) + LD $f13, 1 * SIZE(XX) + addl XX, INCX, XX + + LD $f14, 0 * SIZE(XX) + LD $f15, 1 * SIZE(XX) + addl XX, INCX, XX + + LD $f16, 0 * SIZE(XX) + LD $f17, 1 * SIZE(XX) + addl XX, INCX, XX + + fabs $f10, $f18 + fabs $f11, $f19 + fabs $f12, $f20 + fabs $f13, $f21 + + ldi $1, -1($1) + ble $1, $L23 + .align 4 + +$L22: + LD $f10, 0 * SIZE(XX) + fabs $f14, $f22 + LD $f11, 1 * SIZE(XX) + addl XX, INCX, XX + + LD $f12, 0 * SIZE(XX) + fabs $f15, $f23 + LD $f13, 1 * SIZE(XX) + addl XX, INCX, XX + + LD $f14, 0 * SIZE(XX) + fabs $f16, $f24 + LD $f15, 1 * SIZE(XX) + addl XX, INCX, XX + + LD $f16, 0 * SIZE(XX) + fabs $f17, $f25 + LD $f17, 1 * SIZE(XX) + addl XX, INCX, XX + + faddd $f18, $f19, $f4 + faddd $f20, $f21, $f5 + faddd $f22, $f23, $f6 + faddd $f24, $f25, $f7 + + fcmpeq $f0, $f4, $f26 + fcmpeq $f0, $f5, $f27 + fcmpeq $f0, $f6, $f28 + fcmpeq $f0, $f7, $f29 + + fabs $f10, $f18 + ldi $0, 1($0) + ldi $1, -1($1) # i -- + fbne $f26, $End + + fabs $f11, $f19 + ldi $0, 1($0) + unop + fbne $f27, $End + + fabs $f12, $f20 + ldi $0, 1($0) + unop + fbne $f28, $End + + fabs $f13, $f21 + ldi $0, 1($0) + fbne $f29, $End + bgt $1, $L22 + .align 4 + +$L23: + fabs $f14, $f22 + fabs $f15, $f23 + fabs $f16, $f24 + fabs $f17, $f25 + + faddd $f18, $f19, $f4 + faddd $f20, $f21, $f5 + faddd $f22, $f23, $f6 + faddd $f24, $f25, $f7 + + fcmpeq $f0, $f4, $f26 + fcmpeq $f0, $f5, $f27 + fcmpeq $f0, $f6, $f28 + fcmpeq $f0, $f7, $f29 + + ldi $0, 1($0) + fbne $f26, $End + ldi $0, 1($0) + fbne $f27, $End + ldi $0, 1($0) + fbne $f28, $End + ldi $0, 1($0) + fbne $f29, $End + .align 4 + +$L40: + LD $f10, 0 * SIZE(XX) + LD $f11, 1 * SIZE(XX) + + addl XX, INCX, XX + + fabs $f10, $f18 + fabs $f11, $f19 + + faddd $f18, $f19, $f18 + fcmpeq $f0, $f18, $f2 + + ldi $0, 1($0) + fbne $f2, $End + br $31, $L40 + .align 4 + +$End: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + ldi $sp, STACKSIZE($sp) + ret + + EPILOGUE diff --git a/kernel/sw_64/izamax_simd.S b/kernel/sw_64/izamax_simd.S new file mode 100644 index 0000000..8b00f60 --- /dev/null +++ b/kernel/sw_64/izamax_simd.S @@ -0,0 +1,609 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 96 + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#define I $2 + +#ifndef USE_MIN +#define CMPLT(a, b) fcmplt a, b +#define VCMPLT(a, b) vfcmplt a, b +#else +#define CMPLT(a, b) fcmplt b, a +#define VCMPLT(a, b) vfcmplt b, a +#endif + +#define STACKSIZE 8 * 8 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + ldi $sp, -STACKSIZE($sp) + + fstd $f2, 0($sp) + fclr $f16 + cmplt $31, N, $2 + unop + + fstd $f3, 8($sp) + fclr $f17 + cmplt $31, INCX, $3 + unop + + fstd $f4, 16($sp) + fclr $f18 + SXADDQ INCX, $31, INCX + unop + + fstd $f5, 24($sp) + fclr $f19 + and $2, $3, $2 + clr $0 + + fstd $f6, 32($sp) + mov X, XX + + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + + fclr $f0 + cmpeq INCX, SIZE, $3 + and X, (VEC_LEN*SIZE-1), $4 # test the address of X (aligment) + beq $2, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + bic $3, $4, $3 + nop + nop + beq $3, $Sub + .align 4 + +$Align_Access: +/* + Unloop 8*2=16 reals +*/ +#ifdef USE_MIN + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + fabs $f20, $f20 + fabs $f21, $f21 + ADD $f20, $f21, $f0 # init temp min result value +#endif + sra N, 3, I + and N, 7, $3 + addl INCX, INCX, INCX + ble I, $Remain + .align 4 +/* + Init max or min value +*/ + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + fabs $f20, $f20 + fabs $f21, $f21 + + ADD $f20, $f21, $f4 + nop + vcpyf $f4, $f0 + vcpyf $f4, $f1 + + + VLD $f22, 0*VEC_LEN*SIZE(X) + VLD $f23, 1*VEC_LEN*SIZE(X) + VLD $f24, 2*VEC_LEN*SIZE(X) + VLD $f25, 3*VEC_LEN*SIZE(X) + + /*vfabs*/ + vcpys $f31, $f22, $f10 + subl I, 1, I + vcpys $f31, $f23, $f11 + addl X, 16*SIZE, X + + vcpys $f31, $f24, $f12 + nop + vcpys $f31, $f25, $f13 + ble I, $MainLoopEnd + .align 4 + +$MainLoop: + vextf $f10, 1, $f4 + VLD $f22, 0*VEC_LEN*SIZE(X) + vextf $f10, 3, $f5 + VLD $f23, 1*VEC_LEN*SIZE(X) + + vextf $f11, 0, $f6 + VLD $f24, 2*VEC_LEN*SIZE(X) + vextf $f11, 2, $f7 + VLD $f25, 3*VEC_LEN*SIZE(X) + + vextf $f12, 1, $f14 + vextf $f12, 3, $f15 + vextf $f13, 0, $f16 + vextf $f13, 2, $f17 + + vinsf $f4, $f11, 0, $f11 + vinsf $f6, $f10, 1, $f10 + vinsf $f14, $f13, 0, $f13 + vinsf $f16, $f12, 1, $f12 + + vinsf $f5, $f11, 2, $f11 + vinsf $f7, $f10, 3, $f10 + vinsf $f15, $f13, 2, $f13 + vinsf $f17, $f12, 3, $f12 + + VADD $f10, $f11, $f2 + addl X, 16*SIZE, X + VADD $f12, $f13, $f3 + subl I, 1, I + + vcpys $f31, $f22, $f10 + vcpys $f31, $f23, $f11 + VCMPLT($f0, $f2), $f18 + VCMPLT($f1, $f3), $f19 + + vcpys $f31, $f24, $f12 + fillcs PREFETCHSIZE * SIZE(X) + vcpys $f31, $f25, $f13 + nop + + vfseleq $f18, $f0, $f2, $f0 + vfseleq $f19, $f1, $f3, $f1 + nop + bgt I, $MainLoop + .align 4 + +$MainLoopEnd: +/*spilt the complex vector to real vector($f10,$f12) and image vector ($f11,$f13)*/ + vextf $f10, 1, $f4 + vextf $f10, 3, $f5 + vextf $f11, 0, $f6 + vextf $f11, 2, $f7 + + vextf $f12, 1, $f14 + vextf $f12, 3, $f15 + vextf $f13, 0, $f16 + vextf $f13, 2, $f17 + + vinsf $f4, $f11, 0, $f11 + vinsf $f6, $f10, 1, $f10 + vinsf $f14, $f13, 0, $f13 + vinsf $f16, $f12, 1, $f12 + + vinsf $f5, $f11, 2, $f11 + vinsf $f7, $f10, 3, $f10 + vinsf $f15, $f13, 2, $f13 + vinsf $f17, $f12, 3, $f12 + + VADD $f10, $f11, $f2 + VADD $f12, $f13, $f3 + VCMPLT($f0, $f2), $f18 + VCMPLT($f1, $f3), $f19 + + vfseleq $f18, $f0, $f2, $f0 + vfseleq $f19, $f1, $f3, $f1 +/*find the max or min between f0 and f1*/ + VCMPLT($f0, $f1), $f18 + vfseleq $f18, $f0, $f1, $f0 + + + vextf $f0, 1, $f22 + vextf $f0, 2, $f23 + vextf $f0, 3, $f24 + CMPLT($f0, $f22), $f16 + + CMPLT($f23, $f24), $f17 + fseleq $f16, $f0, $f22, $f0 + fseleq $f17, $f23, $f24, $f23 + CMPLT($f0, $f23), $f18 + + fseleq $f18, $f0, $f23, $f0 + nop + .align 4 +$Remain: + ble $3, $Continuous_FindIndex + .align 4 +$RemainLoop: + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + unop + addl X, 2*SIZE, X + + fabs $f20, $f29 + fabs $f21, $f30 + ADD $f29, $f30, $f29 + + CMPLT($f0, $f29), $f16 + fselne $f16,$f29,$f0, $f0 + + subl $3, 1, $3 + bgt $3, $RemainLoop + .align 4 + + /*find index*/ +$Continuous_FindIndex: + + jmp $L20 + +$Sub: + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + sra N, 2, $1 + addl INCX, INCX, INCX + + fabs $f20, $f20 + fabs $f21, $f21 + ADD $f20, $f21, $f0 + ble $1, $L15 + .align 4 + + ldi $1, -1($1) + unop + addl X, INCX, X + unop + + LD $f22, 0 * SIZE(X) + fmov $f0, $f1 + LD $f23, 1 * SIZE(X) + addl X, INCX, X + + LD $f24, 0 * SIZE(X) + fmov $f0, $f2 + LD $f25, 1 * SIZE(X) + addl X, INCX, X + + LD $f26, 0 * SIZE(X) + fmov $f0, $f3 + LD $f27, 1 * SIZE(X) + addl X, INCX, X + + fabs $f20, $f8 + fabs $f21, $f9 + fabs $f22, $f10 + fabs $f23, $f11 + + fabs $f24, $f12 + fabs $f25, $f13 + fabs $f26, $f14 + fabs $f27, $f15 + + ble $1, $L14 + .align 4 + + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + ldi $1, -1($1) + addl X, INCX, X + + LD $f22, 0 * SIZE(X) + LD $f23, 1 * SIZE(X) + unop + addl X, INCX, X + + LD $f24, 0 * SIZE(X) + LD $f25, 1 * SIZE(X) + unop + addl X, INCX, X + + LD $f26, 0 * SIZE(X) + LD $f27, 1 * SIZE(X) + addl X, INCX, X + ble $1, $L13 + .align 4 + +$L12: + ADD $f8, $f9, $f16 + fillcs PREFETCHSIZE * SIZE(X) + fabs $f20, $f8 + fillcs 64 * SIZE(X) + + ADD $f10, $f11, $f17 + unop + fabs $f21, $f9 + LD $f20, 0 * SIZE(X) + + ADD $f12, $f13, $f18 + LD $f21, 1 * SIZE(X) + fabs $f22, $f10 + addl X, INCX, X + + ADD $f14, $f15, $f19 + LD $f22, 0 * SIZE(X) + fabs $f23, $f11 + unop + + CMPLT($f0, $f16), $f4 + LD $f23, 1 * SIZE(X) + fabs $f24, $f12 + addl X, INCX, X + + CMPLT($f1, $f17), $f5 + LD $f24, 0 * SIZE(X) + fabs $f25, $f13 + unop + + CMPLT($f2, $f18), $f6 + LD $f25, 1 * SIZE(X) + fabs $f26, $f14 + addl X, INCX, X + + CMPLT($f3, $f19), $f7 + LD $f26, 0 * SIZE(X) + fabs $f27, $f15 + unop + + fselne $f4,$f16,$f0, $f0 + LD $f27, 1 * SIZE(X) + addl X, INCX, X + ldi $1, -1($1) # i -- + + fselne $f5,$f17,$f1, $f1 + fselne $f6,$f18,$f2, $f2 + fselne $f7,$f19,$f3, $f3 + bgt $1,$L12 + .align 4 + +$L13: + ADD $f8, $f9, $f16 + fabs $f20, $f8 + + ADD $f10, $f11, $f17 + fabs $f21, $f9 + + ADD $f12, $f13, $f18 + fabs $f22, $f10 + + ADD $f14, $f15, $f19 + fabs $f23, $f11 + + CMPLT($f0, $f16), $f4 + fabs $f24, $f12 + + CMPLT($f1, $f17), $f5 + fabs $f25, $f13 + + CMPLT($f2, $f18), $f6 + fabs $f26, $f14 + CMPLT($f3, $f19), $f7 + fabs $f27, $f15 + + fselne $f4,$f16,$f0, $f0 + fselne $f5,$f17,$f1, $f1 + fselne $f6,$f18,$f2, $f2 + fselne $f7,$f19,$f3, $f3 + .align 4 + +$L14: + ADD $f8, $f9, $f16 + ADD $f10, $f11, $f17 + ADD $f12, $f13, $f18 + ADD $f14, $f15, $f19 + + CMPLT($f0, $f16), $f4 + CMPLT($f1, $f17), $f5 + CMPLT($f2, $f18), $f6 + CMPLT($f3, $f19), $f7 + + fselne $f4,$f16,$f0, $f0 + fselne $f5,$f17,$f1, $f1 + fselne $f6,$f18,$f2, $f2 + fselne $f7,$f19,$f3, $f3 + + CMPLT($f0, $f1), $f16 + CMPLT($f2, $f3), $f17 + + fselne $f16,$f1,$f0, $f0 + fselne $f17,$f3,$f2, $f2 + + CMPLT($f0, $f2), $f16 + fselne $f16,$f2,$f0, $f0 + .align 4 + +$L15: + and N, 3, $1 + unop + unop + ble $1, $L20 + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + unop + addl X, INCX, X + + fabs $f20, $f29 + fabs $f21, $f30 + ADD $f29, $f30, $f29 + + CMPLT($f0, $f29), $f16 + fselne $f16,$f29,$f0, $f0 + + ldi $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$L20: + sra N, 2, $1 + ble $1, $L40 + .align 4 + + LD $f10, 0 * SIZE(XX) + LD $f11, 1 * SIZE(XX) + addl XX, INCX, XX + + LD $f12, 0 * SIZE(XX) + LD $f13, 1 * SIZE(XX) + addl XX, INCX, XX + + LD $f14, 0 * SIZE(XX) + LD $f15, 1 * SIZE(XX) + addl XX, INCX, XX + + LD $f16, 0 * SIZE(XX) + LD $f17, 1 * SIZE(XX) + addl XX, INCX, XX + + fabs $f10, $f18 + fabs $f11, $f19 + fabs $f12, $f20 + fabs $f13, $f21 + + ldi $1, -1($1) + ble $1, $L23 + .align 4 + +$L22: + LD $f10, 0 * SIZE(XX) + fabs $f14, $f22 + LD $f11, 1 * SIZE(XX) + addl XX, INCX, XX + + LD $f12, 0 * SIZE(XX) + fabs $f15, $f23 + LD $f13, 1 * SIZE(XX) + addl XX, INCX, XX + + LD $f14, 0 * SIZE(XX) + fabs $f16, $f24 + LD $f15, 1 * SIZE(XX) + addl XX, INCX, XX + + LD $f16, 0 * SIZE(XX) + fabs $f17, $f25 + LD $f17, 1 * SIZE(XX) + addl XX, INCX, XX + + ADD $f18, $f19, $f4 + ADD $f20, $f21, $f5 + ADD $f22, $f23, $f6 + ADD $f24, $f25, $f7 + + fcmpeq $f0, $f4, $f26 + fcmpeq $f0, $f5, $f27 + fcmpeq $f0, $f6, $f28 + fcmpeq $f0, $f7, $f29 + + fabs $f10, $f18 + ldi $0, 1($0) + ldi $1, -1($1) # i -- + fbne $f26, $End + + fabs $f11, $f19 + ldi $0, 1($0) + unop + fbne $f27, $End + + fabs $f12, $f20 + ldi $0, 1($0) + fillcs PREFETCHSIZE * SIZE(X) + fbne $f28, $End + + fabs $f13, $f21 + ldi $0, 1($0) + fbne $f29, $End + bgt $1, $L22 + .align 4 + +$L23: + fabs $f14, $f22 + fabs $f15, $f23 + fabs $f16, $f24 + fabs $f17, $f25 + + ADD $f18, $f19, $f4 + ADD $f20, $f21, $f5 + ADD $f22, $f23, $f6 + ADD $f24, $f25, $f7 + + fcmpeq $f0, $f4, $f26 + fcmpeq $f0, $f5, $f27 + fcmpeq $f0, $f6, $f28 + fcmpeq $f0, $f7, $f29 + + ldi $0, 1($0) + fbne $f26, $End + ldi $0, 1($0) + fbne $f27, $End + ldi $0, 1($0) + fbne $f28, $End + ldi $0, 1($0) + fbne $f29, $End + .align 4 + +$L40: + LD $f10, 0 * SIZE(XX) + LD $f11, 1 * SIZE(XX) + + addl XX, INCX, XX + + fabs $f10, $f18 + fabs $f11, $f19 + + ADD $f18, $f19, $f18 + fcmpeq $f0, $f18, $f2 + + ldi $0, 1($0) + fbne $f2, $End + br $31, $L40 + .align 4 + +$End: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + ldi $sp, STACKSIZE($sp) + ret + + EPILOGUE diff --git a/kernel/sw_64/lsame.S b/kernel/sw_64/lsame.S new file mode 100644 index 0000000..c2c0863 --- /dev/null +++ b/kernel/sw_64/lsame.S @@ -0,0 +1,77 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "version.h" + + .set noat + .set noreorder +.text + .align 5 + .globl lsame_ + .ent lsame_ +lsame_: + .frame $sp,0,$26,0 +#ifdef PROFILE + ldgp $gp, 0($27) + lda $28, _mcount + jsr $28, ($28), _mcount + .prologue 1 +#else + .prologue 0 +#endif + + ldbu $5, 0($16) + ldbu $6, 0($17) +// extb $2, $5 +// extbl $3, $6 + + subl $5, 96, $1 + subl $6, 96, $2 + subl $5, 32, $3 + subl $6, 32, $4 + + + selgt $1, $3, $5, $5 + selgt $2, $4, $6, $6 + cmpeq $5, $6, $0 + .align 4 + +$End: + ret + .end lsame_ + .ident VERSION diff --git a/kernel/sw_64/max.S b/kernel/sw_64/max.S new file mode 100644 index 0000000..07925d1 --- /dev/null +++ b/kernel/sw_64/max.S @@ -0,0 +1,227 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 + +#ifndef USE_MIN +#define CMPLT(a, b) fcmplt a, b +#else +#define CMPLT(a, b) fcmplt b, a +#endif + +#define STACKSIZE 8 * 8 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + +#ifdef F_INTERFACE + ldl N, 0(N) # n + ldl INCX, 0(INCX) # incx +#endif + ldi $sp, -STACKSIZE($sp) + nop + .align 4 + + cmplt $31, N, $2 + cmplt $31, INCX, $3 + SXADDQ INCX, $31, INCX + and $2, $3, $0 + + sra N, 3, $1 + fclr $f0 + unop + beq $0, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + LD $f0, 0 * SIZE(X) + unop + unop + ble $1, $L15 + .align 4 + + fmov $f0, $f1 + addl X, INCX, X + fmov $f0, $f10 + ldi $1, -1($1) + + LD $f21, 0 * SIZE(X) + fmov $f0, $f11 + addl X, INCX, X + fmov $f0, $f12 + + LD $f22, 0 * SIZE(X) + fmov $f0, $f13 + addl X, INCX, X + fmov $f0, $f14 + + LD $f23, 0 * SIZE(X) + fmov $f0, $f15 + addl X, INCX, X + fmov $f0, $f20 + + LD $f24, 0 * SIZE(X) + addl X, INCX, X + LD $f25, 0 * SIZE(X) + addl X, INCX, X + LD $f26, 0 * SIZE(X) + addl X, INCX, X + LD $f27, 0 * SIZE(X) + addl X, INCX, X + + CMPLT($f0, $f20), $f16 + CMPLT($f1, $f21), $f17 + CMPLT($f10, $f22), $f18 + CMPLT($f11, $f23), $f19 + + ble $1, $L13 + .align 4 + +$L12: + fselne $f16, $f20, $f0, $f0 + LD $f20, 0 * SIZE(X) + CMPLT($f12, $f24), $f16 + addl X, INCX, X + + fselne $f17, $f21, $f1, $f1 + LD $f21, 0 * SIZE(X) + CMPLT($f13, $f25), $f17 + addl X, INCX, X + + fselne $f18, $f22, $f10, $f10 + LD $f22, 0 * SIZE(X) + CMPLT($f14, $f26), $f18 + addl X, INCX, X + + fselne $f19, $f23, $f11, $f11 + LD $f23, 0 * SIZE(X) + CMPLT($f15, $f27), $f19 + addl X, INCX, X + + fselne $f16, $f24, $f12, $f12 + LD $f24, 0 * SIZE(X) + CMPLT($f0, $f20), $f16 + addl X, INCX, X + + fselne $f17, $f25, $f13, $f13 + LD $f25, 0 * SIZE(X) + CMPLT($f1, $f21), $f17 + addl X, INCX, X + + fselne $f18, $f26, $f14, $f14 + LD $f26, 0 * SIZE(X) + CMPLT($f10, $f22), $f18 + addl X, INCX, X + + fselne $f19, $f27, $f15, $f15 + LD $f27, 0 * SIZE(X) + CMPLT($f11, $f23), $f19 + ldi $1, -1($1) # i -- + + addl X, INCX, X + unop + unop + bgt $1,$L12 + .align 4 + +$L13: + fselne $f16, $f20, $f0, $f0 + CMPLT($f12, $f24), $f16 + + fselne $f17, $f21, $f1, $f1 + CMPLT($f13, $f25), $f17 + + fselne $f18, $f22, $f10, $f10 + CMPLT($f14, $f26), $f18 + + fselne $f19, $f23, $f11, $f11 + CMPLT($f15, $f27), $f19 + + fselne $f16, $f24, $f12, $f12 + CMPLT($f0, $f1), $f16 + fselne $f17, $f25, $f13, $f13 + CMPLT($f10, $f11), $f17 + + fselne $f18, $f26, $f14, $f14 + CMPLT($f12, $f13), $f18 + fselne $f19, $f27, $f15, $f15 + CMPLT($f14, $f15), $f19 + + fselne $f16, $f1, $f0, $f0 + fselne $f17, $f11, $f10, $f10 + fselne $f18, $f13, $f12, $f12 + fselne $f19, $f15, $f14, $f14 + + CMPLT($f0, $f10), $f16 + CMPLT($f12, $f14), $f17 + + fselne $f16, $f10, $f0, $f0 + fselne $f17, $f14, $f12, $f12 + + CMPLT($f0, $f12), $f16 + fselne $f16, $f12, $f0, $f0 + .align 4 + +$L15: + and N, 7, $1 + unop + unop + ble $1, $End + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + addl X, INCX, X + + CMPLT($f0, $f20), $f16 + fselne $f16, $f20, $f0, $f0 + ldi $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$End: + ldi $sp, STACKSIZE($sp) + ret + + EPILOGUE diff --git a/kernel/sw_64/nrm2_simd.S b/kernel/sw_64/nrm2_simd.S new file mode 100644 index 0000000..0888454 --- /dev/null +++ b/kernel/sw_64/nrm2_simd.S @@ -0,0 +1,493 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#define I $0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f10 +#define a3 $f11 +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f19 +#define x4 $f20 +#define x5 $f21 +#define x6 $f22 +#define x7 $f23 + + PROLOGUE + + + PROFCODE + + + fclr a0 + SXADDQ INCX, 0, INCX + fclr a1 + ble N, $L999 + + fclr a2 + cmpeq INCX, SIZE, $0 + fclr a3 + beq $0, $L20 #stride access + +/* test the address of X */ + and X, (VEC_LEN*SIZE-1), $3 + fclr t0 + nop + bne $3, $UnAlign_ACCESS +/*Align access. Use simd instructions.*/ + sra N, 4, I + ble I, $Remain + + VLD a0, 0*VEC_LEN*SIZE(X) + vcpys $f31, $f31, t0 #clear s0 vector + VLD a1, 1*VEC_LEN*SIZE(X) + vcpys $f31, $f31, t1 + + VLD a2, 2*VEC_LEN*SIZE(X) + vcpys $f31, $f31, t2 + VLD a3, 3*VEC_LEN*SIZE(X) + vcpys $f31, $f31, t3 + + addl X, 16 * SIZE, X + subl I, 1, I + nop + ble I, $MainLoopEnd +$MainLoop: + fillcs PREFETCHSIZE * SIZE(X) + VMAD a0, a0, t0, t0 + subl I, 1, I + VMAD a1, a1, t1, t1 + + addl X, 16 * SIZE, X + VMAD a2, a2, t2, t2 + nop + VMAD a3, a3, t3, t3 + + VLD a0, -4*VEC_LEN*SIZE(X) + VLD a1, -3*VEC_LEN*SIZE(X) + VLD a2, -2*VEC_LEN*SIZE(X) + VLD a3, -1*VEC_LEN*SIZE(X) + + bgt I, $MainLoop + .align 4 +$MainLoopEnd: + VMAD a0, a0, t0, t0 + VMAD a1, a1, t1, t1 + VMAD a2, a2, t2, t2 + VMAD a3, a3, t3, t3 + + VADD t0, t1, a0 + VADD t2, t3, a1 + nop + VADD a0, a1, t0 + + vextf t0, 1, t1 + vextf t0, 2, t2 + vextf t0, 3, t3 + nop + + ADD t0, t1, a2 + ADD t2, t3, a3 + nop + ADD a2, a3, t0 + + .align 4 +$Remain: + and N, 15, I + ble I, $End + .align 4 +$RemainLoop: + LD a0, 0 * SIZE(X) + addl X, SIZE, X + MAD a0, a0, t0, t0 + subl I, 1, I + + bgt I, $RemainLoop + .align 4 +$End: + SQRT t0, a0 + ret + .align 4 + +/*Don't use simd*/ + +$UnAlign_ACCESS: + + fclr t0 + sra N, 4, I + fclr t1 + ble I, $L15 + + fclr t2 + LD x0, 0 * SIZE(X) + fclr t3 + LD x1, 1 * SIZE(X) + + LD x2, 2 * SIZE(X) + LD x3, 3 * SIZE(X) + LD x4, 4 * SIZE(X) + LD x5, 5 * SIZE(X) + LD x6, 6 * SIZE(X) + LD x7, 7 * SIZE(X) + + ldi I, -1(I) + ble I, $L12 + .align 4 + +$L11: + ADD a0, t0, a0 + fillcs (PREFETCHSIZE) * SIZE(X) + MUL x0, x0, t0 + LD x0, 8 * SIZE(X) + + ADD a1, t1, a1 + mov X, XX + MUL x1, x1, t1 + LD x1, 9 * SIZE(X) + + ADD a2, t2, a2 + unop + MUL x2, x2, t2 + LD x2, 10 * SIZE(X) + + ADD a3, t3, a3 + unop + MUL x3, x3, t3 + LD x3, 11 * SIZE(X) + + ADD a0, t0, a0 + unop + MUL x4, x4, t0 + LD x4, 12 * SIZE(X) + + ADD a1, t1, a1 + unop + MUL x5, x5, t1 + LD x5, 13 * SIZE(X) + + ADD a2, t2, a2 + unop + MUL x6, x6, t2 + LD x6, 14 * SIZE(X) + + ADD a3, t3, a3 + unop + MUL x7, x7, t3 + LD x7, 15 * SIZE(X) + + ADD a0, t0, a0 + unop + MUL x0, x0, t0 + LD x0, 16 * SIZE(X) + + ADD a1, t1, a1 + ldi X, 16 * SIZE(X) + MUL x1, x1, t1 + LD x1, 17 * SIZE(XX) + + ADD a2, t2, a2 + unop + MUL x2, x2, t2 + LD x2, 18 * SIZE(XX) + + ADD a3, t3, a3 + unop + MUL x3, x3, t3 + LD x3, 19 * SIZE(XX) + + ADD a0, t0, a0 + unop + MUL x4, x4, t0 + LD x4, 20 * SIZE(XX) + + ADD a1, t1, a1 + ldi I, -1(I) + MUL x5, x5, t1 + LD x5, 21 * SIZE(XX) + + ADD a2, t2, a2 + unop + MUL x6, x6, t2 + LD x6, 22 * SIZE(XX) + + ADD a3, t3, a3 + MUL x7, x7, t3 + LD x7, 23 * SIZE(XX) + bgt I, $L11 + .align 4 + +$L12: + ADD a0, t0, a0 + mov X, XX + MUL x0, x0, t0 + LD x0, 8 * SIZE(X) + + ADD a1, t1, a1 + unop + MUL x1, x1, t1 + LD x1, 9 * SIZE(X) + + ADD a2, t2, a2 + unop + MUL x2, x2, t2 + LD x2, 10 * SIZE(X) + + ADD a3, t3, a3 + unop + MUL x3, x3, t3 + LD x3, 11 * SIZE(X) + + ADD a0, t0, a0 + unop + MUL x4, x4, t0 + LD x4, 12 * SIZE(XX) + + ADD a1, t1, a1 + unop + MUL x5, x5, t1 + LD x5, 13 * SIZE(XX) + + ADD a2, t2, a2 + unop + MUL x6, x6, t2 + LD x6, 14 * SIZE(XX) + + ADD a3, t3, a3 + ldi X, 16 * SIZE(X) + MUL x7, x7, t3 + LD x7, 15 * SIZE(XX) + + ADD a0, t0, a0 + MUL x0, x0, t0 + ADD a1, t1, a1 + MUL x1, x1, t1 + + ADD a2, t2, a2 + MUL x2, x2, t2 + ADD a3, t3, a3 + MUL x3, x3, t3 + + ADD a0, t0, a0 + MUL x4, x4, t0 + ADD a1, t1, a1 + MUL x5, x5, t1 + + ADD a2, t2, a2 + MUL x6, x6, t2 + ADD a3, t3, a3 + MUL x7, x7, t3 + + ADD a1, t1, a1 + ADD a2, t2, a2 + ADD a3, t3, a3 + .align 4 + +$L15: + and N, 15, I + ble I, $L998 + .align 4 + +$L16: + LD x0, 0 * SIZE(X) + ldi X, 1 * SIZE(X) + + ADD a0, t0, a0 + MUL x0, x0, t0 + + ldi I, -1(I) + bgt I, $L16 + bsr $31, $L998 + .align 4 + +$L20: + fclr t0 + sra N, 3, I + fclr t1 + ble I, $L25 + + fclr t2 + fclr t3 + + LD x0, 0 * SIZE(X) + addl X, INCX, X + LD x1, 0 * SIZE(X) + addl X, INCX, X + LD x2, 0 * SIZE(X) + addl X, INCX, X + LD x3, 0 * SIZE(X) + addl X, INCX, X + + LD x4, 0 * SIZE(X) + addl X, INCX, X + LD x5, 0 * SIZE(X) + addl X, INCX, X + LD x6, 0 * SIZE(X) + addl X, INCX, X + + ldi I, -1(I) + ble I, $L22 + .align 4 + +$L21: + ADD a0, t0, a0 + LD x7, 0 * SIZE(X) + MUL x0, x0, t0 + addl X, INCX, X + + ADD a1, t1, a1 + LD x0, 0 * SIZE(X) + MUL x1, x1, t1 + addl X, INCX, X + + ADD a2, t2, a2 + LD x1, 0 * SIZE(X) + MUL x2, x2, t2 + addl X, INCX, X + + ADD a3, t3, a3 + LD x2, 0 * SIZE(X) + MUL x3, x3, t3 + addl X, INCX, X + + ADD a0, t0, a0 + LD x3, 0 * SIZE(X) + MUL x4, x4, t0 + addl X, INCX, X + + ADD a1, t1, a1 + LD x4, 0 * SIZE(X) + MUL x5, x5, t1 + addl X, INCX, X + + ADD a2, t2, a2 + LD x5, 0 * SIZE(X) + MUL x6, x6, t2 + addl X, INCX, X + + ADD a3, t3, a3 + LD x6, 0 * SIZE(X) + MUL x7, x7, t3 + addl X, INCX, X + + ldi I, -1(I) + bgt I, $L21 + .align 4 + +$L22: + ADD a0, t0, a0 + LD x7, 0 * SIZE(X) + MUL x0, x0, t0 + addl X, INCX, X + + ADD a1, t1, a1 + unop + MUL x1, x1, t1 + unop + + ADD a2, t2, a2 + MUL x2, x2, t2 + ADD a3, t3, a3 + MUL x3, x3, t3 + + ADD a0, t0, a0 + MUL x4, x4, t0 + ADD a1, t1, a1 + MUL x5, x5, t1 + + ADD a2, t2, a2 + MUL x6, x6, t2 + ADD a3, t3, a3 + MUL x7, x7, t3 + + ADD a1, t1, a1 + ADD a2, t2, a2 + ADD a3, t3, a3 + .align 4 + +$L25: + and N, 7, I + ble I, $L998 + .align 4 + +$L26: + LD x0, 0 * SIZE(X) + addl X, INCX, X + + ADD a0, t0, a0 + MUL x0, x0, t0 + + ldi I, -1(I) + bgt I, $L26 + .align 4 + + +$L998: + ADD a0, t0, a0 + + ADD a0, a1, a0 + ADD a2, a3, a2 + + + ADD a0, a2, a0 + SQRT a0, a0 + + .align 4 + +$L999: + ret + EPILOGUE diff --git a/kernel/sw_64/rot.S b/kernel/sw_64/rot.S new file mode 100644 index 0000000..3c8624e --- /dev/null +++ b/kernel/sw_64/rot.S @@ -0,0 +1,680 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 +#define I $21 +#define XX $23 +#define YY $24 + +#define C $f10 +#define S $f11 + +#define PREFETCH_SIZE 80 + + PROLOGUE + PROFCODE + .frame $sp, 16, $26, 0 + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + ldi $sp, -16($sp) + fstd $f20, 8($sp) + + fmov $f21, C + LD S, 16($sp) + cmpeq INCX, 1, $23 + cmpeq INCY, 1, $24 + ble N, $L998 + + + and $23, $24, $23 + beq $23, $L50 + + sra N, 3, I + ble I, $L15 + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + LD $f15, 1*SIZE(Y) + + LD $f16, 2*SIZE(X) + LD $f17, 2*SIZE(Y) + LD $f18, 3*SIZE(X) + LD $f19, 3*SIZE(Y) + + MUL C, $f12, $f21 + unop + MUL S, $f13, $f22 + MUL C, $f13, $f23 + + LD $f13, 4*SIZE(Y) + MUL S, $f12, $f24 + LD $f12, 4*SIZE(X) + MUL C, $f14, $f25 + + ldi I, -1(I) + MUL S, $f15, $f26 + ADD $f21, $f22, $f20 + fmov $f20,$f22 + MUL C, $f15, $f27 + + LD $f15, 5*SIZE(Y) + MUL S, $f14, $f28 + SUB $f23, $f24, $f20 + fmov $f20,$f24 + ble I, $L13 + .align 4 + +$L12: + MUL C, $f16, $f21 + flds $f31, (PREFETCH_SIZE) * SIZE(X) + unop + LD $f14, 5*SIZE(X) + + ST $f22, 0*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f20 + fmov $f20,$f26 + + MUL C, $f17, $f23 + flds $f31, (PREFETCH_SIZE) * SIZE(Y) + unop + LD $f17, 6*SIZE(Y) + + ST $f24, 0*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f20 + fmov $f20,$f28 + + MUL C, $f18, $f25 + LD $f16, 6*SIZE(X) + unop + unop + + ST $f26, 1*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f20 + fmov $f20,$f22 + + MUL C, $f19, $f27 + unop + unop + LD $f19, 7*SIZE(Y) + + ST $f28, 1*SIZE(Y) + MUL S, $f18, $f28 + unop + SUB $f23, $f24, $f20 + fmov $f20,$f24 + + MUL C, $f12, $f21 + LD $f18, 7*SIZE(X) + unop + unop + + ST $f22, 2*SIZE(X) + unop + MUL S, $f13, $f22 + ADD $f25, $f26, $f20 + fmov $f20,$f26 + + MUL C, $f13, $f23 + LD $f13, 8*SIZE(Y) + unop + unop + + ST $f24, 2*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f20 + fmov $f20,$f28 + + MUL C, $f14, $f25 + LD $f12, 8*SIZE(X) + unop + unop + + ST $f26, 3*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f20 + fmov $f20,$f22 + + MUL C, $f15, $f27 + LD $f15, 9*SIZE(Y) + unop + unop + + ST $f28, 3*SIZE(Y) + MUL S, $f14, $f28 + unop + SUB $f23, $f24, $f20 + fmov $f20,$f24 + + MUL C, $f16, $f21 + LD $f14, 9*SIZE(X) + unop + unop + + ST $f22, 4*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f20 + fmov $f20,$f26 + + MUL C, $f17, $f23 + LD $f17, 10*SIZE(Y) + unop + unop + + ST $f24, 4*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f20 + fmov $f20,$f28 + + MUL C, $f18, $f25 + LD $f16, 10*SIZE(X) + unop + unop + + ST $f26, 5*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f20 + fmov $f20,$f22 + + MUL C, $f19, $f27 + LD $f19, 11*SIZE(Y) + unop + unop + + ST $f28, 5*SIZE(Y) + MUL S, $f18, $f28 + ldi I, -1(I) + SUB $f23, $f24, $f20 + fmov $f20,$f24 + + MUL C, $f12, $f21 + LD $f18, 11*SIZE(X) + unop + unop + + ST $f22, 6*SIZE(X) + MUL S, $f13, $f22 + unop + ADD $f25, $f26, $f20 + fmov $f20,$f26 + + MUL C, $f13, $f23 + LD $f13, 12*SIZE(Y) + ldi X, 8*SIZE(X) + unop + + ST $f24, 6*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f20 + fmov $f20,$f28 + + MUL C, $f14, $f25 + LD $f12, 4*SIZE(X) + ldi Y, 8*SIZE(Y) + unop + + ST $f26, -1*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f20 + fmov $f20,$f22 + + MUL C, $f15, $f27 + LD $f15, 5*SIZE(Y) + unop + unop + + ST $f28, -1*SIZE(Y) + MUL S, $f14, $f28 + SUB $f23, $f24, $f20 + fmov $f20,$f24 + bgt I, $L12 + .align 4 + +$L13: + MUL C, $f16, $f21 + LD $f14, 5*SIZE(X) + unop + unop + + ST $f22, 0*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f20 + fmov $f20,$f26 + + MUL C, $f17, $f23 + unop + unop + LD $f17, 6*SIZE(Y) + + ST $f24, 0*SIZE(Y) + MUL S, $f16, $f24 + LD $f16, 6*SIZE(X) + SUB $f27, $f28, $f20 + fmov $f20,$f28 + + MUL C, $f18, $f25 + unop + unop + unop + + ST $f26, 1*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f20 + fmov $f20,$f22 + + MUL C, $f19, $f27 + unop + unop + LD $f19, 7*SIZE(Y) + + ST $f28, 1*SIZE(Y) + MUL S, $f18, $f28 + SUB $f23, $f24, $f18 + fmov $f18,$f24 + LD $f18, 7*SIZE(X) + + MUL C, $f12, $f21 + unop + unop + unop + + ST $f22, 2*SIZE(X) + unop + MUL S, $f13, $f22 + ADD $f25, $f26, $f20 + fmov $f20,$f26 + + MUL C, $f13, $f23 + unop + unop + unop + + ST $f24, 2*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f20 + fmov $f20,$f28 + + MUL C, $f14, $f25 + unop + unop + unop + + ST $f26, 3*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f20 + fmov $f20,$f22 + + MUL C, $f15, $f27 + unop + unop + unop + + ST $f28, 3*SIZE(Y) + MUL S, $f14, $f28 + unop + SUB $f23, $f24, $f20 + fmov $f20,$f24 + + MUL C, $f16, $f21 + unop + unop + unop + + ST $f22, 4*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f20 + fmov $f20,$f26 + + MUL C, $f17, $f23 + unop + unop + unop + + ST $f24, 4*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f20 + fmov $f20,$f28 + + MUL C, $f18, $f25 + unop + unop + unop + + ST $f26, 5*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f20 + fmov $f20,$f22 + + MUL C, $f19, $f27 + unop + unop + unop + + ST $f28, 5*SIZE(Y) + MUL S, $f18, $f28 + unop + SUB $f23, $f24, $f20 + fmov $f20,$f24 + + ST $f22, 6*SIZE(X) + ADD $f25, $f26, $f20 + fmov $f20,$f26 + ST $f24, 6*SIZE(Y) + SUB $f27, $f28, $f20 + fmov $f20,$f28 + + ST $f26, 7*SIZE(X) + ldi X, 8*SIZE(X) + ST $f28, 7*SIZE(Y) + ldi Y, 8*SIZE(Y) + .align 4 + + +$L15: + and N, 7, I + ble I, $L998 + .align 4 + +$L16: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f25 + SUB $f23, $f24, $f26 + ldi I, -1(I) + + ST $f25, 0*SIZE(X) + ldi X, 1 * SIZE(X) + ST $f26, 0*SIZE(Y) + ldi Y, 1 * SIZE(Y) + + bgt I, $L16 + .align 4 + +$L998: + clr $0 + fldd $f20, 8($sp) + ldi $sp, 16($sp) + ret + .align 4 + +$L50: + mov X, XX + mov Y, YY + + sra N, 3, I + ble I, $L55 + .align 4 + +$L51: + LD $f12, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f13, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f14, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f16, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f17, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f18, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f19, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f20 + fmov $f20,$f22 + SUB $f23, $f24, $f20 + fmov $f20,$f24 + + ST $f22, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f24, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f20 + fmov $f20,$f26 + SUB $f27, $f28, $f20 + fmov $f20,$f28 + + ST $f26, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f16, $f21 + MUL S, $f17, $f22 + MUL C, $f17, $f23 + MUL S, $f16, $f24 + + ADD $f21, $f22, $f20 + fmov $f20,$f22 + SUB $f23, $f24, $f20 + fmov $f20,$f24 + + ST $f22, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f24, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f18, $f25 + MUL S, $f19, $f26 + MUL C, $f19, $f27 + MUL S, $f18, $f28 + + ADD $f25, $f26, $f20 + fmov $f20,$f26 + SUB $f27, $f28, $f20 + fmov $f20,$f28 + + ST $f26, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + + LD $f12, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f13, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f14, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f16, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f17, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f18, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f19, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f20 + fmov $f20,$f22 + SUB $f23, $f24, $f20 + fmov $f20,$f24 + + ST $f22, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f24, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f20 + fmov $f20,$f26 + SUB $f27, $f28, $f20 + fmov $f20,$f28 + + ST $f26, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f16, $f21 + MUL S, $f17, $f22 + MUL C, $f17, $f23 + MUL S, $f16, $f24 + + ADD $f21, $f22, $f20 + fmov $f20,$f22 + SUB $f23, $f24, $f20 + fmov $f20,$f24 + + ST $f22, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f24, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f18, $f25 + MUL S, $f19, $f26 + MUL C, $f19, $f27 + MUL S, $f18, $f28 + + ADD $f25, $f26, $f20 + fmov $f20,$f26 + SUB $f27, $f28, $f20 + fmov $f20,$f28 + + ST $f26, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + ldi I, -1(I) + bgt I, $L51 + .align 4 + +$L55: + and N, 7, I + ble I, $L999 + .align 4 + +$L56: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f25 + SUB $f23, $f24, $f26 + ldi I, -1(I) + + ST $f25, 0*SIZE(X) + SXADDQ INCX, X, X + ST $f26, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + bgt I, $L56 + .align 4 + +$L999: + fldd $f20, 8($sp) + ldi $sp, 16($sp) + + clr $0 +# fldd $f20, 8($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/rot.S.bak b/kernel/sw_64/rot.S.bak new file mode 100644 index 0000000..62e9ff9 --- /dev/null +++ b/kernel/sw_64/rot.S.bak @@ -0,0 +1,624 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 +#define I $21 +#define XX $23 +#define YY $24 + +#define C $f10 +#define S $f11 + +#define PREFETCH_SIZE 80 + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + fmov $f21, C + LD S, 0($sp) + + cmpeq INCX, 1, $23 + cmpeq INCY, 1, $24 + ble N, $L998 + + and $23, $24, $23 + beq $23, $L50 + + sra N, 3, I + ble I, $L15 + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + LD $f15, 1*SIZE(Y) + + LD $f16, 2*SIZE(X) + LD $f17, 2*SIZE(Y) + LD $f18, 3*SIZE(X) + LD $f19, 3*SIZE(Y) + + MUL C, $f12, $f21 + unop + MUL S, $f13, $f22 + MUL C, $f13, $f23 + + LD $f13, 4*SIZE(Y) + MUL S, $f12, $f24 + LD $f12, 4*SIZE(X) + MUL C, $f14, $f25 + + ldi I, -1(I) + MUL S, $f15, $f26 + ADD $f21, $f22, $f22 + MUL C, $f15, $f27 + + LD $f15, 5*SIZE(Y) + MUL S, $f14, $f28 + SUB $f23, $f24, $f24 + ble I, $L13 + .align 4 + +$L12: + MUL C, $f16, $f21 + fillcs (PREFETCH_SIZE) * SIZE(X) + unop + LD $f14, 5*SIZE(X) + + ST $f22, 0*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + fillcs (PREFETCH_SIZE) * SIZE(Y) + unop + LD $f17, 6*SIZE(Y) + + ST $f24, 0*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + LD $f16, 6*SIZE(X) + unop + unop + + ST $f26, 1*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + LD $f19, 7*SIZE(Y) + + ST $f28, 1*SIZE(Y) + MUL S, $f18, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + LD $f18, 7*SIZE(X) + unop + unop + + ST $f22, 2*SIZE(X) + unop + MUL S, $f13, $f22 + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + LD $f13, 8*SIZE(Y) + unop + unop + + ST $f24, 2*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + LD $f12, 8*SIZE(X) + unop + unop + + ST $f26, 3*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + LD $f15, 9*SIZE(Y) + unop + unop + + ST $f28, 3*SIZE(Y) + MUL S, $f14, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f16, $f21 + LD $f14, 9*SIZE(X) + unop + unop + + ST $f22, 4*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + LD $f17, 10*SIZE(Y) + unop + unop + + ST $f24, 4*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + LD $f16, 10*SIZE(X) + unop + unop + + ST $f26, 5*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + LD $f19, 11*SIZE(Y) + unop + unop + + ST $f28, 5*SIZE(Y) + MUL S, $f18, $f28 + ldi I, -1(I) + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + LD $f18, 11*SIZE(X) + unop + unop + + ST $f22, 6*SIZE(X) + MUL S, $f13, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + LD $f13, 12*SIZE(Y) + ldi X, 8*SIZE(X) + unop + + ST $f24, 6*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + LD $f12, 4*SIZE(X) + ldi Y, 8*SIZE(Y) + unop + + ST $f26, -1*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + LD $f15, 5*SIZE(Y) + unop + unop + + ST $f28, -1*SIZE(Y) + MUL S, $f14, $f28 + SUB $f23, $f24, $f24 + bgt I, $L12 + .align 4 + +$L13: + MUL C, $f16, $f21 + LD $f14, 5*SIZE(X) + unop + unop + + ST $f22, 0*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + unop + unop + LD $f17, 6*SIZE(Y) + + ST $f24, 0*SIZE(Y) + MUL S, $f16, $f24 + LD $f16, 6*SIZE(X) + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + unop + unop + unop + + ST $f26, 1*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + LD $f19, 7*SIZE(Y) + + ST $f28, 1*SIZE(Y) + MUL S, $f18, $f28 + LD $f18, 7*SIZE(X) + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + unop + unop + unop + + ST $f22, 2*SIZE(X) + unop + MUL S, $f13, $f22 + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + unop + unop + unop + + ST $f24, 2*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + unop + unop + unop + + ST $f26, 3*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + unop + unop + unop + + ST $f28, 3*SIZE(Y) + MUL S, $f14, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f16, $f21 + unop + unop + unop + + ST $f22, 4*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + unop + unop + unop + + ST $f24, 4*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + unop + unop + unop + + ST $f26, 5*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + unop + + ST $f28, 5*SIZE(Y) + MUL S, $f18, $f28 + unop + SUB $f23, $f24, $f24 + + ST $f22, 6*SIZE(X) + ADD $f25, $f26, $f26 + ST $f24, 6*SIZE(Y) + SUB $f27, $f28, $f28 + + ST $f26, 7*SIZE(X) + ldi X, 8*SIZE(X) + ST $f28, 7*SIZE(Y) + ldi Y, 8*SIZE(Y) + .align 4 + + +$L15: + and N, 7, I + ble I, $L998 + .align 4 + +$L16: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f25 + SUB $f23, $f24, $f26 + ldi I, -1(I) + + ST $f25, 0*SIZE(X) + ldi X, 1 * SIZE(X) + ST $f26, 0*SIZE(Y) + ldi Y, 1 * SIZE(Y) + + bgt I, $L16 + .align 4 + +$L998: + clr $0 + ret + .align 4 + +$L50: + mov X, XX + mov Y, YY + + sra N, 3, I + ble I, $L55 + .align 4 + +$L51: + LD $f12, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f13, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f14, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f16, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f17, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f18, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f19, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + ST $f22, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f24, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f26, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f16, $f21 + MUL S, $f17, $f22 + MUL C, $f17, $f23 + MUL S, $f16, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + ST $f22, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f24, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f18, $f25 + MUL S, $f19, $f26 + MUL C, $f19, $f27 + MUL S, $f18, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f26, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + + LD $f12, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f13, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f14, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f16, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f17, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f18, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f19, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + ST $f22, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f24, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f26, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f16, $f21 + MUL S, $f17, $f22 + MUL C, $f17, $f23 + MUL S, $f16, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + ST $f22, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f24, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f18, $f25 + MUL S, $f19, $f26 + MUL C, $f19, $f27 + MUL S, $f18, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f26, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + ldi I, -1(I) + bgt I, $L51 + .align 4 + +$L55: + and N, 7, I + ble I, $L999 + .align 4 + +$L56: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f25 + SUB $f23, $f24, $f26 + ldi I, -1(I) + + ST $f25, 0*SIZE(X) + SXADDQ INCX, X, X + ST $f26, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + bgt I, $L56 + .align 4 + +$L999: + clr $0 + ret + EPILOGUE diff --git a/kernel/sw_64/rot_simd.S b/kernel/sw_64/rot_simd.S new file mode 100644 index 0000000..99f3e05 --- /dev/null +++ b/kernel/sw_64/rot_simd.S @@ -0,0 +1,783 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 +#define I $21 +#define XX $23 +#define YY $24 + +#define C $f10 +#define S $f11 + +#define x0 $f12 +#define x1 $f14 +#define x2 $f16 +#define x3 $f18 + +#define y0 $f13 +#define y1 $f15 +#define y2 $f17 +#define y3 $f19 + +#define t0 $f20 +#define t1 $f21 +#define t2 $f22 +#define t3 $f23 +#define t4 $f24 +#define t5 $f25 +#define t6 $f26 +#define t7 $f27 + +#define PREFETCHSIZE 80 + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + fmov $f21, C + LD S, 0($sp) + + cmpeq INCX, 1, $23 + cmpeq INCY, 1, $24 + ble N, $L998 + + and $23, $24, $23 + beq $23, $L50 #incx!=1 or incy !=1 + +/* test the address of X */ + and X, (VEC_LEN*SIZE-1), $3 + and Y, (VEC_LEN*SIZE-1), $4 + or $3, $4, $4 + bne $4, $UnAlign_ACCESS + +/*Align Accessing*/ + sra N, 4, I + ble I, $Remain + + vcpyf C, C + vcpyf S, S + + VLD x0, 0*VEC_LEN*SIZE(X) + VLD x1, 1*VEC_LEN*SIZE(X) + VLD x2, 2*VEC_LEN*SIZE(X) + VLD x3, 3*VEC_LEN*SIZE(X) + + VLD y0, 0*VEC_LEN*SIZE(Y) + VLD y1, 1*VEC_LEN*SIZE(Y) + VLD y2, 2*VEC_LEN*SIZE(Y) + VLD y3, 3*VEC_LEN*SIZE(Y) + + addl X, 16 * SIZE, X + addl Y, 16 * SIZE, Y + subl I, 1, I + ble I, $MainLoopEnd + .align 4 +$MainLoop: + VMUL C, x0, t0 + fillcs (PREFETCHSIZE) * SIZE(X) + VMUL C, x1, t1 + fillcs (PREFETCHSIZE) * SIZE(Y) + + VMUL C, x2, t2 + subl I, 1, I + VMUL C, x3, t3 + nop + + VMUL S, x0, t4 + VLD x0, 0*VEC_LEN*SIZE(X) + VMUL S, x1, t5 + VLD x1, 1*VEC_LEN*SIZE(X) + + VMUL S, x2, t6 + VLD x2, 2*VEC_LEN*SIZE(X) + VMUL S, x3, t7 + VLD x3, 3*VEC_LEN*SIZE(X) + + VMAD S, y0, t0, t0 + VMAD S, y1, t1, t1 + VMAD S, y2, t2, t2 + VMAD S, y3, t3, t3 + + VMSUB C, y0, t4, t4 + VLD y0, 0*VEC_LEN*SIZE(Y) + VMSUB C, y1, t5, t5 + VLD y1, 1*VEC_LEN*SIZE(Y) + + VMSUB C, y2, t6, t6 + VLD y2, 2*VEC_LEN*SIZE(Y) + VMSUB C, y3, t7, t7 + VLD y3, 3*VEC_LEN*SIZE(Y) + + VST t0, -4*VEC_LEN*SIZE(X) + VST t1, -3*VEC_LEN*SIZE(X) + VST t2, -2*VEC_LEN*SIZE(X) + VST t3, -1*VEC_LEN*SIZE(X) + + VST t4, -4*VEC_LEN*SIZE(Y) + VST t5, -3*VEC_LEN*SIZE(Y) + VST t6, -2*VEC_LEN*SIZE(Y) + VST t7, -1*VEC_LEN*SIZE(Y) + + addl X, 16 * SIZE, X + addl Y, 16 * SIZE, Y + nop + bgt I, $MainLoop + .align 4 +$MainLoopEnd: + VMUL C, x0, t0 + VMUL C, x1, t1 + VMUL C, x2, t2 + VMUL C, x3, t3 + + VMUL S, x0, t4 + VMUL S, x1, t5 + VMUL S, x2, t6 + VMUL S, x3, t7 + + VMAD S, y0, t0, t0 + VMAD S, y1, t1, t1 + VMAD S, y2, t2, t2 + VMAD S, y3, t3, t3 + + VMSUB C, y0, t4, t4 + VMSUB C, y1, t5, t5 + VMSUB C, y2, t6, t6 + VMSUB C, y3, t7, t7 + + VST t0, -4*VEC_LEN*SIZE(X) + VST t1, -3*VEC_LEN*SIZE(X) + VST t2, -2*VEC_LEN*SIZE(X) + VST t3, -1*VEC_LEN*SIZE(X) + + VST t4, -4*VEC_LEN*SIZE(Y) + VST t5, -3*VEC_LEN*SIZE(Y) + VST t6, -2*VEC_LEN*SIZE(Y) + VST t7, -1*VEC_LEN*SIZE(Y) + + .align 4 +$Remain: + and N, 15, I + ble I, $End +$RemainLoop: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f12, $f24 + MAD S, $f13, $f21, $f25 + MSUB C, $f13, $f24, $f26 + + + ldi I, -1(I) + ST $f25, 0*SIZE(X) + ldi X, 1 * SIZE(X) + ST $f26, 0*SIZE(Y) + + ldi Y, 1 * SIZE(Y) + bgt I, $RemainLoop + + .align 4 +$End: + clr $0 + ret + .align 4 + +$UnAlign_ACCESS: + + sra N, 3, I + ble I, $L15 + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + LD $f15, 1*SIZE(Y) + + LD $f16, 2*SIZE(X) + LD $f17, 2*SIZE(Y) + LD $f18, 3*SIZE(X) + LD $f19, 3*SIZE(Y) + + MUL C, $f12, $f21 + unop + MUL S, $f13, $f22 + MUL C, $f13, $f23 + + LD $f13, 4*SIZE(Y) + MUL S, $f12, $f24 + LD $f12, 4*SIZE(X) + MUL C, $f14, $f25 + + ldi I, -1(I) + MUL S, $f15, $f26 + ADD $f21, $f22, $f22 + MUL C, $f15, $f27 + + LD $f15, 5*SIZE(Y) + MUL S, $f14, $f28 + SUB $f23, $f24, $f24 + ble I, $L13 + .align 4 + +$L12: + MUL C, $f16, $f21 + fillcs (PREFETCHSIZE) * SIZE(X) + unop + LD $f14, 5*SIZE(X) + + ST $f22, 0*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + fillcs (PREFETCHSIZE) * SIZE(Y) + unop + LD $f17, 6*SIZE(Y) + + ST $f24, 0*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + LD $f16, 6*SIZE(X) + unop + unop + + ST $f26, 1*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + LD $f19, 7*SIZE(Y) + + ST $f28, 1*SIZE(Y) + MUL S, $f18, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + LD $f18, 7*SIZE(X) + unop + unop + + ST $f22, 2*SIZE(X) + unop + MUL S, $f13, $f22 + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + LD $f13, 8*SIZE(Y) + unop + unop + + ST $f24, 2*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + LD $f12, 8*SIZE(X) + unop + unop + + ST $f26, 3*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + LD $f15, 9*SIZE(Y) + unop + unop + + ST $f28, 3*SIZE(Y) + MUL S, $f14, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f16, $f21 + LD $f14, 9*SIZE(X) + unop + unop + + ST $f22, 4*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + LD $f17, 10*SIZE(Y) + unop + unop + + ST $f24, 4*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + LD $f16, 10*SIZE(X) + unop + unop + + ST $f26, 5*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + LD $f19, 11*SIZE(Y) + unop + unop + + ST $f28, 5*SIZE(Y) + MUL S, $f18, $f28 + ldi I, -1(I) + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + LD $f18, 11*SIZE(X) + unop + unop + + ST $f22, 6*SIZE(X) + MUL S, $f13, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + LD $f13, 12*SIZE(Y) + ldi X, 8*SIZE(X) + unop + + ST $f24, 6*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + LD $f12, 4*SIZE(X) + ldi Y, 8*SIZE(Y) + unop + + ST $f26, -1*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + LD $f15, 5*SIZE(Y) + unop + unop + + ST $f28, -1*SIZE(Y) + MUL S, $f14, $f28 + SUB $f23, $f24, $f24 + bgt I, $L12 + .align 4 + +$L13: + MUL C, $f16, $f21 + LD $f14, 5*SIZE(X) + unop + unop + + ST $f22, 0*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + unop + unop + LD $f17, 6*SIZE(Y) + + ST $f24, 0*SIZE(Y) + MUL S, $f16, $f24 + LD $f16, 6*SIZE(X) + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + unop + unop + unop + + ST $f26, 1*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + LD $f19, 7*SIZE(Y) + + ST $f28, 1*SIZE(Y) + MUL S, $f18, $f28 + LD $f18, 7*SIZE(X) + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + unop + unop + unop + + ST $f22, 2*SIZE(X) + unop + MUL S, $f13, $f22 + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + unop + unop + unop + + ST $f24, 2*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + unop + unop + unop + + ST $f26, 3*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + unop + unop + unop + + ST $f28, 3*SIZE(Y) + MUL S, $f14, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f16, $f21 + unop + unop + unop + + ST $f22, 4*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + unop + unop + unop + + ST $f24, 4*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + unop + unop + unop + + ST $f26, 5*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + unop + + ST $f28, 5*SIZE(Y) + MUL S, $f18, $f28 + unop + SUB $f23, $f24, $f24 + + ST $f22, 6*SIZE(X) + ADD $f25, $f26, $f26 + ST $f24, 6*SIZE(Y) + SUB $f27, $f28, $f28 + + ST $f26, 7*SIZE(X) + ldi X, 8*SIZE(X) + ST $f28, 7*SIZE(Y) + ldi Y, 8*SIZE(Y) + .align 4 + + +$L15: + and N, 7, I + ble I, $L998 + .align 4 + +$L16: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f25 + SUB $f23, $f24, $f26 + ldi I, -1(I) + + ST $f25, 0*SIZE(X) + ldi X, 1 * SIZE(X) + ST $f26, 0*SIZE(Y) + ldi Y, 1 * SIZE(Y) + + bgt I, $L16 + .align 4 + +$L998: + clr $0 + ret + .align 4 + +$L50: + mov X, XX + mov Y, YY + + sra N, 3, I + ble I, $L55 + .align 4 + +$L51: + LD $f12, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f13, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f14, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f16, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f17, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f18, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f19, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + ST $f22, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f24, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f26, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f16, $f21 + MUL S, $f17, $f22 + MUL C, $f17, $f23 + MUL S, $f16, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + ST $f22, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f24, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f18, $f25 + MUL S, $f19, $f26 + MUL C, $f19, $f27 + MUL S, $f18, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f26, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + + LD $f12, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f13, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f14, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f16, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f17, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f18, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f19, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + ST $f22, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f24, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f26, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f16, $f21 + MUL S, $f17, $f22 + MUL C, $f17, $f23 + MUL S, $f16, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + ST $f22, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f24, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f18, $f25 + MUL S, $f19, $f26 + MUL C, $f19, $f27 + MUL S, $f18, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f26, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + ldi I, -1(I) + bgt I, $L51 + .align 4 + +$L55: + and N, 7, I + ble I, $L999 + .align 4 + +$L56: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f25 + SUB $f23, $f24, $f26 + ldi I, -1(I) + + ST $f25, 0*SIZE(X) + SXADDQ INCX, X, X + ST $f26, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + bgt I, $L56 + .align 4 + +$L999: + clr $0 + ret + EPILOGUE diff --git a/kernel/sw_64/scal-sw.S.bak b/kernel/sw_64/scal-sw.S.bak new file mode 100644 index 0000000..f8da324 --- /dev/null +++ b/kernel/sw_64/scal-sw.S.bak @@ -0,0 +1,480 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $20 +#define INCX $21 + +#define XX $18 +#define I $19 + +#define ALPHA $f19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f21 + +#define t0 $f22 +#define t1 $f23 +#define t2 $f24 +#define t3 $f25 + + PROLOGUE + PROFCODE + + mov X, XX + ble N, $L999 + + cmpeq INCX, 1, $0 + beq $0, $L20 + +#ifndef DOUBLE + sra N, 4, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + + LD a4, 4 * SIZE(X) + MUL a0, ALPHA, t0 + LD a5, 5 * SIZE(X) + MUL a1, ALPHA, t1 + LD a6, 6 * SIZE(X) + MUL a2, ALPHA, t2 + LD a7, 7 * SIZE(X) + MUL a3, ALPHA, t3 + + ST t0, 0 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 1 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 2 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 3 * SIZE(X) + MUL a7, ALPHA, t3 + + LD a0, 8 * SIZE(X) + LD a1, 9 * SIZE(X) + LD a2, 10 * SIZE(X) + LD a3, 11 * SIZE(X) + + ST t0, 4 * SIZE(X) + MUL a0, ALPHA, t0 + ST t1, 5 * SIZE(X) + MUL a1, ALPHA, t1 + + ST t2, 6 * SIZE(X) + MUL a2, ALPHA, t2 + ST t3, 7 * SIZE(X) + MUL a3, ALPHA, t3 + + LD a4, 12 * SIZE(X) + LD a5, 13 * SIZE(X) + LD a6, 14 * SIZE(X) + LD a7, 15 * SIZE(X) + + ldi I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ST t0, 8 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 9 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 10 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 11 * SIZE(X) + MUL a7, ALPHA, t3 + + LD a0, 16 * SIZE(X) + LD a1, 17 * SIZE(X) + LD a2, 18 * SIZE(X) + LD a3, 19 * SIZE(X) + + ST t0, 12 * SIZE(X) + MUL a0, ALPHA, t0 + ST t1, 13 * SIZE(X) + MUL a1, ALPHA, t1 + + ST t2, 14 * SIZE(X) + MUL a2, ALPHA, t2 + ST t3, 15 * SIZE(X) + MUL a3, ALPHA, t3 + + LD a4, 20 * SIZE(X) + LD a5, 21 * SIZE(X) + LD a6, 22 * SIZE(X) + LD a7, 23 * SIZE(X) + + ST t0, 16 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 17 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 18 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 19 * SIZE(X) + MUL a7, ALPHA, t3 + + LD a0, 24 * SIZE(X) + LD a1, 25 * SIZE(X) + LD a2, 26 * SIZE(X) + LD a3, 27 * SIZE(X) + + ST t0, 20 * SIZE(X) + MUL a0, ALPHA, t0 + ST t1, 21 * SIZE(X) + MUL a1, ALPHA, t1 + + ST t2, 22 * SIZE(X) + MUL a2, ALPHA, t2 + ST t3, 23 * SIZE(X) + MUL a3, ALPHA, t3 + + LD a4, 28 * SIZE(X) + LD a5, 29 * SIZE(X) + LD a6, 30 * SIZE(X) + LD a7, 31 * SIZE(X) + + fillcs PREFETCHSIZE * SIZE(X) + ldi I, -1(I) + addl X, 16 * SIZE, X + bne I, $L12 + .align 4 + +$L13: + ST t0, 8 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 9 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 10 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 11 * SIZE(X) + MUL a7, ALPHA, t3 + + ST t0, 12 * SIZE(X) + ST t1, 13 * SIZE(X) + ST t2, 14 * SIZE(X) + ST t3, 15 * SIZE(X) + addl X, 16 * SIZE, X + .align 4 + +$L15: + and N, 15, I + +#else + + sra N, 3, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + + LD a4, 4 * SIZE(X) + MUL a0, ALPHA, t0 + LD a5, 5 * SIZE(X) + MUL a1, ALPHA, t1 + + LD a6, 6 * SIZE(X) + MUL a2, ALPHA, t2 + LD a7, 7 * SIZE(X) + MUL a3, ALPHA, t3 + + ldi I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ST t0, 0 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 1 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 2 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 3 * SIZE(X) + MUL a7, ALPHA, t3 + + LD a0, 8 * SIZE(X) + ldi I, -1(I) + LD a1, 9 * SIZE(X) + addl X, 8 * SIZE, X + + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + + ST t0, -4 * SIZE(X) + MUL a0, ALPHA, t0 + ST t1, -3 * SIZE(X) + MUL a1, ALPHA, t1 + + ST t2, -2 * SIZE(X) + MUL a2, ALPHA, t2 + ST t3, -1 * SIZE(X) + MUL a3, ALPHA, t3 + + LD a4, 4 * SIZE(X) + LD a5, 5 * SIZE(X) + + LD a6, 6 * SIZE(X) + LD a7, 7 * SIZE(X) + fillcs PREFETCHSIZE * SIZE(X) + bne I, $L12 + .align 4 + +$L13: + ST t0, 0 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 1 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 2 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 3 * SIZE(X) + MUL a7, ALPHA, t3 + + ST t0, 4 * SIZE(X) + ST t1, 5 * SIZE(X) + ST t2, 6 * SIZE(X) + ST t3, 7 * SIZE(X) + addl X, 8 * SIZE, X + .align 4 + +$L15: + and N, 7, I + +#endif + + unop + unop + ble I, $L999 + .align 4 + +$L17: + LD a0, 0 * SIZE(X) + + MUL a0, ALPHA, t0 + + ST t0, 0 * SIZE(X) + + addl X, SIZE, X + + ldi I, -1(I) + bne I, $L17 + ret + .align 4 + +$L20: + sra N, 3, I + ble I, $L25 + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + MUL a0, ALPHA, t0 + ldi I, -1(I) + SXADDQ INCX, X, X + + LD a5, 0 * SIZE(X) + MUL a1, ALPHA, t1 + SXADDQ INCX, X, X + unop + + LD a6, 0 * SIZE(X) + MUL a2, ALPHA, t2 + SXADDQ INCX, X, X + unop + + LD a7, 0 * SIZE(X) + MUL a3, ALPHA, t3 + SXADDQ INCX, X, X + ble I, $L23 + .align 4 + +$L22: + ST t0, 0 * SIZE(XX) + MUL a4, ALPHA, t0 + fillcs PREFETCHSIZE * SIZE(X) + SXADDQ INCX, XX, XX + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + ldi I, -1(I) + unop + + ST t1, 0 * SIZE(XX) + MUL a5, ALPHA, t1 + SXADDQ INCX, XX, XX + unop + + LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t2, 0 * SIZE(XX) + MUL a6, ALPHA, t2 + SXADDQ INCX, XX, XX + unop + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t3, 0 * SIZE(XX) + MUL a7, ALPHA, t3 + SXADDQ INCX, XX, XX + unop + + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t0, 0 * SIZE(XX) + MUL a0, ALPHA, t0 + SXADDQ INCX, XX, XX + unop + + LD a4, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t1, 0 * SIZE(XX) + MUL a1, ALPHA, t1 + SXADDQ INCX, XX, XX + unop + + LD a5, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t2, 0 * SIZE(XX) + MUL a2, ALPHA, t2 + SXADDQ INCX, XX, XX + unop + + LD a6, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t3, 0 * SIZE(XX) + MUL a3, ALPHA, t3 + SXADDQ INCX, XX, XX + unop + + LD a7, 0 * SIZE(X) + SXADDQ INCX, X, X + unop + bne I, $L22 + .align 4 + +$L23: + ST t0, 0 * SIZE(XX) + MUL a4, ALPHA, t0 + SXADDQ INCX, XX, XX + + ST t1, 0 * SIZE(XX) + MUL a5, ALPHA, t1 + SXADDQ INCX, XX, XX + + ST t2, 0 * SIZE(XX) + MUL a6, ALPHA, t2 + SXADDQ INCX, XX, XX + + ST t3, 0 * SIZE(XX) + MUL a7, ALPHA, t3 + SXADDQ INCX, XX, XX + + ST t0, 0 * SIZE(XX) + SXADDQ INCX, XX, XX + ST t1, 0 * SIZE(XX) + SXADDQ INCX, XX, XX + ST t2, 0 * SIZE(XX) + SXADDQ INCX, XX, XX + ST t3, 0 * SIZE(XX) + SXADDQ INCX, XX, XX + .align 4 + +$L25: + and N, 7, I + unop + unop + ble I, $L999 + .align 4 + +$L27: + LD a0, 0 * SIZE(X) + + MUL a0, ALPHA, t0 + + ST t0, 0 * SIZE(XX) + + SXADDQ INCX, X, X + SXADDQ INCX, XX, XX + + ldi I, -1(I) + bne I, $L27 + .align 4 + +$L999: + ret + EPILOGUE diff --git a/kernel/sw_64/scal.S b/kernel/sw_64/scal.S new file mode 100644 index 0000000..87b89c9 --- /dev/null +++ b/kernel/sw_64/scal.S @@ -0,0 +1,480 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $20 +#define INCX $21 + +#define XX $18 +#define I $19 + +#define ALPHA $f19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f21 + +#define t0 $f22 +#define t1 $f23 +#define t2 $f24 +#define t3 $f25 + + PROLOGUE + PROFCODE + + mov X, XX + ble N, $L999 + + cmpeq INCX, 1, $0 + beq $0, $L20 + +#ifndef DOUBLE + sra N, 4, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + + LD a4, 4 * SIZE(X) + MUL a0, ALPHA, t0 + LD a5, 5 * SIZE(X) + MUL a1, ALPHA, t1 + LD a6, 6 * SIZE(X) + MUL a2, ALPHA, t2 + LD a7, 7 * SIZE(X) + MUL a3, ALPHA, t3 + + ST t0, 0 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 1 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 2 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 3 * SIZE(X) + MUL a7, ALPHA, t3 + + LD a0, 8 * SIZE(X) + LD a1, 9 * SIZE(X) + LD a2, 10 * SIZE(X) + LD a3, 11 * SIZE(X) + + ST t0, 4 * SIZE(X) + MUL a0, ALPHA, t0 + ST t1, 5 * SIZE(X) + MUL a1, ALPHA, t1 + + ST t2, 6 * SIZE(X) + MUL a2, ALPHA, t2 + ST t3, 7 * SIZE(X) + MUL a3, ALPHA, t3 + + LD a4, 12 * SIZE(X) + LD a5, 13 * SIZE(X) + LD a6, 14 * SIZE(X) + LD a7, 15 * SIZE(X) + + ldi I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ST t0, 8 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 9 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 10 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 11 * SIZE(X) + MUL a7, ALPHA, t3 + + LD a0, 16 * SIZE(X) + LD a1, 17 * SIZE(X) + LD a2, 18 * SIZE(X) + LD a3, 19 * SIZE(X) + + ST t0, 12 * SIZE(X) + MUL a0, ALPHA, t0 + ST t1, 13 * SIZE(X) + MUL a1, ALPHA, t1 + + ST t2, 14 * SIZE(X) + MUL a2, ALPHA, t2 + ST t3, 15 * SIZE(X) + MUL a3, ALPHA, t3 + + LD a4, 20 * SIZE(X) + LD a5, 21 * SIZE(X) + LD a6, 22 * SIZE(X) + LD a7, 23 * SIZE(X) + + ST t0, 16 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 17 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 18 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 19 * SIZE(X) + MUL a7, ALPHA, t3 + + LD a0, 24 * SIZE(X) + LD a1, 25 * SIZE(X) + LD a2, 26 * SIZE(X) + LD a3, 27 * SIZE(X) + + ST t0, 20 * SIZE(X) + MUL a0, ALPHA, t0 + ST t1, 21 * SIZE(X) + MUL a1, ALPHA, t1 + + ST t2, 22 * SIZE(X) + MUL a2, ALPHA, t2 + ST t3, 23 * SIZE(X) + MUL a3, ALPHA, t3 + + LD a4, 28 * SIZE(X) + LD a5, 29 * SIZE(X) + LD a6, 30 * SIZE(X) + LD a7, 31 * SIZE(X) + + fillcs PREFETCHSIZE * SIZE(X) + ldi I, -1(I) + addl X, 16 * SIZE, X + bne I, $L12 + .align 4 + +$L13: + ST t0, 8 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 9 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 10 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 11 * SIZE(X) + MUL a7, ALPHA, t3 + + ST t0, 12 * SIZE(X) + ST t1, 13 * SIZE(X) + ST t2, 14 * SIZE(X) + ST t3, 15 * SIZE(X) + addl X, 16 * SIZE, X + .align 4 + +$L15: + and N, 15, I + +#else + + sra N, 3, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + + LD a4, 4 * SIZE(X) + MUL a0, ALPHA, t0 + LD a5, 5 * SIZE(X) + MUL a1, ALPHA, t1 + + LD a6, 6 * SIZE(X) + MUL a2, ALPHA, t2 + LD a7, 7 * SIZE(X) + MUL a3, ALPHA, t3 + + ldi I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ST t0, 0 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 1 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 2 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 3 * SIZE(X) + MUL a7, ALPHA, t3 + + LD a0, 8 * SIZE(X) + ldi I, -1(I) + LD a1, 9 * SIZE(X) + addl X, 8 * SIZE, X + + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + + ST t0, -4 * SIZE(X) + MUL a0, ALPHA, t0 + ST t1, -3 * SIZE(X) + MUL a1, ALPHA, t1 + + ST t2, -2 * SIZE(X) + MUL a2, ALPHA, t2 + ST t3, -1 * SIZE(X) + MUL a3, ALPHA, t3 + + LD a4, 4 * SIZE(X) + LD a5, 5 * SIZE(X) + + LD a6, 6 * SIZE(X) + LD a7, 7 * SIZE(X) + fillcs PREFETCHSIZE * SIZE(X) + bne I, $L12 + .align 4 + +$L13: + ST t0, 0 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 1 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 2 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 3 * SIZE(X) + MUL a7, ALPHA, t3 + + ST t0, 4 * SIZE(X) + ST t1, 5 * SIZE(X) + ST t2, 6 * SIZE(X) + ST t3, 7 * SIZE(X) + addl X, 8 * SIZE, X + .align 4 + +$L15: + and N, 7, I + +#endif + + unop + unop + ble I, $L999 + .align 4 + +$L17: + LD a0, 0 * SIZE(X) + + MUL a0, ALPHA, t0 + + ST t0, 0 * SIZE(X) + + addl X, SIZE, X + + ldi I, -1(I) + bne I, $L17 + ret + .align 4 + +$L20: + sra N, 3, I + ble I, $L25 + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + MUL a0, ALPHA, t0 + ldi I, -1(I) + SXADDQ INCX, X, X + + LD a5, 0 * SIZE(X) + MUL a1, ALPHA, t1 + SXADDQ INCX, X, X + unop + + LD a6, 0 * SIZE(X) + MUL a2, ALPHA, t2 + SXADDQ INCX, X, X + unop + + LD a7, 0 * SIZE(X) + MUL a3, ALPHA, t3 + SXADDQ INCX, X, X + ble I, $L23 + .align 4 + +$L22: + ST t0, 0 * SIZE(XX) + MUL a4, ALPHA, t0 + fillcs PREFETCHSIZE * SIZE(X) + SXADDQ INCX, XX, XX + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + ldi I, -1(I) + unop + + ST t1, 0 * SIZE(XX) + MUL a5, ALPHA, t1 + SXADDQ INCX, XX, XX + unop + + LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t2, 0 * SIZE(XX) + MUL a6, ALPHA, t2 + SXADDQ INCX, XX, XX + unop + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t3, 0 * SIZE(XX) + MUL a7, ALPHA, t3 + SXADDQ INCX, XX, XX + unop + + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t0, 0 * SIZE(XX) + MUL a0, ALPHA, t0 + SXADDQ INCX, XX, XX + unop + + LD a4, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t1, 0 * SIZE(XX) + MUL a1, ALPHA, t1 + SXADDQ INCX, XX, XX + unop + + LD a5, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t2, 0 * SIZE(XX) + MUL a2, ALPHA, t2 + SXADDQ INCX, XX, XX + unop + + LD a6, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t3, 0 * SIZE(XX) + MUL a3, ALPHA, t3 + SXADDQ INCX, XX, XX + unop + + LD a7, 0 * SIZE(X) + SXADDQ INCX, X, X + unop + bne I, $L22 + .align 4 + +$L23: + ST t0, 0 * SIZE(XX) + MUL a4, ALPHA, t0 + SXADDQ INCX, XX, XX + + ST t1, 0 * SIZE(XX) + MUL a5, ALPHA, t1 + SXADDQ INCX, XX, XX + + ST t2, 0 * SIZE(XX) + MUL a6, ALPHA, t2 + SXADDQ INCX, XX, XX + + ST t3, 0 * SIZE(XX) + MUL a7, ALPHA, t3 + SXADDQ INCX, XX, XX + + ST t0, 0 * SIZE(XX) + SXADDQ INCX, XX, XX + ST t1, 0 * SIZE(XX) + SXADDQ INCX, XX, XX + ST t2, 0 * SIZE(XX) + SXADDQ INCX, XX, XX + ST t3, 0 * SIZE(XX) + SXADDQ INCX, XX, XX + .align 4 + +$L25: + and N, 7, I + unop + unop + ble I, $L999 + .align 4 + +$L27: + LD a0, 0 * SIZE(X) + + MUL a0, ALPHA, t0 + + ST t0, 0 * SIZE(XX) + + SXADDQ INCX, X, X + SXADDQ INCX, XX, XX + + ldi I, -1(I) + bne I, $L27 + .align 4 + +$L999: + ret + EPILOGUE diff --git a/kernel/sw_64/scal_simd.S b/kernel/sw_64/scal_simd.S new file mode 100644 index 0000000..7462e99 --- /dev/null +++ b/kernel/sw_64/scal_simd.S @@ -0,0 +1,344 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 144 + +#define N $16 +#define X $20 +#define INCX $21 + +#define XX $18 +#define I $19 + +#define ALPHA $f19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f21 + +#define t0 $f22 +#define t1 $f23 +#define t2 $f24 +#define t3 $f25 + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + + mov X, XX + ble N, $L999 + + cmpeq INCX, 1, $0 + beq $0, $L20 + +/** + test the address of X +**/ + and X, (VEC_LEN*SIZE-1), $4 + beq $4, $Align_X_Access + + .align 5 +/** + process the unalign address of X +**/ + sra N, 4, I + ble I, $Remain /*if N is too small(less then unroll size), don't need process unalign X. Just jump to remain section.*/ + + sra $4, BASE_SHIFT, $4 + ldi $3, VEC_LEN + subl $3, $4, $4 + subl N, $4, N + +$UnAlign_X_Loop: + LD a0, 0*SIZE(X) + MUL a0, ALPHA, t0 + ST t0, 0*SIZE(X) + addl X, SIZE, X + + + + subl $4, 1, $4 + bgt $4, $UnAlign_X_Loop + .align 5 + +$Align_X_Access: + +/* + Unloop 16 +*/ + sra N, 4, I + vcpyf ALPHA, ALPHA + ble I, $Remain + + VLD a0, 0*VEC_LEN*SIZE(X) + VLD a1, 1*VEC_LEN*SIZE(X) + VLD a2, 2*VEC_LEN*SIZE(X) + VLD a3, 3*VEC_LEN*SIZE(X) + + ldi I, -1(I) + ble I, $MainLoop_End + .align 5 +$MainLoop: + VMUL a0, ALPHA, t0 + VLD a0, 4*VEC_LEN*SIZE(X) + VMUL a1, ALPHA, t1 + VLD a1, 5*VEC_LEN*SIZE(X) + + VMUL a2, ALPHA, t2 + VLD a2, 6*VEC_LEN*SIZE(X) + VMUL a3, ALPHA, t3 + VLD a3, 7*VEC_LEN*SIZE(X) + + VST t0, 0*VEC_LEN*SIZE(X) + VST t1, 1*VEC_LEN*SIZE(X) + VST t2, 2*VEC_LEN*SIZE(X) + VST t3, 3*VEC_LEN*SIZE(X) + + fillcs PREFETCHSIZE * SIZE(X) + ldi I, -1(I) + addl X, 16 * SIZE, X + bne I, $MainLoop + .align 5 + +$MainLoop_End: + VMUL a0, ALPHA, t0 + VST t0, 0*VEC_LEN*SIZE(X) + VMUL a1, ALPHA, t1 + VST t1, 1*VEC_LEN*SIZE(X) + + VMUL a2, ALPHA, t2 + VST t2, 2*VEC_LEN*SIZE(X) + VMUL a3, ALPHA, t3 + VST t3, 3*VEC_LEN*SIZE(X) + + addl X, 16 * SIZE, X + .align 5 + +$Remain: + and N, 15, I + unop + unop + ble I, $L999 + .align 5 + +$L17: + LD a0, 0 * SIZE(X) + + MUL a0, ALPHA, t0 + + ST t0, 0 * SIZE(X) + + addl X, SIZE, X + + ldi I, -1(I) + bne I, $L17 + ret + .align 5 + +$L20: + sra N, 3, I + ble I, $L25 + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + MUL a0, ALPHA, t0 + ldi I, -1(I) + SXADDQ INCX, X, X + + LD a5, 0 * SIZE(X) + MUL a1, ALPHA, t1 + SXADDQ INCX, X, X + unop + + LD a6, 0 * SIZE(X) + MUL a2, ALPHA, t2 + SXADDQ INCX, X, X + unop + + LD a7, 0 * SIZE(X) + MUL a3, ALPHA, t3 + SXADDQ INCX, X, X + ble I, $L23 + .align 5 + +$L22: + ST t0, 0 * SIZE(XX) + MUL a4, ALPHA, t0 +/* + fillcs PREFETCHSIZE * SIZE(X) +*/ + fillcs PREFETCHSIZE * SIZE(X) + SXADDQ INCX, XX, XX + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + ldi I, -1(I) + unop + + ST t1, 0 * SIZE(XX) + MUL a5, ALPHA, t1 + SXADDQ INCX, XX, XX + unop + + LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t2, 0 * SIZE(XX) + MUL a6, ALPHA, t2 + SXADDQ INCX, XX, XX + unop + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t3, 0 * SIZE(XX) + MUL a7, ALPHA, t3 + SXADDQ INCX, XX, XX + unop + + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t0, 0 * SIZE(XX) + MUL a0, ALPHA, t0 + SXADDQ INCX, XX, XX + unop + + LD a4, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t1, 0 * SIZE(XX) + MUL a1, ALPHA, t1 + SXADDQ INCX, XX, XX + unop + + LD a5, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t2, 0 * SIZE(XX) + MUL a2, ALPHA, t2 + SXADDQ INCX, XX, XX + unop + + LD a6, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t3, 0 * SIZE(XX) + MUL a3, ALPHA, t3 + SXADDQ INCX, XX, XX + unop + + LD a7, 0 * SIZE(X) + SXADDQ INCX, X, X + unop + bne I, $L22 + .align 5 + +$L23: + ST t0, 0 * SIZE(XX) + MUL a4, ALPHA, t0 + SXADDQ INCX, XX, XX + + ST t1, 0 * SIZE(XX) + MUL a5, ALPHA, t1 + SXADDQ INCX, XX, XX + + ST t2, 0 * SIZE(XX) + MUL a6, ALPHA, t2 + SXADDQ INCX, XX, XX + + ST t3, 0 * SIZE(XX) + MUL a7, ALPHA, t3 + SXADDQ INCX, XX, XX + + ST t0, 0 * SIZE(XX) + SXADDQ INCX, XX, XX + ST t1, 0 * SIZE(XX) + SXADDQ INCX, XX, XX + ST t2, 0 * SIZE(XX) + SXADDQ INCX, XX, XX + ST t3, 0 * SIZE(XX) + SXADDQ INCX, XX, XX + .align 5 + +$L25: + and N, 7, I + unop + unop + ble I, $L999 + .align 5 + +$L27: + LD a0, 0 * SIZE(X) + + MUL a0, ALPHA, t0 + + ST t0, 0 * SIZE(XX) + + SXADDQ INCX, X, X + SXADDQ INCX, XX, XX + + ldi I, -1(I) + bne I, $L27 + .align 5 + +$L999: + ret + EPILOGUE diff --git a/kernel/sw_64/snrm2.S b/kernel/sw_64/snrm2.S new file mode 100644 index 0000000..ff1ec57 --- /dev/null +++ b/kernel/sw_64/snrm2.S @@ -0,0 +1,491 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + +#define PREFETCH_SIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#define I $0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f10 +#define a3 $f11 +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f19 +#define x4 $f20 +#define x5 $f21 +#define x6 $f22 +#define x7 $f23 +#define x8 $f24 + + PROLOGUE + +#if defined(EV4) || defined(EV5) + .frame $30,16,$26,0 + .mask 0x4000000,-16 + ldih $29, 0($27) !gpdisp!1 + ldi $29, 0($29) !gpdisp!1 + + ldi $sp, -16($sp) + ldl $27, sqrt($29) !literal!2 + stl $26, 0($sp) + + PROFCODE + .prologue 1 +#else + PROFCODE +#endif + + fclr a0 + SXADDQ INCX, 0, INCX + fclr a1 + ble N, $L999 + + fclr a2 + cmpeq INCX, SIZE, $0 + fclr a3 + beq $0, $L20 + + fclr t0 + sra N, 4, I + fclr t1 + ble I, $L15 + + fclr t2 + LD x0, 0 * SIZE(X) + fclr t3 + LD x1, 1 * SIZE(X) + + LD x2, 2 * SIZE(X) + LD x3, 3 * SIZE(X) + LD x4, 4 * SIZE(X) + LD x5, 5 * SIZE(X) + LD x6, 6 * SIZE(X) + LD x7, 7 * SIZE(X) + + ldi I, -1(I) + ble I, $L12 + .align 4 + +$L11: + faddd a0, t0, x8 + fmov x8,a0 + fillcs (PREFETCH_SIZE) * SIZE(X) + fmuld x0, x0, t0 + LD x0, 8 * SIZE(X) + + faddd a1, t1,x8 + fmov x8,a1 + mov X, XX + fmuld x1, x1, t1 + LD x1, 9 * SIZE(X) + + faddd a2, t2,x8 + fmov x8,a2 + #unop + fmuld x2, x2, t2 + LD x2, 10 * SIZE(X) + + faddd a3, t3,x8 + fmov x8,a3 + #unop + fmuld x3, x3, t3 + LD x3, 11 * SIZE(X) + + faddd a0, t0, x8 + fmov x8,a0 + #unop + fmuld x4, x4, t0 + LD x4, 12 * SIZE(X) + + faddd a1, t1, x8 + fmov x8,a1 + #unop + fmuld x5, x5, t1 + LD x5, 13 * SIZE(X) + + faddd a2, t2, x8 + fmov x8,a2 + #unop + fmuld x6, x6, t2 + LD x6, 14 * SIZE(X) + + faddd a3, t3, x8 + fmov x8,a3 + #unop + fmuld x7, x7, t3 + LD x7, 15 * SIZE(X) + + faddd a0, t0, x8 + fmov x8,a0 + #unop + fmuld x0, x0, t0 + LD x0, 16 * SIZE(X) + + faddd a1, t1,x8 + fmov x8,a1 + ldi X, 16 * SIZE(X) + fmuld x1, x1, t1 + LD x1, 17 * SIZE(XX) + + faddd a2, t2, x8 + fmov x8,a2 + #unop + fmuld x2, x2, t2 + LD x2, 18 * SIZE(XX) + + faddd a3, t3,x8 + fmov x8,a3 + #unop + fmuld x3, x3, t3 + LD x3, 19 * SIZE(XX) + + faddd a0, t0, x8 + fmov x8,a0 + #unop + fmuld x4, x4, t0 + LD x4, 20 * SIZE(XX) + + faddd a1, t1,x8 + fmov x8,a1 + ldi I, -1(I) + fmuld x5, x5, t1 + LD x5, 21 * SIZE(XX) + + faddd a2, t2, x8 + fmov x8,a2 + #unop + fmuld x6, x6, t2 + LD x6, 22 * SIZE(XX) + + faddd a3, t3,x8 + fmov x8,a3 + fmuld x7, x7, t3 + LD x7, 23 * SIZE(XX) + bgt I, $L11 + .align 4 + +$L12: + faddd a0, t0,x8 + fmov x8,a0 + mov X, XX + fmuld x0, x0, t0 + LD x0, 8 * SIZE(X) + + faddd a1, t1,x8 + fmov x8,a1 + #unop + fmuld x1, x1, t1 + LD x1, 9 * SIZE(X) + + faddd a2, t2,x8 + fmov x8,a2 + #unop + fmuld x2, x2, t2 + LD x2, 10 * SIZE(X) + + faddd a3, t3, x8 + fmov x8,a3 + #unop + fmuld x3, x3, t3 + LD x3, 11 * SIZE(X) + + faddd a0, t0, x8 + fmov x8,a0 + #unop + fmuld x4, x4, t0 + LD x4, 12 * SIZE(XX) + + faddd a1, t1, x8 + fmov x8,a1 + #unop + fmuld x5, x5, t1 + LD x5, 13 * SIZE(XX) + + faddd a2, t2, x8 + fmov x8,a2 + #unop + fmuld x6, x6, t2 + LD x6, 14 * SIZE(XX) + + faddd a3, t3,x8 + fmov x8,a3 + ldi X, 16 * SIZE(X) + fmuld x7, x7, t3 + LD x7, 15 * SIZE(XX) + + faddd a0, t0,x8 + fmov x8,a0 + fmuld x0, x0, t0 + faddd a1, t1, x8 + fmov x8,a1 + fmuld x1, x1, t1 + + faddd a2, t2, x8 + fmov x8,a2 + fmuld x2, x2, t2 + faddd a3, t3, x8 + fmov x8,a3 + fmuld x3, x3, t3 + + faddd a0, t0, x8 + fmov x8,a0 + fmuld x4, x4, t0 + faddd a1, t1, x8 + fmov x8,a1 + fmuld x5, x5, t1 + + faddd a2, t2, x8 + fmov x8,a2 + fmuld x6, x6, t2 + faddd a3, t3, x8 + fmov x8,a3 + fmuld x7, x7, t3 + + faddd a1, t1, x8 + fmov x8,a1 + faddd a2, t2, x8 + fmov x8,a2 + faddd a3, t3, x8 + fmov x8,a3 + .align 4 + +$L15: + and N, 15, I + ble I, $L998 + .align 4 + +$L16: + LD x0, 0 * SIZE(X) + ldi X, 1 * SIZE(X) + + faddd a0, t0,x8 + fmov x8,a0 + fmuld x0, x0, t0 + + ldi I, -1(I) + bgt I, $L16 + bsr $31, $L998 + .align 4 + +$L20: + fclr t0 + sra N, 3, I + fclr t1 + ble I, $L25 + + fclr t2 + fclr t3 + + LD x0, 0 * SIZE(X) + addl X, INCX, X + LD x1, 0 * SIZE(X) + addl X, INCX, X + LD x2, 0 * SIZE(X) + addl X, INCX, X + LD x3, 0 * SIZE(X) + addl X, INCX, X + + LD x4, 0 * SIZE(X) + addl X, INCX, X + LD x5, 0 * SIZE(X) + addl X, INCX, X + LD x6, 0 * SIZE(X) + addl X, INCX, X + + ldi I, -1(I) + ble I, $L22 + .align 4 + +$L21: + faddd a0, t0, x8 + fmov x8,a0 + LD x7, 0 * SIZE(X) + fmuld x0, x0, t0 + addl X, INCX, X + + faddd a1, t1,x8 + fmov x8,a1 + LD x0, 0 * SIZE(X) + fmuld x1, x1, t1 + addl X, INCX, X + + faddd a2, t2,x8 + fmov x8,a2 + LD x1, 0 * SIZE(X) + fmuld x2, x2, t2 + addl X, INCX, X + + faddd a3, t3,x8 + fmov x8,a3 + LD x2, 0 * SIZE(X) + fmuld x3, x3, t3 + addl X, INCX, X + + faddd a0, t0,x8 + fmov x8,a0 + LD x3, 0 * SIZE(X) + fmuld x4, x4, t0 + addl X, INCX, X + + faddd a1, t1,x8 + fmov x8,a1 + LD x4, 0 * SIZE(X) + fmuld x5, x5, t1 + addl X, INCX, X + + faddd a2, t2,x8 + fmov x8,a2 + LD x5, 0 * SIZE(X) + fmuld x6, x6, t2 + addl X, INCX, X + + faddd a3, t3, x8 + fmov x8,a3 + LD x6, 0 * SIZE(X) + fmuld x7, x7, t3 + addl X, INCX, X + + ldi I, -1(I) + bgt I, $L21 + .align 4 + +$L22: + faddd a0, t0,x8 + fmov x8,a0 + LD x7, 0 * SIZE(X) + fmuld x0, x0, t0 + addl X, INCX, X + + faddd a1, t1, x8 + fmov x8,a1 + unop + fmuld x1, x1, t1 + unop + + faddd a2, t2,x8 + fmov x8,a2 + fmuld x2, x2, t2 + faddd a3, t3, x8 + fmov x8,a3 + fmuld x3, x3, t3 + + faddd a0, t0, x8 + fmov x8,a0 + fmuld x4, x4, t0 + faddd a1, t1, x8 + fmov x8,a1 + fmuld x5, x5, t1 + + faddd a2, t2, x8 + fmov x8,a2 + fmuld x6, x6, t2 + faddd a3, t3, x8 + fmov x8,a3 + fmuld x7, x7, t3 + + faddd a1, t1, x8 + fmov x8,a1 + faddd a2, t2, x8 + fmov x8,a2 + faddd a3, t3, x8 + fmov x8,a3 + .align 4 + +$L25: + and N, 7, I + ble I, $L998 + .align 4 + +$L26: + LD x0, 0 * SIZE(X) + addl X, INCX, X + + faddd a0, t0,x8 + fmov x8,a0 + fmuld x0, x0, t0 + + ldi I, -1(I) + bgt I, $L26 + .align 4 + + +$L998: + faddd a0, t0,x8 + fmov x8,a0 + + faddd a0, a1, x8 + fmov x8,a1 + faddd a2, a3, x8 + fmov x8,a2 + +#if defined(EV4) || defined(EV5) + faddd a0, a2, $f16 + jsr $26, ($27), sqrt !lituse_jsr!2 + + ldih $29, 0($26) !gpdisp!3 + ldi $29, 0($29) !gpdisp!3 +#else + faddd a0, a2,x8 + fsqrtd x8, a0 +#endif + .align 4 + +$L999: +#if defined(EV4) || defined(EV5) + ldl $26, 0($sp) + ldi $sp, 16($sp) +#endif + ret + EPILOGUE diff --git a/kernel/sw_64/snrm2.S.bak b/kernel/sw_64/snrm2.S.bak new file mode 100644 index 0000000..753c90b --- /dev/null +++ b/kernel/sw_64/snrm2.S.bak @@ -0,0 +1,431 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + +#define PREFETCH_SIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#define I $0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f10 +#define a3 $f11 +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f19 +#define x4 $f20 +#define x5 $f21 +#define x6 $f22 +#define x7 $f23 + + PROLOGUE + +#if defined(EV4) || defined(EV5) + .frame $30,16,$26,0 + .mask 0x4000000,-16 + ldih $29, 0($27) !gpdisp!1 + ldi $29, 0($29) !gpdisp!1 + + ldi $sp, -16($sp) + ldl $27, sqrt($29) !literal!2 + stq $26, 0($sp) + + PROFCODE + .prologue 1 +#else + PROFCODE +#endif + + fclr a0 + SXADDQ INCX, 0, INCX + fclr a1 + ble N, $L999 + + fclr a2 + cmpeq INCX, SIZE, $0 + fclr a3 + beq $0, $L20 + + fclr t0 + sra N, 4, I + fclr t1 + ble I, $L15 + + fclr t2 + LD x0, 0 * SIZE(X) + fclr t3 + LD x1, 1 * SIZE(X) + + LD x2, 2 * SIZE(X) + LD x3, 3 * SIZE(X) + LD x4, 4 * SIZE(X) + LD x5, 5 * SIZE(X) + LD x6, 6 * SIZE(X) + LD x7, 7 * SIZE(X) + + ldi I, -1(I) + ble I, $L12 + .align 4 + +$L11: + faddd a0, t0, a0 + fillcs (PREFETCH_SIZE) * SIZE(X) + fmuld x0, x0, t0 + LD x0, 8 * SIZE(X) + + faddd a1, t1, a1 + mov X, XX + fmuld x1, x1, t1 + LD x1, 9 * SIZE(X) + + faddd a2, t2, a2 + unop + fmuld x2, x2, t2 + LD x2, 10 * SIZE(X) + + faddd a3, t3, a3 + unop + fmuld x3, x3, t3 + LD x3, 11 * SIZE(X) + + faddd a0, t0, a0 + unop + fmuld x4, x4, t0 + LD x4, 12 * SIZE(X) + + faddd a1, t1, a1 + unop + fmuld x5, x5, t1 + LD x5, 13 * SIZE(X) + + faddd a2, t2, a2 + unop + fmuld x6, x6, t2 + LD x6, 14 * SIZE(X) + + faddd a3, t3, a3 + unop + fmuld x7, x7, t3 + LD x7, 15 * SIZE(X) + + faddd a0, t0, a0 + unop + fmuld x0, x0, t0 + LD x0, 16 * SIZE(X) + + faddd a1, t1, a1 + ldi X, 16 * SIZE(X) + fmuld x1, x1, t1 + LD x1, 17 * SIZE(XX) + + faddd a2, t2, a2 + unop + fmuld x2, x2, t2 + LD x2, 18 * SIZE(XX) + + faddd a3, t3, a3 + unop + fmuld x3, x3, t3 + LD x3, 19 * SIZE(XX) + + faddd a0, t0, a0 + unop + fmuld x4, x4, t0 + LD x4, 20 * SIZE(XX) + + faddd a1, t1, a1 + ldi I, -1(I) + fmuld x5, x5, t1 + LD x5, 21 * SIZE(XX) + + faddd a2, t2, a2 + unop + fmuld x6, x6, t2 + LD x6, 22 * SIZE(XX) + + faddd a3, t3, a3 + fmuld x7, x7, t3 + LD x7, 23 * SIZE(XX) + bgt I, $L11 + .align 4 + +$L12: + faddd a0, t0, a0 + mov X, XX + fmuld x0, x0, t0 + LD x0, 8 * SIZE(X) + + faddd a1, t1, a1 + unop + fmuld x1, x1, t1 + LD x1, 9 * SIZE(X) + + faddd a2, t2, a2 + unop + fmuld x2, x2, t2 + LD x2, 10 * SIZE(X) + + faddd a3, t3, a3 + unop + fmuld x3, x3, t3 + LD x3, 11 * SIZE(X) + + faddd a0, t0, a0 + unop + fmuld x4, x4, t0 + LD x4, 12 * SIZE(XX) + + faddd a1, t1, a1 + unop + fmuld x5, x5, t1 + LD x5, 13 * SIZE(XX) + + faddd a2, t2, a2 + unop + fmuld x6, x6, t2 + LD x6, 14 * SIZE(XX) + + faddd a3, t3, a3 + ldi X, 16 * SIZE(X) + fmuld x7, x7, t3 + LD x7, 15 * SIZE(XX) + + faddd a0, t0, a0 + fmuld x0, x0, t0 + faddd a1, t1, a1 + fmuld x1, x1, t1 + + faddd a2, t2, a2 + fmuld x2, x2, t2 + faddd a3, t3, a3 + fmuld x3, x3, t3 + + faddd a0, t0, a0 + fmuld x4, x4, t0 + faddd a1, t1, a1 + fmuld x5, x5, t1 + + faddd a2, t2, a2 + fmuld x6, x6, t2 + faddd a3, t3, a3 + fmuld x7, x7, t3 + + faddd a1, t1, a1 + faddd a2, t2, a2 + faddd a3, t3, a3 + .align 4 + +$L15: + and N, 15, I + ble I, $L998 + .align 4 + +$L16: + LD x0, 0 * SIZE(X) + ldi X, 1 * SIZE(X) + + faddd a0, t0, a0 + fmuld x0, x0, t0 + + ldi I, -1(I) + bgt I, $L16 + bsr $31, $L998 + .align 4 + +$L20: + fclr t0 + sra N, 3, I + fclr t1 + ble I, $L25 + + fclr t2 + fclr t3 + + LD x0, 0 * SIZE(X) + addl X, INCX, X + LD x1, 0 * SIZE(X) + addl X, INCX, X + LD x2, 0 * SIZE(X) + addl X, INCX, X + LD x3, 0 * SIZE(X) + addl X, INCX, X + + LD x4, 0 * SIZE(X) + addl X, INCX, X + LD x5, 0 * SIZE(X) + addl X, INCX, X + LD x6, 0 * SIZE(X) + addl X, INCX, X + + ldi I, -1(I) + ble I, $L22 + .align 4 + +$L21: + faddd a0, t0, a0 + LD x7, 0 * SIZE(X) + fmuld x0, x0, t0 + addl X, INCX, X + + faddd a1, t1, a1 + LD x0, 0 * SIZE(X) + fmuld x1, x1, t1 + addl X, INCX, X + + faddd a2, t2, a2 + LD x1, 0 * SIZE(X) + fmuld x2, x2, t2 + addl X, INCX, X + + faddd a3, t3, a3 + LD x2, 0 * SIZE(X) + fmuld x3, x3, t3 + addl X, INCX, X + + faddd a0, t0, a0 + LD x3, 0 * SIZE(X) + fmuld x4, x4, t0 + addl X, INCX, X + + faddd a1, t1, a1 + LD x4, 0 * SIZE(X) + fmuld x5, x5, t1 + addl X, INCX, X + + faddd a2, t2, a2 + LD x5, 0 * SIZE(X) + fmuld x6, x6, t2 + addl X, INCX, X + + faddd a3, t3, a3 + LD x6, 0 * SIZE(X) + fmuld x7, x7, t3 + addl X, INCX, X + + ldi I, -1(I) + bgt I, $L21 + .align 4 + +$L22: + faddd a0, t0, a0 + LD x7, 0 * SIZE(X) + fmuld x0, x0, t0 + addl X, INCX, X + + faddd a1, t1, a1 + unop + fmuld x1, x1, t1 + unop + + faddd a2, t2, a2 + fmuld x2, x2, t2 + faddd a3, t3, a3 + fmuld x3, x3, t3 + + faddd a0, t0, a0 + fmuld x4, x4, t0 + faddd a1, t1, a1 + fmuld x5, x5, t1 + + faddd a2, t2, a2 + fmuld x6, x6, t2 + faddd a3, t3, a3 + fmuld x7, x7, t3 + + faddd a1, t1, a1 + faddd a2, t2, a2 + faddd a3, t3, a3 + .align 4 + +$L25: + and N, 7, I + ble I, $L998 + .align 4 + +$L26: + LD x0, 0 * SIZE(X) + addl X, INCX, X + + faddd a0, t0, a0 + fmuld x0, x0, t0 + + ldi I, -1(I) + bgt I, $L26 + .align 4 + + +$L998: + faddd a0, t0, a0 + + faddd a0, a1, a0 + faddd a2, a3, a2 + +#if defined(EV4) || defined(EV5) + faddd a0, a2, $f16 + jsr $26, ($27), sqrt !lituse_jsr!2 + + ldih $29, 0($26) !gpdisp!3 + ldi $29, 0($29) !gpdisp!3 +#else + faddd a0, a2, a0 + fsqrtd a0, a0 +#endif + .align 4 + +$L999: +#if defined(EV4) || defined(EV5) + ldl $26, 0($sp) + ldi $sp, 16($sp) +#endif + ret + EPILOGUE diff --git a/kernel/sw_64/staticbuffer.S b/kernel/sw_64/staticbuffer.S new file mode 100644 index 0000000..7bbd23d --- /dev/null +++ b/kernel/sw_64/staticbuffer.S @@ -0,0 +1,45 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ALLOC_STATIC + .align 8 + .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 16384 +#endif diff --git a/kernel/sw_64/sum.S b/kernel/sw_64/sum.S new file mode 100644 index 0000000..0be6d53 --- /dev/null +++ b/kernel/sw_64/sum.S @@ -0,0 +1,230 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define I $19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f19 + +#define t0 $f20 +#define t1 $f21 +#define t2 $f22 +#define t3 $f23 + + PROLOGUE + PROFCODE + + fclr s0 + unop + fclr t0 + ble N, $L999 + + sra N, 3, I + fclr s1 + fclr s2 + ble I, $L15 + + LD a0, 0 * SIZE(X) + fclr t1 + SXADDQ INCX, X, X + fclr t2 + + LD a1, 0 * SIZE(X) + fclr t3 + SXADDQ INCX, X, X + fclr s3 + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a5, 0 * SIZE(X) + SXADDQ INCX, X, X + + ldi I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ADD s0, t0, $f24 + fmov $f24,s0 + ldw $31, PREFETCHSIZE * 2 * SIZE(X) + fmov a0, t0 + ldi I, -1(I) + + ADD s1, t1, $f24 + fmov $f24,s1 + LD a6, 0 * SIZE(X) + fmov a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, $f24 + fmov $f24,s2 + LD a7, 0 * SIZE(X) + fmov a2, t2 + SXADDQ INCX, X, X + + ADD s3, t3, $f24 + fmov $f24,s3 + LD a0, 0 * SIZE(X) + fmov a3, t3 + SXADDQ INCX, X, X + + ADD s0, t0, $f24 + fmov $f24,s0 + LD a1, 0 * SIZE(X) + fmov a4, t0 + SXADDQ INCX, X, X + + ADD s1, t1, $f24 + fmov $f24,s1 + LD a2, 0 * SIZE(X) + fmov a5, t1 + SXADDQ INCX, X, X + + ADD s2, t2, $f24 + fmov $f24,s2 + LD a3, 0 * SIZE(X) + fmov a6, t2 + SXADDQ INCX, X, X + + ADD s3, t3, $f24 + fmov $f24,s3 + LD a4, 0 * SIZE(X) + fmov a7, t3 + SXADDQ INCX, X, X + + LD a5, 0 * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + ADD s0, t0, $f24 + fmov $f24,s0 + LD a6, 0 * SIZE(X) + fmov a0, t0 + SXADDQ INCX, X, X + + ADD s1, t1, $f24 + fmov $f24,s1 + LD a7, 0 * SIZE(X) + fmov a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, $f24 + fmov $f24,s2 + fmov a2, t2 + ADD s3, t3, $f24 + fmov $f24,s3 + fmov a3, t3 + + ADD s0, t0, $f24 + fmov $f24,s0 + fmov a4, t0 + ADD s1, t1, $f24 + fmov $f24,s1 + fmov a5, t1 + ADD s2, t2, $f24 + fmov $f24,s2 + fmov a6, t2 + ADD s3, t3, $f24 + fmov $f24,s3 + fmov a7, t3 + + ADD s1, t1, $f24 + fmov $f24,s1 + ADD s2, t2, $f24 + fmov $f24,s2 + ADD s3, t3, $f24 + fmov $f24,s3 + + ADD s0, s1, $f24 + fmov $f24,s0 + ADD s2, s3, $f24 + fmov $f24,s2 + .align 4 + +$L15: + and N, 7, I + ADD s0, s2, $f24 + fmov $f24,s0 + unop + ble I, $L999 + .align 4 + +$L17: + ADD s0, t0, $f24 + fmov $f24,s0 + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + fmov a0, t0 + + ldi I, -1(I) + bne I, $L17 + .align 4 + +$L999: + ADD s0, t0, $f24 + fmov $f24,s0 + ret + EPILOGUE diff --git a/kernel/sw_64/sw_fpcr.S b/kernel/sw_64/sw_fpcr.S new file mode 100644 index 0000000..5dee238 --- /dev/null +++ b/kernel/sw_64/sw_fpcr.S @@ -0,0 +1,39 @@ +#define ASSEMBLER +#include "common.h" +#include "version.h" + + .arch sw2b + .set noat + .set noreorder +.text + .align 5 + .globl read_fpcr + .ent read_fpcr +read_fpcr: + .frame $sp, 0, $26, 0 + RFPCR $f10 + fstd $f10, 0($16) + ret + .end read_fpcr + + .globl write_fpcr + .ent write_fpcr +write_fpcr: + .frame $sp, 0, $26, 0 + fldd $f10, 0($16) + WFPCR $f10 + ret + .end write_fpcr +/** + .globl fadd_test + .ent fadd_test + +fadd_test: + .frame $sp, 0, $26, 0 + faddd $f16, $f17, $f16 + fmov $f16, $f0 + ret + .end fadd_test +**/ + .ident VERSION + diff --git a/kernel/sw_64/sw_fpcr_inline.c b/kernel/sw_64/sw_fpcr_inline.c new file mode 100644 index 0000000..1943e3e --- /dev/null +++ b/kernel/sw_64/sw_fpcr_inline.c @@ -0,0 +1,13 @@ +#include "common.h" + +void read_fpcr(long * test){ + + __asm__("rfpcr $f10 \n fstd $f10, %0":"=m"(*test):); + return; +} + +void write_fpcr(long * test){ + + __asm__("fldd $f10, %0\nwfpcr $f10"::"m"(*test)); + return; +} diff --git a/kernel/sw_64/swap.S b/kernel/sw_64/swap.S new file mode 100644 index 0000000..5c8b679 --- /dev/null +++ b/kernel/sw_64/swap.S @@ -0,0 +1,249 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + + mov $20, $17 + mov $21, $18 + ldl $19, 0($sp) + ldl $20, 8($sp) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + subl $18, 1, $1 + subl $20, 1, $2 + ble $16, $SubEnd # if n <= 0 goto $End + or $1, $2, $1 + + sra $16, 3, $21 + + and $16, 7, $22 + bne $1, $Sub + ble $21, $MainRemain + .align 4 + +$MainLoop: + LD $f10, 0*SIZE($19) + LD $f11, 1*SIZE($19) + LD $f12, 2*SIZE($19) + LD $f13, 3*SIZE($19) + LD $f14, 4*SIZE($19) + LD $f15, 5*SIZE($19) + LD $f16, 6*SIZE($19) + LD $f17, 7*SIZE($19) + + LD $f20, 0*SIZE($17) + LD $f21, 1*SIZE($17) + LD $f22, 2*SIZE($17) + LD $f23, 3*SIZE($17) + LD $f24, 4*SIZE($17) + LD $f25, 5*SIZE($17) + LD $f26, 6*SIZE($17) + LD $f27, 7*SIZE($17) + + fillcs 32*SIZE($17) + unop + fillcs 32*SIZE($19) + subl $21, 1, $21 + + ST $f10, 0*SIZE($17) + ST $f11, 1*SIZE($17) + ST $f12, 2*SIZE($17) + ST $f13, 3*SIZE($17) + ST $f14, 4*SIZE($17) + ST $f15, 5*SIZE($17) + ST $f16, 6*SIZE($17) + ST $f17, 7*SIZE($17) + + ST $f20, 0*SIZE($19) + ST $f21, 1*SIZE($19) + ST $f22, 2*SIZE($19) + ST $f23, 3*SIZE($19) + ST $f24, 4*SIZE($19) + ST $f25, 5*SIZE($19) + ST $f26, 6*SIZE($19) + ST $f27, 7*SIZE($19) + + ldi $17, 8*SIZE($17) + ldi $19, 8*SIZE($19) + bgt $21, $MainLoop + .align 4 + +$MainRemain: + ble $22, $MainEnd + .align 4 + +$MainRemainLoop: + LD $f10, 0*SIZE($19) + LD $f20, 0*SIZE($17) + ldi $17, 1*SIZE($17) + ldi $19, 1*SIZE($19) + subl $22, 1, $22 + ST $f10, -1*SIZE($17) + ST $f20, -1*SIZE($19) + bgt $22, $MainRemainLoop + .align 4 + +$MainEnd: + clr $0 + ret + .align 4 + +$Sub: + mov $17, $23 + mov $19, $24 + + ble $21, $SubRemain + .align 4 + +$SubLoop: + LD $f10, 0*SIZE($19) + SXADDQ $20, $19, $19 + LD $f11, 0*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f12, 0*SIZE($19) + SXADDQ $20, $19, $19 + LD $f13, 0*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f14, 0*SIZE($19) + SXADDQ $20, $19, $19 + LD $f15, 0*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f16, 0*SIZE($19) + SXADDQ $20, $19, $19 + LD $f17, 0*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f20, 0*SIZE($17) + SXADDQ $18, $17, $17 + LD $f21, 0*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f22, 0*SIZE($17) + SXADDQ $18, $17, $17 + LD $f23, 0*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f24, 0*SIZE($17) + SXADDQ $18, $17, $17 + LD $f25, 0*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f26, 0*SIZE($17) + SXADDQ $18, $17, $17 + LD $f27, 0*SIZE($17) + SXADDQ $18, $17, $17 + + ST $f10, 0*SIZE($23) + SXADDQ $18, $23, $23 + ST $f11, 0*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f12, 0*SIZE($23) + SXADDQ $18, $23, $23 + ST $f13, 0*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f14, 0*SIZE($23) + SXADDQ $18, $23, $23 + ST $f15, 0*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f16, 0*SIZE($23) + SXADDQ $18, $23, $23 + ST $f17, 0*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f20, 0*SIZE($24) + SXADDQ $20, $24, $24 + ST $f21, 0*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f22, 0*SIZE($24) + SXADDQ $20, $24, $24 + ST $f23, 0*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f24, 0*SIZE($24) + SXADDQ $20, $24, $24 + ST $f25, 0*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f26, 0*SIZE($24) + SXADDQ $20, $24, $24 + ST $f27, 0*SIZE($24) + SXADDQ $20, $24, $24 + + subl $21, 1, $21 + bgt $21, $SubLoop + .align 4 + +$SubRemain: + ble $22, $SubEnd + .align 4 + +$SubRemainLoop: + LD $f10, 0*SIZE($19) + LD $f20, 0*SIZE($17) + + subl $22, 1, $22 + + ST $f10, 0*SIZE($17) + ST $f20, 0*SIZE($19) + + SXADDQ $18, $17, $17 + SXADDQ $20, $19, $19 + bgt $22, $SubRemainLoop + .align 4 + +$SubEnd: + clr $0 + ret + EPILOGUE diff --git a/kernel/sw_64/swap_simd.S b/kernel/sw_64/swap_simd.S new file mode 100644 index 0000000..8a6141d --- /dev/null +++ b/kernel/sw_64/swap_simd.S @@ -0,0 +1,327 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 64 +#define X $17 +#define Y $19 + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + + mov $20, $17 + mov $21, $18 + ldl $19, 0($sp) + ldl $20, 8($sp) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + subl $18, 1, $1 + subl $20, 1, $2 + ble $16, $SubEnd # if n <= 0 goto $End + or $1, $2, $1 + +/* + Unloop 16 +*/ + sra $16, 4, $21 + and $16, 15, $22 + bne $1, $Sub + ble $21, $MainRemain + .align 4 + +/* + test the address of Y & X +*/ + and Y, (VEC_LEN*SIZE-1), $4 + and X, (VEC_LEN*SIZE-1), $3 + or $3, $4, $4 + bne $4, $UnAlign_ACCESS + +/* align access*/ + +$MainLoop: + VLD $f10, 0*VEC_LEN*SIZE(Y) + VLD $f11, 1*VEC_LEN*SIZE(Y) + VLD $f12, 2*VEC_LEN*SIZE(Y) + VLD $f13, 3*VEC_LEN*SIZE(Y) + + + VLD $f20, 0*VEC_LEN*SIZE(X) + VLD $f21, 1*VEC_LEN*SIZE(X) + VLD $f22, 2*VEC_LEN*SIZE(X) + VLD $f23, 3*VEC_LEN*SIZE(X) + + + fillcs PREFETCHSIZE * SIZE(X) + unop + fillcs PREFETCHSIZE * SIZE(Y) + subl $21, 1, $21 + + VST $f10, 0*VEC_LEN*SIZE(X) + VST $f11, 1*VEC_LEN*SIZE(X) + VST $f12, 2*VEC_LEN*SIZE(X) + VST $f13, 3*VEC_LEN*SIZE(X) + + VST $f20, 0*VEC_LEN*SIZE(Y) + VST $f21, 1*VEC_LEN*SIZE(Y) + VST $f22, 2*VEC_LEN*SIZE(Y) + VST $f23, 3*VEC_LEN*SIZE(Y) + + ldi $17, 16*SIZE(X) + ldi $19, 16*SIZE(Y) + bgt $21, $MainLoop + .align 4 + +$MainRemain: + ble $22, $MainEnd + .align 4 + +$MainRemainLoop: + LD $f10, 0*SIZE($19) + LD $f20, 0*SIZE($17) + ldi $17, 1*SIZE($17) + ldi $19, 1*SIZE($19) + subl $22, 1, $22 + ST $f10, -1*SIZE($17) + ST $f20, -1*SIZE($19) + bgt $22, $MainRemainLoop + .align 4 + +$MainEnd: + clr $0 + ret + .align 4 + +$UnAlign_ACCESS: + sra $16, 3, $21 + and $16, 7, $22 + nop + ble $21, $UnAlign_ACCESS_MainRemain + .align 4 +$UnAlign_ACCESS_MainLoop: + LD $f10, 0*SIZE(Y) + LD $f11, 1*SIZE(Y) + LD $f12, 2*SIZE(Y) + LD $f13, 3*SIZE(Y) + LD $f14, 4*SIZE(Y) + LD $f15, 5*SIZE(Y) + LD $f16, 6*SIZE(Y) + LD $f17, 7*SIZE(Y) + + LD $f20, 0*SIZE(X) + LD $f21, 1*SIZE(X) + LD $f22, 2*SIZE(X) + LD $f23, 3*SIZE(X) + LD $f24, 4*SIZE(X) + LD $f25, 5*SIZE(X) + LD $f26, 6*SIZE(X) + LD $f27, 7*SIZE(X) + + + fillcs PREFETCHSIZE * SIZE(X) + unop + fillcs PREFETCHSIZE * SIZE(Y) + subl $21, 1, $21 + + ST $f10, 0*SIZE(X) + ST $f11, 1*SIZE(X) + ST $f12, 2*SIZE(X) + ST $f13, 3*SIZE(X) + ST $f14, 4*SIZE(X) + ST $f15, 5*SIZE(X) + ST $f16, 6*SIZE(X) + ST $f17, 7*SIZE(X) + + ST $f20, 0*SIZE(Y) + ST $f21, 1*SIZE(Y) + ST $f22, 2*SIZE(Y) + ST $f23, 3*SIZE(Y) + ST $f24, 4*SIZE(Y) + ST $f25, 5*SIZE(Y) + ST $f26, 6*SIZE(Y) + ST $f27, 7*SIZE(Y) + + ldi X, 8*SIZE(X) + ldi Y, 8*SIZE(Y) + bgt $21, $UnAlign_ACCESS_MainLoop + .align 4 + +$UnAlign_ACCESS_MainRemain: + ble $22, $UnAlign_ACCESS_MainEnd + .align 4 + +$UnAlign_ACCESS_MainRemainLoop: + LD $f10, 0*SIZE(Y) + LD $f20, 0*SIZE(X) + ldi X, 1*SIZE(X) + ldi Y, 1*SIZE(Y) + subl $22, 1, $22 + ST $f10, -1*SIZE(X) + ST $f20, -1*SIZE(Y) + bgt $22, $UnAlign_ACCESS_MainRemainLoop + .align 4 + +$UnAlign_ACCESS_MainEnd: + clr $0 + ret + .align 4 + +$Sub: + sra $16, 3, $21 + and $16, 7, $22 + mov $17, $23 + mov $19, $24 + + ble $21, $SubRemain + .align 4 + +$SubLoop: + LD $f10, 0*SIZE($19) + SXADDQ $20, $19, $19 + LD $f11, 0*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f12, 0*SIZE($19) + SXADDQ $20, $19, $19 + LD $f13, 0*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f14, 0*SIZE($19) + SXADDQ $20, $19, $19 + LD $f15, 0*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f16, 0*SIZE($19) + SXADDQ $20, $19, $19 + LD $f17, 0*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f20, 0*SIZE($17) + SXADDQ $18, $17, $17 + LD $f21, 0*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f22, 0*SIZE($17) + SXADDQ $18, $17, $17 + LD $f23, 0*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f24, 0*SIZE($17) + SXADDQ $18, $17, $17 + LD $f25, 0*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f26, 0*SIZE($17) + SXADDQ $18, $17, $17 + LD $f27, 0*SIZE($17) + SXADDQ $18, $17, $17 + + ST $f10, 0*SIZE($23) + SXADDQ $18, $23, $23 + ST $f11, 0*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f12, 0*SIZE($23) + SXADDQ $18, $23, $23 + ST $f13, 0*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f14, 0*SIZE($23) + SXADDQ $18, $23, $23 + ST $f15, 0*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f16, 0*SIZE($23) + SXADDQ $18, $23, $23 + ST $f17, 0*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f20, 0*SIZE($24) + SXADDQ $20, $24, $24 + ST $f21, 0*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f22, 0*SIZE($24) + SXADDQ $20, $24, $24 + ST $f23, 0*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f24, 0*SIZE($24) + SXADDQ $20, $24, $24 + ST $f25, 0*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f26, 0*SIZE($24) + SXADDQ $20, $24, $24 + ST $f27, 0*SIZE($24) + SXADDQ $20, $24, $24 + + subl $21, 1, $21 + bgt $21, $SubLoop + .align 4 + +$SubRemain: + ble $22, $SubEnd + .align 4 + +$SubRemainLoop: + LD $f10, 0*SIZE($19) + LD $f20, 0*SIZE($17) + + subl $22, 1, $22 + + ST $f10, 0*SIZE($17) + ST $f20, 0*SIZE($19) + + SXADDQ $18, $17, $17 + SXADDQ $20, $19, $19 + bgt $22, $SubRemainLoop + .align 4 + +$SubEnd: + clr $0 + ret + EPILOGUE diff --git a/kernel/sw_64/trsm_kernel_4x4_LN.S b/kernel/sw_64/trsm_kernel_4x4_LN.S new file mode 100644 index 0000000..109c471 --- /dev/null +++ b/kernel/sw_64/trsm_kernel_4x4_LN.S @@ -0,0 +1,5144 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(SW6) +#error "Architecture is not specified." +#endif + +#ifdef SW6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 56 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $20 +#define B $21 +#define C $22 +#define LDC $23 + +#define C1 $19 +#define C2 $24 +#define C3 $25 +#define C4 $27 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 +#define tmp $9 + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + ldi $sp, -STACKSIZE($sp) + + ldl C, 0 + STACKSIZE($sp) + ldl LDC, 8 + STACKSIZE($sp) + ldl OFFSET, 16 + STACKSIZE($sp) + + SXADDQ LDC, 0, LDC + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + stl tmp, 64($sp) + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + mull M, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ M, C, C +#endif + +#ifdef RN + negq OFFSET, KK +#endif + +#ifdef RT + mulq N, K, TMP1 + SXADDQ TMP1, B, B + + mulq N, LDC, TMP1 + addl TMP1, C, C + + subl N, OFFSET, KK +#endif + + sra N, 2, J + ble J, $L40 + .align 4 + +$L01: +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + subl B, TMP1, B + + s4addl LDC, 0, TMP1 + subl C, TMP1, C +#endif + + mov C, C1 + addl C, LDC, C2 + addl C2, LDC, C3 +#ifndef RT + s4addl LDC, C, C +#endif + + fclr t1 + addl C3, LDC, C4 + fclr t2 + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + fclr t3 + fclr t4 + + and M, 1, I + ble I, $L20 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + ldi L, -2(KK) + LD b2, 1 * SIZE(B) + ldi AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c09 + LD b4, 3 * SIZE(B) + fclr c13 + + ldi BO, 4 * SIZE(B) + ble KK, $L38 + + ble L, $L35 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c09 + LD b4, 3 * SIZE(BO) + fclr c13 + + ldi BO, 4 * SIZE(BO) + ble TMP1, $L38 + + ble L, $L35 +#endif + .align 4 + +$L32: + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -2(L) + MUL a1, b1, t1 + LD b1, 0 * SIZE(BO) + + ADD c05, t2, b5 + fmov b5, c05 + ldi AO, 2 * SIZE(AO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, b5 + fmov b5, c09 + LD b5, 3 * SIZE(BO) + FIMOVD b5, tmp + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, b5 + fmov b5, c13 + MUL a1, b4, t4 + LD a1, -1 * SIZE(AO) + + ADD c01, t1, b5 + fmov b5, c01 + MUL a2, b1, t1 + LD b1, 4 * SIZE(BO) + ldi BO, 8 * SIZE(BO) + + ADD c05, t2, b5 + fmov b5, c05 + MUL a2, b2, t2 + LD b2, -3 * SIZE(BO) + + ADD c09, t3, b5 + fmov b5, c09 + LD b4, -1 * SIZE(BO) + MUL a2, b3, t3 + LD b3, -2 * SIZE(BO) + + ADD c13, t4, b5 + fmov b5, c13 + IFMOVD tmp, b5 + MUL a2, b5, t4 + LD a2, 0 * SIZE(AO) + bgt L, $L32 + .align 4 + +$L35: + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b1, b5 + fmov b5, t1 +#if defined(LT) || defined(RN) + blbs KK, $L37 +#else + blbs TMP1, $L37 +#endif + .align 4 + + ADD c05, t2, b5 + fmov b5, c05 + LD b1, 0 * SIZE(BO) + MUL a1, b2, b5 + fmov b5, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, b5 + fmov b5, c09 + MUL a1, b3, b5 + fmov b5, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, b5 + fmov b5, c13 + MUL a1, b4, b5 + fmov b5, t4 + LD a1, 0 * SIZE(AO) + ldi AO, 1 * SIZE(AO) + + ADD c01, t1, b5 + fmov b5, c01 + LD b4, 3 * SIZE(BO) + MUL a1, b1, b5 + fmov b5, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L37: + ADD c05, t2, b5 + fmov b5, c05 + MUL a1, b2, b5 + fmov b5, t2 + ADD c09, t3, b5 + fmov b5, c09 + MUL a1, b3, b5 + fmov b5, t3 + + ADD c13, t4, b5 + fmov b5, c13 + ldi AO, 1 * SIZE(AO) + MUL a1, b4, b5 + fmov b5, t4 + ldi BO, 4 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + ADD c05, t2, b5 + fmov b5, c05 + ADD c09, t3, b5 + fmov b5, c09 + ADD c13, t4, b5 + fmov b5, c13 + +$L38: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 1, TMP1 +#else + subl KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO +#else + ldi AO, -1 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c13, b5 + fmov b5, c13 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c13, b5 + fmov b5, c13 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c13, b5 + fmov b5, c13 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a2, c01, b5 + fmov b5, t1 + SUB c05, t1, b5 + fmov b5, c05 + MUL a3, c01, b5 + fmov b5, t1 + SUB c09, t1, b5 + fmov b5, c09 + MUL a4, c01, b5 + fmov b5, t1 + SUB c13, t1, b5 + fmov b5, c13 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, b5 + fmov b5, c05 + MUL b2, c05, b5 + fmov b5, t1 + SUB c09, t1, b5 + fmov b5, c09 + MUL b3, c05, b5 + fmov b5, t1 + SUB c13, t1, b5 + fmov b5, c13 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, b5 + fmov b5, c09 + MUL a2, c09, b5 + fmov b5, t1 + SUB c13, t1, b5 + fmov b5, c13 + MUL a3, c13, b5 + fmov b5, c13 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, b5 + fmov b5, c13 + MUL a2, c13, b5 + fmov b5, t1 + SUB c09, t1, b5 + fmov b5, c09 + MUL a3, c13, b5 + fmov b5, t1 + SUB c05, t1, b5 + fmov b5, c05 + MUL a4, c13, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, b5 + fmov b5, c09 + MUL b2, c09, b5 + fmov b5, t1 + SUB c05, t1, b5 + fmov b5, c05 + MUL b3, c09, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, b5 + fmov b5, c05 + MUL a2, c05, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + MUL a3, c01, b5 + fmov b5, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c13, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -1 * SIZE(C1) + ldi C2, -1 * SIZE(C2) + ldi C3, -1 * SIZE(C3) + ldi C4, -1 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c09, 0 * SIZE(C3) + ST c13, 0 * SIZE(C4) + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L20: + and M, 2, I + ble I, $L30 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(B) + ldi L, -2(KK) + LD b2, 1 * SIZE(B) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c01 + LD b4, 3 * SIZE(B) + fclr c05 + + ldi BO, 4 * SIZE(B) + fclr c02 + fclr c06 + ble KK, $L28 + + ble L, $L25 + +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c01 + LD b4, 3 * SIZE(BO) + fclr c05 + + ldi BO, 4 * SIZE(BO) + fclr c02 + fclr c06 + ble TMP1, $L28 + + ble L, $L25 +#endif + .align 4 + +$L22: + ADD c09, t1, b5 + fmov b5, c09 + unop + MUL a1, b1, t1 + unop + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, b5 + fmov b5, c13 + unop + MUL a1, b2, t3 + ldi BO, 8 * SIZE(BO) + + ADD c14, t4, b5 + fmov b5, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b3, t1 + unop + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + FIMOVD b5, tmp + + ADD c09, t1, b5 + fmov b5, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD c13, t3, b5 + fmov b5, c13 + unop + MUL a3, b2, t3 + ldi AO, 4 * SIZE(AO) + + ADD c14, t4, b5 + fmov b5, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + unop + IFMOVD tmp, b5 + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + IFMOVD tmp, b5 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD c09, t1, b5 + fmov b5, c09 + MUL a1, b1, b5 + fmov b5, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL a2, b1, b5 + fmov b5, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, b5 + fmov b5, c13 + unop + MUL a1, b2, b5 + fmov b5, t3 + unop + + ADD c14, t4, b5 + fmov b5, c14 + unop + MUL a2, b2, b5 + fmov b5, t4 + LD b2, 1 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b3, b5 + fmov b5, t1 + ldi AO, 2 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b3, b5 + fmov b5, t2 + LD b3, 2 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + unop + MUL a1, b4, b5 + fmov b5, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + unop + MUL a2, b4, b5 + fmov b5, t4 + LD a2, -1 * SIZE(AO) + + ADD c09, t1, b5 + fmov b5, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, b5 + fmov b5, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD c10, t2, b5 + fmov b5, c10 + MUL a2, b1, b5 + fmov b5, t2 + ADD c13, t3, b5 + fmov b5, c13 + MUL a1, b2, b5 + fmov b5, t3 + + ADD c14, t4, b5 + fmov b5, c14 + MUL a2, b2, b5 + fmov b5, t4 + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b3, b5 + fmov b5, t1 + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b3, b5 + fmov b5, t2 + ADD c05, t3, b5 + fmov b5, c05 + MUL a1, b4, b5 + fmov b5, t3 + + ADD c06, t4, b5 + fmov b5, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b4, b5 + fmov b5, t4 + ldi BO, 4 * SIZE(BO) + + ADD c09, t1, b5 + fmov b5, c09 + ADD c10, t2, b5 + fmov b5, c10 + ADD c13, t3, b5 + fmov b5, c13 + ADD c14, t4, b5 + fmov b5, c14 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c13, b5 + fmov b5, c13 + + SUB b1, c02, b5 + fmov b5, c02 + SUB b2, c06, b5 + fmov b5, c06 + SUB b3, c10, b5 + fmov b5, c10 + SUB b4, c14, b5 + fmov b5, c14 + +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c05, b5 + fmov b5, c05 + SUB a4, c06, b5 + fmov b5, c06 + + SUB b1, c09, b5 + fmov b5, c09 + SUB b2, c10, b5 + fmov b5, c10 + SUB b3, c13, b5 + fmov b5, c13 + SUB b4, c14, b5 + fmov b5, c14 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c06, b5 + fmov b5, c06 + MUL a1, c10, b5 + fmov b5, c10 + MUL a1, c14, b5 + fmov b5, c14 + + MUL a2, c02, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + MUL a2, c10, b5 + fmov b5, t3 + MUL a2, c14, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + SUB c09, t3, b5 + fmov b5, c09 + SUB c13, t4, b5 + fmov b5, c13 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c05, b5 + fmov b5, c05 + MUL a3, c09, b5 + fmov b5, c09 + MUL a3, c13, b5 + fmov b5, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c13, b5 + fmov b5, c13 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c05, b5 + fmov b5, t2 + MUL a2, c09, b5 + fmov b5, t3 + MUL a2, c13, b5 + fmov b5, t4 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + SUB c10, t3, b5 + fmov b5, c10 + SUB c14, t4, b5 + fmov b5, c14 + + MUL a3, c02, b5 + fmov b5, c02 + MUL a3, c06, b5 + fmov b5, c06 + MUL a3, c10, b5 + fmov b5, c10 + MUL a3, c14, b5 + fmov b5, c14 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c02, b5 + fmov b5, t2 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + + MUL a3, c01, b5 + fmov b5, t1 + MUL a3, c02, b5 + fmov b5, t2 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + + MUL a4, c01, b5 + fmov b5, t1 + MUL a4, c02, b5 + fmov b5, t2 + + SUB c13, t1, b5 + fmov b5, c13 + SUB c14, t2, b5 + fmov b5, c14 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, b5 + fmov b5, c05 + MUL b1, c06, b5 + fmov b5, c06 + + MUL b2, c05, b5 + fmov b5, t1 + MUL b2, c06, b5 + fmov b5, t2 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + + MUL b3, c05, b5 + fmov b5, t1 + MUL b3, c06, b5 + fmov b5, t2 + + SUB c13, t1, b5 + fmov b5, c13 + SUB c14, t2, b5 + fmov b5, c14 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + + MUL a2, c09, b5 + fmov b5, t1 + MUL a2, c10, b5 + fmov b5, t2 + + SUB c13, t1, b5 + fmov b5, c13 + SUB c14, t2, b5 + fmov b5, c14 + + MUL a3, c13, b5 + fmov b5, c13 + MUL a3, c14, b5 + fmov b5, c14 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, b5 + fmov b5, c13 + MUL a1, c14, b5 + fmov b5, c14 + + MUL a2, c13, b5 + fmov b5, t1 + MUL a2, c14, b5 + fmov b5, t2 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + + MUL a3, c13, b5 + fmov b5, t1 + MUL a3, c14, b5 + fmov b5, t2 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + + MUL a4, c13, b5 + fmov b5, t1 + MUL a4, c14, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, b5 + fmov b5, c09 + MUL b1, c10, b5 + fmov b5, c10 + + MUL b2, c09, b5 + fmov b5, t1 + MUL b2, c10, b5 + fmov b5, t2 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + + MUL b3, c09, b5 + fmov b5, t1 + MUL b3, c10, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c06, b5 + fmov b5, c06 + + MUL a2, c05, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c02, b5 + fmov b5, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c13, 6 * SIZE(AO) + ST c14, 7 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) + ldi C2, -2 * SIZE(C2) + ldi C3, -2 * SIZE(C3) + ldi C4, -2 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + +#ifndef LN + ldi C1, 2 * SIZE(C1) + ldi C2, 2 * SIZE(C2) + ldi C3, 2 * SIZE(C3) + ldi C4, 2 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + .align 4 + +$L30: + sra M, 2, I + ble I, $L39 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + + LD b3, 2 * SIZE(B) + fclr c06 + LD b4, 3 * SIZE(B) + fclr c05 + + fillcs 4 * SIZE(C1) + fclr c03 + ldi L, -2(KK) + fclr c04 + + fillcs 7 * SIZE(C2) + fclr c08 + ldi BO, 4 * SIZE(B) + fclr c13 + + fillcs 4 * SIZE(C3) + fclr c09 + ldi AO, 4 * SIZE(AO) + fclr c10 + + fillcs 7 * SIZE(C4) + fclr c14 + fclr c07 + ble KK, $L18 +#else + +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addl AORIG, TMP1, AO + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + + LD b3, 2 * SIZE(BO) + fclr c06 + LD b4, 3 * SIZE(BO) + fclr c05 + + fillcs 4 * SIZE(C1) + fclr c03 + ldi L, -2(TMP1) + fclr c04 + + fillcs 7 * SIZE(C2) + fclr c08 + ldi BO, 4 * SIZE(BO) + fclr c13 + + fillcs 4 * SIZE(C3) + fclr c09 + ldi AO, 4 * SIZE(AO) + fclr c10 + + fillcs 7 * SIZE(C4) + fclr c14 + fclr c07 + ble TMP1, $L18 +#endif + + ble L, $L15 + .align 5 + +$L12: +/* 1 */ + ADD c11, t1, b5 + fmov b5, c11 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD c12, t2, b5 + fmov b5, c12 + unop + MUL b1, a2, t2 + unop + + ADD c16, t3, b5 + fmov b5, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD c15, t4, b5 + fmov b5, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + FIMOVD b5, tmp + +/* 2 */ + ADD c01, t1, b5 + fmov b5, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD c02, t2, b5 + fmov b5, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD c06, t3, b5 + fmov b5, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, b5 + fmov b5, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD c03, t1, b5 + fmov b5, c03 + unop + MUL b3, a1, t1 + unop + + ADD c04, t2, b5 + fmov b5, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, b5 + fmov b5, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, b5 + fmov b5, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD c09, t1, b5 + fmov b5, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, b5 + fmov b5, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD c07, t4, b5 + fmov b5, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD c11, t1, b5 + fmov b5, c11 + unop + IFMOVD tmp, b5 + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD c12, t2, b5 + fmov b5, c12 + ldi L, -2(L) + IFMOVD tmp, b5 + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD c16, t3, b5 + fmov b5, c16 + unop + MUL b2, a2, t3 + unop + + ADD c15, t4, b5 + fmov b5, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD c01, t1, b5 + fmov b5, c01 + unop + IFMOVD tmp, b5 + MUL b5, a6, t1 + unop + + ADD c02, t2, b5 + fmov b5, c02 + unop + IFMOVD tmp, b5 + MUL b5, a4, t2 + unop + + ADD c06, t3, b5 + fmov b5, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, b5 + fmov b5, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD c03, t1, b5 + fmov b5, c03 + ldi AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD c04, t2, b5 + fmov b5, c04 + ldi BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD c08, t3, b5 + fmov b5, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD c13, t4, b5 + fmov b5, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD c09, t1, b5 + fmov b5, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD c14, t3, b5 + fmov b5, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, b5 + fmov b5, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD c11, t1, b5 + fmov b5, c11 + MUL b1, a1, b5 + fmov b5, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD c12, t2, b5 + fmov b5, c12 + MUL b1, a2, b5 + fmov b5, t2 + ADD c16, t3, b5 + fmov b5, c16 + MUL b2, a2, b5 + fmov b5, t3 + + ADD c15, t4, b5 + fmov b5, c15 + MUL b2, a1, b5 + fmov b5, t4 + ADD c01, t1, b5 + fmov b5, c01 + MUL b1, a3, b5 + fmov b5, t1 + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL b1, a4, b5 + fmov b5, t2 + LD b1, 0 * SIZE(BO) + + ADD c06, t3, b5 + fmov b5, c06 + MUL b2, a4, b5 + fmov b5, t3 + ADD c05, t4, b5 + fmov b5, c05 + MUL b4, a1, b5 + fmov b5, t4 + + ADD c03, t1, b5 + fmov b5, c03 + unop + MUL b3, a1, b5 + fmov b5, t1 + LD a1, 0 * SIZE(AO) + + ADD c04, t2, b5 + fmov b5, c04 + unop + MUL b3, a2, b5 + fmov b5, t2 + unop + + ADD c08, t3, b5 + fmov b5, c08 + unop + MUL b4, a2, b5 + fmov b5, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, b5 + fmov b5, c13 + unop + MUL b2, a3, b5 + fmov b5, t4 + LD b2, 1 * SIZE(BO) + + ADD c09, t1, b5 + fmov b5, c09 + unop + MUL b3, a3, b5 + fmov b5, t1 + ldi AO, 4 * SIZE(AO) + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL b3, a4, b5 + fmov b5, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, b5 + fmov b5, c14 + unop + MUL b4, a4, b5 + fmov b5, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, b5 + fmov b5, c07 + unop + MUL b4, a3, b5 + fmov b5, t4 + LD a3, -2 * SIZE(AO) + + ADD c11, t1, b5 + fmov b5, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, b5 + fmov b5, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD c12, t2, b5 + fmov b5, c12 + MUL b1, a2, b5 + fmov b5, t2 + ADD c16, t3, b5 + fmov b5, c16 + MUL b2, a2, b5 + fmov b5, t3 + + ADD c15, t4, b5 + fmov b5, c15 + MUL b2, a1, b5 + fmov b5, t4 + ADD c01, t1, b5 + fmov b5, c01 + MUL b1, a3, b5 + fmov b5, t1 + + ADD c02, t2, b5 + fmov b5, c02 + MUL b1, a4, b5 + fmov b5, t2 + ADD c06, t3, b5 + fmov b5, c06 + MUL b2, a4, b5 + fmov b5, t3 + + ADD c05, t4, b5 + fmov b5, c05 + MUL b4, a1, b5 + fmov b5, t4 + ADD c03, t1, b5 + fmov b5, c03 + MUL b3, a1, b5 + fmov b5, t1 + + ADD c04, t2, b5 + fmov b5, c04 + MUL b3, a2, b5 + fmov b5, t2 + ADD c08, t3, b5 + fmov b5, c08 + MUL b4, a2, b5 + fmov b5, t3 + + ADD c13, t4, b5 + fmov b5, c13 + MUL b2, a3, b5 + fmov b5, t4 + ADD c09, t1, b5 + fmov b5, c09 + MUL b3, a3, b5 + fmov b5, t1 + + ADD c10, t2, b5 + fmov b5, c10 + MUL b3, a4, b5 + fmov b5, t2 + ADD c14, t3, b5 + fmov b5, c14 + MUL b4, a4, b5 + fmov b5, t3 + + ADD c07, t4, b5 + fmov b5, c07 + ldi AO, 4 * SIZE(AO) + MUL b4, a3, b5 + fmov b5, t4 + ldi BO, 4 * SIZE(BO) + + ADD c11, t1, b5 + fmov b5, c11 + ADD c12, t2, b5 + fmov b5, c12 + ADD c16, t3, b5 + fmov b5, c16 + ADD c15, t4, b5 + fmov b5, c15 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 4, TMP1 +#else + subl KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c13, b5 + fmov b5, c13 + + SUB b1, c02, b5 + fmov b5, c02 + SUB b2, c06, b5 + fmov b5, c06 + SUB b3, c10, b5 + fmov b5, c10 + SUB b4, c14, b5 + fmov b5, c14 + + LD a1, 8 * SIZE(BO) + LD a2, 9 * SIZE(BO) + LD a3, 10 * SIZE(BO) + LD a4, 11 * SIZE(BO) + + LD b1, 12 * SIZE(BO) + LD b2, 13 * SIZE(BO) + LD b3, 14 * SIZE(BO) + LD b4, 15 * SIZE(BO) + + SUB a1, c03, b5 + fmov b5, c03 + SUB a2, c07, b5 + fmov b5, c07 + SUB a3, c11, b5 + fmov b5, c11 + SUB a4, c15, b5 + fmov b5, c15 + + SUB b1, c04, b5 + fmov b5, c04 + SUB b2, c08, b5 + fmov b5, c08 + SUB b3, c12, b5 + fmov b5, c12 + SUB b4, c16, b5 + fmov b5, c16 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 + + SUB b1, c05, b5 + fmov b5, c05 + SUB b2, c06, b5 + fmov b5, c06 + SUB b3, c07, b5 + fmov b5, c07 + SUB b4, c08, b5 + fmov b5, c08 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b1, 12 * SIZE(AO) + LD b2, 13 * SIZE(AO) + LD b3, 14 * SIZE(AO) + LD b4, 15 * SIZE(AO) + + SUB a1, c09, b5 + fmov b5, c09 + SUB a2, c10, b5 + fmov b5, c10 + SUB a3, c11, b5 + fmov b5, c11 + SUB a4, c12, b5 + fmov b5, c12 + + SUB b1, c13, b5 + fmov b5, c13 + SUB b2, c14, b5 + fmov b5, c14 + SUB b3, c15, b5 + fmov b5, c15 + SUB b4, c16, b5 + fmov b5, c16 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, b5 + fmov b5, c04 + MUL a1, c08, b5 + fmov b5, c08 + MUL a1, c12, b5 + fmov b5, c12 + MUL a1, c16, b5 + fmov b5, c16 + + MUL a2, c04, b5 + fmov b5, t1 + MUL a2, c08, b5 + fmov b5, t2 + MUL a2, c12, b5 + fmov b5, t3 + MUL a2, c16, b5 + fmov b5, t4 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c07, t2, b5 + fmov b5, c07 + SUB c11, t3, b5 + fmov b5, c11 + SUB c15, t4, b5 + fmov b5, c15 + + MUL a3, c04, b5 + fmov b5, t1 + MUL a3, c08, b5 + fmov b5, t2 + MUL a3, c12, b5 + fmov b5, t3 + MUL a3, c16, b5 + fmov b5, t4 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + SUB c10, t3, b5 + fmov b5, c10 + SUB c14, t4, b5 + fmov b5, c14 + + MUL a4, c04, b5 + fmov b5, t1 + MUL a4, c08, b5 + fmov b5, t2 + MUL a4, c12, b5 + fmov b5, t3 + MUL a4, c16, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + SUB c09, t3, b5 + fmov b5, c09 + SUB c13, t4, b5 + fmov b5, c13 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, b5 + fmov b5, c03 + MUL b1, c07, b5 + fmov b5, c07 + MUL b1, c11, b5 + fmov b5, c11 + MUL b1, c15, b5 + fmov b5, c15 + + MUL b2, c03, b5 + fmov b5, t1 + MUL b2, c07, b5 + fmov b5, t2 + MUL b2, c11, b5 + fmov b5, t3 + MUL b2, c15, b5 + fmov b5, t4 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + SUB c10, t3, b5 + fmov b5, c10 + SUB c14, t4, b5 + fmov b5, c14 + + MUL b3, c03, b5 + fmov b5, t1 + MUL b3, c07, b5 + fmov b5, t2 + MUL b3, c11, b5 + fmov b5, t3 + MUL b3, c15, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + SUB c09, t3, b5 + fmov b5, c09 + SUB c13, t4, b5 + fmov b5, c13 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c06, b5 + fmov b5, c06 + MUL a1, c10, b5 + fmov b5, c10 + MUL a1, c14, b5 + fmov b5, c14 + + MUL a2, c02, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + MUL a2, c10, b5 + fmov b5, t3 + MUL a2, c14, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + SUB c09, t3, b5 + fmov b5, c09 + SUB c13, t4, b5 + fmov b5, c13 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c05, b5 + fmov b5, c05 + MUL a3, c09, b5 + fmov b5, c09 + MUL a3, c13, b5 + fmov b5, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c13, b5 + fmov b5, c13 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c05, b5 + fmov b5, t2 + MUL a2, c09, b5 + fmov b5, t3 + MUL a2, c13, b5 + fmov b5, t4 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + SUB c10, t3, b5 + fmov b5, c10 + SUB c14, t4, b5 + fmov b5, c14 + + MUL a3, c01, b5 + fmov b5, t1 + MUL a3, c05, b5 + fmov b5, t2 + MUL a3, c09, b5 + fmov b5, t3 + MUL a3, c13, b5 + fmov b5, t4 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c07, t2, b5 + fmov b5, c07 + SUB c11, t3, b5 + fmov b5, c11 + SUB c15, t4, b5 + fmov b5, c15 + + MUL a4, c01, b5 + fmov b5, t1 + MUL a4, c05, b5 + fmov b5, t2 + MUL a4, c09, b5 + fmov b5, t3 + MUL a4, c13, b5 + fmov b5, t4 + + SUB c04, t1, b5 + fmov b5, c04 + SUB c08, t2, b5 + fmov b5, c08 + SUB c12, t3, b5 + fmov b5, c12 + SUB c16, t4, b5 + fmov b5, c16 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, b5 + fmov b5, c02 + MUL b1, c06, b5 + fmov b5, c06 + MUL b1, c10, b5 + fmov b5, c10 + MUL b1, c14, b5 + fmov b5, c14 + + MUL b2, c02, b5 + fmov b5, t1 + MUL b2, c06, b5 + fmov b5, t2 + MUL b2, c10, b5 + fmov b5, t3 + MUL b2, c14, b5 + fmov b5, t4 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c07, t2, b5 + fmov b5, c07 + SUB c11, t3, b5 + fmov b5, c11 + SUB c15, t4, b5 + fmov b5, c15 + + MUL b3, c02, b5 + fmov b5, t1 + MUL b3, c06, b5 + fmov b5, t2 + MUL b3, c10, b5 + fmov b5, t3 + MUL b3, c14, b5 + fmov b5, t4 + + SUB c04, t1, b5 + fmov b5, c04 + SUB c08, t2, b5 + fmov b5, c08 + SUB c12, t3, b5 + fmov b5, c12 + SUB c16, t4, b5 + fmov b5, c16 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c07, b5 + fmov b5, c07 + MUL a1, c11, b5 + fmov b5, c11 + MUL a1, c15, b5 + fmov b5, c15 + + MUL a2, c03, b5 + fmov b5, t1 + MUL a2, c07, b5 + fmov b5, t2 + MUL a2, c11, b5 + fmov b5, t3 + MUL a2, c15, b5 + fmov b5, t4 + + SUB c04, t1, b5 + fmov b5, c04 + SUB c08, t2, b5 + fmov b5, c08 + SUB c12, t3, b5 + fmov b5, c12 + SUB c16, t4, b5 + fmov b5, c16 + + MUL a3, c04, b5 + fmov b5, c04 + MUL a3, c08, b5 + fmov b5, c08 + MUL a3, c12, b5 + fmov b5, c12 + MUL a3, c16, b5 + fmov b5, c16 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c02, b5 + fmov b5, t2 + MUL a2, c03, b5 + fmov b5, t3 + MUL a2, c04, b5 + fmov b5, t4 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + SUB c07, t3, b5 + fmov b5, c07 + SUB c08, t4, b5 + fmov b5, c08 + + MUL a3, c01, b5 + fmov b5, t1 + MUL a3, c02, b5 + fmov b5, t2 + MUL a3, c03, b5 + fmov b5, t3 + MUL a3, c04, b5 + fmov b5, t4 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + SUB c11, t3, b5 + fmov b5, c11 + SUB c12, t4, b5 + fmov b5, c12 + + MUL a4, c01, b5 + fmov b5, t1 + MUL a4, c02, b5 + fmov b5, t2 + MUL a4, c03, b5 + fmov b5, t3 + MUL a4, c04, b5 + fmov b5, t4 + + SUB c13, t1, b5 + fmov b5, c13 + SUB c14, t2, b5 + fmov b5, c14 + SUB c15, t3, b5 + fmov b5, c15 + SUB c16, t4, b5 + fmov b5, c16 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, b5 + fmov b5, c05 + MUL b1, c06, b5 + fmov b5, c06 + MUL b1, c07, b5 + fmov b5, c07 + MUL b1, c08, b5 + fmov b5, c08 + + MUL b2, c05, b5 + fmov b5, t1 + MUL b2, c06, b5 + fmov b5, t2 + MUL b2, c07, b5 + fmov b5, t3 + MUL b2, c08, b5 + fmov b5, t4 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + SUB c11, t3, b5 + fmov b5, c11 + SUB c12, t4, b5 + fmov b5, c12 + + MUL b3, c05, b5 + fmov b5, t1 + MUL b3, c06, b5 + fmov b5, t2 + MUL b3, c07, b5 + fmov b5, t3 + MUL b3, c08, b5 + fmov b5, t4 + + SUB c13, t1, b5 + fmov b5, c13 + SUB c14, t2, b5 + fmov b5, c14 + SUB c15, t3, b5 + fmov b5, c15 + SUB c16, t4, b5 + fmov b5, c16 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + MUL a1, c11, b5 + fmov b5, c11 + MUL a1, c12, b5 + fmov b5, c12 + + MUL a2, c09, b5 + fmov b5, t1 + MUL a2, c10, b5 + fmov b5, t2 + MUL a2, c11, b5 + fmov b5, t3 + MUL a2, c12, b5 + fmov b5, t4 + + SUB c13, t1, b5 + fmov b5, c13 + SUB c14, t2, b5 + fmov b5, c14 + SUB c15, t3, b5 + fmov b5, c15 + SUB c16, t4, b5 + fmov b5, c16 + + MUL a3, c13, b5 + fmov b5, c13 + MUL a3, c14, b5 + fmov b5, c14 + MUL a3, c15, b5 + fmov b5, c15 + MUL a3, c16, b5 + fmov b5, c16 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, b5 + fmov b5, c13 + MUL a1, c14, b5 + fmov b5, c14 + MUL a1, c15, b5 + fmov b5, c15 + MUL a1, c16, b5 + fmov b5, c16 + + MUL a2, c13, b5 + fmov b5, t1 + MUL a2, c14, b5 + fmov b5, t2 + MUL a2, c15, b5 + fmov b5, t3 + MUL a2, c16, b5 + fmov b5, t4 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + SUB c11, t3, b5 + fmov b5, c11 + SUB c12, t4, b5 + fmov b5, c12 + + MUL a3, c13, b5 + fmov b5, t1 + MUL a3, c14, b5 + fmov b5, t2 + MUL a3, c15, b5 + fmov b5, t3 + MUL a3, c16, b5 + fmov b5, t4 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + SUB c07, t3, b5 + fmov b5, c07 + SUB c08, t4, b5 + fmov b5, c08 + + MUL a4, c13, b5 + fmov b5, t1 + MUL a4, c14, b5 + fmov b5, t2 + MUL a4, c15, b5 + fmov b5, t3 + MUL a4, c16, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + SUB c03, t3, b5 + fmov b5, c03 + SUB c04, t4, b5 + fmov b5, c04 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, b5 + fmov b5, c09 + MUL b1, c10, b5 + fmov b5, c10 + MUL b1, c11, b5 + fmov b5, c11 + MUL b1, c12, b5 + fmov b5, c12 + + MUL b2, c09, b5 + fmov b5, t1 + MUL b2, c10, b5 + fmov b5, t2 + MUL b2, c11, b5 + fmov b5, t3 + MUL b2, c12, b5 + fmov b5, t4 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + SUB c07, t3, b5 + fmov b5, c07 + SUB c08, t4, b5 + fmov b5, c08 + + MUL b3, c09, b5 + fmov b5, t1 + MUL b3, c10, b5 + fmov b5, t2 + MUL b3, c11, b5 + fmov b5, t3 + MUL b3, c12, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + SUB c03, t3, b5 + fmov b5, c03 + SUB c04, t4, b5 + fmov b5, c04 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c06, b5 + fmov b5, c06 + MUL a1, c07, b5 + fmov b5, c07 + MUL a1, c08, b5 + fmov b5, c08 + + MUL a2, c05, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + MUL a2, c07, b5 + fmov b5, t3 + MUL a2, c08, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + SUB c03, t3, b5 + fmov b5, c03 + SUB c04, t4, b5 + fmov b5, c04 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c02, b5 + fmov b5, c02 + MUL a3, c03, b5 + fmov b5, c03 + MUL a3, c04, b5 + fmov b5, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) + + ST c03, 8 * SIZE(BO) + ST c07, 9 * SIZE(BO) + ST c11, 10 * SIZE(BO) + ST c15, 11 * SIZE(BO) + + ST c04, 12 * SIZE(BO) + ST c08, 13 * SIZE(BO) + ST c12, 14 * SIZE(BO) + ST c16, 15 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) + + ST c09, 8 * SIZE(AO) + ST c10, 9 * SIZE(AO) + ST c11, 10 * SIZE(AO) + ST c12, 11 * SIZE(AO) + + ST c13, 12 * SIZE(AO) + ST c14, 13 * SIZE(AO) + ST c15, 14 * SIZE(AO) + ST c16, 15 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) + ldi C2, -4 * SIZE(C2) + ldi C3, -4 * SIZE(C3) + ldi C4, -4 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c11, 2 * SIZE(C3) + ST c12, 3 * SIZE(C3) + + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + ST c15, 2 * SIZE(C4) + ST c16, 3 * SIZE(C4) + +#ifndef LN + ldi C1, 4 * SIZE(C1) + ldi C2, 4 * SIZE(C2) + ldi C3, 4 * SIZE(C3) + ldi C4, 4 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP1 + addl AO, TMP1, AO + addl BO, TMP1, BO +#endif + +#ifdef LT + addl KK, 4, KK +#endif + +#ifdef LN + subl KK, 4, KK +#endif + + ldi I, -1(I) + + bgt I, $L11 + .align 4 + +$L39: +#ifdef LN + sll K, 2 + BASE_SHIFT, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 4, KK +#endif + +#ifdef RT + subl KK, 4, KK +#endif + ldi J, -1(J) + bgt J, $L01 + .align 4 + +$L40: + and N, 2, J + ble J, $L80 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + subl B, TMP1, B + + addl LDC, LDC, TMP1 + subl C, TMP1, C +#endif + + mov C, C1 + addl C, LDC, C2 + fclr t1 +#ifndef RT + addl C2, LDC, C +#endif + fclr t2 + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + fclr t3 + fclr t4 + + and M, 1, I + ble I, $L60 + +#if defined(LT) || defined(RN) + + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + fclr c02 + LD b2, 1 * SIZE(B) + fclr c06 + + ldi L, -2(KK) + + LD b3, 2 * SIZE(B) + ldi AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(B) + ldi BO, 2 * SIZE(B) + + ble KK, $L78 + + ble L, $L75 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + fclr c02 + LD b2, 1 * SIZE(BO) + fclr c06 + + ldi L, -2(TMP1) + + LD b3, 2 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(BO) + ldi BO, 2 * SIZE(BO) + + ble TMP1, $L78 + + ble L, $L75 +#endif + .align 4 + +$L72: + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -2(L) + MUL a1, b1, b5 + fmov b5, t1 + LD b1, 2 * SIZE(BO) + + ADD c05, t2, b5 + fmov b5, c05 + MUL a1, b2, b5 + fmov b5, t2 + LD a1, 1 * SIZE(AO) + LD b2, 3 * SIZE(BO) + + ADD c02, t3, b5 + fmov b5, c02 + ldi AO, 2 * SIZE(AO) + MUL a2, b3, b5 + fmov b5, t3 + LD b3, 4 * SIZE(BO) + + ADD c06, t4, b5 + fmov b5, c06 + MUL a2, b4, b5 + fmov b5, t4 + LD a2, 0 * SIZE(AO) + LD b4, 5 * SIZE(BO) + + ldi BO, 4 * SIZE(BO) + unop + unop + bgt L, $L72 + .align 4 + +$L75: + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b1, b5 + fmov b5, t1 +#if defined(LT) || defined(RN) + blbs KK, $L77 +#else + blbs TMP1, $L77 +#endif + .align 4 + + ADD c05, t2, b5 + fmov b5, c05 + MUL a1, b2, b5 + fmov b5, t2 + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + LD b2, 1 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + MUL a1, b1, b5 + fmov b5, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L77: + ADD c05, t2, b5 + fmov b5, c05 + MUL a1, b2, b5 + fmov b5, t2 + ADD c02, t3, b5 + fmov b5, c02 + ADD c06, t4, b5 + fmov b5, c06 + + ADD c01, c02, b5 + fmov b5, c01 + ldi AO, 1 * SIZE(AO) + ADD c05, c06, b5 + fmov b5, c05 + ldi BO, 2 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + ADD c05, t2, b5 + fmov b5, c05 + + .align 4 + +$L78: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 1, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -1 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c05, b5 + fmov b5, c05 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a2, c01, b5 + fmov b5, t1 + SUB c05, t1, b5 + fmov b5, c05 + MUL a3, c05, b5 + fmov b5, c05 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, b5 + fmov b5, c05 + MUL a2, c05, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + MUL a3, c01, b5 + fmov b5, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -1 * SIZE(C1) + ldi C2, -1 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L60: + and M, 2, I + ble I, $L70 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + ldi L, -2(KK) + LD b2, 1 * SIZE(B) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + ldi BO, 2 * SIZE(B) + + ble KK, $L68 + + ble L, $L65 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + ldi BO, 2 * SIZE(BO) + + ble TMP1, $L68 + + ble L, $L65 +#endif + .align 4 + +$L62: + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b1, b5 + fmov b5, t1 + unop + + ADD c02, t2, b5 + fmov b5, c02 + ldi AO, 4 * SIZE(AO) + MUL a2, b1, b5 + fmov b5, t2 + LD b1, 2 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + ldi L, -2(L) + MUL a1, b2, b5 + fmov b5, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + unop + MUL a2, b2, b5 + fmov b5, t4 + LD a2, -1 * SIZE(AO) + + ADD c01, t1, b5 + fmov b5, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, b5 + fmov b5, t1 + ldi BO, 4 * SIZE(BO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a4, b3, b5 + fmov b5, t2 + LD b3, 0 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + unop + MUL a3, b4, b5 + fmov b5, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + MUL a4, b4, b5 + fmov b5, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L62 + .align 4 + +$L65: + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b1, b5 + fmov b5, t1 +#if defined(LT) || defined(RN) + blbs KK, $L67 +#else + blbs TMP1, $L67 +#endif + .align 4 + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b1, b5 + fmov b5, t2 + LD b1, 0 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + ldi BO, 2 * SIZE(BO) + MUL a1, b2, b5 + fmov b5, t3 + LD a1, 0 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + unop + MUL a2, b2, b5 + fmov b5, t4 + LD a2, 1 * SIZE(AO) + + ADD c01, t1, b5 + fmov b5, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, b5 + fmov b5, t1 + ldi AO, 2 * SIZE(AO) + .align 4 + +$L67: + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b1, b5 + fmov b5, t2 + ADD c05, t3, b5 + fmov b5, c05 + MUL a1, b2, b5 + fmov b5, t3 + + ADD c06, t4, b5 + fmov b5, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b2, b5 + fmov b5, t4 + ldi BO, 2 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + ADD c02, t2, b5 + fmov b5, c02 + ADD c05, t3, b5 + fmov b5, c05 + ADD c06, t4, b5 + fmov b5, c06 + .align 4 + +$L68: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 + SUB a3, c02, b5 + fmov b5, c02 + SUB a4, c06, b5 + fmov b5, c06 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c05, b5 + fmov b5, c05 + SUB a4, c06, b5 + fmov b5, c06 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c06, b5 + fmov b5, c06 + + MUL a2, c02, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c05, b5 + fmov b5, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c05, b5 + fmov b5, c05 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c05, b5 + fmov b5, t2 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + + MUL a3, c02, b5 + fmov b5, c02 + MUL a3, c06, b5 + fmov b5, c06 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c02, b5 + fmov b5, t2 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + + MUL a3, c05, b5 + fmov b5, c05 + MUL a3, c06, b5 + fmov b5, c06 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c06, b5 + fmov b5, c06 + + MUL a2, c05, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c02, b5 + fmov b5, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) + ldi C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + +#ifndef LN + ldi C1, 2 * SIZE(C1) + ldi C2, 2 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + .align 4 + +$L70: + sra M, 2, I + ble I, $L79 + .align 4 + +$L51: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi L, -2(KK) + + ldi BO, 2 * SIZE(B) + ldi AO, 4 * SIZE(AO) + + ble KK, $L58 + + ble L, $L55 +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi L, -2(TMP1) + ldi BO, 2 * SIZE(BO) + ldi AO, 4 * SIZE(AO) + + ble TMP1, $L58 + + ble L, $L55 +#endif + .align 4 + +$L52: + ADD c05, t1, b5 + fmov b5, c05 + unop + MUL a1, b1, b5 + fmov b5, t1 + unop + + ADD c06, t2, b5 + fmov b5, c06 + ldi L, -2(L) + MUL a2, b1, b5 + fmov b5, t2 + unop + + ADD c07, t3, b5 + fmov b5, c07 + unop + MUL a3, b1, b5 + fmov b5, t3 + unop + + ADD c08, t4, b5 + fmov b5, c08 + unop + MUL a4, b1, b5 + fmov b5, t4 + LD b1, 2 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b2, b5 + fmov b5, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b2, b5 + fmov b5, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b2, b5 + fmov b5, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + unop + MUL a4, b2, b5 + fmov b5, t4 + LD a5, 3 * SIZE(AO) + + ADD c05, t1, b5 + fmov b5, c05 + unop + MUL a1, b3, b5 + fmov b5, t1 + LD b2, -1 * SIZE(BO) + + ADD c06, t2, b5 + fmov b5, c06 + unop + MUL a2, b3, b5 + fmov b5, t2 + unop + + ADD c07, t3, b5 + fmov b5, c07 + unop + MUL a3, b3, b5 + fmov b5, t3 + ldi AO, 8 * SIZE(AO) + + ADD c08, t4, b5 + fmov b5, c08 + unop + MUL a5, b3, b5 + fmov b5, t4 + LD b3, 0 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b4, b5 + fmov b5, t1 + LD a1, -4 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b4, b5 + fmov b5, t2 + LD a2, -3 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, b5 + fmov b5, t3 + LD a3, -2 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a5, b4, b5 + fmov b5, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L52 + .align 4 + +$L55: + ADD c05, t1, b5 + fmov b5, c05 + MUL a1, b1, b5 + fmov b5, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD c06, t2, b5 + fmov b5, c06 + MUL a2, b1, b5 + fmov b5, t2 + ADD c07, t3, b5 + fmov b5, c07 + MUL a3, b1, b5 + fmov b5, t3 + + ADD c08, t4, b5 + fmov b5, c08 + unop + MUL a4, b1, b5 + fmov b5, t4 + LD b1, 0 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b2, b5 + fmov b5, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b2, b5 + fmov b5, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b2, b5 + fmov b5, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b2, b5 + fmov b5, t4 + LD a4, 3 * SIZE(AO) + ldi AO, 4 * SIZE(AO) + + ADD c05, t1, b5 + fmov b5, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, b5 + fmov b5, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L57: + ADD c06, t2, b5 + fmov b5, c06 + MUL a2, b1, b5 + fmov b5, t2 + ADD c07, t3, b5 + fmov b5, c07 + MUL a3, b1, b5 + fmov b5, t3 + + ADD c08, t4, b5 + fmov b5, c08 + MUL a4, b1, b5 + fmov b5, t4 + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b2, b5 + fmov b5, t1 + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b2, b5 + fmov b5, t2 + ADD c03, t3, b5 + fmov b5, c03 + MUL a3, b2, b5 + fmov b5, t3 + + ADD c04, t4, b5 + fmov b5, c04 + ldi AO, 4 * SIZE(AO) + MUL a4, b2, b5 + fmov b5, t4 + ldi BO, 2 * SIZE(BO) + + ADD c05, t1, b5 + fmov b5, c05 + ADD c06, t2, b5 + fmov b5, c06 + ADD c07, t3, b5 + fmov b5, c07 + ADD c08, t4, b5 + fmov b5, c08 + .align 4 + +$L58: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 4, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 + SUB a3, c02, b5 + fmov b5, c02 + SUB a4, c06, b5 + fmov b5, c06 + + SUB b1, c03, b5 + fmov b5, c03 + SUB b2, c07, b5 + fmov b5, c07 + SUB b3, c04, b5 + fmov b5, c04 + SUB b4, c08, b5 + fmov b5, c08 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 + + SUB b1, c05, b5 + fmov b5, c05 + SUB b2, c06, b5 + fmov b5, c06 + SUB b3, c07, b5 + fmov b5, c07 + SUB b4, c08, b5 + fmov b5, c08 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, b5 + fmov b5, c04 + MUL a1, c08, b5 + fmov b5, c08 + + MUL a2, c04, b5 + fmov b5, t1 + MUL a2, c08, b5 + fmov b5, t2 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c07, t2, b5 + fmov b5, c07 + + MUL a3, c04, b5 + fmov b5, t1 + MUL a3, c08, b5 + fmov b5, t2 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + + MUL a4, c04, b5 + fmov b5, t1 + MUL a4, c08, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, b5 + fmov b5, c03 + MUL b1, c07, b5 + fmov b5, c07 + + MUL b2, c03, b5 + fmov b5, t1 + MUL b2, c07, b5 + fmov b5, t2 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + + MUL b3, c03, b5 + fmov b5, t1 + MUL b3, c07, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c06, b5 + fmov b5, c06 + + MUL a2, c02, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c05, b5 + fmov b5, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c05, b5 + fmov b5, c05 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c05, b5 + fmov b5, t2 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + + MUL a3, c01, b5 + fmov b5, t1 + MUL a3, c05, b5 + fmov b5, t2 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c07, t2, b5 + fmov b5, c07 + + MUL a4, c01, b5 + fmov b5, t1 + MUL a4, c05, b5 + fmov b5, t2 + + SUB c04, t1, b5 + fmov b5, c04 + SUB c08, t2, b5 + fmov b5, c08 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, b5 + fmov b5, c02 + MUL b1, c06, b5 + fmov b5, c06 + + MUL b2, c02, b5 + fmov b5, t1 + MUL b2, c06, b5 + fmov b5, t2 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c07, t2, b5 + fmov b5, c07 + + MUL b3, c02, b5 + fmov b5, t1 + MUL b3, c06, b5 + fmov b5, t2 + + SUB c04, t1, b5 + fmov b5, c04 + SUB c08, t2, b5 + fmov b5, c08 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c07, b5 + fmov b5, c07 + + MUL a2, c03, b5 + fmov b5, t1 + MUL a2, c07, b5 + fmov b5, t2 + + SUB c04, t1, b5 + fmov b5, c04 + SUB c08, t2, b5 + fmov b5, c08 + + MUL a3, c04, b5 + fmov b5, c04 + MUL a3, c08, b5 + fmov b5, c08 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c02, b5 + fmov b5, t2 + MUL a2, c03, b5 + fmov b5, t3 + MUL a2, c04, b5 + fmov b5, t4 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + SUB c07, t3, b5 + fmov b5, c07 + SUB c08, t4, b5 + fmov b5, c08 + + MUL a3, c05, b5 + fmov b5, c05 + MUL a3, c06, b5 + fmov b5, c06 + MUL a3, c07, b5 + fmov b5, c07 + MUL a3, c08, b5 + fmov b5, c08 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c06, b5 + fmov b5, c06 + MUL a1, c07, b5 + fmov b5, c07 + MUL a1, c08, b5 + fmov b5, c08 + + MUL a2, c05, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + MUL a2, c07, b5 + fmov b5, t3 + MUL a2, c08, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + SUB c03, t3, b5 + fmov b5, c03 + SUB c04, t4, b5 + fmov b5, c04 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c02, b5 + fmov b5, c02 + MUL a3, c03, b5 + fmov b5, c03 + MUL a3, c04, b5 + fmov b5, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c07, 5 * SIZE(BO) + ST c04, 6 * SIZE(BO) + ST c08, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) + ldi C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + +#ifndef LN + ldi C1, 4 * SIZE(C1) + ldi C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 4, KK +#endif + +#ifdef LN + subl KK, 4, KK +#endif + + ldi I, -1(I) + + bgt I, $L51 + .align 4 + +$L79: +#ifdef LN + sll K, 1 + BASE_SHIFT, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 2, KK +#endif + +#ifdef RT + subl KK, 2, KK +#endif + .align 4 + +$L80: + and N, 1, J + ble J, $L999 + +#ifdef RT + sll K, BASE_SHIFT, TMP1 + subl B, TMP1, B + + subl C, LDC, C +#endif + + mov C, C1 +#ifndef RT + addl C, LDC, C +#endif + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + and M, 1, I + ble I, $L100 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + unop + ble L, $L115 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L115 +#endif + .align 4 + +$L112: + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b1, b5 + fmov b5, t1 + LD a1, 4 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b2, b5 + fmov b5, t2 + LD a2, 5 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c03, t3, b5 + fmov b5, c03 + MUL a3, b3, b5 + fmov b5, t3 + LD a3, 6 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b4, b5 + fmov b5, t4 + LD a4, 7 * SIZE(AO) + LD b4, 7 * SIZE(BO) + + ldi L, -1(L) + ldi AO, 4 * SIZE(AO) + ldi BO, 4 * SIZE(BO) + bgt L, $L112 + .align 4 + +$L115: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L118 + .align 4 + +$L116: + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b1, b5 + fmov b5, t1 + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + ldi L, -1(L) + ldi AO, 1 * SIZE(AO) + ldi BO, 1 * SIZE(BO) + bgt L, $L116 + .align 4 + +$L118: + ADD c01, t1, b5 + fmov b5, c01 + ADD c02, t2, b5 + fmov b5, c02 + ADD c03, t3, b5 + fmov b5, c03 + ADD c04, t4, b5 + fmov b5, c04 + + ADD c01, c02, b5 + fmov b5, c01 + ADD c03, c04, b5 + fmov b5, c03 + ADD c01, c03, b5 + fmov b5, c01 + +#if defined(LN) || defined(RT) + subl KK, 1, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + addl B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 +#else + LD a1, 0 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -1 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + +#ifndef LN + ldi C1, 1 * SIZE(C1) +#endif + +#ifdef RT + SXADDQ K, AORIG, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L100: + and M, 2, I + ble I, $L110 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L105 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + ble L, $L105 +#endif + .align 5 + +$L102: + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -1(L) + MUL a1, b1, b5 + fmov b5, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b1, b5 + fmov b5, t2 + LD a2, 5 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c03, t3, b5 + fmov b5, c03 + ldi BO, 4 * SIZE(BO) + MUL a3, b2, b5 + fmov b5, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b2, b5 + fmov b5, t4 + LD a5, 7 * SIZE(AO) + LD b2, 1 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b3, b5 + fmov b5, t1 + LD a1, 8 * SIZE(AO) + ldi AO, 8 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b3, b5 + fmov b5, t2 + LD b3, 2 * SIZE(BO) + LD a2, 1 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, b5 + fmov b5, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a5, b4, b5 + fmov b5, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L102 + .align 4 + +$L105: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L108 + .align 4 + +$L106: + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -1(L) + MUL a1, b1, b5 + fmov b5, t1 + LD a1, 2 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b1, b5 + fmov b5, t2 + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + ldi AO, 2 * SIZE(AO) + unop + ldi BO, 1 * SIZE(BO) + bgt L, $L106 + .align 4 + +$L108: + ADD c01, t1, b5 + fmov b5, c01 + ADD c02, t2, b5 + fmov b5, c02 + ADD c03, t3, b5 + fmov b5, c03 + ADD c04, t4, b5 + fmov b5, c04 + + ADD c01, c03, b5 + fmov b5, c01 + ADD c02, c04, b5 + fmov b5, c02 + +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, b5 + fmov b5, c02 + MUL a2, c02, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + MUL a3, c01, b5 + fmov b5, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a2, c01, b5 + fmov b5, t1 + SUB c02, t1, b5 + fmov b5, c02 + MUL a3, c02, b5 + fmov b5, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + ldi C1, 2 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + .align 4 + +$L110: + sra M, 2, I + ble I, $L119 + .align 4 + +$L91: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L95 + +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L95 +#endif + .align 5 + +$L92: + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b1, b5 + fmov b5, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + ldi L, -1(L) + MUL a2, b1, b5 + fmov b5, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b1, b5 + fmov b5, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b1, b5 + fmov b5, t4 + LD a4, 7 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b2, b5 + fmov b5, t1 + LD a1, 8 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b2, b5 + fmov b5, t2 + LD a2, 9 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b2, b5 + fmov b5, t3 + LD a3, 10 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b2, b5 + fmov b5, t4 + LD a4, 11 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b3, b5 + fmov b5, t1 + LD a1, 12 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b3, b5 + fmov b5, t2 + LD a2, 13 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b3, b5 + fmov b5, t3 + LD a3, 14 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b3, b5 + fmov b5, t4 + LD a5, 15 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b4, b5 + fmov b5, t1 + LD a1, 16 * SIZE(AO) + ldi AO, 16 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b4, b5 + fmov b5, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, b5 + fmov b5, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a5, b4, b5 + fmov b5, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L92 + .align 4 + +$L95: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + unop + ble L, $L98 + .align 4 + +$L96: + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -1(L) + MUL a1, b1, b5 + fmov b5, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + ldi BO, 1 * SIZE(BO) + MUL a2, b1, b5 + fmov b5, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b1, b5 + fmov b5, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b1, b5 + fmov b5, t4 + LD a4, 7 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ldi AO, 4 * SIZE(AO) + bgt L, $L96 + .align 4 + +$L98: + ADD c01, t1, b5 + fmov b5, c01 + ADD c02, t2, b5 + fmov b5, c02 + ADD c03, t3, b5 + fmov b5, c03 + ADD c04, t4, b5 + fmov b5, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 4, TMP1 +#else + subl KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, b5 + fmov b5, c04 + MUL a2, c04, b5 + fmov b5, t1 + SUB c03, t1, b5 + fmov b5, c03 + MUL a3, c04, b5 + fmov b5, t1 + SUB c02, t1, b5 + fmov b5, c02 + MUL a4, c04, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, b5 + fmov b5, c03 + MUL b2, c03, b5 + fmov b5, t1 + SUB c02, t1, b5 + fmov b5, c02 + MUL b3, c03, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, b5 + fmov b5, c02 + MUL a2, c02, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + MUL a3, c01, b5 + fmov b5, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a2, c01, b5 + fmov b5, t1 + SUB c02, t1, b5 + fmov b5, c02 + MUL a3, c01, b5 + fmov b5, t1 + SUB c03, t1, b5 + fmov b5, c03 + MUL a4, c01, b5 + fmov b5, t1 + SUB c04, t1, b5 + fmov b5, c04 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, b5 + fmov b5, c02 + MUL b2, c02, b5 + fmov b5, t1 + SUB c03, t1, b5 + fmov b5, c03 + MUL b3, c02, b5 + fmov b5, t1 + SUB c04, t1, b5 + fmov b5, c04 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, b5 + fmov b5, c03 + MUL a2, c03, b5 + fmov b5, t1 + SUB c04, t1, b5 + fmov b5, c04 + MUL a3, c04, b5 + fmov b5, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + ldi C1, 4 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 4, KK +#endif + +#ifdef LN + subl KK, 4, KK +#endif + + ldi I, -1(I) + bgt I, $L91 + .align 4 + +$L119: +#ifdef LN + SXADDQ K, B, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 1, KK +#endif + +#ifdef RT + subl KK, 1, KK +#endif + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + ldl tmp, 64($sp) + clr $0 + ldi $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/trsm_kernel_4x4_LN.S.bak b/kernel/sw_64/trsm_kernel_4x4_LN.S.bak new file mode 100644 index 0000000..8405570 --- /dev/null +++ b/kernel/sw_64/trsm_kernel_4x4_LN.S.bak @@ -0,0 +1,4073 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(SW2B) +#error "Architecture is not specified." +#endif + +#ifdef SW2B +#define PREFETCHSIZE 56 +#define UNOP nop +#endif + +#ifdef EV6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 56 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $20 +#define B $21 +#define C $22 +#define LDC $23 + +#define C1 $19 +#define C2 $24 +#define C3 $25 +#define C4 $27 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + ldi $sp, -STACKSIZE($sp) + + ldl C, 0 + STACKSIZE($sp) + ldl LDC, 8 + STACKSIZE($sp) + ldl OFFSET, 16 + STACKSIZE($sp) + + SXADDQ LDC, 0, LDC + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + mull M, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ M, C, C +#endif + +#ifdef RN + negq OFFSET, KK +#endif + +#ifdef RT + mull N, K, TMP1 + SXADDQ TMP1, B, B + + mull N, LDC, TMP1 + addl TMP1, C, C + + subl N, OFFSET, KK +#endif + + sra N, 2, J + ble J, $L40 + .align 4 + +$L01: +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + subl B, TMP1, B + + s4addl LDC, 0, TMP1 + subl C, TMP1, C +#endif + + mov C, C1 + addl C, LDC, C2 + addl C2, LDC, C3 +#ifndef RT + s4addl LDC, C, C +#endif + + fclr t1 + addl C3, LDC, C4 + fclr t2 + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + fclr t3 + fclr t4 + + and M, 1, I + ble I, $L20 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + ldi L, -2(KK) + LD b2, 1 * SIZE(B) + ldi AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c09 + LD b4, 3 * SIZE(B) + fclr c13 + + ldi BO, 4 * SIZE(B) + ble KK, $L38 + + ble L, $L35 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c09 + LD b4, 3 * SIZE(BO) + fclr c13 + + ldi BO, 4 * SIZE(BO) + ble TMP1, $L38 + + ble L, $L35 +#endif + .align 4 + +$L32: + ADD c01, t1, c01 + ldi L, -2(L) + MUL a1, b1, t1 + LD b1, 0 * SIZE(BO) + + ADD c05, t2, c05 + ldi AO, 2 * SIZE(AO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + LD b5, 3 * SIZE(BO) + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, -1 * SIZE(AO) + + ADD c01, t1, c01 + MUL a2, b1, t1 + LD b1, 4 * SIZE(BO) + ldi BO, 8 * SIZE(BO) + + ADD c05, t2, c05 + MUL a2, b2, t2 + LD b2, -3 * SIZE(BO) + + ADD c09, t3, c09 + LD b4, -1 * SIZE(BO) + MUL a2, b3, t3 + LD b3, -2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a2, b5, t4 + LD a2, 0 * SIZE(AO) + bgt L, $L32 + .align 4 + +$L35: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L37 +#else + blbs TMP1, $L37 +#endif + .align 4 + + ADD c05, t2, c05 + LD b1, 0 * SIZE(BO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, 0 * SIZE(AO) + ldi AO, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L37: + ADD c05, t2, c05 + MUL a1, b2, t2 + ADD c09, t3, c09 + MUL a1, b3, t3 + + ADD c13, t4, c13 + ldi AO, 1 * SIZE(AO) + MUL a1, b4, t4 + ldi BO, 4 * SIZE(BO) + + ADD c01, t1, c01 + ADD c05, t2, c05 + ADD c09, t3, c09 + ADD c13, t4, c13 + +$L38: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 1, TMP1 +#else + subl KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO +#else + ldi AO, -1 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c05, t1, c05 + MUL a3, c01, t1 + SUB c09, t1, c09 + MUL a4, c01, t1 + SUB c13, t1, c13 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b2, c05, t1 + SUB c09, t1, c09 + MUL b3, c05, t1 + SUB c13, t1, c13 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a2, c09, t1 + SUB c13, t1, c13 + MUL a3, c13, c13 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a2, c13, t1 + SUB c09, t1, c09 + MUL a3, c13, t1 + SUB c05, t1, c05 + MUL a4, c13, t1 + SUB c01, t1, c01 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b2, c09, t1 + SUB c05, t1, c05 + MUL b3, c09, t1 + SUB c01, t1, c01 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a2, c05, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c13, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -1 * SIZE(C1) + ldi C2, -1 * SIZE(C2) + ldi C3, -1 * SIZE(C3) + ldi C4, -1 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c09, 0 * SIZE(C3) + ST c13, 0 * SIZE(C4) + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L20: + and M, 2, I + ble I, $L30 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(B) + ldi L, -2(KK) + LD b2, 1 * SIZE(B) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c01 + LD b4, 3 * SIZE(B) + fclr c05 + + ldi BO, 4 * SIZE(B) + fclr c02 + fclr c06 + ble KK, $L28 + + ble L, $L25 + +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c01 + LD b4, 3 * SIZE(BO) + fclr c05 + + ldi BO, 4 * SIZE(BO) + fclr c02 + fclr c06 + ble TMP1, $L28 + + ble L, $L25 +#endif + .align 4 + +$L22: + ADD c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + ldi BO, 8 * SIZE(BO) + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a3, b2, t3 + ldi AO, 4 * SIZE(AO) + + ADD c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD c01, t1, c01 + ldi L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD c09, t1, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + ldi AO, 2 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD c10, t2, c10 + MUL a2, b1, t2 + ADD c13, t3, c13 + MUL a1, b2, t3 + + ADD c14, t4, c14 + MUL a2, b2, t4 + ADD c01, t1, c01 + MUL a1, b3, t1 + + ADD c02, t2, c02 + MUL a2, b3, t2 + ADD c05, t3, c05 + MUL a1, b4, t3 + + ADD c06, t4, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b4, t4 + ldi BO, 4 * SIZE(BO) + + ADD c09, t1, c09 + ADD c10, t2, c10 + ADD c13, t3, c13 + ADD c14, t4, c14 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 + + SUB b1, c02, c02 + SUB b2, c06, c06 + SUB b3, c10, c10 + SUB b4, c14, c14 + +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c05, c05 + SUB a4, c06, c06 + + SUB b1, c09, c09 + SUB b2, c10, c10 + SUB b3, c13, c13 + SUB b4, c14, c14 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + MUL a1, c10, c10 + MUL a1, c14, c14 + + MUL a2, c02, t1 + MUL a2, c06, t2 + MUL a2, c10, t3 + MUL a2, c14, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + MUL a3, c01, c01 + MUL a3, c05, c05 + MUL a3, c09, c09 + MUL a3, c13, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 + + MUL a2, c01, t1 + MUL a2, c05, t2 + MUL a2, c09, t3 + MUL a2, c13, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a3, c02, c02 + MUL a3, c06, c06 + MUL a3, c10, c10 + MUL a3, c14, c14 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + + MUL a2, c01, t1 + MUL a2, c02, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a4, c01, t1 + MUL a4, c02, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b1, c06, c06 + + MUL b2, c05, t1 + MUL b2, c06, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL b3, c05, t1 + MUL b3, c06, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a1, c10, c10 + + MUL a2, c09, t1 + MUL a2, c10, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + MUL a3, c13, c13 + MUL a3, c14, c14 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a1, c14, c14 + + MUL a2, c13, t1 + MUL a2, c14, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a3, c13, t1 + MUL a3, c14, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a4, c13, t1 + MUL a4, c14, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b1, c10, c10 + + MUL b2, c09, t1 + MUL b2, c10, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL b3, c09, t1 + MUL b3, c10, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + + MUL a2, c05, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a3, c01, c01 + MUL a3, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c13, 6 * SIZE(AO) + ST c14, 7 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) + ldi C2, -2 * SIZE(C2) + ldi C3, -2 * SIZE(C3) + ldi C4, -2 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + +#ifndef LN + ldi C1, 2 * SIZE(C1) + ldi C2, 2 * SIZE(C2) + ldi C3, 2 * SIZE(C3) + ldi C4, 2 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + .align 4 + +$L30: + sra M, 2, I + ble I, $L39 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + + LD b3, 2 * SIZE(B) + fclr c06 + LD b4, 3 * SIZE(B) + fclr c05 + + fillcs 4 * SIZE(C1) + fclr c03 + ldi L, -2(KK) + fclr c04 + + fillcs 7 * SIZE(C2) + fclr c08 + ldi BO, 4 * SIZE(B) + fclr c13 + + fillcs 4 * SIZE(C3) + fclr c09 + ldi AO, 4 * SIZE(AO) + fclr c10 + + fillcs 7 * SIZE(C4) + fclr c14 + fclr c07 + ble KK, $L18 +#else + +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addl AORIG, TMP1, AO + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + + LD b3, 2 * SIZE(BO) + fclr c06 + LD b4, 3 * SIZE(BO) + fclr c05 + + fillcs 4 * SIZE(C1) + fclr c03 + ldi L, -2(TMP1) + fclr c04 + + fillcs 7 * SIZE(C2) + fclr c08 + ldi BO, 4 * SIZE(BO) + fclr c13 + + fillcs 4 * SIZE(C3) + fclr c09 + ldi AO, 4 * SIZE(AO) + fclr c10 + + fillcs 7 * SIZE(C4) + fclr c14 + fclr c07 + ble TMP1, $L18 +#endif + + ble L, $L15 + .align 5 + +$L12: +/* 1 */ + ADD c11, t1, c11 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD c12, t2, c12 + ldi L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD c03, t1, c03 + ldi AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD c04, t2, c04 + ldi BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD c11, t1, c11 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD c06, t3, c06 + MUL b2, a4, t3 + ADD c05, t4, c05 + MUL b4, a1, t4 + + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + ldi AO, 4 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + MUL b1, a4, t2 + ADD c06, t3, c06 + MUL b2, a4, t3 + + ADD c05, t4, c05 + MUL b4, a1, t4 + ADD c03, t1, c03 + MUL b3, a1, t1 + + ADD c04, t2, c04 + MUL b3, a2, t2 + ADD c08, t3, c08 + MUL b4, a2, t3 + + ADD c13, t4, c13 + MUL b2, a3, t4 + ADD c09, t1, c09 + MUL b3, a3, t1 + + ADD c10, t2, c10 + MUL b3, a4, t2 + ADD c14, t3, c14 + MUL b4, a4, t3 + + ADD c07, t4, c07 + ldi AO, 4 * SIZE(AO) + MUL b4, a3, t4 + ldi BO, 4 * SIZE(BO) + + ADD c11, t1, c11 + ADD c12, t2, c12 + ADD c16, t3, c16 + ADD c15, t4, c15 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 4, TMP1 +#else + subl KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 + + SUB b1, c02, c02 + SUB b2, c06, c06 + SUB b3, c10, c10 + SUB b4, c14, c14 + + LD a1, 8 * SIZE(BO) + LD a2, 9 * SIZE(BO) + LD a3, 10 * SIZE(BO) + LD a4, 11 * SIZE(BO) + + LD b1, 12 * SIZE(BO) + LD b2, 13 * SIZE(BO) + LD b3, 14 * SIZE(BO) + LD b4, 15 * SIZE(BO) + + SUB a1, c03, c03 + SUB a2, c07, c07 + SUB a3, c11, c11 + SUB a4, c15, c15 + + SUB b1, c04, c04 + SUB b2, c08, c08 + SUB b3, c12, c12 + SUB b4, c16, c16 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c05, c05 + SUB b2, c06, c06 + SUB b3, c07, c07 + SUB b4, c08, c08 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b1, 12 * SIZE(AO) + LD b2, 13 * SIZE(AO) + LD b3, 14 * SIZE(AO) + LD b4, 15 * SIZE(AO) + + SUB a1, c09, c09 + SUB a2, c10, c10 + SUB a3, c11, c11 + SUB a4, c12, c12 + + SUB b1, c13, c13 + SUB b2, c14, c14 + SUB b3, c15, c15 + SUB b4, c16, c16 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a1, c08, c08 + MUL a1, c12, c12 + MUL a1, c16, c16 + + MUL a2, c04, t1 + MUL a2, c08, t2 + MUL a2, c12, t3 + MUL a2, c16, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL a3, c04, t1 + MUL a3, c08, t2 + MUL a3, c12, t3 + MUL a3, c16, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a4, c04, t1 + MUL a4, c08, t2 + MUL a4, c12, t3 + MUL a4, c16, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b1, c07, c07 + MUL b1, c11, c11 + MUL b1, c15, c15 + + MUL b2, c03, t1 + MUL b2, c07, t2 + MUL b2, c11, t3 + MUL b2, c15, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL b3, c03, t1 + MUL b3, c07, t2 + MUL b3, c11, t3 + MUL b3, c15, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + MUL a1, c10, c10 + MUL a1, c14, c14 + + MUL a2, c02, t1 + MUL a2, c06, t2 + MUL a2, c10, t3 + MUL a2, c14, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + MUL a3, c01, c01 + MUL a3, c05, c05 + MUL a3, c09, c09 + MUL a3, c13, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 + + MUL a2, c01, t1 + MUL a2, c05, t2 + MUL a2, c09, t3 + MUL a2, c13, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a3, c01, t1 + MUL a3, c05, t2 + MUL a3, c09, t3 + MUL a3, c13, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL a4, c01, t1 + MUL a4, c05, t2 + MUL a4, c09, t3 + MUL a4, c13, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b1, c06, c06 + MUL b1, c10, c10 + MUL b1, c14, c14 + + MUL b2, c02, t1 + MUL b2, c06, t2 + MUL b2, c10, t3 + MUL b2, c14, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL b3, c02, t1 + MUL b3, c06, t2 + MUL b3, c10, t3 + MUL b3, c14, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a1, c07, c07 + MUL a1, c11, c11 + MUL a1, c15, c15 + + MUL a2, c03, t1 + MUL a2, c07, t2 + MUL a2, c11, t3 + MUL a2, c15, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + MUL a3, c04, c04 + MUL a3, c08, c08 + MUL a3, c12, c12 + MUL a3, c16, c16 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + MUL a2, c01, t1 + MUL a2, c02, t2 + MUL a2, c03, t3 + MUL a2, c04, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c01, t1 + MUL a4, c02, t2 + MUL a4, c03, t3 + MUL a4, c04, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b1, c06, c06 + MUL b1, c07, c07 + MUL b1, c08, c08 + + MUL b2, c05, t1 + MUL b2, c06, t2 + MUL b2, c07, t3 + MUL b2, c08, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL b3, c05, t1 + MUL b3, c06, t2 + MUL b3, c07, t3 + MUL b3, c08, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + MUL a2, c09, t1 + MUL a2, c10, t2 + MUL a2, c11, t3 + MUL a2, c12, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + MUL a3, c13, c13 + MUL a3, c14, c14 + MUL a3, c15, c15 + MUL a3, c16, c16 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a1, c14, c14 + MUL a1, c15, c15 + MUL a1, c16, c16 + + MUL a2, c13, t1 + MUL a2, c14, t2 + MUL a2, c15, t3 + MUL a2, c16, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a3, c13, t1 + MUL a3, c14, t2 + MUL a3, c15, t3 + MUL a3, c16, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a4, c13, t1 + MUL a4, c14, t2 + MUL a4, c15, t3 + MUL a4, c16, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b1, c10, c10 + MUL b1, c11, c11 + MUL b1, c12, c12 + + MUL b2, c09, t1 + MUL b2, c10, t2 + MUL b2, c11, t3 + MUL b2, c12, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL b3, c09, t1 + MUL b3, c10, t2 + MUL b3, c11, t3 + MUL b3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + MUL a1, c07, c07 + MUL a1, c08, c08 + + MUL a2, c05, t1 + MUL a2, c06, t2 + MUL a2, c07, t3 + MUL a2, c08, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a3, c01, c01 + MUL a3, c02, c02 + MUL a3, c03, c03 + MUL a3, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) + + ST c03, 8 * SIZE(BO) + ST c07, 9 * SIZE(BO) + ST c11, 10 * SIZE(BO) + ST c15, 11 * SIZE(BO) + + ST c04, 12 * SIZE(BO) + ST c08, 13 * SIZE(BO) + ST c12, 14 * SIZE(BO) + ST c16, 15 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) + + ST c09, 8 * SIZE(AO) + ST c10, 9 * SIZE(AO) + ST c11, 10 * SIZE(AO) + ST c12, 11 * SIZE(AO) + + ST c13, 12 * SIZE(AO) + ST c14, 13 * SIZE(AO) + ST c15, 14 * SIZE(AO) + ST c16, 15 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) + ldi C2, -4 * SIZE(C2) + ldi C3, -4 * SIZE(C3) + ldi C4, -4 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c11, 2 * SIZE(C3) + ST c12, 3 * SIZE(C3) + + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + ST c15, 2 * SIZE(C4) + ST c16, 3 * SIZE(C4) + +#ifndef LN + ldi C1, 4 * SIZE(C1) + ldi C2, 4 * SIZE(C2) + ldi C3, 4 * SIZE(C3) + ldi C4, 4 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP1 + addl AO, TMP1, AO + addl BO, TMP1, BO +#endif + +#ifdef LT + addl KK, 4, KK +#endif + +#ifdef LN + subl KK, 4, KK +#endif + + ldi I, -1(I) + + bgt I, $L11 + .align 4 + +$L39: +#ifdef LN + sll K, 2 + BASE_SHIFT, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 4, KK +#endif + +#ifdef RT + subl KK, 4, KK +#endif + ldi J, -1(J) + bgt J, $L01 + .align 4 + +$L40: + and N, 2, J + ble J, $L80 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + subl B, TMP1, B + + addl LDC, LDC, TMP1 + subl C, TMP1, C +#endif + + mov C, C1 + addl C, LDC, C2 + fclr t1 +#ifndef RT + addl C2, LDC, C +#endif + fclr t2 + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + fclr t3 + fclr t4 + + and M, 1, I + ble I, $L60 + +#if defined(LT) || defined(RN) + + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + fclr c02 + LD b2, 1 * SIZE(B) + fclr c06 + + ldi L, -2(KK) + + LD b3, 2 * SIZE(B) + ldi AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(B) + ldi BO, 2 * SIZE(B) + + ble KK, $L78 + + ble L, $L75 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + fclr c02 + LD b2, 1 * SIZE(BO) + fclr c06 + + ldi L, -2(TMP1) + + LD b3, 2 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(BO) + ldi BO, 2 * SIZE(BO) + + ble TMP1, $L78 + + ble L, $L75 +#endif + .align 4 + +$L72: + ADD c01, t1, c01 + ldi L, -2(L) + MUL a1, b1, t1 + LD b1, 2 * SIZE(BO) + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 1 * SIZE(AO) + LD b2, 3 * SIZE(BO) + + ADD c02, t3, c02 + ldi AO, 2 * SIZE(AO) + MUL a2, b3, t3 + LD b3, 4 * SIZE(BO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD a2, 0 * SIZE(AO) + LD b4, 5 * SIZE(BO) + + ldi BO, 4 * SIZE(BO) + unop + unop + bgt L, $L72 + .align 4 + +$L75: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L77 +#else + blbs TMP1, $L77 +#endif + .align 4 + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + LD b2, 1 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + MUL a1, b1, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L77: + ADD c05, t2, c05 + MUL a1, b2, t2 + ADD c02, t3, c02 + ADD c06, t4, c06 + + ADD c01, c02, c01 + ldi AO, 1 * SIZE(AO) + ADD c05, c06, c05 + ldi BO, 2 * SIZE(BO) + + ADD c01, t1, c01 + ADD c05, t2, c05 + + .align 4 + +$L78: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 1, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -1 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c05, c05 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c05, t1, c05 + MUL a3, c05, c05 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a2, c05, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -1 * SIZE(C1) + ldi C2, -1 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L60: + and M, 2, I + ble I, $L70 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + ldi L, -2(KK) + LD b2, 1 * SIZE(B) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + ldi BO, 2 * SIZE(B) + + ble KK, $L68 + + ble L, $L65 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + ldi BO, 2 * SIZE(BO) + + ble TMP1, $L68 + + ble L, $L65 +#endif + .align 4 + +$L62: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD c02, t2, c02 + ldi AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD c05, t3, c05 + ldi L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + ldi BO, 4 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L62 + .align 4 + +$L65: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L67 +#else + blbs TMP1, $L67 +#endif + .align 4 + + ADD c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c05, t3, c05 + ldi BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + ldi AO, 2 * SIZE(AO) + .align 4 + +$L67: + ADD c02, t2, c02 + MUL a2, b1, t2 + ADD c05, t3, c05 + MUL a1, b2, t3 + + ADD c06, t4, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b2, t4 + ldi BO, 2 * SIZE(BO) + + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c05, t3, c05 + ADD c06, t4, c06 + .align 4 + +$L68: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c02, c02 + SUB a4, c06, c06 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c05, c05 + SUB a4, c06, c06 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + + MUL a2, c02, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + MUL a3, c01, c01 + MUL a3, c05, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + + MUL a2, c01, t1 + MUL a2, c05, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a3, c02, c02 + MUL a3, c06, c06 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + + MUL a2, c01, t1 + MUL a2, c02, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a3, c05, c05 + MUL a3, c06, c06 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + + MUL a2, c05, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a3, c01, c01 + MUL a3, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) + ldi C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + +#ifndef LN + ldi C1, 2 * SIZE(C1) + ldi C2, 2 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + .align 4 + +$L70: + sra M, 2, I + ble I, $L79 + .align 4 + +$L51: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi L, -2(KK) + + ldi BO, 2 * SIZE(B) + ldi AO, 4 * SIZE(AO) + + ble KK, $L58 + + ble L, $L55 +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi L, -2(TMP1) + ldi BO, 2 * SIZE(BO) + ldi AO, 4 * SIZE(AO) + + ble TMP1, $L58 + + ble L, $L55 +#endif + .align 4 + +$L52: + ADD c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD c06, t2, c06 + ldi L, -2(L) + MUL a2, b1, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b3, t3 + ldi AO, 8 * SIZE(AO) + + ADD c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L52 + .align 4 + +$L55: + ADD c05, t1, c05 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + ldi AO, 4 * SIZE(AO) + + ADD c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L57: + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + MUL a4, b1, t4 + ADD c01, t1, c01 + MUL a1, b2, t1 + + ADD c02, t2, c02 + MUL a2, b2, t2 + ADD c03, t3, c03 + MUL a3, b2, t3 + + ADD c04, t4, c04 + ldi AO, 4 * SIZE(AO) + MUL a4, b2, t4 + ldi BO, 2 * SIZE(BO) + + ADD c05, t1, c05 + ADD c06, t2, c06 + ADD c07, t3, c07 + ADD c08, t4, c08 + .align 4 + +$L58: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 4, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c02, c02 + SUB a4, c06, c06 + + SUB b1, c03, c03 + SUB b2, c07, c07 + SUB b3, c04, c04 + SUB b4, c08, c08 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c05, c05 + SUB b2, c06, c06 + SUB b3, c07, c07 + SUB b4, c08, c08 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a1, c08, c08 + + MUL a2, c04, t1 + MUL a2, c08, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL a3, c04, t1 + MUL a3, c08, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a4, c04, t1 + MUL a4, c08, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b1, c07, c07 + + MUL b2, c03, t1 + MUL b2, c07, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL b3, c03, t1 + MUL b3, c07, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + + MUL a2, c02, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + MUL a3, c01, c01 + MUL a3, c05, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + + MUL a2, c01, t1 + MUL a2, c05, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a3, c01, t1 + MUL a3, c05, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL a4, c01, t1 + MUL a4, c05, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b1, c06, c06 + + MUL b2, c02, t1 + MUL b2, c06, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL b3, c02, t1 + MUL b3, c06, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a1, c07, c07 + + MUL a2, c03, t1 + MUL a2, c07, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + MUL a3, c04, c04 + MUL a3, c08, c08 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + MUL a2, c01, t1 + MUL a2, c02, t2 + MUL a2, c03, t3 + MUL a2, c04, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a3, c05, c05 + MUL a3, c06, c06 + MUL a3, c07, c07 + MUL a3, c08, c08 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + MUL a1, c07, c07 + MUL a1, c08, c08 + + MUL a2, c05, t1 + MUL a2, c06, t2 + MUL a2, c07, t3 + MUL a2, c08, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a3, c01, c01 + MUL a3, c02, c02 + MUL a3, c03, c03 + MUL a3, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c07, 5 * SIZE(BO) + ST c04, 6 * SIZE(BO) + ST c08, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) + ldi C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + +#ifndef LN + ldi C1, 4 * SIZE(C1) + ldi C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 4, KK +#endif + +#ifdef LN + subl KK, 4, KK +#endif + + ldi I, -1(I) + + bgt I, $L51 + .align 4 + +$L79: +#ifdef LN + sll K, 1 + BASE_SHIFT, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 2, KK +#endif + +#ifdef RT + subl KK, 2, KK +#endif + .align 4 + +$L80: + and N, 1, J + ble J, $L999 + +#ifdef RT + sll K, BASE_SHIFT, TMP1 + subl B, TMP1, B + + subl C, LDC, C +#endif + + mov C, C1 +#ifndef RT + addl C, LDC, C +#endif + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + and M, 1, I + ble I, $L100 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + unop + ble L, $L115 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L115 +#endif + .align 4 + +$L112: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c02, t2, c02 + MUL a2, b2, t2 + LD a2, 5 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c03, t3, c03 + MUL a3, b3, t3 + LD a3, 6 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c04, t4, c04 + MUL a4, b4, t4 + LD a4, 7 * SIZE(AO) + LD b4, 7 * SIZE(BO) + + ldi L, -1(L) + ldi AO, 4 * SIZE(AO) + ldi BO, 4 * SIZE(BO) + bgt L, $L112 + .align 4 + +$L115: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L118 + .align 4 + +$L116: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + ldi L, -1(L) + ldi AO, 1 * SIZE(AO) + ldi BO, 1 * SIZE(BO) + bgt L, $L116 + .align 4 + +$L118: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c02, c01 + ADD c03, c04, c03 + ADD c01, c03, c01 + +#if defined(LN) || defined(RT) + subl KK, 1, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + addl B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + + SUB a1, c01, c01 +#else + LD a1, 0 * SIZE(AO) + + SUB a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -1 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + +#ifndef LN + ldi C1, 1 * SIZE(C1) +#endif + +#ifdef RT + SXADDQ K, AORIG, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L100: + and M, 2, I + ble I, $L110 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L105 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + ble L, $L105 +#endif + .align 5 + +$L102: + ADD c01, t1, c01 + ldi L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c03, t3, c03 + ldi BO, 4 * SIZE(BO) + MUL a3, b2, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a5, 7 * SIZE(AO) + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b3, t1 + LD a1, 8 * SIZE(AO) + ldi AO, 8 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L102 + .align 4 + +$L105: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L108 + .align 4 + +$L106: + ADD c01, t1, c01 + ldi L, -1(L) + MUL a1, b1, t1 + LD a1, 2 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + ldi AO, 2 * SIZE(AO) + unop + ldi BO, 1 * SIZE(BO) + bgt L, $L106 + .align 4 + +$L108: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c03, c01 + ADD c02, c04, c02 + +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a2, c02, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c02, t1, c02 + MUL a3, c02, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + ldi C1, 2 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + .align 4 + +$L110: + sra M, 2, I + ble I, $L119 + .align 4 + +$L91: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L95 + +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L95 +#endif + .align 5 + +$L92: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + ldi L, -1(L) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 8 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 9 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 10 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 11 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + LD a1, 12 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD a2, 13 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b3, t3 + LD a3, 14 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b3, t4 + LD a5, 15 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b4, t1 + LD a1, 16 * SIZE(AO) + ldi AO, 16 * SIZE(AO) + + ADD c02, t2, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b4, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L92 + .align 4 + +$L95: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + unop + ble L, $L98 + .align 4 + +$L96: + ADD c01, t1, c01 + ldi L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + ldi BO, 1 * SIZE(BO) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ldi AO, 4 * SIZE(AO) + bgt L, $L96 + .align 4 + +$L98: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 4, TMP1 +#else + subl KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a2, c04, t1 + SUB c03, t1, c03 + MUL a3, c04, t1 + SUB c02, t1, c02 + MUL a4, c04, t1 + SUB c01, t1, c01 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b2, c03, t1 + SUB c02, t1, c02 + MUL b3, c03, t1 + SUB c01, t1, c01 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a2, c02, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c02, t1, c02 + MUL a3, c01, t1 + SUB c03, t1, c03 + MUL a4, c01, t1 + SUB c04, t1, c04 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b2, c02, t1 + SUB c03, t1, c03 + MUL b3, c02, t1 + SUB c04, t1, c04 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a2, c03, t1 + SUB c04, t1, c04 + MUL a3, c04, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + ldi C1, 4 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 4, KK +#endif + +#ifdef LN + subl KK, 4, KK +#endif + + ldi I, -1(I) + bgt I, $L91 + .align 4 + +$L119: +#ifdef LN + SXADDQ K, B, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 1, KK +#endif + +#ifdef RT + subl KK, 1, KK +#endif + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + clr $0 + ldi $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/trsm_kernel_4x4_LT.S b/kernel/sw_64/trsm_kernel_4x4_LT.S new file mode 100644 index 0000000..54f8a51 --- /dev/null +++ b/kernel/sw_64/trsm_kernel_4x4_LT.S @@ -0,0 +1,5145 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(SW6) +#error "Architecture is not specified." +#endif + +#ifdef SW6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 56 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + +#define STACKSIZE 88 + +#define M $16 +#define N $17 +#define K $18 +#define A $20 +#define B $21 +#define C $22 +#define LDC $23 + +#define C1 $19 +#define C2 $24 +#define C3 $25 +#define C4 $27 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define tmp $9 + +#define alpha $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + ldi $sp, -STACKSIZE($sp) + + ldl C, 0 + STACKSIZE($sp) + ldl LDC, 8 + STACKSIZE($sp) + ldl OFFSET, 16 + STACKSIZE($sp) + + SXADDQ LDC, 0, LDC + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + stl $9, 64($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + mulq M, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ M, C, C +#endif + +#ifdef RN + negl OFFSET, KK +#endif + +#ifdef RT + mulq N, K, TMP1 + SXADDQ TMP1, B, B + + mulq N, LDC, TMP1 + addl TMP1, C, C + + subl N, OFFSET, KK +#endif + + sra N, 2, J + ble J, $L40 + .align 4 + +$L01: +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + subl B, TMP1, B + + s4addl LDC, 0, TMP1 + subl C, TMP1, C +#endif + + mov C, C1 + addl C, LDC, C2 + addl C2, LDC, C3 +#ifndef RT + s4addl LDC, C, C +#endif + + fclr t1 + addl C3, LDC, C4 + fclr t2 + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L20 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + + LD b3, 2 * SIZE(B) + fclr c06 + LD b4, 3 * SIZE(B) + fclr c05 + + fillcs 4 * SIZE(C1) + fclr c03 + ldi L, -2(KK) + fclr c04 + + fillcs 7 * SIZE(C2) + fclr c08 + ldi BO, 4 * SIZE(B) + fclr c13 + + fillcs 4 * SIZE(C3) + flds $f31, 4 * SIZE(C3) + fclr c09 + ldi AO, 4 * SIZE(AO) + fclr c10 + + fillcs 7 * SIZE(C4) + fclr c14 + fclr c07 + ble KK, $L18 +#else + +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addl AORIG, TMP1, AO + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + + LD b3, 2 * SIZE(BO) + fclr c06 + LD b4, 3 * SIZE(BO) + fclr c05 + + fillcs 4 * SIZE(C1) + fclr c03 + ldi L, -2(TMP1) + fclr c04 + + fillcs 7 * SIZE(C2) + fclr c08 + ldi BO, 4 * SIZE(BO) + fclr c13 + + fillcs 4 * SIZE(C3) + fclr c09 + ldi AO, 4 * SIZE(AO) + fclr c10 + + fillcs 7 * SIZE(C4) + fclr c14 + fclr c07 + ble TMP1, $L18 +#endif + + ble L, $L15 + .align 5 + +$L12: +/* 1 */ + ADD c11, t1, b5 + fmov b5, c11 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD c12, t2, b5 + fmov b5, c12 + unop + MUL b1, a2, t2 + unop + + ADD c16, t3, b5 + fmov b5, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD c15, t4, b5 + fmov b5, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + FIMOVD b5, tmp +/* 2 */ + ADD c01, t1, b5 + fmov b5, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD c02, t2, b5 + fmov b5, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD c06, t3, b5 + fmov b5, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, b5 + fmov b5, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD c03, t1, b5 + fmov b5, c03 + unop + MUL b3, a1, t1 + unop + + ADD c04, t2, b5 + fmov b5, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, b5 + fmov b5, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, b5 + fmov b5, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD c09, t1, b5 + fmov b5, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, b5 + fmov b5, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD c07, t4, b5 + fmov b5, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD c11, t1, b5 + fmov b5, c11 + unop + IFMOVD tmp, b5 + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD c12, t2, b5 + fmov b5, c12 + ldi L, -2(L) + IFMOVD tmp, b5 + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD c16, t3, b5 + fmov b5, c16 + unop + MUL b2, a2, t3 + unop + + ADD c15, t4, b5 + fmov b5, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD c01, t1, b5 + fmov b5, c01 + unop + IFMOVD tmp, b5 + MUL b5, a6, t1 + unop + + ADD c02, t2, b5 + fmov b5, c02 + unop + IFMOVD tmp, b5 + MUL b5, a4, t2 + unop + + ADD c06, t3, b5 + fmov b5, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, b5 + fmov b5, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD c03, t1, b5 + fmov b5, c03 + ldi AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD c04, t2, b5 + fmov b5, c04 + ldi BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD c08, t3, b5 + fmov b5, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD c13, t4, b5 + fmov b5, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD c09, t1, b5 + fmov b5, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD c14, t3, b5 + fmov b5, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, b5 + fmov b5, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD c11, t1, b5 + fmov b5, c11 + MUL b1, a1, b5 + fmov b5, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD c12, t2, b5 + fmov b5, c12 + MUL b1, a2, b5 + fmov b5, t2 + ADD c16, t3, b5 + fmov b5, c16 + MUL b2, a2, b5 + fmov b5, t3 + + ADD c15, t4, b5 + fmov b5, c15 + MUL b2, a1, b5 + fmov b5, t4 + ADD c01, t1, b5 + fmov b5, c01 + MUL b1, a3, b5 + fmov b5, t1 + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL b1, a4, b5 + fmov b5, t2 + LD b1, 0 * SIZE(BO) + + ADD c06, t3, b5 + fmov b5, c06 + MUL b2, a4, b5 + fmov b5, t3 + ADD c05, t4, b5 + fmov b5, c05 + MUL b4, a1, b5 + fmov b5, t4 + + ADD c03, t1, b5 + fmov b5, c03 + unop + MUL b3, a1, b5 + fmov b5, t1 + LD a1, 0 * SIZE(AO) + + ADD c04, t2, b5 + fmov b5, c04 + unop + MUL b3, a2, b5 + fmov b5, t2 + unop + + ADD c08, t3, b5 + fmov b5, c08 + unop + MUL b4, a2, b5 + fmov b5, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, b5 + fmov b5, c13 + unop + MUL b2, a3, b5 + fmov b5, t4 + LD b2, 1 * SIZE(BO) + + ADD c09, t1, b5 + fmov b5, c09 + unop + MUL b3, a3, b5 + fmov b5, t1 + ldi AO, 4 * SIZE(AO) + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL b3, a4, b5 + fmov b5, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, b5 + fmov b5, c14 + unop + MUL b4, a4, b5 + fmov b5, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, b5 + fmov b5, c07 + unop + MUL b4, a3, b5 + fmov b5, t4 + LD a3, -2 * SIZE(AO) + + ADD c11, t1, b5 + fmov b5, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, b5 + fmov b5, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD c12, t2, b5 + fmov b5, c12 + MUL b1, a2, b5 + fmov b5, t2 + ADD c16, t3, b5 + fmov b5, c16 + MUL b2, a2, b5 + fmov b5, t3 + + ADD c15, t4, b5 + fmov b5, c15 + MUL b2, a1, b5 + fmov b5, t4 + ADD c01, t1, b5 + fmov b5, c01 + MUL b1, a3, b5 + fmov b5, t1 + + ADD c02, t2, b5 + fmov b5, c02 + MUL b1, a4, b5 + fmov b5, t2 + ADD c06, t3, b5 + fmov b5, c06 + MUL b2, a4, b5 + fmov b5, t3 + + ADD c05, t4, b5 + fmov b5, c05 + MUL b4, a1, b5 + fmov b5, t4 + ADD c03, t1, b5 + fmov b5, c03 + MUL b3, a1, b5 + fmov b5, t1 + + ADD c04, t2, b5 + fmov b5, c04 + MUL b3, a2, b5 + fmov b5, t2 + ADD c08, t3, b5 + fmov b5, c08 + MUL b4, a2, b5 + fmov b5, t3 + + ADD c13, t4, b5 + fmov b5, c13 + MUL b2, a3, b5 + fmov b5, t4 + ADD c09, t1, b5 + fmov b5, c09 + MUL b3, a3, b5 + fmov b5, t1 + + ADD c10, t2, b5 + fmov b5, c10 + MUL b3, a4, b5 + fmov b5, t2 + ADD c14, t3, b5 + fmov b5, c14 + MUL b4, a4, b5 + fmov b5, t3 + + ADD c07, t4, b5 + fmov b5, c07 + ldi AO, 4 * SIZE(AO) + MUL b4, a3, b5 + fmov b5, t4 + ldi BO, 4 * SIZE(BO) + + ADD c11, t1, b5 + fmov b5, c11 + ADD c12, t2, b5 + fmov b5, c12 + ADD c16, t3, b5 + fmov b5, c16 + ADD c15, t4, b5 + fmov b5, c15 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 4, TMP1 +#else + subl KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c13, b5 + fmov b5, c13 + + SUB b1, c02, b5 + fmov b5, c02 + SUB b2, c06, b5 + fmov b5, c06 + SUB b3, c10, b5 + fmov b5, c10 + SUB b4, c14, b5 + fmov b5, c14 + + LD a1, 8 * SIZE(BO) + LD a2, 9 * SIZE(BO) + LD a3, 10 * SIZE(BO) + LD a4, 11 * SIZE(BO) + + LD b1, 12 * SIZE(BO) + LD b2, 13 * SIZE(BO) + LD b3, 14 * SIZE(BO) + LD b4, 15 * SIZE(BO) + + SUB a1, c03, b5 + fmov b5, c03 + SUB a2, c07, b5 + fmov b5, c07 + SUB a3, c11, b5 + fmov b5, c11 + SUB a4, c15, b5 + fmov b5, c15 + + SUB b1, c04, b5 + fmov b5, c04 + SUB b2, c08, b5 + fmov b5, c08 + SUB b3, c12, b5 + fmov b5, c12 + SUB b4, c16, b5 + fmov b5, c16 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 + + SUB b1, c05, b5 + fmov b5, c05 + SUB b2, c06, b5 + fmov b5, c06 + SUB b3, c07, b5 + fmov b5, c07 + SUB b4, c08, b5 + fmov b5, c08 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b1, 12 * SIZE(AO) + LD b2, 13 * SIZE(AO) + LD b3, 14 * SIZE(AO) + LD b4, 15 * SIZE(AO) + + SUB a1, c09, b5 + fmov b5, c09 + SUB a2, c10, b5 + fmov b5, c10 + SUB a3, c11, b5 + fmov b5, c11 + SUB a4, c12, b5 + fmov b5, c12 + + SUB b1, c13, b5 + fmov b5, c13 + SUB b2, c14, b5 + fmov b5, c14 + SUB b3, c15, b5 + fmov b5, c15 + SUB b4, c16, b5 + fmov b5, c16 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, b5 + fmov b5, c04 + MUL a1, c08, b5 + fmov b5, c08 + MUL a1, c12, b5 + fmov b5, c12 + MUL a1, c16, b5 + fmov b5, c16 + + MUL a2, c04, b5 + fmov b5, t1 + MUL a2, c08, b5 + fmov b5, t2 + MUL a2, c12, b5 + fmov b5, t3 + MUL a2, c16, b5 + fmov b5, t4 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c07, t2, b5 + fmov b5, c07 + SUB c11, t3, b5 + fmov b5, c11 + SUB c15, t4, b5 + fmov b5, c15 + + MUL a3, c04, b5 + fmov b5, t1 + MUL a3, c08, b5 + fmov b5, t2 + MUL a3, c12, b5 + fmov b5, t3 + MUL a3, c16, b5 + fmov b5, t4 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + SUB c10, t3, b5 + fmov b5, c10 + SUB c14, t4, b5 + fmov b5, c14 + + MUL a4, c04, b5 + fmov b5, t1 + MUL a4, c08, b5 + fmov b5, t2 + MUL a4, c12, b5 + fmov b5, t3 + MUL a4, c16, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + SUB c09, t3, b5 + fmov b5, c09 + SUB c13, t4, b5 + fmov b5, c13 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, b5 + fmov b5, c03 + MUL b1, c07, b5 + fmov b5, c07 + MUL b1, c11, b5 + fmov b5, c11 + MUL b1, c15, b5 + fmov b5, c15 + + MUL b2, c03, b5 + fmov b5, t1 + MUL b2, c07, b5 + fmov b5, t2 + MUL b2, c11, b5 + fmov b5, t3 + MUL b2, c15, b5 + fmov b5, t4 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + SUB c10, t3, b5 + fmov b5, c10 + SUB c14, t4, b5 + fmov b5, c14 + + MUL b3, c03, b5 + fmov b5, t1 + MUL b3, c07, b5 + fmov b5, t2 + MUL b3, c11, b5 + fmov b5, t3 + MUL b3, c15, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + SUB c09, t3, b5 + fmov b5, c09 + SUB c13, t4, b5 + fmov b5, c13 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c06, b5 + fmov b5, c06 + MUL a1, c10, b5 + fmov b5, c10 + MUL a1, c14, b5 + fmov b5, c14 + + MUL a2, c02, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + MUL a2, c10, b5 + fmov b5, t3 + MUL a2, c14, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + SUB c09, t3, b5 + fmov b5, c09 + SUB c13, t4, b5 + fmov b5, c13 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c05, b5 + fmov b5, c05 + MUL a3, c09, b5 + fmov b5, c09 + MUL a3, c13, b5 + fmov b5, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c13, b5 + fmov b5, c13 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c05, b5 + fmov b5, t2 + MUL a2, c09, b5 + fmov b5, t3 + MUL a2, c13, b5 + fmov b5, t4 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + SUB c10, t3, b5 + fmov b5, c10 + SUB c14, t4, b5 + fmov b5, c14 + + MUL a3, c01, b5 + fmov b5, t1 + MUL a3, c05, b5 + fmov b5, t2 + MUL a3, c09, b5 + fmov b5, t3 + MUL a3, c13, b5 + fmov b5, t4 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c07, t2, b5 + fmov b5, c07 + SUB c11, t3, b5 + fmov b5, c11 + SUB c15, t4, b5 + fmov b5, c15 + + MUL a4, c01, b5 + fmov b5, t1 + MUL a4, c05, b5 + fmov b5, t2 + MUL a4, c09, b5 + fmov b5, t3 + MUL a4, c13, b5 + fmov b5, t4 + + SUB c04, t1, b5 + fmov b5, c04 + SUB c08, t2, b5 + fmov b5, c08 + SUB c12, t3, b5 + fmov b5, c12 + SUB c16, t4, b5 + fmov b5, c16 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, b5 + fmov b5, c02 + MUL b1, c06, b5 + fmov b5, c06 + MUL b1, c10, b5 + fmov b5, c10 + MUL b1, c14, b5 + fmov b5, c14 + + MUL b2, c02, b5 + fmov b5, t1 + MUL b2, c06, b5 + fmov b5, t2 + MUL b2, c10, b5 + fmov b5, t3 + MUL b2, c14, b5 + fmov b5, t4 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c07, t2, b5 + fmov b5, c07 + SUB c11, t3, b5 + fmov b5, c11 + SUB c15, t4, b5 + fmov b5, c15 + + MUL b3, c02, b5 + fmov b5, t1 + MUL b3, c06, b5 + fmov b5, t2 + MUL b3, c10, b5 + fmov b5, t3 + MUL b3, c14, b5 + fmov b5, t4 + + SUB c04, t1, b5 + fmov b5, c04 + SUB c08, t2, b5 + fmov b5, c08 + SUB c12, t3, b5 + fmov b5, c12 + SUB c16, t4, b5 + fmov b5, c16 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c07, b5 + fmov b5, c07 + MUL a1, c11, b5 + fmov b5, c11 + MUL a1, c15, b5 + fmov b5, c15 + + MUL a2, c03, b5 + fmov b5, t1 + MUL a2, c07, b5 + fmov b5, t2 + MUL a2, c11, b5 + fmov b5, t3 + MUL a2, c15, b5 + fmov b5, t4 + + SUB c04, t1, b5 + fmov b5, c04 + SUB c08, t2, b5 + fmov b5, c08 + SUB c12, t3, b5 + fmov b5, c12 + SUB c16, t4, b5 + fmov b5, c16 + + MUL a3, c04, b5 + fmov b5, c04 + MUL a3, c08, b5 + fmov b5, c08 + MUL a3, c12, b5 + fmov b5, c12 + MUL a3, c16, b5 + fmov b5, c16 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c02, b5 + fmov b5, t2 + MUL a2, c03, b5 + fmov b5, t3 + MUL a2, c04, b5 + fmov b5, t4 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + SUB c07, t3, b5 + fmov b5, c07 + SUB c08, t4, b5 + fmov b5, c08 + + MUL a3, c01, b5 + fmov b5, t1 + MUL a3, c02, b5 + fmov b5, t2 + MUL a3, c03, b5 + fmov b5, t3 + MUL a3, c04, b5 + fmov b5, t4 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + SUB c11, t3, b5 + fmov b5, c11 + SUB c12, t4, b5 + fmov b5, c12 + + MUL a4, c01, b5 + fmov b5, t1 + MUL a4, c02, b5 + fmov b5, t2 + MUL a4, c03, b5 + fmov b5, t3 + MUL a4, c04, b5 + fmov b5, t4 + + SUB c13, t1, b5 + fmov b5, c13 + SUB c14, t2, b5 + fmov b5, c14 + SUB c15, t3, b5 + fmov b5, c15 + SUB c16, t4, b5 + fmov b5, c16 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, b5 + fmov b5, c05 + MUL b1, c06, b5 + fmov b5, c06 + MUL b1, c07, b5 + fmov b5, c07 + MUL b1, c08, b5 + fmov b5, c08 + + MUL b2, c05, b5 + fmov b5, t1 + MUL b2, c06, b5 + fmov b5, t2 + MUL b2, c07, b5 + fmov b5, t3 + MUL b2, c08, b5 + fmov b5, t4 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + SUB c11, t3, b5 + fmov b5, c11 + SUB c12, t4, b5 + fmov b5, c12 + + MUL b3, c05, b5 + fmov b5, t1 + MUL b3, c06, b5 + fmov b5, t2 + MUL b3, c07, b5 + fmov b5, t3 + MUL b3, c08, b5 + fmov b5, t4 + + SUB c13, t1, b5 + fmov b5, c13 + SUB c14, t2, b5 + fmov b5, c14 + SUB c15, t3, b5 + fmov b5, c15 + SUB c16, t4, b5 + fmov b5, c16 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + MUL a1, c11, b5 + fmov b5, c11 + MUL a1, c12, b5 + fmov b5, c12 + + MUL a2, c09, b5 + fmov b5, t1 + MUL a2, c10, b5 + fmov b5, t2 + MUL a2, c11, b5 + fmov b5, t3 + MUL a2, c12, b5 + fmov b5, t4 + + SUB c13, t1, b5 + fmov b5, c13 + SUB c14, t2, b5 + fmov b5, c14 + SUB c15, t3, b5 + fmov b5, c15 + SUB c16, t4, b5 + fmov b5, c16 + + MUL a3, c13, b5 + fmov b5, c13 + MUL a3, c14, b5 + fmov b5, c14 + MUL a3, c15, b5 + fmov b5, c15 + MUL a3, c16, b5 + fmov b5, c16 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, b5 + fmov b5, c13 + MUL a1, c14, b5 + fmov b5, c14 + MUL a1, c15, b5 + fmov b5, c15 + MUL a1, c16, b5 + fmov b5, c16 + + MUL a2, c13, b5 + fmov b5, t1 + MUL a2, c14, b5 + fmov b5, t2 + MUL a2, c15, b5 + fmov b5, t3 + MUL a2, c16, b5 + fmov b5, t4 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + SUB c11, t3, b5 + fmov b5, c11 + SUB c12, t4, b5 + fmov b5, c12 + + MUL a3, c13, b5 + fmov b5, t1 + MUL a3, c14, b5 + fmov b5, t2 + MUL a3, c15, b5 + fmov b5, t3 + MUL a3, c16, b5 + fmov b5, t4 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + SUB c07, t3, b5 + fmov b5, c07 + SUB c08, t4, b5 + fmov b5, c08 + + MUL a4, c13, b5 + fmov b5, t1 + MUL a4, c14, b5 + fmov b5, t2 + MUL a4, c15, b5 + fmov b5, t3 + MUL a4, c16, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + SUB c03, t3, b5 + fmov b5, c03 + SUB c04, t4, b5 + fmov b5, c04 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, b5 + fmov b5, c09 + MUL b1, c10, b5 + fmov b5, c10 + MUL b1, c11, b5 + fmov b5, c11 + MUL b1, c12, b5 + fmov b5, c12 + + MUL b2, c09, b5 + fmov b5, t1 + MUL b2, c10, b5 + fmov b5, t2 + MUL b2, c11, b5 + fmov b5, t3 + MUL b2, c12, b5 + fmov b5, t4 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + SUB c07, t3, b5 + fmov b5, c07 + SUB c08, t4, b5 + fmov b5, c08 + + MUL b3, c09, b5 + fmov b5, t1 + MUL b3, c10, b5 + fmov b5, t2 + MUL b3, c11, b5 + fmov b5, t3 + MUL b3, c12, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + SUB c03, t3, b5 + fmov b5, c03 + SUB c04, t4, b5 + fmov b5, c04 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c06, b5 + fmov b5, c06 + MUL a1, c07, b5 + fmov b5, c07 + MUL a1, c08, b5 + fmov b5, c08 + + MUL a2, c05, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + MUL a2, c07, b5 + fmov b5, t3 + MUL a2, c08, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + SUB c03, t3, b5 + fmov b5, c03 + SUB c04, t4, b5 + fmov b5, c04 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c02, b5 + fmov b5, c02 + MUL a3, c03, b5 + fmov b5, c03 + MUL a3, c04, b5 + fmov b5, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) + + ST c03, 8 * SIZE(BO) + ST c07, 9 * SIZE(BO) + ST c11, 10 * SIZE(BO) + ST c15, 11 * SIZE(BO) + + ST c04, 12 * SIZE(BO) + ST c08, 13 * SIZE(BO) + ST c12, 14 * SIZE(BO) + ST c16, 15 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) + + ST c09, 8 * SIZE(AO) + ST c10, 9 * SIZE(AO) + ST c11, 10 * SIZE(AO) + ST c12, 11 * SIZE(AO) + + ST c13, 12 * SIZE(AO) + ST c14, 13 * SIZE(AO) + ST c15, 14 * SIZE(AO) + ST c16, 15 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) + ldi C2, -4 * SIZE(C2) + ldi C3, -4 * SIZE(C3) + ldi C4, -4 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c11, 2 * SIZE(C3) + ST c12, 3 * SIZE(C3) + + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + ST c15, 2 * SIZE(C4) + ST c16, 3 * SIZE(C4) + +#ifndef LN + ldi C1, 4 * SIZE(C1) + ldi C2, 4 * SIZE(C2) + ldi C3, 4 * SIZE(C3) + ldi C4, 4 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP1 + addl AO, TMP1, AO + addl BO, TMP1, BO +#endif + +#ifdef LT + addl KK, 4, KK +#endif + +#ifdef LN + subl KK, 4, KK +#endif + + ldi I, -1(I) + + bgt I, $L11 + .align 4 + +$L20: + and M, 2, I + ble I, $L30 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(B) + ldi L, -2(KK) + LD b2, 1 * SIZE(B) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c01 + LD b4, 3 * SIZE(B) + fclr c05 + + ldi BO, 4 * SIZE(B) + fclr c02 + fclr c06 + ble KK, $L28 + + ble L, $L25 + +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c01 + LD b4, 3 * SIZE(BO) + fclr c05 + + ldi BO, 4 * SIZE(BO) + fclr c02 + fclr c06 + ble TMP1, $L28 + + ble L, $L25 +#endif + .align 4 + +$L22: + ADD c09, t1, b5 + fmov b5, c09 + unop + MUL a1, b1, t1 + unop + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, b5 + fmov b5, c13 + unop + MUL a1, b2, t3 + ldi BO, 8 * SIZE(BO) + + ADD c14, t4, b5 + fmov b5, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b3, t1 + unop + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + FIMOVD b5, tmp + + ADD c09, t1, b5 + fmov b5, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD c13, t3, b5 + fmov b5, c13 + unop + MUL a3, b2, t3 + ldi AO, 4 * SIZE(AO) + + ADD c14, t4, b5 + fmov b5, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + unop + IFMOVD tmp, b5 + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + IFMOVD tmp, b5 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD c09, t1, b5 + fmov b5, c09 + MUL a1, b1, b5 + fmov b5, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL a2, b1, b5 + fmov b5, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, b5 + fmov b5, c13 + unop + MUL a1, b2, b5 + fmov b5, t3 + unop + + ADD c14, t4, b5 + fmov b5, c14 + unop + MUL a2, b2, b5 + fmov b5, t4 + LD b2, 1 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b3, b5 + fmov b5, t1 + ldi AO, 2 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b3, b5 + fmov b5, t2 + LD b3, 2 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + unop + MUL a1, b4, b5 + fmov b5, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + unop + MUL a2, b4, b5 + fmov b5, t4 + LD a2, -1 * SIZE(AO) + + ADD c09, t1, b5 + fmov b5, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, b5 + fmov b5, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD c10, t2, b5 + fmov b5, c10 + MUL a2, b1, b5 + fmov b5, t2 + ADD c13, t3, b5 + fmov b5, c13 + MUL a1, b2, b5 + fmov b5, t3 + + ADD c14, t4, b5 + fmov b5, c14 + MUL a2, b2, b5 + fmov b5, t4 + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b3, b5 + fmov b5, t1 + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b3, b5 + fmov b5, t2 + ADD c05, t3, b5 + fmov b5, c05 + MUL a1, b4, b5 + fmov b5, t3 + + ADD c06, t4, b5 + fmov b5, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b4, b5 + fmov b5, t4 + ldi BO, 4 * SIZE(BO) + + ADD c09, t1, b5 + fmov b5, c09 + ADD c10, t2, b5 + fmov b5, c10 + ADD c13, t3, b5 + fmov b5, c13 + ADD c14, t4, b5 + fmov b5, c14 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c13, b5 + fmov b5, c13 + + SUB b1, c02, b5 + fmov b5, c02 + SUB b2, c06, b5 + fmov b5, c06 + SUB b3, c10, b5 + fmov b5, c10 + SUB b4, c14, b5 + fmov b5, c14 + +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c05, b5 + fmov b5, c05 + SUB a4, c06, b5 + fmov b5, c06 + + SUB b1, c09, b5 + fmov b5, c09 + SUB b2, c10, b5 + fmov b5, c10 + SUB b3, c13, b5 + fmov b5, c13 + SUB b4, c14, b5 + fmov b5, c14 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c06, b5 + fmov b5, c06 + MUL a1, c10, b5 + fmov b5, c10 + MUL a1, c14, b5 + fmov b5, c14 + + MUL a2, c02, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + MUL a2, c10, b5 + fmov b5, t3 + MUL a2, c14, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + SUB c09, t3, b5 + fmov b5, c09 + SUB c13, t4, b5 + fmov b5, c13 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c05, b5 + fmov b5, c05 + MUL a3, c09, b5 + fmov b5, c09 + MUL a3, c13, b5 + fmov b5, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c13, b5 + fmov b5, c13 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c05, b5 + fmov b5, t2 + MUL a2, c09, b5 + fmov b5, t3 + MUL a2, c13, b5 + fmov b5, t4 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + SUB c10, t3, b5 + fmov b5, c10 + SUB c14, t4, b5 + fmov b5, c14 + + MUL a3, c02, b5 + fmov b5, c02 + MUL a3, c06, b5 + fmov b5, c06 + MUL a3, c10, b5 + fmov b5, c10 + MUL a3, c14, b5 + fmov b5, c14 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c02, b5 + fmov b5, t2 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + + MUL a3, c01, b5 + fmov b5, t1 + MUL a3, c02, b5 + fmov b5, t2 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + + MUL a4, c01, b5 + fmov b5, t1 + MUL a4, c02, b5 + fmov b5, t2 + + SUB c13, t1, b5 + fmov b5, c13 + SUB c14, t2, b5 + fmov b5, c14 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, b5 + fmov b5, c05 + MUL b1, c06, b5 + fmov b5, c06 + + MUL b2, c05, b5 + fmov b5, t1 + MUL b2, c06, b5 + fmov b5, t2 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + + MUL b3, c05, b5 + fmov b5, t1 + MUL b3, c06, b5 + fmov b5, t2 + + SUB c13, t1, b5 + fmov b5, c13 + SUB c14, t2, b5 + fmov b5, c14 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + + MUL a2, c09, b5 + fmov b5, t1 + MUL a2, c10, b5 + fmov b5, t2 + + SUB c13, t1, b5 + fmov b5, c13 + SUB c14, t2, b5 + fmov b5, c14 + + MUL a3, c13, b5 + fmov b5, c13 + MUL a3, c14, b5 + fmov b5, c14 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, b5 + fmov b5, c13 + MUL a1, c14, b5 + fmov b5, c14 + + MUL a2, c13, b5 + fmov b5, t1 + MUL a2, c14, b5 + fmov b5, t2 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + + MUL a3, c13, b5 + fmov b5, t1 + MUL a3, c14, b5 + fmov b5, t2 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + + MUL a4, c13, b5 + fmov b5, t1 + MUL a4, c14, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, b5 + fmov b5, c09 + MUL b1, c10, b5 + fmov b5, c10 + + MUL b2, c09, b5 + fmov b5, t1 + MUL b2, c10, b5 + fmov b5, t2 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + + MUL b3, c09, b5 + fmov b5, t1 + MUL b3, c10, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c06, b5 + fmov b5, c06 + + MUL a2, c05, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c02, b5 + fmov b5, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c13, 6 * SIZE(AO) + ST c14, 7 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) + ldi C2, -2 * SIZE(C2) + ldi C3, -2 * SIZE(C3) + ldi C4, -2 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + +#ifndef LN + ldi C1, 2 * SIZE(C1) + ldi C2, 2 * SIZE(C2) + ldi C3, 2 * SIZE(C3) + ldi C4, 2 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + .align 4 + +$L30: + and M, 1, I + ble I, $L39 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + ldi L, -2(KK) + LD b2, 1 * SIZE(B) + ldi AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c09 + LD b4, 3 * SIZE(B) + fclr c13 + + ldi BO, 4 * SIZE(B) + ble KK, $L38 + + ble L, $L35 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c09 + LD b4, 3 * SIZE(BO) + fclr c13 + + ldi BO, 4 * SIZE(BO) + ble TMP1, $L38 + + ble L, $L35 +#endif + .align 4 + +$L32: + ADD c01, t1, b5 + fmov b5,c01 + ldi L, -2(L) + MUL a1, b1, t1 + LD b1, 0 * SIZE(BO) + + ADD c05, t2, b5 + fmov b5, c05 + ldi AO, 2 * SIZE(AO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, b5 + fmov b5, c09 + LD b5, 3 * SIZE(BO) + FIMOVD b5, tmp + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, b5 + fmov b5, c13 + MUL a1, b4, t4 + LD a1, -1 * SIZE(AO) + + ADD c01, t1, b5 + fmov b5, c01 + MUL a2, b1, t1 + LD b1, 4 * SIZE(BO) + ldi BO, 8 * SIZE(BO) + + ADD c05, t2, b5 + fmov b5, c05 + MUL a2, b2, t2 + LD b2, -3 * SIZE(BO) + + ADD c09, t3, b5 + fmov b5, c09 + LD b4, -1 * SIZE(BO) + MUL a2, b3, t3 + LD b3, -2 * SIZE(BO) + + ADD c13, t4, b5 + fmov b5, c13 + IFMOVD tmp, b5 + MUL a2, b5, t4 + LD a2, 0 * SIZE(AO) + bgt L, $L32 + .align 4 + +$L35: + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b1, b5 + fmov b5, t1 +#if defined(LT) || defined(RN) + blbs KK, $L37 +#else + blbs TMP1, $L37 +#endif + .align 4 + + ADD c05, t2, b5 + fmov b5, c05 + LD b1, 0 * SIZE(BO) + MUL a1, b2, b5 + fmov b5, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, b5 + fmov b5, c09 + MUL a1, b3, b5 + fmov b5, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, b5 + fmov b5, c13 + MUL a1, b4, b5 + fmov b5, t4 + LD a1, 0 * SIZE(AO) + ldi AO, 1 * SIZE(AO) + + ADD c01, t1, b5 + fmov b5, c01 + LD b4, 3 * SIZE(BO) + MUL a1, b1, b5 + fmov b5, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L37: + ADD c05, t2, b5 + fmov b5, c05 + MUL a1, b2, b5 + fmov b5, t2 + ADD c09, t3, b5 + fmov b5, c09 + MUL a1, b3, b5 + fmov b5, t3 + + ADD c13, t4, b5 + fmov b5, c13 + ldi AO, 1 * SIZE(AO) + MUL a1, b4, b5 + fmov b5, t4 + ldi BO, 4 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + ADD c05, t2, b5 + fmov b5, c05 + ADD c09, t3, b5 + fmov b5, c09 + ADD c13, t4, b5 + fmov b5, c13 + +$L38: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 1, TMP1 +#else + subl KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO +#else + ldi AO, -1 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c13, b5 + fmov b5, c13 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c13, b5 + fmov b5, c13 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c13, b5 + fmov b5, c13 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a2, c01, b5 + fmov b5, t1 + SUB c05, t1, b5 + fmov b5, c05 + MUL a3, c01, b5 + fmov b5, t1 + SUB c09, t1, b5 + fmov b5, c09 + MUL a4, c01, b5 + fmov b5, t1 + SUB c13, t1, b5 + fmov b5, c13 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, b5 + fmov b5, c05 + MUL b2, c05, b5 + fmov b5, t1 + SUB c09, t1, b5 + fmov b5, c09 + MUL b3, c05, b5 + fmov b5, t1 + SUB c13, t1, b5 + fmov b5, c13 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, b5 + fmov b5, c09 + MUL a2, c09, b5 + fmov b5, t1 + SUB c13, t1, b5 + fmov b5, c13 + MUL a3, c13, b5 + fmov b5, c13 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, b5 + fmov b5, c13 + MUL a2, c13, b5 + fmov b5, t1 + SUB c09, t1, b5 + fmov b5, c09 + MUL a3, c13, b5 + fmov b5, t1 + SUB c05, t1, b5 + fmov b5, c05 + MUL a4, c13, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, b5 + fmov b5, c09 + MUL b2, c09, b5 + fmov b5, t1 + SUB c05, t1, b5 + fmov b5, c05 + MUL b3, c09, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, b5 + fmov b5, c05 + MUL a2, c05, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + MUL a3, c01, b5 + fmov b5, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c13, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -1 * SIZE(C1) + ldi C2, -1 * SIZE(C2) + ldi C3, -1 * SIZE(C3) + ldi C4, -1 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c09, 0 * SIZE(C3) + ST c13, 0 * SIZE(C4) + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L39: +#ifdef LN + sll K, 2 + BASE_SHIFT, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 4, KK +#endif + +#ifdef RT + subl KK, 4, KK +#endif + ldi J, -1(J) + bgt J, $L01 + .align 4 + +$L40: + and N, 2, J + ble J, $L80 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + subl B, TMP1, B + + addl LDC, LDC, TMP1 + subl C, TMP1, C +#endif + + mov C, C1 + addl C, LDC, C2 + fclr t1 +#ifndef RT + addl C2, LDC, C +#endif + fclr t2 + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L60 + .align 4 + +$L51: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi L, -2(KK) + + ldi BO, 2 * SIZE(B) + ldi AO, 4 * SIZE(AO) + + ble KK, $L58 + + ble L, $L55 +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi L, -2(TMP1) + ldi BO, 2 * SIZE(BO) + ldi AO, 4 * SIZE(AO) + + ble TMP1, $L58 + + ble L, $L55 +#endif + .align 4 + +$L52: + ADD c05, t1, b5 + fmov b5, c05 + unop + MUL a1, b1, b5 + fmov b5, t1 + unop + + ADD c06, t2, b5 + fmov b5, c06 + ldi L, -2(L) + MUL a2, b1, b5 + fmov b5, t2 + unop + + ADD c07, t3, b5 + fmov b5, c07 + unop + MUL a3, b1, b5 + fmov b5, t3 + unop + + ADD c08, t4, b5 + fmov b5, c08 + unop + MUL a4, b1, b5 + fmov b5, t4 + LD b1, 2 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b2, b5 + fmov b5, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b2, b5 + fmov b5, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b2, b5 + fmov b5, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + unop + MUL a4, b2, b5 + fmov b5, t4 + LD a5, 3 * SIZE(AO) + + ADD c05, t1, b5 + fmov b5, c05 + unop + MUL a1, b3, b5 + fmov b5, t1 + LD b2, -1 * SIZE(BO) + + ADD c06, t2, b5 + fmov b5, c06 + unop + MUL a2, b3, b5 + fmov b5, t2 + unop + + ADD c07, t3, b5 + fmov b5, c07 + unop + MUL a3, b3, b5 + fmov b5, t3 + ldi AO, 8 * SIZE(AO) + + ADD c08, t4, b5 + fmov b5, c08 + unop + MUL a5, b3, b5 + fmov b5, t4 + LD b3, 0 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b4, b5 + fmov b5, t1 + LD a1, -4 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b4, b5 + fmov b5, t2 + LD a2, -3 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, b5 + fmov b5, t3 + LD a3, -2 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a5, b4, b5 + fmov b5, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L52 + .align 4 + +$L55: + ADD c05, t1, b5 + fmov b5, c05 + MUL a1, b1, b5 + fmov b5, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD c06, t2, b5 + fmov b5, c06 + MUL a2, b1, b5 + fmov b5, t2 + ADD c07, t3, b5 + fmov b5, c07 + MUL a3, b1, b5 + fmov b5, t3 + + ADD c08, t4, b5 + fmov b5, c08 + unop + MUL a4, b1, b5 + fmov b5, t4 + LD b1, 0 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b2, b5 + fmov b5, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b2, b5 + fmov b5, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b2, b5 + fmov b5, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b2, b5 + fmov b5, t4 + LD a4, 3 * SIZE(AO) + ldi AO, 4 * SIZE(AO) + + ADD c05, t1, b5 + fmov b5, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, b5 + fmov b5, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L57: + ADD c06, t2, b5 + fmov b5, c06 + MUL a2, b1, b5 + fmov b5, t2 + ADD c07, t3, b5 + fmov b5, c07 + MUL a3, b1, b5 + fmov b5, t3 + + ADD c08, t4, b5 + fmov b5, c08 + MUL a4, b1, b5 + fmov b5, t4 + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b2, b5 + fmov b5, t1 + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b2, b5 + fmov b5, t2 + ADD c03, t3, b5 + fmov b5, c03 + MUL a3, b2, b5 + fmov b5, t3 + + ADD c04, t4, b5 + fmov b5, c04 + ldi AO, 4 * SIZE(AO) + MUL a4, b2, b5 + fmov b5, t4 + ldi BO, 2 * SIZE(BO) + + ADD c05, t1, b5 + fmov b5, c05 + ADD c06, t2, b5 + fmov b5, c06 + ADD c07, t3, b5 + fmov b5, c07 + ADD c08, t4, b5 + fmov b5, c08 + .align 4 + +$L58: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 4, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 + SUB a3, c02, b5 + fmov b5, c02 + SUB a4, c06, b5 + fmov b5, c06 + + SUB b1, c03, b5 + fmov b5, c03 + SUB b2, c07, b5 + fmov b5, c07 + SUB b3, c04, b5 + fmov b5, c04 + SUB b4, c08, b5 + fmov b5, c08 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 + + SUB b1, c05, b5 + fmov b5, c05 + SUB b2, c06, b5 + fmov b5, c06 + SUB b3, c07, b5 + fmov b5, c07 + SUB b4, c08, b5 + fmov b5, c08 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, b5 + fmov b5, c04 + MUL a1, c08, b5 + fmov b5, c08 + + MUL a2, c04, b5 + fmov b5, t1 + MUL a2, c08, b5 + fmov b5, t2 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c07, t2, b5 + fmov b5, c07 + + MUL a3, c04, b5 + fmov b5, t1 + MUL a3, c08, b5 + fmov b5, t2 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + + MUL a4, c04, b5 + fmov b5, t1 + MUL a4, c08, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, b5 + fmov b5, c03 + MUL b1, c07, b5 + fmov b5, c07 + + MUL b2, c03, b5 + fmov b5, t1 + MUL b2, c07, b5 + fmov b5, t2 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + + MUL b3, c03, b5 + fmov b5, t1 + MUL b3, c07, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c06, b5 + fmov b5, c06 + + MUL a2, c02, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c05, b5 + fmov b5, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c05, b5 + fmov b5, c05 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c05, b5 + fmov b5, t2 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + + MUL a3, c01, b5 + fmov b5, t1 + MUL a3, c05, b5 + fmov b5, t2 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c07, t2, b5 + fmov b5, c07 + + MUL a4, c01, b5 + fmov b5, t1 + MUL a4, c05, b5 + fmov b5, t2 + + SUB c04, t1, b5 + fmov b5, c04 + SUB c08, t2, b5 + fmov b5, c08 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, b5 + fmov b5, c02 + MUL b1, c06, b5 + fmov b5, c06 + + MUL b2, c02, b5 + fmov b5, t1 + MUL b2, c06, b5 + fmov b5, t2 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c07, t2, b5 + fmov b5, c07 + + MUL b3, c02, b5 + fmov b5, t1 + MUL b3, c06, b5 + fmov b5, t2 + + SUB c04, t1, b5 + fmov b5, c04 + SUB c08, t2, b5 + fmov b5, c08 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c07, b5 + fmov b5, c07 + + MUL a2, c03, b5 + fmov b5, t1 + MUL a2, c07, b5 + fmov b5, t2 + + SUB c04, t1, b5 + fmov b5, c04 + SUB c08, t2, b5 + fmov b5, c08 + + MUL a3, c04, b5 + fmov b5, c04 + MUL a3, c08, b5 + fmov b5, c08 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c02, b5 + fmov b5, t2 + MUL a2, c03, b5 + fmov b5, t3 + MUL a2, c04, b5 + fmov b5, t4 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + SUB c07, t3, b5 + fmov b5, c07 + SUB c08, t4, b5 + fmov b5, c08 + + MUL a3, c05, b5 + fmov b5, c05 + MUL a3, c06, b5 + fmov b5, c06 + MUL a3, c07, b5 + fmov b5, c07 + MUL a3, c08, b5 + fmov b5, c08 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c06, b5 + fmov b5, c06 + MUL a1, c07, b5 + fmov b5, c07 + MUL a1, c08, b5 + fmov b5, c08 + + MUL a2, c05, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + MUL a2, c07, b5 + fmov b5, t3 + MUL a2, c08, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + SUB c03, t3, b5 + fmov b5, c03 + SUB c04, t4, b5 + fmov b5, c04 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c02, b5 + fmov b5, c02 + MUL a3, c03, b5 + fmov b5, c03 + MUL a3, c04, b5 + fmov b5, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c07, 5 * SIZE(BO) + ST c04, 6 * SIZE(BO) + ST c08, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) + ldi C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + +#ifndef LN + ldi C1, 4 * SIZE(C1) + ldi C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 4, KK +#endif + +#ifdef LN + subl KK, 4, KK +#endif + + ldi I, -1(I) + + bgt I, $L51 + .align 4 + +$L60: + and M, 2, I + ble I, $L70 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + ldi L, -2(KK) + LD b2, 1 * SIZE(B) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + ldi BO, 2 * SIZE(B) + + ble KK, $L68 + + ble L, $L65 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + ldi BO, 2 * SIZE(BO) + + ble TMP1, $L68 + + ble L, $L65 +#endif + .align 4 + +$L62: + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b1, b5 + fmov b5, t1 + unop + + ADD c02, t2, b5 + fmov b5, c02 + ldi AO, 4 * SIZE(AO) + MUL a2, b1, b5 + fmov b5, t2 + LD b1, 2 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + ldi L, -2(L) + MUL a1, b2, b5 + fmov b5, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + unop + MUL a2, b2, b5 + fmov b5, t4 + LD a2, -1 * SIZE(AO) + + ADD c01, t1, b5 + fmov b5, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, b5 + fmov b5, t1 + ldi BO, 4 * SIZE(BO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a4, b3, b5 + fmov b5, t2 + LD b3, 0 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + unop + MUL a3, b4, b5 + fmov b5, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + MUL a4, b4, b5 + fmov b5, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L62 + .align 4 + +$L65: + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b1, b5 + fmov b5, t1 +#if defined(LT) || defined(RN) + blbs KK, $L67 +#else + blbs TMP1, $L67 +#endif + .align 4 + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b1, b5 + fmov b5, t2 + LD b1, 0 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + ldi BO, 2 * SIZE(BO) + MUL a1, b2, b5 + fmov b5, t3 + LD a1, 0 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + unop + MUL a2, b2, b5 + fmov b5, t4 + LD a2, 1 * SIZE(AO) + + ADD c01, t1, b5 + fmov b5, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, b5 + fmov b5, t1 + ldi AO, 2 * SIZE(AO) + .align 4 + +$L67: + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b1, b5 + fmov b5, t2 + ADD c05, t3, b5 + fmov b5, c05 + MUL a1, b2, b5 + fmov b5, t3 + + ADD c06, t4, b5 + fmov b5, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b2, b5 + fmov b5, t4 + ldi BO, 2 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + ADD c02, t2, b5 + fmov b5, c02 + ADD c05, t3, b5 + fmov b5, c05 + ADD c06, t4, b5 + fmov b5, c06 + .align 4 + +$L68: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 + SUB a3, c02, b5 + fmov b5, c02 + SUB a4, c06, b5 + fmov b5, c06 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c05, b5 + fmov b5, c05 + SUB a4, c06, b5 + fmov b5, c06 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c06, b5 + fmov b5, c06 + + MUL a2, c02, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c05, b5 + fmov b5, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c05, b5 + fmov b5, c05 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c05, b5 + fmov b5, t2 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + + MUL a3, c02, b5 + fmov b5, c02 + MUL a3, c06, b5 + fmov b5, c06 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c02, b5 + fmov b5, t2 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + + MUL a3, c05, b5 + fmov b5, c05 + MUL a3, c06, b5 + fmov b5, c06 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c06, b5 + fmov b5, c06 + + MUL a2, c05, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c02, b5 + fmov b5, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) + ldi C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + +#ifndef LN + ldi C1, 2 * SIZE(C1) + ldi C2, 2 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + .align 4 + +$L70: + and M, 1, I + ble I, $L79 + +#if defined(LT) || defined(RN) + + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + fclr c02 + LD b2, 1 * SIZE(B) + fclr c06 + + ldi L, -2(KK) + + LD b3, 2 * SIZE(B) + ldi AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(B) + ldi BO, 2 * SIZE(B) + + ble KK, $L78 + + ble L, $L75 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + fclr c02 + LD b2, 1 * SIZE(BO) + fclr c06 + + ldi L, -2(TMP1) + + LD b3, 2 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(BO) + ldi BO, 2 * SIZE(BO) + + ble TMP1, $L78 + + ble L, $L75 +#endif + .align 4 + +$L72: + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -2(L) + MUL a1, b1, b5 + fmov b5, t1 + LD b1, 2 * SIZE(BO) + + ADD c05, t2, b5 + fmov b5, c05 + MUL a1, b2, b5 + fmov b5, t2 + LD a1, 1 * SIZE(AO) + LD b2, 3 * SIZE(BO) + + ADD c02, t3, b5 + fmov b5, c02 + ldi AO, 2 * SIZE(AO) + MUL a2, b3, b5 + fmov b5, t3 + LD b3, 4 * SIZE(BO) + + ADD c06, t4, b5 + fmov b5, c06 + MUL a2, b4, b5 + fmov b5, t4 + LD a2, 0 * SIZE(AO) + LD b4, 5 * SIZE(BO) + + ldi BO, 4 * SIZE(BO) + unop + unop + bgt L, $L72 + .align 4 + +$L75: + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b1, b5 + fmov b5, t1 +#if defined(LT) || defined(RN) + blbs KK, $L77 +#else + blbs TMP1, $L77 +#endif + .align 4 + + ADD c05, t2, b5 + fmov b5, c05 + MUL a1, b2, b5 + fmov b5, t2 + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + LD b2, 1 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + MUL a1, b1, b5 + fmov b5, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L77: + ADD c05, t2, b5 + fmov b5, c05 + MUL a1, b2, b5 + fmov b5, t2 + ADD c02, t3, b5 + fmov b5, c02 + ADD c06, t4, b5 + fmov b5, c06 + + ADD c01, c02, b5 + fmov b5, c01 + ldi AO, 1 * SIZE(AO) + ADD c05, c06, b5 + fmov b5, c05 + ldi BO, 2 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + ADD c05, t2, b5 + fmov b5, c05 + + .align 4 + +$L78: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 1, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -1 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c05, b5 + fmov b5, c05 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a2, c01, b5 + fmov b5, t1 + SUB c05, t1, b5 + fmov b5, c05 + MUL a3, c05, b5 + fmov b5, c05 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, b5 + fmov b5, c05 + MUL a2, c05, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + MUL a3, c01, b5 + fmov b5, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -1 * SIZE(C1) + ldi C2, -1 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L79: +#ifdef LN + sll K, 1 + BASE_SHIFT, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 2, KK +#endif + +#ifdef RT + subl KK, 2, KK +#endif + .align 4 + +$L80: + and N, 1, J + ble J, $L999 + +#ifdef RT + sll K, BASE_SHIFT, TMP1 + subl B, TMP1, B + + subl C, LDC, C +#endif + + mov C, C1 +#ifndef RT + addl C, LDC, C +#endif + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + ble I, $L100 + .align 4 + +$L91: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L95 + +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L95 +#endif + .align 5 + +$L92: + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b1, b5 + fmov b5, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + ldi L, -1(L) + MUL a2, b1, b5 + fmov b5, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b1, b5 + fmov b5, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b1, b5 + fmov b5, t4 + LD a4, 7 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b2, b5 + fmov b5, t1 + LD a1, 8 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b2, b5 + fmov b5, t2 + LD a2, 9 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b2, b5 + fmov b5, t3 + LD a3, 10 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b2, b5 + fmov b5, t4 + LD a4, 11 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b3, b5 + fmov b5, t1 + LD a1, 12 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b3, b5 + fmov b5, t2 + LD a2, 13 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b3, b5 + fmov b5, t3 + LD a3, 14 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b3, b5 + fmov b5, t4 + LD a5, 15 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b4, b5 + fmov b5, t1 + LD a1, 16 * SIZE(AO) + ldi AO, 16 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b4, b5 + fmov b5, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, b5 + fmov b5, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a5, b4, b5 + fmov b5, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L92 + .align 4 + +$L95: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + unop + ble L, $L98 + .align 4 + +$L96: + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -1(L) + MUL a1, b1, b5 + fmov b5, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + ldi BO, 1 * SIZE(BO) + MUL a2, b1, b5 + fmov b5, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b1, b5 + fmov b5, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b1, b5 + fmov b5, t4 + LD a4, 7 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ldi AO, 4 * SIZE(AO) + bgt L, $L96 + .align 4 + +$L98: + ADD c01, t1, b5 + fmov b5, c01 + ADD c02, t2, b5 + fmov b5, c02 + ADD c03, t3, b5 + fmov b5, c03 + ADD c04, t4, b5 + fmov b5, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 4, TMP1 +#else + subl KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, b5 + fmov b5, c04 + MUL a2, c04, b5 + fmov b5, t1 + SUB c03, t1, b5 + fmov b5, c03 + MUL a3, c04, b5 + fmov b5, t1 + SUB c02, t1, b5 + fmov b5, c02 + MUL a4, c04, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, b5 + fmov b5, c03 + MUL b2, c03, b5 + fmov b5, t1 + SUB c02, t1, b5 + fmov b5, c02 + MUL b3, c03, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, b5 + fmov b5, c02 + MUL a2, c02, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + MUL a3, c01, b5 + fmov b5, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a2, c01, b5 + fmov b5, t1 + SUB c02, t1, b5 + fmov b5, c02 + MUL a3, c01, b5 + fmov b5, t1 + SUB c03, t1, b5 + fmov b5, c03 + MUL a4, c01, b5 + fmov b5, t1 + SUB c04, t1, b5 + fmov b5, c04 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, b5 + fmov b5, c02 + MUL b2, c02, b5 + fmov b5, t1 + SUB c03, t1, b5 + fmov b5, c03 + MUL b3, c02, b5 + fmov b5, t1 + SUB c04, t1, b5 + fmov b5, c04 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, b5 + fmov b5, c03 + MUL a2, c03, b5 + fmov b5, t1 + SUB c04, t1, b5 + fmov b5, c04 + MUL a3, c04, b5 + fmov b5, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + ldi C1, 4 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 4, KK +#endif + +#ifdef LN + subl KK, 4, KK +#endif + + ldi I, -1(I) + bgt I, $L91 + .align 4 + +$L100: + and M, 2, I + ble I, $L110 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L105 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + ble L, $L105 +#endif + .align 5 + +$L102: + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -1(L) + MUL a1, b1, b5 + fmov b5, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b1, b5 + fmov b5, t2 + LD a2, 5 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c03, t3, b5 + fmov b5, c03 + ldi BO, 4 * SIZE(BO) + MUL a3, b2, b5 + fmov b5, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b2, b5 + fmov b5, t4 + LD a5, 7 * SIZE(AO) + LD b2, 1 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b3, b5 + fmov b5, t1 + LD a1, 8 * SIZE(AO) + ldi AO, 8 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b3, b5 + fmov b5, t2 + LD b3, 2 * SIZE(BO) + LD a2, 1 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, b5 + fmov b5, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a5, b4, b5 + fmov b5, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L102 + .align 4 + +$L105: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L108 + .align 4 + +$L106: + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -1(L) + MUL a1, b1, b5 + fmov b5, t1 + LD a1, 2 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b1, b5 + fmov b5, t2 + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + ldi AO, 2 * SIZE(AO) + unop + ldi BO, 1 * SIZE(BO) + bgt L, $L106 + .align 4 + +$L108: + ADD c01, t1, b5 + fmov b5, c01 + ADD c02, t2, b5 + fmov b5, c02 + ADD c03, t3, b5 + fmov b5, c03 + ADD c04, t4, b5 + fmov b5, c04 + + ADD c01, c03, b5 + fmov b5, c01 + ADD c02, c04, b5 + fmov b5, c02 + +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, b5 + fmov b5, c02 + MUL a2, c02, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + MUL a3, c01, b5 + fmov b5, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a2, c01, b5 + fmov b5, t1 + SUB c02, t1, b5 + fmov b5, c02 + MUL a3, c02, b5 + fmov b5, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + ldi C1, 2 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + .align 4 + +$L110: + and M, 1, I + ble I, $L119 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + unop + ble L, $L115 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L115 +#endif + .align 4 + +$L112: + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b1, b5 + fmov b5, t1 + LD a1, 4 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b2, b5 + fmov b5, t2 + LD a2, 5 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c03, t3, b5 + fmov b5, c03 + MUL a3, b3, b5 + fmov b5, t3 + LD a3, 6 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b4, b5 + fmov b5, t4 + LD a4, 7 * SIZE(AO) + LD b4, 7 * SIZE(BO) + + ldi L, -1(L) + ldi AO, 4 * SIZE(AO) + ldi BO, 4 * SIZE(BO) + bgt L, $L112 + .align 4 + +$L115: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L118 + .align 4 + +$L116: + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b1, b5 + fmov b5, t1 + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + ldi L, -1(L) + ldi AO, 1 * SIZE(AO) + ldi BO, 1 * SIZE(BO) + bgt L, $L116 + .align 4 + +$L118: + ADD c01, t1, b5 + fmov b5, c01 + ADD c02, t2, b5 + fmov b5, c02 + ADD c03, t3, b5 + fmov b5, c03 + ADD c04, t4, b5 + fmov b5, c04 + + ADD c01, c02, b5 + fmov b5, c01 + ADD c03, c04, b5 + fmov b5, c03 + ADD c01, c03, b5 + fmov b5, c01 + +#if defined(LN) || defined(RT) + subl KK, 1, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + addl B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 +#else + LD a1, 0 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -1 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + +#ifndef LN + ldi C1, 1 * SIZE(C1) +#endif + +#ifdef RT + SXADDQ K, AORIG, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L119: +#ifdef LN + SXADDQ K, B, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 1, KK +#endif + +#ifdef RT + subl KK, 1, KK +#endif + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + ldl $9, 64($sp) + clr $0 + ldi $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/trsm_kernel_4x4_LT.S.bak b/kernel/sw_64/trsm_kernel_4x4_LT.S.bak new file mode 100644 index 0000000..86136ae --- /dev/null +++ b/kernel/sw_64/trsm_kernel_4x4_LT.S.bak @@ -0,0 +1,4072 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + +#if !defined(SW2B) +#error "Architecture is not specified." +#endif + +#ifdef SW2B +#define PREFETCHSIZE 56 +#define UNOP nop +#endif + +#ifdef EV6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 56 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $20 +#define B $21 +#define C $22 +#define LDC $23 + +#define C1 $19 +#define C2 $24 +#define C3 $25 +#define C4 $27 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + ldi $sp, -STACKSIZE($sp) + + ldl C, 0 + STACKSIZE($sp) + ldl LDC, 8 + STACKSIZE($sp) + ldl OFFSET, 16 + STACKSIZE($sp) + + SXADDQ LDC, 0, LDC + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + mull M, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ M, C, C +#endif + +#ifdef RN + negl OFFSET, KK +#endif + +#ifdef RT + mull N, K, TMP1 + SXADDQ TMP1, B, B + + mull N, LDC, TMP1 + addl TMP1, C, C + + subl N, OFFSET, KK +#endif + + sra N, 2, J + ble J, $L40 + .align 4 + +$L01: +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + subl B, TMP1, B + + s4addl LDC, 0, TMP1 + subl C, TMP1, C +#endif + + mov C, C1 + addl C, LDC, C2 + addl C2, LDC, C3 +#ifndef RT + s4addl LDC, C, C +#endif + + fclr t1 + addl C3, LDC, C4 + fclr t2 + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L20 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + + LD b3, 2 * SIZE(B) + fclr c06 + LD b4, 3 * SIZE(B) + fclr c05 + + fillcs 4 * SIZE(C1) + fclr c03 + ldi L, -2(KK) + fclr c04 + + fillcs 7 * SIZE(C2) + fclr c08 + ldi BO, 4 * SIZE(B) + fclr c13 + + fillcs 4 * SIZE(C3) + fclr c09 + ldi AO, 4 * SIZE(AO) + fclr c10 + + fillcs 7 * SIZE(C4) + fclr c14 + fclr c07 + ble KK, $L18 +#else + +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addl AORIG, TMP1, AO + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + + LD b3, 2 * SIZE(BO) + fclr c06 + LD b4, 3 * SIZE(BO) + fclr c05 + + fillcs 4 * SIZE(C1) + fclr c03 + ldi L, -2(TMP1) + fclr c04 + + fillcs 7 * SIZE(C2) + fclr c08 + ldi BO, 4 * SIZE(BO) + fclr c13 + + fillcs 4 * SIZE(C3) + fclr c09 + ldi AO, 4 * SIZE(AO) + fclr c10 + + fillcs 7 * SIZE(C4) + fclr c14 + fclr c07 + ble TMP1, $L18 +#endif + + ble L, $L15 + .align 5 + +$L12: +/* 1 */ + ADD c11, t1, c11 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD c12, t2, c12 + ldi L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD c03, t1, c03 + ldi AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD c04, t2, c04 + ldi BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD c11, t1, c11 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD c06, t3, c06 + MUL b2, a4, t3 + ADD c05, t4, c05 + MUL b4, a1, t4 + + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + ldi AO, 4 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + MUL b1, a4, t2 + ADD c06, t3, c06 + MUL b2, a4, t3 + + ADD c05, t4, c05 + MUL b4, a1, t4 + ADD c03, t1, c03 + MUL b3, a1, t1 + + ADD c04, t2, c04 + MUL b3, a2, t2 + ADD c08, t3, c08 + MUL b4, a2, t3 + + ADD c13, t4, c13 + MUL b2, a3, t4 + ADD c09, t1, c09 + MUL b3, a3, t1 + + ADD c10, t2, c10 + MUL b3, a4, t2 + ADD c14, t3, c14 + MUL b4, a4, t3 + + ADD c07, t4, c07 + ldi AO, 4 * SIZE(AO) + MUL b4, a3, t4 + ldi BO, 4 * SIZE(BO) + + ADD c11, t1, c11 + ADD c12, t2, c12 + ADD c16, t3, c16 + ADD c15, t4, c15 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 4, TMP1 +#else + subl KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 + + SUB b1, c02, c02 + SUB b2, c06, c06 + SUB b3, c10, c10 + SUB b4, c14, c14 + + LD a1, 8 * SIZE(BO) + LD a2, 9 * SIZE(BO) + LD a3, 10 * SIZE(BO) + LD a4, 11 * SIZE(BO) + + LD b1, 12 * SIZE(BO) + LD b2, 13 * SIZE(BO) + LD b3, 14 * SIZE(BO) + LD b4, 15 * SIZE(BO) + + SUB a1, c03, c03 + SUB a2, c07, c07 + SUB a3, c11, c11 + SUB a4, c15, c15 + + SUB b1, c04, c04 + SUB b2, c08, c08 + SUB b3, c12, c12 + SUB b4, c16, c16 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c05, c05 + SUB b2, c06, c06 + SUB b3, c07, c07 + SUB b4, c08, c08 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b1, 12 * SIZE(AO) + LD b2, 13 * SIZE(AO) + LD b3, 14 * SIZE(AO) + LD b4, 15 * SIZE(AO) + + SUB a1, c09, c09 + SUB a2, c10, c10 + SUB a3, c11, c11 + SUB a4, c12, c12 + + SUB b1, c13, c13 + SUB b2, c14, c14 + SUB b3, c15, c15 + SUB b4, c16, c16 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a1, c08, c08 + MUL a1, c12, c12 + MUL a1, c16, c16 + + MUL a2, c04, t1 + MUL a2, c08, t2 + MUL a2, c12, t3 + MUL a2, c16, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL a3, c04, t1 + MUL a3, c08, t2 + MUL a3, c12, t3 + MUL a3, c16, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a4, c04, t1 + MUL a4, c08, t2 + MUL a4, c12, t3 + MUL a4, c16, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b1, c07, c07 + MUL b1, c11, c11 + MUL b1, c15, c15 + + MUL b2, c03, t1 + MUL b2, c07, t2 + MUL b2, c11, t3 + MUL b2, c15, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL b3, c03, t1 + MUL b3, c07, t2 + MUL b3, c11, t3 + MUL b3, c15, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + MUL a1, c10, c10 + MUL a1, c14, c14 + + MUL a2, c02, t1 + MUL a2, c06, t2 + MUL a2, c10, t3 + MUL a2, c14, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + MUL a3, c01, c01 + MUL a3, c05, c05 + MUL a3, c09, c09 + MUL a3, c13, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 + + MUL a2, c01, t1 + MUL a2, c05, t2 + MUL a2, c09, t3 + MUL a2, c13, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a3, c01, t1 + MUL a3, c05, t2 + MUL a3, c09, t3 + MUL a3, c13, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL a4, c01, t1 + MUL a4, c05, t2 + MUL a4, c09, t3 + MUL a4, c13, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b1, c06, c06 + MUL b1, c10, c10 + MUL b1, c14, c14 + + MUL b2, c02, t1 + MUL b2, c06, t2 + MUL b2, c10, t3 + MUL b2, c14, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL b3, c02, t1 + MUL b3, c06, t2 + MUL b3, c10, t3 + MUL b3, c14, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a1, c07, c07 + MUL a1, c11, c11 + MUL a1, c15, c15 + + MUL a2, c03, t1 + MUL a2, c07, t2 + MUL a2, c11, t3 + MUL a2, c15, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + MUL a3, c04, c04 + MUL a3, c08, c08 + MUL a3, c12, c12 + MUL a3, c16, c16 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + MUL a2, c01, t1 + MUL a2, c02, t2 + MUL a2, c03, t3 + MUL a2, c04, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c01, t1 + MUL a4, c02, t2 + MUL a4, c03, t3 + MUL a4, c04, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b1, c06, c06 + MUL b1, c07, c07 + MUL b1, c08, c08 + + MUL b2, c05, t1 + MUL b2, c06, t2 + MUL b2, c07, t3 + MUL b2, c08, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL b3, c05, t1 + MUL b3, c06, t2 + MUL b3, c07, t3 + MUL b3, c08, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + MUL a2, c09, t1 + MUL a2, c10, t2 + MUL a2, c11, t3 + MUL a2, c12, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + MUL a3, c13, c13 + MUL a3, c14, c14 + MUL a3, c15, c15 + MUL a3, c16, c16 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a1, c14, c14 + MUL a1, c15, c15 + MUL a1, c16, c16 + + MUL a2, c13, t1 + MUL a2, c14, t2 + MUL a2, c15, t3 + MUL a2, c16, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a3, c13, t1 + MUL a3, c14, t2 + MUL a3, c15, t3 + MUL a3, c16, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a4, c13, t1 + MUL a4, c14, t2 + MUL a4, c15, t3 + MUL a4, c16, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b1, c10, c10 + MUL b1, c11, c11 + MUL b1, c12, c12 + + MUL b2, c09, t1 + MUL b2, c10, t2 + MUL b2, c11, t3 + MUL b2, c12, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL b3, c09, t1 + MUL b3, c10, t2 + MUL b3, c11, t3 + MUL b3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + MUL a1, c07, c07 + MUL a1, c08, c08 + + MUL a2, c05, t1 + MUL a2, c06, t2 + MUL a2, c07, t3 + MUL a2, c08, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a3, c01, c01 + MUL a3, c02, c02 + MUL a3, c03, c03 + MUL a3, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) + + ST c03, 8 * SIZE(BO) + ST c07, 9 * SIZE(BO) + ST c11, 10 * SIZE(BO) + ST c15, 11 * SIZE(BO) + + ST c04, 12 * SIZE(BO) + ST c08, 13 * SIZE(BO) + ST c12, 14 * SIZE(BO) + ST c16, 15 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) + + ST c09, 8 * SIZE(AO) + ST c10, 9 * SIZE(AO) + ST c11, 10 * SIZE(AO) + ST c12, 11 * SIZE(AO) + + ST c13, 12 * SIZE(AO) + ST c14, 13 * SIZE(AO) + ST c15, 14 * SIZE(AO) + ST c16, 15 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) + ldi C2, -4 * SIZE(C2) + ldi C3, -4 * SIZE(C3) + ldi C4, -4 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c11, 2 * SIZE(C3) + ST c12, 3 * SIZE(C3) + + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + ST c15, 2 * SIZE(C4) + ST c16, 3 * SIZE(C4) + +#ifndef LN + ldi C1, 4 * SIZE(C1) + ldi C2, 4 * SIZE(C2) + ldi C3, 4 * SIZE(C3) + ldi C4, 4 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP1 + addl AO, TMP1, AO + addl BO, TMP1, BO +#endif + +#ifdef LT + addl KK, 4, KK +#endif + +#ifdef LN + subl KK, 4, KK +#endif + + ldi I, -1(I) + + bgt I, $L11 + .align 4 + +$L20: + and M, 2, I + ble I, $L30 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(B) + ldi L, -2(KK) + LD b2, 1 * SIZE(B) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c01 + LD b4, 3 * SIZE(B) + fclr c05 + + ldi BO, 4 * SIZE(B) + fclr c02 + fclr c06 + ble KK, $L28 + + ble L, $L25 + +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c01 + LD b4, 3 * SIZE(BO) + fclr c05 + + ldi BO, 4 * SIZE(BO) + fclr c02 + fclr c06 + ble TMP1, $L28 + + ble L, $L25 +#endif + .align 4 + +$L22: + ADD c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + ldi BO, 8 * SIZE(BO) + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a3, b2, t3 + ldi AO, 4 * SIZE(AO) + + ADD c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD c01, t1, c01 + ldi L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD c09, t1, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + ldi AO, 2 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD c10, t2, c10 + MUL a2, b1, t2 + ADD c13, t3, c13 + MUL a1, b2, t3 + + ADD c14, t4, c14 + MUL a2, b2, t4 + ADD c01, t1, c01 + MUL a1, b3, t1 + + ADD c02, t2, c02 + MUL a2, b3, t2 + ADD c05, t3, c05 + MUL a1, b4, t3 + + ADD c06, t4, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b4, t4 + ldi BO, 4 * SIZE(BO) + + ADD c09, t1, c09 + ADD c10, t2, c10 + ADD c13, t3, c13 + ADD c14, t4, c14 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 + + SUB b1, c02, c02 + SUB b2, c06, c06 + SUB b3, c10, c10 + SUB b4, c14, c14 + +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c05, c05 + SUB a4, c06, c06 + + SUB b1, c09, c09 + SUB b2, c10, c10 + SUB b3, c13, c13 + SUB b4, c14, c14 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + MUL a1, c10, c10 + MUL a1, c14, c14 + + MUL a2, c02, t1 + MUL a2, c06, t2 + MUL a2, c10, t3 + MUL a2, c14, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + MUL a3, c01, c01 + MUL a3, c05, c05 + MUL a3, c09, c09 + MUL a3, c13, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 + + MUL a2, c01, t1 + MUL a2, c05, t2 + MUL a2, c09, t3 + MUL a2, c13, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a3, c02, c02 + MUL a3, c06, c06 + MUL a3, c10, c10 + MUL a3, c14, c14 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + + MUL a2, c01, t1 + MUL a2, c02, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a4, c01, t1 + MUL a4, c02, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b1, c06, c06 + + MUL b2, c05, t1 + MUL b2, c06, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL b3, c05, t1 + MUL b3, c06, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a1, c10, c10 + + MUL a2, c09, t1 + MUL a2, c10, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + MUL a3, c13, c13 + MUL a3, c14, c14 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a1, c14, c14 + + MUL a2, c13, t1 + MUL a2, c14, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a3, c13, t1 + MUL a3, c14, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a4, c13, t1 + MUL a4, c14, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b1, c10, c10 + + MUL b2, c09, t1 + MUL b2, c10, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL b3, c09, t1 + MUL b3, c10, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + + MUL a2, c05, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a3, c01, c01 + MUL a3, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c13, 6 * SIZE(AO) + ST c14, 7 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) + ldi C2, -2 * SIZE(C2) + ldi C3, -2 * SIZE(C3) + ldi C4, -2 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + +#ifndef LN + ldi C1, 2 * SIZE(C1) + ldi C2, 2 * SIZE(C2) + ldi C3, 2 * SIZE(C3) + ldi C4, 2 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + .align 4 + +$L30: + and M, 1, I + ble I, $L39 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + ldi L, -2(KK) + LD b2, 1 * SIZE(B) + ldi AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c09 + LD b4, 3 * SIZE(B) + fclr c13 + + ldi BO, 4 * SIZE(B) + ble KK, $L38 + + ble L, $L35 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c09 + LD b4, 3 * SIZE(BO) + fclr c13 + + ldi BO, 4 * SIZE(BO) + ble TMP1, $L38 + + ble L, $L35 +#endif + .align 4 + +$L32: + ADD c01, t1, c01 + ldi L, -2(L) + MUL a1, b1, t1 + LD b1, 0 * SIZE(BO) + + ADD c05, t2, c05 + ldi AO, 2 * SIZE(AO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + LD b5, 3 * SIZE(BO) + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, -1 * SIZE(AO) + + ADD c01, t1, c01 + MUL a2, b1, t1 + LD b1, 4 * SIZE(BO) + ldi BO, 8 * SIZE(BO) + + ADD c05, t2, c05 + MUL a2, b2, t2 + LD b2, -3 * SIZE(BO) + + ADD c09, t3, c09 + LD b4, -1 * SIZE(BO) + MUL a2, b3, t3 + LD b3, -2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a2, b5, t4 + LD a2, 0 * SIZE(AO) + bgt L, $L32 + .align 4 + +$L35: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L37 +#else + blbs TMP1, $L37 +#endif + .align 4 + + ADD c05, t2, c05 + LD b1, 0 * SIZE(BO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, 0 * SIZE(AO) + ldi AO, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L37: + ADD c05, t2, c05 + MUL a1, b2, t2 + ADD c09, t3, c09 + MUL a1, b3, t3 + + ADD c13, t4, c13 + ldi AO, 1 * SIZE(AO) + MUL a1, b4, t4 + ldi BO, 4 * SIZE(BO) + + ADD c01, t1, c01 + ADD c05, t2, c05 + ADD c09, t3, c09 + ADD c13, t4, c13 + +$L38: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 1, TMP1 +#else + subl KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO +#else + ldi AO, -1 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c05, t1, c05 + MUL a3, c01, t1 + SUB c09, t1, c09 + MUL a4, c01, t1 + SUB c13, t1, c13 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b2, c05, t1 + SUB c09, t1, c09 + MUL b3, c05, t1 + SUB c13, t1, c13 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a2, c09, t1 + SUB c13, t1, c13 + MUL a3, c13, c13 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a2, c13, t1 + SUB c09, t1, c09 + MUL a3, c13, t1 + SUB c05, t1, c05 + MUL a4, c13, t1 + SUB c01, t1, c01 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b2, c09, t1 + SUB c05, t1, c05 + MUL b3, c09, t1 + SUB c01, t1, c01 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a2, c05, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c13, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -1 * SIZE(C1) + ldi C2, -1 * SIZE(C2) + ldi C3, -1 * SIZE(C3) + ldi C4, -1 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c09, 0 * SIZE(C3) + ST c13, 0 * SIZE(C4) + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L39: +#ifdef LN + sll K, 2 + BASE_SHIFT, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 4, KK +#endif + +#ifdef RT + subl KK, 4, KK +#endif + ldi J, -1(J) + bgt J, $L01 + .align 4 + +$L40: + and N, 2, J + ble J, $L80 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + subl B, TMP1, B + + addl LDC, LDC, TMP1 + subl C, TMP1, C +#endif + + mov C, C1 + addl C, LDC, C2 + fclr t1 +#ifndef RT + addl C2, LDC, C +#endif + fclr t2 + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L60 + .align 4 + +$L51: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi L, -2(KK) + + ldi BO, 2 * SIZE(B) + ldi AO, 4 * SIZE(AO) + + ble KK, $L58 + + ble L, $L55 +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi L, -2(TMP1) + ldi BO, 2 * SIZE(BO) + ldi AO, 4 * SIZE(AO) + + ble TMP1, $L58 + + ble L, $L55 +#endif + .align 4 + +$L52: + ADD c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD c06, t2, c06 + ldi L, -2(L) + MUL a2, b1, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b3, t3 + ldi AO, 8 * SIZE(AO) + + ADD c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L52 + .align 4 + +$L55: + ADD c05, t1, c05 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + ldi AO, 4 * SIZE(AO) + + ADD c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L57: + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + MUL a4, b1, t4 + ADD c01, t1, c01 + MUL a1, b2, t1 + + ADD c02, t2, c02 + MUL a2, b2, t2 + ADD c03, t3, c03 + MUL a3, b2, t3 + + ADD c04, t4, c04 + ldi AO, 4 * SIZE(AO) + MUL a4, b2, t4 + ldi BO, 2 * SIZE(BO) + + ADD c05, t1, c05 + ADD c06, t2, c06 + ADD c07, t3, c07 + ADD c08, t4, c08 + .align 4 + +$L58: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 4, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c02, c02 + SUB a4, c06, c06 + + SUB b1, c03, c03 + SUB b2, c07, c07 + SUB b3, c04, c04 + SUB b4, c08, c08 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c05, c05 + SUB b2, c06, c06 + SUB b3, c07, c07 + SUB b4, c08, c08 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a1, c08, c08 + + MUL a2, c04, t1 + MUL a2, c08, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL a3, c04, t1 + MUL a3, c08, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a4, c04, t1 + MUL a4, c08, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b1, c07, c07 + + MUL b2, c03, t1 + MUL b2, c07, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL b3, c03, t1 + MUL b3, c07, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + + MUL a2, c02, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + MUL a3, c01, c01 + MUL a3, c05, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + + MUL a2, c01, t1 + MUL a2, c05, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a3, c01, t1 + MUL a3, c05, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL a4, c01, t1 + MUL a4, c05, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b1, c06, c06 + + MUL b2, c02, t1 + MUL b2, c06, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL b3, c02, t1 + MUL b3, c06, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a1, c07, c07 + + MUL a2, c03, t1 + MUL a2, c07, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + MUL a3, c04, c04 + MUL a3, c08, c08 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + MUL a2, c01, t1 + MUL a2, c02, t2 + MUL a2, c03, t3 + MUL a2, c04, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a3, c05, c05 + MUL a3, c06, c06 + MUL a3, c07, c07 + MUL a3, c08, c08 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + MUL a1, c07, c07 + MUL a1, c08, c08 + + MUL a2, c05, t1 + MUL a2, c06, t2 + MUL a2, c07, t3 + MUL a2, c08, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a3, c01, c01 + MUL a3, c02, c02 + MUL a3, c03, c03 + MUL a3, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c07, 5 * SIZE(BO) + ST c04, 6 * SIZE(BO) + ST c08, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) + ldi C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + +#ifndef LN + ldi C1, 4 * SIZE(C1) + ldi C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 4, KK +#endif + +#ifdef LN + subl KK, 4, KK +#endif + + ldi I, -1(I) + + bgt I, $L51 + .align 4 + +$L60: + and M, 2, I + ble I, $L70 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + ldi L, -2(KK) + LD b2, 1 * SIZE(B) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + ldi BO, 2 * SIZE(B) + + ble KK, $L68 + + ble L, $L65 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + ldi BO, 2 * SIZE(BO) + + ble TMP1, $L68 + + ble L, $L65 +#endif + .align 4 + +$L62: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD c02, t2, c02 + ldi AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD c05, t3, c05 + ldi L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + ldi BO, 4 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L62 + .align 4 + +$L65: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L67 +#else + blbs TMP1, $L67 +#endif + .align 4 + + ADD c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c05, t3, c05 + ldi BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + ldi AO, 2 * SIZE(AO) + .align 4 + +$L67: + ADD c02, t2, c02 + MUL a2, b1, t2 + ADD c05, t3, c05 + MUL a1, b2, t3 + + ADD c06, t4, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b2, t4 + ldi BO, 2 * SIZE(BO) + + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c05, t3, c05 + ADD c06, t4, c06 + .align 4 + +$L68: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c02, c02 + SUB a4, c06, c06 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c05, c05 + SUB a4, c06, c06 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + + MUL a2, c02, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + MUL a3, c01, c01 + MUL a3, c05, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + + MUL a2, c01, t1 + MUL a2, c05, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a3, c02, c02 + MUL a3, c06, c06 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + + MUL a2, c01, t1 + MUL a2, c02, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a3, c05, c05 + MUL a3, c06, c06 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + + MUL a2, c05, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a3, c01, c01 + MUL a3, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) + ldi C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + +#ifndef LN + ldi C1, 2 * SIZE(C1) + ldi C2, 2 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + .align 4 + +$L70: + and M, 1, I + ble I, $L79 + +#if defined(LT) || defined(RN) + + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + fclr c02 + LD b2, 1 * SIZE(B) + fclr c06 + + ldi L, -2(KK) + + LD b3, 2 * SIZE(B) + ldi AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(B) + ldi BO, 2 * SIZE(B) + + ble KK, $L78 + + ble L, $L75 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + fclr c02 + LD b2, 1 * SIZE(BO) + fclr c06 + + ldi L, -2(TMP1) + + LD b3, 2 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(BO) + ldi BO, 2 * SIZE(BO) + + ble TMP1, $L78 + + ble L, $L75 +#endif + .align 4 + +$L72: + ADD c01, t1, c01 + ldi L, -2(L) + MUL a1, b1, t1 + LD b1, 2 * SIZE(BO) + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 1 * SIZE(AO) + LD b2, 3 * SIZE(BO) + + ADD c02, t3, c02 + ldi AO, 2 * SIZE(AO) + MUL a2, b3, t3 + LD b3, 4 * SIZE(BO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD a2, 0 * SIZE(AO) + LD b4, 5 * SIZE(BO) + + ldi BO, 4 * SIZE(BO) + unop + unop + bgt L, $L72 + .align 4 + +$L75: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L77 +#else + blbs TMP1, $L77 +#endif + .align 4 + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + LD b2, 1 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + MUL a1, b1, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L77: + ADD c05, t2, c05 + MUL a1, b2, t2 + ADD c02, t3, c02 + ADD c06, t4, c06 + + ADD c01, c02, c01 + ldi AO, 1 * SIZE(AO) + ADD c05, c06, c05 + ldi BO, 2 * SIZE(BO) + + ADD c01, t1, c01 + ADD c05, t2, c05 + + .align 4 + +$L78: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 1, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -1 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c05, c05 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c05, t1, c05 + MUL a3, c05, c05 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a2, c05, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -1 * SIZE(C1) + ldi C2, -1 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L79: +#ifdef LN + sll K, 1 + BASE_SHIFT, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 2, KK +#endif + +#ifdef RT + subl KK, 2, KK +#endif + .align 4 + +$L80: + and N, 1, J + ble J, $L999 + +#ifdef RT + sll K, BASE_SHIFT, TMP1 + subl B, TMP1, B + + subl C, LDC, C +#endif + + mov C, C1 +#ifndef RT + addl C, LDC, C +#endif + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + ble I, $L100 + .align 4 + +$L91: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L95 + +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L95 +#endif + .align 5 + +$L92: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + ldi L, -1(L) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 8 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 9 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 10 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 11 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + LD a1, 12 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD a2, 13 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b3, t3 + LD a3, 14 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b3, t4 + LD a5, 15 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b4, t1 + LD a1, 16 * SIZE(AO) + ldi AO, 16 * SIZE(AO) + + ADD c02, t2, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b4, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L92 + .align 4 + +$L95: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + unop + ble L, $L98 + .align 4 + +$L96: + ADD c01, t1, c01 + ldi L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + ldi BO, 1 * SIZE(BO) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ldi AO, 4 * SIZE(AO) + bgt L, $L96 + .align 4 + +$L98: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 4, TMP1 +#else + subl KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a2, c04, t1 + SUB c03, t1, c03 + MUL a3, c04, t1 + SUB c02, t1, c02 + MUL a4, c04, t1 + SUB c01, t1, c01 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b2, c03, t1 + SUB c02, t1, c02 + MUL b3, c03, t1 + SUB c01, t1, c01 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a2, c02, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c02, t1, c02 + MUL a3, c01, t1 + SUB c03, t1, c03 + MUL a4, c01, t1 + SUB c04, t1, c04 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b2, c02, t1 + SUB c03, t1, c03 + MUL b3, c02, t1 + SUB c04, t1, c04 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a2, c03, t1 + SUB c04, t1, c04 + MUL a3, c04, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + ldi C1, 4 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 4, KK +#endif + +#ifdef LN + subl KK, 4, KK +#endif + + ldi I, -1(I) + bgt I, $L91 + .align 4 + +$L100: + and M, 2, I + ble I, $L110 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L105 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + ble L, $L105 +#endif + .align 5 + +$L102: + ADD c01, t1, c01 + ldi L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c03, t3, c03 + ldi BO, 4 * SIZE(BO) + MUL a3, b2, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a5, 7 * SIZE(AO) + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b3, t1 + LD a1, 8 * SIZE(AO) + ldi AO, 8 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L102 + .align 4 + +$L105: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L108 + .align 4 + +$L106: + ADD c01, t1, c01 + ldi L, -1(L) + MUL a1, b1, t1 + LD a1, 2 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + ldi AO, 2 * SIZE(AO) + unop + ldi BO, 1 * SIZE(BO) + bgt L, $L106 + .align 4 + +$L108: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c03, c01 + ADD c02, c04, c02 + +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a2, c02, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c02, t1, c02 + MUL a3, c02, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + ldi C1, 2 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + .align 4 + +$L110: + and M, 1, I + ble I, $L119 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + unop + ble L, $L115 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L115 +#endif + .align 4 + +$L112: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c02, t2, c02 + MUL a2, b2, t2 + LD a2, 5 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c03, t3, c03 + MUL a3, b3, t3 + LD a3, 6 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c04, t4, c04 + MUL a4, b4, t4 + LD a4, 7 * SIZE(AO) + LD b4, 7 * SIZE(BO) + + ldi L, -1(L) + ldi AO, 4 * SIZE(AO) + ldi BO, 4 * SIZE(BO) + bgt L, $L112 + .align 4 + +$L115: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L118 + .align 4 + +$L116: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + ldi L, -1(L) + ldi AO, 1 * SIZE(AO) + ldi BO, 1 * SIZE(BO) + bgt L, $L116 + .align 4 + +$L118: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c02, c01 + ADD c03, c04, c03 + ADD c01, c03, c01 + +#if defined(LN) || defined(RT) + subl KK, 1, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + addl B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + + SUB a1, c01, c01 +#else + LD a1, 0 * SIZE(AO) + + SUB a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -1 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + +#ifndef LN + ldi C1, 1 * SIZE(C1) +#endif + +#ifdef RT + SXADDQ K, AORIG, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L119: +#ifdef LN + SXADDQ K, B, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 1, KK +#endif + +#ifdef RT + subl KK, 1, KK +#endif + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + clr $0 + ldi $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/trsm_kernel_4x4_RT.S b/kernel/sw_64/trsm_kernel_4x4_RT.S new file mode 100644 index 0000000..b9a1975 --- /dev/null +++ b/kernel/sw_64/trsm_kernel_4x4_RT.S @@ -0,0 +1,5148 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(SW6) +#error "Architecture is not specified." +#endif + +#ifdef SW6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 56 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + +#define STACKSIZE 88 + +#define M $16 +#define N $17 +#define K $18 +#define A $20 +#define B $21 +#define C $22 +#define LDC $23 + +#define C1 $19 +#define C2 $24 +#define C3 $25 +#define C4 $27 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define tmp $9 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + ldi $sp, -STACKSIZE($sp) + + ldl C, 0 + STACKSIZE($sp) + ldl LDC, 8 + STACKSIZE($sp) + ldl OFFSET, 16 + STACKSIZE($sp) + + SXADDQ LDC, 0, LDC + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + stl $9, 64($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + mulq M, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ M, C, C +#endif + +#ifdef RN + negq OFFSET, KK +#endif + +#ifdef RT + mull N, K, TMP1 + SXADDQ TMP1, B, B + + mull N, LDC, TMP1 + addl TMP1, C, C + + subl N, OFFSET, KK +#endif + + and N, 1, J + ble J, $L40 + +#ifdef RT + sll K, BASE_SHIFT, TMP1 + subl B, TMP1, B + + subl C, LDC, C +#endif + + mov C, C1 +#ifndef RT + addl C, LDC, C +#endif + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + ble I, $L100 + .align 4 + +$L91: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L95 + +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L95 +#endif + .align 5 + +$L92: + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b1, b5 + fmov b5, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + ldi L, -1(L) + MUL a2, b1, b5 + fmov b5, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b1, b5 + fmov b5, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b1, b5 + fmov b5, t4 + LD a4, 7 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b2, b5 + fmov b5, t1 + LD a1, 8 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b2, b5 + fmov b5, t2 + LD a2, 9 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b2, b5 + fmov b5, t3 + LD a3, 10 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b2, b5 + fmov b5, t4 + LD a4, 11 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b3, b5 + fmov b5, t1 + LD a1, 12 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b3, b5 + fmov b5, t2 + LD a2, 13 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b3, b5 + fmov b5, t3 + LD a3, 14 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b3, b5 + fmov b5, t4 + LD a5, 15 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b4, b5 + fmov b5, t1 + LD a1, 16 * SIZE(AO) + ldi AO, 16 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b4, b5 + fmov b5, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, b5 + fmov b5, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a5, b4, b5 + fmov b5, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L92 + .align 4 + +$L95: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + unop + ble L, $L98 + .align 4 + +$L96: + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -1(L) + MUL a1, b1, b5 + fmov b5, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + ldi BO, 1 * SIZE(BO) + MUL a2, b1, b5 + fmov b5, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b1, b5 + fmov b5, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b1, b5 + fmov b5, t4 + LD a4, 7 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ldi AO, 4 * SIZE(AO) + bgt L, $L96 + .align 4 + +$L98: + ADD c01, t1, b5 + fmov b5, c01 + ADD c02, t2, b5 + fmov b5, c02 + ADD c03, t3, b5 + fmov b5, c03 + ADD c04, t4, b5 + fmov b5, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 4, TMP1 +#else + subl KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, b5 + fmov b5, c04 + MUL a2, c04, b5 + fmov b5, t1 + SUB c03, t1, b5 + fmov b5, c03 + MUL a3, c04, b5 + fmov b5, t1 + SUB c02, t1, b5 + fmov b5, c02 + MUL a4, c04, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, b5 + fmov b5, c03 + MUL b2, c03, b5 + fmov b5, t1 + SUB c02, t1, b5 + fmov b5, c02 + MUL b3, c03, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, b5 + fmov b5, c02 + MUL a2, c02, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + MUL a3, c01, b5 + fmov b5, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a2, c01, b5 + fmov b5, t1 + SUB c02, t1, b5 + fmov b5, c02 + MUL a3, c01, b5 + fmov b5, t1 + SUB c03, t1, b5 + fmov b5, c03 + MUL a4, c01, b5 + fmov b5, t1 + SUB c04, t1, b5 + fmov b5, c04 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, b5 + fmov b5, c02 + MUL b2, c02, b5 + fmov b5, t1 + SUB c03, t1, b5 + fmov b5, c03 + MUL b3, c02, b5 + fmov b5, t1 + SUB c04, t1, b5 + fmov b5, c04 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, b5 + fmov b5, c03 + MUL a2, c03, b5 + fmov b5, t1 + SUB c04, t1, b5 + fmov b5, c04 + MUL a3, c04, b5 + fmov b5, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + ldi C1, 4 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 4, KK +#endif + +#ifdef LN + subl KK, 4, KK +#endif + + ldi I, -1(I) + bgt I, $L91 + .align 4 + +$L100: + and M, 2, I + ble I, $L110 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L105 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + ble L, $L105 +#endif + .align 5 + +$L102: + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -1(L) + MUL a1, b1, b5 + fmov b5, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b1, b5 + fmov b5, t2 + LD a2, 5 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c03, t3, b5 + fmov b5, c03 + ldi BO, 4 * SIZE(BO) + MUL a3, b2, b5 + fmov b5, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b2, b5 + fmov b5, t4 + LD a5, 7 * SIZE(AO) + LD b2, 1 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b3, b5 + fmov b5, t1 + LD a1, 8 * SIZE(AO) + ldi AO, 8 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b3, b5 + fmov b5, t2 + LD b3, 2 * SIZE(BO) + LD a2, 1 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, b5 + fmov b5, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a5, b4, b5 + fmov b5, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L102 + .align 4 + +$L105: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L108 + .align 4 + +$L106: + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -1(L) + MUL a1, b1, b5 + fmov b5, t1 + LD a1, 2 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b1, b5 + fmov b5, t2 + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + ldi AO, 2 * SIZE(AO) + unop + ldi BO, 1 * SIZE(BO) + bgt L, $L106 + .align 4 + +$L108: + ADD c01, t1, b5 + fmov b5, c01 + ADD c02, t2, b5 + fmov b5, c02 + ADD c03, t3, b5 + fmov b5, c03 + ADD c04, t4, b5 + fmov b5, c04 + + ADD c01, c03, b5 + fmov b5, c01 + ADD c02, c04, b5 + fmov b5, c02 + +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, b5 + fmov b5, c02 + MUL a2, c02, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + MUL a3, c01, b5 + fmov b5, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a2, c01, b5 + fmov b5, t1 + SUB c02, t1, b5 + fmov b5, c02 + MUL a3, c02, b5 + fmov b5, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + ldi C1, 2 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + .align 4 + +$L110: + and M, 1, I + ble I, $L119 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + unop + ble L, $L115 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L115 +#endif + .align 4 + +$L112: + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b1, b5 + fmov b5, t1 + LD a1, 4 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b2, b5 + fmov b5, t2 + LD a2, 5 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c03, t3, b5 + fmov b5, c03 + MUL a3, b3, b5 + fmov b5, t3 + LD a3, 6 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b4, b5 + fmov b5, t4 + LD a4, 7 * SIZE(AO) + LD b4, 7 * SIZE(BO) + + ldi L, -1(L) + ldi AO, 4 * SIZE(AO) + ldi BO, 4 * SIZE(BO) + bgt L, $L112 + .align 4 + +$L115: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L118 + .align 4 + +$L116: + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b1, b5 + fmov b5, t1 + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + ldi L, -1(L) + ldi AO, 1 * SIZE(AO) + ldi BO, 1 * SIZE(BO) + bgt L, $L116 + .align 4 + +$L118: + ADD c01, t1, b5 + fmov b5, c01 + ADD c02, t2, b5 + fmov b5, c02 + ADD c03, t3, b5 + fmov b5, c03 + ADD c04, t4, b5 + fmov b5, c04 + + ADD c01, c02, b5 + fmov b5, c01 + ADD c03, c04, b5 + fmov b5, c03 + ADD c01, c03, b5 + fmov b5, c01 + +#if defined(LN) || defined(RT) + subl KK, 1, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + addl B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 +#else + LD a1, 0 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -1 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + +#ifndef LN + ldi C1, 1 * SIZE(C1) +#endif + +#ifdef RT + SXADDQ K, AORIG, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L119: +#ifdef LN + SXADDQ K, B, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 1, KK +#endif + +#ifdef RT + subl KK, 1, KK +#endif + .align 4 + +$L40: + and N, 2, J + ble J, $L80 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + subl B, TMP1, B + + addl LDC, LDC, TMP1 + subl C, TMP1, C +#endif + + mov C, C1 + addl C, LDC, C2 + fclr t1 +#ifndef RT + addl C2, LDC, C +#endif + fclr t2 + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L60 + .align 4 + +$L51: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi L, -2(KK) + + ldi BO, 2 * SIZE(B) + ldi AO, 4 * SIZE(AO) + + ble KK, $L58 + + ble L, $L55 +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi L, -2(TMP1) + ldi BO, 2 * SIZE(BO) + ldi AO, 4 * SIZE(AO) + + ble TMP1, $L58 + + ble L, $L55 +#endif + .align 4 + +$L52: + ADD c05, t1, b5 + fmov b5, c05 + unop + MUL a1, b1, b5 + fmov b5, t1 + unop + + ADD c06, t2, b5 + fmov b5, c06 + ldi L, -2(L) + MUL a2, b1, b5 + fmov b5, t2 + unop + + ADD c07, t3, b5 + fmov b5, c07 + unop + MUL a3, b1, b5 + fmov b5, t3 + unop + + ADD c08, t4, b5 + fmov b5, c08 + unop + MUL a4, b1, b5 + fmov b5, t4 + LD b1, 2 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b2, b5 + fmov b5, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b2, b5 + fmov b5, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b2, b5 + fmov b5, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + unop + MUL a4, b2, b5 + fmov b5, t4 + LD a5, 3 * SIZE(AO) + + ADD c05, t1, b5 + fmov b5, c05 + unop + MUL a1, b3, b5 + fmov b5, t1 + LD b2, -1 * SIZE(BO) + + ADD c06, t2, b5 + fmov b5, c06 + unop + MUL a2, b3, b5 + fmov b5, t2 + unop + + ADD c07, t3, b5 + fmov b5, c07 + unop + MUL a3, b3, b5 + fmov b5, t3 + ldi AO, 8 * SIZE(AO) + + ADD c08, t4, b5 + fmov b5, c08 + unop + MUL a5, b3, b5 + fmov b5, t4 + LD b3, 0 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b4, b5 + fmov b5, t1 + LD a1, -4 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b4, b5 + fmov b5, t2 + LD a2, -3 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, b5 + fmov b5, t3 + LD a3, -2 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a5, b4, b5 + fmov b5, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L52 + .align 4 + +$L55: + ADD c05, t1, b5 + fmov b5, c05 + MUL a1, b1, b5 + fmov b5, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD c06, t2, b5 + fmov b5, c06 + MUL a2, b1, b5 + fmov b5, t2 + ADD c07, t3, b5 + fmov b5, c07 + MUL a3, b1, b5 + fmov b5, t3 + + ADD c08, t4, b5 + fmov b5, c08 + unop + MUL a4, b1, b5 + fmov b5, t4 + LD b1, 0 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b2, b5 + fmov b5, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b2, b5 + fmov b5, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b2, b5 + fmov b5, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, b5 + fmov b5, c04 + MUL a4, b2, b5 + fmov b5, t4 + LD a4, 3 * SIZE(AO) + ldi AO, 4 * SIZE(AO) + + ADD c05, t1, b5 + fmov b5, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, b5 + fmov b5, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L57: + ADD c06, t2, b5 + fmov b5, c06 + MUL a2, b1, b5 + fmov b5, t2 + ADD c07, t3, b5 + fmov b5, c07 + MUL a3, b1, b5 + fmov b5, t3 + + ADD c08, t4, b5 + fmov b5, c08 + MUL a4, b1, b5 + fmov b5, t4 + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b2, b5 + fmov b5, t1 + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b2, b5 + fmov b5, t2 + ADD c03, t3, b5 + fmov b5, c03 + MUL a3, b2, b5 + fmov b5, t3 + + ADD c04, t4, b5 + fmov b5, c04 + ldi AO, 4 * SIZE(AO) + MUL a4, b2, b5 + fmov b5, t4 + ldi BO, 2 * SIZE(BO) + + ADD c05, t1, b5 + fmov b5, c05 + ADD c06, t2, b5 + fmov b5, c06 + ADD c07, t3, b5 + fmov b5, c07 + ADD c08, t4, b5 + fmov b5, c08 + .align 4 + +$L58: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 4, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 + SUB a3, c02, b5 + fmov b5, c02 + SUB a4, c06, b5 + fmov b5, c06 + + SUB b1, c03, b5 + fmov b5, c03 + SUB b2, c07, b5 + fmov b5, c07 + SUB b3, c04, b5 + fmov b5, c04 + SUB b4, c08, b5 + fmov b5, c08 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 + + SUB b1, c05, b5 + fmov b5, c05 + SUB b2, c06, b5 + fmov b5, c06 + SUB b3, c07, b5 + fmov b5, c07 + SUB b4, c08, b5 + fmov b5, c08 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, b5 + fmov b5, c04 + MUL a1, c08, b5 + fmov b5, c08 + + MUL a2, c04, b5 + fmov b5, t1 + MUL a2, c08, b5 + fmov b5, t2 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c07, t2, b5 + fmov b5, c07 + + MUL a3, c04, b5 + fmov b5, t1 + MUL a3, c08, b5 + fmov b5, t2 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + + MUL a4, c04, b5 + fmov b5, t1 + MUL a4, c08, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, b5 + fmov b5, c03 + MUL b1, c07, b5 + fmov b5, c07 + + MUL b2, c03, b5 + fmov b5, t1 + MUL b2, c07, b5 + fmov b5, t2 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + + MUL b3, c03, b5 + fmov b5, t1 + MUL b3, c07, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c06, b5 + fmov b5, c06 + + MUL a2, c02, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c05, b5 + fmov b5, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c05, b5 + fmov b5, c05 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c05, b5 + fmov b5, t2 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + + MUL a3, c01, b5 + fmov b5, t1 + MUL a3, c05, b5 + fmov b5, t2 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c07, t2, b5 + fmov b5, c07 + + MUL a4, c01, b5 + fmov b5, t1 + MUL a4, c05, b5 + fmov b5, t2 + + SUB c04, t1, b5 + fmov b5, c04 + SUB c08, t2, b5 + fmov b5, c08 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, b5 + fmov b5, c02 + MUL b1, c06, b5 + fmov b5, c06 + + MUL b2, c02, b5 + fmov b5, t1 + MUL b2, c06, b5 + fmov b5, t2 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c07, t2, b5 + fmov b5, c07 + + MUL b3, c02, b5 + fmov b5, t1 + MUL b3, c06, b5 + fmov b5, t2 + + SUB c04, t1, b5 + fmov b5, c04 + SUB c08, t2, b5 + fmov b5, c08 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c07, b5 + fmov b5, c07 + + MUL a2, c03, b5 + fmov b5, t1 + MUL a2, c07, b5 + fmov b5, t2 + + SUB c04, t1, b5 + fmov b5, c04 + SUB c08, t2, b5 + fmov b5, c08 + + MUL a3, c04, b5 + fmov b5, c04 + MUL a3, c08, b5 + fmov b5, c08 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c02, b5 + fmov b5, t2 + MUL a2, c03, b5 + fmov b5, t3 + MUL a2, c04, b5 + fmov b5, t4 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + SUB c07, t3, b5 + fmov b5, c07 + SUB c08, t4, b5 + fmov b5, c08 + + MUL a3, c05, b5 + fmov b5, c05 + MUL a3, c06, b5 + fmov b5, c06 + MUL a3, c07, b5 + fmov b5, c07 + MUL a3, c08, b5 + fmov b5, c08 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c06, b5 + fmov b5, c06 + MUL a1, c07, b5 + fmov b5, c07 + MUL a1, c08, b5 + fmov b5, c08 + + MUL a2, c05, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + MUL a2, c07, b5 + fmov b5, t3 + MUL a2, c08, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + SUB c03, t3, b5 + fmov b5, c03 + SUB c04, t4, b5 + fmov b5, c04 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c02, b5 + fmov b5, c02 + MUL a3, c03, b5 + fmov b5, c03 + MUL a3, c04, b5 + fmov b5, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c07, 5 * SIZE(BO) + ST c04, 6 * SIZE(BO) + ST c08, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) + ldi C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + +#ifndef LN + ldi C1, 4 * SIZE(C1) + ldi C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 4, KK +#endif + +#ifdef LN + subl KK, 4, KK +#endif + + ldi I, -1(I) + + bgt I, $L51 + .align 4 + +$L60: + and M, 2, I + ble I, $L70 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + ldi L, -2(KK) + LD b2, 1 * SIZE(B) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + ldi BO, 2 * SIZE(B) + + ble KK, $L68 + + ble L, $L65 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + ldi BO, 2 * SIZE(BO) + + ble TMP1, $L68 + + ble L, $L65 +#endif + .align 4 + +$L62: + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b1, b5 + fmov b5, t1 + unop + + ADD c02, t2, b5 + fmov b5, c02 + ldi AO, 4 * SIZE(AO) + MUL a2, b1, b5 + fmov b5, t2 + LD b1, 2 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + ldi L, -2(L) + MUL a1, b2, b5 + fmov b5, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + unop + MUL a2, b2, b5 + fmov b5, t4 + LD a2, -1 * SIZE(AO) + + ADD c01, t1, b5 + fmov b5, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, b5 + fmov b5, t1 + ldi BO, 4 * SIZE(BO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a4, b3, b5 + fmov b5, t2 + LD b3, 0 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + unop + MUL a3, b4, b5 + fmov b5, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + MUL a4, b4, b5 + fmov b5, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L62 + .align 4 + +$L65: + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b1, b5 + fmov b5, t1 +#if defined(LT) || defined(RN) + blbs KK, $L67 +#else + blbs TMP1, $L67 +#endif + .align 4 + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b1, b5 + fmov b5, t2 + LD b1, 0 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + ldi BO, 2 * SIZE(BO) + MUL a1, b2, b5 + fmov b5, t3 + LD a1, 0 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + unop + MUL a2, b2, b5 + fmov b5, t4 + LD a2, 1 * SIZE(AO) + + ADD c01, t1, b5 + fmov b5, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, b5 + fmov b5, t1 + ldi AO, 2 * SIZE(AO) + .align 4 + +$L67: + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b1, b5 + fmov b5, t2 + ADD c05, t3, b5 + fmov b5, c05 + MUL a1, b2, b5 + fmov b5, t3 + + ADD c06, t4, b5 + fmov b5, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b2, b5 + fmov b5, t4 + ldi BO, 2 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + ADD c02, t2, b5 + fmov b5, c02 + ADD c05, t3, b5 + fmov b5, c05 + ADD c06, t4, b5 + fmov b5, c06 + .align 4 + +$L68: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 + SUB a3, c02, b5 + fmov b5, c02 + SUB a4, c06, b5 + fmov b5, c06 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c05, b5 + fmov b5, c05 + SUB a4, c06, b5 + fmov b5, c06 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c06, b5 + fmov b5, c06 + + MUL a2, c02, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c05, b5 + fmov b5, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c05, b5 + fmov b5, c05 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c05, b5 + fmov b5, t2 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + + MUL a3, c02, b5 + fmov b5, c02 + MUL a3, c06, b5 + fmov b5, c06 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c02, b5 + fmov b5, t2 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + + MUL a3, c05, b5 + fmov b5, c05 + MUL a3, c06, b5 + fmov b5, c06 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c06, b5 + fmov b5, c06 + + MUL a2, c05, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c02, b5 + fmov b5, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) + ldi C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + +#ifndef LN + ldi C1, 2 * SIZE(C1) + ldi C2, 2 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + .align 4 + +$L70: + and M, 1, I + ble I, $L79 + +#if defined(LT) || defined(RN) + + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + fclr c02 + LD b2, 1 * SIZE(B) + fclr c06 + + ldi L, -2(KK) + + LD b3, 2 * SIZE(B) + ldi AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(B) + ldi BO, 2 * SIZE(B) + + ble KK, $L78 + + ble L, $L75 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + fclr c02 + LD b2, 1 * SIZE(BO) + fclr c06 + + ldi L, -2(TMP1) + + LD b3, 2 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(BO) + ldi BO, 2 * SIZE(BO) + + ble TMP1, $L78 + + ble L, $L75 +#endif + .align 4 + +$L72: + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -2(L) + MUL a1, b1, b5 + fmov b5, t1 + LD b1, 2 * SIZE(BO) + + ADD c05, t2, b5 + fmov b5, c05 + MUL a1, b2, b5 + fmov b5, t2 + LD a1, 1 * SIZE(AO) + LD b2, 3 * SIZE(BO) + + ADD c02, t3, b5 + fmov b5, c02 + ldi AO, 2 * SIZE(AO) + MUL a2, b3, b5 + fmov b5, t3 + LD b3, 4 * SIZE(BO) + + ADD c06, t4, b5 + fmov b5, c06 + MUL a2, b4, b5 + fmov b5, t4 + LD a2, 0 * SIZE(AO) + LD b4, 5 * SIZE(BO) + + ldi BO, 4 * SIZE(BO) + unop + unop + bgt L, $L72 + .align 4 + +$L75: + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b1, b5 + fmov b5, t1 +#if defined(LT) || defined(RN) + blbs KK, $L77 +#else + blbs TMP1, $L77 +#endif + .align 4 + + ADD c05, t2, b5 + fmov b5, c05 + MUL a1, b2, b5 + fmov b5, t2 + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + LD b2, 1 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + MUL a1, b1, b5 + fmov b5, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L77: + ADD c05, t2, b5 + fmov b5, c05 + MUL a1, b2, b5 + fmov b5, t2 + ADD c02, t3, b5 + fmov b5, c02 + ADD c06, t4, b5 + fmov b5, c06 + + ADD c01, c02, b5 + fmov b5, c01 + ldi AO, 1 * SIZE(AO) + ADD c05, c06, b5 + fmov b5, c05 + ldi BO, 2 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + ADD c05, t2, b5 + fmov b5, c05 + + .align 4 + +$L78: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 1, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -1 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c05, b5 + fmov b5, c05 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a2, c01, b5 + fmov b5, t1 + SUB c05, t1, b5 + fmov b5, c05 + MUL a3, c05, b5 + fmov b5, c05 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, b5 + fmov b5, c05 + MUL a2, c05, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + MUL a3, c01, b5 + fmov b5, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -1 * SIZE(C1) + ldi C2, -1 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L79: +#ifdef LN + sll K, 1 + BASE_SHIFT, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 2, KK +#endif + +#ifdef RT + subl KK, 2, KK +#endif + .align 4 + +$L80: + sra N, 2, J + ble J, $L999 + .align 4 + +$L01: +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + subl B, TMP1, B + + s4addl LDC, 0, TMP1 + subl C, TMP1, C +#endif + + mov C, C1 + addl C, LDC, C2 + addl C2, LDC, C3 +#ifndef RT + s4addl LDC, C, C +#endif + + fclr t1 + addl C3, LDC, C4 + fclr t2 + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L20 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + + LD b3, 2 * SIZE(B) + fclr c06 + LD b4, 3 * SIZE(B) + fclr c05 + + fillcs 4 * SIZE(C1) + fclr c03 + ldi L, -2(KK) + fclr c04 + + fillcs 7 * SIZE(C2) + flds $f31, 7 * SIZE(C2) + fclr c08 + ldi BO, 4 * SIZE(B) + fclr c13 + + fillcs 4 * SIZE(C3) + fclr c09 + ldi AO, 4 * SIZE(AO) + fclr c10 + + fillcs 7 * SIZE(C4) + fclr c14 + fclr c07 + ble KK, $L18 +#else + +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addl AORIG, TMP1, AO + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + + LD b3, 2 * SIZE(BO) + fclr c06 + LD b4, 3 * SIZE(BO) + fclr c05 + + fillcs 4 * SIZE(C1) + fclr c03 + ldi L, -2(TMP1) + fclr c04 + + fillcs 7 * SIZE(C2) + fclr c08 + ldi BO, 4 * SIZE(BO) + fclr c13 + + fillcs 4 * SIZE(C3) + fclr c09 + ldi AO, 4 * SIZE(AO) + fclr c10 + + fillcs 7 * SIZE(C4) + fclr c14 + fclr c07 + ble TMP1, $L18 +#endif + + ble L, $L15 + .align 5 + +$L12: +/* 1 */ + ADD c11, t1, b5 + fmov b5, c11 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, b5 + fmov b5, t1 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD c12, t2, b5 + fmov b5, c12 + unop + MUL b1, a2, t2 + unop + + ADD c16, t3, b5 + fmov b5, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD c15, t4, b5 + fmov b5, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + FIMOVD b5, tmp + +/* 2 */ + ADD c01, t1, b5 + fmov b5, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD c02, t2, b5 + fmov b5, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD c06, t3, b5 + fmov b5, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, b5 + fmov b5, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD c03, t1, b5 + fmov b5, c03 + unop + MUL b3, a1, t1 + unop + + ADD c04, t2, b5 + fmov b5, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, b5 + fmov b5, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, b5 + fmov b5, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD c09, t1, b5 + fmov b5, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, b5 + fmov b5, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD c07, t4, b5 + fmov b5, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD c11, t1, b5 + fmov b5, c11 + unop + IFMOVD tmp, b5 + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD c12, t2, b5 + fmov b5, c12 + ldi L, -2(L) + IFMOVD tmp, b5 + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD c16, t3, b5 + fmov b5, c16 + unop + MUL b2, a2, t3 + unop + + ADD c15, t4, b5 + fmov b5, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD c01, t1, b5 + fmov b5, c01 + unop + IFMOVD tmp, b5 + MUL b5, a6, t1 + unop + + ADD c02, t2, b5 + fmov b5, c02 + unop + IFMOVD tmp, b5 + MUL b5, a4, t2 + unop + + ADD c06, t3, b5 + fmov b5, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, b5 + fmov b5, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD c03, t1, b5 + fmov b5, c03 + ldi AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD c04, t2, b5 + fmov b5, c04 + ldi BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD c08, t3, b5 + fmov b5, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD c13, t4, b5 + fmov b5, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD c09, t1, b5 + fmov b5, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD c14, t3, b5 + fmov b5, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, b5 + fmov b5, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD c11, t1, b5 + fmov b5, c11 + MUL b1, a1, b5 + fmov b5, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD c12, t2, b5 + fmov b5, c12 + MUL b1, a2, b5 + fmov b5, t2 + ADD c16, t3, b5 + fmov b5, c16 + MUL b2, a2, b5 + fmov b5, t3 + + ADD c15, t4, b5 + fmov b5, c15 + MUL b2, a1, b5 + fmov b5, t4 + ADD c01, t1, b5 + fmov b5, c01 + MUL b1, a3, b5 + fmov b5, t1 + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL b1, a4, b5 + fmov b5, t2 + LD b1, 0 * SIZE(BO) + + ADD c06, t3, b5 + fmov b5, c06 + MUL b2, a4, b5 + fmov b5, t3 + ADD c05, t4, b5 + fmov b5, c05 + MUL b4, a1, b5 + fmov b5, t4 + + ADD c03, t1, b5 + fmov b5, c03 + unop + MUL b3, a1, b5 + fmov b5, t1 + LD a1, 0 * SIZE(AO) + + ADD c04, t2, b5 + fmov b5, c04 + unop + MUL b3, a2, b5 + fmov b5, t2 + unop + + ADD c08, t3, b5 + fmov b5, c08 + unop + MUL b4, a2, b5 + fmov b5, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, b5 + fmov b5, c13 + unop + MUL b2, a3, b5 + fmov b5, t4 + LD b2, 1 * SIZE(BO) + + ADD c09, t1, b5 + fmov b5, c09 + unop + MUL b3, a3, b5 + fmov b5, t1 + ldi AO, 4 * SIZE(AO) + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL b3, a4, b5 + fmov b5, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, b5 + fmov b5, c14 + unop + MUL b4, a4, b5 + fmov b5, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, b5 + fmov b5, c07 + unop + MUL b4, a3, b5 + fmov b5, t4 + LD a3, -2 * SIZE(AO) + + ADD c11, t1, b5 + fmov b5, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, b5 + fmov b5, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD c12, t2, b5 + fmov b5, c12 + MUL b1, a2, b5 + fmov b5, t2 + ADD c16, t3, b5 + fmov b5, c16 + MUL b2, a2, b5 + fmov b5, t3 + + ADD c15, t4, b5 + fmov b5, c15 + MUL b2, a1, b5 + fmov b5, t4 + ADD c01, t1, b5 + fmov b5, c01 + MUL b1, a3, b5 + fmov b5, t1 + + ADD c02, t2, b5 + fmov b5, c02 + MUL b1, a4, b5 + fmov b5, t2 + ADD c06, t3, b5 + fmov b5, c06 + MUL b2, a4, b5 + fmov b5, t3 + + ADD c05, t4, b5 + fmov b5, c05 + MUL b4, a1, b5 + fmov b5, t4 + ADD c03, t1, b5 + fmov b5, c03 + MUL b3, a1, b5 + fmov b5, t1 + + ADD c04, t2, b5 + fmov b5, c04 + MUL b3, a2, b5 + fmov b5, t2 + ADD c08, t3, b5 + fmov b5, c08 + MUL b4, a2, b5 + fmov b5, t3 + + ADD c13, t4, b5 + fmov b5, c13 + MUL b2, a3, b5 + fmov b5, t4 + ADD c09, t1, b5 + fmov b5, c09 + MUL b3, a3, b5 + fmov b5, t1 + + ADD c10, t2, b5 + fmov b5, c10 + MUL b3, a4, b5 + fmov b5, t2 + ADD c14, t3, b5 + fmov b5, c14 + MUL b4, a4, b5 + fmov b5, t3 + + ADD c07, t4, b5 + fmov b5, c07 + ldi AO, 4 * SIZE(AO) + MUL b4, a3, b5 + fmov b5, t4 + ldi BO, 4 * SIZE(BO) + + ADD c11, t1, b5 + fmov b5, c11 + ADD c12, t2, b5 + fmov b5, c12 + ADD c16, t3, b5 + fmov b5, c16 + ADD c15, t4, b5 + fmov b5, c15 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 4, TMP1 +#else + subl KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c13, b5 + fmov b5, c13 + + SUB b1, c02, b5 + fmov b5, c02 + SUB b2, c06, b5 + fmov b5, c06 + SUB b3, c10, b5 + fmov b5, c10 + SUB b4, c14, b5 + fmov b5, c14 + + LD a1, 8 * SIZE(BO) + LD a2, 9 * SIZE(BO) + LD a3, 10 * SIZE(BO) + LD a4, 11 * SIZE(BO) + + LD b1, 12 * SIZE(BO) + LD b2, 13 * SIZE(BO) + LD b3, 14 * SIZE(BO) + LD b4, 15 * SIZE(BO) + + SUB a1, c03, b5 + fmov b5, c03 + SUB a2, c07, b5 + fmov b5, c07 + SUB a3, c11, b5 + fmov b5, c11 + SUB a4, c15, b5 + fmov b5, c15 + + SUB b1, c04, b5 + fmov b5, c04 + SUB b2, c08, b5 + fmov b5, c08 + SUB b3, c12, b5 + fmov b5, c12 + SUB b4, c16, b5 + fmov b5, c16 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 + + SUB b1, c05, b5 + fmov b5, c05 + SUB b2, c06, b5 + fmov b5, c06 + SUB b3, c07, b5 + fmov b5, c07 + SUB b4, c08, b5 + fmov b5, c08 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b1, 12 * SIZE(AO) + LD b2, 13 * SIZE(AO) + LD b3, 14 * SIZE(AO) + LD b4, 15 * SIZE(AO) + + SUB a1, c09, b5 + fmov b5, c09 + SUB a2, c10, b5 + fmov b5, c10 + SUB a3, c11, b5 + fmov b5, c11 + SUB a4, c12, b5 + fmov b5, c12 + + SUB b1, c13, b5 + fmov b5, c13 + SUB b2, c14, b5 + fmov b5, c14 + SUB b3, c15, b5 + fmov b5, c15 + SUB b4, c16, b5 + fmov b5, c16 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, b5 + fmov b5, c04 + MUL a1, c08, b5 + fmov b5, c08 + MUL a1, c12, b5 + fmov b5, c12 + MUL a1, c16, b5 + fmov b5, c16 + + MUL a2, c04, b5 + fmov b5, t1 + MUL a2, c08, b5 + fmov b5, t2 + MUL a2, c12, b5 + fmov b5, t3 + MUL a2, c16, b5 + fmov b5, t4 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c07, t2, b5 + fmov b5, c07 + SUB c11, t3, b5 + fmov b5, c11 + SUB c15, t4, b5 + fmov b5, c15 + + MUL a3, c04, b5 + fmov b5, t1 + MUL a3, c08, b5 + fmov b5, t2 + MUL a3, c12, b5 + fmov b5, t3 + MUL a3, c16, b5 + fmov b5, t4 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + SUB c10, t3, b5 + fmov b5, c10 + SUB c14, t4, b5 + fmov b5, c14 + + MUL a4, c04, b5 + fmov b5, t1 + MUL a4, c08, b5 + fmov b5, t2 + MUL a4, c12, b5 + fmov b5, t3 + MUL a4, c16, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + SUB c09, t3, b5 + fmov b5, c09 + SUB c13, t4, b5 + fmov b5, c13 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, b5 + fmov b5, c03 + MUL b1, c07, b5 + fmov b5, c07 + MUL b1, c11, b5 + fmov b5, c11 + MUL b1, c15, b5 + fmov b5, c15 + + MUL b2, c03, b5 + fmov b5, t1 + MUL b2, c07, b5 + fmov b5, t2 + MUL b2, c11, b5 + fmov b5, t3 + MUL b2, c15, b5 + fmov b5, t4 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + SUB c10, t3, b5 + fmov b5, c10 + SUB c14, t4, b5 + fmov b5, c14 + + MUL b3, c03, b5 + fmov b5, t1 + MUL b3, c07, b5 + fmov b5, t2 + MUL b3, c11, b5 + fmov b5, t3 + MUL b3, c15, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + SUB c09, t3, b5 + fmov b5, c09 + SUB c13, t4, b5 + fmov b5, c13 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c06, b5 + fmov b5, c06 + MUL a1, c10, b5 + fmov b5, c10 + MUL a1, c14, b5 + fmov b5, c14 + + MUL a2, c02, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + MUL a2, c10, b5 + fmov b5, t3 + MUL a2, c14, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + SUB c09, t3, b5 + fmov b5, c09 + SUB c13, t4, b5 + fmov b5, c13 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c05, b5 + fmov b5, c05 + MUL a3, c09, b5 + fmov b5, c09 + MUL a3, c13, b5 + fmov b5, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c13, b5 + fmov b5, c13 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c05, b5 + fmov b5, t2 + MUL a2, c09, b5 + fmov b5, t3 + MUL a2, c13, b5 + fmov b5, t4 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + SUB c10, t3, b5 + fmov b5, c10 + SUB c14, t4, b5 + fmov b5, c14 + + MUL a3, c01, b5 + fmov b5, t1 + MUL a3, c05, b5 + fmov b5, t2 + MUL a3, c09, b5 + fmov b5, t3 + MUL a3, c13, b5 + fmov b5, t4 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c07, t2, b5 + fmov b5, c07 + SUB c11, t3, b5 + fmov b5, c11 + SUB c15, t4, b5 + fmov b5, c15 + + MUL a4, c01, b5 + fmov b5, t1 + MUL a4, c05, b5 + fmov b5, t2 + MUL a4, c09, b5 + fmov b5, t3 + MUL a4, c13, b5 + fmov b5, t4 + + SUB c04, t1, b5 + fmov b5, c04 + SUB c08, t2, b5 + fmov b5, c08 + SUB c12, t3, b5 + fmov b5, c12 + SUB c16, t4, b5 + fmov b5, c16 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, b5 + fmov b5, c02 + MUL b1, c06, b5 + fmov b5, c06 + MUL b1, c10, b5 + fmov b5, c10 + MUL b1, c14, b5 + fmov b5, c14 + + MUL b2, c02, b5 + fmov b5, t1 + MUL b2, c06, b5 + fmov b5, t2 + MUL b2, c10, b5 + fmov b5, t3 + MUL b2, c14, b5 + fmov b5, t4 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c07, t2, b5 + fmov b5, c07 + SUB c11, t3, b5 + fmov b5, c11 + SUB c15, t4, b5 + fmov b5, c15 + + MUL b3, c02, b5 + fmov b5, t1 + MUL b3, c06, b5 + fmov b5, t2 + MUL b3, c10, b5 + fmov b5, t3 + MUL b3, c14, b5 + fmov b5, t4 + + SUB c04, t1, b5 + fmov b5, c04 + SUB c08, t2, b5 + fmov b5, c08 + SUB c12, t3, b5 + fmov b5, c12 + SUB c16, t4, b5 + fmov b5, c16 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c07, b5 + fmov b5, c07 + MUL a1, c11, b5 + fmov b5, c11 + MUL a1, c15, b5 + fmov b5, c15 + + MUL a2, c03, b5 + fmov b5, t1 + MUL a2, c07, b5 + fmov b5, t2 + MUL a2, c11, b5 + fmov b5, t3 + MUL a2, c15, b5 + fmov b5, t4 + + SUB c04, t1, b5 + fmov b5, c04 + SUB c08, t2, b5 + fmov b5, c08 + SUB c12, t3, b5 + fmov b5, c12 + SUB c16, t4, b5 + fmov b5, c16 + + MUL a3, c04, b5 + fmov b5, c04 + MUL a3, c08, b5 + fmov b5, c08 + MUL a3, c12, b5 + fmov b5, c12 + MUL a3, c16, b5 + fmov b5, c16 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c02, b5 + fmov b5, t2 + MUL a2, c03, b5 + fmov b5, t3 + MUL a2, c04, b5 + fmov b5, t4 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + SUB c07, t3, b5 + fmov b5, c07 + SUB c08, t4, b5 + fmov b5, c08 + + MUL a3, c01, b5 + fmov b5, t1 + MUL a3, c02, b5 + fmov b5, t2 + MUL a3, c03, b5 + fmov b5, t3 + MUL a3, c04, b5 + fmov b5, t4 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + SUB c11, t3, b5 + fmov b5, c11 + SUB c12, t4, b5 + fmov b5, c12 + + MUL a4, c01, b5 + fmov b5, t1 + MUL a4, c02, b5 + fmov b5, t2 + MUL a4, c03, b5 + fmov b5, t3 + MUL a4, c04, b5 + fmov b5, t4 + + SUB c13, t1, b5 + fmov b5, c13 + SUB c14, t2, b5 + fmov b5, c14 + SUB c15, t3, b5 + fmov b5, c15 + SUB c16, t4, b5 + fmov b5, c16 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, b5 + fmov b5, c05 + MUL b1, c06, b5 + fmov b5, c06 + MUL b1, c07, b5 + fmov b5, c07 + MUL b1, c08, b5 + fmov b5, c08 + + MUL b2, c05, b5 + fmov b5, t1 + MUL b2, c06, b5 + fmov b5, t2 + MUL b2, c07, b5 + fmov b5, t3 + MUL b2, c08, b5 + fmov b5, t4 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + SUB c11, t3, b5 + fmov b5, c11 + SUB c12, t4, b5 + fmov b5, c12 + + MUL b3, c05, b5 + fmov b5, t1 + MUL b3, c06, b5 + fmov b5, t2 + MUL b3, c07, b5 + fmov b5, t3 + MUL b3, c08, b5 + fmov b5, t4 + + SUB c13, t1, b5 + fmov b5, c13 + SUB c14, t2, b5 + fmov b5, c14 + SUB c15, t3, b5 + fmov b5, c15 + SUB c16, t4, b5 + fmov b5, c16 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + MUL a1, c11, b5 + fmov b5, c11 + MUL a1, c12, b5 + fmov b5, c12 + + MUL a2, c09, b5 + fmov b5, t1 + MUL a2, c10, b5 + fmov b5, t2 + MUL a2, c11, b5 + fmov b5, t3 + MUL a2, c12, b5 + fmov b5, t4 + + SUB c13, t1, b5 + fmov b5, c13 + SUB c14, t2, b5 + fmov b5, c14 + SUB c15, t3, b5 + fmov b5, c15 + SUB c16, t4, b5 + fmov b5, c16 + + MUL a3, c13, b5 + fmov b5, c13 + MUL a3, c14, b5 + fmov b5, c14 + MUL a3, c15, b5 + fmov b5, c15 + MUL a3, c16, b5 + fmov b5, c16 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, b5 + fmov b5, c13 + MUL a1, c14, b5 + fmov b5, c14 + MUL a1, c15, b5 + fmov b5, c15 + MUL a1, c16, b5 + fmov b5, c16 + + MUL a2, c13, b5 + fmov b5, t1 + MUL a2, c14, b5 + fmov b5, t2 + MUL a2, c15, b5 + fmov b5, t3 + MUL a2, c16, b5 + fmov b5, t4 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + SUB c11, t3, b5 + fmov b5, c11 + SUB c12, t4, b5 + fmov b5, c12 + + MUL a3, c13, b5 + fmov b5, t1 + MUL a3, c14, b5 + fmov b5, t2 + MUL a3, c15, b5 + fmov b5, t3 + MUL a3, c16, b5 + fmov b5, t4 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + SUB c07, t3, b5 + fmov b5, c07 + SUB c08, t4, b5 + fmov b5, c08 + + MUL a4, c13, b5 + fmov b5, t1 + MUL a4, c14, b5 + fmov b5, t2 + MUL a4, c15, b5 + fmov b5, t3 + MUL a4, c16, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + SUB c03, t3, b5 + fmov b5, c03 + SUB c04, t4, b5 + fmov b5, c04 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, b5 + fmov b5, c09 + MUL b1, c10, b5 + fmov b5, c10 + MUL b1, c11, b5 + fmov b5, c11 + MUL b1, c12, b5 + fmov b5, c12 + + MUL b2, c09, b5 + fmov b5, t1 + MUL b2, c10, b5 + fmov b5, t2 + MUL b2, c11, b5 + fmov b5, t3 + MUL b2, c12, b5 + fmov b5, t4 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + SUB c07, t3, b5 + fmov b5, c07 + SUB c08, t4, b5 + fmov b5, c08 + + MUL b3, c09, b5 + fmov b5, t1 + MUL b3, c10, b5 + fmov b5, t2 + MUL b3, c11, b5 + fmov b5, t3 + MUL b3, c12, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + SUB c03, t3, b5 + fmov b5, c03 + SUB c04, t4, b5 + fmov b5, c04 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c06, b5 + fmov b5, c06 + MUL a1, c07, b5 + fmov b5, c07 + MUL a1, c08, b5 + fmov b5, c08 + + MUL a2, c05, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + MUL a2, c07, b5 + fmov b5, t3 + MUL a2, c08, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + SUB c03, t3, b5 + fmov b5, c03 + SUB c04, t4, b5 + fmov b5, c04 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c02, b5 + fmov b5, c02 + MUL a3, c03, b5 + fmov b5, c03 + MUL a3, c04, b5 + fmov b5, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) + + ST c03, 8 * SIZE(BO) + ST c07, 9 * SIZE(BO) + ST c11, 10 * SIZE(BO) + ST c15, 11 * SIZE(BO) + + ST c04, 12 * SIZE(BO) + ST c08, 13 * SIZE(BO) + ST c12, 14 * SIZE(BO) + ST c16, 15 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) + + ST c09, 8 * SIZE(AO) + ST c10, 9 * SIZE(AO) + ST c11, 10 * SIZE(AO) + ST c12, 11 * SIZE(AO) + + ST c13, 12 * SIZE(AO) + ST c14, 13 * SIZE(AO) + ST c15, 14 * SIZE(AO) + ST c16, 15 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) + ldi C2, -4 * SIZE(C2) + ldi C3, -4 * SIZE(C3) + ldi C4, -4 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c11, 2 * SIZE(C3) + ST c12, 3 * SIZE(C3) + + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + ST c15, 2 * SIZE(C4) + ST c16, 3 * SIZE(C4) + +#ifndef LN + ldi C1, 4 * SIZE(C1) + ldi C2, 4 * SIZE(C2) + ldi C3, 4 * SIZE(C3) + ldi C4, 4 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP1 + addl AO, TMP1, AO + addl BO, TMP1, BO +#endif + +#ifdef LT + addl KK, 4, KK +#endif + +#ifdef LN + subl KK, 4, KK +#endif + + ldi I, -1(I) + + bgt I, $L11 + .align 4 + +$L20: + and M, 2, I + ble I, $L30 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(B) + ldi L, -2(KK) + LD b2, 1 * SIZE(B) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c01 + LD b4, 3 * SIZE(B) + fclr c05 + + ldi BO, 4 * SIZE(B) + fclr c02 + fclr c06 + ble KK, $L28 + + ble L, $L25 + +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c01 + LD b4, 3 * SIZE(BO) + fclr c05 + + ldi BO, 4 * SIZE(BO) + fclr c02 + fclr c06 + ble TMP1, $L28 + + ble L, $L25 +#endif + .align 4 + +$L22: + ADD c09, t1, b5 + fmov b5, c09 + unop + MUL a1, b1, t1 + unop + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, b5 + fmov b5, c13 + unop + MUL a1, b2, t3 + ldi BO, 8 * SIZE(BO) + + ADD c14, t4, b5 + fmov b5, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b3, t1 + unop + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + FIMOVD b5, tmp + + ADD c09, t1, b5 + fmov b5, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD c13, t3, b5 + fmov b5, c13 + unop + MUL a3, b2, t3 + ldi AO, 4 * SIZE(AO) + + ADD c14, t4, b5 + fmov b5, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + unop + IFMOVD tmp, b5 + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + IFMOVD tmp, b5 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + + +$L25: + ADD c09, t1, b5 + fmov b5, c09 + MUL a1, b1, b5 + fmov b5, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + + ADD c10, t2, b5 + fmov b5, c10 + unop + MUL a2, b1, b5 + fmov b5, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, b5 + fmov b5, c13 + unop + MUL a1, b2, b5 + fmov b5, t3 + unop + + ADD c14, t4, b5 + fmov b5, c14 + unop + MUL a2, b2, b5 + fmov b5, t4 + LD b2, 1 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b3, b5 + fmov b5, t1 + ldi AO, 2 * SIZE(AO) + + ADD c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b3, b5 + fmov b5, t2 + LD b3, 2 * SIZE(BO) + + ADD c05, t3, b5 + fmov b5, c05 + unop + MUL a1, b4, b5 + fmov b5, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, b5 + fmov b5, c06 + unop + MUL a2, b4, b5 + fmov b5, t4 + LD a2, -1 * SIZE(AO) + + ADD c09, t1, b5 + fmov b5, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, b5 + fmov b5, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD c10, t2, b5 + fmov b5, c10 + MUL a2, b1, b5 + fmov b5, t2 + ADD c13, t3, b5 + fmov b5, c13 + MUL a1, b2, b5 + fmov b5, t3 + + ADD c14, t4, b5 + fmov b5, c14 + MUL a2, b2, b5 + fmov b5, t4 + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b3, b5 + fmov b5, t1 + + ADD c02, t2, b5 + fmov b5, c02 + MUL a2, b3, b5 + fmov b5, t2 + ADD c05, t3, b5 + fmov b5, c05 + MUL a1, b4, b5 + fmov b5, t3 + + ADD c06, t4, b5 + fmov b5, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b4, b5 + fmov b5, t4 + ldi BO, 4 * SIZE(BO) + + ADD c09, t1, b5 + fmov b5, c09 + ADD c10, t2, b5 + fmov b5, c10 + ADD c13, t3, b5 + fmov b5, c13 + ADD c14, t4, b5 + fmov b5, c14 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c13, b5 + fmov b5, c13 + + SUB b1, c02, b5 + fmov b5, c02 + SUB b2, c06, b5 + fmov b5, c06 + SUB b3, c10, b5 + fmov b5, c10 + SUB b4, c14, b5 + fmov b5, c14 + +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c05, b5 + fmov b5, c05 + SUB a4, c06, b5 + fmov b5, c06 + + SUB b1, c09, b5 + fmov b5, c09 + SUB b2, c10, b5 + fmov b5, c10 + SUB b3, c13, b5 + fmov b5, c13 + SUB b4, c14, b5 + fmov b5, c14 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c06, b5 + fmov b5, c06 + MUL a1, c10, b5 + fmov b5, c10 + MUL a1, c14, b5 + fmov b5, c14 + + MUL a2, c02, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + MUL a2, c10, b5 + fmov b5, t3 + MUL a2, c14, b5 + fmov b5, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c05, t2, b5 + fmov b5, c05 + SUB c09, t3, b5 + fmov b5, c09 + SUB c13, t4, b5 + fmov b5, c13 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c05, b5 + fmov b5, c05 + MUL a3, c09, b5 + fmov b5, c09 + MUL a3, c13, b5 + fmov b5, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c13, b5 + fmov b5, c13 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c05, b5 + fmov b5, t2 + MUL a2, c09, b5 + fmov b5, t3 + MUL a2, c13, b5 + fmov b5, t4 + + SUB c02, t1, b5 + fmov b5, c02 + SUB c06, t2, b5 + fmov b5, c06 + SUB c10, t3, b5 + fmov b5, c10 + SUB c14, t4, b5 + fmov b5, c14 + + MUL a3, c02, b5 + fmov b5, c02 + MUL a3, c06, b5 + fmov b5, c06 + MUL a3, c10, b5 + fmov b5, c10 + MUL a3, c14, b5 + fmov b5, c14 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + MUL a2, c01, b5 + fmov b5, t1 + MUL a2, c02, b5 + fmov b5, t2 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + + MUL a3, c01, b5 + fmov b5, t1 + MUL a3, c02, b5 + fmov b5, t2 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + + MUL a4, c01, b5 + fmov b5, t1 + MUL a4, c02, b5 + fmov b5, t2 + + SUB c13, t1, b5 + fmov b5, c13 + SUB c14, t2, b5 + fmov b5, c14 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, b5 + fmov b5, c05 + MUL b1, c06, b5 + fmov b5, c06 + + MUL b2, c05, b5 + fmov b5, t1 + MUL b2, c06, b5 + fmov b5, t2 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + + MUL b3, c05, b5 + fmov b5, t1 + MUL b3, c06, b5 + fmov b5, t2 + + SUB c13, t1, b5 + fmov b5, c13 + SUB c14, t2, b5 + fmov b5, c14 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + + MUL a2, c09, b5 + fmov b5, t1 + MUL a2, c10, b5 + fmov b5, t2 + + SUB c13, t1, b5 + fmov b5, c13 + SUB c14, t2, b5 + fmov b5, c14 + + MUL a3, c13, b5 + fmov b5, c13 + MUL a3, c14, b5 + fmov b5, c14 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, b5 + fmov b5, c13 + MUL a1, c14, b5 + fmov b5, c14 + + MUL a2, c13, b5 + fmov b5, t1 + MUL a2, c14, b5 + fmov b5, t2 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + + MUL a3, c13, b5 + fmov b5, t1 + MUL a3, c14, b5 + fmov b5, t2 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + + MUL a4, c13, b5 + fmov b5, t1 + MUL a4, c14, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, b5 + fmov b5, c09 + MUL b1, c10, b5 + fmov b5, c10 + + MUL b2, c09, b5 + fmov b5, t1 + MUL b2, c10, b5 + fmov b5, t2 + + SUB c05, t1, b5 + fmov b5, c05 + SUB c06, t2, b5 + fmov b5, c06 + + MUL b3, c09, b5 + fmov b5, t1 + MUL b3, c10, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c06, b5 + fmov b5, c06 + + MUL a2, c05, b5 + fmov b5, t1 + MUL a2, c06, b5 + fmov b5, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + + MUL a3, c01, b5 + fmov b5, c01 + MUL a3, c02, b5 + fmov b5, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c13, 6 * SIZE(AO) + ST c14, 7 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) + ldi C2, -2 * SIZE(C2) + ldi C3, -2 * SIZE(C3) + ldi C4, -2 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + +#ifndef LN + ldi C1, 2 * SIZE(C1) + ldi C2, 2 * SIZE(C2) + ldi C3, 2 * SIZE(C3) + ldi C4, 2 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + .align 4 + +$L30: + and M, 1, I + ble I, $L39 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + ldi L, -2(KK) + LD b2, 1 * SIZE(B) + ldi AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c09 + LD b4, 3 * SIZE(B) + fclr c13 + + ldi BO, 4 * SIZE(B) + ble KK, $L38 + + ble L, $L35 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c09 + LD b4, 3 * SIZE(BO) + fclr c13 + + ldi BO, 4 * SIZE(BO) + ble TMP1, $L38 + + ble L, $L35 +#endif + .align 4 + +$L32: + ADD c01, t1, b5 + fmov b5, c01 + ldi L, -2(L) + MUL a1, b1, t1 + LD b1, 0 * SIZE(BO) + + ADD c05, t2, b5 + fmov b5, c05 + ldi AO, 2 * SIZE(AO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, b5 + fmov b5, c09 + LD b5, 3 * SIZE(BO) + FIMOVD b5, tmp + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, b5 + fmov b5, c13 + MUL a1, b4, t4 + LD a1, -1 * SIZE(AO) + + ADD c01, t1, b5 + fmov b5, c01 + MUL a2, b1, t1 + LD b1, 4 * SIZE(BO) + ldi BO, 8 * SIZE(BO) + + ADD c05, t2, b5 + fmov b5, c05 + MUL a2, b2, t2 + LD b2, -3 * SIZE(BO) + + ADD c09, t3, b5 + fmov b5, c09 + LD b4, -1 * SIZE(BO) + MUL a2, b3, t3 + LD b3, -2 * SIZE(BO) + + ADD c13, t4, b5 + fmov b5, c13 + IFMOVD tmp, b5 + MUL a2, b5, t4 + LD a2, 0 * SIZE(AO) + bgt L, $L32 + .align 4 + +$L35: + ADD c01, t1, b5 + fmov b5, c01 + MUL a1, b1, b5 + fmov b5, t1 +#if defined(LT) || defined(RN) + blbs KK, $L37 +#else + blbs TMP1, $L37 +#endif + .align 4 + + ADD c05, t2, b5 + fmov b5, c05 + LD b1, 0 * SIZE(BO) + MUL a1, b2, b5 + fmov b5, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, b5 + fmov b5, c09 + MUL a1, b3, b5 + fmov b5, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, b5 + fmov b5, c13 + MUL a1, b4, b5 + fmov b5, t4 + LD a1, 0 * SIZE(AO) + ldi AO, 1 * SIZE(AO) + + ADD c01, t1, b5 + fmov b5, c01 + LD b4, 3 * SIZE(BO) + MUL a1, b1, b5 + fmov b5, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L37: + ADD c05, t2, b5 + fmov b5, c05 + MUL a1, b2, b5 + fmov b5, t2 + ADD c09, t3, b5 + fmov b5, c09 + MUL a1, b3, b5 + fmov b5, t3 + + ADD c13, t4, b5 + fmov b5, c13 + ldi AO, 1 * SIZE(AO) + MUL a1, b4, b5 + fmov b5, t4 + ldi BO, 4 * SIZE(BO) + + ADD c01, t1, b5 + fmov b5, c01 + ADD c05, t2, b5 + fmov b5, c05 + ADD c09, t3, b5 + fmov b5, c09 + ADD c13, t4, b5 + fmov b5, c13 + +$L38: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 1, TMP1 +#else + subl KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO +#else + ldi AO, -1 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c13, b5 + fmov b5, c13 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c05, b5 + fmov b5, c05 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c13, b5 + fmov b5, c13 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c05, b5 + fmov b5, c05 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c13, b5 + fmov b5, c13 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, b5 + fmov b5, c01 + MUL a2, c01, b5 + fmov b5, t1 + SUB c05, t1, b5 + fmov b5, c05 + MUL a3, c01, b5 + fmov b5, t1 + SUB c09, t1, b5 + fmov b5, c09 + MUL a4, c01, b5 + fmov b5, t1 + SUB c13, t1, b5 + fmov b5, c13 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, b5 + fmov b5, c05 + MUL b2, c05, b5 + fmov b5, t1 + SUB c09, t1, b5 + fmov b5, c09 + MUL b3, c05, b5 + fmov b5, t1 + SUB c13, t1, b5 + fmov b5, c13 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, b5 + fmov b5, c09 + MUL a2, c09, b5 + fmov b5, t1 + SUB c13, t1, b5 + fmov b5, c13 + MUL a3, c13, b5 + fmov b5, c13 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, b5 + fmov b5, c13 + MUL a2, c13, b5 + fmov b5, t1 + SUB c09, t1, b5 + fmov b5, c09 + MUL a3, c13, b5 + fmov b5, t1 + SUB c05, t1, b5 + fmov b5, c05 + MUL a4, c13, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, b5 + fmov b5, c09 + MUL b2, c09, b5 + fmov b5, t1 + SUB c05, t1, b5 + fmov b5, c05 + MUL b3, c09, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, b5 + fmov b5, c05 + MUL a2, c05, b5 + fmov b5, t1 + SUB c01, t1, b5 + fmov b5, c01 + MUL a3, c01, b5 + fmov b5, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c13, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -1 * SIZE(C1) + ldi C2, -1 * SIZE(C2) + ldi C3, -1 * SIZE(C3) + ldi C4, -1 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c09, 0 * SIZE(C3) + ST c13, 0 * SIZE(C4) + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L39: +#ifdef LN + sll K, 2 + BASE_SHIFT, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 4, KK +#endif + +#ifdef RT + subl KK, 4, KK +#endif + ldi J, -1(J) + bgt J, $L01 + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + ldl $9, 64($sp) + clr $0 + ldi $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/trsm_kernel_4x4_RT.S.bak b/kernel/sw_64/trsm_kernel_4x4_RT.S.bak new file mode 100644 index 0000000..af57279 --- /dev/null +++ b/kernel/sw_64/trsm_kernel_4x4_RT.S.bak @@ -0,0 +1,4072 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + +#if !defined(SW2B) +#error "Architecture is not specified." +#endif + +#ifdef SW2B +#define PREFETCHSIZE 56 +#define UNOP nop +#endif + +#ifdef EV6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 56 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $20 +#define B $21 +#define C $22 +#define LDC $23 + +#define C1 $19 +#define C2 $24 +#define C3 $25 +#define C4 $27 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + ldi $sp, -STACKSIZE($sp) + + ldl C, 0 + STACKSIZE($sp) + ldl LDC, 8 + STACKSIZE($sp) + ldl OFFSET, 16 + STACKSIZE($sp) + + SXADDQ LDC, 0, LDC + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + mull M, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ M, C, C +#endif + +#ifdef RN + negq OFFSET, KK +#endif + +#ifdef RT + mull N, K, TMP1 + SXADDQ TMP1, B, B + + mull N, LDC, TMP1 + addl TMP1, C, C + + subl N, OFFSET, KK +#endif + + and N, 1, J + ble J, $L40 + +#ifdef RT + sll K, BASE_SHIFT, TMP1 + subl B, TMP1, B + + subl C, LDC, C +#endif + + mov C, C1 +#ifndef RT + addl C, LDC, C +#endif + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + ble I, $L100 + .align 4 + +$L91: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L95 + +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L95 +#endif + .align 5 + +$L92: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + ldi L, -1(L) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 8 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 9 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 10 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 11 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + LD a1, 12 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD a2, 13 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b3, t3 + LD a3, 14 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b3, t4 + LD a5, 15 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b4, t1 + LD a1, 16 * SIZE(AO) + ldi AO, 16 * SIZE(AO) + + ADD c02, t2, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b4, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L92 + .align 4 + +$L95: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + unop + ble L, $L98 + .align 4 + +$L96: + ADD c01, t1, c01 + ldi L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + ldi BO, 1 * SIZE(BO) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ldi AO, 4 * SIZE(AO) + bgt L, $L96 + .align 4 + +$L98: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 4, TMP1 +#else + subl KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a2, c04, t1 + SUB c03, t1, c03 + MUL a3, c04, t1 + SUB c02, t1, c02 + MUL a4, c04, t1 + SUB c01, t1, c01 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b2, c03, t1 + SUB c02, t1, c02 + MUL b3, c03, t1 + SUB c01, t1, c01 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a2, c02, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c02, t1, c02 + MUL a3, c01, t1 + SUB c03, t1, c03 + MUL a4, c01, t1 + SUB c04, t1, c04 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b2, c02, t1 + SUB c03, t1, c03 + MUL b3, c02, t1 + SUB c04, t1, c04 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a2, c03, t1 + SUB c04, t1, c04 + MUL a3, c04, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + ldi C1, 4 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 4, KK +#endif + +#ifdef LN + subl KK, 4, KK +#endif + + ldi I, -1(I) + bgt I, $L91 + .align 4 + +$L100: + and M, 2, I + ble I, $L110 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L105 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + ble L, $L105 +#endif + .align 5 + +$L102: + ADD c01, t1, c01 + ldi L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c03, t3, c03 + ldi BO, 4 * SIZE(BO) + MUL a3, b2, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a5, 7 * SIZE(AO) + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b3, t1 + LD a1, 8 * SIZE(AO) + ldi AO, 8 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L102 + .align 4 + +$L105: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L108 + .align 4 + +$L106: + ADD c01, t1, c01 + ldi L, -1(L) + MUL a1, b1, t1 + LD a1, 2 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + ldi AO, 2 * SIZE(AO) + unop + ldi BO, 1 * SIZE(BO) + bgt L, $L106 + .align 4 + +$L108: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c03, c01 + ADD c02, c04, c02 + +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a2, c02, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c02, t1, c02 + MUL a3, c02, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + ldi C1, 2 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + .align 4 + +$L110: + and M, 1, I + ble I, $L119 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + unop + ble L, $L115 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L115 +#endif + .align 4 + +$L112: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c02, t2, c02 + MUL a2, b2, t2 + LD a2, 5 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c03, t3, c03 + MUL a3, b3, t3 + LD a3, 6 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c04, t4, c04 + MUL a4, b4, t4 + LD a4, 7 * SIZE(AO) + LD b4, 7 * SIZE(BO) + + ldi L, -1(L) + ldi AO, 4 * SIZE(AO) + ldi BO, 4 * SIZE(BO) + bgt L, $L112 + .align 4 + +$L115: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L118 + .align 4 + +$L116: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + ldi L, -1(L) + ldi AO, 1 * SIZE(AO) + ldi BO, 1 * SIZE(BO) + bgt L, $L116 + .align 4 + +$L118: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c02, c01 + ADD c03, c04, c03 + ADD c01, c03, c01 + +#if defined(LN) || defined(RT) + subl KK, 1, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + addl B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + + SUB a1, c01, c01 +#else + LD a1, 0 * SIZE(AO) + + SUB a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -1 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + +#ifndef LN + ldi C1, 1 * SIZE(C1) +#endif + +#ifdef RT + SXADDQ K, AORIG, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L119: +#ifdef LN + SXADDQ K, B, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 1, KK +#endif + +#ifdef RT + subl KK, 1, KK +#endif + .align 4 + +$L40: + and N, 2, J + ble J, $L80 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + subl B, TMP1, B + + addl LDC, LDC, TMP1 + subl C, TMP1, C +#endif + + mov C, C1 + addl C, LDC, C2 + fclr t1 +#ifndef RT + addl C2, LDC, C +#endif + fclr t2 + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L60 + .align 4 + +$L51: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi L, -2(KK) + + ldi BO, 2 * SIZE(B) + ldi AO, 4 * SIZE(AO) + + ble KK, $L58 + + ble L, $L55 +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi L, -2(TMP1) + ldi BO, 2 * SIZE(BO) + ldi AO, 4 * SIZE(AO) + + ble TMP1, $L58 + + ble L, $L55 +#endif + .align 4 + +$L52: + ADD c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD c06, t2, c06 + ldi L, -2(L) + MUL a2, b1, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b3, t3 + ldi AO, 8 * SIZE(AO) + + ADD c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L52 + .align 4 + +$L55: + ADD c05, t1, c05 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + ldi AO, 4 * SIZE(AO) + + ADD c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L57: + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + MUL a4, b1, t4 + ADD c01, t1, c01 + MUL a1, b2, t1 + + ADD c02, t2, c02 + MUL a2, b2, t2 + ADD c03, t3, c03 + MUL a3, b2, t3 + + ADD c04, t4, c04 + ldi AO, 4 * SIZE(AO) + MUL a4, b2, t4 + ldi BO, 2 * SIZE(BO) + + ADD c05, t1, c05 + ADD c06, t2, c06 + ADD c07, t3, c07 + ADD c08, t4, c08 + .align 4 + +$L58: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 4, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c02, c02 + SUB a4, c06, c06 + + SUB b1, c03, c03 + SUB b2, c07, c07 + SUB b3, c04, c04 + SUB b4, c08, c08 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c05, c05 + SUB b2, c06, c06 + SUB b3, c07, c07 + SUB b4, c08, c08 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a1, c08, c08 + + MUL a2, c04, t1 + MUL a2, c08, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL a3, c04, t1 + MUL a3, c08, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a4, c04, t1 + MUL a4, c08, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b1, c07, c07 + + MUL b2, c03, t1 + MUL b2, c07, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL b3, c03, t1 + MUL b3, c07, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + + MUL a2, c02, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + MUL a3, c01, c01 + MUL a3, c05, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + + MUL a2, c01, t1 + MUL a2, c05, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a3, c01, t1 + MUL a3, c05, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL a4, c01, t1 + MUL a4, c05, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b1, c06, c06 + + MUL b2, c02, t1 + MUL b2, c06, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL b3, c02, t1 + MUL b3, c06, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a1, c07, c07 + + MUL a2, c03, t1 + MUL a2, c07, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + MUL a3, c04, c04 + MUL a3, c08, c08 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + MUL a2, c01, t1 + MUL a2, c02, t2 + MUL a2, c03, t3 + MUL a2, c04, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a3, c05, c05 + MUL a3, c06, c06 + MUL a3, c07, c07 + MUL a3, c08, c08 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + MUL a1, c07, c07 + MUL a1, c08, c08 + + MUL a2, c05, t1 + MUL a2, c06, t2 + MUL a2, c07, t3 + MUL a2, c08, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a3, c01, c01 + MUL a3, c02, c02 + MUL a3, c03, c03 + MUL a3, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c07, 5 * SIZE(BO) + ST c04, 6 * SIZE(BO) + ST c08, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) + ldi C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + +#ifndef LN + ldi C1, 4 * SIZE(C1) + ldi C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 4, KK +#endif + +#ifdef LN + subl KK, 4, KK +#endif + + ldi I, -1(I) + + bgt I, $L51 + .align 4 + +$L60: + and M, 2, I + ble I, $L70 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + ldi L, -2(KK) + LD b2, 1 * SIZE(B) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + ldi BO, 2 * SIZE(B) + + ble KK, $L68 + + ble L, $L65 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + ldi BO, 2 * SIZE(BO) + + ble TMP1, $L68 + + ble L, $L65 +#endif + .align 4 + +$L62: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD c02, t2, c02 + ldi AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD c05, t3, c05 + ldi L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + ldi BO, 4 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L62 + .align 4 + +$L65: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L67 +#else + blbs TMP1, $L67 +#endif + .align 4 + + ADD c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c05, t3, c05 + ldi BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + ldi AO, 2 * SIZE(AO) + .align 4 + +$L67: + ADD c02, t2, c02 + MUL a2, b1, t2 + ADD c05, t3, c05 + MUL a1, b2, t3 + + ADD c06, t4, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b2, t4 + ldi BO, 2 * SIZE(BO) + + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c05, t3, c05 + ADD c06, t4, c06 + .align 4 + +$L68: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c02, c02 + SUB a4, c06, c06 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c05, c05 + SUB a4, c06, c06 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + + MUL a2, c02, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + MUL a3, c01, c01 + MUL a3, c05, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + + MUL a2, c01, t1 + MUL a2, c05, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a3, c02, c02 + MUL a3, c06, c06 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + + MUL a2, c01, t1 + MUL a2, c02, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a3, c05, c05 + MUL a3, c06, c06 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + + MUL a2, c05, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a3, c01, c01 + MUL a3, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) + ldi C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + +#ifndef LN + ldi C1, 2 * SIZE(C1) + ldi C2, 2 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + .align 4 + +$L70: + and M, 1, I + ble I, $L79 + +#if defined(LT) || defined(RN) + + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + fclr c02 + LD b2, 1 * SIZE(B) + fclr c06 + + ldi L, -2(KK) + + LD b3, 2 * SIZE(B) + ldi AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(B) + ldi BO, 2 * SIZE(B) + + ble KK, $L78 + + ble L, $L75 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + fclr c02 + LD b2, 1 * SIZE(BO) + fclr c06 + + ldi L, -2(TMP1) + + LD b3, 2 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(BO) + ldi BO, 2 * SIZE(BO) + + ble TMP1, $L78 + + ble L, $L75 +#endif + .align 4 + +$L72: + ADD c01, t1, c01 + ldi L, -2(L) + MUL a1, b1, t1 + LD b1, 2 * SIZE(BO) + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 1 * SIZE(AO) + LD b2, 3 * SIZE(BO) + + ADD c02, t3, c02 + ldi AO, 2 * SIZE(AO) + MUL a2, b3, t3 + LD b3, 4 * SIZE(BO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD a2, 0 * SIZE(AO) + LD b4, 5 * SIZE(BO) + + ldi BO, 4 * SIZE(BO) + unop + unop + bgt L, $L72 + .align 4 + +$L75: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L77 +#else + blbs TMP1, $L77 +#endif + .align 4 + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + LD b2, 1 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + MUL a1, b1, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L77: + ADD c05, t2, c05 + MUL a1, b2, t2 + ADD c02, t3, c02 + ADD c06, t4, c06 + + ADD c01, c02, c01 + ldi AO, 1 * SIZE(AO) + ADD c05, c06, c05 + ldi BO, 2 * SIZE(BO) + + ADD c01, t1, c01 + ADD c05, t2, c05 + + .align 4 + +$L78: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 1, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -1 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c05, c05 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c05, t1, c05 + MUL a3, c05, c05 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a2, c05, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -1 * SIZE(C1) + ldi C2, -1 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L79: +#ifdef LN + sll K, 1 + BASE_SHIFT, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 2, KK +#endif + +#ifdef RT + subl KK, 2, KK +#endif + .align 4 + +$L80: + sra N, 2, J + ble J, $L999 + .align 4 + +$L01: +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + subl B, TMP1, B + + s4addl LDC, 0, TMP1 + subl C, TMP1, C +#endif + + mov C, C1 + addl C, LDC, C2 + addl C2, LDC, C3 +#ifndef RT + s4addl LDC, C, C +#endif + + fclr t1 + addl C3, LDC, C4 + fclr t2 + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L20 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + + LD b3, 2 * SIZE(B) + fclr c06 + LD b4, 3 * SIZE(B) + fclr c05 + + fillcs 4 * SIZE(C1) + fclr c03 + ldi L, -2(KK) + fclr c04 + + fillcs 7 * SIZE(C2) + fclr c08 + ldi BO, 4 * SIZE(B) + fclr c13 + + fillcs 4 * SIZE(C3) + fclr c09 + ldi AO, 4 * SIZE(AO) + fclr c10 + + fillcs 7 * SIZE(C4) + fclr c14 + fclr c07 + ble KK, $L18 +#else + +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addl AORIG, TMP1, AO + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + + LD b3, 2 * SIZE(BO) + fclr c06 + LD b4, 3 * SIZE(BO) + fclr c05 + + fillcs 4 * SIZE(C1) + fclr c03 + ldi L, -2(TMP1) + fclr c04 + + fillcs 7 * SIZE(C2) + fclr c08 + ldi BO, 4 * SIZE(BO) + fclr c13 + + fillcs 4 * SIZE(C3) + fclr c09 + ldi AO, 4 * SIZE(AO) + fclr c10 + + fillcs 7 * SIZE(C4) + fclr c14 + fclr c07 + ble TMP1, $L18 +#endif + + ble L, $L15 + .align 5 + +$L12: +/* 1 */ + ADD c11, t1, c11 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD c12, t2, c12 + ldi L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD c03, t1, c03 + ldi AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD c04, t2, c04 + ldi BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD c11, t1, c11 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD c06, t3, c06 + MUL b2, a4, t3 + ADD c05, t4, c05 + MUL b4, a1, t4 + + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + ldi AO, 4 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + MUL b1, a4, t2 + ADD c06, t3, c06 + MUL b2, a4, t3 + + ADD c05, t4, c05 + MUL b4, a1, t4 + ADD c03, t1, c03 + MUL b3, a1, t1 + + ADD c04, t2, c04 + MUL b3, a2, t2 + ADD c08, t3, c08 + MUL b4, a2, t3 + + ADD c13, t4, c13 + MUL b2, a3, t4 + ADD c09, t1, c09 + MUL b3, a3, t1 + + ADD c10, t2, c10 + MUL b3, a4, t2 + ADD c14, t3, c14 + MUL b4, a4, t3 + + ADD c07, t4, c07 + ldi AO, 4 * SIZE(AO) + MUL b4, a3, t4 + ldi BO, 4 * SIZE(BO) + + ADD c11, t1, c11 + ADD c12, t2, c12 + ADD c16, t3, c16 + ADD c15, t4, c15 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 4, TMP1 +#else + subl KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 + + SUB b1, c02, c02 + SUB b2, c06, c06 + SUB b3, c10, c10 + SUB b4, c14, c14 + + LD a1, 8 * SIZE(BO) + LD a2, 9 * SIZE(BO) + LD a3, 10 * SIZE(BO) + LD a4, 11 * SIZE(BO) + + LD b1, 12 * SIZE(BO) + LD b2, 13 * SIZE(BO) + LD b3, 14 * SIZE(BO) + LD b4, 15 * SIZE(BO) + + SUB a1, c03, c03 + SUB a2, c07, c07 + SUB a3, c11, c11 + SUB a4, c15, c15 + + SUB b1, c04, c04 + SUB b2, c08, c08 + SUB b3, c12, c12 + SUB b4, c16, c16 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c05, c05 + SUB b2, c06, c06 + SUB b3, c07, c07 + SUB b4, c08, c08 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b1, 12 * SIZE(AO) + LD b2, 13 * SIZE(AO) + LD b3, 14 * SIZE(AO) + LD b4, 15 * SIZE(AO) + + SUB a1, c09, c09 + SUB a2, c10, c10 + SUB a3, c11, c11 + SUB a4, c12, c12 + + SUB b1, c13, c13 + SUB b2, c14, c14 + SUB b3, c15, c15 + SUB b4, c16, c16 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a1, c08, c08 + MUL a1, c12, c12 + MUL a1, c16, c16 + + MUL a2, c04, t1 + MUL a2, c08, t2 + MUL a2, c12, t3 + MUL a2, c16, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL a3, c04, t1 + MUL a3, c08, t2 + MUL a3, c12, t3 + MUL a3, c16, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a4, c04, t1 + MUL a4, c08, t2 + MUL a4, c12, t3 + MUL a4, c16, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b1, c07, c07 + MUL b1, c11, c11 + MUL b1, c15, c15 + + MUL b2, c03, t1 + MUL b2, c07, t2 + MUL b2, c11, t3 + MUL b2, c15, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL b3, c03, t1 + MUL b3, c07, t2 + MUL b3, c11, t3 + MUL b3, c15, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + MUL a1, c10, c10 + MUL a1, c14, c14 + + MUL a2, c02, t1 + MUL a2, c06, t2 + MUL a2, c10, t3 + MUL a2, c14, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + MUL a3, c01, c01 + MUL a3, c05, c05 + MUL a3, c09, c09 + MUL a3, c13, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 + + MUL a2, c01, t1 + MUL a2, c05, t2 + MUL a2, c09, t3 + MUL a2, c13, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a3, c01, t1 + MUL a3, c05, t2 + MUL a3, c09, t3 + MUL a3, c13, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL a4, c01, t1 + MUL a4, c05, t2 + MUL a4, c09, t3 + MUL a4, c13, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b1, c06, c06 + MUL b1, c10, c10 + MUL b1, c14, c14 + + MUL b2, c02, t1 + MUL b2, c06, t2 + MUL b2, c10, t3 + MUL b2, c14, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL b3, c02, t1 + MUL b3, c06, t2 + MUL b3, c10, t3 + MUL b3, c14, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a1, c07, c07 + MUL a1, c11, c11 + MUL a1, c15, c15 + + MUL a2, c03, t1 + MUL a2, c07, t2 + MUL a2, c11, t3 + MUL a2, c15, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + MUL a3, c04, c04 + MUL a3, c08, c08 + MUL a3, c12, c12 + MUL a3, c16, c16 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + MUL a2, c01, t1 + MUL a2, c02, t2 + MUL a2, c03, t3 + MUL a2, c04, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c01, t1 + MUL a4, c02, t2 + MUL a4, c03, t3 + MUL a4, c04, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b1, c06, c06 + MUL b1, c07, c07 + MUL b1, c08, c08 + + MUL b2, c05, t1 + MUL b2, c06, t2 + MUL b2, c07, t3 + MUL b2, c08, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL b3, c05, t1 + MUL b3, c06, t2 + MUL b3, c07, t3 + MUL b3, c08, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + MUL a2, c09, t1 + MUL a2, c10, t2 + MUL a2, c11, t3 + MUL a2, c12, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + MUL a3, c13, c13 + MUL a3, c14, c14 + MUL a3, c15, c15 + MUL a3, c16, c16 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a1, c14, c14 + MUL a1, c15, c15 + MUL a1, c16, c16 + + MUL a2, c13, t1 + MUL a2, c14, t2 + MUL a2, c15, t3 + MUL a2, c16, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a3, c13, t1 + MUL a3, c14, t2 + MUL a3, c15, t3 + MUL a3, c16, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a4, c13, t1 + MUL a4, c14, t2 + MUL a4, c15, t3 + MUL a4, c16, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b1, c10, c10 + MUL b1, c11, c11 + MUL b1, c12, c12 + + MUL b2, c09, t1 + MUL b2, c10, t2 + MUL b2, c11, t3 + MUL b2, c12, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL b3, c09, t1 + MUL b3, c10, t2 + MUL b3, c11, t3 + MUL b3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + MUL a1, c07, c07 + MUL a1, c08, c08 + + MUL a2, c05, t1 + MUL a2, c06, t2 + MUL a2, c07, t3 + MUL a2, c08, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a3, c01, c01 + MUL a3, c02, c02 + MUL a3, c03, c03 + MUL a3, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) + + ST c03, 8 * SIZE(BO) + ST c07, 9 * SIZE(BO) + ST c11, 10 * SIZE(BO) + ST c15, 11 * SIZE(BO) + + ST c04, 12 * SIZE(BO) + ST c08, 13 * SIZE(BO) + ST c12, 14 * SIZE(BO) + ST c16, 15 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) + + ST c09, 8 * SIZE(AO) + ST c10, 9 * SIZE(AO) + ST c11, 10 * SIZE(AO) + ST c12, 11 * SIZE(AO) + + ST c13, 12 * SIZE(AO) + ST c14, 13 * SIZE(AO) + ST c15, 14 * SIZE(AO) + ST c16, 15 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) + ldi C2, -4 * SIZE(C2) + ldi C3, -4 * SIZE(C3) + ldi C4, -4 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c11, 2 * SIZE(C3) + ST c12, 3 * SIZE(C3) + + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + ST c15, 2 * SIZE(C4) + ST c16, 3 * SIZE(C4) + +#ifndef LN + ldi C1, 4 * SIZE(C1) + ldi C2, 4 * SIZE(C2) + ldi C3, 4 * SIZE(C3) + ldi C4, 4 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP1 + addl AO, TMP1, AO + addl BO, TMP1, BO +#endif + +#ifdef LT + addl KK, 4, KK +#endif + +#ifdef LN + subl KK, 4, KK +#endif + + ldi I, -1(I) + + bgt I, $L11 + .align 4 + +$L20: + and M, 2, I + ble I, $L30 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(B) + ldi L, -2(KK) + LD b2, 1 * SIZE(B) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c01 + LD b4, 3 * SIZE(B) + fclr c05 + + ldi BO, 4 * SIZE(B) + fclr c02 + fclr c06 + ble KK, $L28 + + ble L, $L25 + +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c01 + LD b4, 3 * SIZE(BO) + fclr c05 + + ldi BO, 4 * SIZE(BO) + fclr c02 + fclr c06 + ble TMP1, $L28 + + ble L, $L25 +#endif + .align 4 + +$L22: + ADD c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + ldi BO, 8 * SIZE(BO) + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a3, b2, t3 + ldi AO, 4 * SIZE(AO) + + ADD c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD c01, t1, c01 + ldi L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD c09, t1, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + ldi AO, 2 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD c10, t2, c10 + MUL a2, b1, t2 + ADD c13, t3, c13 + MUL a1, b2, t3 + + ADD c14, t4, c14 + MUL a2, b2, t4 + ADD c01, t1, c01 + MUL a1, b3, t1 + + ADD c02, t2, c02 + MUL a2, b3, t2 + ADD c05, t3, c05 + MUL a1, b4, t3 + + ADD c06, t4, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b4, t4 + ldi BO, 4 * SIZE(BO) + + ADD c09, t1, c09 + ADD c10, t2, c10 + ADD c13, t3, c13 + ADD c14, t4, c14 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 + + SUB b1, c02, c02 + SUB b2, c06, c06 + SUB b3, c10, c10 + SUB b4, c14, c14 + +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c05, c05 + SUB a4, c06, c06 + + SUB b1, c09, c09 + SUB b2, c10, c10 + SUB b3, c13, c13 + SUB b4, c14, c14 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + MUL a1, c10, c10 + MUL a1, c14, c14 + + MUL a2, c02, t1 + MUL a2, c06, t2 + MUL a2, c10, t3 + MUL a2, c14, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + MUL a3, c01, c01 + MUL a3, c05, c05 + MUL a3, c09, c09 + MUL a3, c13, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 + + MUL a2, c01, t1 + MUL a2, c05, t2 + MUL a2, c09, t3 + MUL a2, c13, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a3, c02, c02 + MUL a3, c06, c06 + MUL a3, c10, c10 + MUL a3, c14, c14 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + + MUL a2, c01, t1 + MUL a2, c02, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a4, c01, t1 + MUL a4, c02, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b1, c06, c06 + + MUL b2, c05, t1 + MUL b2, c06, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL b3, c05, t1 + MUL b3, c06, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a1, c10, c10 + + MUL a2, c09, t1 + MUL a2, c10, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + MUL a3, c13, c13 + MUL a3, c14, c14 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a1, c14, c14 + + MUL a2, c13, t1 + MUL a2, c14, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a3, c13, t1 + MUL a3, c14, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a4, c13, t1 + MUL a4, c14, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b1, c10, c10 + + MUL b2, c09, t1 + MUL b2, c10, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL b3, c09, t1 + MUL b3, c10, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + + MUL a2, c05, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a3, c01, c01 + MUL a3, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c13, 6 * SIZE(AO) + ST c14, 7 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) + ldi C2, -2 * SIZE(C2) + ldi C3, -2 * SIZE(C3) + ldi C4, -2 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + +#ifndef LN + ldi C1, 2 * SIZE(C1) + ldi C2, 2 * SIZE(C2) + ldi C3, 2 * SIZE(C3) + ldi C4, 2 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + .align 4 + +$L30: + and M, 1, I + ble I, $L39 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + ldi L, -2(KK) + LD b2, 1 * SIZE(B) + ldi AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c09 + LD b4, 3 * SIZE(B) + fclr c13 + + ldi BO, 4 * SIZE(B) + ble KK, $L38 + + ble L, $L35 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c09 + LD b4, 3 * SIZE(BO) + fclr c13 + + ldi BO, 4 * SIZE(BO) + ble TMP1, $L38 + + ble L, $L35 +#endif + .align 4 + +$L32: + ADD c01, t1, c01 + ldi L, -2(L) + MUL a1, b1, t1 + LD b1, 0 * SIZE(BO) + + ADD c05, t2, c05 + ldi AO, 2 * SIZE(AO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + LD b5, 3 * SIZE(BO) + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, -1 * SIZE(AO) + + ADD c01, t1, c01 + MUL a2, b1, t1 + LD b1, 4 * SIZE(BO) + ldi BO, 8 * SIZE(BO) + + ADD c05, t2, c05 + MUL a2, b2, t2 + LD b2, -3 * SIZE(BO) + + ADD c09, t3, c09 + LD b4, -1 * SIZE(BO) + MUL a2, b3, t3 + LD b3, -2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a2, b5, t4 + LD a2, 0 * SIZE(AO) + bgt L, $L32 + .align 4 + +$L35: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L37 +#else + blbs TMP1, $L37 +#endif + .align 4 + + ADD c05, t2, c05 + LD b1, 0 * SIZE(BO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, 0 * SIZE(AO) + ldi AO, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L37: + ADD c05, t2, c05 + MUL a1, b2, t2 + ADD c09, t3, c09 + MUL a1, b3, t3 + + ADD c13, t4, c13 + ldi AO, 1 * SIZE(AO) + MUL a1, b4, t4 + ldi BO, 4 * SIZE(BO) + + ADD c01, t1, c01 + ADD c05, t2, c05 + ADD c09, t3, c09 + ADD c13, t4, c13 + +$L38: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 1, TMP1 +#else + subl KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl B, TMP2, BO +#else + ldi AO, -1 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c05, t1, c05 + MUL a3, c01, t1 + SUB c09, t1, c09 + MUL a4, c01, t1 + SUB c13, t1, c13 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b2, c05, t1 + SUB c09, t1, c09 + MUL b3, c05, t1 + SUB c13, t1, c13 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a2, c09, t1 + SUB c13, t1, c13 + MUL a3, c13, c13 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a2, c13, t1 + SUB c09, t1, c09 + MUL a3, c13, t1 + SUB c05, t1, c05 + MUL a4, c13, t1 + SUB c01, t1, c01 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b2, c09, t1 + SUB c05, t1, c05 + MUL b3, c09, t1 + SUB c01, t1, c01 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a2, c05, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c13, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -1 * SIZE(C1) + ldi C2, -1 * SIZE(C2) + ldi C3, -1 * SIZE(C3) + ldi C4, -1 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c09, 0 * SIZE(C3) + ST c13, 0 * SIZE(C4) + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L39: +#ifdef LN + sll K, 2 + BASE_SHIFT, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 4, KK +#endif + +#ifdef RT + subl KK, 4, KK +#endif + ldi J, -1(J) + bgt J, $L01 + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + clr $0 + ldi $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/zamax.S b/kernel/sw_64/zamax.S new file mode 100644 index 0000000..c453e9d --- /dev/null +++ b/kernel/sw_64/zamax.S @@ -0,0 +1,302 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 + +#ifndef USE_MIN +#define CMPLT(a, b) fcmplt a, b +#else +#define CMPLT(a, b) fcmplt b, a +#endif + +#define STACKSIZE 8 * 8 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + ldi $sp, -STACKSIZE($sp) + + fstd $f2, 0($sp) + fclr $f16 + cmplt $31, N, $2 + + fstd $f3, 8($sp) + fclr $f17 + cmplt $31, INCX, $3 + unop + + fstd $f4, 16($sp) + fclr $f18 + SXADDQ INCX, $31, INCX + unop + + fstd $f5, 24($sp) + fclr $f19 + and $2, $3, $0 + unop + + fstd $f6, 32($sp) + unop + + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + + fclr $f0 + beq $0, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + sra N, 2, $1 + addl INCX, INCX, INCX + + fabs $f20, $f20 + fabs $f21, $f21 + faddd $f20, $f21, $f0 + ble $1, $L15 + .align 4 + + ldi $1, -1($1) + unop + addl X, INCX, X + unop + + LD $f22, 0 * SIZE(X) + fmov $f0, $f1 + LD $f23, 1 * SIZE(X) + addl X, INCX, X + + LD $f24, 0 * SIZE(X) + fmov $f0, $f2 + LD $f25, 1 * SIZE(X) + addl X, INCX, X + + LD $f26, 0 * SIZE(X) + fmov $f0, $f3 + LD $f27, 1 * SIZE(X) + addl X, INCX, X + + fabs $f20, $f8 + fabs $f21, $f9 + fabs $f22, $f10 + fabs $f23, $f11 + + fabs $f24, $f12 + fabs $f25, $f13 + fabs $f26, $f14 + fabs $f27, $f15 + + ble $1, $L14 + .align 4 + + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + ldi $1, -1($1) + addl X, INCX, X + + LD $f22, 0 * SIZE(X) + LD $f23, 1 * SIZE(X) + unop + addl X, INCX, X + + LD $f24, 0 * SIZE(X) + LD $f25, 1 * SIZE(X) + unop + addl X, INCX, X + + LD $f26, 0 * SIZE(X) + LD $f27, 1 * SIZE(X) + addl X, INCX, X + ble $1, $L13 + .align 4 + +$L12: + faddd $f8, $f9, $f16 + unop + fabs $f20, $f8 + fillcs 64 * SIZE(X) + + faddd $f10, $f11, $f17 + unop + fabs $f21, $f9 + LD $f20, 0 * SIZE(X) + + faddd $f12, $f13, $f18 + LD $f21, 1 * SIZE(X) + fabs $f22, $f10 + addl X, INCX, X + + faddd $f14, $f15, $f19 + LD $f22, 0 * SIZE(X) + fabs $f23, $f11 + unop + + CMPLT($f0, $f16), $f4 + LD $f23, 1 * SIZE(X) + fabs $f24, $f12 + addl X, INCX, X + + CMPLT($f1, $f17), $f5 + LD $f24, 0 * SIZE(X) + fabs $f25, $f13 + unop + + CMPLT($f2, $f18), $f6 + LD $f25, 1 * SIZE(X) + fabs $f26, $f14 + addl X, INCX, X + + CMPLT($f3, $f19), $f7 + LD $f26, 0 * SIZE(X) + fabs $f27, $f15 + unop + + fselne $f4, $f16, $f0, $f0 + LD $f27, 1 * SIZE(X) + addl X, INCX, X + ldi $1, -1($1) # i -- + + fselne $f5, $f17, $f1, $f1 + fselne $f6, $f18, $f2, $f2 + fselne $f7, $f19, $f3, $f3 + bgt $1,$L12 + .align 4 + +$L13: + faddd $f8, $f9, $f16 + fabs $f20, $f8 + + faddd $f10, $f11, $f17 + fabs $f21, $f9 + + faddd $f12, $f13, $f18 + fabs $f22, $f10 + + faddd $f14, $f15, $f19 + fabs $f23, $f11 + + CMPLT($f0, $f16), $f4 + fabs $f24, $f12 + + CMPLT($f1, $f17), $f5 + fabs $f25, $f13 + + CMPLT($f2, $f18), $f6 + fabs $f26, $f14 + CMPLT($f3, $f19), $f7 + fabs $f27, $f15 + + fselne $f4, $f16, $f0, $f0 + fselne $f5, $f17, $f1, $f1 + fselne $f6, $f18, $f2, $f2 + fselne $f7, $f19, $f3, $f3 + .align 4 + +$L14: + faddd $f8, $f9, $f16 + faddd $f10, $f11, $f17 + faddd $f12, $f13, $f18 + faddd $f14, $f15, $f19 + + CMPLT($f0, $f16), $f4 + CMPLT($f1, $f17), $f5 + CMPLT($f2, $f18), $f6 + CMPLT($f3, $f19), $f7 + + fselne $f4, $f16, $f0, $f0 + fselne $f5, $f17, $f1, $f1 + fselne $f6, $f18, $f2, $f2 + fselne $f7, $f19, $f3, $f3 + + CMPLT($f0, $f1), $f16 + CMPLT($f2, $f3), $f17 + + fselne $f16, $f1, $f0, $f0 + fselne $f17, $f3, $f2, $f2 + + CMPLT($f0, $f2), $f16 + fselne $f16, $f2, $f0, $f0 + .align 4 + +$L15: + and N, 3, $1 + unop + unop + ble $1, $End + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + unop + addl X, INCX, X + + fabs $f20, $f29 + fabs $f21, $f30 + faddd $f29, $f30, $f20 + fmov $f20,$f29 + + CMPLT($f0, $f29), $f16 + fselne $f16, $f29, $f0, $f0 + + ldi $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$End: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + ldi $sp, STACKSIZE($sp) + ret + + EPILOGUE diff --git a/kernel/sw_64/zamax.S.bak b/kernel/sw_64/zamax.S.bak new file mode 100644 index 0000000..74b9331 --- /dev/null +++ b/kernel/sw_64/zamax.S.bak @@ -0,0 +1,301 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 + +#ifndef USE_MIN +#define CMPLT(a, b) fcmplt a, b +#else +#define CMPLT(a, b) fcmplt b, a +#endif + +#define STACKSIZE 8 * 8 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + ldi $sp, -STACKSIZE($sp) + + fstd $f2, 0($sp) + fclr $f16 + cmplt $31, N, $2 + + fstd $f3, 8($sp) + fclr $f17 + cmplt $31, INCX, $3 + unop + + fstd $f4, 16($sp) + fclr $f18 + SXADDQ INCX, $31, INCX + unop + + fstd $f5, 24($sp) + fclr $f19 + and $2, $3, $0 + unop + + fstd $f6, 32($sp) + unop + + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + + fclr $f0 + beq $0, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + sra N, 2, $1 + addl INCX, INCX, INCX + + fabs $f20, $f20 + fabs $f21, $f21 + faddd $f20, $f21, $f0 + ble $1, $L15 + .align 4 + + ldi $1, -1($1) + unop + addl X, INCX, X + unop + + LD $f22, 0 * SIZE(X) + fmov $f0, $f1 + LD $f23, 1 * SIZE(X) + addl X, INCX, X + + LD $f24, 0 * SIZE(X) + fmov $f0, $f2 + LD $f25, 1 * SIZE(X) + addl X, INCX, X + + LD $f26, 0 * SIZE(X) + fmov $f0, $f3 + LD $f27, 1 * SIZE(X) + addl X, INCX, X + + fabs $f20, $f8 + fabs $f21, $f9 + fabs $f22, $f10 + fabs $f23, $f11 + + fabs $f24, $f12 + fabs $f25, $f13 + fabs $f26, $f14 + fabs $f27, $f15 + + ble $1, $L14 + .align 4 + + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + ldi $1, -1($1) + addl X, INCX, X + + LD $f22, 0 * SIZE(X) + LD $f23, 1 * SIZE(X) + unop + addl X, INCX, X + + LD $f24, 0 * SIZE(X) + LD $f25, 1 * SIZE(X) + unop + addl X, INCX, X + + LD $f26, 0 * SIZE(X) + LD $f27, 1 * SIZE(X) + addl X, INCX, X + ble $1, $L13 + .align 4 + +$L12: + faddd $f8, $f9, $f16 + unop + fabs $f20, $f8 + fillcs 64 * SIZE(X) + + faddd $f10, $f11, $f17 + unop + fabs $f21, $f9 + LD $f20, 0 * SIZE(X) + + faddd $f12, $f13, $f18 + LD $f21, 1 * SIZE(X) + fabs $f22, $f10 + addl X, INCX, X + + faddd $f14, $f15, $f19 + LD $f22, 0 * SIZE(X) + fabs $f23, $f11 + unop + + CMPLT($f0, $f16), $f4 + LD $f23, 1 * SIZE(X) + fabs $f24, $f12 + addl X, INCX, X + + CMPLT($f1, $f17), $f5 + LD $f24, 0 * SIZE(X) + fabs $f25, $f13 + unop + + CMPLT($f2, $f18), $f6 + LD $f25, 1 * SIZE(X) + fabs $f26, $f14 + addl X, INCX, X + + CMPLT($f3, $f19), $f7 + LD $f26, 0 * SIZE(X) + fabs $f27, $f15 + unop + +fselne $f4,$f16,$f0, $f0 + LD $f27, 1 * SIZE(X) + addl X, INCX, X + ldi $1, -1($1) # i -- + +fselne $f5,$f17,$f1, $f1 +fselne $f6,$f18,$f2, $f2 +fselne $f7,$f19,$f3, $f3 + bgt $1,$L12 + .align 4 + +$L13: + faddd $f8, $f9, $f16 + fabs $f20, $f8 + + faddd $f10, $f11, $f17 + fabs $f21, $f9 + + faddd $f12, $f13, $f18 + fabs $f22, $f10 + + faddd $f14, $f15, $f19 + fabs $f23, $f11 + + CMPLT($f0, $f16), $f4 + fabs $f24, $f12 + + CMPLT($f1, $f17), $f5 + fabs $f25, $f13 + + CMPLT($f2, $f18), $f6 + fabs $f26, $f14 + CMPLT($f3, $f19), $f7 + fabs $f27, $f15 + +fselne $f4,$f16,$f0, $f0 +fselne $f5,$f17,$f1, $f1 +fselne $f6,$f18,$f2, $f2 +fselne $f7,$f19,$f3, $f3 + .align 4 + +$L14: + faddd $f8, $f9, $f16 + faddd $f10, $f11, $f17 + faddd $f12, $f13, $f18 + faddd $f14, $f15, $f19 + + CMPLT($f0, $f16), $f4 + CMPLT($f1, $f17), $f5 + CMPLT($f2, $f18), $f6 + CMPLT($f3, $f19), $f7 + +fselne $f4,$f16,$f0, $f0 +fselne $f5,$f17,$f1, $f1 +fselne $f6,$f18,$f2, $f2 +fselne $f7,$f19,$f3, $f3 + + CMPLT($f0, $f1), $f16 + CMPLT($f2, $f3), $f17 + +fselne $f16,$f1,$f0, $f0 +fselne $f17,$f3,$f2, $f2 + + CMPLT($f0, $f2), $f16 +fselne $f16,$f2,$f0, $f0 + .align 4 + +$L15: + and N, 3, $1 + unop + unop + ble $1, $End + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + unop + addl X, INCX, X + + fabs $f20, $f29 + fabs $f21, $f30 + faddd $f29, $f30, $f29 + + CMPLT($f0, $f29), $f16 +fselne $f16,$f29,$f0, $f0 + + ldi $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$End: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + ldi $sp, STACKSIZE($sp) + ret + + EPILOGUE diff --git a/kernel/sw_64/zasum.S b/kernel/sw_64/zasum.S new file mode 100644 index 0000000..72e120c --- /dev/null +++ b/kernel/sw_64/zasum.S @@ -0,0 +1,231 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define I $19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f19 + +#define t0 $f20 +#define t1 $f21 +#define t2 $f22 +#define t3 $f23 +#define t4 $f24 +#define s4 $f27 + PROLOGUE + PROFCODE + + fclr s0 + unop + fclr t0 + addl INCX, INCX, INCX + + fclr s1 + unop + fclr t1 + ble N, $L999 + + fclr s2 + sra N, 2, I + fclr s3 + ble I, $L15 + + LD a0, 0 * SIZE(X) + fclr t2 + LD a1, 1 * SIZE(X) + SXADDQ INCX, X, X + + LD a2, 0 * SIZE(X) + fclr t3 + LD a3, 1 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + LD a5, 1 * SIZE(X) + SXADDQ INCX, X, X + ldi I, -1(I) + + ble I, $L13 + .align 4 + +$L12: + ADD s0, t0, s4 + fmov s4,s0 + fillcs PREFETCHSIZE * SIZE(X) + fabs a0, t0 + ldi I, -1(I) + + ADD s1, t1, s4 + fmov s4,s1 + LD a6, 0 * SIZE(X) + fabs a1, t1 + unop + + ADD s2, t2, s4 + fmov s4,s2 + LD a7, 1 * SIZE(X) + fabs a2, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s4 + fmov s4,s3 + LD a0, 0 * SIZE(X) + fabs a3, t3 + unop + + ADD s0, t0, s4 + fmov s4,s0 + LD a1, 1 * SIZE(X) + fabs a4, t0 + SXADDQ INCX, X, X + + ADD s1, t1, s4 + fmov s4,s1 + LD a2, 0 * SIZE(X) + fabs a5, t1 + unop + + fadds s2, t2, s4 + fmov s4,s2 + LD a3, 1 * SIZE(X) + fabs a6, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s4 + fmov s4,s3 + LD a4, 0 * SIZE(X) + fabs a7, t3 + unop + + LD a5, 1 * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + ADD s0, t0, s4 + fmov s4,s0 + LD a6, 0 * SIZE(X) + fabs a0, t0 + + ADD s1, t1, s4 + fmov s4,s1 + LD a7, 1 * SIZE(X) + fabs a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s4 + fmov s4,s2 + fabs a2, t2 + ADD s3, t3, s4 + fmov s4,s3 + fabs a3, t3 + + ADD s0, t0, s4 + fmov s4,s0 + fabs a4, t0 + ADD s1, t1, s4 + fmov s4,s1 + fabs a5, t1 + ADD s2, t2, s4 + fmov s4,s2 + fabs a6, t2 + ADD s3, t3, s4 + fmov s4,s3 + fabs a7, t3 + + ADD s2, t2, s4 + fmov s4,s2 + ADD s3, t3, s4 + fmov s4,s3 + + .align 4 + +$L15: + ADD s0, s2, $f25 + fmov $f25, s0 + and N, 3, I + ADD s1, s3, $f25 + fmov $f25, s1 + ble I, $L999 + .align 4 + +$L17: + ADD s0, t0, $f25 + fmov $f25, s0 + LD a0, 0 * SIZE(X) + fabs a0, t0 + ldi I, -1(I) + + ADD s1, t1, $f25 + fmov $f25, s1 + LD a1, 1 * SIZE(X) + fabs a1, t1 + SXADDQ INCX, X, X + + bne I, $L17 + .align 4 + +$L999: + ADD s0, t0, $f25 + ADD s1, t1, $f26 + + ADD $f25, $f26, s0 + ret + EPILOGUE diff --git a/kernel/sw_64/zasum.S.bak b/kernel/sw_64/zasum.S.bak new file mode 100644 index 0000000..db79771 --- /dev/null +++ b/kernel/sw_64/zasum.S.bak @@ -0,0 +1,208 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define I $19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f19 + +#define t0 $f20 +#define t1 $f21 +#define t2 $f22 +#define t3 $f23 + + PROLOGUE + PROFCODE + + fclr s0 + unop + fclr t0 + addl INCX, INCX, INCX + + fclr s1 + unop + fclr t1 + ble N, $L999 + + fclr s2 + sra N, 2, I + fclr s3 + ble I, $L15 + + LD a0, 0 * SIZE(X) + fclr t2 + LD a1, 1 * SIZE(X) + SXADDQ INCX, X, X + + LD a2, 0 * SIZE(X) + fclr t3 + LD a3, 1 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + LD a5, 1 * SIZE(X) + SXADDQ INCX, X, X + ldi I, -1(I) + + ble I, $L13 + .align 4 + +$L12: + ADD s0, t0, s0 + fillcs PREFETCHSIZE * SIZE(X) + fabs a0, t0 + ldi I, -1(I) + + ADD s1, t1, s1 + LD a6, 0 * SIZE(X) + fabs a1, t1 + unop + + ADD s2, t2, s2 + LD a7, 1 * SIZE(X) + fabs a2, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + fabs a3, t3 + unop + + ADD s0, t0, s0 + LD a1, 1 * SIZE(X) + fabs a4, t0 + SXADDQ INCX, X, X + + ADD s1, t1, s1 + LD a2, 0 * SIZE(X) + fabs a5, t1 + unop + + ADD s2, t2, s2 + LD a3, 1 * SIZE(X) + fabs a6, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + fabs a7, t3 + unop + + LD a5, 1 * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + ADD s0, t0, s0 + LD a6, 0 * SIZE(X) + fabs a0, t0 + + ADD s1, t1, s1 + LD a7, 1 * SIZE(X) + fabs a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + fabs a2, t2 + ADD s3, t3, s3 + fabs a3, t3 + + ADD s0, t0, s0 + fabs a4, t0 + ADD s1, t1, s1 + fabs a5, t1 + ADD s2, t2, s2 + fabs a6, t2 + ADD s3, t3, s3 + fabs a7, t3 + + ADD s2, t2, s2 + ADD s3, t3, s3 + + .align 4 + +$L15: + ADD s0, s2, s0 + and N, 3, I + ADD s1, s3, s1 + ble I, $L999 + .align 4 + +$L17: + ADD s0, t0, s0 + LD a0, 0 * SIZE(X) + fabs a0, t0 + ldi I, -1(I) + + ADD s1, t1, s1 + LD a1, 1 * SIZE(X) + fabs a1, t1 + SXADDQ INCX, X, X + + bne I, $L17 + .align 4 + +$L999: + ADD s0, t0, s0 + ADD s1, t1, s1 + + ADD s0, s1, s0 + ret + EPILOGUE diff --git a/kernel/sw_64/zasum_simd.S b/kernel/sw_64/zasum_simd.S new file mode 100644 index 0000000..5606fdf --- /dev/null +++ b/kernel/sw_64/zasum_simd.S @@ -0,0 +1,385 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 96 + +#define N $16 +#define X $17 +#define INCX $18 +#define I $19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f19 + +#define t0 $f20 +#define t1 $f21 +#define t2 $f22 +#define t3 $f23 + +#define t4 $f24 +#define t5 $f25 +#define t6 $f26 +#define t7 $f27 + + PROLOGUE + PROFCODE + .frame $sp, 16, $26, 0 + + fclr s0 + unop + fclr t0 + addl INCX, INCX, INCX + + fclr s1 + unop + fclr t1 + ble N, $L999 + + cmpeq INCX, 2, $3 + beq $3, $Sub + .align 4 + + and X, (VEC_LEN*SIZE-1), $6 + bgt $6, $UnAlign_X_ACCESS + .align 4 +$Align_Access: + +/* + Unloop 8*2= 16 reals +*/ + sra N, 3, I + fclr s2 + fclr s3 + ble I, $Remain + + VLD a0, 0*VEC_LEN*SIZE(X) + vcpys $f31, $f31, t0 + VLD a1, 1*VEC_LEN*SIZE(X) + vcpys $f31, $f31, t1 + + VLD a2, 2*VEC_LEN*SIZE(X) + vcpys $f31, $f31, t2 + VLD a3, 3*VEC_LEN*SIZE(X) + vcpys $f31, $f31, t3 + + subl I, 1, I + addl X, 16*SIZE, X + unop + ble I, $MainLoopEnd + +$MainLoop: + vcpys $f31, a0, a4 + VLD a0, 0*VEC_LEN*SIZE(X) + vcpys $f31, a1, a5 + VLD a1, 1*VEC_LEN*SIZE(X) + + vcpys $f31, a2, a6 + VLD a2, 2*VEC_LEN*SIZE(X) + vcpys $f31, a3, a7 + VLD a3, 3*VEC_LEN*SIZE(X) + + VADD t0, a4, t0 + subl I, 1, I + VADD t1, a5, t1 + fillcs PREFETCHSIZE * SIZE(X) + + VADD t2, a6, t2 + addl X, 16*SIZE, X + VADD t3, a7, t3 + bgt I, $MainLoop + +$MainLoopEnd: + /*fabs*/ + + vcpys $f31, a0, a4 + vcpys $f31, a1, a5 + vcpys $f31, a2, a6 + vcpys $f31, a3, a7 + + VADD t0, a4, t0 + VADD t1, a5, t1 + VADD t2, a6, t2 + VADD t3, a7, t3 + + VADD t0, t1, t0 + VADD t2, t3, t2 + VADD t0, t2, t0 + nop + + vextf t0, 0, s0 + vextf t0, 1, s1 + vextf t0, 2, s2 + vextf t0, 3, s3 + +$Remain: + and N, 7, I + ADD s0, s2, s0 + ADD s1, s3, s1 + ble I, $End + .align 4 + +$RemainLoop: + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + fabs a0, t0 + addl X, 2*SIZE, X + + fabs a1, t1 + ldi I, -1(I) + ADD s0, t0, s0 + ADD s1, t1, s1 + + bne I, $RemainLoop + .align 4 +$End: + ADD s0, s1, s0 + ret + .align 4 + +$UnAlign_X_ACCESS: + sra N, 3, I + fclr s2 + fclr s3 + ble I, $Remain + + VLD_UL a0, 0*VEC_LEN*SIZE(X) + vcpys $f31, $f31, t0 + VLD_UH t4, 1*VEC_LEN*SIZE(X) + vcpys $f31, $f31, t1 + + VLD_UL a1, 1*VEC_LEN*SIZE(X) + vcpys $f31, $f31, t2 + VLD_UH t5, 2*VEC_LEN*SIZE(X) + vcpys $f31, $f31, t3 + + VLD_UL a2, 2*VEC_LEN*SIZE(X) + VLD_UH t6, 3*VEC_LEN*SIZE(X) + VLD_UL a3, 3*VEC_LEN*SIZE(X) + VLD_UH t7, 4*VEC_LEN*SIZE(X) + + vbisw a0, t4, a0 + subl I, 1, I + vbisw a1, t5, a1 + addl X, 16*SIZE, X + + vbisw a2, t6, a2 + unop + vbisw a3, t7, a3 + ble I, $MainLoopEnd + +$UnAlign_X_ACCESS_MainLoop: +/*fabs*/ + vcpys $f31, a0, a4 + VLD_UL a0, 0*VEC_LEN*SIZE(X) + vcpys $f31, a1, a5 + VLD_UH t4, 1*VEC_LEN*SIZE(X) + + vcpys $f31, a2, a6 + VLD_UL a1, 1*VEC_LEN*SIZE(X) + vcpys $f31, a3, a7 + VLD_UH t5, 2*VEC_LEN*SIZE(X) + + VADD t0, a4, t0 + VLD_UL a2, 2*VEC_LEN*SIZE(X) + VADD t1, a5, t1 + VLD_UH t6, 3*VEC_LEN*SIZE(X) + + VADD t2, a6, t2 + VLD_UL a3, 3*VEC_LEN*SIZE(X) + VADD t3, a7, t3 + VLD_UH t7, 4*VEC_LEN*SIZE(X) + + + vbisw a0, t4, a0 + subl I, 1, I + vbisw a1, t5, a1 + fillcs PREFETCHSIZE * SIZE(X) + + vbisw a2, t6, a2 + addl X, 16*SIZE, X + vbisw a3, t7, a3 + bgt I, $UnAlign_X_ACCESS_MainLoop + + jmp $MainLoopEnd + .align 4 + + +$Sub: + fclr s2 + sra N, 2, I + fclr s3 + ble I, $L15 + + LD a0, 0 * SIZE(X) + fclr t2 + LD a1, 1 * SIZE(X) + SXADDQ INCX, X, X + + LD a2, 0 * SIZE(X) + fclr t3 + LD a3, 1 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + LD a5, 1 * SIZE(X) + SXADDQ INCX, X, X + ldi I, -1(I) + + ble I, $L13 + .align 4 + +$L12: + ADD s0, t0, s0 + fillcs PREFETCHSIZE * SIZE(X) + fabs a0, t0 + ldi I, -1(I) + + ADD s1, t1, s1 + LD a6, 0 * SIZE(X) + fabs a1, t1 + unop + + ADD s2, t2, s2 + LD a7, 1 * SIZE(X) + fabs a2, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + fabs a3, t3 + unop + + ADD s0, t0, s0 + LD a1, 1 * SIZE(X) + fabs a4, t0 + SXADDQ INCX, X, X + + ADD s1, t1, s1 + LD a2, 0 * SIZE(X) + fabs a5, t1 + unop + + ADD s2, t2, s2 + LD a3, 1 * SIZE(X) + fabs a6, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + fabs a7, t3 + unop + + LD a5, 1 * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + ADD s0, t0, s0 + LD a6, 0 * SIZE(X) + fabs a0, t0 + + ADD s1, t1, s1 + LD a7, 1 * SIZE(X) + fabs a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + fabs a2, t2 + ADD s3, t3, s3 + fabs a3, t3 + + ADD s0, t0, s0 + fabs a4, t0 + ADD s1, t1, s1 + fabs a5, t1 + ADD s2, t2, s2 + fabs a6, t2 + ADD s3, t3, s3 + fabs a7, t3 + + ADD s2, t2, s2 + ADD s3, t3, s3 + + .align 4 + +$L15: + ADD s0, s2, s0 + and N, 3, I + ADD s1, s3, s1 + ble I, $L999 + .align 4 + +$L17: + ADD s0, t0, s0 + LD a0, 0 * SIZE(X) + fabs a0, t0 + ldi I, -1(I) + + ADD s1, t1, s1 + LD a1, 1 * SIZE(X) + fabs a1, t1 + SXADDQ INCX, X, X + + bne I, $L17 + .align 4 + +$L999: + ADD s0, t0, s0 + ADD s1, t1, s1 + + ADD s0, s1, s0 + ret + EPILOGUE diff --git a/kernel/sw_64/zaxpy.S b/kernel/sw_64/zaxpy.S new file mode 100644 index 0000000..19b6398 --- /dev/null +++ b/kernel/sw_64/zaxpy.S @@ -0,0 +1,654 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 40 + +#ifndef CONJ +#define ADD1 SUB +#define ADD2 ADD +#else +#define ADD1 ADD +#define ADD2 SUB +#endif + +#define tmp $f9 + PROLOGUE + PROFCODE + .frame $sp, 16, $26, 0 + + ldw $19, 0($sp) + fmov $f19, $f29 + ldl $20, 8($sp) + fmov $f20, $f30 + + mov $21, $18 + ldw $21, 16($sp) + ldi $sp, -64($sp) + nop + + fstd $f2, 0($sp) + cmpeq $19, 1, $1 + fstd $f3, 8($sp) + cmpeq $21, 1, $2 + + fstd $f4, 16($sp) + and $16, 3, $5 + fstd $f5, 24($sp) + fstd $f6, 32($sp) + + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd tmp, 56($sp) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + and $1, $2, $1 + ble $16, $End + sra $16, 2, $4 + beq $1, $Sub + + ble $4, $Remain + subl $4, 1, $4 + + LD $f0, 0*SIZE($18) + LD $f1, 1*SIZE($18) + LD $f2, 2*SIZE($18) + LD $f3, 3*SIZE($18) + LD $f4, 4*SIZE($18) + LD $f5, 5*SIZE($18) + LD $f6, 6*SIZE($18) + LD $f7, 7*SIZE($18) + + LD $f8, 0*SIZE($20) + LD $f28, 1*SIZE($20) + LD $f10, 2*SIZE($20) + LD $f11, 3*SIZE($20) + LD $f12, 4*SIZE($20) + LD $f13, 5*SIZE($20) + LD $f14, 6*SIZE($20) + LD $f15, 7*SIZE($20) + + addl $18, 8*SIZE, $18 + ble $4, $MainLoopEnd + .align 4 + +$MainLoop: + fillcs PREFETCHSIZE * SIZE($20) + fillcs PREFETCHSIZE * SIZE($18) + + MUL $f29, $f0, $f20 + fillcs 9*SIZE($18) + MUL $f30, $f1, $f21 + unop + + MUL $f30, $f0, $f22 + LD $f0, 0*SIZE($18) + MUL $f29, $f1, $f23 + LD $f1, 1*SIZE($18) + + MUL $f29, $f2, $f24 + unop + MUL $f30, $f3, $f25 + nop + + MUL $f30, $f2, $f26 + LD $f2, 2*SIZE($18) + MUL $f29, $f3, $f27 + LD $f3, 3*SIZE($18) + + ADD1 $f20, $f21, $f16 + MUL $f29, $f4, $f20 + ADD2 $f22, $f23, $f17 + MUL $f30, $f5, $f21 + + ADD1 $f24, $f25, $f18 + unop + MUL $f30, $f4, $f22 + LD $f4, 4*SIZE($18) + + ADD2 $f26, $f27, $f19 + addl $20, 8*SIZE, $20 + MUL $f29, $f5, $f23 + LD $f5, 5*SIZE($18) + + ADD $f16, $f8, tmp + fmov tmp, $f16 + LD $f8, 0*SIZE($20) + MUL $f29, $f6, $f24 + unop + + ADD $f17, $f28, tmp + fmov tmp, $f17 + LD $f28, 1*SIZE($20) + MUL $f30, $f7, $f25 + unop + + ADD $f18, $f10, tmp + fmov tmp, $f18 + LD $f10, 2*SIZE($20) + MUL $f30, $f6, $f26 + LD $f6, 6*SIZE($18) + + ADD $f19, $f11, tmp + fmov tmp, $f19 + LD $f11, 3*SIZE($20) + MUL $f29, $f7, $f27 + LD $f7, 7*SIZE($18) + + ST $f16,-8*SIZE($20) + ADD1 $f20, $f21, $f16 + ST $f17,-7*SIZE($20) + ADD2 $f22, $f23, $f17 + + ST $f18,-6*SIZE($20) + ADD1 $f24, $f25, $f18 + ST $f19,-5*SIZE($20) + ADD2 $f26, $f27, $f19 + + ADD $f16, $f12, tmp + fmov tmp, $f16 + LD $f12, 4*SIZE($20) + ADD $f17, $f13, tmp + fmov tmp, $f17 + LD $f13, 5*SIZE($20) + ADD $f18, $f14, tmp + fmov tmp, $f18 + LD $f14, 6*SIZE($20) + ADD $f19, $f15, tmp + fmov tmp, $f19 + LD $f15, 7*SIZE($20) + + ST $f16,-4*SIZE($20) + addl $18, 8*SIZE, $18 + ST $f17,-3*SIZE($20) + subl $4, 1, $4 + + ST $f18,-2*SIZE($20) + nop + ST $f19,-1*SIZE($20) + bgt $4, $MainLoop + .align 4 + +$MainLoopEnd: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + MUL $f29, $f1, $f23 + + MUL $f29, $f2, $f24 + MUL $f30, $f3, $f25 + MUL $f30, $f2, $f26 + MUL $f29, $f3, $f27 + + ADD1 $f20, $f21, $f16 + MUL $f29, $f4, $f20 + ADD2 $f22, $f23, $f17 + MUL $f30, $f5, $f21 + + ADD1 $f24, $f25, $f18 + MUL $f30, $f4, $f22 + ADD2 $f26, $f27, $f19 + MUL $f29, $f5, $f23 + + ADD $f16, $f8, tmp + fmov tmp, $f16 + MUL $f29, $f6, $f24 + ADD $f17, $f28, tmp + fmov tmp, $f17 + MUL $f30, $f7, $f25 + + ADD $f18, $f10, tmp + fmov tmp, $f18 + MUL $f30, $f6, $f26 + ADD $f19, $f11, tmp + fmov tmp, $f19 + MUL $f29, $f7, $f27 + + ST $f16, 0*SIZE($20) + ADD1 $f20, $f21, $f16 + ST $f17, 1*SIZE($20) + ADD2 $f22, $f23, $f17 + + ST $f18, 2*SIZE($20) + ADD1 $f24, $f25, $f18 + ST $f19, 3*SIZE($20) + ADD2 $f26, $f27, $f19 + + ADD $f16, $f12, tmp + fmov tmp, $f16 + ADD $f17, $f13, tmp + fmov tmp, $f17 + ADD $f18, $f14, tmp + fmov tmp, $f18 + ADD $f19, $f15, tmp + fmov tmp, $f19 + + ST $f16, 4*SIZE($20) + ST $f17, 5*SIZE($20) + ST $f18, 6*SIZE($20) + ST $f19, 7*SIZE($20) + + unop + addl $20, 8*SIZE, $20 + unop + ble $5, $End + .align 4 + +$Remain: + subl $5, 1, $6 + ble $5, $End + LD $f0, 0*SIZE($18) + LD $f1, 1*SIZE($18) + + LD $f8, 0*SIZE($20) + LD $f28, 1*SIZE($20) + addl $18, 2*SIZE, $18 + ble $6, $RemainLoopEnd + .align 4 + +$RemainLoop: + MUL $f29, $f0, $f20 + subl $6, 1, $6 + MUL $f30, $f1, $f21 + addl $20, 2*SIZE, $20 + + MUL $f30, $f0, $f22 + LD $f0, 0*SIZE($18) + MUL $f29, $f1, $f23 + LD $f1, 1*SIZE($18) + + ADD1 $f20, $f21, $f16 + ADD2 $f22, $f23, $f17 + ADD $f16, $f8, tmp + fmov tmp, $f16 + LD $f8, 0*SIZE($20) + ADD $f17, $f28, tmp + fmov tmp, $f17 + LD $f28, 1*SIZE($20) + + ST $f16,-2*SIZE($20) + addl $18, 2*SIZE, $18 + ST $f17,-1*SIZE($20) + bgt $6, $RemainLoop + .align 4 + +$RemainLoopEnd: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + MUL $f29, $f1, $f23 + + ADD1 $f20, $f21, $f16 + ADD2 $f22, $f23, $f17 + ADD $f16, $f8, tmp + fmov tmp, $f16 + ADD $f17, $f28, tmp + fmov tmp, $f17 + + ST $f16, 0*SIZE($20) + nop + ST $f17, 1*SIZE($20) + nop + .align 4 + +$End: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd tmp, 56($sp) + ldi $sp, 64($sp) + ret + .align 4 + +$Sub: + SXSUBL $16, SIZE, $22 + addl $22, $22, $22 # Complex + .align 4 + + addl $19, $19, $19 # Complex + addl $21, $21, $21 # Complex + + ble $4, $SubRemain + LD $f0, 0*SIZE($18) + LD $f1, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f2, 0*SIZE($18) + LD $f3, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f4, 0*SIZE($18) + LD $f5, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f6, 0*SIZE($18) + LD $f7, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f8, 0*SIZE($20) + LD $f28, 1*SIZE($20) + SXADDQ $21, $20, $24 + + LD $f10, 0*SIZE($24) + LD $f11, 1*SIZE($24) + SXADDQ $21, $24, $24 + + LD $f12, 0*SIZE($24) + LD $f13, 1*SIZE($24) + SXADDQ $21, $24, $24 + + LD $f14, 0*SIZE($24) + LD $f15, 1*SIZE($24) + SXADDQ $21, $24, $24 + + subl $4, 1, $4 + ble $4, $SubMainLoopEnd + .align 4 + +$SubMainLoop: + MUL $f29, $f0, $f20 + unop + MUL $f30, $f1, $f21 + unop + + MUL $f30, $f0, $f22 + LD $f0, 0*SIZE($18) + MUL $f29, $f1, $f23 + LD $f1, 1*SIZE($18) + + MUL $f29, $f2, $f24 + SXADDQ $19, $18, $18 + MUL $f30, $f3, $f25 + unop + + MUL $f30, $f2, $f26 + LD $f2, 0*SIZE($18) + MUL $f29, $f3, $f27 + LD $f3, 1*SIZE($18) + + ADD1 $f20, $f21, $f16 + SXADDQ $19, $18, $18 + MUL $f29, $f4, $f20 + unop + + ADD2 $f22, $f23, $f17 + unop + MUL $f30, $f5, $f21 + unop + + ADD1 $f24, $f25, $f18 + unop + MUL $f30, $f4, $f22 + LD $f4, 0*SIZE($18) + + ADD2 $f26, $f27, $f19 + unop + MUL $f29, $f5, $f23 + LD $f5, 1*SIZE($18) + + ADD $f16, $f8, tmp + fmov tmp, $f16 + LD $f8, 0*SIZE($24) + MUL $f29, $f6, $f24 + SXADDQ $19, $18, $18 + + ADD $f17, $f28, tmp + fmov tmp, $f17 + LD $f28, 1*SIZE($24) + MUL $f30, $f7, $f25 + SXADDQ $21, $24, $24 + + ADD $f18, $f10, tmp + fmov tmp, $f18 + LD $f10, 0*SIZE($24) + MUL $f30, $f6, $f26 + LD $f6, 0*SIZE($18) + + ADD $f19, $f11, tmp + fmov tmp, $f19 + LD $f11, 1*SIZE($24) + MUL $f29, $f7, $f27 + LD $f7, 1*SIZE($18) + + ST $f16, 0*SIZE($20) + SXADDQ $19, $18, $18 + ADD1 $f20, $f21, $f16 + unop + + ST $f17, 1*SIZE($20) + SXADDQ $21, $20, $20 + ADD2 $f22, $f23, $f17 + unop + + ST $f18, 0*SIZE($20) + SXADDQ $21, $24, $24 + ADD1 $f24, $f25, $f18 + unop + + ST $f19, 1*SIZE($20) + unop + ADD2 $f26, $f27, $f19 + SXADDQ $21, $20, $20 + + ADD $f16, $f12, tmp + fmov tmp, $f16 + unop + LD $f12, 0*SIZE($24) + unop + + ADD $f17, $f13, tmp + fmov tmp, $f17 + unop + LD $f13, 1*SIZE($24) + SXADDQ $21, $24, $24 + + ADD $f18, $f14, tmp + fmov tmp, $f18 + subl $4, 1, $4 + LD $f14, 0*SIZE($24) + unop + + ADD $f19, $f15, tmp + fmov tmp, $f19 + unop + LD $f15, 1*SIZE($24) + SXADDQ $21, $24, $24 + + ST $f16, 0*SIZE($20) + ST $f17, 1*SIZE($20) + SXADDQ $21, $20, $20 + unop + + ST $f18, 0*SIZE($20) + ST $f19, 1*SIZE($20) + SXADDQ $21, $20, $20 + bgt $4, $SubMainLoop + .align 4 + +$SubMainLoopEnd: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + MUL $f29, $f1, $f23 + + MUL $f29, $f2, $f24 + MUL $f30, $f3, $f25 + MUL $f30, $f2, $f26 + MUL $f29, $f3, $f27 + + ADD1 $f20, $f21, $f16 + MUL $f29, $f4, $f20 + ADD2 $f22, $f23, $f17 + MUL $f30, $f5, $f21 + + ADD1 $f24, $f25, $f18 + MUL $f30, $f4, $f22 + ADD2 $f26, $f27, $f19 + MUL $f29, $f5, $f23 + + ADD $f16, $f8, tmp + fmov tmp, $f16 + MUL $f29, $f6, $f24 + ADD $f17, $f28, tmp + fmov tmp, $f17 + MUL $f30, $f7, $f25 + + ADD $f18, $f10, tmp + fmov tmp, $f18 + MUL $f30, $f6, $f26 + ADD $f19, $f11, tmp + fmov tmp, $f19 + MUL $f29, $f7, $f27 + + ST $f16, 0*SIZE($20) + ADD1 $f20, $f21, $f16 + ST $f17, 1*SIZE($20) + ADD2 $f22, $f23, $f17 + + SXADDQ $21, $20, $20 + nop + ST $f18, 0*SIZE($20) + ADD1 $f24, $f25, $f18 + + ST $f19, 1*SIZE($20) + ADD2 $f26, $f27, $f19 + SXADDQ $21, $20, $20 + ADD $f16, $f12, tmp + fmov tmp, $f16 + + ADD $f17, $f13, tmp + fmov tmp, $f17 + ADD $f18, $f14, tmp + fmov tmp, $f18 + ADD $f19, $f15, tmp + fmov tmp, $f19 + + ST $f16, 0*SIZE($20) + ST $f17, 1*SIZE($20) + SXADDQ $21, $20, $20 + + ST $f18, 0*SIZE($20) + ST $f19, 1*SIZE($20) + SXADDQ $21, $20, $20 + ble $5, $SubEnd + .align 4 + +$SubRemain: + subl $5, 1, $6 + ble $5, $SubEnd + LD $f0, 0*SIZE($18) + LD $f1, 1*SIZE($18) + + LD $f8, 0*SIZE($20) + LD $f28, 1*SIZE($20) + SXADDQ $19, $18, $18 + SXADDQ $21, $20, $24 + ble $6, $SubRemainLoopEnd + .align 4 + +$SubRemainLoop: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + LD $f0, 0*SIZE($18) + + MUL $f29, $f1, $f23 + LD $f1, 1*SIZE($18) + ADD1 $f20, $f21, $f16 + SXADDQ $19, $18, $18 + + ADD2 $f22, $f23, $f17 + nop + ADD $f16, $f8, tmp + fmov tmp, $f16 + LD $f8, 0*SIZE($24) + + ADD $f17, $f28, tmp + fmov tmp, $f17 + LD $f28, 1*SIZE($24) + SXADDQ $21, $24, $24 + subl $6, 1, $6 + + ST $f16, 0*SIZE($20) + ST $f17, 1*SIZE($20) + SXADDQ $21, $20, $20 + bgt $6, $SubRemainLoop + .align 4 + +$SubRemainLoopEnd: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + MUL $f29, $f1, $f23 + + ADD1 $f20, $f21, $f16 + ADD2 $f22, $f23, $f17 + ADD $f16, $f8, tmp + fmov tmp, $f16 + ADD $f17, $f28, tmp + fmov tmp, $f17 + + ST $f16, 0*SIZE($20) + nop + ST $f17, 1*SIZE($20) + nop + .align 4 + +$SubEnd: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd tmp, 56($sp) + ldi $sp, 64($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/zaxpy.S.bak b/kernel/sw_64/zaxpy.S.bak new file mode 100644 index 0000000..c6cd44b --- /dev/null +++ b/kernel/sw_64/zaxpy.S.bak @@ -0,0 +1,611 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 40 + +#ifndef CONJ +#define ADD1 SUB +#define ADD2 ADD +#else +#define ADD1 ADD +#define ADD2 SUB +#endif + + + PROLOGUE + PROFCODE + .frame $sp, 16, $26, 0 + + ldl $19, 0($sp) + fmov $f19, $f29 + ldl $20, 8($sp) + fmov $f20, $f30 + + mov $21, $18 + ldl $21, 16($sp) + ldi $sp, -64($sp) + nop + + fstd $f2, 0($sp) + cmpeq $19, 1, $1 + fstd $f3, 8($sp) + cmpeq $21, 1, $2 + + fstd $f4, 16($sp) + and $16, 3, $5 + fstd $f5, 24($sp) + fstd $f6, 32($sp) + + fstd $f7, 40($sp) + fstd $f8, 48($sp) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + and $1, $2, $1 + ble $16, $End + sra $16, 2, $4 + beq $1, $Sub + + ble $4, $Remain + subl $4, 1, $4 + + LD $f0, 0*SIZE($18) + LD $f1, 1*SIZE($18) + LD $f2, 2*SIZE($18) + LD $f3, 3*SIZE($18) + LD $f4, 4*SIZE($18) + LD $f5, 5*SIZE($18) + LD $f6, 6*SIZE($18) + LD $f7, 7*SIZE($18) + + LD $f8, 0*SIZE($20) + LD $f28, 1*SIZE($20) + LD $f10, 2*SIZE($20) + LD $f11, 3*SIZE($20) + LD $f12, 4*SIZE($20) + LD $f13, 5*SIZE($20) + LD $f14, 6*SIZE($20) + LD $f15, 7*SIZE($20) + + addl $18, 8*SIZE, $18 + ble $4, $MainLoopEnd + .align 4 + +$MainLoop: + fillcs PREFETCHSIZE * SIZE($20) + fillcs PREFETCHSIZE * SIZE($18) + + MUL $f29, $f0, $f20 + fillcs 9*SIZE($18) + MUL $f30, $f1, $f21 + unop + + MUL $f30, $f0, $f22 + LD $f0, 0*SIZE($18) + MUL $f29, $f1, $f23 + LD $f1, 1*SIZE($18) + + MUL $f29, $f2, $f24 + unop + MUL $f30, $f3, $f25 + nop + + MUL $f30, $f2, $f26 + LD $f2, 2*SIZE($18) + MUL $f29, $f3, $f27 + LD $f3, 3*SIZE($18) + + ADD1 $f20, $f21, $f16 + MUL $f29, $f4, $f20 + ADD2 $f22, $f23, $f17 + MUL $f30, $f5, $f21 + + ADD1 $f24, $f25, $f18 + unop + MUL $f30, $f4, $f22 + LD $f4, 4*SIZE($18) + + ADD2 $f26, $f27, $f19 + addl $20, 8*SIZE, $20 + MUL $f29, $f5, $f23 + LD $f5, 5*SIZE($18) + + ADD $f16, $f8, $f16 + LD $f8, 0*SIZE($20) + MUL $f29, $f6, $f24 + unop + + ADD $f17, $f28, $f17 + LD $f28, 1*SIZE($20) + MUL $f30, $f7, $f25 + unop + + ADD $f18, $f10, $f18 + LD $f10, 2*SIZE($20) + MUL $f30, $f6, $f26 + LD $f6, 6*SIZE($18) + + ADD $f19, $f11, $f19 + LD $f11, 3*SIZE($20) + MUL $f29, $f7, $f27 + LD $f7, 7*SIZE($18) + + ST $f16,-8*SIZE($20) + ADD1 $f20, $f21, $f16 + ST $f17,-7*SIZE($20) + ADD2 $f22, $f23, $f17 + + ST $f18,-6*SIZE($20) + ADD1 $f24, $f25, $f18 + ST $f19,-5*SIZE($20) + ADD2 $f26, $f27, $f19 + + ADD $f16, $f12, $f16 + LD $f12, 4*SIZE($20) + ADD $f17, $f13, $f17 + LD $f13, 5*SIZE($20) + ADD $f18, $f14, $f18 + LD $f14, 6*SIZE($20) + ADD $f19, $f15, $f19 + LD $f15, 7*SIZE($20) + + ST $f16,-4*SIZE($20) + addl $18, 8*SIZE, $18 + ST $f17,-3*SIZE($20) + subl $4, 1, $4 + + ST $f18,-2*SIZE($20) + nop + ST $f19,-1*SIZE($20) + bgt $4, $MainLoop + .align 4 + +$MainLoopEnd: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + MUL $f29, $f1, $f23 + + MUL $f29, $f2, $f24 + MUL $f30, $f3, $f25 + MUL $f30, $f2, $f26 + MUL $f29, $f3, $f27 + + ADD1 $f20, $f21, $f16 + MUL $f29, $f4, $f20 + ADD2 $f22, $f23, $f17 + MUL $f30, $f5, $f21 + + ADD1 $f24, $f25, $f18 + MUL $f30, $f4, $f22 + ADD2 $f26, $f27, $f19 + MUL $f29, $f5, $f23 + + ADD $f16, $f8, $f16 + MUL $f29, $f6, $f24 + ADD $f17, $f28, $f17 + MUL $f30, $f7, $f25 + + ADD $f18, $f10, $f18 + MUL $f30, $f6, $f26 + ADD $f19, $f11, $f19 + MUL $f29, $f7, $f27 + + ST $f16, 0*SIZE($20) + ADD1 $f20, $f21, $f16 + ST $f17, 1*SIZE($20) + ADD2 $f22, $f23, $f17 + + ST $f18, 2*SIZE($20) + ADD1 $f24, $f25, $f18 + ST $f19, 3*SIZE($20) + ADD2 $f26, $f27, $f19 + + ADD $f16, $f12, $f16 + ADD $f17, $f13, $f17 + ADD $f18, $f14, $f18 + ADD $f19, $f15, $f19 + + ST $f16, 4*SIZE($20) + ST $f17, 5*SIZE($20) + ST $f18, 6*SIZE($20) + ST $f19, 7*SIZE($20) + + unop + addl $20, 8*SIZE, $20 + unop + ble $5, $End + .align 4 + +$Remain: + subl $5, 1, $6 + ble $5, $End + LD $f0, 0*SIZE($18) + LD $f1, 1*SIZE($18) + + LD $f8, 0*SIZE($20) + LD $f28, 1*SIZE($20) + addl $18, 2*SIZE, $18 + ble $6, $RemainLoopEnd + .align 4 + +$RemainLoop: + MUL $f29, $f0, $f20 + subl $6, 1, $6 + MUL $f30, $f1, $f21 + addl $20, 2*SIZE, $20 + + MUL $f30, $f0, $f22 + LD $f0, 0*SIZE($18) + MUL $f29, $f1, $f23 + LD $f1, 1*SIZE($18) + + ADD1 $f20, $f21, $f16 + ADD2 $f22, $f23, $f17 + ADD $f16, $f8, $f16 + LD $f8, 0*SIZE($20) + ADD $f17, $f28, $f17 + LD $f28, 1*SIZE($20) + + ST $f16,-2*SIZE($20) + addl $18, 2*SIZE, $18 + ST $f17,-1*SIZE($20) + bgt $6, $RemainLoop + .align 4 + +$RemainLoopEnd: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + MUL $f29, $f1, $f23 + + ADD1 $f20, $f21, $f16 + ADD2 $f22, $f23, $f17 + ADD $f16, $f8, $f16 + ADD $f17, $f28, $f17 + + ST $f16, 0*SIZE($20) + nop + ST $f17, 1*SIZE($20) + nop + .align 4 + +$End: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + ldi $sp, 64($sp) + ret + .align 4 + +$Sub: + SXSUBL $16, SIZE, $22 + addl $22, $22, $22 # Complex + .align 4 + + addl $19, $19, $19 # Complex + addl $21, $21, $21 # Complex + + ble $4, $SubRemain + LD $f0, 0*SIZE($18) + LD $f1, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f2, 0*SIZE($18) + LD $f3, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f4, 0*SIZE($18) + LD $f5, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f6, 0*SIZE($18) + LD $f7, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f8, 0*SIZE($20) + LD $f28, 1*SIZE($20) + SXADDQ $21, $20, $24 + + LD $f10, 0*SIZE($24) + LD $f11, 1*SIZE($24) + SXADDQ $21, $24, $24 + + LD $f12, 0*SIZE($24) + LD $f13, 1*SIZE($24) + SXADDQ $21, $24, $24 + + LD $f14, 0*SIZE($24) + LD $f15, 1*SIZE($24) + SXADDQ $21, $24, $24 + + subl $4, 1, $4 + ble $4, $SubMainLoopEnd + .align 4 + +$SubMainLoop: + MUL $f29, $f0, $f20 + unop + MUL $f30, $f1, $f21 + unop + + MUL $f30, $f0, $f22 + LD $f0, 0*SIZE($18) + MUL $f29, $f1, $f23 + LD $f1, 1*SIZE($18) + + MUL $f29, $f2, $f24 + SXADDQ $19, $18, $18 + MUL $f30, $f3, $f25 + unop + + MUL $f30, $f2, $f26 + LD $f2, 0*SIZE($18) + MUL $f29, $f3, $f27 + LD $f3, 1*SIZE($18) + + ADD1 $f20, $f21, $f16 + SXADDQ $19, $18, $18 + MUL $f29, $f4, $f20 + unop + + ADD2 $f22, $f23, $f17 + unop + MUL $f30, $f5, $f21 + unop + + ADD1 $f24, $f25, $f18 + unop + MUL $f30, $f4, $f22 + LD $f4, 0*SIZE($18) + + ADD2 $f26, $f27, $f19 + unop + MUL $f29, $f5, $f23 + LD $f5, 1*SIZE($18) + + ADD $f16, $f8, $f16 + LD $f8, 0*SIZE($24) + MUL $f29, $f6, $f24 + SXADDQ $19, $18, $18 + + ADD $f17, $f28, $f17 + LD $f28, 1*SIZE($24) + MUL $f30, $f7, $f25 + SXADDQ $21, $24, $24 + + ADD $f18, $f10, $f18 + LD $f10, 0*SIZE($24) + MUL $f30, $f6, $f26 + LD $f6, 0*SIZE($18) + + ADD $f19, $f11, $f19 + LD $f11, 1*SIZE($24) + MUL $f29, $f7, $f27 + LD $f7, 1*SIZE($18) + + ST $f16, 0*SIZE($20) + SXADDQ $19, $18, $18 + ADD1 $f20, $f21, $f16 + unop + + ST $f17, 1*SIZE($20) + SXADDQ $21, $20, $20 + ADD2 $f22, $f23, $f17 + unop + + ST $f18, 0*SIZE($20) + SXADDQ $21, $24, $24 + ADD1 $f24, $f25, $f18 + unop + + ST $f19, 1*SIZE($20) + unop + ADD2 $f26, $f27, $f19 + SXADDQ $21, $20, $20 + + ADD $f16, $f12, $f16 + unop + LD $f12, 0*SIZE($24) + unop + + ADD $f17, $f13, $f17 + unop + LD $f13, 1*SIZE($24) + SXADDQ $21, $24, $24 + + ADD $f18, $f14, $f18 + subl $4, 1, $4 + LD $f14, 0*SIZE($24) + unop + + ADD $f19, $f15, $f19 + unop + LD $f15, 1*SIZE($24) + SXADDQ $21, $24, $24 + + ST $f16, 0*SIZE($20) + ST $f17, 1*SIZE($20) + SXADDQ $21, $20, $20 + unop + + ST $f18, 0*SIZE($20) + ST $f19, 1*SIZE($20) + SXADDQ $21, $20, $20 + bgt $4, $SubMainLoop + .align 4 + +$SubMainLoopEnd: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + MUL $f29, $f1, $f23 + + MUL $f29, $f2, $f24 + MUL $f30, $f3, $f25 + MUL $f30, $f2, $f26 + MUL $f29, $f3, $f27 + + ADD1 $f20, $f21, $f16 + MUL $f29, $f4, $f20 + ADD2 $f22, $f23, $f17 + MUL $f30, $f5, $f21 + + ADD1 $f24, $f25, $f18 + MUL $f30, $f4, $f22 + ADD2 $f26, $f27, $f19 + MUL $f29, $f5, $f23 + + ADD $f16, $f8, $f16 + MUL $f29, $f6, $f24 + ADD $f17, $f28, $f17 + MUL $f30, $f7, $f25 + + ADD $f18, $f10, $f18 + MUL $f30, $f6, $f26 + ADD $f19, $f11, $f19 + MUL $f29, $f7, $f27 + + ST $f16, 0*SIZE($20) + ADD1 $f20, $f21, $f16 + ST $f17, 1*SIZE($20) + ADD2 $f22, $f23, $f17 + + SXADDQ $21, $20, $20 + nop + ST $f18, 0*SIZE($20) + ADD1 $f24, $f25, $f18 + + ST $f19, 1*SIZE($20) + ADD2 $f26, $f27, $f19 + SXADDQ $21, $20, $20 + ADD $f16, $f12, $f16 + + ADD $f17, $f13, $f17 + ADD $f18, $f14, $f18 + ADD $f19, $f15, $f19 + + ST $f16, 0*SIZE($20) + ST $f17, 1*SIZE($20) + SXADDQ $21, $20, $20 + + ST $f18, 0*SIZE($20) + ST $f19, 1*SIZE($20) + SXADDQ $21, $20, $20 + ble $5, $SubEnd + .align 4 + +$SubRemain: + subl $5, 1, $6 + ble $5, $SubEnd + LD $f0, 0*SIZE($18) + LD $f1, 1*SIZE($18) + + LD $f8, 0*SIZE($20) + LD $f28, 1*SIZE($20) + SXADDQ $19, $18, $18 + SXADDQ $21, $20, $24 + ble $6, $SubRemainLoopEnd + .align 4 + +$SubRemainLoop: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + LD $f0, 0*SIZE($18) + + MUL $f29, $f1, $f23 + LD $f1, 1*SIZE($18) + ADD1 $f20, $f21, $f16 + SXADDQ $19, $18, $18 + + ADD2 $f22, $f23, $f17 + nop + ADD $f16, $f8, $f16 + LD $f8, 0*SIZE($24) + + ADD $f17, $f28, $f17 + LD $f28, 1*SIZE($24) + SXADDQ $21, $24, $24 + subl $6, 1, $6 + + ST $f16, 0*SIZE($20) + ST $f17, 1*SIZE($20) + SXADDQ $21, $20, $20 + bgt $6, $SubRemainLoop + .align 4 + +$SubRemainLoopEnd: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + MUL $f29, $f1, $f23 + + ADD1 $f20, $f21, $f16 + ADD2 $f22, $f23, $f17 + ADD $f16, $f8, $f16 + ADD $f17, $f28, $f17 + + ST $f16, 0*SIZE($20) + nop + ST $f17, 1*SIZE($20) + nop + .align 4 + +$SubEnd: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + ldi $sp, 64($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/zaxpy_simd.S b/kernel/sw_64/zaxpy_simd.S new file mode 100644 index 0000000..a823ebf --- /dev/null +++ b/kernel/sw_64/zaxpy_simd.S @@ -0,0 +1,1479 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 128 + +#ifndef CONJ +#define ADD1 SUB +#define ADD2 ADD + +#define VADD1 VSUB +#define VADD2 VADD +#define VMAD1 VNMAD +#define VMAD2 VMAD + +#else +#define ADD1 ADD +#define ADD2 SUB + +#define VADD1 VADD +#define VADD2 VSUB +#define VMAD1 VMAD +#define VMAD2 VNMAD + +#endif + + + PROLOGUE + PROFCODE + .frame $sp, 64, $26, 0 + + ldl $19, 0($sp) + fmov $f19, $f29 + ldl $20, 8($sp) + fmov $f20, $f30 + + mov $21, $18 + ldl $21, 16($sp) + ldi $sp, -64($sp) + nop + + fstd $f2, 0($sp) + cmpeq $19, 1, $1 + fstd $f3, 8($sp) + cmpeq $21, 1, $2 + + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + nop + + fstd $f7, 40($sp) + fstd $f8, 48($sp) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif +/* + unloop 8: process 8 complex=16 float/double +*/ + and $1, $2, $1 + ble $16, $End + sra $16, 3, $4 + and $16, 7, $5 + + beq $1, $Sub + ble $4, $Remain + subl $4, 1, $4 + nop +/*extern alpha_r alpha_i to vector*/ + + vcpyf $f29, $f29 + vcpyf $f30, $f30 + +/** + align ? + test the address of Y & X +**/ + and $20, (VEC_LEN*SIZE-1), $6 + bgt $6, $UnAlign_Y_ACCESS + + and $18, (VEC_LEN*SIZE-1), $7 + nop + nop + bgt $7, $UnAlign_X_ACCESS + + .align 4 + + VLD $f0, 0*VEC_LEN*SIZE($18) + VLD $f1, 1*VEC_LEN*SIZE($18) + VLD $f2, 2*VEC_LEN*SIZE($18) + VLD $f3, 3*VEC_LEN*SIZE($18) + +/* + LD $f0, 0*SIZE($18) + LD $f1, 1*SIZE($18) + LD $f2, 2*SIZE($18) + LD $f3, 3*SIZE($18) + + LD $f4, 4*SIZE($18) + LD $f5, 5*SIZE($18) + LD $f6, 6*SIZE($18) + LD $f7, 7*SIZE($18) +*/ + + VLD $f8, 0*VEC_LEN*SIZE($20) + VLD $f28, 1*VEC_LEN*SIZE($20) + VLD $f10, 2*VEC_LEN*SIZE($20) + VLD $f11, 3*VEC_LEN*SIZE($20) + +/* + LD $f8, 0*SIZE($20) + LD $f28, 1*SIZE($20) + LD $f10, 2*SIZE($20) + LD $f11, 3*SIZE($20) + LD $f12, 4*SIZE($20) + LD $f13, 5*SIZE($20) + LD $f14, 6*SIZE($20) + LD $f15, 7*SIZE($20) +*/ + addl $18, 16*SIZE, $18 + ble $4, $MainLoopEnd + .align 4 + +$MainLoop: +/* + fillcs PREFETCHSIZE * SIZE($20) + fillcs PREFETCHSIZE * SIZE($18) +*/ + fillcs PREFETCHSIZE * SIZE($20) + fillcs PREFETCHSIZE * SIZE($18) + +/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ + vextf $f0, 1, $f4 + vextf $f0, 3, $f5 + vextf $f1, 0, $f6 + vextf $f1, 2, $f7 + + vextf $f2, 1, $f12 + vextf $f2, 3, $f13 + vextf $f3, 0, $f14 + vextf $f3, 2, $f15 + + vinsf $f4, $f1, 0, $f1 + vinsf $f5, $f1, 2, $f1 + vinsf $f6, $f0, 1, $f0 + vinsf $f7, $f0, 3, $f0 + + vinsf $f12, $f3, 0, $f3 + vinsf $f13, $f3, 2, $f3 + vinsf $f14, $f2, 1, $f2 + vinsf $f15, $f2, 3, $f2 + +/*Compute*/ + VMUL $f29, $f0, $f20 + VMUL $f30, $f0, $f21 + VMUL $f29, $f2, $f22 + VMUL $f30, $f2, $f23 + + + VMAD1 $f30, $f1, $f20, $f16 + VMAD2 $f29, $f1, $f21, $f17 + VMAD1 $f30, $f3, $f22, $f18 + VMAD2 $f29, $f3, $f23, $f19 + + VLD $f0, 0*VEC_LEN*SIZE($18) + VLD $f1, 1*VEC_LEN*SIZE($18) + VLD $f2, 2*VEC_LEN*SIZE($18) + VLD $f3, 3*VEC_LEN*SIZE($18) + +/*combine the real & image vector to complex vector*/ + vextf $f16, 1, $f24 + vextf $f16, 3, $f25 + vextf $f17, 0, $f26 + vextf $f17, 2, $f27 + + vextf $f18, 1, $f12 + vextf $f18, 3, $f13 + vextf $f19, 0, $f14 + vextf $f19, 2, $f15 + + vinsf $f24, $f17, 0, $f17 + addl $20, 16*SIZE, $20 + vinsf $f25, $f17, 2, $f17 + addl $18, 16*SIZE, $18 + + vinsf $f26, $f16, 1, $f16 + subl $4, 1, $4 + vinsf $f27, $f16, 3, $f16 + nop + + vinsf $f12, $f19, 0, $f19 + vinsf $f13, $f19, 2, $f19 + vinsf $f14, $f18, 1, $f18 + vinsf $f15, $f18, 3, $f18 + + VADD $f16, $f8, $f16 + VLD $f8, 0*VEC_LEN*SIZE($20) + VADD $f17, $f28, $f17 + VLD $f28, 1*VEC_LEN*SIZE($20) + + VADD $f18, $f10, $f18 + VLD $f10, 2*VEC_LEN*SIZE($20) + VADD $f19, $f11, $f19 + VLD $f11, 3*VEC_LEN*SIZE($20) + + VST $f16, -4*VEC_LEN*SIZE($20) + VST $f17, -3*VEC_LEN*SIZE($20) + VST $f18, -2*VEC_LEN*SIZE($20) + VST $f19, -1*VEC_LEN*SIZE($20) + +/* + MUL $f29, $f0, $f20 + fillcs 9*SIZE($18) + MUL $f30, $f1, $f21 + unop + + MUL $f30, $f0, $f22 + LD $f0, 0*SIZE($18) + MUL $f29, $f1, $f23 + LD $f1, 1*SIZE($18) + + MUL $f29, $f2, $f24 + unop + MUL $f30, $f3, $f25 + nop + + MUL $f30, $f2, $f26 + LD $f2, 2*SIZE($18) + MUL $f29, $f3, $f27 + LD $f3, 3*SIZE($18) + + ADD1 $f20, $f21, $f16 + MUL $f29, $f4, $f20 + ADD2 $f22, $f23, $f17 + MUL $f30, $f5, $f21 + + ADD1 $f24, $f25, $f18 + unop + MUL $f30, $f4, $f22 + LD $f4, 4*SIZE($18) + + ADD2 $f26, $f27, $f19 + addl $20, 8*SIZE, $20 + MUL $f29, $f5, $f23 + LD $f5, 5*SIZE($18) + + ADD $f16, $f8, $f16 + LD $f8, 0*SIZE($20) + MUL $f29, $f6, $f24 + unop + + ADD $f17, $f28, $f17 + LD $f28, 1*SIZE($20) + MUL $f30, $f7, $f25 + unop + + ADD $f18, $f10, $f18 + LD $f10, 2*SIZE($20) + MUL $f30, $f6, $f26 + LD $f6, 6*SIZE($18) + + ADD $f19, $f11, $f19 + LD $f11, 3*SIZE($20) + MUL $f29, $f7, $f27 + LD $f7, 7*SIZE($18) + + ST $f16,-8*SIZE($20) + ADD1 $f20, $f21, $f16 + ST $f17,-7*SIZE($20) + ADD2 $f22, $f23, $f17 + + ST $f18,-6*SIZE($20) + ADD1 $f24, $f25, $f18 + ST $f19,-5*SIZE($20) + ADD2 $f26, $f27, $f19 + + ADD $f16, $f12, $f16 + LD $f12, 4*SIZE($20) + ADD $f17, $f13, $f17 + LD $f13, 5*SIZE($20) + ADD $f18, $f14, $f18 + LD $f14, 6*SIZE($20) + ADD $f19, $f15, $f19 + LD $f15, 7*SIZE($20) + + ST $f16,-4*SIZE($20) + + ST $f17,-3*SIZE($20) + + + ST $f18,-2*SIZE($20) + nop + ST $f19,-1*SIZE($20) +*/ + bgt $4, $MainLoop + .align 4 + +$MainLoopEnd: + +/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ + vextf $f0, 1, $f4 + vextf $f0, 3, $f5 + vextf $f1, 0, $f6 + vextf $f1, 2, $f7 + + vextf $f2, 1, $f12 + vextf $f2, 3, $f13 + vextf $f3, 0, $f14 + vextf $f3, 2, $f15 + + vinsf $f4, $f1, 0, $f1 + vinsf $f5, $f1, 2, $f1 + vinsf $f6, $f0, 1, $f0 + vinsf $f7, $f0, 3, $f0 + + vinsf $f12, $f3, 0, $f3 + vinsf $f13, $f3, 2, $f3 + vinsf $f14, $f2, 1, $f2 + vinsf $f15, $f2, 3, $f2 + + VMUL $f29, $f0, $f20 + VMUL $f30, $f0, $f21 + VMUL $f29, $f2, $f22 + VMUL $f30, $f2, $f23 + + VMAD1 $f30, $f1, $f20, $f16 + VMAD2 $f29, $f1, $f21, $f17 + VMAD1 $f30, $f3, $f22, $f18 + VMAD2 $f29, $f3, $f23, $f19 + +/*combine the real(f16,f18) & image(f17,f19) vector to complex vector*/ + vextf $f16, 1, $f24 + vextf $f16, 3, $f25 + vextf $f17, 0, $f26 + vextf $f17, 2, $f27 + + vextf $f18, 1, $f12 + vextf $f18, 3, $f13 + vextf $f19, 0, $f14 + vextf $f19, 2, $f15 + + vinsf $f24, $f17, 0, $f17 + vinsf $f25, $f17, 2, $f17 + vinsf $f26, $f16, 1, $f16 + vinsf $f27, $f16, 3, $f16 + + vinsf $f12, $f19, 0, $f19 + vinsf $f13, $f19, 2, $f19 + vinsf $f14, $f18, 1, $f18 + vinsf $f15, $f18, 3, $f18 + + VADD $f16, $f8, $f16 + VADD $f17, $f28, $f17 + VADD $f18, $f10, $f18 + VADD $f19, $f11, $f19 + + VST $f16, 0*VEC_LEN*SIZE($20) + VST $f17, 1*VEC_LEN*SIZE($20) + VST $f18, 2*VEC_LEN*SIZE($20) + VST $f19, 3*VEC_LEN*SIZE($20) + + addl $20, 16*SIZE, $20 + ble $5, $End + +/* MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + MUL $f29, $f1, $f23 + + MUL $f29, $f2, $f24 + MUL $f30, $f3, $f25 + MUL $f30, $f2, $f26 + MUL $f29, $f3, $f27 + + ADD1 $f20, $f21, $f16 + MUL $f29, $f4, $f20 + ADD2 $f22, $f23, $f17 + MUL $f30, $f5, $f21 + + ADD1 $f24, $f25, $f18 + MUL $f30, $f4, $f22 + ADD2 $f26, $f27, $f19 + MUL $f29, $f5, $f23 + + ADD $f16, $f8, $f16 + MUL $f29, $f6, $f24 + ADD $f17, $f28, $f17 + MUL $f30, $f7, $f25 + + ADD $f18, $f10, $f18 + MUL $f30, $f6, $f26 + ADD $f19, $f11, $f19 + MUL $f29, $f7, $f27 + + ST $f16, 0*SIZE($20) + ADD1 $f20, $f21, $f16 + ST $f17, 1*SIZE($20) + ADD2 $f22, $f23, $f17 + + ST $f18, 2*SIZE($20) + ADD1 $f24, $f25, $f18 + ST $f19, 3*SIZE($20) + ADD2 $f26, $f27, $f19 + + ADD $f16, $f12, $f16 + ADD $f17, $f13, $f17 + ADD $f18, $f14, $f18 + ADD $f19, $f15, $f19 + + ST $f16, 4*SIZE($20) + ST $f17, 5*SIZE($20) + ST $f18, 6*SIZE($20) + ST $f19, 7*SIZE($20) + + unop + unop +*/ + .align 4 + +$Remain: + subl $5, 1, $6 + ble $5, $End + LD $f0, 0*SIZE($18) + LD $f1, 1*SIZE($18) + + LD $f8, 0*SIZE($20) + LD $f28, 1*SIZE($20) + addl $18, 2*SIZE, $18 + ble $6, $RemainLoopEnd + .align 4 + +$RemainLoop: + MUL $f29, $f0, $f20 + subl $6, 1, $6 + MUL $f30, $f1, $f21 + addl $20, 2*SIZE, $20 + + MUL $f30, $f0, $f22 + LD $f0, 0*SIZE($18) + MUL $f29, $f1, $f23 + LD $f1, 1*SIZE($18) + + ADD1 $f20, $f21, $f16 + ADD2 $f22, $f23, $f17 + ADD $f16, $f8, $f16 + LD $f8, 0*SIZE($20) + ADD $f17, $f28, $f17 + LD $f28, 1*SIZE($20) + + ST $f16,-2*SIZE($20) + addl $18, 2*SIZE, $18 + ST $f17,-1*SIZE($20) + bgt $6, $RemainLoop + .align 4 + +$RemainLoopEnd: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + MUL $f29, $f1, $f23 + + ADD1 $f20, $f21, $f16 + ADD2 $f22, $f23, $f17 + ADD $f16, $f8, $f16 + ADD $f17, $f28, $f17 + + ST $f16, 0*SIZE($20) + nop + ST $f17, 1*SIZE($20) + nop + .align 4 + +$End: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + ldi $sp, 64($sp) + ret + .align 4 + +$UnAlign_Y_ACCESS: + and $18, (VEC_LEN*SIZE-1), $7 + nop + nop + bgt $7, $UnAlign_XY_ACCESS + .align 4 +/* + Unalign access Y, Align access X +*/ + + VLD_UL $f8, 0*VEC_LEN*SIZE($20) + VLD_UH $f12, 1*VEC_LEN*SIZE($20) + + VLD_UL $f28, 1*VEC_LEN*SIZE($20) + VLD_UH $f13, 2*VEC_LEN*SIZE($20) + + VLD_UL $f10, 2*VEC_LEN*SIZE($20) + VLD_UH $f14, 3*VEC_LEN*SIZE($20) + + VLD_UL $f11, 3*VEC_LEN*SIZE($20) + VLD_UH $f15, 4*VEC_LEN*SIZE($20) + + VLD $f0, 0*VEC_LEN*SIZE($18) + VLD $f1, 1*VEC_LEN*SIZE($18) + VLD $f2, 2*VEC_LEN*SIZE($18) + VLD $f3, 3*VEC_LEN*SIZE($18) + + vbisw $f8, $f12, $f8 + vbisw $f28, $f13, $f28 + vbisw $f10, $f14, $f10 + vbisw $f11, $f15, $f11 + + addl $18, 16*SIZE, $18 + ble $4, $UnAlign_Y_MainLoopEnd + .align 4 +$UnAlign_Y_MainLoop: + fillcs PREFETCHSIZE * SIZE($20) + fillcs PREFETCHSIZE * SIZE($18) + +/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ + vextf $f0, 1, $f4 + vextf $f0, 3, $f5 + vextf $f1, 0, $f6 + vextf $f1, 2, $f7 + + vextf $f2, 1, $f12 + vextf $f2, 3, $f13 + vextf $f3, 0, $f14 + vextf $f3, 2, $f15 + + vinsf $f4, $f1, 0, $f1 + vinsf $f5, $f1, 2, $f1 + vinsf $f6, $f0, 1, $f0 + vinsf $f7, $f0, 3, $f0 + + vinsf $f12, $f3, 0, $f3 + vinsf $f13, $f3, 2, $f3 + vinsf $f14, $f2, 1, $f2 + vinsf $f15, $f2, 3, $f2 + +/*Compute*/ + VMUL $f29, $f0, $f20 + VMUL $f30, $f0, $f21 + VMUL $f29, $f2, $f22 + VMUL $f30, $f2, $f23 + + + VMAD1 $f30, $f1, $f20, $f16 + VMAD2 $f29, $f1, $f21, $f17 + VMAD1 $f30, $f3, $f22, $f18 + VMAD2 $f29, $f3, $f23, $f19 + + VLD $f0, 0*VEC_LEN*SIZE($18) + VLD $f1, 1*VEC_LEN*SIZE($18) + VLD $f2, 2*VEC_LEN*SIZE($18) + VLD $f3, 3*VEC_LEN*SIZE($18) + + +/*combine the real & image vector to complex vector*/ + vextf $f16, 1, $f24 + vextf $f16, 3, $f25 + vextf $f17, 0, $f26 + vextf $f17, 2, $f27 + + vextf $f18, 1, $f12 + vextf $f18, 3, $f13 + vextf $f19, 0, $f14 + vextf $f19, 2, $f15 + + vinsf $f24, $f17, 0, $f17 + addl $20, 16*SIZE, $20 + vinsf $f25, $f17, 2, $f17 + addl $18, 16*SIZE, $18 + + vinsf $f26, $f16, 1, $f16 + subl $4, 1, $4 + vinsf $f27, $f16, 3, $f16 + nop + + vinsf $f12, $f19, 0, $f19 + vinsf $f13, $f19, 2, $f19 + vinsf $f14, $f18, 1, $f18 + vinsf $f15, $f18, 3, $f18 + + VADD $f16, $f8, $f16 + VLD_UL $f8, 0*VEC_LEN*SIZE($20) + VLD_UH $f12, 1*VEC_LEN*SIZE($20) + + VADD $f17, $f28, $f17 + VLD_UL $f28, 1*VEC_LEN*SIZE($20) + VLD_UH $f13, 2*VEC_LEN*SIZE($20) + + + VADD $f18, $f10, $f18 + VLD_UL $f10, 2*VEC_LEN*SIZE($20) + VLD_UH $f14, 3*VEC_LEN*SIZE($20) + + VADD $f19, $f11, $f19 + VLD_UL $f11, 3*VEC_LEN*SIZE($20) + VLD_UH $f15, 4*VEC_LEN*SIZE($20) + + + vbisw $f8, $f12, $f8 + VST_UL $f16, -4*VEC_LEN*SIZE($20) + VST_UH $f16, -3*VEC_LEN*SIZE($20) + + vbisw $f28, $f13, $f28 + VST_UL $f17, -3*VEC_LEN*SIZE($20) + VST_UH $f17, -2*VEC_LEN*SIZE($20) + + vbisw $f10, $f14, $f10 + VST_UL $f18, -2*VEC_LEN*SIZE($20) + VST_UH $f18, -1*VEC_LEN*SIZE($20) + + vbisw $f11, $f15, $f11 + VST_UL $f19, -1*VEC_LEN*SIZE($20) + VST_UH $f19, 0*VEC_LEN*SIZE($20) + + bgt $4, $UnAlign_Y_MainLoop + +$UnAlign_Y_MainLoopEnd: +/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ + vextf $f0, 1, $f4 + vextf $f0, 3, $f5 + vextf $f1, 0, $f6 + vextf $f1, 2, $f7 + + vextf $f2, 1, $f12 + vextf $f2, 3, $f13 + vextf $f3, 0, $f14 + vextf $f3, 2, $f15 + + vinsf $f4, $f1, 0, $f1 + vinsf $f5, $f1, 2, $f1 + vinsf $f6, $f0, 1, $f0 + vinsf $f7, $f0, 3, $f0 + + vinsf $f12, $f3, 0, $f3 + vinsf $f13, $f3, 2, $f3 + vinsf $f14, $f2, 1, $f2 + vinsf $f15, $f2, 3, $f2 + + VMUL $f29, $f0, $f20 + VMUL $f30, $f0, $f21 + VMUL $f29, $f2, $f22 + VMUL $f30, $f2, $f23 + + VMAD1 $f30, $f1, $f20, $f16 + VMAD2 $f29, $f1, $f21, $f17 + VMAD1 $f30, $f3, $f22, $f18 + VMAD2 $f29, $f3, $f23, $f19 + +/*combine the real(f16,f18) & image(f17,f19) vector to complex vector*/ + vextf $f16, 1, $f24 + vextf $f16, 3, $f25 + vextf $f17, 0, $f26 + vextf $f17, 2, $f27 + + vextf $f18, 1, $f12 + vextf $f18, 3, $f13 + vextf $f19, 0, $f14 + vextf $f19, 2, $f15 + + vinsf $f24, $f17, 0, $f17 + vinsf $f25, $f17, 2, $f17 + vinsf $f26, $f16, 1, $f16 + vinsf $f27, $f16, 3, $f16 + + vinsf $f12, $f19, 0, $f19 + vinsf $f13, $f19, 2, $f19 + vinsf $f14, $f18, 1, $f18 + vinsf $f15, $f18, 3, $f18 + + VADD $f16, $f8, $f16 + VADD $f17, $f28, $f17 + VADD $f18, $f10, $f18 + VADD $f19, $f11, $f19 + + VST_UL $f16, 0*VEC_LEN*SIZE($20) + VST_UH $f16, 1*VEC_LEN*SIZE($20) + VST_UL $f17, 1*VEC_LEN*SIZE($20) + VST_UH $f17, 2*VEC_LEN*SIZE($20) + + VST_UL $f18, 2*VEC_LEN*SIZE($20) + VST_UH $f18, 3*VEC_LEN*SIZE($20) + VST_UL $f19, 3*VEC_LEN*SIZE($20) + VST_UH $f19, 4*VEC_LEN*SIZE($20) + + addl $20, 16*SIZE, $20 + ble $5, $End + + jmp $Remain + + .align 4 + + +$UnAlign_X_ACCESS: + and $20, (VEC_LEN*SIZE-1), $6 + nop + nop + bgt $6, $UnAlign_XY_ACCESS + + .align 4 +/* + Unalign access X, Align access Y +*/ + VLD_UL $f0, 0*VEC_LEN*SIZE($18) + VLD_UH $f4, 1*VEC_LEN*SIZE($18) + + VLD_UL $f1, 1*VEC_LEN*SIZE($18) + VLD_UH $f5, 2*VEC_LEN*SIZE($18) + + VLD_UL $f2, 2*VEC_LEN*SIZE($18) + VLD_UH $f6, 3*VEC_LEN*SIZE($18) + + VLD_UL $f3, 3*VEC_LEN*SIZE($18) + VLD_UH $f7, 4*VEC_LEN*SIZE($18) + + VLD $f8, 0*VEC_LEN*SIZE($20) + VLD $f28, 1*VEC_LEN*SIZE($20) + VLD $f10, 2*VEC_LEN*SIZE($20) + VLD $f11, 3*VEC_LEN*SIZE($20) + + vbisw $f0, $f4, $f0 + vbisw $f1, $f5, $f1 + vbisw $f2, $f6, $f2 + vbisw $f3, $f7, $f3 + + addl $18, 16*SIZE, $18 + ble $4, $UnAlign_X_MainLoopEnd + .align 4 +$UnAlign_X_MainLoop: + fillcs PREFETCHSIZE * SIZE($20) + fillcs PREFETCHSIZE * SIZE($18) + +/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ + vextf $f0, 1, $f4 + vextf $f0, 3, $f5 + vextf $f1, 0, $f6 + vextf $f1, 2, $f7 + + vextf $f2, 1, $f12 + vextf $f2, 3, $f13 + vextf $f3, 0, $f14 + vextf $f3, 2, $f15 + + vinsf $f4, $f1, 0, $f1 + vinsf $f5, $f1, 2, $f1 + vinsf $f6, $f0, 1, $f0 + vinsf $f7, $f0, 3, $f0 + + vinsf $f12, $f3, 0, $f3 + vinsf $f13, $f3, 2, $f3 + vinsf $f14, $f2, 1, $f2 + vinsf $f15, $f2, 3, $f2 + +/*Compute*/ + VMUL $f29, $f0, $f20 + VMUL $f30, $f0, $f21 + VMUL $f29, $f2, $f22 + VMUL $f30, $f2, $f23 + + + VMAD1 $f30, $f1, $f20, $f16 + VMAD2 $f29, $f1, $f21, $f17 + VMAD1 $f30, $f3, $f22, $f18 + VMAD2 $f29, $f3, $f23, $f19 +/* + VLD $f0, 0*VEC_LEN*SIZE($18) + VLD $f1, 1*VEC_LEN*SIZE($18) + VLD $f2, 2*VEC_LEN*SIZE($18) + VLD $f3, 3*VEC_LEN*SIZE($18) +*/ + VLD_UL $f0, 0*VEC_LEN*SIZE($18) + VLD_UH $f4, 1*VEC_LEN*SIZE($18) + + VLD_UL $f1, 1*VEC_LEN*SIZE($18) + VLD_UH $f5, 2*VEC_LEN*SIZE($18) + + VLD_UL $f2, 2*VEC_LEN*SIZE($18) + VLD_UH $f6, 3*VEC_LEN*SIZE($18) + + VLD_UL $f3, 3*VEC_LEN*SIZE($18) + VLD_UH $f7, 4*VEC_LEN*SIZE($18) + +/*combine the real & image vector to complex vector*/ + vextf $f16, 1, $f24 + vextf $f16, 3, $f25 + vextf $f17, 0, $f26 + vextf $f17, 2, $f27 + + vextf $f18, 1, $f12 + vextf $f18, 3, $f13 + vextf $f19, 0, $f14 + vextf $f19, 2, $f15 + + vbisw $f0, $f4, $f0 + vbisw $f1, $f5, $f1 + vbisw $f2, $f6, $f2 + vbisw $f3, $f7, $f3 + + vinsf $f24, $f17, 0, $f17 + addl $20, 16*SIZE, $20 + vinsf $f25, $f17, 2, $f17 + addl $18, 16*SIZE, $18 + + vinsf $f26, $f16, 1, $f16 + subl $4, 1, $4 + vinsf $f27, $f16, 3, $f16 + nop + + vinsf $f12, $f19, 0, $f19 + vinsf $f13, $f19, 2, $f19 + vinsf $f14, $f18, 1, $f18 + vinsf $f15, $f18, 3, $f18 + + VADD $f16, $f8, $f16 + VLD $f8, 0*VEC_LEN*SIZE($20) + VADD $f17, $f28, $f17 + VLD $f28, 1*VEC_LEN*SIZE($20) + + VADD $f18, $f10, $f18 + VLD $f10, 2*VEC_LEN*SIZE($20) + VADD $f19, $f11, $f19 + VLD $f11, 3*VEC_LEN*SIZE($20) + + VST $f16, -4*VEC_LEN*SIZE($20) + VST $f17, -3*VEC_LEN*SIZE($20) + VST $f18, -2*VEC_LEN*SIZE($20) + VST $f19, -1*VEC_LEN*SIZE($20) + + bgt $4, $UnAlign_X_MainLoop + .align 4 + +$UnAlign_X_MainLoopEnd: +/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ + vextf $f0, 1, $f4 + vextf $f0, 3, $f5 + vextf $f1, 0, $f6 + vextf $f1, 2, $f7 + + vextf $f2, 1, $f12 + vextf $f2, 3, $f13 + vextf $f3, 0, $f14 + vextf $f3, 2, $f15 + + vinsf $f4, $f1, 0, $f1 + vinsf $f5, $f1, 2, $f1 + vinsf $f6, $f0, 1, $f0 + vinsf $f7, $f0, 3, $f0 + + vinsf $f12, $f3, 0, $f3 + vinsf $f13, $f3, 2, $f3 + vinsf $f14, $f2, 1, $f2 + vinsf $f15, $f2, 3, $f2 + + VMUL $f29, $f0, $f20 + VMUL $f30, $f0, $f21 + VMUL $f29, $f2, $f22 + VMUL $f30, $f2, $f23 + + VMAD1 $f30, $f1, $f20, $f16 + VMAD2 $f29, $f1, $f21, $f17 + VMAD1 $f30, $f3, $f22, $f18 + VMAD2 $f29, $f3, $f23, $f19 + +/*combine the real(f16,f18) & image(f17,f19) vector to complex vector*/ + vextf $f16, 1, $f24 + vextf $f16, 3, $f25 + vextf $f17, 0, $f26 + vextf $f17, 2, $f27 + + vextf $f18, 1, $f12 + vextf $f18, 3, $f13 + vextf $f19, 0, $f14 + vextf $f19, 2, $f15 + + vinsf $f24, $f17, 0, $f17 + vinsf $f25, $f17, 2, $f17 + vinsf $f26, $f16, 1, $f16 + vinsf $f27, $f16, 3, $f16 + + vinsf $f12, $f19, 0, $f19 + vinsf $f13, $f19, 2, $f19 + vinsf $f14, $f18, 1, $f18 + vinsf $f15, $f18, 3, $f18 + + VADD $f16, $f8, $f16 + VADD $f17, $f28, $f17 + VADD $f18, $f10, $f18 + VADD $f19, $f11, $f19 + + VST $f16, 0*VEC_LEN*SIZE($20) + VST $f17, 1*VEC_LEN*SIZE($20) + VST $f18, 2*VEC_LEN*SIZE($20) + VST $f19, 3*VEC_LEN*SIZE($20) + + addl $20, 16*SIZE, $20 + ble $5, $End + + jmp $Remain + .align 4 + +$UnAlign_XY_ACCESS: +/* + Unalign access X & Y +*/ + VLD_UL $f0, 0*VEC_LEN*SIZE($18) + VLD_UH $f4, 1*VEC_LEN*SIZE($18) + + VLD_UL $f1, 1*VEC_LEN*SIZE($18) + VLD_UH $f5, 2*VEC_LEN*SIZE($18) + + VLD_UL $f2, 2*VEC_LEN*SIZE($18) + VLD_UH $f6, 3*VEC_LEN*SIZE($18) + + VLD_UL $f3, 3*VEC_LEN*SIZE($18) + VLD_UH $f7, 4*VEC_LEN*SIZE($18) + + VLD_UL $f8, 0*VEC_LEN*SIZE($20) + VLD_UH $f12, 1*VEC_LEN*SIZE($20) + + VLD_UL $f28, 1*VEC_LEN*SIZE($20) + VLD_UH $f13, 2*VEC_LEN*SIZE($20) + + VLD_UL $f10, 2*VEC_LEN*SIZE($20) + VLD_UH $f14, 3*VEC_LEN*SIZE($20) + + VLD_UL $f11, 3*VEC_LEN*SIZE($20) + VLD_UH $f15, 4*VEC_LEN*SIZE($20) + + vbisw $f0, $f4, $f0 + vbisw $f1, $f5, $f1 + vbisw $f2, $f6, $f2 + vbisw $f3, $f7, $f3 + + vbisw $f8, $f12, $f8 + vbisw $f28, $f13, $f28 + vbisw $f10, $f14, $f10 + vbisw $f11, $f15, $f11 + + addl $18, 16*SIZE, $18 + ble $4, $UnAlign_MainLoopEnd + .align 4 + +$UnAlign_MainLoop: + fillcs PREFETCHSIZE * SIZE($20) + fillcs PREFETCHSIZE * SIZE($18) + +/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ + vextf $f0, 1, $f4 + vextf $f0, 3, $f5 + vextf $f1, 0, $f6 + vextf $f1, 2, $f7 + + vextf $f2, 1, $f12 + vextf $f2, 3, $f13 + vextf $f3, 0, $f14 + vextf $f3, 2, $f15 + + vinsf $f4, $f1, 0, $f1 + vinsf $f5, $f1, 2, $f1 + vinsf $f6, $f0, 1, $f0 + vinsf $f7, $f0, 3, $f0 + + vinsf $f12, $f3, 0, $f3 + vinsf $f13, $f3, 2, $f3 + vinsf $f14, $f2, 1, $f2 + vinsf $f15, $f2, 3, $f2 + +/*Compute*/ + VMUL $f29, $f0, $f20 + VMUL $f30, $f0, $f21 + VMUL $f29, $f2, $f22 + VMUL $f30, $f2, $f23 + + + VMAD1 $f30, $f1, $f20, $f16 + VMAD2 $f29, $f1, $f21, $f17 + VMAD1 $f30, $f3, $f22, $f18 + VMAD2 $f29, $f3, $f23, $f19 +/* + VLD $f0, 0*VEC_LEN*SIZE($18) + VLD $f1, 1*VEC_LEN*SIZE($18) + VLD $f2, 2*VEC_LEN*SIZE($18) + VLD $f3, 3*VEC_LEN*SIZE($18) +*/ + VLD_UL $f0, 0*VEC_LEN*SIZE($18) + VLD_UH $f4, 1*VEC_LEN*SIZE($18) + + VLD_UL $f1, 1*VEC_LEN*SIZE($18) + VLD_UH $f5, 2*VEC_LEN*SIZE($18) + + VLD_UL $f2, 2*VEC_LEN*SIZE($18) + VLD_UH $f6, 3*VEC_LEN*SIZE($18) + + VLD_UL $f3, 3*VEC_LEN*SIZE($18) + VLD_UH $f7, 4*VEC_LEN*SIZE($18) + +/*combine the real & image vector to complex vector*/ + vextf $f16, 1, $f24 + vextf $f16, 3, $f25 + vextf $f17, 0, $f26 + vextf $f17, 2, $f27 + + vextf $f18, 1, $f12 + vextf $f18, 3, $f13 + vextf $f19, 0, $f14 + vextf $f19, 2, $f15 + + vbisw $f0, $f4, $f0 + vbisw $f1, $f5, $f1 + vbisw $f2, $f6, $f2 + vbisw $f3, $f7, $f3 + + vinsf $f24, $f17, 0, $f17 + addl $20, 16*SIZE, $20 + vinsf $f25, $f17, 2, $f17 + addl $18, 16*SIZE, $18 + + vinsf $f26, $f16, 1, $f16 + subl $4, 1, $4 + vinsf $f27, $f16, 3, $f16 + nop + + vinsf $f12, $f19, 0, $f19 + vinsf $f13, $f19, 2, $f19 + vinsf $f14, $f18, 1, $f18 + vinsf $f15, $f18, 3, $f18 + + VADD $f16, $f8, $f16 + VLD_UL $f8, 0*VEC_LEN*SIZE($20) + VLD_UH $f12, 1*VEC_LEN*SIZE($20) + + VADD $f17, $f28, $f17 + VLD_UL $f28, 1*VEC_LEN*SIZE($20) + VLD_UH $f13, 2*VEC_LEN*SIZE($20) + + + VADD $f18, $f10, $f18 + VLD_UL $f10, 2*VEC_LEN*SIZE($20) + VLD_UH $f14, 3*VEC_LEN*SIZE($20) + + VADD $f19, $f11, $f19 + VLD_UL $f11, 3*VEC_LEN*SIZE($20) + VLD_UH $f15, 4*VEC_LEN*SIZE($20) + +/* + VST $f16, -4*VEC_LEN*SIZE($20) + VST $f17, -3*VEC_LEN*SIZE($20) + VST $f18, -2*VEC_LEN*SIZE($20) + VST $f19, -1*VEC_LEN*SIZE($20) +*/ + + vbisw $f8, $f12, $f8 + VST_UL $f16, -4*VEC_LEN*SIZE($20) + VST_UH $f16, -3*VEC_LEN*SIZE($20) + + vbisw $f28, $f13, $f28 + VST_UL $f17, -3*VEC_LEN*SIZE($20) + VST_UH $f17, -2*VEC_LEN*SIZE($20) + + vbisw $f10, $f14, $f10 + VST_UL $f18, -2*VEC_LEN*SIZE($20) + VST_UH $f18, -1*VEC_LEN*SIZE($20) + + vbisw $f11, $f15, $f11 + VST_UL $f19, -1*VEC_LEN*SIZE($20) + VST_UH $f19, 0*VEC_LEN*SIZE($20) + + bgt $4, $UnAlign_MainLoop + .align 4 + +$UnAlign_MainLoopEnd: + +/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ + vextf $f0, 1, $f4 + vextf $f0, 3, $f5 + vextf $f1, 0, $f6 + vextf $f1, 2, $f7 + + vextf $f2, 1, $f12 + vextf $f2, 3, $f13 + vextf $f3, 0, $f14 + vextf $f3, 2, $f15 + + vinsf $f4, $f1, 0, $f1 + vinsf $f5, $f1, 2, $f1 + vinsf $f6, $f0, 1, $f0 + vinsf $f7, $f0, 3, $f0 + + vinsf $f12, $f3, 0, $f3 + vinsf $f13, $f3, 2, $f3 + vinsf $f14, $f2, 1, $f2 + vinsf $f15, $f2, 3, $f2 + + VMUL $f29, $f0, $f20 + VMUL $f30, $f0, $f21 + VMUL $f29, $f2, $f22 + VMUL $f30, $f2, $f23 + + VMAD1 $f30, $f1, $f20, $f16 + VMAD2 $f29, $f1, $f21, $f17 + VMAD1 $f30, $f3, $f22, $f18 + VMAD2 $f29, $f3, $f23, $f19 + +/*combine the real(f16,f18) & image(f17,f19) vector to complex vector*/ + vextf $f16, 1, $f24 + vextf $f16, 3, $f25 + vextf $f17, 0, $f26 + vextf $f17, 2, $f27 + + vextf $f18, 1, $f12 + vextf $f18, 3, $f13 + vextf $f19, 0, $f14 + vextf $f19, 2, $f15 + + vinsf $f24, $f17, 0, $f17 + vinsf $f25, $f17, 2, $f17 + vinsf $f26, $f16, 1, $f16 + vinsf $f27, $f16, 3, $f16 + + vinsf $f12, $f19, 0, $f19 + vinsf $f13, $f19, 2, $f19 + vinsf $f14, $f18, 1, $f18 + vinsf $f15, $f18, 3, $f18 + + VADD $f16, $f8, $f16 + VADD $f17, $f28, $f17 + VADD $f18, $f10, $f18 + VADD $f19, $f11, $f19 + + VST_UL $f16, 0*VEC_LEN*SIZE($20) + VST_UH $f16, 1*VEC_LEN*SIZE($20) + VST_UL $f17, 1*VEC_LEN*SIZE($20) + VST_UH $f17, 2*VEC_LEN*SIZE($20) + + VST_UL $f18, 2*VEC_LEN*SIZE($20) + VST_UH $f18, 3*VEC_LEN*SIZE($20) + VST_UL $f19, 3*VEC_LEN*SIZE($20) + VST_UH $f19, 4*VEC_LEN*SIZE($20) + + addl $20, 16*SIZE, $20 + ble $5, $End + + jmp $Remain + .align 4 +/*Unloop 4 complex = 8 float/double*/ +$Sub: + sra $16, 2, $4 + and $16, 3, $5 + SXSUBL $16, SIZE, $22 + addl $22, $22, $22 # Complex + .align 4 + + addl $19, $19, $19 # Complex + addl $21, $21, $21 # Complex + + ble $4, $SubRemain + LD $f0, 0*SIZE($18) + LD $f1, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f2, 0*SIZE($18) + LD $f3, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f4, 0*SIZE($18) + LD $f5, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f6, 0*SIZE($18) + LD $f7, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f8, 0*SIZE($20) + LD $f28, 1*SIZE($20) + SXADDQ $21, $20, $24 + + LD $f10, 0*SIZE($24) + LD $f11, 1*SIZE($24) + SXADDQ $21, $24, $24 + + LD $f12, 0*SIZE($24) + LD $f13, 1*SIZE($24) + SXADDQ $21, $24, $24 + + LD $f14, 0*SIZE($24) + LD $f15, 1*SIZE($24) + SXADDQ $21, $24, $24 + + subl $4, 1, $4 + ble $4, $SubMainLoopEnd + .align 4 + +$SubMainLoop: + MUL $f29, $f0, $f20 + unop + MUL $f30, $f1, $f21 + unop + + MUL $f30, $f0, $f22 + LD $f0, 0*SIZE($18) + MUL $f29, $f1, $f23 + LD $f1, 1*SIZE($18) + + MUL $f29, $f2, $f24 + SXADDQ $19, $18, $18 + MUL $f30, $f3, $f25 + unop + + MUL $f30, $f2, $f26 + LD $f2, 0*SIZE($18) + MUL $f29, $f3, $f27 + LD $f3, 1*SIZE($18) + + ADD1 $f20, $f21, $f16 + SXADDQ $19, $18, $18 + MUL $f29, $f4, $f20 + unop + + ADD2 $f22, $f23, $f17 + unop + MUL $f30, $f5, $f21 + unop + + ADD1 $f24, $f25, $f18 + unop + MUL $f30, $f4, $f22 + LD $f4, 0*SIZE($18) + + ADD2 $f26, $f27, $f19 + unop + MUL $f29, $f5, $f23 + LD $f5, 1*SIZE($18) + + ADD $f16, $f8, $f16 + LD $f8, 0*SIZE($24) + MUL $f29, $f6, $f24 + SXADDQ $19, $18, $18 + + ADD $f17, $f28, $f17 + LD $f28, 1*SIZE($24) + MUL $f30, $f7, $f25 + SXADDQ $21, $24, $24 + + ADD $f18, $f10, $f18 + LD $f10, 0*SIZE($24) + MUL $f30, $f6, $f26 + LD $f6, 0*SIZE($18) + + ADD $f19, $f11, $f19 + LD $f11, 1*SIZE($24) + MUL $f29, $f7, $f27 + LD $f7, 1*SIZE($18) + + ST $f16, 0*SIZE($20) + SXADDQ $19, $18, $18 + ADD1 $f20, $f21, $f16 + unop + + ST $f17, 1*SIZE($20) + SXADDQ $21, $20, $20 + ADD2 $f22, $f23, $f17 + unop + + ST $f18, 0*SIZE($20) + SXADDQ $21, $24, $24 + ADD1 $f24, $f25, $f18 + unop + + ST $f19, 1*SIZE($20) + unop + ADD2 $f26, $f27, $f19 + SXADDQ $21, $20, $20 + + ADD $f16, $f12, $f16 + unop + LD $f12, 0*SIZE($24) + unop + + ADD $f17, $f13, $f17 + unop + LD $f13, 1*SIZE($24) + SXADDQ $21, $24, $24 + + ADD $f18, $f14, $f18 + subl $4, 1, $4 + LD $f14, 0*SIZE($24) + unop + + ADD $f19, $f15, $f19 + unop + LD $f15, 1*SIZE($24) + SXADDQ $21, $24, $24 + + ST $f16, 0*SIZE($20) + ST $f17, 1*SIZE($20) + SXADDQ $21, $20, $20 + unop + + ST $f18, 0*SIZE($20) + ST $f19, 1*SIZE($20) + SXADDQ $21, $20, $20 + bgt $4, $SubMainLoop + .align 4 + +$SubMainLoopEnd: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + MUL $f29, $f1, $f23 + + MUL $f29, $f2, $f24 + MUL $f30, $f3, $f25 + MUL $f30, $f2, $f26 + MUL $f29, $f3, $f27 + + ADD1 $f20, $f21, $f16 + MUL $f29, $f4, $f20 + ADD2 $f22, $f23, $f17 + MUL $f30, $f5, $f21 + + ADD1 $f24, $f25, $f18 + MUL $f30, $f4, $f22 + ADD2 $f26, $f27, $f19 + MUL $f29, $f5, $f23 + + ADD $f16, $f8, $f16 + MUL $f29, $f6, $f24 + ADD $f17, $f28, $f17 + MUL $f30, $f7, $f25 + + ADD $f18, $f10, $f18 + MUL $f30, $f6, $f26 + ADD $f19, $f11, $f19 + MUL $f29, $f7, $f27 + + ST $f16, 0*SIZE($20) + ADD1 $f20, $f21, $f16 + ST $f17, 1*SIZE($20) + ADD2 $f22, $f23, $f17 + + SXADDQ $21, $20, $20 + nop + ST $f18, 0*SIZE($20) + ADD1 $f24, $f25, $f18 + + ST $f19, 1*SIZE($20) + ADD2 $f26, $f27, $f19 + SXADDQ $21, $20, $20 + ADD $f16, $f12, $f16 + + ADD $f17, $f13, $f17 + ADD $f18, $f14, $f18 + ADD $f19, $f15, $f19 + + ST $f16, 0*SIZE($20) + ST $f17, 1*SIZE($20) + SXADDQ $21, $20, $20 + + ST $f18, 0*SIZE($20) + ST $f19, 1*SIZE($20) + SXADDQ $21, $20, $20 + ble $5, $SubEnd + .align 4 + +$SubRemain: + subl $5, 1, $6 + ble $5, $SubEnd + LD $f0, 0*SIZE($18) + LD $f1, 1*SIZE($18) + + LD $f8, 0*SIZE($20) + LD $f28, 1*SIZE($20) + SXADDQ $19, $18, $18 + SXADDQ $21, $20, $24 + ble $6, $SubRemainLoopEnd + .align 4 + +$SubRemainLoop: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + LD $f0, 0*SIZE($18) + + MUL $f29, $f1, $f23 + LD $f1, 1*SIZE($18) + ADD1 $f20, $f21, $f16 + SXADDQ $19, $18, $18 + + ADD2 $f22, $f23, $f17 + nop + ADD $f16, $f8, $f16 + LD $f8, 0*SIZE($24) + + ADD $f17, $f28, $f17 + LD $f28, 1*SIZE($24) + SXADDQ $21, $24, $24 + subl $6, 1, $6 + + ST $f16, 0*SIZE($20) + ST $f17, 1*SIZE($20) + SXADDQ $21, $20, $20 + bgt $6, $SubRemainLoop + .align 4 + +$SubRemainLoopEnd: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + MUL $f29, $f1, $f23 + + ADD1 $f20, $f21, $f16 + ADD2 $f22, $f23, $f17 + ADD $f16, $f8, $f16 + ADD $f17, $f28, $f17 + + ST $f16, 0*SIZE($20) + nop + ST $f17, 1*SIZE($20) + nop + .align 4 + +$SubEnd: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + ldi $sp, 64($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/zdot.S b/kernel/sw_64/zdot.S new file mode 100644 index 0000000..114a7a3 --- /dev/null +++ b/kernel/sw_64/zdot.S @@ -0,0 +1,583 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 +#define XX $21 +#define YY $23 + +#define I $5 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f2 +#define s3 $f30 +#define s4 $f3 + +#define a0 $f10 +#define a1 $f11 +#define a2 $f12 +#define a3 $f13 +#define a4 $f14 +#define a5 $f15 +#define a6 $f16 +#define a7 $f17 + +#define b0 $f18 +#define b1 $f19 +#define b2 $f20 +#define b3 $f21 +#define b4 $f22 +#define b5 $f23 +#define b6 $f24 +#define b7 $f25 + +#define t0 $f26 +#define t1 $f27 +#define t2 $f28 +#define t3 $f29 + + PROLOGUE + PROFCODE + .frame $sp, 24, $26, 0 + + ldi $sp, -24($sp) + fclr s0 + fstd $f2, 0($sp) + fstd $f3, 16($sp) + fclr s1 + + fclr s2 + addl INCX, INCX, INCX + fclr s3 + ble N, $L999 + + addl INCY, INCY, INCY + fclr t0 + fclr t1 + fclr t2 + fclr t3 + + srl N, 3, I + ble I, $L25 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + SXADDQ INCX, X, X + SXADDQ INCY, Y, Y + + LD a2, 0 * SIZE(X) + LD a3, 1 * SIZE(X) + LD b2, 0 * SIZE(Y) + LD b3, 1 * SIZE(Y) + + SXADDQ INCX, X, X + SXADDQ INCY, Y, Y + + LD a4, 0 * SIZE(X) + LD a5, 1 * SIZE(X) + LD b4, 0 * SIZE(Y) + LD b5, 1 * SIZE(Y) + + SXADDQ INCX, X, X + SXADDQ INCY, Y, Y + + LD a6, 0 * SIZE(X) + LD b6, 0 * SIZE(Y) + + subl I, 1, I + ble I, $L23 + .align 4 + +$L22: + ADD s0, t0, s4 + fmov s4,s0 + LD a7, 1 * SIZE(X) + MUL a0, b0, t0 + LD b7, 1 * SIZE(Y) + + ADD s1, t1, s4 + fmov s4,s1 + fillcs PREFETCHSIZE * SIZE(X) + MUL a0, b1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s4 + fmov s4,s2 + fillcs PREFETCHSIZE * SIZE(Y) + MUL a1, b0, t2 + SXADDQ INCY, Y, Y + + ADD s3, t3, s4 + fmov s4,s3 + LD a0, 0 * SIZE(X) + MUL a1, b1, t3 + LD a1, 1 * SIZE(X) + + ADD s0, t0, s4 + fmov s4,s0 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t0 + LD b1, 1 * SIZE(Y) + + ADD s1, t1, s4 + fmov s4,s1 + SXADDQ INCX, X, X + MUL a2, b3, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s4 + fmov s4,s2 + #unop + MUL a3, b2, t2 + unop + + ADD s3, t3, s4 + fmov s4,s3 + LD a2, 0 * SIZE(X) + MUL a3, b3, t3 + LD a3, 1 * SIZE(X) + + ADD s0, t0, s4 + fmov s4,s0 + LD b2, 0 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 1 * SIZE(Y) + + ADD s1, t1, s4 + fmov s4,s1 + SXADDQ INCX, X, X + MUL a4, b5, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s4 + fmov s4,s2 + unop + MUL a5, b4, t2 + unop + + ADD s3, t3, s4 + fmov s4,s3 + LD a4, 0 * SIZE(X) + MUL a5, b5, t3 + LD a5, 1 * SIZE(X) + + ADD s0, t0, s4 + fmov s4,s0 + LD b4, 0 * SIZE(Y) + MUL a6, b6, t0 + LD b5, 1 * SIZE(Y) + + ADD s1, t1, s4 + fmov s4,s1 + SXADDQ INCX, X, X + MUL a6, b7, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s4 + fmov s4,s2 + unop + MUL a7, b6, t2 + unop + + ADD s3, t3, s4 + fmov s4,s3 + LD a6, 0 * SIZE(X) + MUL a7, b7, t3 + LD a7, 1 * SIZE(X) + + ADD s0, t0, s4 + fmov s4,s0 + LD b6, 0 * SIZE(Y) + MUL a0, b0, t0 + LD b7, 1 * SIZE(Y) + + ADD s1, t1, s4 + fmov s4,s1 + SXADDQ INCX, X, X + MUL a0, b1, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s4 + fmov s4,s2 + unop + MUL a1, b0, t2 + unop + + ADD s3, t3, s4 + fmov s4,s3 + LD a0, 0 * SIZE(X) + MUL a1, b1, t3 + LD a1, 1 * SIZE(X) + + ADD s0, t0, s4 + fmov s4,s0 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t0 + LD b1, 1 * SIZE(Y) + + ADD s1, t1, s4 + fmov s4,s1 + SXADDQ INCX, X, X + MUL a2, b3, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s4 + fmov s4,s2 + unop + MUL a3, b2, t2 + unop + + ADD s3, t3, s4 + fmov s4,s3 + LD a2, 0 * SIZE(X) + MUL a3, b3, t3 + LD a3, 1 * SIZE(X) + + ADD s0, t0, s4 + fmov s4,s0 + LD b2, 0 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 1 * SIZE(Y) + + ADD s1, t1, s4 + fmov s4,s1 + SXADDQ INCX, X, X + MUL a4, b5, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s4 + fmov s4,s2 + unop + MUL a5, b4, t2 + subl I, 1, I + + ADD s3, t3, s4 + fmov s4,s3 + LD a4, 0 * SIZE(X) + MUL a5, b5, t3 + LD a5, 1 * SIZE(X) + + ADD s0, t0, s4 + fmov s4,s0 + LD b4, 0 * SIZE(Y) + MUL a6, b6, t0 + LD b5, 1 * SIZE(Y) + + ADD s1, t1, s4 + fmov s4,s1 + SXADDQ INCX, X, X + MUL a6, b7, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s4 + fmov s4,s2 + LD a6, 0 * SIZE(X) + MUL a7, b6, t2 + unop + + ADD s3, t3, s4 + fmov s4,s3 + LD b6, 0 * SIZE(Y) + MUL a7, b7, t3 + bgt I, $L22 + .align 4 + +$L23: + ADD s0, t0, s4 + fmov s4,s0 + LD a7, 1 * SIZE(X) + MUL a0, b0, t0 + LD b7, 1 * SIZE(Y) + + ADD s1, t1, s4 + fmov s4,s1 + SXADDQ INCX, X, X + MUL a0, b1, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s4 + fmov s4,s2 + unop + MUL a1, b0, t2 + unop + + ADD s3, t3, s4 + fmov s4,s3 + LD a0, 0 * SIZE(X) + MUL a1, b1, t3 + LD a1, 1 * SIZE(X) + + ADD s0, t0, s4 + fmov s4,s0 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t0 + LD b1, 1 * SIZE(Y) + + ADD s1, t1, s4 + fmov s4,s1 + SXADDQ INCX, X, X + MUL a2, b3, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s4 + fmov s4,s2 + unop + MUL a3, b2, t2 + unop + + ADD s3, t3, s4 + fmov s4,s3 + LD a2, 0 * SIZE(X) + MUL a3, b3, t3 + LD a3, 1 * SIZE(X) + + ADD s0, t0, s4 + fmov s4,s0 + LD b2, 0 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 1 * SIZE(Y) + + ADD s1, t1, s4 + fmov s4,s1 + SXADDQ INCX, X, X + MUL a4, b5, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s4 + fmov s4,s2 + unop + MUL a5, b4, t2 + unop + + ADD s3, t3, s4 + fmov s4,s3 + LD a4, 0 * SIZE(X) + MUL a5, b5, t3 + LD a5, 1 * SIZE(X) + + ADD s0, t0, s4 + fmov s4,s0 + LD b4, 0 * SIZE(Y) + MUL a6, b6, t0 + LD b5, 1 * SIZE(Y) + + ADD s1, t1, s4 + fmov s4,s1 + SXADDQ INCX, X, X + MUL a6, b7, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s4 + fmov s4,s2 + unop + MUL a7, b6, t2 + unop + + ADD s3, t3, s4 + fmov s4,s3 + LD a6, 0 * SIZE(X) + MUL a7, b7, t3 + LD a7, 1 * SIZE(X) + + ADD s0, t0, s4 + fmov s4,s0 + LD b6, 0 * SIZE(Y) + MUL a0, b0, t0 + LD b7, 1 * SIZE(Y) + + ADD s1, t1, s4 + fmov s4,s1 + SXADDQ INCX, X, X + MUL a0, b1, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s4 + fmov s4,s2 + MUL a1, b0, t2 + ADD s3, t3, s4 + fmov s4,s3 + MUL a1, b1, t3 + + ADD s0, t0, s4 + fmov s4,s0 + MUL a2, b2, t0 + ADD s1, t1, s4 + fmov s4,s1 + MUL a2, b3, t1 + + ADD s2, t2, s4 + fmov s4,s2 + MUL a3, b2, t2 + ADD s3, t3, s4 + fmov s4,s3 + MUL a3, b3, t3 + + ADD s0, t0, s4 + fmov s4,s0 + MUL a4, b4, t0 + ADD s1, t1, s4 + fmov s4,s1 + MUL a4, b5, t1 + + ADD s2, t2, s4 + fmov s4,s2 + MUL a5, b4, t2 + ADD s3, t3, s4 + fmov s4,s3 + MUL a5, b5, t3 + + ADD s0, t0, s4 + fmov s4,s0 + MUL a6, b6, t0 + ADD s1, t1, s4 + fmov s4,s1 + MUL a6, b7, t1 + + ADD s2, t2, s4 + fmov s4,s2 + MUL a7, b6, t2 + ADD s3, t3, s4 + fmov s4,s3 + MUL a7, b7, t3 + .align 4 + +$L25: + and N, 7, I + unop + unop + ble I, $L998 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + SXADDQ INCX, X, X + subl I, 1, I + SXADDQ INCY, Y, Y + ble I, $L28 + .align 4 + +$L26: + ADD s0, t0, s4 + fmov s4,s0 + mov X, XX + MUL a0, b0, t0 + mov Y, YY + + ADD s1, t1, s4 + fmov s4,s1 + SXADDQ INCX, X, X + MUL a0, b1, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s4 + fmov s4,s2 + LD a0, 0 * SIZE(XX) + MUL a1, b0, t2 + LD b0, 0 * SIZE(YY) + + ADD s3, t3, s4 + fmov s4,s3 + subl I, 1, I + MUL a1, b1, t3 + LD a1, 1 * SIZE(XX) + + LD b1, 1 * SIZE(YY) + bgt I, $L26 + .align 4 + +$L28: + ADD s0, t0, s4 + fmov s4,s0 + MUL a0, b0, t0 + ADD s1, t1, s4 + fmov s4,s1 + MUL a0, b1, t1 + + ADD s2, t2, s4 + fmov s4,s2 + MUL a1, b0, t2 + ADD s3, t3, s4 + fmov s4,s3 + MUL a1, b1, t3 + .align 4 + +$L998: + ADD s0, t0, s4 + fmov s4,s0 + ADD s1, t1, s4 + fmov s4,s1 + ADD s2, t2, s4 + fmov s4,s2 + ADD s3, t3, s4 + fmov s4,s3 + +#ifndef CONJ + SUB s0, s3, s4 + fmov s4,s0 + ADD s1, s2, s4 + fmov s4,s1 +#else + ADD s0, s3, s4 + fmov s4,s0 + SUB s1, s2, s4 + fmov s4,s1 +#endif + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 16($sp) + ldi $sp, 24($sp) + ret + + EPILOGUE diff --git a/kernel/sw_64/zdot.S.bak b/kernel/sw_64/zdot.S.bak new file mode 100644 index 0000000..d10673c --- /dev/null +++ b/kernel/sw_64/zdot.S.bak @@ -0,0 +1,500 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 +#define XX $21 +#define YY $23 + +#define I $5 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f2 +#define s3 $f30 + +#define a0 $f10 +#define a1 $f11 +#define a2 $f12 +#define a3 $f13 +#define a4 $f14 +#define a5 $f15 +#define a6 $f16 +#define a7 $f17 + +#define b0 $f18 +#define b1 $f19 +#define b2 $f20 +#define b3 $f21 +#define b4 $f22 +#define b5 $f23 +#define b6 $f24 +#define b7 $f25 + +#define t0 $f26 +#define t1 $f27 +#define t2 $f28 +#define t3 $f29 + + PROLOGUE + PROFCODE + .frame $sp, 16, $26, 0 + + ldi $sp, -16($sp) + fclr s0 + fstd $f2, 0($sp) + fclr s1 + + fclr s2 + addl INCX, INCX, INCX + fclr s3 + ble N, $L999 + + addl INCY, INCY, INCY + fclr t0 + fclr t1 + fclr t2 + fclr t3 + + srl N, 3, I + ble I, $L25 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + SXADDQ INCX, X, X + SXADDQ INCY, Y, Y + + LD a2, 0 * SIZE(X) + LD a3, 1 * SIZE(X) + LD b2, 0 * SIZE(Y) + LD b3, 1 * SIZE(Y) + + SXADDQ INCX, X, X + SXADDQ INCY, Y, Y + + LD a4, 0 * SIZE(X) + LD a5, 1 * SIZE(X) + LD b4, 0 * SIZE(Y) + LD b5, 1 * SIZE(Y) + + SXADDQ INCX, X, X + SXADDQ INCY, Y, Y + + LD a6, 0 * SIZE(X) + LD b6, 0 * SIZE(Y) + + subl I, 1, I + ble I, $L23 + .align 4 + +$L22: + ADD s0, t0, s0 + LD a7, 1 * SIZE(X) + MUL a0, b0, t0 + LD b7, 1 * SIZE(Y) + + ADD s1, t1, s1 + fillcs PREFETCHSIZE * SIZE(X) + MUL a0, b1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + fillcs PREFETCHSIZE * SIZE(Y) + MUL a1, b0, t2 + SXADDQ INCY, Y, Y + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + MUL a1, b1, t3 + LD a1, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t0 + LD b1, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a2, b3, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a3, b2, t2 + unop + + ADD s3, t3, s3 + LD a2, 0 * SIZE(X) + MUL a3, b3, t3 + LD a3, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b2, 0 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a4, b5, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a5, b4, t2 + unop + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + MUL a5, b5, t3 + LD a5, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b4, 0 * SIZE(Y) + MUL a6, b6, t0 + LD b5, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a6, b7, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a7, b6, t2 + unop + + ADD s3, t3, s3 + LD a6, 0 * SIZE(X) + MUL a7, b7, t3 + LD a7, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b6, 0 * SIZE(Y) + MUL a0, b0, t0 + LD b7, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a0, b1, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a1, b0, t2 + unop + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + MUL a1, b1, t3 + LD a1, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t0 + LD b1, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a2, b3, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a3, b2, t2 + unop + + ADD s3, t3, s3 + LD a2, 0 * SIZE(X) + MUL a3, b3, t3 + LD a3, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b2, 0 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a4, b5, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a5, b4, t2 + subl I, 1, I + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + MUL a5, b5, t3 + LD a5, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b4, 0 * SIZE(Y) + MUL a6, b6, t0 + LD b5, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a6, b7, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + LD a6, 0 * SIZE(X) + MUL a7, b6, t2 + unop + + ADD s3, t3, s3 + LD b6, 0 * SIZE(Y) + MUL a7, b7, t3 + bgt I, $L22 + .align 4 + +$L23: + ADD s0, t0, s0 + LD a7, 1 * SIZE(X) + MUL a0, b0, t0 + LD b7, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a0, b1, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a1, b0, t2 + unop + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + MUL a1, b1, t3 + LD a1, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t0 + LD b1, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a2, b3, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a3, b2, t2 + unop + + ADD s3, t3, s3 + LD a2, 0 * SIZE(X) + MUL a3, b3, t3 + LD a3, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b2, 0 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a4, b5, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a5, b4, t2 + unop + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + MUL a5, b5, t3 + LD a5, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b4, 0 * SIZE(Y) + MUL a6, b6, t0 + LD b5, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a6, b7, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a7, b6, t2 + unop + + ADD s3, t3, s3 + LD a6, 0 * SIZE(X) + MUL a7, b7, t3 + LD a7, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b6, 0 * SIZE(Y) + MUL a0, b0, t0 + LD b7, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a0, b1, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + MUL a1, b0, t2 + ADD s3, t3, s3 + MUL a1, b1, t3 + + ADD s0, t0, s0 + MUL a2, b2, t0 + ADD s1, t1, s1 + MUL a2, b3, t1 + + ADD s2, t2, s2 + MUL a3, b2, t2 + ADD s3, t3, s3 + MUL a3, b3, t3 + + ADD s0, t0, s0 + MUL a4, b4, t0 + ADD s1, t1, s1 + MUL a4, b5, t1 + + ADD s2, t2, s2 + MUL a5, b4, t2 + ADD s3, t3, s3 + MUL a5, b5, t3 + + ADD s0, t0, s0 + MUL a6, b6, t0 + ADD s1, t1, s1 + MUL a6, b7, t1 + + ADD s2, t2, s2 + MUL a7, b6, t2 + ADD s3, t3, s3 + MUL a7, b7, t3 + .align 4 + +$L25: + and N, 7, I + unop + unop + ble I, $L998 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + SXADDQ INCX, X, X + subl I, 1, I + SXADDQ INCY, Y, Y + ble I, $L28 + .align 4 + +$L26: + ADD s0, t0, s0 + mov X, XX + MUL a0, b0, t0 + mov Y, YY + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a0, b1, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + LD a0, 0 * SIZE(XX) + MUL a1, b0, t2 + LD b0, 0 * SIZE(YY) + + ADD s3, t3, s3 + subl I, 1, I + MUL a1, b1, t3 + LD a1, 1 * SIZE(XX) + + LD b1, 1 * SIZE(YY) + bgt I, $L26 + .align 4 + +$L28: + ADD s0, t0, s0 + MUL a0, b0, t0 + ADD s1, t1, s1 + MUL a0, b1, t1 + + ADD s2, t2, s2 + MUL a1, b0, t2 + ADD s3, t3, s3 + MUL a1, b1, t3 + .align 4 + +$L998: + ADD s0, t0, s0 + ADD s1, t1, s1 + ADD s2, t2, s2 + ADD s3, t3, s3 + +#ifndef CONJ + SUB s0, s3, s0 + ADD s1, s2, s1 +#else + ADD s0, s3, s0 + SUB s1, s2, s1 +#endif + .align 4 + +$L999: + fldd $f2, 0($sp) + ldi $sp, 16($sp) + ret + + EPILOGUE diff --git a/kernel/sw_64/zdot_simd.S b/kernel/sw_64/zdot_simd.S new file mode 100644 index 0000000..ed775e6 --- /dev/null +++ b/kernel/sw_64/zdot_simd.S @@ -0,0 +1,699 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 +#define XX $21 +#define YY $23 + +#define I $5 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f2 +#define s3 $f30 + +#define a0 $f10 +#define a1 $f11 +#define a2 $f12 +#define a3 $f13 +#define a4 $f14 +#define a5 $f15 +#define a6 $f16 +#define a7 $f17 + +#define b0 $f18 +#define b1 $f19 +#define b2 $f20 +#define b3 $f21 +#define b4 $f22 +#define b5 $f23 +#define b6 $f24 +#define b7 $f25 + +#define t0 $f26 +#define t1 $f27 +#define t2 $f28 +#define t3 $f29 + +#define t4 $f3 +#define t5 $f4 +#define t6 $f5 +#define t7 $f6 + + PROLOGUE + PROFCODE + .frame $sp, 40, $26, 0 + + ldi $sp, -40($sp) + fclr s0 + fstd $f2, 0($sp) + fclr s1 + + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + + fclr s2 + addl INCX, INCX, INCX + fclr s3 + ble N, $L999 + + addl INCY, INCY, INCY + fclr t0 + fclr t1 + fclr t2 + fclr t3 + + cmpeq INCX, 2, $21 + cmpeq INCY, 2, $22 + and $21, $22, $22 + beq $22, $Sub + +/* + test the address of Y & X +*/ + and Y, (VEC_LEN*SIZE-1), $4 + and X, (VEC_LEN*SIZE-1), $3 + or $3, $4, $4 + bne $4, $UnAlign_ACCESS + +/*Align access*/ +/*UnLoop 8*/ + srl N, 3, I + ble I, $Remain + .align 4 + vcpys $f31, $f31, s0 #clear s0 vector + vcpys $f31, $f31, s1 #clear s0 vector + vcpys $f31, $f31, s2 #clear s0 vector + vcpys $f31, $f31, s3 #clear s0 vector + + vcpys $f31, $f31, t0 + vcpys $f31, $f31, t1 + vcpys $f31, $f31, t2 + vcpys $f31, $f31, t3 + +$MainLoop: + VLD a0, 0*VEC_LEN*SIZE(X) + VLD a1, 1*VEC_LEN*SIZE(X) + VLD a2, 2*VEC_LEN*SIZE(X) + VLD a3, 3*VEC_LEN*SIZE(X) + + VLD b0, 0*VEC_LEN*SIZE(Y) + VADD s0, t0, s0 + VLD b1, 1*VEC_LEN*SIZE(Y) + VADD s1, t1, s1 + + VLD b2, 2*VEC_LEN*SIZE(Y) + VADD s2, t2, s2 + VLD b3, 3*VEC_LEN*SIZE(Y) + VADD s3, t3, s3 + +/*spilt the X complex vector to real vector(a0, a2) and image vector (a1, a3) + Y complex vectory to real vector(b0, b2) and image vector (b1, b3) +*/ + vextf a0, 1, a4 + vextf a0, 3, a5 + vextf a1, 0, a6 + vextf a1, 2, a7 + + vextf a2, 1, t0 + vextf a2, 3, t1 + vextf a3, 0, t2 + vextf a3, 2, t3 + + vextf b0, 1, b4 + vextf b0, 3, b5 + vextf b1, 0, b6 + vextf b1, 2, b7 + + vextf b2, 1, t4 + vextf b2, 3, t5 + vextf b3, 0, t6 + vextf b3, 2, t7 + + vinsf a4, a1, 0, a1 + vinsf a6, a0, 1, a0 + vinsf t0, a3, 0, a3 + vinsf t2, a2, 1, a2 + + vinsf b4, b1, 0, b1 + addl X, 16 * SIZE, X + vinsf b6, b0, 1, b0 + addl Y, 16 * SIZE, Y + + vinsf t4, b3, 0, b3 + subl I, 1, I + vinsf t6, b2, 1, b2 + nop + + vinsf a5, a1, 2, a1 + vinsf a7, a0, 3, a0 + vinsf t1, a3, 2, a3 + vinsf t3, a2, 3, a2 + + vinsf b5, b1, 2, b1 + vinsf b7, b0, 3, b0 + vinsf t5, b3, 2, b3 + vinsf t7, b2, 3, b2 + + /*Computing*/ + + + fillcs PREFETCHSIZE * SIZE(X) + VMAD a0, b0, s0, s0 + fillcs PREFETCHSIZE * SIZE(Y) + VMAD a0, b1, s1, s1 + + VMAD a1, b0, s2, s2 + VMAD a1, b1, s3, s3 + VMUL a2, b2, t0 /*Just multiply. Add it in next loop.*/ + VMUL a2, b3, t1 + + VMUL a3, b2, t2 + VMUL a3, b3, t3 + nop + bgt I, $MainLoop + .align 4 +$MainLoopEnd: + VADD s0, t0, s0 + VADD s1, t1, s1 + VADD s2, t2, s2 + VADD s3, t3, s3 + +#ifndef CONJ + VSUB s0, s3, s0 + VADD s1, s2, s1 +#else + VADD s0, s3, s0 + VSUB s1, s2, s1 +#endif + vcpys $f31, $f31, s2 #clear s0 vector + vcpys $f31, $f31, s3 #clear s0 vector + + vextf s0, 1, t1 + vextf s0, 2, t2 + vextf s0, 3, t3 + vextf s1, 1, t5 + + vextf s1, 2, t6 + vextf s1, 3, t7 + ADD s0, t1, s0 + ADD t2, t3, t0 + + ADD s1, t5, s1 + ADD t6, t7, t4 + ADD s0, t0, s0 + ADD s1, t4, s1 +$Remain: + and N, 7, I + ble I, $End + .align 4 +$RemainLoop: + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + SXADDQ INCX, X, X + subl I, 1, I + SXADDQ INCY, Y, Y + MAD a0, b0, s0, s0 + + MAD a0, b1, s1, s1 + MAD a1, b0, s2, s2 + MAD a1, b1, s3, s3 + bgt I, $RemainLoop + .align 4 + +#ifndef CONJ + SUB s0, s3, s0 + ADD s1, s2, s1 +#else + ADD s0, s3, s0 + SUB s1, s2, s1 +#endif + +$End: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + + fldd $f6, 32($sp) + ldi $sp, 40($sp) + ret + + .align 4 + +$UnAlign_ACCESS: +$Sub: + srl N, 3, I + ble I, $L25 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + SXADDQ INCX, X, X + SXADDQ INCY, Y, Y + + LD a2, 0 * SIZE(X) + LD a3, 1 * SIZE(X) + LD b2, 0 * SIZE(Y) + LD b3, 1 * SIZE(Y) + + SXADDQ INCX, X, X + SXADDQ INCY, Y, Y + + LD a4, 0 * SIZE(X) + LD a5, 1 * SIZE(X) + LD b4, 0 * SIZE(Y) + LD b5, 1 * SIZE(Y) + + SXADDQ INCX, X, X + SXADDQ INCY, Y, Y + + LD a6, 0 * SIZE(X) + LD b6, 0 * SIZE(Y) + + subl I, 1, I + ble I, $L23 + .align 4 + +$L22: + ADD s0, t0, s0 + LD a7, 1 * SIZE(X) + MUL a0, b0, t0 + LD b7, 1 * SIZE(Y) + + ADD s1, t1, s1 + fillcs PREFETCHSIZE * SIZE(X) + MUL a0, b1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + fillcs PREFETCHSIZE * SIZE(Y) + MUL a1, b0, t2 + SXADDQ INCY, Y, Y + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + MUL a1, b1, t3 + LD a1, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t0 + LD b1, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a2, b3, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a3, b2, t2 + unop + + ADD s3, t3, s3 + LD a2, 0 * SIZE(X) + MUL a3, b3, t3 + LD a3, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b2, 0 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a4, b5, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a5, b4, t2 + unop + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + MUL a5, b5, t3 + LD a5, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b4, 0 * SIZE(Y) + MUL a6, b6, t0 + LD b5, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a6, b7, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a7, b6, t2 + unop + + ADD s3, t3, s3 + LD a6, 0 * SIZE(X) + MUL a7, b7, t3 + LD a7, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b6, 0 * SIZE(Y) + MUL a0, b0, t0 + LD b7, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a0, b1, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a1, b0, t2 + unop + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + MUL a1, b1, t3 + LD a1, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t0 + LD b1, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a2, b3, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a3, b2, t2 + unop + + ADD s3, t3, s3 + LD a2, 0 * SIZE(X) + MUL a3, b3, t3 + LD a3, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b2, 0 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a4, b5, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a5, b4, t2 + subl I, 1, I + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + MUL a5, b5, t3 + LD a5, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b4, 0 * SIZE(Y) + MUL a6, b6, t0 + LD b5, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a6, b7, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + LD a6, 0 * SIZE(X) + MUL a7, b6, t2 + unop + + ADD s3, t3, s3 + LD b6, 0 * SIZE(Y) + MUL a7, b7, t3 + bgt I, $L22 + .align 4 + +$L23: + ADD s0, t0, s0 + LD a7, 1 * SIZE(X) + MUL a0, b0, t0 + LD b7, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a0, b1, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a1, b0, t2 + unop + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + MUL a1, b1, t3 + LD a1, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t0 + LD b1, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a2, b3, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a3, b2, t2 + unop + + ADD s3, t3, s3 + LD a2, 0 * SIZE(X) + MUL a3, b3, t3 + LD a3, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b2, 0 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a4, b5, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a5, b4, t2 + unop + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + MUL a5, b5, t3 + LD a5, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b4, 0 * SIZE(Y) + MUL a6, b6, t0 + LD b5, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a6, b7, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a7, b6, t2 + unop + + ADD s3, t3, s3 + LD a6, 0 * SIZE(X) + MUL a7, b7, t3 + LD a7, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b6, 0 * SIZE(Y) + MUL a0, b0, t0 + LD b7, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a0, b1, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + MUL a1, b0, t2 + ADD s3, t3, s3 + MUL a1, b1, t3 + + ADD s0, t0, s0 + MUL a2, b2, t0 + ADD s1, t1, s1 + MUL a2, b3, t1 + + ADD s2, t2, s2 + MUL a3, b2, t2 + ADD s3, t3, s3 + MUL a3, b3, t3 + + ADD s0, t0, s0 + MUL a4, b4, t0 + ADD s1, t1, s1 + MUL a4, b5, t1 + + ADD s2, t2, s2 + MUL a5, b4, t2 + ADD s3, t3, s3 + MUL a5, b5, t3 + + ADD s0, t0, s0 + MUL a6, b6, t0 + ADD s1, t1, s1 + MUL a6, b7, t1 + + ADD s2, t2, s2 + MUL a7, b6, t2 + ADD s3, t3, s3 + MUL a7, b7, t3 + .align 4 + +$L25: + and N, 7, I + unop + unop + ble I, $L998 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + SXADDQ INCX, X, X + subl I, 1, I + SXADDQ INCY, Y, Y + ble I, $L28 + .align 4 + +$L26: + ADD s0, t0, s0 + mov X, XX + MUL a0, b0, t0 + mov Y, YY + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a0, b1, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + LD a0, 0 * SIZE(XX) + MUL a1, b0, t2 + LD b0, 0 * SIZE(YY) + + ADD s3, t3, s3 + subl I, 1, I + MUL a1, b1, t3 + LD a1, 1 * SIZE(XX) + + LD b1, 1 * SIZE(YY) + bgt I, $L26 + .align 4 + +$L28: + ADD s0, t0, s0 + MUL a0, b0, t0 + ADD s1, t1, s1 + MUL a0, b1, t1 + + ADD s2, t2, s2 + MUL a1, b0, t2 + ADD s3, t3, s3 + MUL a1, b1, t3 + .align 4 + +$L998: + ADD s0, t0, s0 + ADD s1, t1, s1 + ADD s2, t2, s2 + ADD s3, t3, s3 + +#ifndef CONJ + SUB s0, s3, s0 + ADD s1, s2, s1 +#else + ADD s0, s3, s0 + SUB s1, s2, s1 +#endif + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + + fldd $f6, 32($sp) + ldi $sp, 40($sp) + ret + + EPILOGUE diff --git a/kernel/sw_64/zgemm_beta.S b/kernel/sw_64/zgemm_beta.S new file mode 100644 index 0000000..18f845c --- /dev/null +++ b/kernel/sw_64/zgemm_beta.S @@ -0,0 +1,192 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + .set noat + .set noreorder +.text + .align 5 + .globl CNAME + .ent CNAME +CNAME: + .frame $sp, 0, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + ldi $28, _mcount + jsr $28, ($28), _mcount + .prologue 1 +#else + .prologue 0 +#endif + + ldl $18, 24($sp) + ble $16, $End + ldl $19, 32($sp) + ble $17, $End + + addl $19, $19, $19 + fbne $f19,$Main + fbne $f20,$Main + .align 4 + +$L13: + mov $18, $1 + ldi $17, -1($17) + SXADDQ $19, $18, $18 + mov $16, $2 + .align 4 + +$L12: + ST $f31, 0*SIZE($1) + ST $f31, 1*SIZE($1) + ldi $2, -1($2) + ldi $1, 2*SIZE($1) + bgt $2, $L12 + bgt $17,$L13 + clr $0 + ret + .align 4 + +/* Main Routine */ +$Main: + sra $16, 1, $2 # $2 = (m >> 1) + mov $18, $1 # c_offset = c + ldi $17, -1($17) # n -- + SXADDQ $19, $18, $18 # c += ldc + beq $2, $L18 + + LD $f14, 0*SIZE($1) + LD $f15, 1*SIZE($1) + LD $f24, 2*SIZE($1) + LD $f25, 3*SIZE($1) + ldi $2, -1($2) # $2 -- + ble $2, $L19 + .align 4 + + +$L23: + MUL $f19, $f14, $f10 + fillcs 9*SIZE($1) + MUL $f20, $f15, $f11 + ldi $2, -1($2) + + MUL $f19, $f15, $f12 + LD $f15, 5*SIZE($1) + MUL $f20, $f14, $f13 + LD $f14, 4*SIZE($1) + + MUL $f19, $f24, $f16 + unop + MUL $f20, $f25, $f17 + unop + + MUL $f19, $f25, $f18 + LD $f25, 7*SIZE($1) + SUB $f10, $f11, $f22 + unop + + MUL $f20, $f24, $f21 + LD $f24, 6*SIZE($1) + ADD $f12, $f13, $f23 + ldi $1, 4*SIZE($1) + + SUB $f16, $f17, $f26 + ADD $f18, $f21, $f27 + ST $f22,-4*SIZE($1) + ST $f23,-3*SIZE($1) + + ST $f26,-2*SIZE($1) + ST $f27,-1*SIZE($1) + unop + bgt $2,$L23 + .align 4 + +$L19: + MUL $f19, $f14, $f10 + MUL $f20, $f15, $f11 + MUL $f19, $f15, $f12 + MUL $f20, $f14, $f13 + + MUL $f19, $f24, $f16 + MUL $f20, $f25, $f17 + MUL $f19, $f25, $f18 + MUL $f20, $f24, $f21 + + SUB $f10, $f11, $f22 + ADD $f12, $f13, $f23 + SUB $f16, $f17, $f26 + ADD $f18, $f21, $f27 + ldi $1, 4*SIZE($1) + + ST $f22, -4*SIZE($1) + ST $f23, -3*SIZE($1) + ST $f26, -2*SIZE($1) + ST $f27, -1*SIZE($1) + + blbs $16, $L18 + bgt $17, $Main + clr $0 + ret + .align 4 + +$L18: + LD $f14, 0*SIZE($1) + LD $f15, 1*SIZE($1) + MUL $f19, $f15, $f13 + MUL $f20, $f14, $f10 + + MUL $f19, $f14, $f12 + MUL $f20, $f15, $f11 + ADD $f13, $f10, $f26 + SUB $f12, $f11, $f27 + + ST $f26, 1*SIZE($1) + ST $f27, 0*SIZE($1) + ldi $1, 2*SIZE($1) + bgt $17, $Main + .align 4 + +$End: + clr $0 + ret + .ident VERSION + .end CNAME diff --git a/kernel/sw_64/zgemm_kernel_2x2.S b/kernel/sw_64/zgemm_kernel_2x2.S new file mode 100644 index 0000000..6cf954b --- /dev/null +++ b/kernel/sw_64/zgemm_kernel_2x2.S @@ -0,0 +1,1949 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(SW6) +#error "Architecture is not specified." +#endif + +#ifdef SW6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 48 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + + .set noat + .set noreorder + .arch sw6a + +.text + .align 5 + .globl CNAME + .ent CNAME + +#define STACKSIZE 88 + +#define M $16 +#define N $17 +#define K $18 +#define A $21 +#define B $22 +#define C $20 +#define LDC $23 + +#define C1 $19 +#define C2 $24 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha_i $f29 +#define alpha_r $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define BB $3 +#define OFFSET $4 + +#define tmp $9 + +#define ALPHA_R 64($sp) +#define ALPHA_I 72($sp) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#else +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 SUB +#define ADD4 SUB +#endif + +CNAME: + .frame $sp, STACKSIZE, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + ldi $at, _mcount + jsr $at, ($at), _mcount +#endif + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + ldi $sp, -STACKSIZE($sp) + + ldl B, 0 + STACKSIZE($sp) + ldl C, 8 + STACKSIZE($sp) + ldl LDC, 16 + STACKSIZE($sp) +#ifdef TRMMKERNEL + ldl OFFSET, 24 + STACKSIZE($sp) +#endif + + sll LDC, ZBASE_SHIFT, LDC + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + fstd $f19, ALPHA_R + fstd $f20, ALPHA_I + stl tmp, 80($sp) + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#if defined(TRMMKERNEL) && !defined(LEFT) + subl $31, OFFSET, KK +#endif + + sra N, 1, J + ble J, $L30 + .align 4 + +$L01: + mov C, C1 + addl C, LDC, C2 + mov A, AO + s4addl K, 0, BB + + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + SXADDQ BB, B, BB + addl C2, LDC, C + unop + + sra M, 1, I + fclr t1 + fclr t2 + fclr t3 + fclr t4 + + fclr c01 + fclr c05 + + ble I, $L20 + .align 4 + +$L11: +#ifndef EV4 + fillcs 0 * SIZE(BB) + fillcs 8 * SIZE(BB) + unop + ldi BB, 16 * SIZE(BB) +#endif + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 2, TMP1 +#else + addl KK, 2, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c07 + + ldi BO, 4 * SIZE(B) + fclr c11 + ldi AO, 4 * SIZE(AO) + fclr c15 + + fillcs 4 * SIZE(C1) + fclr c04 +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + fclr c08 + + fillcs 4 * SIZE(C2) + fclr c12 + fclr c16 + ble L, $L15 +#else + sll KK, ZBASE_SHIFT + 1, TMP1 + addl AO, TMP1, AO + addl B, TMP1, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c07 + + ldi BO, 4 * SIZE(BO) + fclr c11 + ldi AO, 4 * SIZE(AO) + fclr c15 + + fillcs 4 * SIZE(C1) + fclr c04 + ldi L, -2(TMP1) + fclr c08 + + fillcs 4 * SIZE(C2) + fclr c12 + fclr c16 + ble L, $L15 +#endif + .align 5 + +$L12: +/* 1 */ + ADD1 c11, t1, a6 + fmov a6, c11 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD3 c12, t2, a6 + fmov a6, c12 + unop + MUL b1, a2, t2 + unop + + ADD2 c16, t3, a6 + fmov a6, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD4 c15, t4, a6 + fmov a6, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD1 c01, t1, a6 + fmov a6, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD3 c02, t2, a6 + fmov a6, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD2 c06, t3, a6 + fmov a6, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, a6 + fmov a6, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD1 c03, t1, a6 + fmov a6, c03 + unop + MUL b3, a1, t1 + unop + + ADD3 c04, t2, a6 + fmov a6, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, a6 + fmov a6, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, a6 + fmov a6, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD1 c09, t1, a6 + fmov a6, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + FIMOVD a6, tmp + + ADD3 c10, t2, a6 + fmov a6, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, a6 + fmov a6, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD4 c07, t4, a6 + fmov a6, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD1 c11, t1, a6 + fmov a6, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD3 c12, t2, a6 + fmov a6, c12 + ldi L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD2 c16, t3, a6 + fmov a6, c16 + unop + MUL b2, a2, t3 + unop + + ADD4 c15, t4, a6 + fmov a6, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD1 c01, t1, a6 + fmov a6, c01 + unop + IFMOVD tmp, a6 + MUL b5, a6, t1 + unop + + ADD3 c02, t2, a6 + fmov a6, c02 + unop + MUL b5, a4, t2 + unop + + ADD2 c06, t3, a6 + fmov a6, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, a6 + fmov a6, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD1 c03, t1, a6 + fmov a6, c03 + ldi AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD3 c04, t2, a6 + fmov a6, c04 + ldi BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD2 c08, t3, a6 + fmov a6, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD4 c13, t4, a6 + fmov a6, c13 + unop + IFMOVD tmp, a6 + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD1 c09, t1, a6 + fmov a6, c09 + unop + IFMOVD tmp, a6 + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD3 c10, t2, a6 + fmov a6, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD2 c14, t3, a6 + fmov a6, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, a6 + fmov a6, c07 + IFMOVD tmp, a6 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD1 c11, t1, a6 + fmov a6, c11 + fldd alpha_r, ALPHA_R + FIMOVD alpha_r, tmp + MUL b1, a1, t1 +#ifndef TRMMKERNEL + blbs K, $L18 +#else + blbs TMP1, $L18 +#endif + .align 4 + + ADD3 c12, t2, a6 + fmov a6, c12 + MUL b1, a2, t2 + ADD2 c16, t3, a6 + fmov a6, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, a6 + fmov a6, c15 + MUL b2, a1, t4 + ADD1 c01, t1, a6 + fmov a6, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, a6 + fmov a6, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD2 c06, t3, a6 + fmov a6, c06 + MUL b2, a4, t3 + ADD4 c05, t4, a6 + fmov a6, c05 + MUL b4, a1, t4 + + ADD1 c03, t1, a6 + fmov a6, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c04, t2, a6 + fmov a6, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, a6 + fmov a6, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, a6 + fmov a6, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c09, t1, a6 + fmov a6, c09 + unop + MUL b3, a3, t1 + ldi AO, 4 * SIZE(AO) + + ADD3 c10, t2, a6 + fmov a6, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, a6 + fmov a6, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, a6 + fmov a6, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD1 c11, t1, a6 + fmov a6, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L18: + ADD3 c12, t2, a6 + fmov a6, c12 + unop + MUL b1, a2, t2 + fldd alpha_i, ALPHA_I + + ADD2 c16, t3, a6 + fmov a6, c16 + unop + MUL b2, a2, t3 +#ifndef TRMMKERNEL + LD a5, 0 * SIZE(C1) +#else + unop +#endif + + ADD4 c15, t4, a6 + fmov a6, c15 + MUL b2, a1, t4 + ADD1 c01, t1, a6 + fmov a6, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, a6 + fmov a6, c02 + unop + MUL b1, a4, t2 +#ifndef TRMMKERNEL + LD b1, 1 * SIZE(C1) +#else + unop +#endif + + ADD2 c06, t3, a6 + fmov a6, c06 + MUL b2, a4, t3 + ADD4 c05, t4, a6 + fmov a6, c05 + MUL b4, a1, t4 + + ADD1 c03, t1, a6 + fmov a6, c03 + unop + MUL b3, a1, t1 +#ifndef TRMMKERNEL + LD a1, 2 * SIZE(C1) +#else + unop +#endif + + ADD3 c04, t2, a6 + fmov a6, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, a6 + fmov a6, c08 + unop + MUL b4, a2, t3 +#ifndef TRMMKERNEL + LD a2, 3 * SIZE(C1) +#else + unop +#endif + + ADD4 c13, t4, a6 + fmov a6, c13 + unop + MUL b2, a3, t4 +#ifndef TRMMKERNEL + LD b2, 0 * SIZE(C2) +#else + unop +#endif + + ADD1 c09, t1, a6 + fmov a6, c09 + ldi I, -1(I) + MUL b3, a3, t1 + unop + + ADD3 c10, t2, a6 + fmov a6, c10 + unop + MUL b3, a4, t2 +#ifndef TRMMKERNEL + LD b3, 1 * SIZE(C2) +#else + unop +#endif + + ADD2 c14, t3, a6 + fmov a6, c14 + unop + MUL b4, a4, t3 +#ifndef TRMMKERNEL + LD a4, 2 * SIZE(C2) +#else + unop +#endif + + ADD4 c07, t4, a6 + fmov a6, c07 + unop + MUL b4, a3, t4 +#ifndef TRMMKERNEL + LD a3, 3 * SIZE(C2) +#else + unop +#endif + + ADD1 c11, t1, a6 + fmov a6, c11 + ADD3 c12, t2, a6 + fmov a6, c12 + ADD2 c16, t3, a6 + fmov a6, c16 + ADD4 c15, t4, a6 + fmov a6, c15 + + ADD c01, c06, a6 + fmov a6, c01 + ADD c02, c05, a6 + fmov a6, c02 + ADD c03, c08, a6 + fmov a6, c03 + ADD c04, c07, a6 + fmov a6, c04 + + ADD c09, c14, a6 + fmov a6, c09 + IFMOVD tmp, alpha_r + MUL alpha_r, c01, t1 + ADD c10, c13, a6 + fmov a6, c10 + IFMOVD tmp, alpha_r + MUL alpha_r, c02, t2 + + ADD c11, c16, a6 + fmov a6, c11 + IFMOVD tmp, alpha_r + MUL alpha_r, c03, t3 + ADD c12, c15, a6 + fmov a6, c12 + IFMOVD tmp, alpha_r + MUL alpha_r, c04, t4 + +#ifndef TRMMKERNEL + ADD a5, t1, a6 + fmov a6, a5 + MUL alpha_i, c02, t1 + ADD b1, t2, a6 + fmov a6, b1 + MUL alpha_i, c01, t2 + + ADD a1, t3, a6 + fmov a6, a1 + MUL alpha_i, c04, t3 + ADD a2, t4, a6 + fmov a6, a2 + MUL alpha_i, c03, t4 +#else + ADD $f31, t1, a5 + MUL alpha_i, c02, t1 + ADD $f31, t2, b1 + MUL alpha_i, c01, t2 + + ADD $f31, t3, a1 + MUL alpha_i, c04, t3 + ADD $f31, t4, a2 + MUL alpha_i, c03, t4 +#endif + + SUB a5, t1, a6 + fmov a6, a5 + IFMOVD tmp, alpha_r + MUL alpha_r, c09, t1 + ADD b1, t2, a6 + fmov a6, b1 + IFMOVD tmp, alpha_r + MUL alpha_r, c10, t2 + + SUB a1, t3, a6 + fmov a6, a1 + IFMOVD tmp, alpha_r + MUL alpha_r, c11, t3 + ADD a2, t4, a6 + fmov a6, a2 + IFMOVD tmp, alpha_r + MUL alpha_r, c12, t4 + +#ifndef TRMMKERNEL + ADD b2, t1, a6 + fmov a6, b2 + MUL alpha_i, c10, t1 + ADD b3, t2, a6 + fmov a6, b3 + MUL alpha_i, c09, t2 + + ADD a4, t3, a6 + fmov a6, a4 + MUL alpha_i, c12, t3 + ADD a3, t4, a6 + fmov a6, a3 + MUL alpha_i, c11, t4 +#else + ADD $f31, t1, b2 + MUL alpha_i, c10, t1 + ADD $f31, t2, b3 + MUL alpha_i, c09, t2 + + ADD $f31, t3, a4 + MUL alpha_i, c12, t3 + ADD $f31, t4, a3 + MUL alpha_i, c11, t4 +#endif + + SUB b2, t1, a6 + fmov a6, b2 + ST a5, 0 * SIZE(C1) + fclr t1 + unop + + ADD b3, t2, a6 + fmov a6, b3 + ST b1, 1 * SIZE(C1) + fclr t2 + unop + + SUB a4, t3, a6 + fmov a6, a4 + ST a1, 2 * SIZE(C1) + fclr t3 + unop + + ADD a3, t4, a6 + fmov a6, a3 + ST a2, 3 * SIZE(C1) + fclr t4 + unop + + ST b2, 0 * SIZE(C2) + fclr c01 + ST b3, 1 * SIZE(C2) + fclr c05 + + ST a4, 2 * SIZE(C2) + ldi C1, 4 * SIZE(C1) + ST a3, 3 * SIZE(C2) + ldi C2, 4 * SIZE(C2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 2, TMP1 +#else + subl TMP1, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP1 + addl AO, TMP1, AO + addl BO, TMP1, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 2, KK +#endif + bgt I, $L11 + .align 4 + +$L20: + and M, 1, I + ble I, $L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 1, TMP1 +#else + addl KK, 2, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + ldi AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(B) + ldi BO, 4 * SIZE(B) + +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + ble L, $L25 +#else + sll KK, ZBASE_SHIFT + 0, TMP1 + addl AO, TMP1, AO + sll KK, ZBASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(BO) + ldi BO, 4 * SIZE(BO) + + ldi L, -2(TMP1) + ble L, $L25 +#endif + .align 5 + +$L22: + ADD1 c09, t1, a6 + fmov a6, c09 + unop + MUL a1, b1, t1 + unop + + ADD3 c10, t2, a6 + fmov a6, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, a6 + fmov a6, c13 + unop + MUL a1, b2, t3 + ldi BO, 8 * SIZE(BO) + + ADD2 c14, t4, a6 + fmov a6, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD1 c01, t1, a6 + fmov a6, c01 + unop + MUL a1, b3, t1 + unop + + ADD3 c02, t2, a6 + fmov a6, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD4 c05, t3, a6 + fmov a6, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD2 c06, t4, a6 + fmov a6, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD1 c09, t1, a6 + fmov a6, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD3 c10, t2, a6 + fmov a6, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD4 c13, t3, a6 + fmov a6, c13 + unop + MUL a3, b2, t3 + ldi AO, 4 * SIZE(AO) + + ADD2 c14, t4, a6 + fmov a6, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD1 c01, t1, a6 + fmov a6, c01 + ldi L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD3 c02, t2, a6 + fmov a6, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD4 c05, t3, a6 + fmov a6, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, a6 + fmov a6, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD1 c09, t1, a6 + fmov a6, c09 + fldd alpha_r, ALPHA_R + FIMOVD alpha_r, tmp + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L28 +#else + blbs TMP1, $L28 +#endif + .align 4 + + ADD3 c10, t2, a6 + fmov a6, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, a6 + fmov a6, c13 + unop + MUL a1, b2, t3 + unop + + ADD2 c14, t4, a6 + fmov a6, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c01, t1, a6 + fmov a6, c01 + unop + MUL a1, b3, t1 + ldi AO, 2 * SIZE(AO) + + ADD3 c02, t2, a6 + fmov a6, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD4 c05, t3, a6 + fmov a6, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, a6 + fmov a6, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c09, t1, a6 + fmov a6, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L28: + ADD3 c10, t2, a6 + fmov a6, c10 + unop + MUL a2, b1, t2 + fldd alpha_i, ALPHA_I + + ADD4 c13, t3, a6 + fmov a6, c13 + unop + MUL a1, b2, t3 +#ifndef TRMMKERNEL + LD c03, 0 * SIZE(C1) +#else + unop +#endif + + ADD2 c14, t4, a6 + fmov a6, c14 + unop + MUL a2, b2, t4 +#ifndef TRMMKERNEL + LD c04, 1 * SIZE(C1) +#else + unop +#endif + + ADD1 c01, t1, a6 + fmov a6, c01 + unop + MUL a1, b3, t1 +#ifndef TRMMKERNEL + LD c11, 0 * SIZE(C2) +#else + unop +#endif + + ADD3 c02, t2, a6 + fmov a6, c02 + unop + MUL a2, b3, t2 +#ifndef TRMMKERNEL + LD c12, 1 * SIZE(C2) +#else + unop +#endif + + ADD4 c05, t3, a6 + fmov a6, c05 + MUL a1, b4, t3 + ADD2 c06, t4, a6 + fmov a6, c06 + MUL a2, b4, t4 + + ADD1 c09, t1, a6 + fmov a6, c09 + ADD3 c10, t2, a6 + fmov a6, c10 + ADD4 c13, t3, a6 + fmov a6, c13 + ADD2 c14, t4, a6 + fmov a6, c14 + + ADD c01, c06, a6 + fmov a6, c01 + ADD c02, c05, a6 + fmov a6, c02 + ADD c09, c14, a6 + fmov a6, c09 + ADD c10, c13, a6 + fmov a6, c10 + + IFMOVD tmp, alpha_r + MUL alpha_r, c01, t1 + MUL alpha_r, c02, t2 + MUL alpha_r, c09, t3 + MUL alpha_r, c10, t4 + +#ifndef TRMMKERNEL + ADD c03, t1, a6 + fmov a6, c03 + MUL alpha_i, c02, t1 + ADD c04, t2, a6 + fmov a6, c04 + MUL alpha_i, c01, t2 + + ADD c11, t3, a6 + fmov a6, c11 + MUL alpha_i, c10, t3 + ADD c12, t4, a6 + fmov a6, c12 + MUL alpha_i, c09, t4 +#else + ADD $f31, t1, c03 + MUL alpha_i, c02, t1 + ADD $f31, t2, c04 + MUL alpha_i, c01, t2 + + ADD $f31, t3, c11 + MUL alpha_i, c10, t3 + ADD $f31, t4, c12 + MUL alpha_i, c09, t4 +#endif + + SUB c03, t1, a6 + fmov a6, c03 + ADD c04, t2, a6 + fmov a6, c04 + SUB c11, t3, a6 + fmov a6, c11 + ADD c12, t4, a6 + fmov a6, c12 + + ST c03, 0 * SIZE(C1) + ST c04, 1 * SIZE(C1) + ST c11, 0 * SIZE(C2) + ST c12, 1 * SIZE(C2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 1, TMP1 +#else + subl TMP1, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 1, KK +#endif + .align 4 + +$L29: + mov BO, B + ldi J, -1(J) +#if defined(TRMMKERNEL) && !defined(LEFT) + addl KK, 2, KK +#else + unop +#endif + bgt J, $L01 + .align 4 + +$L30: + and N, 1, J + ble J, $L999 + + mov C, C1 + mov A, AO + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + sra M, 1, I + ble I, $L50 + .align 4 + +$L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 2, TMP1 +#else + addl KK, 1, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi BO, 2 * SIZE(B) + fclr c03 + ldi AO, 4 * SIZE(AO) + fclr c07 + +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + fclr c04 + fclr c08 + ble L, $L45 +#else + sll KK, ZBASE_SHIFT + 1, TMP1 + addl AO, TMP1, AO + sll KK, ZBASE_SHIFT + 0, TMP1 + addl B, TMP1, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi BO, 2 * SIZE(BO) + fclr c03 + ldi AO, 4 * SIZE(AO) + fclr c07 + + ldi L, -2(TMP1) + fclr c04 + fclr c08 + ble L, $L45 +#endif + .align 5 + +$L42: + ADD4 c05, t1, a6 + fmov a6, c05 + unop + MUL a1, b1, t1 + unop + + ADD2 c06, t2, a6 + fmov a6, c06 + ldi L, -2(L) + MUL a2, b1, t2 + unop + + ADD4 c07, t3, a6 + fmov a6, c07 + unop + MUL a3, b1, t3 + unop + + ADD2 c08, t4, a6 + fmov a6, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD1 c01, t1, a6 + fmov a6, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, a6 + fmov a6, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, a6 + fmov a6, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, a6 + fmov a6, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD4 c05, t1, a6 + fmov a6, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD2 c06, t2, a6 + fmov a6, c06 + unop + MUL a2, b3, t2 + unop + + ADD4 c07, t3, a6 + fmov a6, c07 + unop + MUL a3, b3, t3 + ldi AO, 8 * SIZE(AO) + + ADD2 c08, t4, a6 + fmov a6, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD1 c01, t1, a6 + fmov a6, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD3 c02, t2, a6 + fmov a6, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD1 c03, t3, a6 + fmov a6, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD3 c04, t4, a6 + fmov a6, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L42 + .align 4 + +$L45: + ADD4 c05, t1, a6 + fmov a6, c05 + fldd alpha_r, ALPHA_R + FIMOVD alpha_r, tmp + MUL b1, a1, t1 +#ifndef TRMMKERNEL + blbs K, $L48 +#else + blbs TMP1, $L48 +#endif + .align 4 + + ADD2 c06, t2, a6 + fmov a6, c06 + MUL a2, b1, t2 + ADD4 c07, t3, a6 + fmov a6, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, a6 + fmov a6, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD1 c01, t1, a6 + fmov a6, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, a6 + fmov a6, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, a6 + fmov a6, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, a6 + fmov a6, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + ldi AO, 4 * SIZE(AO) + + ADD4 c05, t1, a6 + fmov a6, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L48: + ADD2 c06, t2, a6 + fmov a6, c06 + unop + MUL a2, b1, t2 + fldd alpha_i, ALPHA_I + + ADD4 c07, t3, a6 + fmov a6, c07 + ldi I, -1(I) + MUL a3, b1, t3 +#ifndef TRMMKERNEL + LD c09, 0 * SIZE(C1) +#else + unop +#endif + + ADD2 c08, t4, a6 + fmov a6, c08 + unop + MUL a4, b1, t4 +#ifndef TRMMKERNEL + LD c10, 1 * SIZE(C1) +#else + unop +#endif + + ADD1 c01, t1, a6 + fmov a6, c01 + unop + MUL a1, b2, t1 +#ifndef TRMMKERNEL + LD c11, 2 * SIZE(C1) +#else + unop +#endif + + ADD3 c02, t2, a6 + fmov a6, c02 + unop + MUL a2, b2, t2 +#ifndef TRMMKERNEL + LD c12, 3 * SIZE(C1) +#else + unop +#endif + + ADD1 c03, t3, a6 + fmov a6, c03 + MUL a3, b2, t3 + ADD3 c04, t4, a6 + fmov a6, c04 + MUL a4, b2, t4 + + ADD4 c05, t1, a6 + fmov a6, c05 + ADD2 c06, t2, a6 + fmov a6, c06 + ADD4 c07, t3, a6 + fmov a6, c07 + ADD2 c08, t4, a6 + fmov a6, c08 + + ADD c01, c06, a6 + fmov a6, c01 + ADD c02, c05, a6 + fmov a6, c02 + ADD c03, c08, a6 + fmov a6, c03 + ADD c04, c07, a6 + fmov a6, c04 + + IFMOVD tmp, alpha_r + MUL alpha_r, c01, t1 + MUL alpha_r, c02, t2 + MUL alpha_r, c03, t3 + MUL alpha_r, c04, t4 + +#ifndef TRMMKERNEL + ADD c09, t1, a6 + fmov a6, c09 + MUL alpha_i, c02, t1 + ADD c10, t2, a6 + fmov a6, c10 + MUL alpha_i, c01, t2 + + ADD c11, t3, a6 + fmov a6, c11 + MUL alpha_i, c04, t3 + ADD c12, t4, a6 + fmov a6, c12 + MUL alpha_i, c03, t4 +#else + ADD $f31, t1, c09 + MUL alpha_i, c02, t1 + ADD $f31, t2, c10 + MUL alpha_i, c01, t2 + + ADD $f31, t3, c11 + MUL alpha_i, c04, t3 + ADD $f31, t4, c12 + MUL alpha_i, c03, t4 +#endif + + SUB c09, t1, a6 + fmov a6, c09 + ADD c10, t2, a6 + fmov a6, c10 + SUB c11, t3, a6 + fmov a6, c11 + ADD c12, t4, a6 + fmov a6, c12 + + ST c09, 0 * SIZE(C1) + ST c10, 1 * SIZE(C1) + ST c11, 2 * SIZE(C1) + ST c12, 3 * SIZE(C1) + + ldi C1, 4 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 2, TMP1 +#else + subl TMP1, 1, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addl BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 2, KK +#endif + + bgt I, $L41 + .align 4 + +$L50: + and M, 1, I + ble I, $L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 1, TMP1 +#else + addl KK, 1, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi AO, 2 * SIZE(AO) + ldi BO, 2 * SIZE(B) + +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + ble L, $L55 +#else + sll KK, ZBASE_SHIFT + 0, TMP1 + addl AO, TMP1, AO + addl B, TMP1, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi AO, 2 * SIZE(AO) + ldi BO, 2 * SIZE(BO) + + ldi L, -2(TMP1) + ble L, $L55 +#endif + .align 5 + +$L52: + ADD1 c01, t1, a6 + fmov a6, c01 + unop + MUL a1, b1, t1 + unop + + ADD3 c02, t2, a6 + fmov a6, c02 + ldi AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD4 c05, t3, a6 + fmov a6, c05 + ldi L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, a6 + fmov a6, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c01, t1, a6 + fmov a6, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + ldi BO, 4 * SIZE(BO) + + ADD3 c02, t2, a6 + fmov a6, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD4 c05, t3, a6 + fmov a6, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, a6 + fmov a6, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L52 + .align 4 + +$L55: + ADD1 c01, t1, a6 + fmov a6, c01 + fldd alpha_r, ALPHA_R + FIMOVD alpha_r, tmp + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L58 +#else + blbs TMP1, $L58 +#endif + .align 4 + + ADD3 c02, t2, a6 + fmov a6, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c05, t3, a6 + fmov a6, c05 + ldi BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD2 c06, t4, a6 + fmov a6, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD1 c01, t1, a6 + fmov a6, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + ldi AO, 2 * SIZE(AO) + .align 4 + +$L58: + ADD3 c02, t2, a6 + fmov a6, c02 + unop + MUL a2, b1, t2 + fldd alpha_i, ALPHA_I + + ADD4 c05, t3, a6 + fmov a6, c05 + unop + MUL a1, b2, t3 +#ifndef TRMMKERNEL + LD c03, 0 * SIZE(C1) +#else + unop +#endif + + ADD2 c06, t4, a6 + fmov a6, c06 + unop + MUL a2, b2, t4 +#ifndef TRMMKERNEL + LD c04, 1 * SIZE(C1) +#else + unop +#endif + + ADD1 c01, t1, a6 + fmov a6, c01 + ADD3 c02, t2, a6 + fmov a6, c02 + ADD4 c05, t3, a6 + fmov a6, c05 + ADD2 c06, t4, a6 + fmov a6, c06 + + ADD c01, c06, a6 + fmov a6, c01 + ADD c02, c05, a6 + fmov a6, c02 + + IFMOVD tmp, alpha_r + MUL alpha_r, c01, t1 + MUL alpha_r, c02, t2 + MUL alpha_i, c02, t3 + MUL alpha_i, c01, t4 + +#ifndef TRMMKERNEL + ADD c03, t1, a6 + fmov a6, c03 + ADD c04, t2, a6 + fmov a6, c04 +#else + ADD $f31, t1, c03 + ADD $f31, t2, c04 +#endif + + SUB c03, t3, a6 + fmov a6, c03 + ADD c04, t4, a6 + fmov a6, c04 + + ST c03, 0 * SIZE(C1) + ST c04, 1 * SIZE(C1) + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + ldl $9, 80($sp) + clr $0 + ldi $sp, STACKSIZE($sp) + ret + .ident VERSION + .end CNAME diff --git a/kernel/sw_64/zgemm_kernel_2x2.S.bak b/kernel/sw_64/zgemm_kernel_2x2.S.bak new file mode 100644 index 0000000..2133673 --- /dev/null +++ b/kernel/sw_64/zgemm_kernel_2x2.S.bak @@ -0,0 +1,1704 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(SW2B) +#error "Architecture is not specified." +#endif + +#ifdef SW2B +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + + + .set noat + .set noreorder + .arch ev6 + +.text + .align 5 + .globl CNAME + .ent CNAME + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $21 +#define B $22 +#define C $20 +#define LDC $23 + +#define C1 $19 +#define C2 $24 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha_i $f29 +#define alpha_r $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define BB $3 +#define OFFSET $4 + +#define ALPHA_R 64($sp) +#define ALPHA_I 72($sp) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#else +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 SUB +#define ADD4 SUB +#endif + +CNAME: + .frame $sp, STACKSIZE, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + ldi $at, _mcount + jsr $at, ($at), _mcount +#endif + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + ldi $sp, -STACKSIZE($sp) + + ldl B, 0 + STACKSIZE($sp) + ldl C, 8 + STACKSIZE($sp) + ldl LDC, 16 + STACKSIZE($sp) +#ifdef TRMMKERNEL + ldl OFFSET, 24 + STACKSIZE($sp) +#endif + + sll LDC, ZBASE_SHIFT, LDC + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + fstd $f19, ALPHA_R + fstd $f20, ALPHA_I + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#if defined(TRMMKERNEL) && !defined(LEFT) + subl $31, OFFSET, KK +#endif + + sra N, 1, J + ble J, $L30 + .align 4 + +$L01: + mov C, C1 + addl C, LDC, C2 + mov A, AO + s4addl K, 0, BB + + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + SXADDQ BB, B, BB + addl C2, LDC, C + unop + + sra M, 1, I + fclr t1 + fclr t2 + fclr t3 + fclr t4 + + fclr c01 + fclr c05 + + ble I, $L20 + .align 4 + +$L11: +#ifndef EV4 + fillcs 0 * SIZE(BB) + fillcs 8 * SIZE(BB) + unop + ldi BB, 16 * SIZE(BB) +#endif + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 2, TMP1 +#else + addl KK, 2, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c07 + + ldi BO, 4 * SIZE(B) + fclr c11 + ldi AO, 4 * SIZE(AO) + fclr c15 + + fillcs 4 * SIZE(C1) + fclr c04 +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + fclr c08 + + fillcs 4 * SIZE(C2) + fclr c12 + fclr c16 + ble L, $L15 +#else + sll KK, ZBASE_SHIFT + 1, TMP1 + addl AO, TMP1, AO + addl B, TMP1, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c07 + + ldi BO, 4 * SIZE(BO) + fclr c11 + ldi AO, 4 * SIZE(AO) + fclr c15 + + fillcs 4 * SIZE(C1) + fclr c04 + ldi L, -2(TMP1) + fclr c08 + + fillcs 4 * SIZE(C2) + fclr c12 + fclr c16 + ble L, $L15 +#endif + .align 5 + +$L12: +/* 1 */ + ADD1 c11, t1, c11 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD3 c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD4 c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD1 c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD3 c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD1 c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD3 c12, t2, c12 + ldi L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD4 c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD1 c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD1 c03, t1, c03 + ldi AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD3 c04, t2, c04 + ldi BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD1 c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD1 c11, t1, c11 + fldd alpha_r, ALPHA_R + MUL b1, a1, t1 +#ifndef TRMMKERNEL + blbs K, $L18 +#else + blbs TMP1, $L18 +#endif + .align 4 + + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD2 c06, t3, c06 + MUL b2, a4, t3 + ADD4 c05, t4, c05 + MUL b4, a1, t4 + + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + ldi AO, 4 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD1 c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L18: + ADD3 c12, t2, c12 + unop + MUL b1, a2, t2 + fldd alpha_i, ALPHA_I + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 +#ifndef TRMMKERNEL + LD a5, 0 * SIZE(C1) +#else + unop +#endif + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + unop + MUL b1, a4, t2 +#ifndef TRMMKERNEL + LD b1, 1 * SIZE(C1) +#else + unop +#endif + + ADD2 c06, t3, c06 + MUL b2, a4, t3 + ADD4 c05, t4, c05 + MUL b4, a1, t4 + + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 +#ifndef TRMMKERNEL + LD a1, 2 * SIZE(C1) +#else + unop +#endif + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 +#ifndef TRMMKERNEL + LD a2, 3 * SIZE(C1) +#else + unop +#endif + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 +#ifndef TRMMKERNEL + LD b2, 0 * SIZE(C2) +#else + unop +#endif + + ADD1 c09, t1, c09 + ldi I, -1(I) + MUL b3, a3, t1 + unop + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 +#ifndef TRMMKERNEL + LD b3, 1 * SIZE(C2) +#else + unop +#endif + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 +#ifndef TRMMKERNEL + LD a4, 2 * SIZE(C2) +#else + unop +#endif + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 +#ifndef TRMMKERNEL + LD a3, 3 * SIZE(C2) +#else + unop +#endif + + ADD1 c11, t1, c11 + ADD3 c12, t2, c12 + ADD2 c16, t3, c16 + ADD4 c15, t4, c15 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + + ADD c09, c14, c09 + MUL alpha_r, c01, t1 + ADD c10, c13, c10 + MUL alpha_r, c02, t2 + + ADD c11, c16, c11 + MUL alpha_r, c03, t3 + ADD c12, c15, c12 + MUL alpha_r, c04, t4 + +#ifndef TRMMKERNEL + ADD a5, t1, a5 + MUL alpha_i, c02, t1 + ADD b1, t2, b1 + MUL alpha_i, c01, t2 + + ADD a1, t3, a1 + MUL alpha_i, c04, t3 + ADD a2, t4, a2 + MUL alpha_i, c03, t4 +#else + ADD $f31, t1, a5 + MUL alpha_i, c02, t1 + ADD $f31, t2, b1 + MUL alpha_i, c01, t2 + + ADD $f31, t3, a1 + MUL alpha_i, c04, t3 + ADD $f31, t4, a2 + MUL alpha_i, c03, t4 +#endif + + SUB a5, t1, a5 + MUL alpha_r, c09, t1 + ADD b1, t2, b1 + MUL alpha_r, c10, t2 + + SUB a1, t3, a1 + MUL alpha_r, c11, t3 + ADD a2, t4, a2 + MUL alpha_r, c12, t4 + +#ifndef TRMMKERNEL + ADD b2, t1, b2 + MUL alpha_i, c10, t1 + ADD b3, t2, b3 + MUL alpha_i, c09, t2 + + ADD a4, t3, a4 + MUL alpha_i, c12, t3 + ADD a3, t4, a3 + MUL alpha_i, c11, t4 +#else + ADD $f31, t1, b2 + MUL alpha_i, c10, t1 + ADD $f31, t2, b3 + MUL alpha_i, c09, t2 + + ADD $f31, t3, a4 + MUL alpha_i, c12, t3 + ADD $f31, t4, a3 + MUL alpha_i, c11, t4 +#endif + + SUB b2, t1, b2 + ST a5, 0 * SIZE(C1) + fclr t1 + unop + + ADD b3, t2, b3 + ST b1, 1 * SIZE(C1) + fclr t2 + unop + + SUB a4, t3, a4 + ST a1, 2 * SIZE(C1) + fclr t3 + unop + + ADD a3, t4, a3 + ST a2, 3 * SIZE(C1) + fclr t4 + unop + + ST b2, 0 * SIZE(C2) + fclr c01 + ST b3, 1 * SIZE(C2) + fclr c05 + + ST a4, 2 * SIZE(C2) + ldi C1, 4 * SIZE(C1) + ST a3, 3 * SIZE(C2) + ldi C2, 4 * SIZE(C2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 2, TMP1 +#else + subl TMP1, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP1 + addl AO, TMP1, AO + addl BO, TMP1, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 2, KK +#endif + bgt I, $L11 + .align 4 + +$L20: + and M, 1, I + ble I, $L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 1, TMP1 +#else + addl KK, 2, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + ldi AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(B) + ldi BO, 4 * SIZE(B) + +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + ble L, $L25 +#else + sll KK, ZBASE_SHIFT + 0, TMP1 + addl AO, TMP1, AO + sll KK, ZBASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(BO) + ldi BO, 4 * SIZE(BO) + + ldi L, -2(TMP1) + ble L, $L25 +#endif + .align 5 + +$L22: + ADD1 c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + ldi BO, 8 * SIZE(BO) + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a3, b2, t3 + ldi AO, 4 * SIZE(AO) + + ADD2 c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD1 c01, t1, c01 + ldi L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD1 c09, t1, c09 + fldd alpha_r, ALPHA_R + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L28 +#else + blbs TMP1, $L28 +#endif + .align 4 + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + ldi AO, 2 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L28: + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + fldd alpha_i, ALPHA_I + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 +#ifndef TRMMKERNEL + LD c03, 0 * SIZE(C1) +#else + unop +#endif + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 +#ifndef TRMMKERNEL + LD c04, 1 * SIZE(C1) +#else + unop +#endif + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 +#ifndef TRMMKERNEL + LD c11, 0 * SIZE(C2) +#else + unop +#endif + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 +#ifndef TRMMKERNEL + LD c12, 1 * SIZE(C2) +#else + unop +#endif + + ADD4 c05, t3, c05 + MUL a1, b4, t3 + ADD2 c06, t4, c06 + MUL a2, b4, t4 + + ADD1 c09, t1, c09 + ADD3 c10, t2, c10 + ADD4 c13, t3, c13 + ADD2 c14, t4, c14 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c09, c14, c09 + ADD c10, c13, c10 + + MUL alpha_r, c01, t1 + MUL alpha_r, c02, t2 + MUL alpha_r, c09, t3 + MUL alpha_r, c10, t4 + +#ifndef TRMMKERNEL + ADD c03, t1, c03 + MUL alpha_i, c02, t1 + ADD c04, t2, c04 + MUL alpha_i, c01, t2 + + ADD c11, t3, c11 + MUL alpha_i, c10, t3 + ADD c12, t4, c12 + MUL alpha_i, c09, t4 +#else + ADD $f31, t1, c03 + MUL alpha_i, c02, t1 + ADD $f31, t2, c04 + MUL alpha_i, c01, t2 + + ADD $f31, t3, c11 + MUL alpha_i, c10, t3 + ADD $f31, t4, c12 + MUL alpha_i, c09, t4 +#endif + + SUB c03, t1, c03 + ADD c04, t2, c04 + SUB c11, t3, c11 + ADD c12, t4, c12 + + ST c03, 0 * SIZE(C1) + ST c04, 1 * SIZE(C1) + ST c11, 0 * SIZE(C2) + ST c12, 1 * SIZE(C2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 1, TMP1 +#else + subl TMP1, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 1, KK +#endif + .align 4 + +$L29: + mov BO, B + ldi J, -1(J) +#if defined(TRMMKERNEL) && !defined(LEFT) + addl KK, 2, KK +#else + unop +#endif + bgt J, $L01 + .align 4 + +$L30: + and N, 1, J + ble J, $L999 + + mov C, C1 + mov A, AO + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + sra M, 1, I + ble I, $L50 + .align 4 + +$L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 2, TMP1 +#else + addl KK, 1, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi BO, 2 * SIZE(B) + fclr c03 + ldi AO, 4 * SIZE(AO) + fclr c07 + +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + fclr c04 + fclr c08 + ble L, $L45 +#else + sll KK, ZBASE_SHIFT + 1, TMP1 + addl AO, TMP1, AO + sll KK, ZBASE_SHIFT + 0, TMP1 + addl B, TMP1, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi BO, 2 * SIZE(BO) + fclr c03 + ldi AO, 4 * SIZE(AO) + fclr c07 + + ldi L, -2(TMP1) + fclr c04 + fclr c08 + ble L, $L45 +#endif + .align 5 + +$L42: + ADD4 c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD2 c06, t2, c06 + ldi L, -2(L) + MUL a2, b1, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD4 c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD2 c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b3, t3 + ldi AO, 8 * SIZE(AO) + + ADD2 c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD1 c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L42 + .align 4 + +$L45: + ADD4 c05, t1, c05 + fldd alpha_r, ALPHA_R + MUL b1, a1, t1 +#ifndef TRMMKERNEL + blbs K, $L48 +#else + blbs TMP1, $L48 +#endif + .align 4 + + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + ldi AO, 4 * SIZE(AO) + + ADD4 c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L48: + ADD2 c06, t2, c06 + unop + MUL a2, b1, t2 + fldd alpha_i, ALPHA_I + + ADD4 c07, t3, c07 + ldi I, -1(I) + MUL a3, b1, t3 +#ifndef TRMMKERNEL + LD c09, 0 * SIZE(C1) +#else + unop +#endif + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 +#ifndef TRMMKERNEL + LD c10, 1 * SIZE(C1) +#else + unop +#endif + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 +#ifndef TRMMKERNEL + LD c11, 2 * SIZE(C1) +#else + unop +#endif + + ADD3 c02, t2, c02 + unop + MUL a2, b2, t2 +#ifndef TRMMKERNEL + LD c12, 3 * SIZE(C1) +#else + unop +#endif + + ADD1 c03, t3, c03 + MUL a3, b2, t3 + ADD3 c04, t4, c04 + MUL a4, b2, t4 + + ADD4 c05, t1, c05 + ADD2 c06, t2, c06 + ADD4 c07, t3, c07 + ADD2 c08, t4, c08 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + + MUL alpha_r, c01, t1 + MUL alpha_r, c02, t2 + MUL alpha_r, c03, t3 + MUL alpha_r, c04, t4 + +#ifndef TRMMKERNEL + ADD c09, t1, c09 + MUL alpha_i, c02, t1 + ADD c10, t2, c10 + MUL alpha_i, c01, t2 + + ADD c11, t3, c11 + MUL alpha_i, c04, t3 + ADD c12, t4, c12 + MUL alpha_i, c03, t4 +#else + ADD $f31, t1, c09 + MUL alpha_i, c02, t1 + ADD $f31, t2, c10 + MUL alpha_i, c01, t2 + + ADD $f31, t3, c11 + MUL alpha_i, c04, t3 + ADD $f31, t4, c12 + MUL alpha_i, c03, t4 +#endif + + SUB c09, t1, c09 + ADD c10, t2, c10 + SUB c11, t3, c11 + ADD c12, t4, c12 + + ST c09, 0 * SIZE(C1) + ST c10, 1 * SIZE(C1) + ST c11, 2 * SIZE(C1) + ST c12, 3 * SIZE(C1) + + ldi C1, 4 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 +#ifdef LEFT + subl TMP1, 2, TMP1 +#else + subl TMP1, 1, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addl BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 2, KK +#endif + + bgt I, $L41 + .align 4 + +$L50: + and M, 1, I + ble I, $L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addl KK, 1, TMP1 +#else + addl KK, 1, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi AO, 2 * SIZE(AO) + ldi BO, 2 * SIZE(B) + +#ifndef TRMMKERNEL + ldi L, -2(K) +#else + ldi L, -2(TMP1) +#endif + ble L, $L55 +#else + sll KK, ZBASE_SHIFT + 0, TMP1 + addl AO, TMP1, AO + addl B, TMP1, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi AO, 2 * SIZE(AO) + ldi BO, 2 * SIZE(BO) + + ldi L, -2(TMP1) + ble L, $L55 +#endif + .align 5 + +$L52: + ADD1 c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD3 c02, t2, c02 + ldi AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + ldi L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + ldi BO, 4 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L52 + .align 4 + +$L55: + ADD1 c01, t1, c01 + fldd alpha_r, ALPHA_R + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L58 +#else + blbs TMP1, $L58 +#endif + .align 4 + + ADD3 c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + ldi BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + ldi AO, 2 * SIZE(AO) + .align 4 + +$L58: + ADD3 c02, t2, c02 + unop + MUL a2, b1, t2 + fldd alpha_i, ALPHA_I + + ADD4 c05, t3, c05 + unop + MUL a1, b2, t3 +#ifndef TRMMKERNEL + LD c03, 0 * SIZE(C1) +#else + unop +#endif + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 +#ifndef TRMMKERNEL + LD c04, 1 * SIZE(C1) +#else + unop +#endif + + ADD1 c01, t1, c01 + ADD3 c02, t2, c02 + ADD4 c05, t3, c05 + ADD2 c06, t4, c06 + + ADD c01, c06, c01 + ADD c02, c05, c02 + + MUL alpha_r, c01, t1 + MUL alpha_r, c02, t2 + MUL alpha_i, c02, t3 + MUL alpha_i, c01, t4 + +#ifndef TRMMKERNEL + ADD c03, t1, c03 + ADD c04, t2, c04 +#else + ADD $f31, t1, c03 + ADD $f31, t2, c04 +#endif + + SUB c03, t3, c03 + ADD c04, t4, c04 + + ST c03, 0 * SIZE(C1) + ST c04, 1 * SIZE(C1) + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + clr $0 + ldi $sp, STACKSIZE($sp) + ret + .ident VERSION + .end CNAME diff --git a/kernel/sw_64/zgemm_kernel_simd_8x2.S b/kernel/sw_64/zgemm_kernel_simd_8x2.S new file mode 100644 index 0000000..f6a36fb --- /dev/null +++ b/kernel/sw_64/zgemm_kernel_simd_8x2.S @@ -0,0 +1,3189 @@ +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(SW2B) +#error "Architecture is not specified." +#endif + +#define STACKSIZE 128 + +#define M $16 +#define N $17 +#define K $18 +#define A $21 +#define B $22 +#define C $20 +#define LDC $23 + +#define C1 $19 +#define C2 $24 + +#define PREA $10 +#define PREB $11 + +#define AO $9 +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define a5 $f16 +#define a6 $f24 +#define a7 $f25 +#define a8 $f26 + +#define b5 $f27 +#define b6 $f28 +#define b7 $f29 +#define b8 $f30 + +#define alpha_i $f29 +#define alpha_r $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TEMP $1 +#define KK $2 +#define BB $3 +#define OFFSET $4 + +#define ALPHA_R 64($sp) +#define ALPHA_I 72($sp) + +/* + *=================== + * (a+bi)*(c+di) + * ADD1 ac '+' bd + * ADD2 ad '+' bc + * FMAD5 a*alpha_r + real part + * FMAD6 a*alpha_i + image part + * FMAD7 b*alpha_r + image part + * FMAD8 b*alpha_i + real part + + *=================== + */ + +/* + *=================== + * (a+bi) * (c+di) + * (a+bi) * (alpha_r+alpha_i) + *=================== + */ +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 SUB +#define ADD2 ADD +#define FMAD5 MAD +#define FMAD6 MAD +#define FMAD7 MAD +#define FMAD8 NMAD +#endif + +/* + *=================== + * (a-bi) * (c+di) + * (a+bi) * (alpha_r+alpha_i) + *=================== + */ + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 ADD +#define ADD2 SUB +#define FMAD5 MAD +#define FMAD6 MAD +#define FMAD7 MAD +#define FMAD8 NMAD +#endif + +/* + *=================== + * (a+bi) * (c-di) + * (a-bi) * (alpha_r+alpha_i) + *=================== + */ + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 ADD +#define ADD2 SUB +#define FMAD5 MAD +#define FMAD6 MAD +#define FMAD7 NMAD +#define FMAD8 MAD +#endif + +/* + *=================== + * (a-bi) * (c-di) + * (a-bi) * (alpha_r+alpha_i) + *=================== + */ +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define ADD1 SUB +#define ADD2 ADD +#define FMAD5 MAD +#define FMAD6 MAD +#define FMAD7 NMAD +#define FMAD8 MAD +#endif + + + + PROLOGUE + PROFCODE + + .frame $30, STACKSIZE, $26, 0 + ldi $sp, -STACKSIZE($sp) + + + ldl B, 0 + STACKSIZE($sp) + ldl C, 8 + STACKSIZE($sp) + ldl LDC, 16 + STACKSIZE($sp) +#ifdef TRMMKERNEL + ldl OFFSET, 24 + STACKSIZE($sp) +#endif + + sll LDC, ZBASE_SHIFT, LDC # LDC*sizebyte + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + ST $f19, ALPHA_R + ST $f20, ALPHA_I + + stl $9, 80($sp) # Integer Saved Register + stl $10,88($sp) + stl $11,96($sp) + stl $12,104($sp) + stl $13,112($sp) + stl $14,120($sp) + + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#if defined(TRMMKERNEL) && !defined(LEFT) + subl $31, OFFSET, KK +#endif + + sra N, 1, J # J=N/2 + ble J, $L50 + .align 4 + +$L01: +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + sra M, 3, I # I=M/8 + sll K, ZBASE_SHIFT, PREB + + sll K, 2+ZBASE_SHIFT, PREA + mov C, C1 + + addl C, LDC, C2 + mov A, AO # Reset A + + addl PREB, B, PREB + addl C2, LDC, C # Change C to next panel + + addl PREA, A, PREA + beq I, $L20 # GEMM_MR=8 + +$L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO # LL && RU reset B + nop +#else + sll KK, 3 + ZBASE_SHIFT, L # KK*8mr + sll KK, 1 + ZBASE_SHIFT, TEMP # KK*2nr + + addl AO, L, AO # mov AO point to the data part + addl B,TEMP,BO # mov BO point to the data part +#endif + + vcpys $f31,$f31,c01 # Clear result regs + fillcs 0(C1) + fillcs 4*SIZE(C1) + + vcpys $f31,$f31,c02 + fillcs 8*SIZE(C1) + fillcs 12*SIZE(C1) + + vcpys $f31,$f31,c03 + fillcs 0(C2) + fillcs 4*SIZE(C2) + + vcpys $f31,$f31,c04 + fillcs 8*SIZE(C2) + fillcs 12*SIZE(C2) + + vcpys $f31,$f31,c05 + vcpys $f31,$f31,c06 + vcpys $f31,$f31,c07 + vcpys $f31,$f31,c08 + + vcpys $f31,$f31,c09 + LDDE b1, 0 * SIZE(BO) # B1R + LDDE b2, 1 * SIZE(BO) # B1I + + vcpys $f31,$f31,c10 + VLD a1, 0 * SIZE(AO) # A1, A2 + VLD a2, 4 * SIZE(AO) # A3, A4 + + vcpys $f31,$f31,c11 + LDDE b3, 2 * SIZE(BO) # B2R + LDDE b4, 3 * SIZE(BO) # B2I + + vcpys $f31,$f31,c12 + VLD a3, 8 * SIZE(AO) # A5, A6 + VLD a4,12 * SIZE(AO) # A7, A8 + + vcpys $f31,$f31,c13 + vcpys $f31,$f31,c14 + vcpys $f31,$f31,c15 + vcpys $f31,$f31,c16 + + + + +#if (defined(LEFT) && !defined(TRANSA)) \ + ||(!defined(LEFT) && defined(TRANSA)) + subl K, KK, TEMP # temp is the length of data part +#elif defined(LEFT) + addl KK, 8, TEMP # mr=8, careful about complex +#else + addl KK, 2, TEMP # nr=2 +#endif + sra TEMP, 1, L # L=TEMP/2 + ble L, $L15 + +#else + vcpys $f31,$f31,c01 # Clear result regs + mov B, BO # Set B, (block A x panel Bj) + sra K, 1, L # Unroll K as 2 + + vcpys $f31,$f31,c02 + fillcs 0(C1) + fillcs 4*SIZE(C1) + + vcpys $f31,$f31,c03 + fillcs 8*SIZE(C1) + fillcs 12*SIZE(C1) + + vcpys $f31,$f31,c04 + fillcs 0(C2) + fillcs 4*SIZE(C2) + + vcpys $f31,$f31,c05 + fillcs 8*SIZE(C2) + fillcs 12*SIZE(C2) + + vcpys $f31,$f31,c06 + vcpys $f31,$f31,c07 + vcpys $f31,$f31,c08 + vcpys $f31,$f31,c09 + + vcpys $f31,$f31,c10 + LDDE b1, 0 * SIZE(BO) # B1R + LDDE b2, 1 * SIZE(BO) # B1I + + vcpys $f31,$f31,c11 + VLD a1, 0 * SIZE(AO) # A1, A2 + VLD a2, 4 * SIZE(AO) # A3, A4 + + vcpys $f31,$f31,c12 + LDDE b3, 2 * SIZE(BO) # B2R + LDDE b4, 3 * SIZE(BO) # B2I + + vcpys $f31,$f31,c13 + VLD a3, 8 * SIZE(AO) # A5, A6 + VLD a4,12 * SIZE(AO) # A7, A8 + + vcpys $f31,$f31,c14 + vcpys $f31,$f31,c15 + + vcpys $f31,$f31,c16 + ble L, $L15 +#endif + + .align 4 +$L12: + addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE + VMAD a1,b1,c01,c01 # C11(ac,bc), C21(ac,bc) + LDDE b5, 4 * SIZE(BO) # next B1R + + VMAD a1,b2,c02,c02 # C11(ad,bd), C21(ad,bd) + LDDE b6, 5 * SIZE(BO) # next B1I + + VMAD a2,b1,c05,c05 # C31, C41 + VLD a8,12 * SIZE(AO) # next A7, A8 + + VMAD a2,b2,c06,c06 # C31, C41 + VLD a7, 8 * SIZE(AO) # next A5, A6 + + VMAD a1,b3,c03,c03 # C12(ac,bc), C22(ac,bc) + VMAD a1,b4,c04,c04 # C12(ad,bd), C22(ad,bd) + VMAD a3,b1,c09,c09 # C51, C61 + VMAD a3,b2,c10,c10 # C51, C61 + + + VMAD a2,b3,c07,c07 # C32, C42 + LDDE b7, 6 * SIZE(BO) # next B2R + + VMAD a2,b4,c08,c08 # C32, C42 + LDDE b8, 7 * SIZE(BO) # next B2I + + VMAD a4,b1,c13,c13 # C71, C81 + VLD a5, 0 * SIZE(AO) # next A1, A2, a5==a0 + + VMAD a4,b2,c14,c14 # C71, C81 + VLD a6, 4 * SIZE(AO) # next A3, A4 + addl BO, 8*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE + + + VMAD a3,b3,c11,c11 # C52, C62 + fillcs 0(PREB) + + VMAD a3,b4,c12,c12 # C52, C62 + fillcs 0(PREA) + + VMAD a4,b3,c15,c15 # C72, C82 + fillcs 8*SIZE(PREA) + + VMAD a4,b4,c16,c16 # C72, C82 + subl L, 1, L # + addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE + + VMAD a8,b5,c13,c13 + LDDE b1, 0 * SIZE(BO) + + VMAD a8,b6,c14,c14 + LDDE b2, 1 * SIZE(BO) + + VMAD a7,b5,c09,c09 + addl PREA, 16*SIZE, PREA + VLD a4,12 * SIZE(AO) + + VMAD a7,b6,c10,c10 + VLD a3, 8 * SIZE(AO) + + VMAD a5,b5,c01,c01 + VMAD a5,b6,c02,c02 + VMAD a5,b7,c03,c03 + VMAD a5,b8,c04,c04 + + VMAD a8,b7,c15,c15 + LDDE b3, 2 * SIZE(BO) + + VMAD a8,b8,c16,c16 + LDDE b4, 3 * SIZE(BO) + + VMAD a6,b5,c05,c05 + VLD a1, 0 * SIZE(AO) + + VMAD a6,b6,c06,c06 + VLD a2, 4 * SIZE(AO) + + + VMAD a7,b7,c11,c11 + fillcs 4*SIZE(PREB) + + VMAD a7,b8,c12,c12 + fillcs 0(PREA) + + VMAD a6,b7,c07,c07 + addl PREB, 8*SIZE, PREB + fillcs 8*SIZE(PREA) + + VMAD a6,b8,c08,c08 + addl PREA, 16*SIZE, PREA + bne L, $L12 # continue K + +$L15: + LD alpha_r, ALPHA_R # $f30==b8 +#ifndef TRMMKERNEL + blbc K, $L18 # if(K&1) +#else + blbc TEMP, $L18 +#endif + +$L16: + VMAD a1,b1,c01,c01 # C11R C21R + addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE + + VMAD a1,b2,c02,c02 # C11I C21I + addl BO, 4*SIZE, BO + + VMAD a1,b3,c03,c03 # C12R c22R + VMAD a1,b4,c04,c04 # C12I C22I + + VMAD a2,b1,c05,c05 # C31R C41R + VMAD a2,b2,c06,c06 # C31I C41I + VMAD a2,b3,c07,c07 # C32R C42R + VMAD a2,b4,c08,c08 # C32I C42I + + VMAD a3,b1,c09,c09 # C51R C61R + VMAD a3,b2,c10,c10 # C51I C61I + VMAD a3,b3,c11,c11 # C52R C62R + VMAD a3,b4,c12,c12 # C52I C62I + + VMAD a4,b1,c13,c13 # C71R C81R + VMAD a4,b2,c14,c14 # C71I C81I + VMAD a4,b3,c15,c15 # C72R C82R + VMAD a4,b4,c16,c16 # C72I C82I + +$L18: # Write back + LD alpha_i, ALPHA_I # $f29==b7 +#ifndef TRMMKERNEL + vextf c01, 0, a1 # a1=C11R_ac + vextf c01, 1, a2 # a2=C11I_bc + vextf c01, 2, a3 # a3=C21R_ac + vextf c01, 3, a4 # a4=C21I_bc + + vextf c02, 0, b1 # b1=C11I_ad + vextf c02, 1, b2 # b2=C11R_bd + vextf c02, 2, b3 # b3=C21I_ad + vextf c02, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + LD a1, 0 * SIZE(C1) + LD a2, 1 * SIZE(C1) + LD a3, 2 * SIZE(C1) + LD a4, 3 * SIZE(C1) + + FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, a3, a3 + FMAD7 a7, alpha_r, a2, a2 + FMAD7 a8, alpha_r, a4, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 0 * SIZE(C1) + ST c01, 1 * SIZE(C1) + ST b6, 2 * SIZE(C1) + ST c02, 3 * SIZE(C1) + + vextf c05, 0, a1 # a1=C11R_ac + vextf c05, 1, a2 # a2=C11I_bc + vextf c05, 2, a3 # a3=C21R_ac + vextf c05, 3, a4 # a4=C21I_bc + + vextf c06, 0, b1 # b1=C11I_ad + vextf c06, 1, b2 # b2=C11R_bd + vextf c06, 2, b3 # b3=C21I_ad + vextf c06, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + LD a1, 4 * SIZE(C1) + LD a2, 5 * SIZE(C1) + LD a3, 6 * SIZE(C1) + LD a4, 7 * SIZE(C1) + + FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, a3, a3 + FMAD7 a7, alpha_r, a2, a2 + FMAD7 a8, alpha_r, a4, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 4 * SIZE(C1) + ST c01, 5 * SIZE(C1) + ST b6, 6 * SIZE(C1) + ST c02, 7 * SIZE(C1) + + vextf c09, 0, a1 # a1=C11R_ac + vextf c09, 1, a2 # a2=C11I_bc + vextf c09, 2, a3 # a3=C21R_ac + vextf c09, 3, a4 # a4=C21I_bc + + vextf c10, 0, b1 # b1=C11I_ad + vextf c10, 1, b2 # b2=C11R_bd + vextf c10, 2, b3 # b3=C21I_ad + vextf c10, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + LD a1, 8 * SIZE(C1) + LD a2, 9 * SIZE(C1) + LD a3, 10 * SIZE(C1) + LD a4, 11 * SIZE(C1) + + FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, a3, a3 + FMAD7 a7, alpha_r, a2, a2 + FMAD7 a8, alpha_r, a4, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 8 * SIZE(C1) + ST c01, 9 * SIZE(C1) + ST b6, 10 * SIZE(C1) + ST c02, 11 * SIZE(C1) + + vextf c13, 0, a1 # a1=C11R_ac + vextf c13, 1, a2 # a2=C11I_bc + vextf c13, 2, a3 # a3=C21R_ac + vextf c13, 3, a4 # a4=C21I_bc + + vextf c14, 0, b1 # b1=C11I_ad + vextf c14, 1, b2 # b2=C11R_bd + vextf c14, 2, b3 # b3=C21I_ad + vextf c14, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + LD a1, 12 * SIZE(C1) + LD a2, 13 * SIZE(C1) + LD a3, 14 * SIZE(C1) + LD a4, 15 * SIZE(C1) + + FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, a3, a3 + FMAD7 a7, alpha_r, a2, a2 + FMAD7 a8, alpha_r, a4, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 12 * SIZE(C1) + ST c01, 13 * SIZE(C1) + ST b6, 14 * SIZE(C1) + ST c02, 15 * SIZE(C1) + + + vextf c03, 0, a1 # a1=C11R_ac + vextf c03, 1, a2 # a2=C11I_bc + vextf c03, 2, a3 # a3=C21R_ac + vextf c03, 3, a4 # a4=C21I_bc + + vextf c04, 0, b1 # b1=C11I_ad + vextf c04, 1, b2 # b2=C11R_bd + vextf c04, 2, b3 # b3=C21I_ad + vextf c04, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + LD a1, 0 * SIZE(C2) + LD a2, 1 * SIZE(C2) + LD a3, 2 * SIZE(C2) + LD a4, 3 * SIZE(C2) + + FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, a3, a3 + FMAD7 a7, alpha_r, a2, a2 + FMAD7 a8, alpha_r, a4, a4 + + FMAD8 a7, alpha_i, a1, c01 + FMAD8 a8, alpha_i, a3, c05 + FMAD6 b5, alpha_i, a2, c02 + FMAD6 a6, alpha_i, a4, c06 + + ST c01, 0 * SIZE(C2) + ST c02, 1 * SIZE(C2) + ST c05, 2 * SIZE(C2) + ST c06, 3 * SIZE(C2) + + vextf c07, 0, a1 # a1=C11R_ac + vextf c07, 1, a2 # a2=C11I_bc + vextf c07, 2, a3 # a3=C21R_ac + vextf c07, 3, a4 # a4=C21I_bc + + vextf c08, 0, b1 # b1=C11I_ad + vextf c08, 1, b2 # b2=C11R_bd + vextf c08, 2, b3 # b3=C21I_ad + vextf c08, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + LD a1, 4 * SIZE(C2) + LD a2, 5 * SIZE(C2) + LD a3, 6 * SIZE(C2) + LD a4, 7 * SIZE(C2) + + FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, a3, a3 + FMAD7 a7, alpha_r, a2, a2 + FMAD7 a8, alpha_r, a4, a4 + + FMAD8 a7, alpha_i, a1, c01 + FMAD8 a8, alpha_i, a3, c05 + FMAD6 b5, alpha_i, a2, c02 + FMAD6 a6, alpha_i, a4, c06 + + ST c01, 4 * SIZE(C2) + ST c02, 5 * SIZE(C2) + ST c05, 6 * SIZE(C2) + ST c06, 7 * SIZE(C2) + + vextf c11, 0, a1 # a1=C11R_ac + vextf c11, 1, a2 # a2=C11I_bc + vextf c11, 2, a3 # a3=C21R_ac + vextf c11, 3, a4 # a4=C21I_bc + + vextf c12, 0, b1 # b1=C11I_ad + vextf c12, 1, b2 # b2=C11R_bd + vextf c12, 2, b3 # b3=C21I_ad + vextf c12, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + LD a1, 8 * SIZE(C2) + LD a2, 9 * SIZE(C2) + LD a3, 10 * SIZE(C2) + LD a4, 11 * SIZE(C2) + + FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, a3, a3 + FMAD7 a7, alpha_r, a2, a2 + FMAD7 a8, alpha_r, a4, a4 + + FMAD8 a7, alpha_i, a1, c01 + FMAD8 a8, alpha_i, a3, c05 + FMAD6 b5, alpha_i, a2, c02 + FMAD6 a6, alpha_i, a4, c06 + + ST c01, 8 * SIZE(C2) + ST c02, 9 * SIZE(C2) + ST c05, 10 * SIZE(C2) + ST c06, 11 * SIZE(C2) + + vextf c15, 0, a1 # a1=C11R_ac + vextf c15, 1, a2 # a2=C11I_bc + vextf c15, 2, a3 # a3=C21R_ac + vextf c15, 3, a4 # a4=C21I_bc + + vextf c16, 0, b1 # b1=C11I_ad + vextf c16, 1, b2 # b2=C11R_bd + vextf c16, 2, b3 # b3=C21I_ad + vextf c16, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + LD a1, 12 * SIZE(C2) + LD a2, 13 * SIZE(C2) + LD a3, 14 * SIZE(C2) + LD a4, 15 * SIZE(C2) + + FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, a3, a3 + FMAD7 a7, alpha_r, a2, a2 + FMAD7 a8, alpha_r, a4, a4 + + FMAD8 a7, alpha_i, a1, c01 + FMAD8 a8, alpha_i, a3, c05 + FMAD6 b5, alpha_i, a2, c02 + FMAD6 a6, alpha_i, a4, c06 + + ST c01, 12 * SIZE(C2) + ST c02, 13 * SIZE(C2) + ST c05, 14 * SIZE(C2) + ST c06, 15 * SIZE(C2) + +#else + vextf c01, 0, a1 # a1=C11R_ac + vextf c01, 1, a2 # a2=C11I_bc + vextf c01, 2, a3 # a3=C21R_ac + vextf c01, 3, a4 # a4=C21I_bc + + vextf c02, 0, b1 # b1=C11I_ad + vextf c02, 1, b2 # b2=C11R_bd + vextf c02, 2, b3 # b3=C21I_ad + vextf c02, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, $f31, a3 + FMAD7 a7, alpha_r, $f31, a2 + FMAD7 a8, alpha_r, $f31, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 0 * SIZE(C1) + ST c01, 1 * SIZE(C1) + ST b6, 2 * SIZE(C1) + ST c02, 3 * SIZE(C1) + + vextf c05, 0, a1 # a1=C11R_ac + vextf c05, 1, a2 # a2=C11I_bc + vextf c05, 2, a3 # a3=C21R_ac + vextf c05, 3, a4 # a4=C21I_bc + + vextf c06, 0, b1 # b1=C11I_ad + vextf c06, 1, b2 # b2=C11R_bd + vextf c06, 2, b3 # b3=C21I_ad + vextf c06, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, $f31, a3 + FMAD7 a7, alpha_r, $f31, a2 + FMAD7 a8, alpha_r, $f31, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 4 * SIZE(C1) + ST c01, 5 * SIZE(C1) + ST b6, 6 * SIZE(C1) + ST c02, 7 * SIZE(C1) + + vextf c09, 0, a1 # a1=C11R_ac + vextf c09, 1, a2 # a2=C11I_bc + vextf c09, 2, a3 # a3=C21R_ac + vextf c09, 3, a4 # a4=C21I_bc + + vextf c10, 0, b1 # b1=C11I_ad + vextf c10, 1, b2 # b2=C11R_bd + vextf c10, 2, b3 # b3=C21I_ad + vextf c10, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, $f31, a3 + FMAD7 a7, alpha_r, $f31, a2 + FMAD7 a8, alpha_r, $f31, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 8 * SIZE(C1) + ST c01, 9 * SIZE(C1) + ST b6, 10 * SIZE(C1) + ST c02, 11 * SIZE(C1) + + vextf c13, 0, a1 # a1=C11R_ac + vextf c13, 1, a2 # a2=C11I_bc + vextf c13, 2, a3 # a3=C21R_ac + vextf c13, 3, a4 # a4=C21I_bc + + vextf c14, 0, b1 # b1=C11I_ad + vextf c14, 1, b2 # b2=C11R_bd + vextf c14, 2, b3 # b3=C21I_ad + vextf c14, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, $f31, a3 + FMAD7 a7, alpha_r, $f31, a2 + FMAD7 a8, alpha_r, $f31, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 12 * SIZE(C1) + ST c01, 13 * SIZE(C1) + ST b6, 14 * SIZE(C1) + ST c02, 15 * SIZE(C1) + + + vextf c03, 0, a1 # a1=C11R_ac + vextf c03, 1, a2 # a2=C11I_bc + vextf c03, 2, a3 # a3=C21R_ac + vextf c03, 3, a4 # a4=C21I_bc + + vextf c04, 0, b1 # b1=C11I_ad + vextf c04, 1, b2 # b2=C11R_bd + vextf c04, 2, b3 # b3=C21I_ad + vextf c04, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, $f31, a3 + FMAD7 a7, alpha_r, $f31, a2 + FMAD7 a8, alpha_r, $f31, a4 + + FMAD8 a7, alpha_i, a1, c01 + FMAD8 a8, alpha_i, a3, c05 + FMAD6 b5, alpha_i, a2, c02 + FMAD6 a6, alpha_i, a4, c06 + + ST c01, 0 * SIZE(C2) + ST c02, 1 * SIZE(C2) + ST c05, 2 * SIZE(C2) + ST c06, 3 * SIZE(C2) + + vextf c07, 0, a1 # a1=C11R_ac + vextf c07, 1, a2 # a2=C11I_bc + vextf c07, 2, a3 # a3=C21R_ac + vextf c07, 3, a4 # a4=C21I_bc + + vextf c08, 0, b1 # b1=C11I_ad + vextf c08, 1, b2 # b2=C11R_bd + vextf c08, 2, b3 # b3=C21I_ad + vextf c08, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, $f31, a3 + FMAD7 a7, alpha_r, $f31, a2 + FMAD7 a8, alpha_r, $f31, a4 + + FMAD8 a7, alpha_i, a1, c01 + FMAD8 a8, alpha_i, a3, c05 + FMAD6 b5, alpha_i, a2, c02 + FMAD6 a6, alpha_i, a4, c06 + + ST c01, 4 * SIZE(C2) + ST c02, 5 * SIZE(C2) + ST c05, 6 * SIZE(C2) + ST c06, 7 * SIZE(C2) + + vextf c11, 0, a1 # a1=C11R_ac + vextf c11, 1, a2 # a2=C11I_bc + vextf c11, 2, a3 # a3=C21R_ac + vextf c11, 3, a4 # a4=C21I_bc + + vextf c12, 0, b1 # b1=C11I_ad + vextf c12, 1, b2 # b2=C11R_bd + vextf c12, 2, b3 # b3=C21I_ad + vextf c12, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, $f31, a3 + FMAD7 a7, alpha_r, $f31, a2 + FMAD7 a8, alpha_r, $f31, a4 + + FMAD8 a7, alpha_i, a1, c01 + FMAD8 a8, alpha_i, a3, c05 + FMAD6 b5, alpha_i, a2, c02 + FMAD6 a6, alpha_i, a4, c06 + + ST c01, 8 * SIZE(C2) + ST c02, 9 * SIZE(C2) + ST c05, 10 * SIZE(C2) + ST c06, 11 * SIZE(C2) + + vextf c15, 0, a1 # a1=C11R_ac + vextf c15, 1, a2 # a2=C11I_bc + vextf c15, 2, a3 # a3=C21R_ac + vextf c15, 3, a4 # a4=C21I_bc + + vextf c16, 0, b1 # b1=C11I_ad + vextf c16, 1, b2 # b2=C11R_bd + vextf c16, 2, b3 # b3=C21I_ad + vextf c16, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, $f31, a3 + FMAD7 a7, alpha_r, $f31, a2 + FMAD7 a8, alpha_r, $f31, a4 + + FMAD8 a7, alpha_i, a1, c01 + FMAD8 a8, alpha_i, a3, c05 + FMAD6 b5, alpha_i, a2, c02 + FMAD6 a6, alpha_i, a4, c06 + + ST c01, 12 * SIZE(C2) + ST c02, 13 * SIZE(C2) + ST c05, 14 * SIZE(C2) + ST c06, 15 * SIZE(C2) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl K, KK, TEMP +#ifdef LEFT + subl TEMP, 8, TEMP +#else + subl TEMP, 2, TEMP +#endif + + sll TEMP, 3 + ZBASE_SHIFT,L # mr=8 + sll TEMP, 1 + ZBASE_SHIFT,TEMP # nr=2 + + addl AO, L, AO + addl BO, TEMP, BO +#endif + +#ifdef LEFT + addl KK,8,KK +#endif +#endif + + jmp $L09 + + + .align 4 + +$L20: # N=2, M=4 + and M, 4, I # I=M&4 + ble I, $L30 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO + nop +#else + sll KK, 2 + ZBASE_SHIFT, L # mr=4 + sll KK, 1 + ZBASE_SHIFT,TEMP # nr=2 + + addl AO, L, AO + addl B, TEMP, BO +#endif + fillcs 0(C1) + fillcs 4*SIZE(C1) + fillcs 8*SIZE(C1) + + vcpys $f31,$f31,c01 # Clear result regs + vcpys $f31,$f31,c02 + vcpys $f31,$f31,c03 + vcpys $f31,$f31,c04 + + fillcs 0(C2) + fillcs 4*SIZE(C2) + fillcs 8*SIZE(C2) + + vcpys $f31,$f31,c05 + vcpys $f31,$f31,c06 + vcpys $f31,$f31,c07 + vcpys $f31,$f31,c08 + + LDDE b1, 0 * SIZE(BO) # B1R + LDDE b2, 1 * SIZE(BO) # B1I + LDDE b3, 2 * SIZE(BO) # B2R + LDDE b4, 3 * SIZE(BO) # B2I + + VLD a1, 0 * SIZE(AO) # A1, A2 + VLD a2, 4 * SIZE(AO) # A3, A4 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl K, KK, TEMP +#elif defined(LEFT) + addl KK, 4, TEMP # mr=4 +#else + addl KK, 2,TEMP # nr=2 +#endif + sra TEMP, 1, L + ble L, $L25 + +#else + mov B, BO # Set B, (block A x panel Bj) + sra K, 1, L # Unroll K as 2 + + fillcs 0(C1) + fillcs 4*SIZE(C1) + fillcs 8*SIZE(C1) + + vcpys $f31,$f31,c01 # Clear result regs + vcpys $f31,$f31,c02 + vcpys $f31,$f31,c03 + vcpys $f31,$f31,c04 + + fillcs 0(C2) + fillcs 4*SIZE(C2) + fillcs 8*SIZE(C2) + + vcpys $f31,$f31,c05 + vcpys $f31,$f31,c06 + vcpys $f31,$f31,c07 + vcpys $f31,$f31,c08 + + LDDE b1, 0 * SIZE(BO) # B1R + LDDE b2, 1 * SIZE(BO) # B1I + LDDE b3, 2 * SIZE(BO) # B2R + LDDE b4, 3 * SIZE(BO) # B2I + + VLD a1, 0 * SIZE(AO) # A1, A2 + VLD a2, 4 * SIZE(AO) # A3, A4 + + ble L, $L25 +#endif + + .align 4 +$L22: + VMAD a1,b1,c01,c01 # C11(ac,bc), C21(ac,bc) + VMAD a1,b2,c02,c02 # C11(ad,bd), C21(ad,bd) + VMAD a1,b3,c03,c03 # C12(ac,bc), C22(ac,bc) + VMAD a1,b4,c04,c04 # C12(ad,bd), C22(ad,bd) + + LDDE b5, 4 * SIZE(BO) # next B1R + LDDE b6, 5 * SIZE(BO) # next B1I + LDDE b7, 6 * SIZE(BO) # next B2R + LDDE b8, 7 * SIZE(BO) # next B2I + + fillcs 0(PREB) + addl BO, 8*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE + VMAD a2,b1,c05,c05 # C31, C41 + VMAD a2,b2,c06,c06 # C31, C41 + + fillcs 0(PREA) + VMAD a2,b3,c07,c07 # C32, C42 + VMAD a2,b4,c08,c08 # C32, C42 + + VLD a5, 8 * SIZE(AO) # next A1, A2, a5==a0 + VLD a6, 12 * SIZE(AO) # next A3, A4 + + subl L, 1, L # + + addl AO, 16*SIZE, AO # AO+=4mr*2kr*2px*SIZE + VMAD a5,b5,c01,c01 + VMAD a5,b6,c02,c02 + + addl PREA, 16*SIZE, PREA + VMAD a5,b7,c03,c03 + VMAD a5,b8,c04,c04 + + LDDE b1, 0 * SIZE(BO) + LDDE b2, 1 * SIZE(BO) + LDDE b3, 2 * SIZE(BO) + LDDE b4, 3 * SIZE(BO) + + fillcs 4*SIZE(PREB) + VMAD a6,b5,c05,c05 + VMAD a6,b6,c06,c06 + + fillcs 0(PREA) + VMAD a6,b7,c07,c07 + VMAD a6,b8,c08,c08 + + VLD a1, 0 * SIZE(AO) + VLD a2, 4 * SIZE(AO) + + addl PREB, 8*SIZE, PREB + addl PREA, 16*SIZE, PREA + bne L, $L22 # continue K + +$L25: + LD alpha_r, ALPHA_R # $f30==b8 +#ifndef TRMMKERNEL + blbc K, $L28 # if(K&1) +#else + blbc TEMP, $L28 +#endif + +$L26: + addl AO, 8*SIZE, AO # AO+=4mr*1kr*2px*SIZE + VMAD a1,b1,c01,c01 # C11R C21R + VMAD a1,b2,c02,c02 # C11I C21I + VMAD a1,b3,c03,c03 # C12R c22R + VMAD a1,b4,c04,c04 # C12I C22I + + addl BO, 4*SIZE, BO + VMAD a2,b1,c05,c05 # C31R C41R + VMAD a2,b2,c06,c06 # C31I C41I + VMAD a2,b3,c07,c07 # C32R C42R + VMAD a2,b4,c08,c08 # C32I C42I + +$L28: # Write back + LD alpha_i, ALPHA_I # $f29==b7 +#ifndef TRMMKERNEL + vextf c01, 0, a1 # a1=C11R_ac + vextf c01, 1, a2 # a2=C11I_bc + vextf c01, 2, a3 # a3=C21R_ac + vextf c01, 3, a4 # a4=C21I_bc + + vextf c02, 0, b1 # b1=C11I_ad + vextf c02, 1, b2 # b2=C11R_bd + vextf c02, 2, b3 # b3=C21I_ad + vextf c02, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + LD a1, 0 * SIZE(C1) + LD a2, 1 * SIZE(C1) + LD a3, 2 * SIZE(C1) + LD a4, 3 * SIZE(C1) + + FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, a3, a3 + FMAD7 a7, alpha_r, a2, a2 + FMAD7 a8, alpha_r, a4, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 0 * SIZE(C1) + ST c01, 1 * SIZE(C1) + ST b6, 2 * SIZE(C1) + ST c02, 3 * SIZE(C1) + + vextf c05, 0, a1 # a1=C11R_ac + vextf c05, 1, a2 # a2=C11I_bc + vextf c05, 2, a3 # a3=C21R_ac + vextf c05, 3, a4 # a4=C21I_bc + + vextf c06, 0, b1 # b1=C11I_ad + vextf c06, 1, b2 # b2=C11R_bd + vextf c06, 2, b3 # b3=C21I_ad + vextf c06, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + LD a1, 4 * SIZE(C1) + LD a2, 5 * SIZE(C1) + LD a3, 6 * SIZE(C1) + LD a4, 7 * SIZE(C1) + + FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, a3, a3 + FMAD7 a7, alpha_r, a2, a2 + FMAD7 a8, alpha_r, a4, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 4 * SIZE(C1) + ST c01, 5 * SIZE(C1) + ST b6, 6 * SIZE(C1) + ST c02, 7 * SIZE(C1) + + + vextf c03, 0, a1 # a1=C11R_ac + vextf c03, 1, a2 # a2=C11I_bc + vextf c03, 2, a3 # a3=C21R_ac + vextf c03, 3, a4 # a4=C21I_bc + + vextf c04, 0, b1 # b1=C11I_ad + vextf c04, 1, b2 # b2=C11R_bd + vextf c04, 2, b3 # b3=C21I_ad + vextf c04, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + LD a1, 0 * SIZE(C2) + LD a2, 1 * SIZE(C2) + LD a3, 2 * SIZE(C2) + LD a4, 3 * SIZE(C2) + + FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, a3, a3 + FMAD7 a7, alpha_r, a2, a2 + FMAD7 a8, alpha_r, a4, a4 + + FMAD8 a7, alpha_i, a1, c01 + FMAD8 a8, alpha_i, a3, c05 + FMAD6 b5, alpha_i, a2, c02 + FMAD6 a6, alpha_i, a4, c06 + + ST c01, 0 * SIZE(C2) + ST c02, 1 * SIZE(C2) + ST c05, 2 * SIZE(C2) + ST c06, 3 * SIZE(C2) + + vextf c07, 0, a1 # a1=C11R_ac + vextf c07, 1, a2 # a2=C11I_bc + vextf c07, 2, a3 # a3=C21R_ac + vextf c07, 3, a4 # a4=C21I_bc + + vextf c08, 0, b1 # b1=C11I_ad + vextf c08, 1, b2 # b2=C11R_bd + vextf c08, 2, b3 # b3=C21I_ad + vextf c08, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + LD a1, 4 * SIZE(C2) + LD a2, 5 * SIZE(C2) + LD a3, 6 * SIZE(C2) + LD a4, 7 * SIZE(C2) + + FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, a3, a3 + FMAD7 a7, alpha_r, a2, a2 + FMAD7 a8, alpha_r, a4, a4 + + FMAD8 a7, alpha_i, a1, c01 + FMAD8 a8, alpha_i, a3, c05 + FMAD6 b5, alpha_i, a2, c02 + FMAD6 a6, alpha_i, a4, c06 + + ST c01, 4 * SIZE(C2) + ST c02, 5 * SIZE(C2) + ST c05, 6 * SIZE(C2) + ST c06, 7 * SIZE(C2) + +#else + + vextf c01, 0, a1 # a1=C11R_ac + vextf c01, 1, a2 # a2=C11I_bc + vextf c01, 2, a3 # a3=C21R_ac + vextf c01, 3, a4 # a4=C21I_bc + + vextf c02, 0, b1 # b1=C11I_ad + vextf c02, 1, b2 # b2=C11R_bd + vextf c02, 2, b3 # b3=C21I_ad + vextf c02, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, $f31, a3 + FMAD7 a7, alpha_r, $f31, a2 + FMAD7 a8, alpha_r, $f31, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 0 * SIZE(C1) + ST c01, 1 * SIZE(C1) + ST b6, 2 * SIZE(C1) + ST c02, 3 * SIZE(C1) + + vextf c05, 0, a1 # a1=C11R_ac + vextf c05, 1, a2 # a2=C11I_bc + vextf c05, 2, a3 # a3=C21R_ac + vextf c05, 3, a4 # a4=C21I_bc + + vextf c06, 0, b1 # b1=C11I_ad + vextf c06, 1, b2 # b2=C11R_bd + vextf c06, 2, b3 # b3=C21I_ad + vextf c06, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, $f31, a3 + FMAD7 a7, alpha_r, $f31, a2 + FMAD7 a8, alpha_r, $f31, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 4 * SIZE(C1) + ST c01, 5 * SIZE(C1) + ST b6, 6 * SIZE(C1) + ST c02, 7 * SIZE(C1) + + + vextf c03, 0, a1 # a1=C11R_ac + vextf c03, 1, a2 # a2=C11I_bc + vextf c03, 2, a3 # a3=C21R_ac + vextf c03, 3, a4 # a4=C21I_bc + + vextf c04, 0, b1 # b1=C11I_ad + vextf c04, 1, b2 # b2=C11R_bd + vextf c04, 2, b3 # b3=C21I_ad + vextf c04, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, $f31, a3 + FMAD7 a7, alpha_r, $f31, a2 + FMAD7 a8, alpha_r, $f31, a4 + + FMAD8 a7, alpha_i, a1, c01 + FMAD8 a8, alpha_i, a3, c05 + FMAD6 b5, alpha_i, a2, c02 + FMAD6 a6, alpha_i, a4, c06 + + ST c01, 0 * SIZE(C2) + ST c02, 1 * SIZE(C2) + ST c05, 2 * SIZE(C2) + ST c06, 3 * SIZE(C2) + + vextf c07, 0, a1 # a1=C11R_ac + vextf c07, 1, a2 # a2=C11I_bc + vextf c07, 2, a3 # a3=C21R_ac + vextf c07, 3, a4 # a4=C21I_bc + + vextf c08, 0, b1 # b1=C11I_ad + vextf c08, 1, b2 # b2=C11R_bd + vextf c08, 2, b3 # b3=C21I_ad + vextf c08, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, $f31, a3 + FMAD7 a7, alpha_r, $f31, a2 + FMAD7 a8, alpha_r, $f31, a4 + + FMAD8 a7, alpha_i, a1, c01 + FMAD8 a8, alpha_i, a3, c05 + FMAD6 b5, alpha_i, a2, c02 + FMAD6 a6, alpha_i, a4, c06 + + ST c01, 4 * SIZE(C2) + ST c02, 5 * SIZE(C2) + ST c05, 6 * SIZE(C2) + ST c06, 7 * SIZE(C2) + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl K, KK, TEMP +#ifdef LEFT + subl TEMP, 4, TEMP +#else + subl TEMP, 2, TEMP +#endif + + sll TEMP, 2 + ZBASE_SHIFT, L + sll TEMP, 1 + ZBASE_SHIFT, TEMP + + addl AO, L, AO + addl BO, TEMP,BO +#endif + +#ifdef LEFT + addl KK, 4,KK +#endif +#endif + + addl C1, 8*SIZE, C1 + addl C2, 8*SIZE, C2 + + + .align 4 +$L30: + and M, 2, I # I=M&2 + ble I, $L40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO + nop +#else + sll KK, 1 + ZBASE_SHIFT, L # mr=2 + sll KK, 1 + ZBASE_SHIFT,TEMP # nr=2 + + addl AO, L, AO + addl B, TEMP, BO +#endif + + fclr c01 + fclr c02 + fclr c03 + fclr c04 + fclr c05 + fclr c06 + fclr c07 + fclr c08 # CLEAR 8 register + fclr c09 + fclr c10 + fclr c11 + fclr c12 + fclr c13 + fclr c14 + fclr c15 + fclr c16 + + fillcs 0*SIZE(C1) + fillcs 4*SIZE(C1) + + LD b1, 0*SIZE(BO) # b1 real part + LD b2, 1*SIZE(BO) # b1 image part + LD b3, 2*SIZE(BO) # b2 real part + LD b4, 3*SIZE(BO) # b2 image part + + fillcs 0*SIZE(C2) + fillcs 4*SIZE(C2) + + LD a1, 0*SIZE(AO) # a1 real part + LD a2, 1*SIZE(AO) # a1 image part + LD a3, 2*SIZE(AO) # a2 real part + LD a4, 3*SIZE(AO) # a2 image part + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl K, KK, TEMP +#elif defined(LEFT) + addl KK, 2, TEMP # mr=2 +#else + addl KK, 2, TEMP # nr=2 +#endif + sra TEMP, 1, L + ble L, $L35 + +#else + + mov B, BO # Set B, (block A x panel Bj) + sra K, 1, L # Unroll K as 2 + + fclr c01 + fclr c02 + fclr c03 + fclr c04 + fclr c05 + fclr c06 + fclr c07 + fclr c08 # CLEAR 8 register + fclr c09 + fclr c10 + fclr c11 + fclr c12 + fclr c13 + fclr c14 + fclr c15 + fclr c16 + + fillcs 0*SIZE(C1) + fillcs 4*SIZE(C1) + + LD b1, 0*SIZE(BO) # b1 real part + LD b2, 1*SIZE(BO) # b1 image part + LD b3, 2*SIZE(BO) # b2 real part + LD b4, 3*SIZE(BO) # b2 image part + + fillcs 0*SIZE(C2) + fillcs 4*SIZE(C2) + + LD a1, 0*SIZE(AO) # a1 real part + LD a2, 1*SIZE(AO) # a1 image part + LD a3, 2*SIZE(AO) # a2 real part + LD a4, 3*SIZE(AO) # a2 image part + + ble L, $L35 +#endif + + .align 4 +$L32: + MAD a1,b1,c01,c01 # a1*c1 + MAD a1,b2,c02,c02 # a1*d1 + MAD a1,b3,c03,c03 # a1*c2 + MAD a1,b4,c04,c04 # a1*d2 + + LD b5, 4 * SIZE(BO) # next B1R + LD b6, 5 * SIZE(BO) # next B1I + LD b7, 6 * SIZE(BO) # next B2R + LD b8, 7 * SIZE(BO) # next B2I + + LD a5, 4 * SIZE(AO) # next A1-A4 real part + LD a6, 5 * SIZE(AO) # next A1-A4 image part + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + addl BO, 8*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE + MAD a2,b1,c05,c05 # b1*c1 + MAD a2,b2,c06,c06 # b1*d1 + MAD a2,b3,c07,c07 # b1*c2 + MAD a2,b4,c08,c08 # b1*d2 + + MAD a3,b1,c09,c09 # a2*c1 + MAD a3,b2,c10,c10 # a2*d1 + MAD a3,b3,c11,c11 # a2*c2 + MAD a3,b4,c12,c12 # a2*d2 + + MAD a4,b1,c13,c13 # b2*c1 + MAD a4,b2,c14,c14 # b2*d1 + MAD a4,b3,c15,c15 # b2*c2 + MAD a4,b4,c16,c16 # b2*d2 + + subl L, 1, L # + + addl AO, 8*SIZE, AO # AO+=4mr*1kr*2px*SIZE + MAD a5,b5,c01,c01 + MAD a5,b6,c02,c02 + MAD a5,b7,c03,c03 + MAD a5,b8,c04,c04 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MAD a6,b5,c05,c05 + MAD a6,b6,c06,c06 + MAD a6,b7,c07,c07 + MAD a6,b8,c08,c08 + + MAD a7,b5,c09,c09 + MAD a7,b6,c10,c10 + MAD a7,b7,c11,c11 + MAD a7,b8,c12,c12 + + MAD a8,b5,c13,c13 + MAD a8,b6,c14,c14 + MAD a8,b7,c15,c15 + MAD a8,b8,c16,c16 + + bne L, $L32 # continue K + +$L35: + LD alpha_r, ALPHA_R # $f30==b8 +#ifndef TRMMKERNEL + blbc K, $L38 # if(K&1) +#else + blbc TEMP, $L38 +#endif + +$L36: + addl AO, 4*SIZE, AO # AO+=2mr*1kr*2px*SIZE + addl BO, 4*SIZE, BO + + MAD a1,b1,c01,c01 # a1*c1 + MAD a1,b2,c02,c02 # a1*d1 + MAD a1,b3,c03,c03 # a1*c2 + MAD a1,b4,c04,c04 # a1*d2 + + MAD a2,b1,c05,c05 # b1*c1 + MAD a2,b2,c06,c06 # b1*d1 + MAD a2,b3,c07,c07 # b1*c2 + MAD a2,b4,c08,c08 # b1*d2 + + MAD a3,b1,c09,c09 # a2*c1 + MAD a3,b2,c10,c10 # a2*d1 + MAD a3,b3,c11,c11 # a2*c2 + MAD a3,b4,c12,c12 # a2*d2 + + MAD a4,b1,c13,c13 # b2*c1 + MAD a4,b2,c14,c14 # b2*d1 + MAD a4,b3,c15,c15 # b2*c2 + MAD a4,b4,c16,c16 # b2*d2 + + + +$L38: # Write back + LD alpha_i, ALPHA_I # $f29==b7 +#ifndef TRMMKERNEL + ADD1 c01, c06, c01 # ac '+' bd + ADD1 c09, c14, c09 + ADD1 c03, c08, c03 # + ADD1 c11, c16, c11 + + ADD2 c05, c02, c02 # bc '+' ad + ADD2 c13, c10, c10 + ADD2 c07, c04, c04 + ADD2 c15, c12, c12 + + LD b1, 0 * SIZE(C1) + LD b2, 1 * SIZE(C1) + LD b3, 2 * SIZE(C1) + LD b4, 3 * SIZE(C1) + + LD a5, 0 * SIZE(C2) + LD a6, 1 * SIZE(C2) + LD a7, 2 * SIZE(C2) + LD a8, 3 * SIZE(C2) + + FMAD5 c01, alpha_r, b1, b1 + FMAD5 c09, alpha_r, b3, b3 + FMAD5 c03, alpha_r, a5, a5 + FMAD5 c11, alpha_r, a7, a7 + + FMAD7 c02, alpha_r, b2, b2 + FMAD7 c10, alpha_r, b4, b4 + FMAD7 c04, alpha_r, a6, a6 + FMAD7 c12, alpha_r, a8, a8 + + FMAD8 c02, alpha_i, b1, b1 + FMAD8 c10, alpha_i, b3, b3 + FMAD8 c04, alpha_i, a5, a5 + FMAD8 c12, alpha_i, a7, a7 + + FMAD6 c01, alpha_i, b2, b2 + FMAD6 c09, alpha_i, b4, b4 + FMAD6 c03, alpha_i, a6, a6 + FMAD6 c11, alpha_i, a8, a8 + + ST b1, 0 * SIZE(C1) + ST b2, 1 * SIZE(C1) + ST b3, 2 * SIZE(C1) + ST b4, 3 * SIZE(C1) + + ST a5, 0 * SIZE(C2) + ST a6, 1 * SIZE(C2) + ST a7, 2 * SIZE(C2) + ST a8, 3 * SIZE(C2) + +#else + + ADD1 c01, c06, c01 # ac '+' bd + ADD1 c09, c14, c09 + ADD1 c03, c08, c03 # + ADD1 c11, c16, c11 + + ADD2 c05, c02, c02 # bc '+' ad + ADD2 c13, c10, c10 + ADD2 c07, c04, c04 + ADD2 c15, c12, c12 + + FMAD5 c01, alpha_r, $f31, b1 + FMAD5 c09, alpha_r, $f31, b3 + FMAD5 c03, alpha_r, $f31, a5 + FMAD5 c11, alpha_r, $f31, a7 + + FMAD7 c02, alpha_r, $f31, b2 + FMAD7 c10, alpha_r, $f31, b4 + FMAD7 c04, alpha_r, $f31, a6 + FMAD7 c12, alpha_r, $f31, a8 + + FMAD8 c02, alpha_i, b1, b1 + FMAD8 c10, alpha_i, b3, b3 + FMAD8 c04, alpha_i, a5, a5 + FMAD8 c12, alpha_i, a7, a7 + + FMAD6 c01, alpha_i, b2, b2 + FMAD6 c09, alpha_i, b4, b4 + FMAD6 c03, alpha_i, a6, a6 + FMAD6 c11, alpha_i, a8, a8 + + ST b1, 0 * SIZE(C1) + ST b2, 1 * SIZE(C1) + ST b3, 2 * SIZE(C1) + ST b4, 3 * SIZE(C1) + + ST a5, 0 * SIZE(C2) + ST a6, 1 * SIZE(C2) + ST a7, 2 * SIZE(C2) + ST a8, 3 * SIZE(C2) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl K, KK, TEMP +#ifdef LEFT + subl TEMP, 2, TEMP +#else + subl TEMP, 2, TEMP +#endif + + sll TEMP, 1 + ZBASE_SHIFT, L + sll TEMP, 1 + ZBASE_SHIFT, TEMP + + addl AO, L, AO + addl BO, TEMP, BO +#endif + +#ifdef LEFT + addl KK, 2, KK +#endif +#endif + + addl C1, 4*SIZE, C1 + addl C2, 4*SIZE, C2 + + + .align 4 +$L40: + and M, 1, I # I=M&1 + ble I, $L09 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO + nop +#else + sll KK, ZBASE_SHIFT, L # mr=1 + sll KK, 1 + ZBASE_SHIFT,TEMP # nr=2 + + addl AO, L, AO + addl B, TEMP, BO +#endif + + fillcs 0*SIZE(C1) + fillcs 0*SIZE(C2) + + fclr c01 + fclr c02 + fclr c03 + fclr c04 + fclr c05 + fclr c06 + fclr c07 + fclr c08 + + LD b1, 0*SIZE(BO) # b1 real part + LD b2, 1*SIZE(BO) # b1 image part + LD b3, 2*SIZE(BO) # b2 real part + LD b4, 3*SIZE(BO) # b2 image part + + LD a1, 0*SIZE(AO) # a1 real part + LD a2, 1*SIZE(AO) # a1 image part + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl K, KK, TEMP +#elif defined(LEFT) + addl KK, 1, TEMP # mr=1 +#else + addl KK, 2, TEMP # nr=2 +#endif + sra TEMP, 1, L + + ble L, $L45 + +#else + mov B, BO # Set B, (block A x panel Bj) + sra K, 1, L # Unroll K as 2 + + fillcs 0*SIZE(C1) + fillcs 0*SIZE(C2) + + fclr c01 + fclr c02 + fclr c03 + fclr c04 + fclr c05 + fclr c06 + fclr c07 + fclr c08 + + LD b1, 0*SIZE(BO) # b1 real part + LD b2, 1*SIZE(BO) # b1 image part + LD b3, 2*SIZE(BO) # b2 real part + LD b4, 3*SIZE(BO) # b2 image part + + LD a1, 0*SIZE(AO) # a1 real part + LD a2, 1*SIZE(AO) # a1 image part + + ble L, $L45 +#endif + + .align 4 +$L42: + MAD a1,b1,c01,c01 # C11 real part + MAD a1,b2,c02,c02 # C11 imag part + MAD a1,b3,c03,c03 # C21 real part + MAD a1,b4,c04,c04 # C21 imag part + + LD b5, 4 * SIZE(BO) # next B1R + LD b6, 5 * SIZE(BO) # next B1I + LD b7, 6 * SIZE(BO) # next B2R + LD b8, 7 * SIZE(BO) # next B2I + + LD a5, 2 * SIZE(AO) # next A1-A4 real part + LD a6, 3 * SIZE(AO) # next A1-A4 image part + + addl BO, 8*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE + MAD a2,b1,c05,c05 # C11 image part + MAD a2,b2,c06,c06 # C11 real part + MAD a2,b3,c07,c07 # C21 image part + MAD a2,b4,c08,c08 # C21 real part + + subl L, 1, L # + + addl AO, 4*SIZE, AO # AO+=1mr*2kr*2px*SIZE + MAD a5,b5,c01,c01 + MAD a5,b6,c02,c02 + MAD a5,b7,c03,c03 + MAD a5,b8,c04,c04 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MAD a6,b5,c05,c05 + MAD a6,b6,c06,c06 + MAD a6,b7,c07,c07 + MAD a6,b8,c08,c08 + + bne L, $L42 # continue K + +$L45: + LD alpha_r, ALPHA_R # $f30==b8 +#ifndef TRMMKERNEL + blbc K, $L48 # if(K&1) +#else + blbc TEMP, $L48 +#endif + +$L46: + addl AO, 2*SIZE, AO # AO+=8mr*1kr*2px*SIZE + MAD a1,b1,c01,c01 # C11 real part + MAD a1,b2,c02,c02 # C11 imag part + MAD a1,b3,c03,c03 # C21 real part + MAD a1,b4,c04,c04 # C21 imag part + + addl BO, 4*SIZE, BO + MAD a2,b1,c05,c05 # C11 image part + MAD a2,b2,c06,c06 # C11 real part + MAD a2,b3,c07,c07 # C21 image part + MAD a2,b4,c08,c08 # C21 real part + + +$L48: # Write back + LD alpha_i, ALPHA_I # $f29==b7 +#ifndef TRMMKERNEL + ADD1 c01, c06, c01 + ADD1 c03, c08, c03 + ADD2 c05, c02, c02 + ADD2 c07, c04, c04 + + LD b1, 0 * SIZE(C1) + LD b2, 1 * SIZE(C1) + + LD a5, 0 * SIZE(C2) + LD a6, 1 * SIZE(C2) + + FMAD5 c01, alpha_r, b1, b1 + FMAD5 c03, alpha_r, a5, a5 + + FMAD7 c02, alpha_r, b2, b2 + FMAD7 c04, alpha_r, a6, a6 + + FMAD8 c02, alpha_i, b1, b1 + FMAD8 c04, alpha_i, a5, a5 + + FMAD6 c01, alpha_i, b2, b2 + FMAD6 c03, alpha_i, a6, a6 + + ST b1, 0 * SIZE(C1) + ST b2, 1 * SIZE(C1) + + ST a5, 0 * SIZE(C2) + ST a6, 1 * SIZE(C2) + +#else + + ADD1 c01, c06, c01 + ADD1 c03, c08, c03 + ADD2 c05, c02, c02 + ADD2 c07, c04, c04 + + FMAD5 c01, alpha_r, $f31, b1 + FMAD5 c03, alpha_r, $f31, a5 + + FMAD7 c02, alpha_r, $f31, b2 + FMAD7 c04, alpha_r, $f31, a6 + + FMAD8 c02, alpha_i, b1, b1 + FMAD8 c04, alpha_i, a5, a5 + + FMAD6 c01, alpha_i, b2, b2 + FMAD6 c03, alpha_i, a6, a6 + + ST b1, 0 * SIZE(C1) + ST b2, 1 * SIZE(C1) + + ST a5, 0 * SIZE(C2) + ST a6, 1 * SIZE(C2) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl K, KK, TEMP +#ifdef LEFT + subl TEMP, 1, TEMP +#else + subl TEMP, 2, TEMP +#endif + + sll TEMP, ZBASE_SHIFT, L + sll TEMP, 1 + ZBASE_SHIFT, TEMP + + addl AO, L, AO + addl BO, TEMP,BO +#endif + +#ifdef LEFT + addl KK, 1, KK +#endif +#endif + + addl C1, 2*SIZE, C1 + addl C2, 2*SIZE, C2 + + + .align 4 + +$L09: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl KK, 2, KK # nr=2 + nop +#endif + mov BO, B # Change B to next panel + subl J, 1, J # J-- + bgt J, $L01 + + + .align 4 +$L50: + and N, 1, J + ble J, $L999 # Finish! + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK # reset KK +#endif + + sra M, 3, I # I=M/8 + sll K, 1 + ZBASE_SHIFT, PREA + + mov C, C1 + mov A, AO # Reset A + + addl A, PREA, PREA + beq I, $L60 # GEMM_MR=8 + + +$L51: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA))\ + || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 3 + ZBASE_SHIFT,L # mr=8 + sll KK, ZBASE_SHIFT,TEMP # nr=1 + + addl AO, L, AO + addl B, TEMP, BO +#endif + + fillcs 0(C1) + fillcs 4*SIZE(C1) + fillcs 8*SIZE(C1) + fillcs 12*SIZE(C1) + fillcs 16*SIZE(C1) + + vcpys $f31,$f31,c01 # Clear result regs + vcpys $f31,$f31,c02 + + vcpys $f31,$f31,c05 + vcpys $f31,$f31,c06 + + vcpys $f31,$f31,c09 + vcpys $f31,$f31,c10 + + vcpys $f31,$f31,c13 + vcpys $f31,$f31,c14 + + LDDE b1, 0 * SIZE(BO) # B1R + LDDE b2, 1 * SIZE(BO) # B1I + + VLD a1, 0 * SIZE(AO) # A1, A2 + VLD a2, 4 * SIZE(AO) # A3, A4 + VLD a3, 8 * SIZE(AO) # A5, A6 + VLD a4,12 * SIZE(AO) # A7, A8 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl K, KK, TEMP +#elif defined(LEFT) + addl KK, 8, TEMP # mr=8 +#else + addl KK, 1, TEMP # nr=1 +#endif + sra TEMP, 1, L + ble L, $L55 + +#else + mov B, BO # Set B, (block A x panel Bj) + sra K, 1, L # Unroll K as 2 + + fillcs 0(C1) + fillcs 4*SIZE(C1) + fillcs 8*SIZE(C1) + fillcs 12*SIZE(C1) + fillcs 16*SIZE(C1) + + vcpys $f31,$f31,c01 # Clear result regs + vcpys $f31,$f31,c02 + + vcpys $f31,$f31,c05 + vcpys $f31,$f31,c06 + + vcpys $f31,$f31,c09 + vcpys $f31,$f31,c10 + + vcpys $f31,$f31,c13 + vcpys $f31,$f31,c14 + + LDDE b1, 0 * SIZE(BO) # B1R + LDDE b2, 1 * SIZE(BO) # B1I + + VLD a1, 0 * SIZE(AO) # A1, A2 + VLD a2, 4 * SIZE(AO) # A3, A4 + VLD a3, 8 * SIZE(AO) # A5, A6 + VLD a4,12 * SIZE(AO) # A7, A8 + + ble L, $L55 +#endif + + .align 4 +$L52: + addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE + VMAD a1,b1,c01,c01 # C11(ac,bc), C21(ac,bc) + VMAD a1,b2,c02,c02 # C11(ad,bd), C21(ad,bd) + + LDDE b5, 2 * SIZE(BO) # next B1R + LDDE b6, 3 * SIZE(BO) # next B1I + + addl BO, 4*SIZE, BO # BO+=1nr*2kr*2cpx*SIZE + VMAD a2,b1,c05,c05 # C31, C41 + VMAD a2,b2,c06,c06 # C31, C41 + + VLD a5, 0 * SIZE(AO) # next A1, A2, a5==a0 + VLD a6, 4 * SIZE(AO) # next A3, A4 + VLD a7, 8 * SIZE(AO) # next A5, A6 + VLD a8,12 * SIZE(AO) # next A7, A8 + + VMAD a3,b1,c09,c09 # C51, C61 + VMAD a3,b2,c10,c10 # C51, C61 + + fillcs 0(PREA) + VMAD a4,b1,c13,c13 # C71, C81 + VMAD a4,b2,c14,c14 # C71, C81 + + subl L, 1, L # + + addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE + VMAD a5,b5,c01,c01 + VMAD a5,b6,c02,c02 + + addl PREA, 16*SIZE, PREA + LDDE b1, 0 * SIZE(BO) + LDDE b2, 1 * SIZE(BO) + + VMAD a6,b5,c05,c05 + VMAD a6,b6,c06,c06 + + VLD a1, 0 * SIZE(AO) + VLD a2, 4 * SIZE(AO) + VLD a3, 8 * SIZE(AO) + VLD a4,12 * SIZE(AO) + + VMAD a7,b5,c09,c09 + VMAD a7,b6,c10,c10 + + fillcs 0(PREA) + VMAD a8,b5,c13,c13 + VMAD a8,b6,c14,c14 + + addl PREA, 16*SIZE, PREA + bne L, $L52 # continue K + +$L55: + LD alpha_r, ALPHA_R # $f30==b8 +#ifndef TRMMKERNEL + blbc K, $L58 # if(K&1) +#else + blbc TEMP, $L58 +#endif + +$L56: + addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE + VMAD a1,b1,c01,c01 # C11R C21R + VMAD a1,b2,c02,c02 # C11I C21I + + addl BO, 2*SIZE, BO + VMAD a2,b1,c05,c05 # C31R C41R + VMAD a2,b2,c06,c06 # C31I C41I + + VMAD a3,b1,c09,c09 # C51R C61R + VMAD a3,b2,c10,c10 # C51I C61I + + VMAD a4,b1,c13,c13 # C71R C81R + VMAD a4,b2,c14,c14 # C71I C81I + +$L58: # Write back + LD alpha_i, ALPHA_I # $f29==b7 +#ifndef TRMMKERNEL + vextf c01, 0, a1 # a1=C11R_ac + vextf c01, 1, a2 # a2=C11I_bc + vextf c01, 2, a3 # a3=C21R_ac + vextf c01, 3, a4 # a4=C21I_bc + + vextf c02, 0, b1 # b1=C11I_ad + vextf c02, 1, b2 # b2=C11R_bd + vextf c02, 2, b3 # b3=C21I_ad + vextf c02, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + LD a1, 0 * SIZE(C1) + LD a2, 1 * SIZE(C1) + LD a3, 2 * SIZE(C1) + LD a4, 3 * SIZE(C1) + + FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, a3, a3 + FMAD7 a7, alpha_r, a2, a2 + FMAD7 a8, alpha_r, a4, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 0 * SIZE(C1) + ST c01, 1 * SIZE(C1) + ST b6, 2 * SIZE(C1) + ST c02, 3 * SIZE(C1) + + vextf c05, 0, a1 # a1=C11R_ac + vextf c05, 1, a2 # a2=C11I_bc + vextf c05, 2, a3 # a3=C21R_ac + vextf c05, 3, a4 # a4=C21I_bc + + vextf c06, 0, b1 # b1=C11I_ad + vextf c06, 1, b2 # b2=C11R_bd + vextf c06, 2, b3 # b3=C21I_ad + vextf c06, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + LD a1, 4 * SIZE(C1) + LD a2, 5 * SIZE(C1) + LD a3, 6 * SIZE(C1) + LD a4, 7 * SIZE(C1) + + FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, a3, a3 + FMAD7 a7, alpha_r, a2, a2 + FMAD7 a8, alpha_r, a4, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 4 * SIZE(C1) + ST c01, 5 * SIZE(C1) + ST b6, 6 * SIZE(C1) + ST c02, 7 * SIZE(C1) + + vextf c09, 0, a1 # a1=C11R_ac + vextf c09, 1, a2 # a2=C11I_bc + vextf c09, 2, a3 # a3=C21R_ac + vextf c09, 3, a4 # a4=C21I_bc + + vextf c10, 0, b1 # b1=C11I_ad + vextf c10, 1, b2 # b2=C11R_bd + vextf c10, 2, b3 # b3=C21I_ad + vextf c10, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + LD a1, 8 * SIZE(C1) + LD a2, 9 * SIZE(C1) + LD a3, 10 * SIZE(C1) + LD a4, 11 * SIZE(C1) + + FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, a3, a3 + FMAD7 a7, alpha_r, a2, a2 + FMAD7 a8, alpha_r, a4, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 8 * SIZE(C1) + ST c01, 9 * SIZE(C1) + ST b6, 10 * SIZE(C1) + ST c02, 11 * SIZE(C1) + + vextf c13, 0, a1 # a1=C11R_ac + vextf c13, 1, a2 # a2=C11I_bc + vextf c13, 2, a3 # a3=C21R_ac + vextf c13, 3, a4 # a4=C21I_bc + + vextf c14, 0, b1 # b1=C11I_ad + vextf c14, 1, b2 # b2=C11R_bd + vextf c14, 2, b3 # b3=C21I_ad + vextf c14, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + LD a1, 12 * SIZE(C1) + LD a2, 13 * SIZE(C1) + LD a3, 14 * SIZE(C1) + LD a4, 15 * SIZE(C1) + + FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, a3, a3 + FMAD7 a7, alpha_r, a2, a2 + FMAD7 a8, alpha_r, a4, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 12 * SIZE(C1) + ST c01, 13 * SIZE(C1) + ST b6, 14 * SIZE(C1) + ST c02, 15 * SIZE(C1) + +#else + + vextf c01, 0, a1 # a1=C11R_ac + vextf c01, 1, a2 # a2=C11I_bc + vextf c01, 2, a3 # a3=C21R_ac + vextf c01, 3, a4 # a4=C21I_bc + + vextf c02, 0, b1 # b1=C11I_ad + vextf c02, 1, b2 # b2=C11R_bd + vextf c02, 2, b3 # b3=C21I_ad + vextf c02, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, $f31, a3 + FMAD7 a7, alpha_r, $f31, a2 + FMAD7 a8, alpha_r, $f31, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 0 * SIZE(C1) + ST c01, 1 * SIZE(C1) + ST b6, 2 * SIZE(C1) + ST c02, 3 * SIZE(C1) + + vextf c05, 0, a1 # a1=C11R_ac + vextf c05, 1, a2 # a2=C11I_bc + vextf c05, 2, a3 # a3=C21R_ac + vextf c05, 3, a4 # a4=C21I_bc + + vextf c06, 0, b1 # b1=C11I_ad + vextf c06, 1, b2 # b2=C11R_bd + vextf c06, 2, b3 # b3=C21I_ad + vextf c06, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, $f31, a3 + FMAD7 a7, alpha_r, $f31, a2 + FMAD7 a8, alpha_r, $f31, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 4 * SIZE(C1) + ST c01, 5 * SIZE(C1) + ST b6, 6 * SIZE(C1) + ST c02, 7 * SIZE(C1) + + vextf c09, 0, a1 # a1=C11R_ac + vextf c09, 1, a2 # a2=C11I_bc + vextf c09, 2, a3 # a3=C21R_ac + vextf c09, 3, a4 # a4=C21I_bc + + vextf c10, 0, b1 # b1=C11I_ad + vextf c10, 1, b2 # b2=C11R_bd + vextf c10, 2, b3 # b3=C21I_ad + vextf c10, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, $f31, a3 + FMAD7 a7, alpha_r, $f31, a2 + FMAD7 a8, alpha_r, $f31, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 8 * SIZE(C1) + ST c01, 9 * SIZE(C1) + ST b6, 10 * SIZE(C1) + ST c02, 11 * SIZE(C1) + + vextf c13, 0, a1 # a1=C11R_ac + vextf c13, 1, a2 # a2=C11I_bc + vextf c13, 2, a3 # a3=C21R_ac + vextf c13, 3, a4 # a4=C21I_bc + + vextf c14, 0, b1 # b1=C11I_ad + vextf c14, 1, b2 # b2=C11R_bd + vextf c14, 2, b3 # b3=C21I_ad + vextf c14, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, $f31, a3 + FMAD7 a7, alpha_r, $f31, a2 + FMAD7 a8, alpha_r, $f31, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 12 * SIZE(C1) + ST c01, 13 * SIZE(C1) + ST b6, 14 * SIZE(C1) + ST c02, 15 * SIZE(C1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl K, KK, TEMP +#ifdef LEFT + subl TEMP, 8, TEMP +#else + subl TEMP, 1, TEMP +#endif + + sll TEMP, 3 + ZBASE_SHIFT,L + sll TEMP, ZBASE_SHIFT,TEMP + + addl AO, L, AO + addl BO, TEMP, BO +#endif + +#ifdef LEFT + addl KK, 8, KK +#endif +#endif + + jmp $L999 + + + .align 4 +$L60: + and M, 4, I + ble I, $L70 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA))\ + || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 2 + ZBASE_SHIFT,L # mr=4 + sll KK, ZBASE_SHIFT,TEMP # nr=1 + + addl AO, L, AO + addl B, TEMP, BO +#endif + + fillcs 0(C1) + fillcs 4*SIZE(C1) + fillcs 8*SIZE(C1) + + vcpys $f31,$f31,c01 # Clear result regs + vcpys $f31,$f31,c02 + + vcpys $f31,$f31,c05 + vcpys $f31,$f31,c06 + + LDDE b1, 0 * SIZE(BO) # B1R + LDDE b2, 1 * SIZE(BO) # B1I + + VLD a1, 0 * SIZE(AO) # A1, A2 + VLD a2, 4 * SIZE(AO) # A3, A4 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl K, KK, TEMP +#elif defined(LEFT) + addl KK, 4, TEMP # mr=4 +#else + addl KK, 1, TEMP # nr=1 +#endif + sra TEMP, 1, L + ble L, $L65 + +#else + + mov B, BO # Set B, (block A x panel Bj) + sra K, 1, L # Unroll K as 2 + + fillcs 0(C1) + fillcs 4*SIZE(C1) + fillcs 8*SIZE(C1) + + vcpys $f31,$f31,c01 # Clear result regs + vcpys $f31,$f31,c02 + + vcpys $f31,$f31,c05 + vcpys $f31,$f31,c06 + + LDDE b1, 0 * SIZE(BO) # B1R + LDDE b2, 1 * SIZE(BO) # B1I + + VLD a1, 0 * SIZE(AO) # A1, A2 + VLD a2, 4 * SIZE(AO) # A3, A4 + + ble L, $L65 +#endif + + .align 4 +$L62: + VMAD a1,b1,c01,c01 # C11(ac,bc), C21(ac,bc) + VMAD a1,b2,c02,c02 # C11(ad,bd), C21(ad,bd) + + LDDE b5, 2 * SIZE(BO) # next B1R + LDDE b6, 3 * SIZE(BO) # next B1I + + addl BO, 4*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE + VMAD a2,b1,c05,c05 # C31, C41 + VMAD a2,b2,c06,c06 # C31, C41 + + fillcs 0(PREA) + VLD a5, 8 * SIZE(AO) # next A1, A2, a5==a0 + VLD a6, 12 * SIZE(AO) # next A3, A4 + + subl L, 1, L # + + addl AO, 16*SIZE, AO # AO+=4mr*2kr*2px*SIZE + VMAD a5,b5,c01,c01 + VMAD a5,b6,c02,c02 + + addl PREA, 16*SIZE, PREA + LDDE b1, 0 * SIZE(BO) + LDDE b2, 1 * SIZE(BO) + + fillcs 0(PREA) + VMAD a6,b5,c05,c05 + VMAD a6,b6,c06,c06 + + VLD a1, 0 * SIZE(AO) + VLD a2, 4 * SIZE(AO) + + addl PREA, 16*SIZE, PREA + bne L, $L62 # continue K + +$L65: + LD alpha_r, ALPHA_R # $f30==b8 +#ifndef TRMMKERNEL + blbc K, $L68 # if(K&1) +#else + blbc TEMP, $L68 +#endif + +$L66: + addl AO, 8*SIZE, AO # AO+=4mr*1kr*2px*SIZE + VMAD a1,b1,c01,c01 # C11R C21R + VMAD a1,b2,c02,c02 # C11I C21I + + addl BO, 2*SIZE, BO + VMAD a2,b1,c05,c05 # C31R C41R + VMAD a2,b2,c06,c06 # C31I C41I + +$L68: # Write back + LD alpha_i, ALPHA_I # $f29==b7 +#ifndef TRMMKERNEL + vextf c01, 0, a1 # a1=C11R_ac + vextf c01, 1, a2 # a2=C11I_bc + vextf c01, 2, a3 # a3=C21R_ac + vextf c01, 3, a4 # a4=C21I_bc + + vextf c02, 0, b1 # b1=C11I_ad + vextf c02, 1, b2 # b2=C11R_bd + vextf c02, 2, b3 # b3=C21I_ad + vextf c02, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + LD a1, 0 * SIZE(C1) + LD a2, 1 * SIZE(C1) + LD a3, 2 * SIZE(C1) + LD a4, 3 * SIZE(C1) + + FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, a3, a3 + FMAD7 a7, alpha_r, a2, a2 + FMAD7 a8, alpha_r, a4, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 0 * SIZE(C1) + ST c01, 1 * SIZE(C1) + ST b6, 2 * SIZE(C1) + ST c02, 3 * SIZE(C1) + + vextf c05, 0, a1 # a1=C11R_ac + vextf c05, 1, a2 # a2=C11I_bc + vextf c05, 2, a3 # a3=C21R_ac + vextf c05, 3, a4 # a4=C21I_bc + + vextf c06, 0, b1 # b1=C11I_ad + vextf c06, 1, b2 # b2=C11R_bd + vextf c06, 2, b3 # b3=C21I_ad + vextf c06, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + LD a1, 4 * SIZE(C1) + LD a2, 5 * SIZE(C1) + LD a3, 6 * SIZE(C1) + LD a4, 7 * SIZE(C1) + + FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, a3, a3 + FMAD7 a7, alpha_r, a2, a2 + FMAD7 a8, alpha_r, a4, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 4 * SIZE(C1) + ST c01, 5 * SIZE(C1) + ST b6, 6 * SIZE(C1) + ST c02, 7 * SIZE(C1) + +#else + + vextf c01, 0, a1 # a1=C11R_ac + vextf c01, 1, a2 # a2=C11I_bc + vextf c01, 2, a3 # a3=C21R_ac + vextf c01, 3, a4 # a4=C21I_bc + + vextf c02, 0, b1 # b1=C11I_ad + vextf c02, 1, b2 # b2=C11R_bd + vextf c02, 2, b3 # b3=C21I_ad + vextf c02, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, $f31, a3 + FMAD7 a7, alpha_r, $f31, a2 + FMAD7 a8, alpha_r, $f31, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 0 * SIZE(C1) + ST c01, 1 * SIZE(C1) + ST b6, 2 * SIZE(C1) + ST c02, 3 * SIZE(C1) + + vextf c05, 0, a1 # a1=C11R_ac + vextf c05, 1, a2 # a2=C11I_bc + vextf c05, 2, a3 # a3=C21R_ac + vextf c05, 3, a4 # a4=C21I_bc + + vextf c06, 0, b1 # b1=C11I_ad + vextf c06, 1, b2 # b2=C11R_bd + vextf c06, 2, b3 # b3=C21I_ad + vextf c06, 3, b4 # b4=C21R_bd + + ADD1 a1, b2, b5 # ac '+' bd + ADD1 a3, b4, a6 + ADD2 a2, b1, a7 # bc '+' ad + ADD2 a4, b3, a8 + + FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 + FMAD5 a6, alpha_r, $f31, a3 + FMAD7 a7, alpha_r, $f31, a2 + FMAD7 a8, alpha_r, $f31, a4 + + FMAD8 a7, alpha_i, a1, b4 + FMAD8 a8, alpha_i, a3, b6 + FMAD6 b5, alpha_i, a2, c01 + FMAD6 a6, alpha_i, a4, c02 + + ST b4, 4 * SIZE(C1) + ST c01, 5 * SIZE(C1) + ST b6, 6 * SIZE(C1) + ST c02, 7 * SIZE(C1) + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl K, KK,TEMP +#ifdef LEFT + subl TEMP, 4, TEMP # mr=4 +#else + subl TEMP, 1, TEMP # nr=1 +#endif + + sll TEMP, 2 + ZBASE_SHIFT, L + sll TEMP, ZBASE_SHIFT,TEMP + + addl AO, L, AO + addl BO,TEMP, BO +#endif + +#ifdef LEFT + addl KK,4,KK +#endif +#endif + + addl C1, 8*SIZE, C1 + + + .align 4 +$L70: + and M, 2, I # I=M&2 + ble I, $L80 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO + nop +#else + sll KK, 1 + ZBASE_SHIFT, L # mr=2 + sll KK, ZBASE_SHIFT,TEMP # nr=1 + + addl AO, L, AO + addl B, TEMP, BO +#endif + + fillcs 0*SIZE(C1) + fillcs 4*SIZE(C1) + + fclr c01 + fclr c02 # CLEAR 8 register + fclr c03 + fclr c04 + fclr c05 + fclr c06 + fclr c07 + fclr c08 + + LD b1, 0*SIZE(BO) # b1 real part + LD b2, 1*SIZE(BO) # b1 image part + + LD a1, 0*SIZE(AO) # a1 real part + LD a2, 1*SIZE(AO) # a1 image part + LD a3, 2*SIZE(AO) # a2 real part + LD a4, 3*SIZE(AO) # a2 image part + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl K, KK, TEMP +#elif defined(LEFT) + addl KK, 2, TEMP # mr=2 +#else + addl KK, 1, TEMP # nr=1 +#endif + sra TEMP, 1, L + ble L, $L75 + +#else + mov B, BO # Set B, (block A x panel Bj) + sra K, 1, L # Unroll K as 2 + + fillcs 0*SIZE(C1) + fillcs 4*SIZE(C1) + + fclr c01 + fclr c02 # CLEAR 8 register + fclr c03 + fclr c04 + fclr c05 + fclr c06 + fclr c07 + fclr c08 + + LD b1, 0*SIZE(BO) # b1 real part + LD b2, 1*SIZE(BO) # b1 image part + + LD a1, 0*SIZE(AO) # a1 real part + LD a2, 1*SIZE(AO) # a1 image part + LD a3, 2*SIZE(AO) # a2 real part + LD a4, 3*SIZE(AO) # a2 image part + + ble L, $L75 +#endif + + .align 4 +$L72: + MAD a1,b1,c01,c01 # C11 real part + MAD a1,b2,c02,c02 # C11 imag part + + LD b5, 2 * SIZE(BO) # next B1R + LD b6, 3 * SIZE(BO) # next B1I + + LD a5, 4 * SIZE(AO) # next A1-A4 real part + LD a6, 5 * SIZE(AO) # next A1-A4 image part + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + addl BO, 4*SIZE, BO # BO+=1nr*2kr*2cpx*SIZE + MAD a2,b1,c03,c03 # C11 image part + MAD a2,b2,c04,c04 # C11 real part + + MAD a3,b1,c05,c05 # C12 real part + MAD a3,b2,c06,c06 # C12 imag part + + MAD a4,b1,c07,c07 # C12 image part + MAD a4,b2,c08,c08 # C12 real part + + subl L, 1, L # + + addl AO, 8*SIZE, AO # AO+=4mr*1kr*2px*SIZE + MAD a5,b5,c01,c01 + MAD a5,b6,c02,c02 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MAD a6,b5,c03,c03 + MAD a6,b6,c04,c04 + + MAD a7,b5,c05,c05 + MAD a7,b6,c06,c06 + + MAD a8,b5,c07,c07 + MAD a8,b6,c08,c08 + + bne L, $L72 # continue K + +$L75: + LD alpha_r, ALPHA_R # $f30==b8 +#ifndef TRMMKERNEL + blbc K, $L78 # if(K&1) +#else + blbc TEMP, $L78 +#endif + +$L76: + addl AO, 4*SIZE, AO # AO+=2mr*1kr*2px*SIZE + MAD a1,b1,c01,c01 # C11 real part + MAD a1,b2,c02,c02 # C11 imag part + + addl BO, 4*SIZE, BO + MAD a2,b1,c03,c03 # C11 image part + MAD a2,b2,c04,c04 # C11 real part + + MAD a3,b1,c05,c05 # C12 real part + MAD a3,b2,c06,c06 # C12 imag part + + MAD a4,b1,c07,c07 # C12 image part + MAD a4,b2,c08,c08 # C12 real part + + + +$L78: # Write back + LD alpha_i, ALPHA_I # $f29==b7 +#ifndef TRMMKERNEL + ADD1 c01, c04, c01 + ADD1 c05, c08, c05 + ADD2 c03, c02, c02 + ADD2 c07, c06, c06 + + LD b1, 0 * SIZE(C1) + LD b2, 1 * SIZE(C1) + LD b3, 2 * SIZE(C1) + LD b4, 3 * SIZE(C1) + + FMAD5 c01, alpha_r, b1, b1 + FMAD5 c05, alpha_r, b3, b3 + FMAD7 c02, alpha_r, b2, b2 + FMAD7 c06, alpha_r, b4, b4 + + FMAD8 c02, alpha_i, b1, b1 + FMAD8 c06, alpha_i, b3, b3 + FMAD6 c01, alpha_i, b2, b2 + FMAD6 c05, alpha_i, b4, b4 + + ST b1, 0 * SIZE(C1) + ST b2, 1 * SIZE(C1) + ST b3, 2 * SIZE(C1) + ST b4, 3 * SIZE(C1) + +#else + + ADD1 c01, c04, c01 + ADD1 c05, c08, c05 + ADD2 c03, c02, c02 + ADD2 c07, c06, c06 + + FMAD5 c01, alpha_r, $f31, b1 + FMAD5 c05, alpha_r, $f31, b3 + FMAD7 c02, alpha_r, $f31, b2 + FMAD7 c06, alpha_r, $f31, b4 + + FMAD8 c02, alpha_i, b1, b1 + FMAD8 c06, alpha_i, b3, b3 + FMAD6 c01, alpha_i, b2, b2 + FMAD6 c05, alpha_i, b4, b4 + + ST b1, 0 * SIZE(C1) + ST b2, 1 * SIZE(C1) + ST b3, 2 * SIZE(C1) + ST b4, 3 * SIZE(C1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl K, KK, TEMP +#ifdef LEFT + subl TEMP, 2, TEMP +#else + subl TEMP, 1, TEMP +#endif + + sll TEMP, 1 + ZBASE_SHIFT, L + sll TEMP, ZBASE_SHIFT, TEMP + + addl AO, L, AO + addl BO, TEMP, BO +#endif + +#ifdef LEFT + addl KK, 2, KK +#endif +#endif + + addl C1, 4*SIZE, C1 + + + .align 4 +$L80: + and M, 1, I # I=M&1 + ble I, $L999 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO + nop +#else + sll KK, ZBASE_SHIFT, L # mr=1 + sll KK, ZBASE_SHIFT,TEMP # nr=1 + + addl AO, L, AO + addl B, TEMP, BO +#endif + + fillcs 0*SIZE(C1) + + fclr c01 # CLEAR 8 register + fclr c02 + fclr c03 + fclr c04 + + LD b1, 0*SIZE(BO) # b1 real part + LD b2, 1*SIZE(BO) # b1 image part + + LD a1, 0*SIZE(AO) # a1 real part + LD a2, 1*SIZE(AO) # a1 image part + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + subl K, KK, TEMP +#elif defined(LEFT) + addl KK, 1, TEMP # mr=1 +#else + addl KK, 1, TEMP # nr=1 +#endif + sra TEMP, 1, L + ble L, $L85 + +#else + mov B, BO # Set B, (block A x panel Bj) + sra K, 1, L # Unroll K as 2 + + fillcs 0*SIZE(C1) + + fclr c01 # CLEAR 8 register + fclr c02 + fclr c03 + fclr c04 + + LD b1, 0*SIZE(BO) # b1 real part + LD b2, 1*SIZE(BO) # b1 image part + + LD a1, 0*SIZE(AO) # a1 real part + LD a2, 1*SIZE(AO) # a1 image part + + ble L, $L85 +#endif + + .align 4 +$L82: + MAD a1,b1,c01,c01 # C11 real part + MAD a1,b2,c02,c02 # C11 imag part + + LD b5, 2 * SIZE(BO) # next B1R + LD b6, 3 * SIZE(BO) # next B1I + + LD a5, 2 * SIZE(AO) # next A1-A4 real part + LD a6, 3 * SIZE(AO) # next A1-A4 image part + + addl BO, 4*SIZE, BO # BO+=1nr*2kr*2cpx*SIZE + MAD a2,b1,c03,c03 # C11 image part + MAD a2,b2,c04,c04 # C11 real part + + subl L, 1, L # + + addl AO, 4*SIZE, AO # AO+=1mr*2kr*2px*SIZE + MAD a5,b5,c01,c01 + MAD a5,b6,c02,c02 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MAD a6,b5,c03,c03 + MAD a6,b6,c04,c04 + + bne L, $L82 # continue K + +$L85: + LD alpha_r, ALPHA_R # $f30==b8 +#ifndef TRMMKERNEL + blbc K, $L88 # if(K&1) +#else + blbc TEMP, $L88 +#endif + +$L86: + addl AO, 2*SIZE, AO # AO+=8mr*1kr*2px*SIZE + MAD a1,b1,c01,c01 # C11 real part + MAD a1,b2,c02,c02 # C11 imag part + + addl BO, 2*SIZE, BO + MAD a2,b1,c03,c03 # C11 image part + MAD a2,b2,c04,c04 # C11 real part + +$L88: # Write back + LD alpha_i, ALPHA_I # $f29==b7 +#ifndef TRMMKERNEL + ADD1 c01, c04, c01 + ADD2 c03, c02, c02 + + LD b1, 0 * SIZE(C1) + LD b2, 1 * SIZE(C1) + + FMAD5 c01, alpha_r, b1, b1 + FMAD7 c02, alpha_r, b2, b2 + FMAD8 c02, alpha_i, b1, b1 + FMAD6 c01, alpha_i, b2, b2 + + ST b1, 0 * SIZE(C1) + ST b2, 1 * SIZE(C1) + +#else + + ADD1 c01, c04, c01 + ADD2 c03, c02, c02 + + FMAD5 c01, alpha_r, $f31, b1 + FMAD7 c02, alpha_r, $f31, b2 + + FMAD8 c02, alpha_i, b1, b1 + FMAD6 c01, alpha_i, b2, b2 + + ST b1, 0 * SIZE(C1) + ST b2, 1 * SIZE(C1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + subl K, KK, TEMP +#ifdef LEFT + subl TEMP, 1, TEMP +#else + subl TEMP, 1, TEMP +#endif + + sll TEMP, ZBASE_SHIFT, L + sll TEMP, ZBASE_SHIFT, TEMP + + addl AO, L, AO + addl BO, TEMP,BO +#endif + +#ifdef LEFT + addl KK, 1, KK +#endif +#endif + + addl C1, 2*SIZE, C1 + + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + ldl $9, 80($sp) + ldl $10,88($sp) + ldl $11,96($sp) + ldl $12,104($sp) + ldl $13,112($sp) + ldl $14,120($sp) + + clr $0 + + ldi $sp, STACKSIZE($sp) + ret $31,($26),1 # + + EPILOGUE diff --git a/kernel/sw_64/zgemv_n.S b/kernel/sw_64/zgemv_n.S new file mode 100644 index 0000000..03d71ee --- /dev/null +++ b/kernel/sw_64/zgemv_n.S @@ -0,0 +1,1040 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define STACKSIZE 64 +#define PREFETCHSIZE 32 + +#define M $16 +#define N $17 +#define A $21 +#define LDA $18 + +#define X $19 +#define INCX $20 +#define Y $22 +#define INCY $23 + +#define BUFFER $24 + +#define I $25 +#define J $27 + +#define Y1 $4 +#define A1 $5 +#define A2 $6 + +#define alpha_r $f19 +#define alpha_i $f20 + +#define alpha1 $f0 +#define alpha2 $f1 +#define alpha3 $f10 +#define alpha4 $f11 + +#define y0 $f12 +#define y1 $f13 +#define y2 $f14 +#define y3 $f15 + +#define y4 $f16 +#define y5 $f17 +#define y6 $f18 +#define y7 $f21 + +#define a0 $f22 +#define a1 $f23 +#define a2 $f24 +#define a3 $f25 +#define a4 $f26 +#define a5 $f27 +#define a6 $f28 +#define a7 $f29 + +#define t0 $f2 +#define t1 $f3 +#define t2 $f4 +#define t3 $f5 + +#if !defined(CONJ) && !defined(XCONJ) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#elif defined(CONJ) && !defined(XCONJ) +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#elif !defined(CONJ) && defined(XCONJ) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#else +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 SUB +#define ADD4 SUB +#endif + + PROLOGUE + + ldi $sp, -STACKSIZE($sp) + ldl LDA, 0 + STACKSIZE($sp) + ldl X, 8 + STACKSIZE($sp) + ldl INCX, 16 + STACKSIZE($sp) + ldl Y, 24 + STACKSIZE($sp) + ldl INCY, 32 + STACKSIZE($sp) + ldl BUFFER, 40 + STACKSIZE($sp) + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + + PROFCODE + + cmple M, 0, $0 + sll INCX, ZBASE_SHIFT, INCX + cmple N, 0, $1 + sll INCY, ZBASE_SHIFT, INCY + + or $0, $1, $0 + bne $0, $L999 + + cmpeq INCY, 2 * SIZE, $0 + sll LDA, ZBASE_SHIFT,LDA + bne $0, $L10 + + mov BUFFER, Y1 + + mov Y, BUFFER + mov Y1, Y + + sra M, 2, I + ble I, $L05 + .align 4 + +$L02: + ST $f31, 0 * SIZE(Y1) + ST $f31, 1 * SIZE(Y1) + ST $f31, 2 * SIZE(Y1) + ST $f31, 3 * SIZE(Y1) + ST $f31, 4 * SIZE(Y1) + ST $f31, 5 * SIZE(Y1) + ST $f31, 6 * SIZE(Y1) + ST $f31, 7 * SIZE(Y1) + + ldi Y1, 8 * SIZE(Y1) + ldi I, -1(I) + bgt I, $L02 + .align 4 + +$L05: + and M, 3, I + ble I, $L10 + .align 4 + +$L06: + ST $f31, 0 * SIZE(Y1) + ST $f31, 1 * SIZE(Y1) + addl Y1, 2 * SIZE, Y1 + + ldi I, -1(I) + bgt I, $L06 + .align 4 + +$L10: + sra N, 1, J + ble J, $L20 + .align 4 + +$L11: + LD alpha1, 0 * SIZE(X) + LD alpha2, 1 * SIZE(X) + addl X, INCX, X + LD alpha3, 0 * SIZE(X) + LD alpha4, 1 * SIZE(X) + addl X, INCX, X + + MUL alpha_r, alpha1, y0 + MUL alpha_r, alpha2, y1 + MUL alpha_r, alpha3, y2 + MUL alpha_r, alpha4, y3 + + MUL alpha_i, alpha2, t0 + mov A, A1 + MUL alpha_i, alpha1, t1 + addl A, LDA, A2 + MUL alpha_i, alpha4, t2 + addl A2, LDA, A + MUL alpha_i, alpha3, t3 + mov Y, Y1 + +#ifndef XCONJ + SUB y0, t0, alpha1 + ADD y1, t1, alpha2 + SUB y2, t2, alpha3 + ADD y3, t3, alpha4 +#else + ADD y0, t0, alpha1 + SUB y1, t1, alpha2 + ADD y2, t2, alpha3 + SUB y3, t3, alpha4 +#endif + + fillcs 4 * SIZE(X) + + sra M, 2, I + ble I, $L15 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD a4, 0 * SIZE(A2) + LD a5, 1 * SIZE(A2) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + MUL alpha1, a0, t0 + LD y0, 0 * SIZE(Y1) + MUL alpha1, a1, t1 + LD y1, 1 * SIZE(Y1) + + MUL alpha1, a2, t2 + LD y2, 2 * SIZE(Y1) + MUL alpha1, a3, t3 + LD y3, 3 * SIZE(Y1) + + ADD1 y0, t0, $f6 + unop + MUL alpha3, a4, t0 + LD y4, 4 * SIZE(Y1) + + ADD2 y1, t1, $f7 + unop + MUL alpha3, a5, t1 + LD y5, 5 * SIZE(Y1) + + ADD1 y2, t2, $f8 + unop + MUL alpha3, a6, t2 + LD y6, 6 * SIZE(Y1) + + ADD2 y3, t3, $f9 + unop + MUL alpha3, a7, t3 + LD y7, 7 * SIZE(Y1) + + ADD1 $f6, t0, y0 + unop + MUL alpha2, a1, t0 + LD a1, 5 * SIZE(A1) + + ADD2 $f7, t1, y1 + unop + MUL alpha2, a0, t1 + LD a0, 4 * SIZE(A1) + + ADD1 $f8, t2, y2 + unop + MUL alpha2, a3, t2 + LD a3, 7 * SIZE(A1) + + ADD2 $f9, t3, y3 + unop + MUL alpha2, a2, t3 + LD a2, 6 * SIZE(A1) + + ADD3 y0, t0, $f6 + unop + MUL alpha4, a5, t0 + LD a5, 5 * SIZE(A2) + + ADD4 y1, t1, $f7 + unop + MUL alpha4, a4, t1 + LD a4, 4 * SIZE(A2) + + ADD3 y2, t2, $f8 + unop + MUL alpha4, a7, t2 + LD a7, 7 * SIZE(A2) + + ADD4 y3, t3, $f9 + unop + MUL alpha4, a6, t3 + LD a6, 6 * SIZE(A2) + + ADD3 $f6, t0, y0 + MUL alpha1, a0, t0 + ADD4 $f7, t1, y1 + MUL alpha1, a1, t1 + + ADD3 $f8, t2, y2 + unop + MUL alpha1, a2, t2 + unop + + ADD4 $f9, t3, y3 + ldi I, -1(I) + MUL alpha1, a3, t3 + ble I, $L13 + .align 4 + +$L12: + ADD1 y4, t0, $f6 + ST y0, 0 * SIZE(Y1) + MUL alpha3, a4, t0 + fillcs (PREFETCHSIZE + 0) * SIZE(A1) + + ADD2 y5, t1, $f7 + ST y1, 1 * SIZE(Y1) + MUL alpha3, a5, t1 + ldi I, -1(I) + + ADD1 y6, t2, $f8 + ST y2, 2 * SIZE(Y1) + MUL alpha3, a6, t2 + unop + + ADD2 y7, t3, $f9 + ST y3, 3 * SIZE(Y1) + MUL alpha3, a7, t3 + unop + + ADD1 $f6, t0, y4 + unop + MUL alpha2, a1, t0 + LD a1, 9 * SIZE(A1) + + ADD2 $f7, t1, y5 + unop + MUL alpha2, a0, t1 + LD a0, 8 * SIZE(A1) + + ADD1 $f8, t2, y6 + unop + MUL alpha2, a3, t2 + LD a3, 11 * SIZE(A1) + + ADD2 $f9, t3, y7 + unop + MUL alpha2, a2, t3 + LD a2, 10 * SIZE(A1) + + ADD3 y4, t0, $f6 + fillcs (PREFETCHSIZE + 0) * SIZE(Y1) + MUL alpha4, a5, t0 + LD a5, 9 * SIZE(A2) + + ADD4 y5, t1, $f7 + unop + MUL alpha4, a4, t1 + LD a4, 8 * SIZE(A2) + + ADD3 y6, t2, $f8 + unop + MUL alpha4, a7, t2 + LD a7, 11 * SIZE(A2) + + ADD4 y7, t3, $f9 + unop + MUL alpha4, a6, t3 + LD a6, 10 * SIZE(A2) + + ADD3 $f6, t0, y4 + unop + MUL alpha1, a0, t0 + LD y0, 8 * SIZE(Y1) + + ADD4 $f7, t1, y5 + unop + MUL alpha1, a1, t1 + LD y1, 9 * SIZE(Y1) + + ADD3 $f8, t2, y6 + unop + MUL alpha1, a2, t2 + LD y2, 10 * SIZE(Y1) + + ADD4 $f9, t3, y7 + unop + MUL alpha1, a3, t3 + LD y3, 11 * SIZE(Y1) + + ADD1 y0, t0, $f6 + ST y4, 4 * SIZE(Y1) + MUL alpha3, a4, t0 + fillcs (PREFETCHSIZE + 0) * SIZE(A2) + + ADD2 y1, t1, $f7 + ST y5, 5 * SIZE(Y1) + MUL alpha3, a5, t1 + unop + + ADD1 y2, t2, $f8 + ST y6, 6 * SIZE(Y1) + MUL alpha3, a6, t2 + unop + + ADD2 y3, t3, $f9 + ST y7, 7 * SIZE(Y1) + MUL alpha3, a7, t3 + ldi Y1, 8 * SIZE(Y1) + + ADD1 $f6, t0, y0 + unop + MUL alpha2, a1, t0 + LD a1, 13 * SIZE(A1) + + ADD2 $f7, t1, y1 + unop + MUL alpha2, a0, t1 + LD a0, 12 * SIZE(A1) + + ADD1 $f8, t2, y2 + unop + MUL alpha2, a3, t2 + LD a3, 15 * SIZE(A1) + + ADD2 $f9, t3, y3 + unop + MUL alpha2, a2, t3 + LD a2, 14 * SIZE(A1) + + ADD3 y0, t0, $f6 + unop + MUL alpha4, a5, t0 + LD a5, 13 * SIZE(A2) + + ADD4 y1, t1, $f7 + unop + MUL alpha4, a4, t1 + LD a4, 12 * SIZE(A2) + + ADD3 y2, t2, $f8 + unop + MUL alpha4, a7, t2 + LD a7, 15 * SIZE(A2) + + ADD4 y3, t3, $f9 + unop + MUL alpha4, a6, t3 + LD a6, 14 * SIZE(A2) + + ADD3 $f6, t0, y0 + unop + MUL alpha1, a0, t0 + LD y4, 4 * SIZE(Y1) + + ADD4 $f7, t1, y1 + ldi A2, 8 * SIZE(A2) + MUL alpha1, a1, t1 + LD y5, 5 * SIZE(Y1) + + ADD3 $f8, t2, y2 + ldi A1, 8 * SIZE(A1) + MUL alpha1, a2, t2 + LD y6, 6 * SIZE(Y1) + + ADD4 $f9, t3, y3 + MUL alpha1, a3, t3 + LD y7, 7 * SIZE(Y1) + bgt I, $L12 + .align 4 + +$L13: + ADD1 y4, t0, $f6 + ST y0, 0 * SIZE(Y1) + MUL alpha3, a4, t0 + unop + + ADD2 y5, t1, $f7 + ST y1, 1 * SIZE(Y1) + MUL alpha3, a5, t1 + unop + + ADD1 y6, t2, $f8 + ST y2, 2 * SIZE(Y1) + MUL alpha3, a6, t2 + unop + + ADD2 y7, t3, $f9 + ST y3, 3 * SIZE(Y1) + MUL alpha3, a7, t3 + unop + + ADD1 $f6, t0, y4 + MUL alpha2, a1, t0 + ADD2 $f7, t1, y5 + MUL alpha2, a0, t1 + + ADD1 $f8, t2, y6 + MUL alpha2, a3, t2 + ADD2 $f9, t3, y7 + MUL alpha2, a2, t3 + + ADD3 y4, t0, $f6 + MUL alpha4, a5, t0 + ADD4 y5, t1, $f7 + MUL alpha4, a4, t1 + + ADD3 y6, t2, $f8 + MUL alpha4, a7, t2 + ADD4 y7, t3, $f9 + MUL alpha4, a6, t3 + + ADD3 $f6, t0, y4 + ADD4 $f7, t1, y5 + ADD3 $f8, t2, y6 + ADD4 $f9, t3, y7 + + ST y4, 4 * SIZE(Y1) + ldi A1, 8 * SIZE(A1) + ST y5, 5 * SIZE(Y1) + ldi A2, 8 * SIZE(A2) + + ST y6, 6 * SIZE(Y1) + unop + ST y7, 7 * SIZE(Y1) + ldi Y1, 8 * SIZE(Y1) + .align 4 + +$L15: + and M, 2, I + ble I, $L17 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD a4, 0 * SIZE(A2) + LD a5, 1 * SIZE(A2) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + MUL alpha1, a0, t0 + LD y0, 0 * SIZE(Y1) + MUL alpha1, a1, t1 + LD y1, 1 * SIZE(Y1) + MUL alpha1, a2, t2 + LD y2, 2 * SIZE(Y1) + MUL alpha1, a3, t3 + LD y3, 3 * SIZE(Y1) + + ADD1 y0, t0, $f6 + MUL alpha3, a4, t0 + ADD2 y1, t1, $f7 + MUL alpha3, a5, t1 + ADD1 y2, t2, $f8 + MUL alpha3, a6, t2 + ADD2 y3, t3, $f9 + MUL alpha3, a7, t3 + + ADD1 $f6, t0, y0 + MUL alpha2, a1, t0 + ADD2 $f7, t1, y1 + MUL alpha2, a0, t1 + + ADD1 $f8, t2, y2 + MUL alpha2, a3, t2 + ADD2 $f9, t3, y3 + MUL alpha2, a2, t3 + + ADD3 y0, t0, $f6 + MUL alpha4, a5, t0 + ADD4 y1, t1, $f7 + MUL alpha4, a4, t1 + + ADD3 y2, t2, $f8 + MUL alpha4, a7, t2 + ADD4 y3, t3, $f9 + MUL alpha4, a6, t3 + + ADD3 $f6, t0, y0 + ADD4 $f7, t1, y1 + ADD3 $f8, t2, y2 + ADD4 $f9, t3, y3 + + ST y0, 0 * SIZE(Y1) + ldi A1, 4 * SIZE(A1) + ST y1, 1 * SIZE(Y1) + ldi A2, 4 * SIZE(A2) + + ST y2, 2 * SIZE(Y1) + unop + ST y3, 3 * SIZE(Y1) + ldi Y1, 4 * SIZE(Y1) + .align 4 + +$L17: + blbc M, $L18 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 0 * SIZE(A2) + LD a3, 1 * SIZE(A2) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + + MUL alpha1, a0, t0 + MUL alpha1, a1, t1 + + ADD1 y0, t0, $f6 + MUL alpha3, a2, t0 + ADD2 y1, t1, $f7 + MUL alpha3, a3, t1 + + ADD1 $f6, t0, y0 + MUL alpha2, a1, t0 + ADD2 $f7, t1, y1 + MUL alpha2, a0, t1 + + ADD3 y0, t0, $f6 + MUL alpha4, a3, t0 + ADD4 y1, t1, $f7 + MUL alpha4, a2, t1 + + ADD3 $f6, t0, y0 + ADD4 $f7, t1, y1 + + ST y0, 0 * SIZE(Y1) + ST y1, 1 * SIZE(Y1) + .align 4 + +$L18: + ldi J, -1(J) + bgt J, $L11 + .align 4 + +$L20: + blbc N, $L990 + + LD alpha1, 0 * SIZE(X) + LD alpha2, 1 * SIZE(X) + + MUL alpha_r, alpha1, y0 + MUL alpha_r, alpha2, y1 + + MUL alpha_i, alpha2, t0 + mov A, A1 + MUL alpha_i, alpha1, t1 + mov Y, Y1 + +#ifndef XCONJ + SUB y0, t0, alpha1 + ADD y1, t1, alpha2 +#else + ADD y0, t0, alpha1 + SUB y1, t1, alpha2 +#endif + + sra M, 2, I + ble I, $L25 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + + MUL alpha1, a0, t0 + LD a4, 4 * SIZE(A1) + MUL alpha1, a1, t1 + LD a5, 5 * SIZE(A1) + MUL alpha1, a2, t2 + LD a6, 6 * SIZE(A1) + MUL alpha1, a3, t3 + LD a7, 7 * SIZE(A1) + + ADD1 y0, t0, $f6 + unop + MUL alpha2, a1, t0 + LD a1, 9 * SIZE(A1) + + ADD2 y1, t1, $f7 + unop + MUL alpha2, a0, t1 + LD a0, 8 * SIZE(A1) + + ADD1 y2, t2, $f8 + unop + MUL alpha2, a3, t2 + LD a3, 11 * SIZE(A1) + + ADD2 y3, t3, $f9 + unop + MUL alpha2, a2, t3 + LD a2, 10 * SIZE(A1) + + ADD3 $f6, t0, y0 + unop + LD y4, 4 * SIZE(Y1) + MUL alpha1, a4, t0 + + ADD4 $f7, t1, y1 + unop + LD y5, 5 * SIZE(Y1) + MUL alpha1, a5, t1 + + ADD3 $f8, t2, y2 + LD y6, 6 * SIZE(Y1) + MUL alpha1, a6, t2 + ldi I, -1(I) + + ADD4 $f9, t3, y3 + LD y7, 7 * SIZE(Y1) + MUL alpha1, a7, t3 + ble I, $L23 + .align 4 + +$L22: + ADD1 y4, t0, $f6 + ST y0, 0 * SIZE(Y1) + MUL alpha2, a5, t0 + LD a5, 13 * SIZE(A1) + + ADD2 y5, t1, $f7 + ST y1, 1 * SIZE(Y1) + MUL alpha2, a4, t1 + LD a4, 12 * SIZE(A1) + + ADD1 y6, t2, $f8 + ST y2, 2 * SIZE(Y1) + MUL alpha2, a7, t2 + LD a7, 15 * SIZE(A1) + + ADD2 y7, t3, $f9 + ST y3, 3 * SIZE(Y1) + MUL alpha2, a6, t3 + LD a6, 14 * SIZE(A1) + + ADD3 $f6, t0, y4 + LD y0, 8 * SIZE(Y1) + MUL alpha1, a0, t0 + fillcs (PREFETCHSIZE + 0) * SIZE(A1) + + ADD4 $f7, t1, y5 + LD y1, 9 * SIZE(Y1) + MUL alpha1, a1, t1 + ldi I, -1(I) + + ADD3 $f8, t2, y6 + LD y2, 10 * SIZE(Y1) + MUL alpha1, a2, t2 + unop + + ADD4 $f9, t3, y7 + LD y3, 11 * SIZE(Y1) + MUL alpha1, a3, t3 + unop + + ADD1 y0, t0, $f6 + ST y4, 4 * SIZE(Y1) + MUL alpha2, a1, t0 + LD a1, 17 * SIZE(A1) + + ADD2 y1, t1, $f7 + ST y5, 5 * SIZE(Y1) + MUL alpha2, a0, t1 + LD a0, 16 * SIZE(A1) + + ADD1 y2, t2, $f8 + ST y6, 6 * SIZE(Y1) + MUL alpha2, a3, t2 + LD a3, 19 * SIZE(A1) + + ADD2 y3, t3, $f9 + ST y7, 7 * SIZE(Y1) + MUL alpha2, a2, t3 + LD a2, 18 * SIZE(A1) + + ADD3 $f6, t0, y0 + LD y4, 12 * SIZE(Y1) + MUL alpha1, a4, t0 + fillcs (PREFETCHSIZE + 0) * SIZE(Y1) + + ADD4 $f7, t1, y1 + LD y5, 13 * SIZE(Y1) + MUL alpha1, a5, t1 + ldi A1, 8 * SIZE(A1) + + ADD3 $f8, t2, y2 + LD y6, 14 * SIZE(Y1) + MUL alpha1, a6, t2 + ldi Y1, 8 * SIZE(Y1) + + ADD4 $f9, t3, y3 + LD y7, 7 * SIZE(Y1) + MUL alpha1, a7, t3 + bgt I, $L22 + .align 4 + +$L23: + ADD1 y4, t0, $f6 + ST y0, 0 * SIZE(Y1) + MUL alpha2, a5, t0 + unop + + ADD2 y5, t1, $f7 + ST y1, 1 * SIZE(Y1) + MUL alpha2, a4, t1 + unop + + ADD1 y6, t2, $f8 + ST y2, 2 * SIZE(Y1) + MUL alpha2, a7, t2 + unop + + ADD2 y7, t3, $f9 + ST y3, 3 * SIZE(Y1) + MUL alpha2, a6, t3 + unop + + ADD3 $f6, t0, y4 + ADD4 $f7, t1, y5 + ADD3 $f8, t2, y6 + ADD4 $f9, t3, y7 + + ST y4, 4 * SIZE(Y1) + unop + ST y5, 5 * SIZE(Y1) + unop + + ST y6, 6 * SIZE(Y1) + ldi A1, 8 * SIZE(A1) + ST y7, 7 * SIZE(Y1) + ldi Y1, 8 * SIZE(Y1) + .align 4 + +$L25: + and M, 2, I + ble I, $L27 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + MUL alpha1, a0, t0 + LD y0, 0 * SIZE(Y1) + MUL alpha1, a1, t1 + LD y1, 1 * SIZE(Y1) + MUL alpha1, a2, t2 + LD y2, 2 * SIZE(Y1) + MUL alpha1, a3, t3 + LD y3, 3 * SIZE(Y1) + + ADD1 y0, t0, $f6 + MUL alpha2, a1, t0 + ADD2 y1, t1, $f7 + MUL alpha2, a0, t1 + ADD1 y2, t2, $f8 + MUL alpha2, a3, t2 + ADD2 y3, t3, $f9 + MUL alpha2, a2, t3 + + ADD3 $f6, t0, y0 + ADD4 $f7, t1, y1 + ADD3 $f8, t2, y2 + ADD4 $f9, t3, y3 + + ST y0, 0 * SIZE(Y1) + ST y1, 1 * SIZE(Y1) + + ST y2, 2 * SIZE(Y1) + ldi A1, 4 * SIZE(A1) + ST y3, 3 * SIZE(Y1) + ldi Y1, 4 * SIZE(Y1) + .align 4 + +$L27: + blbc M, $L990 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + + MUL alpha1, a0, t0 + LD y0, 0 * SIZE(Y1) + MUL alpha1, a1, t1 + LD y1, 1 * SIZE(Y1) + + ADD1 y0, t0, $f6 + MUL alpha2, a1, t0 + ADD2 y1, t1, $f7 + MUL alpha2, a0, t1 + + ADD3 $f6, t0, y0 + ADD4 $f7, t1, y1 + + ST y0, 0 * SIZE(Y1) + ST y1, 1 * SIZE(Y1) + .align 4 + +$L990: + cmpeq INCY, 2 * SIZE, $0 + bne $0, $L999 + + mov BUFFER, Y1 + + sra M, 2, I + ble I, $L995 + .align 4 + +$L992: + LD a0, 0 * SIZE(BUFFER) + LD a1, 1 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + LD a2, 0 * SIZE(BUFFER) + LD a3, 1 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + + LD y0, 0 * SIZE(Y) + LD y1, 1 * SIZE(Y) + LD y2, 2 * SIZE(Y) + LD y3, 3 * SIZE(Y) + + LD a4, 0 * SIZE(BUFFER) + LD a5, 1 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + LD a6, 0 * SIZE(BUFFER) + LD a7, 1 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + + LD y4, 4 * SIZE(Y) + LD y5, 5 * SIZE(Y) + LD y6, 6 * SIZE(Y) + LD y7, 7 * SIZE(Y) + + ADD a0, y0, $f6 + ADD a1, y1, $f7 + ADD a2, y2, $f8 + ADD a3, y3, $f9 + + fmov $f6, a0 + fmov $f7, a1 + fmov $f8, a2 + fmov $f9, a3 + + ST a0, 0 * SIZE(Y1) + ADD a4, y4, $f6 + ST a1, 1 * SIZE(Y1) + ADD a5, y5, $f7 + addl Y1, INCY, Y1 + + ST a2, 0 * SIZE(Y1) + ADD a6, y6, $f8 + ST a3, 1 * SIZE(Y1) + ADD a7, y7, $f9 + addl Y1, INCY, Y1 + + fmov $f6, a4 + fmov $f7, a5 + fmov $f8, a6 + fmov $f9, a7 + + ST a4, 0 * SIZE(Y1) + ST a5, 1 * SIZE(Y1) + addl Y1, INCY, Y1 + ST a6, 0 * SIZE(Y1) + ST a7, 1 * SIZE(Y1) + addl Y1, INCY, Y1 + + ldi I, -1(I) + ldi Y, 8 * SIZE(Y) + bgt I, $L992 + .align 4 + +$L995: + and M, 3, I + ble I, $L999 + .align 4 + +$L996: + LD a0, 0 * SIZE(BUFFER) + LD a1, 1 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + + LD y0, 0 * SIZE(Y) + LD y1, 1 * SIZE(Y) + ldi Y, 2 * SIZE(Y) + + ADD a0, y0, $f6 + ADD a1, y1, $f7 + + fmov $f6, a0 + fmov $f7, a1 + + ST a0, 0 * SIZE(Y1) + ST a1, 1 * SIZE(Y1) + addl Y1, INCY, Y1 + + ldi I, -1(I) + bgt I, $L996 + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + + ldi $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/zgemv_n.S.bak b/kernel/sw_64/zgemv_n.S.bak new file mode 100644 index 0000000..3dd482e --- /dev/null +++ b/kernel/sw_64/zgemv_n.S.bak @@ -0,0 +1,1027 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define STACKSIZE 64 +#define PREFETCHSIZE 32 + +#define M $16 +#define N $17 +#define A $21 +#define LDA $18 + +#define X $19 +#define INCX $20 +#define Y $22 +#define INCY $23 + +#define BUFFER $24 + +#define I $25 +#define J $27 + +#define Y1 $4 +#define A1 $5 +#define A2 $6 + +#define alpha_r $f19 +#define alpha_i $f20 + +#define alpha1 $f0 +#define alpha2 $f1 +#define alpha3 $f10 +#define alpha4 $f11 + +#define y0 $f12 +#define y1 $f13 +#define y2 $f14 +#define y3 $f15 + +#define y4 $f16 +#define y5 $f17 +#define y6 $f18 +#define y7 $f21 + +#define a0 $f22 +#define a1 $f23 +#define a2 $f24 +#define a3 $f25 +#define a4 $f26 +#define a5 $f27 +#define a6 $f28 +#define a7 $f29 + +#define t0 $f2 +#define t1 $f3 +#define t2 $f4 +#define t3 $f5 + +#if !defined(CONJ) && !defined(XCONJ) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#elif defined(CONJ) && !defined(XCONJ) +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#elif !defined(CONJ) && defined(XCONJ) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#else +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 SUB +#define ADD4 SUB +#endif + + PROLOGUE + + ldi $sp, -STACKSIZE($sp) + ldl LDA, 0 + STACKSIZE($sp) + ldl X, 8 + STACKSIZE($sp) + ldl INCX, 16 + STACKSIZE($sp) + ldl Y, 24 + STACKSIZE($sp) + ldl INCY, 32 + STACKSIZE($sp) + ldl BUFFER, 40 + STACKSIZE($sp) + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + + PROFCODE + + cmple M, 0, $0 + sll INCX, ZBASE_SHIFT, INCX + cmple N, 0, $1 + sll INCY, ZBASE_SHIFT, INCY + + or $0, $1, $0 + bne $0, $L999 + + cmpeq INCY, 2 * SIZE, $0 + sll LDA, ZBASE_SHIFT,LDA + bne $0, $L10 + + mov BUFFER, Y1 + + mov Y, BUFFER + mov Y1, Y + + sra M, 2, I + ble I, $L05 + .align 4 + +$L02: + ST $f31, 0 * SIZE(Y1) + ST $f31, 1 * SIZE(Y1) + ST $f31, 2 * SIZE(Y1) + ST $f31, 3 * SIZE(Y1) + ST $f31, 4 * SIZE(Y1) + ST $f31, 5 * SIZE(Y1) + ST $f31, 6 * SIZE(Y1) + ST $f31, 7 * SIZE(Y1) + + ldi Y1, 8 * SIZE(Y1) + ldi I, -1(I) + bgt I, $L02 + .align 4 + +$L05: + and M, 3, I + ble I, $L10 + .align 4 + +$L06: + ST $f31, 0 * SIZE(Y1) + ST $f31, 1 * SIZE(Y1) + addl Y1, 2 * SIZE, Y1 + + ldi I, -1(I) + bgt I, $L06 + .align 4 + +$L10: + sra N, 1, J + ble J, $L20 + .align 4 + +$L11: + LD alpha1, 0 * SIZE(X) + LD alpha2, 1 * SIZE(X) + addl X, INCX, X + LD alpha3, 0 * SIZE(X) + LD alpha4, 1 * SIZE(X) + addl X, INCX, X + + MUL alpha_r, alpha1, y0 + MUL alpha_r, alpha2, y1 + MUL alpha_r, alpha3, y2 + MUL alpha_r, alpha4, y3 + + MUL alpha_i, alpha2, t0 + mov A, A1 + MUL alpha_i, alpha1, t1 + addl A, LDA, A2 + MUL alpha_i, alpha4, t2 + addl A2, LDA, A + MUL alpha_i, alpha3, t3 + mov Y, Y1 + +#ifndef XCONJ + SUB y0, t0, alpha1 + ADD y1, t1, alpha2 + SUB y2, t2, alpha3 + ADD y3, t3, alpha4 +#else + ADD y0, t0, alpha1 + SUB y1, t1, alpha2 + ADD y2, t2, alpha3 + SUB y3, t3, alpha4 +#endif + + fillcs 4 * SIZE(X) + + sra M, 2, I + ble I, $L15 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD a4, 0 * SIZE(A2) + LD a5, 1 * SIZE(A2) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + MUL alpha1, a0, t0 + LD y0, 0 * SIZE(Y1) + MUL alpha1, a1, t1 + LD y1, 1 * SIZE(Y1) + + MUL alpha1, a2, t2 + LD y2, 2 * SIZE(Y1) + MUL alpha1, a3, t3 + LD y3, 3 * SIZE(Y1) + + ADD1 y0, t0, y0 + unop + MUL alpha3, a4, t0 + LD y4, 4 * SIZE(Y1) + + ADD2 y1, t1, y1 + unop + MUL alpha3, a5, t1 + LD y5, 5 * SIZE(Y1) + + ADD1 y2, t2, y2 + unop + MUL alpha3, a6, t2 + LD y6, 6 * SIZE(Y1) + + ADD2 y3, t3, y3 + unop + MUL alpha3, a7, t3 + LD y7, 7 * SIZE(Y1) + + ADD1 y0, t0, y0 + unop + MUL alpha2, a1, t0 + LD a1, 5 * SIZE(A1) + + ADD2 y1, t1, y1 + unop + MUL alpha2, a0, t1 + LD a0, 4 * SIZE(A1) + + ADD1 y2, t2, y2 + unop + MUL alpha2, a3, t2 + LD a3, 7 * SIZE(A1) + + ADD2 y3, t3, y3 + unop + MUL alpha2, a2, t3 + LD a2, 6 * SIZE(A1) + + ADD3 y0, t0, y0 + unop + MUL alpha4, a5, t0 + LD a5, 5 * SIZE(A2) + + ADD4 y1, t1, y1 + unop + MUL alpha4, a4, t1 + LD a4, 4 * SIZE(A2) + + ADD3 y2, t2, y2 + unop + MUL alpha4, a7, t2 + LD a7, 7 * SIZE(A2) + + ADD4 y3, t3, y3 + unop + MUL alpha4, a6, t3 + LD a6, 6 * SIZE(A2) + + ADD3 y0, t0, y0 + MUL alpha1, a0, t0 + ADD4 y1, t1, y1 + MUL alpha1, a1, t1 + + ADD3 y2, t2, y2 + unop + MUL alpha1, a2, t2 + unop + + ADD4 y3, t3, y3 + ldi I, -1(I) + MUL alpha1, a3, t3 + ble I, $L13 + .align 4 + +$L12: + ADD1 y4, t0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha3, a4, t0 + fillcs (PREFETCHSIZE + 0) * SIZE(A1) + + ADD2 y5, t1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha3, a5, t1 + ldi I, -1(I) + + ADD1 y6, t2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha3, a6, t2 + unop + + ADD2 y7, t3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha3, a7, t3 + unop + + ADD1 y4, t0, y4 + unop + MUL alpha2, a1, t0 + LD a1, 9 * SIZE(A1) + + ADD2 y5, t1, y5 + unop + MUL alpha2, a0, t1 + LD a0, 8 * SIZE(A1) + + ADD1 y6, t2, y6 + unop + MUL alpha2, a3, t2 + LD a3, 11 * SIZE(A1) + + ADD2 y7, t3, y7 + unop + MUL alpha2, a2, t3 + LD a2, 10 * SIZE(A1) + + ADD3 y4, t0, y4 + fillcs (PREFETCHSIZE + 0) * SIZE(Y1) + MUL alpha4, a5, t0 + LD a5, 9 * SIZE(A2) + + ADD4 y5, t1, y5 + unop + MUL alpha4, a4, t1 + LD a4, 8 * SIZE(A2) + + ADD3 y6, t2, y6 + unop + MUL alpha4, a7, t2 + LD a7, 11 * SIZE(A2) + + ADD4 y7, t3, y7 + unop + MUL alpha4, a6, t3 + LD a6, 10 * SIZE(A2) + + ADD3 y4, t0, y4 + unop + MUL alpha1, a0, t0 + LD y0, 8 * SIZE(Y1) + + ADD4 y5, t1, y5 + unop + MUL alpha1, a1, t1 + LD y1, 9 * SIZE(Y1) + + ADD3 y6, t2, y6 + unop + MUL alpha1, a2, t2 + LD y2, 10 * SIZE(Y1) + + ADD4 y7, t3, y7 + unop + MUL alpha1, a3, t3 + LD y3, 11 * SIZE(Y1) + + ADD1 y0, t0, y0 + ST y4, 4 * SIZE(Y1) + MUL alpha3, a4, t0 + fillcs (PREFETCHSIZE + 0) * SIZE(A2) + + ADD2 y1, t1, y1 + ST y5, 5 * SIZE(Y1) + MUL alpha3, a5, t1 + unop + + ADD1 y2, t2, y2 + ST y6, 6 * SIZE(Y1) + MUL alpha3, a6, t2 + unop + + ADD2 y3, t3, y3 + ST y7, 7 * SIZE(Y1) + MUL alpha3, a7, t3 + ldi Y1, 8 * SIZE(Y1) + + ADD1 y0, t0, y0 + unop + MUL alpha2, a1, t0 + LD a1, 13 * SIZE(A1) + + ADD2 y1, t1, y1 + unop + MUL alpha2, a0, t1 + LD a0, 12 * SIZE(A1) + + ADD1 y2, t2, y2 + unop + MUL alpha2, a3, t2 + LD a3, 15 * SIZE(A1) + + ADD2 y3, t3, y3 + unop + MUL alpha2, a2, t3 + LD a2, 14 * SIZE(A1) + + ADD3 y0, t0, y0 + unop + MUL alpha4, a5, t0 + LD a5, 13 * SIZE(A2) + + ADD4 y1, t1, y1 + unop + MUL alpha4, a4, t1 + LD a4, 12 * SIZE(A2) + + ADD3 y2, t2, y2 + unop + MUL alpha4, a7, t2 + LD a7, 15 * SIZE(A2) + + ADD4 y3, t3, y3 + unop + MUL alpha4, a6, t3 + LD a6, 14 * SIZE(A2) + + ADD3 y0, t0, y0 + unop + MUL alpha1, a0, t0 + LD y4, 4 * SIZE(Y1) + + ADD4 y1, t1, y1 + ldi A2, 8 * SIZE(A2) + MUL alpha1, a1, t1 + LD y5, 5 * SIZE(Y1) + + ADD3 y2, t2, y2 + ldi A1, 8 * SIZE(A1) + MUL alpha1, a2, t2 + LD y6, 6 * SIZE(Y1) + + ADD4 y3, t3, y3 + MUL alpha1, a3, t3 + LD y7, 7 * SIZE(Y1) + bgt I, $L12 + .align 4 + +$L13: + ADD1 y4, t0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha3, a4, t0 + unop + + ADD2 y5, t1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha3, a5, t1 + unop + + ADD1 y6, t2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha3, a6, t2 + unop + + ADD2 y7, t3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha3, a7, t3 + unop + + ADD1 y4, t0, y4 + MUL alpha2, a1, t0 + ADD2 y5, t1, y5 + MUL alpha2, a0, t1 + + ADD1 y6, t2, y6 + MUL alpha2, a3, t2 + ADD2 y7, t3, y7 + MUL alpha2, a2, t3 + + ADD3 y4, t0, y4 + MUL alpha4, a5, t0 + ADD4 y5, t1, y5 + MUL alpha4, a4, t1 + + ADD3 y6, t2, y6 + MUL alpha4, a7, t2 + ADD4 y7, t3, y7 + MUL alpha4, a6, t3 + + ADD3 y4, t0, y4 + ADD4 y5, t1, y5 + ADD3 y6, t2, y6 + ADD4 y7, t3, y7 + + ST y4, 4 * SIZE(Y1) + ldi A1, 8 * SIZE(A1) + ST y5, 5 * SIZE(Y1) + ldi A2, 8 * SIZE(A2) + + ST y6, 6 * SIZE(Y1) + unop + ST y7, 7 * SIZE(Y1) + ldi Y1, 8 * SIZE(Y1) + .align 4 + +$L15: + and M, 2, I + ble I, $L17 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD a4, 0 * SIZE(A2) + LD a5, 1 * SIZE(A2) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + MUL alpha1, a0, t0 + LD y0, 0 * SIZE(Y1) + MUL alpha1, a1, t1 + LD y1, 1 * SIZE(Y1) + MUL alpha1, a2, t2 + LD y2, 2 * SIZE(Y1) + MUL alpha1, a3, t3 + LD y3, 3 * SIZE(Y1) + + ADD1 y0, t0, y0 + MUL alpha3, a4, t0 + ADD2 y1, t1, y1 + MUL alpha3, a5, t1 + ADD1 y2, t2, y2 + MUL alpha3, a6, t2 + ADD2 y3, t3, y3 + MUL alpha3, a7, t3 + + ADD1 y0, t0, y0 + MUL alpha2, a1, t0 + ADD2 y1, t1, y1 + MUL alpha2, a0, t1 + + ADD1 y2, t2, y2 + MUL alpha2, a3, t2 + ADD2 y3, t3, y3 + MUL alpha2, a2, t3 + + ADD3 y0, t0, y0 + MUL alpha4, a5, t0 + ADD4 y1, t1, y1 + MUL alpha4, a4, t1 + + ADD3 y2, t2, y2 + MUL alpha4, a7, t2 + ADD4 y3, t3, y3 + MUL alpha4, a6, t3 + + ADD3 y0, t0, y0 + ADD4 y1, t1, y1 + ADD3 y2, t2, y2 + ADD4 y3, t3, y3 + + ST y0, 0 * SIZE(Y1) + ldi A1, 4 * SIZE(A1) + ST y1, 1 * SIZE(Y1) + ldi A2, 4 * SIZE(A2) + + ST y2, 2 * SIZE(Y1) + unop + ST y3, 3 * SIZE(Y1) + ldi Y1, 4 * SIZE(Y1) + .align 4 + +$L17: + blbc M, $L18 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 0 * SIZE(A2) + LD a3, 1 * SIZE(A2) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + + MUL alpha1, a0, t0 + MUL alpha1, a1, t1 + + ADD1 y0, t0, y0 + MUL alpha3, a2, t0 + ADD2 y1, t1, y1 + MUL alpha3, a3, t1 + + ADD1 y0, t0, y0 + MUL alpha2, a1, t0 + ADD2 y1, t1, y1 + MUL alpha2, a0, t1 + + ADD3 y0, t0, y0 + MUL alpha4, a3, t0 + ADD4 y1, t1, y1 + MUL alpha4, a2, t1 + + ADD3 y0, t0, y0 + ADD4 y1, t1, y1 + + ST y0, 0 * SIZE(Y1) + ST y1, 1 * SIZE(Y1) + .align 4 + +$L18: + ldi J, -1(J) + bgt J, $L11 + .align 4 + +$L20: + blbc N, $L990 + + LD alpha1, 0 * SIZE(X) + LD alpha2, 1 * SIZE(X) + + MUL alpha_r, alpha1, y0 + MUL alpha_r, alpha2, y1 + + MUL alpha_i, alpha2, t0 + mov A, A1 + MUL alpha_i, alpha1, t1 + mov Y, Y1 + +#ifndef XCONJ + SUB y0, t0, alpha1 + ADD y1, t1, alpha2 +#else + ADD y0, t0, alpha1 + SUB y1, t1, alpha2 +#endif + + sra M, 2, I + ble I, $L25 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + + MUL alpha1, a0, t0 + LD a4, 4 * SIZE(A1) + MUL alpha1, a1, t1 + LD a5, 5 * SIZE(A1) + MUL alpha1, a2, t2 + LD a6, 6 * SIZE(A1) + MUL alpha1, a3, t3 + LD a7, 7 * SIZE(A1) + + ADD1 y0, t0, y0 + unop + MUL alpha2, a1, t0 + LD a1, 9 * SIZE(A1) + + ADD2 y1, t1, y1 + unop + MUL alpha2, a0, t1 + LD a0, 8 * SIZE(A1) + + ADD1 y2, t2, y2 + unop + MUL alpha2, a3, t2 + LD a3, 11 * SIZE(A1) + + ADD2 y3, t3, y3 + unop + MUL alpha2, a2, t3 + LD a2, 10 * SIZE(A1) + + ADD3 y0, t0, y0 + unop + LD y4, 4 * SIZE(Y1) + MUL alpha1, a4, t0 + + ADD4 y1, t1, y1 + unop + LD y5, 5 * SIZE(Y1) + MUL alpha1, a5, t1 + + ADD3 y2, t2, y2 + LD y6, 6 * SIZE(Y1) + MUL alpha1, a6, t2 + ldi I, -1(I) + + ADD4 y3, t3, y3 + LD y7, 7 * SIZE(Y1) + MUL alpha1, a7, t3 + ble I, $L23 + .align 4 + +$L22: + ADD1 y4, t0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha2, a5, t0 + LD a5, 13 * SIZE(A1) + + ADD2 y5, t1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha2, a4, t1 + LD a4, 12 * SIZE(A1) + + ADD1 y6, t2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha2, a7, t2 + LD a7, 15 * SIZE(A1) + + ADD2 y7, t3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha2, a6, t3 + LD a6, 14 * SIZE(A1) + + ADD3 y4, t0, y4 + LD y0, 8 * SIZE(Y1) + MUL alpha1, a0, t0 + fillcs (PREFETCHSIZE + 0) * SIZE(A1) + + ADD4 y5, t1, y5 + LD y1, 9 * SIZE(Y1) + MUL alpha1, a1, t1 + ldi I, -1(I) + + ADD3 y6, t2, y6 + LD y2, 10 * SIZE(Y1) + MUL alpha1, a2, t2 + unop + + ADD4 y7, t3, y7 + LD y3, 11 * SIZE(Y1) + MUL alpha1, a3, t3 + unop + + ADD1 y0, t0, y0 + ST y4, 4 * SIZE(Y1) + MUL alpha2, a1, t0 + LD a1, 17 * SIZE(A1) + + ADD2 y1, t1, y1 + ST y5, 5 * SIZE(Y1) + MUL alpha2, a0, t1 + LD a0, 16 * SIZE(A1) + + ADD1 y2, t2, y2 + ST y6, 6 * SIZE(Y1) + MUL alpha2, a3, t2 + LD a3, 19 * SIZE(A1) + + ADD2 y3, t3, y3 + ST y7, 7 * SIZE(Y1) + MUL alpha2, a2, t3 + LD a2, 18 * SIZE(A1) + + ADD3 y0, t0, y0 + LD y4, 12 * SIZE(Y1) + MUL alpha1, a4, t0 + fillcs (PREFETCHSIZE + 0) * SIZE(Y1) + + ADD4 y1, t1, y1 + LD y5, 13 * SIZE(Y1) + MUL alpha1, a5, t1 + ldi A1, 8 * SIZE(A1) + + ADD3 y2, t2, y2 + LD y6, 14 * SIZE(Y1) + MUL alpha1, a6, t2 + ldi Y1, 8 * SIZE(Y1) + + ADD4 y3, t3, y3 + LD y7, 7 * SIZE(Y1) + MUL alpha1, a7, t3 + bgt I, $L22 + .align 4 + +$L23: + ADD1 y4, t0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha2, a5, t0 + unop + + ADD2 y5, t1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha2, a4, t1 + unop + + ADD1 y6, t2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha2, a7, t2 + unop + + ADD2 y7, t3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha2, a6, t3 + unop + + ADD3 y4, t0, y4 + ADD4 y5, t1, y5 + ADD3 y6, t2, y6 + ADD4 y7, t3, y7 + + ST y4, 4 * SIZE(Y1) + unop + ST y5, 5 * SIZE(Y1) + unop + + ST y6, 6 * SIZE(Y1) + ldi A1, 8 * SIZE(A1) + ST y7, 7 * SIZE(Y1) + ldi Y1, 8 * SIZE(Y1) + .align 4 + +$L25: + and M, 2, I + ble I, $L27 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + MUL alpha1, a0, t0 + LD y0, 0 * SIZE(Y1) + MUL alpha1, a1, t1 + LD y1, 1 * SIZE(Y1) + MUL alpha1, a2, t2 + LD y2, 2 * SIZE(Y1) + MUL alpha1, a3, t3 + LD y3, 3 * SIZE(Y1) + + ADD1 y0, t0, y0 + MUL alpha2, a1, t0 + ADD2 y1, t1, y1 + MUL alpha2, a0, t1 + ADD1 y2, t2, y2 + MUL alpha2, a3, t2 + ADD2 y3, t3, y3 + MUL alpha2, a2, t3 + + ADD3 y0, t0, y0 + ADD4 y1, t1, y1 + ADD3 y2, t2, y2 + ADD4 y3, t3, y3 + + ST y0, 0 * SIZE(Y1) + ST y1, 1 * SIZE(Y1) + + ST y2, 2 * SIZE(Y1) + ldi A1, 4 * SIZE(A1) + ST y3, 3 * SIZE(Y1) + ldi Y1, 4 * SIZE(Y1) + .align 4 + +$L27: + blbc M, $L990 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + + MUL alpha1, a0, t0 + LD y0, 0 * SIZE(Y1) + MUL alpha1, a1, t1 + LD y1, 1 * SIZE(Y1) + + ADD1 y0, t0, y0 + MUL alpha2, a1, t0 + ADD2 y1, t1, y1 + MUL alpha2, a0, t1 + + ADD3 y0, t0, y0 + ADD4 y1, t1, y1 + + ST y0, 0 * SIZE(Y1) + ST y1, 1 * SIZE(Y1) + .align 4 + +$L990: + cmpeq INCY, 2 * SIZE, $0 + bne $0, $L999 + + mov BUFFER, Y1 + + sra M, 2, I + ble I, $L995 + .align 4 + +$L992: + LD a0, 0 * SIZE(BUFFER) + LD a1, 1 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + LD a2, 0 * SIZE(BUFFER) + LD a3, 1 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + + LD y0, 0 * SIZE(Y) + LD y1, 1 * SIZE(Y) + LD y2, 2 * SIZE(Y) + LD y3, 3 * SIZE(Y) + + LD a4, 0 * SIZE(BUFFER) + LD a5, 1 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + LD a6, 0 * SIZE(BUFFER) + LD a7, 1 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + + LD y4, 4 * SIZE(Y) + LD y5, 5 * SIZE(Y) + LD y6, 6 * SIZE(Y) + LD y7, 7 * SIZE(Y) + + ADD a0, y0, a0 + ADD a1, y1, a1 + ADD a2, y2, a2 + ADD a3, y3, a3 + + ST a0, 0 * SIZE(Y1) + ADD a4, y4, a4 + ST a1, 1 * SIZE(Y1) + ADD a5, y5, a5 + addl Y1, INCY, Y1 + + ST a2, 0 * SIZE(Y1) + ADD a6, y6, a6 + ST a3, 1 * SIZE(Y1) + ADD a7, y7, a7 + addl Y1, INCY, Y1 + + ST a4, 0 * SIZE(Y1) + ST a5, 1 * SIZE(Y1) + addl Y1, INCY, Y1 + ST a6, 0 * SIZE(Y1) + ST a7, 1 * SIZE(Y1) + addl Y1, INCY, Y1 + + ldi I, -1(I) + ldi Y, 8 * SIZE(Y) + bgt I, $L992 + .align 4 + +$L995: + and M, 3, I + ble I, $L999 + .align 4 + +$L996: + LD a0, 0 * SIZE(BUFFER) + LD a1, 1 * SIZE(BUFFER) + addl BUFFER, INCY, BUFFER + + LD y0, 0 * SIZE(Y) + LD y1, 1 * SIZE(Y) + ldi Y, 2 * SIZE(Y) + + ADD a0, y0, a0 + ADD a1, y1, a1 + + ST a0, 0 * SIZE(Y1) + ST a1, 1 * SIZE(Y1) + addl Y1, INCY, Y1 + + ldi I, -1(I) + bgt I, $L996 + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + + ldi $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/zgemv_t.S b/kernel/sw_64/zgemv_t.S new file mode 100644 index 0000000..bf31cb4 --- /dev/null +++ b/kernel/sw_64/zgemv_t.S @@ -0,0 +1,1047 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define STACKSIZE 64 +#define PREFETCHSIZE 32 + +#define M $16 +#define N $17 +#define A $21 +#define LDA $18 + +#define X $19 +#define INCX $20 +#define Y $22 +#define INCY $23 + +#define BUFFER $24 + +#define I $25 +#define J $27 + +#define X1 $3 +#define Y1 $4 +#define A1 $5 +#define A2 $6 + +#define alpha_r $f19 +#define alpha_i $f20 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f21 + +#define a0 $f22 +#define a1 $f23 +#define a2 $f24 +#define a3 $f25 +#define a4 $f26 +#define a5 $f27 +#define a6 $f28 +#define a7 $f29 + +#define a8 $f2 +#define a9 $f3 +#define a10 $f4 +#define a11 $f5 +#define a12 $f6 +#define a13 $f7 +#define a14 $f8 +#define a15 $f9 + +#if !defined(CONJ) && !defined(XCONJ) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#elif !defined(CONJ) && defined(XCONJ) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#elif defined(CONJ) && !defined(XCONJ) +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#else +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 SUB +#define ADD4 SUB +#endif + + PROLOGUE + + ldi $sp, -STACKSIZE($sp) + ldl LDA, 0 + STACKSIZE($sp) + ldl X, 8 + STACKSIZE($sp) + ldl INCX, 16 + STACKSIZE($sp) + ldl Y, 24 + STACKSIZE($sp) + ldl INCY, 32 + STACKSIZE($sp) + ldl BUFFER, 40 + STACKSIZE($sp) + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + + PROFCODE + + cmple M, 0, $0 + sll INCX, ZBASE_SHIFT, INCX + cmple N, 0, $1 + sll INCY, ZBASE_SHIFT, INCY + + or $0, $1, $0 + bne $0, $L999 + + cmpeq INCX, 2 * SIZE, $0 + mov X, X1 + sll LDA, ZBASE_SHIFT,LDA + bne $0, $L10 + + sra M, 2, I + mov BUFFER, Y1 + mov BUFFER, X + ble I, $L05 + .align 4 + +$L02: + fillcs (PREFETCHSIZE + 0) * SIZE(X1) + ldi I, -1(I) + + LD a0, 0 * SIZE(X1) + LD a1, 1 * SIZE(X1) + addl X1, INCX, X1 + LD a2, 0 * SIZE(X1) + LD a3, 1 * SIZE(X1) + addl X1, INCX, X1 + + ST a0, 0 * SIZE(Y1) + ST a1, 1 * SIZE(Y1) + ST a2, 2 * SIZE(Y1) + ST a3, 3 * SIZE(Y1) + + LD a4, 0 * SIZE(X1) + LD a5, 1 * SIZE(X1) + addl X1, INCX, X1 + LD a6, 0 * SIZE(X1) + LD a7, 1 * SIZE(X1) + addl X1, INCX, X1 + + ST a4, 4 * SIZE(Y1) + ST a5, 5 * SIZE(Y1) + ST a6, 6 * SIZE(Y1) + ST a7, 7 * SIZE(Y1) + + ldi Y1, 8 * SIZE(Y1) + bgt I, $L02 + .align 4 + +$L05: + and M, 3, I + ble I, $L10 + .align 4 + +$L06: + LD a0, 0 * SIZE(X1) + LD a1, 1 * SIZE(X1) + addl X1, INCX, X1 + + ST a0, 0 * SIZE(Y1) + ST a1, 1 * SIZE(Y1) + ldi Y1, 2 * SIZE(Y1) + + ldi I, -1(I) + bgt I, $L06 + .align 4 + +$L10: + mov Y, Y1 + fclr t0 + unop + fclr t1 + + sra N, 1, J + fclr t2 + fclr t3 + ble J, $L20 + .align 4 + +$L11: + mov A, A1 + fclr s0 + addl A, LDA, A2 + fclr s1 + + addl A2, LDA, A + unop + mov X, X1 + fillcs 3 * SIZE(Y) + + sra M, 2, I + fclr s2 + fclr s3 + ble I, $L15 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 0 * SIZE(A2) + LD a3, 1 * SIZE(A2) + LD a4, 2 * SIZE(A1) + LD a5, 3 * SIZE(A1) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + LD a8, 4 * SIZE(A1) + LD a9, 5 * SIZE(A1) + LD a10, 4 * SIZE(A2) + LD a11, 5 * SIZE(A2) + LD a12, 6 * SIZE(A1) + LD a13, 7 * SIZE(A1) + LD a14, 6 * SIZE(A2) + LD a15, 7 * SIZE(A2) + + LD x0, 0 * SIZE(X1) + LD x1, 1 * SIZE(X1) + LD x2, 2 * SIZE(X1) + + ldi I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ADD3 s0, t0, $f30 + fmov $f30, s0 + unop + MUL x0, a0, t0 + LD x3, 3 * SIZE(X1) + + ADD4 s1, t1, $f30 + fmov $f30, s1 + fillcs (PREFETCHSIZE + 0) * SIZE(A1) + MUL x0, a1, t1 + unop + + ADD3 s2, t2, $f30 + fmov $f30, s2 + unop + MUL x0, a2, t2 + unop + + ADD4 s3, t3, $f30 + fmov $f30, s3 + unop + MUL x0, a3, t3 + LD x0, 4 * SIZE(X1) + + ADD1 s0, t0, $f30 + fmov $f30, s0 + unop + MUL x1, a1, t0 + LD a1, 9 * SIZE(A1) + + ADD2 s1, t1, $f30 + fmov $f30, s1 + unop + MUL x1, a0, t1 + LD a0, 8 * SIZE(A1) + + ADD1 s2, t2, $f30 + fmov $f30, s2 + unop + MUL x1, a3, t2 + LD a3, 9 * SIZE(A2) + + ADD2 s3, t3, $f30 + fmov $f30, s3 + unop + MUL x1, a2, t3 + LD a2, 8 * SIZE(A2) + + ADD3 s0, t0, $f30 + fmov $f30, s0 + unop + MUL x2, a4, t0 + LD x1, 5 * SIZE(X1) + + ADD4 s1, t1, $f30 + fmov $f30, s1 + MUL x2, a5, t1 + ADD3 s2, t2, $f30 + fmov $f30, s2 + MUL x2, a6, t2 + + ADD4 s3, t3, $f30 + fmov $f30, s3 + unop + MUL x2, a7, t3 + LD x2, 6 * SIZE(X1) + + ADD1 s0, t0, $f30 + fmov $f30, s0 + unop + MUL x3, a5, t0 + LD a5, 11 * SIZE(A1) + + ADD2 s1, t1, $f30 + fmov $f30, s1 + unop + MUL x3, a4, t1 + LD a4, 10 * SIZE(A1) + + ADD1 s2, t2, $f30 + fmov $f30, s2 + unop + MUL x3, a7, t2 + LD a7, 11 * SIZE(A2) + + ADD2 s3, t3, $f30 + fmov $f30, s3 + unop + MUL x3, a6, t3 + LD a6, 10 * SIZE(A2) + + ADD3 s0, t0, $f30 + fmov $f30, s0 + unop + MUL x0, a8, t0 + LD x3, 7 * SIZE(X1) + + ADD4 s1, t1, $f30 + fmov $f30, s1 + fillcs (PREFETCHSIZE + 0) * SIZE(A2) + MUL x0, a9, t1 + unop + + ADD3 s2, t2, $f30 + fmov $f30, s2 + ldi I, -1(I) + MUL x0, a10, t2 + unop + + ADD4 s3, t3, $f30 + fmov $f30, s3 + unop + MUL x0, a11, t3 + LD x0, 8 * SIZE(X1) + + ADD1 s0, t0, $f30 + fmov $f30, s0 + unop + MUL x1, a9, t0 + LD a9, 13 * SIZE(A1) + + ADD2 s1, t1, $f30 + fmov $f30, s1 + unop + MUL x1, a8, t1 + LD a8, 12 * SIZE(A1) + + ADD1 s2, t2, $f30 + fmov $f30, s2 + ldi A1, 8 * SIZE(A1) + MUL x1, a11, t2 + LD a11, 13 * SIZE(A2) + + ADD2 s3, t3, $f30 + fmov $f30, s3 + unop + MUL x1, a10, t3 + LD a10, 12 * SIZE(A2) + + ADD3 s0, t0, $f30 + fmov $f30, s0 + unop + MUL x2, a12, t0 + LD x1, 9 * SIZE(X1) + + ADD4 s1, t1, $f30 + fmov $f30, s1 + fillcs (PREFETCHSIZE + 0) * SIZE(X1) + MUL x2, a13, t1 + ldi A2, 8 * SIZE(A2) + + ADD3 s2, t2, $f30 + fmov $f30, s2 + unop + MUL x2, a14, t2 + unop + + ADD4 s3, t3, $f30 + fmov $f30, s3 + unop + MUL x2, a15, t3 + LD x2, 10 * SIZE(X1) + + ADD1 s0, t0, $f30 + fmov $f30, s0 + unop + MUL x3, a13, t0 + LD a13, 7 * SIZE(A1) + + ADD2 s1, t1, $f30 + fmov $f30, s1 + ldi X1, 8 * SIZE(X1) + MUL x3, a12, t1 + LD a12, 6 * SIZE(A1) + + ADD1 s2, t2, $f30 + fmov $f30, s2 + unop + MUL x3, a15, t2 + LD a15, 7 * SIZE(A2) + + ADD2 s3, t3, $f30 + fmov $f30, s3 + MUL x3, a14, t3 + LD a14, 6 * SIZE(A2) + bgt I, $L12 + .align 4 + +$L13: + ADD3 s0, t0, $f30 + fmov $f30, s0 + unop + MUL x0, a0, t0 + LD x3, 3 * SIZE(X1) + + ADD4 s1, t1, $f30 + fmov $f30, s1 + MUL x0, a1, t1 + ADD3 s2, t2, $f30 + fmov $f30, s2 + MUL x0, a2, t2 + + ADD4 s3, t3, $f30 + fmov $f30, s3 + unop + MUL x0, a3, t3 + LD x0, 4 * SIZE(X1) + + ADD1 s0, t0, $f30 + fmov $f30, s0 + MUL x1, a1, t0 + ADD2 s1, t1, $f30 + fmov $f30, s1 + MUL x1, a0, t1 + + ADD1 s2, t2, $f30 + fmov $f30, s2 + unop + MUL x1, a3, t2 + unop + + ADD2 s3, t3, $f30 + fmov $f30, s3 + ldi A1, 8 * SIZE(A1) + MUL x1, a2, t3 + LD x1, 5 * SIZE(X1) + + ADD3 s0, t0, $f30 + fmov $f30, s0 + MUL x2, a4, t0 + ADD4 s1, t1, $f30 + fmov $f30, s1 + MUL x2, a5, t1 + + ADD3 s2, t2, $f30 + fmov $f30, s2 + unop + MUL x2, a6, t2 + unop + + ADD4 s3, t3, $f30 + fmov $f30, s3 + ldi A2, 8 * SIZE(A2) + MUL x2, a7, t3 + LD x2, 6 * SIZE(X1) + + ADD1 s0, t0, $f30 + fmov $f30, s0 + MUL x3, a5, t0 + ADD2 s1, t1, $f30 + fmov $f30, s1 + MUL x3, a4, t1 + + ADD1 s2, t2, $f30 + fmov $f30, s2 + unop + MUL x3, a7, t2 + ldi X1, 8 * SIZE(X1) + + ADD2 s3, t3, $f30 + fmov $f30, s3 + unop + MUL x3, a6, t3 + LD x3, -1 * SIZE(X1) + + ADD3 s0, t0, $f30 + fmov $f30, s0 + MUL x0, a8, t0 + ADD4 s1, t1, $f30 + fmov $f30, s1 + MUL x0, a9, t1 + + ADD3 s2, t2, $f30 + fmov $f30, s2 + MUL x0, a10, t2 + ADD4 s3, t3, $f30 + fmov $f30, s3 + MUL x0, a11, t3 + + ADD1 s0, t0, $f30 + fmov $f30, s0 + MUL x1, a9, t0 + ADD2 s1, t1, $f30 + fmov $f30, s1 + MUL x1, a8, t1 + + ADD1 s2, t2, $f30 + fmov $f30, s2 + MUL x1, a11, t2 + ADD2 s3, t3, $f30 + fmov $f30, s3 + MUL x1, a10, t3 + + ADD3 s0, t0, $f30 + fmov $f30, s0 + MUL x2, a12, t0 + ADD4 s1, t1, $f30 + fmov $f30, s1 + MUL x2, a13, t1 + + ADD3 s2, t2, $f30 + fmov $f30, s2 + MUL x2, a14, t2 + ADD4 s3, t3, $f30 + fmov $f30, s3 + MUL x2, a15, t3 + + ADD1 s0, t0, $f30 + fmov $f30, s0 + MUL x3, a13, t0 + ADD2 s1, t1, $f30 + fmov $f30, s1 + MUL x3, a12, t1 + + ADD1 s2, t2, $f30 + fmov $f30, s2 + MUL x3, a15, t2 + ADD2 s3, t3, $f30 + fmov $f30, s3 + MUL x3, a14, t3 + .align 4 + +$L15: + and M, 3, I + ble I, $L18 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 0 * SIZE(A2) + LD a3, 1 * SIZE(A2) + + LD x0, 0 * SIZE(X1) + + ldi I, -1(I) + ble I, $L17 + .align 4 + +$L16: + ADD3 s0, t0, $f30 + fmov $f30, s0 + ldi I, -1(I) + MUL x0, a0, t0 + LD x1, 1 * SIZE(X1) + + ADD4 s1, t1, $f30 + fmov $f30, s1 + MUL x0, a1, t1 + ADD3 s2, t2, $f30 + fmov $f30, s2 + MUL x0, a2, t2 + + ADD4 s3, t3, $f30 + fmov $f30, s3 + unop + MUL x0, a3, t3 + LD x0, 2 * SIZE(X1) + + ADD1 s0, t0, $f30 + fmov $f30, s0 + ldi A2, 2 * SIZE(A2) + MUL x1, a1, t0 + LD a1, 3 * SIZE(A1) + + ADD2 s1, t1, $f30 + fmov $f30, s1 + ldi X1, 2 * SIZE(X1) + MUL x1, a0, t1 + LD a0, 2 * SIZE(A1) + + ADD1 s2, t2, $f30 + fmov $f30, s2 + ldi A1, 2 * SIZE(A1) + MUL x1, a3, t2 + LD a3, 1 * SIZE(A2) + + ADD2 s3, t3, $f30 + fmov $f30, s3 + MUL x1, a2, t3 + LD a2, 0 * SIZE(A2) + bgt I, $L16 + .align 4 + +$L17: + ADD3 s0, t0, $f30 + fmov $f30, s0 + unop + MUL x0, a0, t0 + LD x1, 1 * SIZE(X1) + + ADD4 s1, t1, $f30 + fmov $f30, s1 + unop + MUL x0, a1, t1 + unop + + ADD3 s2, t2, $f30 + fmov $f30, s2 + MUL x0, a2, t2 + ADD4 s3, t3, $f30 + fmov $f30, s3 + MUL x0, a3, t3 + + ADD1 s0, t0, $f30 + fmov $f30, s0 + MUL x1, a1, t0 + ADD2 s1, t1, $f30 + fmov $f30, s1 + MUL x1, a0, t1 + + ADD1 s2, t2, $f30 + fmov $f30, s2 + MUL x1, a3, t2 + ADD2 s3, t3, $f30 + fmov $f30, s3 + MUL x1, a2, t3 + .align 4 + +$L18: + LD a0, 0 * SIZE(Y) + unop + LD a1, 1 * SIZE(Y) + addl Y, INCY, Y + + LD a2, 0 * SIZE(Y) + unop + LD a3, 1 * SIZE(Y) + addl Y, INCY, Y + + ADD3 s0, t0, a8 + ADD4 s1, t1, a9 + ADD3 s2, t2, a10 + ADD4 s3, t3, a11 + + fmov a8, s0 + fmov a9, s1 + fmov a10, s2 + fmov a11, s3 + + MUL alpha_r, s0, t0 + MUL alpha_r, s1, t1 + MUL alpha_r, s2, t2 + MUL alpha_r, s3, t3 + + ADD a0, t0, a8 + MUL alpha_i, s1, t0 + ADD a1, t1, a9 + MUL alpha_i, s0, t1 + ADD a2, t2, a10 + MUL alpha_i, s3, t2 + ADD a3, t3, a11 + MUL alpha_i, s2, t3 + + SUB a8, t0, a0 + ADD a9, t1, a1 + SUB a10, t2, a2 + ADD a11, t3, a3 + + ST a0, 0 * SIZE(Y1) + fclr t0 + ST a1, 1 * SIZE(Y1) + addl Y1, INCY, Y1 + + ST a2, 0 * SIZE(Y1) + fclr t1 + ST a3, 1 * SIZE(Y1) + addl Y1, INCY, Y1 + + fclr t2 + ldi J, -1(J) + fclr t3 + bgt J, $L11 + .align 4 + +$L20: + blbc N, $L999 + + mov A, A1 + fclr s0 + fclr s1 + mov X, X1 + + sra M, 2, I + fclr s2 + fclr s3 + ble I, $L25 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a4, 2 * SIZE(A1) + LD a5, 3 * SIZE(A1) + LD a8, 4 * SIZE(A1) + LD a9, 5 * SIZE(A1) + LD a12, 6 * SIZE(A1) + LD a13, 7 * SIZE(A1) + + LD x0, 0 * SIZE(X1) + LD x1, 1 * SIZE(X1) + LD x2, 2 * SIZE(X1) + + ldi I, -1(I) + ble I, $L23 + .align 4 + +$L22: + ADD3 s0, t0, $f30 + fmov $f30, s0 + fillcs (PREFETCHSIZE + 0) * SIZE(A1) + MUL x0, a0, t0 + LD x3, 3 * SIZE(X1) + + ADD4 s1, t1, $f30 + fmov $f30, s1 + unop + MUL x0, a1, t1 + LD x0, 4 * SIZE(X1) + + ADD1 s2, t0, $f30 + fmov $f30, s2 + ldi I, -1(I) + MUL x1, a1, t0 + LD a1, 9 * SIZE(A1) + + ADD2 s3, t1, $f30 + fmov $f30, s3 + unop + MUL x1, a0, t1 + LD a0, 8 * SIZE(A1) + + ADD3 s0, t0, $f30 + fmov $f30, s0 + unop + MUL x2, a4, t0 + LD x1, 5 * SIZE(X1) + + ADD4 s1, t1, $f30 + fmov $f30, s1 + unop + MUL x2, a5, t1 + LD x2, 6 * SIZE(X1) + + ADD1 s2, t0, $f30 + fmov $f30, s2 + unop + MUL x3, a5, t0 + LD a5, 11 * SIZE(A1) + + ADD2 s3, t1, $f30 + fmov $f30, s3 + unop + MUL x3, a4, t1 + LD a4, 10 * SIZE(A1) + + ADD3 s0, t0, $f30 + fmov $f30, s0 + unop + MUL x0, a8, t0 + LD x3, 7 * SIZE(X1) + + ADD4 s1, t1, $f30 + fmov $f30, s1 + unop + MUL x0, a9, t1 + LD x0, 8 * SIZE(X1) + + ADD1 s2, t0, $f30 + fmov $f30, s2 + unop + MUL x1, a9, t0 + LD a9, 13 * SIZE(A1) + + ADD2 s3, t1, $f30 + fmov $f30, s3 + unop + MUL x1, a8, t1 + LD a8, 12 * SIZE(A1) + + ADD3 s0, t0, $f30 + fmov $f30, s0 + unop + MUL x2, a12, t0 + LD x1, 9 * SIZE(X1) + + ADD4 s1, t1, $f30 + fmov $f30, s1 + ldi A1, 8 * SIZE(A1) + MUL x2, a13, t1 + LD x2, 10 * SIZE(X1) + + ADD1 s2, t0, $f30 + fmov $f30, s2 + ldi X1, 8 * SIZE(X1) + MUL x3, a13, t0 + LD a13, 7 * SIZE(A1) + + ADD2 s3, t1, $f30 + fmov $f30, s3 + MUL x3, a12, t1 + LD a12, 6 * SIZE(A1) + bgt I, $L22 + .align 4 + +$L23: + ADD3 s0, t0, $f30 + fmov $f30, s0 + unop + MUL x0, a0, t0 + LD x3, 3 * SIZE(X1) + + ADD4 s1, t1, $f30 + fmov $f30, s1 + unop + MUL x0, a1, t1 + LD x0, 4 * SIZE(X1) + + ADD1 s2, t0, $f30 + fmov $f30, s2 + unop + MUL x1, a1, t0 + ldi A1, 8 * SIZE(A1) + + ADD2 s3, t1, $f30 + fmov $f30, s3 + unop + MUL x1, a0, t1 + LD x1, 5 * SIZE(X1) + + ADD3 s0, t0, $f30 + fmov $f30, s0 + unop + MUL x2, a4, t0 + unop + + ADD4 s1, t1, $f30 + fmov $f30, s1 + unop + MUL x2, a5, t1 + LD x2, 6 * SIZE(X1) + + ADD1 s2, t0, $f30 + fmov $f30, s2 + unop + MUL x3, a5, t0 + ldi X1, 8 * SIZE(X1) + + ADD2 s3, t1, $f30 + fmov $f30, s3 + unop + MUL x3, a4, t1 + LD x3, -1 * SIZE(X1) + + ADD3 s0, t0, $f30 + fmov $f30, s0 + MUL x0, a8, t0 + ADD4 s1, t1, $f30 + fmov $f30, s1 + MUL x0, a9, t1 + + ADD1 s2, t0, $f30 + fmov $f30, s2 + MUL x1, a9, t0 + ADD2 s3, t1, $f30 + fmov $f30, s3 + MUL x1, a8, t1 + + ADD3 s0, t0, $f30 + fmov $f30, s0 + MUL x2, a12, t0 + ADD4 s1, t1, $f30 + fmov $f30, s1 + MUL x2, a13, t1 + + ADD1 s2, t0, $f30 + fmov $f30, s2 + MUL x3, a13, t0 + ADD2 s3, t1, $f30 + fmov $f30, s3 + MUL x3, a12, t1 + .align 4 + +$L25: + and M, 3, I + ble I, $L28 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + + LD x0, 0 * SIZE(X1) + + ldi I, -1(I) + ble I, $L27 + .align 4 + +$L26: + ADD3 s0, t0, $f30 + fmov $f30, s0 + ldi A1, 2 * SIZE(A1) + MUL x0, a0, t0 + LD x1, 1 * SIZE(X1) + + ADD4 s1, t1, $f30 + fmov $f30, s1 + ldi I, -1(I) + MUL x0, a1, t1 + LD x0, 2 * SIZE(X1) + + ADD1 s0, t0, $f30 + fmov $f30, s0 + ldi X1, 2 * SIZE(X1) + MUL x1, a1, t0 + LD a1, 1 * SIZE(A1) + + ADD2 s1, t1, $f30 + fmov $f30, s1 + MUL x1, a0, t1 + LD a0, 0 * SIZE(A1) + bgt I, $L26 + .align 4 + +$L27: + ADD3 s0, t0, $f30 + fmov $f30, s0 + unop + MUL x0, a0, t0 + LD x1, 1 * SIZE(X1) + + ADD4 s1, t1, $f30 + fmov $f30, s1 + unop + MUL x0, a1, t1 + unop + + ADD1 s0, t0, $f30 + fmov $f30, s0 + MUL x1, a1, t0 + ADD2 s1, t1, $f30 + fmov $f30, s1 + MUL x1, a0, t1 + .align 4 + +$L28: + LD a0, 0 * SIZE(Y) + LD a1, 1 * SIZE(Y) + + ADD3 s0, t0, a8 + ADD4 s1, t1, a9 + ADD3 s2, t2, a10 + ADD4 s3, t3, a11 + + ADD a8, a10, s0 + ADD a9, a11, s1 + + MUL alpha_r, s0, t0 + MUL alpha_r, s1, t1 + + ADD a0, t0, a8 + MUL alpha_i, s1, t0 + ADD a1, t1, a9 + MUL alpha_i, s0, t1 + + SUB a8, t0, a0 + ADD a9, t1, a1 + + ST a0, 0 * SIZE(Y1) + ST a1, 1 * SIZE(Y1) + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + + ldi $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/zgemv_t.S.bak b/kernel/sw_64/zgemv_t.S.bak new file mode 100644 index 0000000..f857fb7 --- /dev/null +++ b/kernel/sw_64/zgemv_t.S.bak @@ -0,0 +1,922 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define STACKSIZE 64 +#define PREFETCHSIZE 32 + +#define M $16 +#define N $17 +#define A $21 +#define LDA $18 + +#define X $19 +#define INCX $20 +#define Y $22 +#define INCY $23 + +#define BUFFER $24 + +#define I $25 +#define J $27 + +#define X1 $3 +#define Y1 $4 +#define A1 $5 +#define A2 $6 + +#define alpha_r $f19 +#define alpha_i $f20 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f21 + +#define a0 $f22 +#define a1 $f23 +#define a2 $f24 +#define a3 $f25 +#define a4 $f26 +#define a5 $f27 +#define a6 $f28 +#define a7 $f29 + +#define a8 $f2 +#define a9 $f3 +#define a10 $f4 +#define a11 $f5 +#define a12 $f6 +#define a13 $f7 +#define a14 $f8 +#define a15 $f9 + +#if !defined(CONJ) && !defined(XCONJ) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#elif !defined(CONJ) && defined(XCONJ) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#elif defined(CONJ) && !defined(XCONJ) +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#else +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 SUB +#define ADD4 SUB +#endif + + PROLOGUE + + ldi $sp, -STACKSIZE($sp) + ldl LDA, 0 + STACKSIZE($sp) + ldl X, 8 + STACKSIZE($sp) + ldl INCX, 16 + STACKSIZE($sp) + ldl Y, 24 + STACKSIZE($sp) + ldl INCY, 32 + STACKSIZE($sp) + ldl BUFFER, 40 + STACKSIZE($sp) + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + + PROFCODE + + cmple M, 0, $0 + sll INCX, ZBASE_SHIFT, INCX + cmple N, 0, $1 + sll INCY, ZBASE_SHIFT, INCY + + or $0, $1, $0 + bne $0, $L999 + + cmpeq INCX, 2 * SIZE, $0 + mov X, X1 + sll LDA, ZBASE_SHIFT,LDA + bne $0, $L10 + + sra M, 2, I + mov BUFFER, Y1 + mov BUFFER, X + ble I, $L05 + .align 4 + +$L02: + fillcs (PREFETCHSIZE + 0) * SIZE(X1) + ldi I, -1(I) + + LD a0, 0 * SIZE(X1) + LD a1, 1 * SIZE(X1) + addl X1, INCX, X1 + LD a2, 0 * SIZE(X1) + LD a3, 1 * SIZE(X1) + addl X1, INCX, X1 + + ST a0, 0 * SIZE(Y1) + ST a1, 1 * SIZE(Y1) + ST a2, 2 * SIZE(Y1) + ST a3, 3 * SIZE(Y1) + + LD a4, 0 * SIZE(X1) + LD a5, 1 * SIZE(X1) + addl X1, INCX, X1 + LD a6, 0 * SIZE(X1) + LD a7, 1 * SIZE(X1) + addl X1, INCX, X1 + + ST a4, 4 * SIZE(Y1) + ST a5, 5 * SIZE(Y1) + ST a6, 6 * SIZE(Y1) + ST a7, 7 * SIZE(Y1) + + ldi Y1, 8 * SIZE(Y1) + bgt I, $L02 + .align 4 + +$L05: + and M, 3, I + ble I, $L10 + .align 4 + +$L06: + LD a0, 0 * SIZE(X1) + LD a1, 1 * SIZE(X1) + addl X1, INCX, X1 + + ST a0, 0 * SIZE(Y1) + ST a1, 1 * SIZE(Y1) + ldi Y1, 2 * SIZE(Y1) + + ldi I, -1(I) + bgt I, $L06 + .align 4 + +$L10: + mov Y, Y1 + fclr t0 + unop + fclr t1 + + sra N, 1, J + fclr t2 + fclr t3 + ble J, $L20 + .align 4 + +$L11: + mov A, A1 + fclr s0 + addl A, LDA, A2 + fclr s1 + + addl A2, LDA, A + unop + mov X, X1 + fillcs 3 * SIZE(Y) + + sra M, 2, I + fclr s2 + fclr s3 + ble I, $L15 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 0 * SIZE(A2) + LD a3, 1 * SIZE(A2) + LD a4, 2 * SIZE(A1) + LD a5, 3 * SIZE(A1) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + LD a8, 4 * SIZE(A1) + LD a9, 5 * SIZE(A1) + LD a10, 4 * SIZE(A2) + LD a11, 5 * SIZE(A2) + LD a12, 6 * SIZE(A1) + LD a13, 7 * SIZE(A1) + LD a14, 6 * SIZE(A2) + LD a15, 7 * SIZE(A2) + + LD x0, 0 * SIZE(X1) + LD x1, 1 * SIZE(X1) + LD x2, 2 * SIZE(X1) + + ldi I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ADD3 s0, t0, s0 + unop + MUL x0, a0, t0 + LD x3, 3 * SIZE(X1) + + ADD4 s1, t1, s1 + fillcs (PREFETCHSIZE + 0) * SIZE(A1) + MUL x0, a1, t1 + unop + + ADD3 s2, t2, s2 + unop + MUL x0, a2, t2 + unop + + ADD4 s3, t3, s3 + unop + MUL x0, a3, t3 + LD x0, 4 * SIZE(X1) + + ADD1 s0, t0, s0 + unop + MUL x1, a1, t0 + LD a1, 9 * SIZE(A1) + + ADD2 s1, t1, s1 + unop + MUL x1, a0, t1 + LD a0, 8 * SIZE(A1) + + ADD1 s2, t2, s2 + unop + MUL x1, a3, t2 + LD a3, 9 * SIZE(A2) + + ADD2 s3, t3, s3 + unop + MUL x1, a2, t3 + LD a2, 8 * SIZE(A2) + + ADD3 s0, t0, s0 + unop + MUL x2, a4, t0 + LD x1, 5 * SIZE(X1) + + ADD4 s1, t1, s1 + MUL x2, a5, t1 + ADD3 s2, t2, s2 + MUL x2, a6, t2 + + ADD4 s3, t3, s3 + unop + MUL x2, a7, t3 + LD x2, 6 * SIZE(X1) + + ADD1 s0, t0, s0 + unop + MUL x3, a5, t0 + LD a5, 11 * SIZE(A1) + + ADD2 s1, t1, s1 + unop + MUL x3, a4, t1 + LD a4, 10 * SIZE(A1) + + ADD1 s2, t2, s2 + unop + MUL x3, a7, t2 + LD a7, 11 * SIZE(A2) + + ADD2 s3, t3, s3 + unop + MUL x3, a6, t3 + LD a6, 10 * SIZE(A2) + + ADD3 s0, t0, s0 + unop + MUL x0, a8, t0 + LD x3, 7 * SIZE(X1) + + ADD4 s1, t1, s1 + fillcs (PREFETCHSIZE + 0) * SIZE(A2) + MUL x0, a9, t1 + unop + + ADD3 s2, t2, s2 + ldi I, -1(I) + MUL x0, a10, t2 + unop + + ADD4 s3, t3, s3 + unop + MUL x0, a11, t3 + LD x0, 8 * SIZE(X1) + + ADD1 s0, t0, s0 + unop + MUL x1, a9, t0 + LD a9, 13 * SIZE(A1) + + ADD2 s1, t1, s1 + unop + MUL x1, a8, t1 + LD a8, 12 * SIZE(A1) + + ADD1 s2, t2, s2 + ldi A1, 8 * SIZE(A1) + MUL x1, a11, t2 + LD a11, 13 * SIZE(A2) + + ADD2 s3, t3, s3 + unop + MUL x1, a10, t3 + LD a10, 12 * SIZE(A2) + + ADD3 s0, t0, s0 + unop + MUL x2, a12, t0 + LD x1, 9 * SIZE(X1) + + ADD4 s1, t1, s1 + fillcs (PREFETCHSIZE + 0) * SIZE(X1) + MUL x2, a13, t1 + ldi A2, 8 * SIZE(A2) + + ADD3 s2, t2, s2 + unop + MUL x2, a14, t2 + unop + + ADD4 s3, t3, s3 + unop + MUL x2, a15, t3 + LD x2, 10 * SIZE(X1) + + ADD1 s0, t0, s0 + unop + MUL x3, a13, t0 + LD a13, 7 * SIZE(A1) + + ADD2 s1, t1, s1 + ldi X1, 8 * SIZE(X1) + MUL x3, a12, t1 + LD a12, 6 * SIZE(A1) + + ADD1 s2, t2, s2 + unop + MUL x3, a15, t2 + LD a15, 7 * SIZE(A2) + + ADD2 s3, t3, s3 + MUL x3, a14, t3 + LD a14, 6 * SIZE(A2) + bgt I, $L12 + .align 4 + +$L13: + ADD3 s0, t0, s0 + unop + MUL x0, a0, t0 + LD x3, 3 * SIZE(X1) + + ADD4 s1, t1, s1 + MUL x0, a1, t1 + ADD3 s2, t2, s2 + MUL x0, a2, t2 + + ADD4 s3, t3, s3 + unop + MUL x0, a3, t3 + LD x0, 4 * SIZE(X1) + + ADD1 s0, t0, s0 + MUL x1, a1, t0 + ADD2 s1, t1, s1 + MUL x1, a0, t1 + + ADD1 s2, t2, s2 + unop + MUL x1, a3, t2 + unop + + ADD2 s3, t3, s3 + ldi A1, 8 * SIZE(A1) + MUL x1, a2, t3 + LD x1, 5 * SIZE(X1) + + ADD3 s0, t0, s0 + MUL x2, a4, t0 + ADD4 s1, t1, s1 + MUL x2, a5, t1 + + ADD3 s2, t2, s2 + unop + MUL x2, a6, t2 + unop + + ADD4 s3, t3, s3 + ldi A2, 8 * SIZE(A2) + MUL x2, a7, t3 + LD x2, 6 * SIZE(X1) + + ADD1 s0, t0, s0 + MUL x3, a5, t0 + ADD2 s1, t1, s1 + MUL x3, a4, t1 + + ADD1 s2, t2, s2 + unop + MUL x3, a7, t2 + ldi X1, 8 * SIZE(X1) + + ADD2 s3, t3, s3 + unop + MUL x3, a6, t3 + LD x3, -1 * SIZE(X1) + + ADD3 s0, t0, s0 + MUL x0, a8, t0 + ADD4 s1, t1, s1 + MUL x0, a9, t1 + + ADD3 s2, t2, s2 + MUL x0, a10, t2 + ADD4 s3, t3, s3 + MUL x0, a11, t3 + + ADD1 s0, t0, s0 + MUL x1, a9, t0 + ADD2 s1, t1, s1 + MUL x1, a8, t1 + + ADD1 s2, t2, s2 + MUL x1, a11, t2 + ADD2 s3, t3, s3 + MUL x1, a10, t3 + + ADD3 s0, t0, s0 + MUL x2, a12, t0 + ADD4 s1, t1, s1 + MUL x2, a13, t1 + + ADD3 s2, t2, s2 + MUL x2, a14, t2 + ADD4 s3, t3, s3 + MUL x2, a15, t3 + + ADD1 s0, t0, s0 + MUL x3, a13, t0 + ADD2 s1, t1, s1 + MUL x3, a12, t1 + + ADD1 s2, t2, s2 + MUL x3, a15, t2 + ADD2 s3, t3, s3 + MUL x3, a14, t3 + .align 4 + +$L15: + and M, 3, I + ble I, $L18 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 0 * SIZE(A2) + LD a3, 1 * SIZE(A2) + + LD x0, 0 * SIZE(X1) + + ldi I, -1(I) + ble I, $L17 + .align 4 + +$L16: + ADD3 s0, t0, s0 + ldi I, -1(I) + MUL x0, a0, t0 + LD x1, 1 * SIZE(X1) + + ADD4 s1, t1, s1 + MUL x0, a1, t1 + ADD3 s2, t2, s2 + MUL x0, a2, t2 + + ADD4 s3, t3, s3 + unop + MUL x0, a3, t3 + LD x0, 2 * SIZE(X1) + + ADD1 s0, t0, s0 + ldi A2, 2 * SIZE(A2) + MUL x1, a1, t0 + LD a1, 3 * SIZE(A1) + + ADD2 s1, t1, s1 + ldi X1, 2 * SIZE(X1) + MUL x1, a0, t1 + LD a0, 2 * SIZE(A1) + + ADD1 s2, t2, s2 + ldi A1, 2 * SIZE(A1) + MUL x1, a3, t2 + LD a3, 1 * SIZE(A2) + + ADD2 s3, t3, s3 + MUL x1, a2, t3 + LD a2, 0 * SIZE(A2) + bgt I, $L16 + .align 4 + +$L17: + ADD3 s0, t0, s0 + unop + MUL x0, a0, t0 + LD x1, 1 * SIZE(X1) + + ADD4 s1, t1, s1 + unop + MUL x0, a1, t1 + unop + + ADD3 s2, t2, s2 + MUL x0, a2, t2 + ADD4 s3, t3, s3 + MUL x0, a3, t3 + + ADD1 s0, t0, s0 + MUL x1, a1, t0 + ADD2 s1, t1, s1 + MUL x1, a0, t1 + + ADD1 s2, t2, s2 + MUL x1, a3, t2 + ADD2 s3, t3, s3 + MUL x1, a2, t3 + .align 4 + +$L18: + LD a0, 0 * SIZE(Y) + unop + LD a1, 1 * SIZE(Y) + addl Y, INCY, Y + + LD a2, 0 * SIZE(Y) + unop + LD a3, 1 * SIZE(Y) + addl Y, INCY, Y + + ADD3 s0, t0, s0 + ADD4 s1, t1, s1 + ADD3 s2, t2, s2 + ADD4 s3, t3, s3 + + MUL alpha_r, s0, t0 + MUL alpha_r, s1, t1 + MUL alpha_r, s2, t2 + MUL alpha_r, s3, t3 + + ADD a0, t0, a0 + MUL alpha_i, s1, t0 + ADD a1, t1, a1 + MUL alpha_i, s0, t1 + ADD a2, t2, a2 + MUL alpha_i, s3, t2 + ADD a3, t3, a3 + MUL alpha_i, s2, t3 + + SUB a0, t0, a0 + ADD a1, t1, a1 + SUB a2, t2, a2 + ADD a3, t3, a3 + + ST a0, 0 * SIZE(Y1) + fclr t0 + ST a1, 1 * SIZE(Y1) + addl Y1, INCY, Y1 + + ST a2, 0 * SIZE(Y1) + fclr t1 + ST a3, 1 * SIZE(Y1) + addl Y1, INCY, Y1 + + fclr t2 + ldi J, -1(J) + fclr t3 + bgt J, $L11 + .align 4 + +$L20: + blbc N, $L999 + + mov A, A1 + fclr s0 + fclr s1 + mov X, X1 + + sra M, 2, I + fclr s2 + fclr s3 + ble I, $L25 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a4, 2 * SIZE(A1) + LD a5, 3 * SIZE(A1) + LD a8, 4 * SIZE(A1) + LD a9, 5 * SIZE(A1) + LD a12, 6 * SIZE(A1) + LD a13, 7 * SIZE(A1) + + LD x0, 0 * SIZE(X1) + LD x1, 1 * SIZE(X1) + LD x2, 2 * SIZE(X1) + + ldi I, -1(I) + ble I, $L23 + .align 4 + +$L22: + ADD3 s0, t0, s0 + fillcs (PREFETCHSIZE + 0) * SIZE(A1) + MUL x0, a0, t0 + LD x3, 3 * SIZE(X1) + + ADD4 s1, t1, s1 + unop + MUL x0, a1, t1 + LD x0, 4 * SIZE(X1) + + ADD1 s2, t0, s2 + ldi I, -1(I) + MUL x1, a1, t0 + LD a1, 9 * SIZE(A1) + + ADD2 s3, t1, s3 + unop + MUL x1, a0, t1 + LD a0, 8 * SIZE(A1) + + ADD3 s0, t0, s0 + unop + MUL x2, a4, t0 + LD x1, 5 * SIZE(X1) + + ADD4 s1, t1, s1 + unop + MUL x2, a5, t1 + LD x2, 6 * SIZE(X1) + + ADD1 s2, t0, s2 + unop + MUL x3, a5, t0 + LD a5, 11 * SIZE(A1) + + ADD2 s3, t1, s3 + unop + MUL x3, a4, t1 + LD a4, 10 * SIZE(A1) + + ADD3 s0, t0, s0 + unop + MUL x0, a8, t0 + LD x3, 7 * SIZE(X1) + + ADD4 s1, t1, s1 + unop + MUL x0, a9, t1 + LD x0, 8 * SIZE(X1) + + ADD1 s2, t0, s2 + unop + MUL x1, a9, t0 + LD a9, 13 * SIZE(A1) + + ADD2 s3, t1, s3 + unop + MUL x1, a8, t1 + LD a8, 12 * SIZE(A1) + + ADD3 s0, t0, s0 + unop + MUL x2, a12, t0 + LD x1, 9 * SIZE(X1) + + ADD4 s1, t1, s1 + ldi A1, 8 * SIZE(A1) + MUL x2, a13, t1 + LD x2, 10 * SIZE(X1) + + ADD1 s2, t0, s2 + ldi X1, 8 * SIZE(X1) + MUL x3, a13, t0 + LD a13, 7 * SIZE(A1) + + ADD2 s3, t1, s3 + MUL x3, a12, t1 + LD a12, 6 * SIZE(A1) + bgt I, $L22 + .align 4 + +$L23: + ADD3 s0, t0, s0 + unop + MUL x0, a0, t0 + LD x3, 3 * SIZE(X1) + + ADD4 s1, t1, s1 + unop + MUL x0, a1, t1 + LD x0, 4 * SIZE(X1) + + ADD1 s2, t0, s2 + unop + MUL x1, a1, t0 + ldi A1, 8 * SIZE(A1) + + ADD2 s3, t1, s3 + unop + MUL x1, a0, t1 + LD x1, 5 * SIZE(X1) + + ADD3 s0, t0, s0 + unop + MUL x2, a4, t0 + unop + + ADD4 s1, t1, s1 + unop + MUL x2, a5, t1 + LD x2, 6 * SIZE(X1) + + ADD1 s2, t0, s2 + unop + MUL x3, a5, t0 + ldi X1, 8 * SIZE(X1) + + ADD2 s3, t1, s3 + unop + MUL x3, a4, t1 + LD x3, -1 * SIZE(X1) + + ADD3 s0, t0, s0 + MUL x0, a8, t0 + ADD4 s1, t1, s1 + MUL x0, a9, t1 + + ADD1 s2, t0, s2 + MUL x1, a9, t0 + ADD2 s3, t1, s3 + MUL x1, a8, t1 + + ADD3 s0, t0, s0 + MUL x2, a12, t0 + ADD4 s1, t1, s1 + MUL x2, a13, t1 + + ADD1 s2, t0, s2 + MUL x3, a13, t0 + ADD2 s3, t1, s3 + MUL x3, a12, t1 + .align 4 + +$L25: + and M, 3, I + ble I, $L28 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + + LD x0, 0 * SIZE(X1) + + ldi I, -1(I) + ble I, $L27 + .align 4 + +$L26: + ADD3 s0, t0, s0 + ldi A1, 2 * SIZE(A1) + MUL x0, a0, t0 + LD x1, 1 * SIZE(X1) + + ADD4 s1, t1, s1 + ldi I, -1(I) + MUL x0, a1, t1 + LD x0, 2 * SIZE(X1) + + ADD1 s0, t0, s0 + ldi X1, 2 * SIZE(X1) + MUL x1, a1, t0 + LD a1, 1 * SIZE(A1) + + ADD2 s1, t1, s1 + MUL x1, a0, t1 + LD a0, 0 * SIZE(A1) + bgt I, $L26 + .align 4 + +$L27: + ADD3 s0, t0, s0 + unop + MUL x0, a0, t0 + LD x1, 1 * SIZE(X1) + + ADD4 s1, t1, s1 + unop + MUL x0, a1, t1 + unop + + ADD1 s0, t0, s0 + MUL x1, a1, t0 + ADD2 s1, t1, s1 + MUL x1, a0, t1 + .align 4 + +$L28: + LD a0, 0 * SIZE(Y) + LD a1, 1 * SIZE(Y) + + ADD3 s0, t0, s0 + ADD4 s1, t1, s1 + ADD3 s2, t2, s2 + ADD4 s3, t3, s3 + + ADD s0, s2, s0 + ADD s1, s3, s1 + + MUL alpha_r, s0, t0 + MUL alpha_r, s1, t1 + + ADD a0, t0, a0 + MUL alpha_i, s1, t0 + ADD a1, t1, a1 + MUL alpha_i, s0, t1 + + SUB a0, t0, a0 + ADD a1, t1, a1 + + ST a0, 0 * SIZE(Y1) + ST a1, 1 * SIZE(Y1) + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + + ldi $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/sw_64/znrm2.S b/kernel/sw_64/znrm2.S new file mode 100644 index 0000000..c1b7375 --- /dev/null +++ b/kernel/sw_64/znrm2.S @@ -0,0 +1,441 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + +#define PREFETCH_SIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#define I $0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f10 +#define a3 $f11 +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f19 +#define x4 $f20 +#define x5 $f21 +#define x6 $f22 +#define x7 $f23 + + PROLOGUE + +#if defined(EV4) || defined(EV5) + .frame $30,16,$26,0 + .mask 0x4000000,-16 + ldih $29, 0($27) !gpdisp!1 + ldi $29, 0($29) !gpdisp!1 + + ldi $sp, -16($sp) + ldl $27, sqrt($29) !literal!2 + stl $26, 0($sp) + + PROFCODE + .prologue 1 +#else + PROFCODE +#endif + + fclr a0 + sll INCX, ZBASE_SHIFT, INCX + fclr a1 + ble N, $L999 + + fclr a2 + cmpeq INCX, 2 * SIZE, $0 + fclr a3 + beq $0, $L20 + + fclr t0 + sra N, 3, I + fclr t1 + ble I, $L15 + + fclr t2 + LD x0, 0 * SIZE(X) + fclr t3 + LD x1, 1 * SIZE(X) + + LD x2, 2 * SIZE(X) + LD x3, 3 * SIZE(X) + LD x4, 4 * SIZE(X) + LD x5, 5 * SIZE(X) + LD x6, 6 * SIZE(X) + LD x7, 7 * SIZE(X) + + ldi I, -1(I) + ble I, $L12 + .align 4 + +$L11: + faddd a0, t0, $f25 + fillcs (PREFETCH_SIZE) * SIZE(X) + fmuld x0, x0, t0 + LD x0, 8 * SIZE(X) + + faddd a1, t1, $f26 + mov X, XX + fmuld x1, x1, t1 + LD x1, 9 * SIZE(X) + + faddd a2, t2, $f27 + unop + fmuld x2, x2, t2 + LD x2, 10 * SIZE(X) + + faddd a3, t3, $f28 + unop + fmuld x3, x3, t3 + LD x3, 11 * SIZE(X) + + faddd $f25, t0, a0 + unop + fmuld x4, x4, t0 + LD x4, 12 * SIZE(X) + + faddd $f26, t1, a1 + unop + fmuld x5, x5, t1 + LD x5, 13 * SIZE(X) + + faddd $f27, t2, a2 + unop + fmuld x6, x6, t2 + LD x6, 14 * SIZE(X) + + faddd $f28, t3, a3 + unop + fmuld x7, x7, t3 + LD x7, 15 * SIZE(X) + + faddd a0, t0, $f25 + unop + fmuld x0, x0, t0 + LD x0, 16 * SIZE(X) + + faddd a1, t1, $f26 + ldi X, 16 * SIZE(X) + fmuld x1, x1, t1 + LD x1, 17 * SIZE(XX) + + faddd a2, t2, $f27 + unop + fmuld x2, x2, t2 + LD x2, 18 * SIZE(XX) + + faddd a3, t3, $f28 + unop + fmuld x3, x3, t3 + LD x3, 19 * SIZE(XX) + + faddd $f25, t0, a0 + unop + fmuld x4, x4, t0 + LD x4, 20 * SIZE(XX) + + faddd $f26, t1, a1 + ldi I, -1(I) + fmuld x5, x5, t1 + LD x5, 21 * SIZE(XX) + + faddd $f27, t2, a2 + unop + fmuld x6, x6, t2 + LD x6, 22 * SIZE(XX) + + faddd $f28, t3, a3 + fmuld x7, x7, t3 + LD x7, 23 * SIZE(XX) + bgt I, $L11 + .align 4 + +$L12: + faddd a0, t0, $f25 + mov X, XX + fmuld x0, x0, t0 + LD x0, 8 * SIZE(X) + + faddd a1, t1, $f26 + unop + fmuld x1, x1, t1 + LD x1, 9 * SIZE(X) + + faddd a2, t2, $f27 + unop + fmuld x2, x2, t2 + LD x2, 10 * SIZE(X) + + faddd a3, t3, $f28 + unop + fmuld x3, x3, t3 + LD x3, 11 * SIZE(X) + + faddd $f25, t0, a0 + unop + fmuld x4, x4, t0 + LD x4, 12 * SIZE(XX) + + faddd $f26, t1, a1 + unop + fmuld x5, x5, t1 + LD x5, 13 * SIZE(XX) + + faddd $f27, t2, a2 + unop + fmuld x6, x6, t2 + LD x6, 14 * SIZE(XX) + + faddd $f28, t3, a3 + ldi X, 16 * SIZE(X) + fmuld x7, x7, t3 + LD x7, 15 * SIZE(XX) + + faddd a0, t0, $f25 + fmuld x0, x0, t0 + faddd a1, t1, $f26 + fmuld x1, x1, t1 + + faddd a2, t2, $f27 + fmuld x2, x2, t2 + faddd a3, t3, $f28 + fmuld x3, x3, t3 + + faddd $f25, t0, a0 + fmuld x4, x4, t0 + faddd $f26, t1, a1 + fmuld x5, x5, t1 + + faddd $f27, t2, a2 + fmuld x6, x6, t2 + faddd $f28, t3, a3 + fmuld x7, x7, t3 + + faddd a2, t2, $f27 + fmov $f27, a2 + faddd a3, t3, $f28 + fmov $f28, a3 + .align 4 + +$L15: + and N, 7, I + ble I, $L998 + .align 4 + +$L16: + LD x0, 0 * SIZE(X) + LD x1, 1 * SIZE(X) + + ldi X, 2 * SIZE(X) + + faddd a0, t0, $f25 + fmov $f25, a0 + fmuld x0, x0, t0 + faddd a1, t1, $f26 + fmov $f26, a1 + fmuld x1, x1, t1 + + ldi I, -1(I) + bgt I, $L16 + bsr $31, $L998 + .align 4 + +$L20: + fclr t0 + sra N, 2, I + fclr t1 + ble I, $L25 + + LD x0, 0 * SIZE(X) + fclr t2 + LD x1, 1 * SIZE(X) + addl X, INCX, X + LD x2, 0 * SIZE(X) + fclr t3 + LD x3, 1 * SIZE(X) + addl X, INCX, X + + LD x4, 0 * SIZE(X) + ldi I, -1(I) + LD x5, 1 * SIZE(X) + addl X, INCX, X + + LD x6, 0 * SIZE(X) + ble I, $L22 + .align 4 + +$L21: + faddd a0, t0, $f25 + LD x7, 1 * SIZE(X) + fmuld x0, x0, t0 + addl X, INCX, X + + faddd a1, t1, $f26 + LD x0, 0 * SIZE(X) + fmuld x1, x1, t1 + unop + + faddd a2, t2, $f27 + LD x1, 1 * SIZE(X) + fmuld x2, x2, t2 + addl X, INCX, X + + faddd a3, t3, $f28 + LD x2, 0 * SIZE(X) + fmuld x3, x3, t3 + unop + + faddd $f25, t0, a0 + LD x3, 1 * SIZE(X) + fmuld x4, x4, t0 + addl X, INCX, X + + faddd $f26, t1, a1 + LD x4, 0 * SIZE(X) + fmuld x5, x5, t1 + ldi I, -1(I) + + faddd $f27, t2, a2 + LD x5, 1 * SIZE(X) + fmuld x6, x6, t2 + addl X, INCX, X + + faddd $f28, t3, a3 + LD x6, 0 * SIZE(X) + fmuld x7, x7, t3 + bgt I, $L21 + .align 4 + +$L22: + faddd a0, t0, $f25 + LD x7, 1 * SIZE(X) + fmuld x0, x0, t0 + addl X, INCX, X + + faddd a1, t1, $f26 + fmuld x1, x1, t1 + faddd a2, t2, $f27 + fmuld x2, x2, t2 + + faddd a3, t3, $f28 + fmuld x3, x3, t3 + faddd $f25, t0, a0 + fmuld x4, x4, t0 + + faddd $f26, t1, a1 + fmuld x5, x5, t1 + faddd $f27, t2, a2 + fmuld x6, x6, t2 + + faddd $f28, t3, a3 + fmuld x7, x7, t3 + + faddd a2, t2, $f27 + fmov $f27, a2 + faddd a3, t3, $f28 + fmov $f28, a3 + .align 4 + +$L25: + and N, 3, I + ble I, $L998 + .align 4 + +$L26: + LD x0, 0 * SIZE(X) + ldi I, -1(I) + LD x1, 1 * SIZE(X) + addl X, INCX, X + + faddd a0, t0, $f25 + fmov $f25, a0 + fmuld x0, x0, t0 + faddd a1, t1, $f26 + fmov $f26, a1 + fmuld x1, x1, t1 + + bgt I, $L26 + .align 4 + + +$L998: + faddd a0, t0, $f25 + faddd a1, t1, $f26 + fmov $f25, a0 + fmov $f26, a1 + + faddd a0, a1, $f25 + fmov $f25, a0 + faddd a2, a3, $f26 + fmov $f26, a2 + +#if defined(EV4) || defined(EV5) + faddd a0, a2, $f16 + jsr $26, ($27), sqrt !lituse_jsr!2 + + ldih $29, 0($26) !gpdisp!3 + ldi $29, 0($29) !gpdisp!3 +#else + faddd a0, a2, $f25 + fmov $f25, a0 + fsqrtd a0, $f25 + fmov $f25, a0 +#endif + .align 4 + +$L999: +#if defined(EV4) || defined(EV5) + ldl $26, 0($sp) + ldi $sp, 16($sp) +#endif + ret + EPILOGUE diff --git a/kernel/sw_64/znrm2.S.bak b/kernel/sw_64/znrm2.S.bak new file mode 100644 index 0000000..b2e80e0 --- /dev/null +++ b/kernel/sw_64/znrm2.S.bak @@ -0,0 +1,426 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + +#define PREFETCH_SIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#define I $0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f10 +#define a3 $f11 +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f19 +#define x4 $f20 +#define x5 $f21 +#define x6 $f22 +#define x7 $f23 + + PROLOGUE + +#if defined(EV4) || defined(EV5) + .frame $30,16,$26,0 + .mask 0x4000000,-16 + ldih $29, 0($27) !gpdisp!1 + ldi $29, 0($29) !gpdisp!1 + + ldi $sp, -16($sp) + ldl $27, sqrt($29) !literal!2 + stq $26, 0($sp) + + PROFCODE + .prologue 1 +#else + PROFCODE +#endif + + fclr a0 + sll INCX, ZBASE_SHIFT, INCX + fclr a1 + ble N, $L999 + + fclr a2 + cmpeq INCX, 2 * SIZE, $0 + fclr a3 + beq $0, $L20 + + fclr t0 + sra N, 3, I + fclr t1 + ble I, $L15 + + fclr t2 + LD x0, 0 * SIZE(X) + fclr t3 + LD x1, 1 * SIZE(X) + + LD x2, 2 * SIZE(X) + LD x3, 3 * SIZE(X) + LD x4, 4 * SIZE(X) + LD x5, 5 * SIZE(X) + LD x6, 6 * SIZE(X) + LD x7, 7 * SIZE(X) + + ldi I, -1(I) + ble I, $L12 + .align 4 + +$L11: + faddd a0, t0, a0 + fillcs (PREFETCH_SIZE) * SIZE(X) + fmuld x0, x0, t0 + LD x0, 8 * SIZE(X) + + faddd a1, t1, a1 + mov X, XX + fmuld x1, x1, t1 + LD x1, 9 * SIZE(X) + + faddd a2, t2, a2 + unop + fmuld x2, x2, t2 + LD x2, 10 * SIZE(X) + + faddd a3, t3, a3 + unop + fmuld x3, x3, t3 + LD x3, 11 * SIZE(X) + + faddd a0, t0, a0 + unop + fmuld x4, x4, t0 + LD x4, 12 * SIZE(X) + + faddd a1, t1, a1 + unop + fmuld x5, x5, t1 + LD x5, 13 * SIZE(X) + + faddd a2, t2, a2 + unop + fmuld x6, x6, t2 + LD x6, 14 * SIZE(X) + + faddd a3, t3, a3 + unop + fmuld x7, x7, t3 + LD x7, 15 * SIZE(X) + + faddd a0, t0, a0 + unop + fmuld x0, x0, t0 + LD x0, 16 * SIZE(X) + + faddd a1, t1, a1 + ldi X, 16 * SIZE(X) + fmuld x1, x1, t1 + LD x1, 17 * SIZE(XX) + + faddd a2, t2, a2 + unop + fmuld x2, x2, t2 + LD x2, 18 * SIZE(XX) + + faddd a3, t3, a3 + unop + fmuld x3, x3, t3 + LD x3, 19 * SIZE(XX) + + faddd a0, t0, a0 + unop + fmuld x4, x4, t0 + LD x4, 20 * SIZE(XX) + + faddd a1, t1, a1 + ldi I, -1(I) + fmuld x5, x5, t1 + LD x5, 21 * SIZE(XX) + + faddd a2, t2, a2 + unop + fmuld x6, x6, t2 + LD x6, 22 * SIZE(XX) + + faddd a3, t3, a3 + fmuld x7, x7, t3 + LD x7, 23 * SIZE(XX) + bgt I, $L11 + .align 4 + +$L12: + faddd a0, t0, a0 + mov X, XX + fmuld x0, x0, t0 + LD x0, 8 * SIZE(X) + + faddd a1, t1, a1 + unop + fmuld x1, x1, t1 + LD x1, 9 * SIZE(X) + + faddd a2, t2, a2 + unop + fmuld x2, x2, t2 + LD x2, 10 * SIZE(X) + + faddd a3, t3, a3 + unop + fmuld x3, x3, t3 + LD x3, 11 * SIZE(X) + + faddd a0, t0, a0 + unop + fmuld x4, x4, t0 + LD x4, 12 * SIZE(XX) + + faddd a1, t1, a1 + unop + fmuld x5, x5, t1 + LD x5, 13 * SIZE(XX) + + faddd a2, t2, a2 + unop + fmuld x6, x6, t2 + LD x6, 14 * SIZE(XX) + + faddd a3, t3, a3 + ldi X, 16 * SIZE(X) + fmuld x7, x7, t3 + LD x7, 15 * SIZE(XX) + + faddd a0, t0, a0 + fmuld x0, x0, t0 + faddd a1, t1, a1 + fmuld x1, x1, t1 + + faddd a2, t2, a2 + fmuld x2, x2, t2 + faddd a3, t3, a3 + fmuld x3, x3, t3 + + faddd a0, t0, a0 + fmuld x4, x4, t0 + faddd a1, t1, a1 + fmuld x5, x5, t1 + + faddd a2, t2, a2 + fmuld x6, x6, t2 + faddd a3, t3, a3 + fmuld x7, x7, t3 + + faddd a2, t2, a2 + faddd a3, t3, a3 + .align 4 + +$L15: + and N, 7, I + ble I, $L998 + .align 4 + +$L16: + LD x0, 0 * SIZE(X) + LD x1, 1 * SIZE(X) + + ldi X, 2 * SIZE(X) + + faddd a0, t0, a0 + fmuld x0, x0, t0 + faddd a1, t1, a1 + fmuld x1, x1, t1 + + ldi I, -1(I) + bgt I, $L16 + bsr $31, $L998 + .align 4 + +$L20: + fclr t0 + sra N, 2, I + fclr t1 + ble I, $L25 + + LD x0, 0 * SIZE(X) + fclr t2 + LD x1, 1 * SIZE(X) + addl X, INCX, X + LD x2, 0 * SIZE(X) + fclr t3 + LD x3, 1 * SIZE(X) + addl X, INCX, X + + LD x4, 0 * SIZE(X) + ldi I, -1(I) + LD x5, 1 * SIZE(X) + addl X, INCX, X + + LD x6, 0 * SIZE(X) + ble I, $L22 + .align 4 + +$L21: + faddd a0, t0, a0 + LD x7, 1 * SIZE(X) + fmuld x0, x0, t0 + addl X, INCX, X + + faddd a1, t1, a1 + LD x0, 0 * SIZE(X) + fmuld x1, x1, t1 + unop + + faddd a2, t2, a2 + LD x1, 1 * SIZE(X) + fmuld x2, x2, t2 + addl X, INCX, X + + faddd a3, t3, a3 + LD x2, 0 * SIZE(X) + fmuld x3, x3, t3 + unop + + faddd a0, t0, a0 + LD x3, 1 * SIZE(X) + fmuld x4, x4, t0 + addl X, INCX, X + + faddd a1, t1, a1 + LD x4, 0 * SIZE(X) + fmuld x5, x5, t1 + ldi I, -1(I) + + faddd a2, t2, a2 + LD x5, 1 * SIZE(X) + fmuld x6, x6, t2 + addl X, INCX, X + + faddd a3, t3, a3 + LD x6, 0 * SIZE(X) + fmuld x7, x7, t3 + bgt I, $L21 + .align 4 + +$L22: + faddd a0, t0, a0 + LD x7, 1 * SIZE(X) + fmuld x0, x0, t0 + addl X, INCX, X + + faddd a1, t1, a1 + fmuld x1, x1, t1 + faddd a2, t2, a2 + fmuld x2, x2, t2 + + faddd a3, t3, a3 + fmuld x3, x3, t3 + faddd a0, t0, a0 + fmuld x4, x4, t0 + + faddd a1, t1, a1 + fmuld x5, x5, t1 + faddd a2, t2, a2 + fmuld x6, x6, t2 + + faddd a3, t3, a3 + fmuld x7, x7, t3 + faddd a2, t2, a2 + faddd a3, t3, a3 + .align 4 + +$L25: + and N, 3, I + ble I, $L998 + .align 4 + +$L26: + LD x0, 0 * SIZE(X) + ldi I, -1(I) + LD x1, 1 * SIZE(X) + addl X, INCX, X + + faddd a0, t0, a0 + fmuld x0, x0, t0 + faddd a1, t1, a1 + fmuld x1, x1, t1 + + bgt I, $L26 + .align 4 + + +$L998: + faddd a0, t0, a0 + faddd a1, t1, a1 + + faddd a0, a1, a0 + faddd a2, a3, a2 + +#if defined(EV4) || defined(EV5) + faddd a0, a2, $f16 + jsr $26, ($27), sqrt !lituse_jsr!2 + + ldih $29, 0($26) !gpdisp!3 + ldi $29, 0($29) !gpdisp!3 +#else + faddd a0, a2, a0 + fsqrtd a0, a0 +#endif + .align 4 + +$L999: +#if defined(EV4) || defined(EV5) + ldl $26, 0($sp) + ldi $sp, 16($sp) +#endif + ret + EPILOGUE diff --git a/kernel/sw_64/znrm2_simd.S b/kernel/sw_64/znrm2_simd.S new file mode 100644 index 0000000..5a509d4 --- /dev/null +++ b/kernel/sw_64/znrm2_simd.S @@ -0,0 +1,492 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#define I $0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f10 +#define a3 $f11 +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f19 +#define x4 $f20 +#define x5 $f21 +#define x6 $f22 +#define x7 $f23 + + PROLOGUE + + PROFCODE + + fclr a0 + sll INCX, ZBASE_SHIFT, INCX + fclr a1 + ble N, $L999 + + fclr a2 + cmpeq INCX, 2 * SIZE, $0 + fclr a3 + beq $0, $L20 #stride access + + +/* test the address of X */ + and X, (VEC_LEN*SIZE-1), $3 + fclr t0 + fclr t1 + bne $3, $UnAlign_ACCESS +/*Align access. Use simd instructions. Unloop 8 complex*/ + sra N, 3, I + ble I, $Remain + + VLD a0, 0*VEC_LEN*SIZE(X) + vcpys $f31, $f31, t0 #clear s0 vector + VLD a1, 1*VEC_LEN*SIZE(X) + vcpys $f31, $f31, t1 + + VLD a2, 2*VEC_LEN*SIZE(X) + vcpys $f31, $f31, t2 + VLD a3, 3*VEC_LEN*SIZE(X) + vcpys $f31, $f31, t3 + + addl X, 16 * SIZE, X + subl I, 1, I + nop + ble I, $MainLoopEnd +$MainLoop: + fillcs PREFETCHSIZE * SIZE(X) + VMAD a0, a0, t0, t0 + subl I, 1, I + VMAD a1, a1, t1, t1 + + addl X, 16 * SIZE, X + VMAD a2, a2, t2, t2 + nop + VMAD a3, a3, t3, t3 + + VLD a0, -4*VEC_LEN*SIZE(X) + VLD a1, -3*VEC_LEN*SIZE(X) + VLD a2, -2*VEC_LEN*SIZE(X) + VLD a3, -1*VEC_LEN*SIZE(X) + + bgt I, $MainLoop + .align 4 +$MainLoopEnd: + VMAD a0, a0, t0, t0 + VMAD a1, a1, t1, t1 + VMAD a2, a2, t2, t2 + VMAD a3, a3, t3, t3 + + VADD t0, t1, a0 + VADD t2, t3, a1 + nop + VADD a0, a1, t0 + + vextf t0, 1, t1 + vextf t0, 2, t2 + vextf t0, 3, t3 + nop + + ADD t0, t1, a2 + ADD t2, t3, a3 + fclr t1 + ADD a2, a3, t0 + + .align 4 +$Remain: + and N, 7, I + ble I, $End + .align 4 +$RemainLoop: + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + + addl X, 2*SIZE, X + MAD a0, a0, t0, t0 + subl I, 1, I + MAD a1, a1, t1, t1 + + bgt I, $RemainLoop + .align 4 + + ADD t0, t1, t0 +$End: + SQRT t0, a0 + ret + .align 4 + +$UnAlign_ACCESS: + + fclr t0 + sra N, 3, I + fclr t1 + ble I, $L15 + + fclr t2 + LD x0, 0 * SIZE(X) + fclr t3 + LD x1, 1 * SIZE(X) + + LD x2, 2 * SIZE(X) + LD x3, 3 * SIZE(X) + LD x4, 4 * SIZE(X) + LD x5, 5 * SIZE(X) + LD x6, 6 * SIZE(X) + LD x7, 7 * SIZE(X) + + ldi I, -1(I) + ble I, $L12 + .align 4 + +$L11: + ADD a0, t0, a0 + fillcs (PREFETCHSIZE) * SIZE(X) + MUL x0, x0, t0 + LD x0, 8 * SIZE(X) + + ADD a1, t1, a1 + mov X, XX + MUL x1, x1, t1 + LD x1, 9 * SIZE(X) + + ADD a2, t2, a2 + unop + MUL x2, x2, t2 + LD x2, 10 * SIZE(X) + + ADD a3, t3, a3 + unop + MUL x3, x3, t3 + LD x3, 11 * SIZE(X) + + ADD a0, t0, a0 + unop + MUL x4, x4, t0 + LD x4, 12 * SIZE(X) + + ADD a1, t1, a1 + unop + MUL x5, x5, t1 + LD x5, 13 * SIZE(X) + + ADD a2, t2, a2 + unop + MUL x6, x6, t2 + LD x6, 14 * SIZE(X) + + ADD a3, t3, a3 + unop + MUL x7, x7, t3 + LD x7, 15 * SIZE(X) + + ADD a0, t0, a0 + unop + MUL x0, x0, t0 + LD x0, 16 * SIZE(X) + + ADD a1, t1, a1 + ldi X, 16 * SIZE(X) + MUL x1, x1, t1 + LD x1, 17 * SIZE(XX) + + ADD a2, t2, a2 + unop + MUL x2, x2, t2 + LD x2, 18 * SIZE(XX) + + ADD a3, t3, a3 + unop + MUL x3, x3, t3 + LD x3, 19 * SIZE(XX) + + ADD a0, t0, a0 + unop + MUL x4, x4, t0 + LD x4, 20 * SIZE(XX) + + ADD a1, t1, a1 + ldi I, -1(I) + MUL x5, x5, t1 + LD x5, 21 * SIZE(XX) + + ADD a2, t2, a2 + unop + MUL x6, x6, t2 + LD x6, 22 * SIZE(XX) + + ADD a3, t3, a3 + MUL x7, x7, t3 + LD x7, 23 * SIZE(XX) + bgt I, $L11 + .align 4 + +$L12: + ADD a0, t0, a0 + mov X, XX + MUL x0, x0, t0 + LD x0, 8 * SIZE(X) + + ADD a1, t1, a1 + unop + MUL x1, x1, t1 + LD x1, 9 * SIZE(X) + + ADD a2, t2, a2 + unop + MUL x2, x2, t2 + LD x2, 10 * SIZE(X) + + ADD a3, t3, a3 + unop + MUL x3, x3, t3 + LD x3, 11 * SIZE(X) + + ADD a0, t0, a0 + unop + MUL x4, x4, t0 + LD x4, 12 * SIZE(XX) + + ADD a1, t1, a1 + unop + MUL x5, x5, t1 + LD x5, 13 * SIZE(XX) + + ADD a2, t2, a2 + unop + MUL x6, x6, t2 + LD x6, 14 * SIZE(XX) + + ADD a3, t3, a3 + ldi X, 16 * SIZE(X) + MUL x7, x7, t3 + LD x7, 15 * SIZE(XX) + + ADD a0, t0, a0 + MUL x0, x0, t0 + ADD a1, t1, a1 + MUL x1, x1, t1 + + ADD a2, t2, a2 + MUL x2, x2, t2 + ADD a3, t3, a3 + MUL x3, x3, t3 + + ADD a0, t0, a0 + MUL x4, x4, t0 + ADD a1, t1, a1 + MUL x5, x5, t1 + + ADD a2, t2, a2 + MUL x6, x6, t2 + ADD a3, t3, a3 + MUL x7, x7, t3 + + ADD a2, t2, a2 + ADD a3, t3, a3 + .align 4 + +$L15: + and N, 7, I + ble I, $L998 + .align 4 + +$L16: + LD x0, 0 * SIZE(X) + LD x1, 1 * SIZE(X) + + ldi X, 2 * SIZE(X) + + ADD a0, t0, a0 + MUL x0, x0, t0 + ADD a1, t1, a1 + MUL x1, x1, t1 + + ldi I, -1(I) + bgt I, $L16 + bsr $31, $L998 + .align 4 + +$L20: + fclr t0 + sra N, 2, I + fclr t1 + ble I, $L25 + + LD x0, 0 * SIZE(X) + fclr t2 + LD x1, 1 * SIZE(X) + addl X, INCX, X + LD x2, 0 * SIZE(X) + fclr t3 + LD x3, 1 * SIZE(X) + addl X, INCX, X + + LD x4, 0 * SIZE(X) + ldi I, -1(I) + LD x5, 1 * SIZE(X) + addl X, INCX, X + + LD x6, 0 * SIZE(X) + ble I, $L22 + .align 4 + +$L21: + ADD a0, t0, a0 + LD x7, 1 * SIZE(X) + MUL x0, x0, t0 + addl X, INCX, X + + ADD a1, t1, a1 + LD x0, 0 * SIZE(X) + MUL x1, x1, t1 + unop + + ADD a2, t2, a2 + LD x1, 1 * SIZE(X) + MUL x2, x2, t2 + addl X, INCX, X + + ADD a3, t3, a3 + LD x2, 0 * SIZE(X) + MUL x3, x3, t3 + unop + + ADD a0, t0, a0 + LD x3, 1 * SIZE(X) + MUL x4, x4, t0 + addl X, INCX, X + + ADD a1, t1, a1 + LD x4, 0 * SIZE(X) + MUL x5, x5, t1 + ldi I, -1(I) + + ADD a2, t2, a2 + LD x5, 1 * SIZE(X) + MUL x6, x6, t2 + addl X, INCX, X + + ADD a3, t3, a3 + LD x6, 0 * SIZE(X) + MUL x7, x7, t3 + bgt I, $L21 + .align 4 + +$L22: + ADD a0, t0, a0 + LD x7, 1 * SIZE(X) + MUL x0, x0, t0 + addl X, INCX, X + + ADD a1, t1, a1 + MUL x1, x1, t1 + ADD a2, t2, a2 + MUL x2, x2, t2 + + ADD a3, t3, a3 + MUL x3, x3, t3 + ADD a0, t0, a0 + MUL x4, x4, t0 + + ADD a1, t1, a1 + MUL x5, x5, t1 + ADD a2, t2, a2 + MUL x6, x6, t2 + + ADD a3, t3, a3 + MUL x7, x7, t3 + ADD a2, t2, a2 + ADD a3, t3, a3 + .align 4 + +$L25: + and N, 3, I + ble I, $L998 + .align 4 + +$L26: + LD x0, 0 * SIZE(X) + ldi I, -1(I) + LD x1, 1 * SIZE(X) + addl X, INCX, X + + ADD a0, t0, a0 + MUL x0, x0, t0 + ADD a1, t1, a1 + MUL x1, x1, t1 + + bgt I, $L26 + .align 4 + + +$L998: + ADD a0, t0, a0 + ADD a1, t1, a1 + + ADD a0, a1, a0 + ADD a2, a3, a2 + + + + ADD a0, a2, a0 + SQRT a0, a0 + + .align 4 + +$L999: + + ret + EPILOGUE diff --git a/kernel/sw_64/zrot.S b/kernel/sw_64/zrot.S new file mode 100644 index 0000000..9016a00 --- /dev/null +++ b/kernel/sw_64/zrot.S @@ -0,0 +1,689 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 +#define I $21 +#define XX $23 +#define YY $24 + +#define b9 $f29 + +#define C $f10 +#define S $f11 + +#define PREFETCH_SIZE 80 + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + fmov $f21, C + LD S, 0($sp) + + addl INCX, INCX, INCX + addl INCY, INCY, INCY + + cmpeq INCX, 2, $23 + cmpeq INCY, 2, $24 + ble N, $L998 + + and $23, $24, $23 + beq $23, $L50 + + sra N, 2, I + ble I, $L15 + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + LD $f15, 1*SIZE(Y) + + LD $f16, 2*SIZE(X) + LD $f17, 2*SIZE(Y) + LD $f18, 3*SIZE(X) + LD $f19, 3*SIZE(Y) + + MUL C, $f12, $f21 + unop + MUL S, $f13, $f22 + MUL C, $f13, $f23 + + LD $f13, 4*SIZE(Y) + MUL S, $f12, $f24 + LD $f12, 4*SIZE(X) + MUL C, $f14, $f25 + + ldi I, -1(I) + MUL S, $f15, $f26 + ADD $f21, $f22, b9 + fmov b9, $f22 + MUL C, $f15, $f27 + + LD $f15, 5*SIZE(Y) + MUL S, $f14, $f28 + SUB $f23, $f24, b9 + fmov b9, $f24 + ble I, $L13 + .align 4 + +$L12: + MUL C, $f16, $f21 + fillcs (PREFETCH_SIZE) * SIZE(X) + unop + LD $f14, 5*SIZE(X) + + ST $f22, 0*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, b9 + fmov b9, $f26 + + MUL C, $f17, $f23 + fillcs (PREFETCH_SIZE) * SIZE(Y) + unop + LD $f17, 6*SIZE(Y) + + ST $f24, 0*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, b9 + fmov b9, $f28 + + MUL C, $f18, $f25 + LD $f16, 6*SIZE(X) + unop + unop + + ST $f26, 1*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, b9 + fmov b9, $f22 + + MUL C, $f19, $f27 + unop + unop + LD $f19, 7*SIZE(Y) + + ST $f28, 1*SIZE(Y) + MUL S, $f18, $f28 + unop + SUB $f23, $f24, b9 + fmov b9, $f24 + + MUL C, $f12, $f21 + LD $f18, 7*SIZE(X) + unop + unop + + ST $f22, 2*SIZE(X) + unop + MUL S, $f13, $f22 + ADD $f25, $f26, b9 + fmov b9, $f26 + + MUL C, $f13, $f23 + LD $f13, 8*SIZE(Y) + unop + unop + + ST $f24, 2*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, b9 + fmov b9, $f28 + + MUL C, $f14, $f25 + LD $f12, 8*SIZE(X) + unop + unop + + ST $f26, 3*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, b9 + fmov b9, $f22 + + MUL C, $f15, $f27 + LD $f15, 9*SIZE(Y) + unop + unop + + ST $f28, 3*SIZE(Y) + MUL S, $f14, $f28 + unop + SUB $f23, $f24, b9 + fmov b9, $f24 + + MUL C, $f16, $f21 + LD $f14, 9*SIZE(X) + unop + unop + + ST $f22, 4*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, b9 + fmov b9, $f26 + + MUL C, $f17, $f23 + LD $f17, 10*SIZE(Y) + unop + unop + + ST $f24, 4*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, b9 + fmov b9, $f28 + + MUL C, $f18, $f25 + LD $f16, 10*SIZE(X) + unop + unop + + ST $f26, 5*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, b9 + fmov b9, $f22 + + MUL C, $f19, $f27 + LD $f19, 11*SIZE(Y) + unop + unop + + ST $f28, 5*SIZE(Y) + MUL S, $f18, $f28 + ldi I, -1(I) + SUB $f23, $f24, b9 + fmov b9, $f24 + + MUL C, $f12, $f21 + LD $f18, 11*SIZE(X) + unop + unop + + ST $f22, 6*SIZE(X) + MUL S, $f13, $f22 + unop + ADD $f25, $f26, b9 + fmov b9, $f26 + + MUL C, $f13, $f23 + LD $f13, 12*SIZE(Y) + ldi X, 8*SIZE(X) + unop + + ST $f24, 6*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, b9 + fmov b9, $f28 + + MUL C, $f14, $f25 + LD $f12, 4*SIZE(X) + ldi Y, 8*SIZE(Y) + unop + + ST $f26, -1*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, b9 + fmov b9, $f22 + + MUL C, $f15, $f27 + LD $f15, 5*SIZE(Y) + unop + unop + + ST $f28, -1*SIZE(Y) + MUL S, $f14, $f28 + SUB $f23, $f24, b9 + fmov b9, $f24 + bgt I, $L12 + .align 4 + +$L13: + MUL C, $f16, $f21 + LD $f14, 5*SIZE(X) + unop + unop + + ST $f22, 0*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, b9 + fmov b9, $f26 + + MUL C, $f17, $f23 + unop + unop + LD $f17, 6*SIZE(Y) + + ST $f24, 0*SIZE(Y) + MUL S, $f16, $f24 + LD $f16, 6*SIZE(X) + SUB $f27, $f28, b9 + fmov b9, $f28 + + MUL C, $f18, $f25 + unop + unop + unop + + ST $f26, 1*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, b9 + fmov b9, $f22 + + MUL C, $f19, $f27 + unop + unop + LD $f19, 7*SIZE(Y) + + ST $f28, 1*SIZE(Y) + MUL S, $f18, $f28 + LD $f18, 7*SIZE(X) + SUB $f23, $f24, b9 + fmov b9, $f24 + + MUL C, $f12, $f21 + unop + unop + unop + + ST $f22, 2*SIZE(X) + unop + MUL S, $f13, $f22 + ADD $f25, $f26, b9 + fmov b9, $f26 + + MUL C, $f13, $f23 + unop + unop + unop + + ST $f24, 2*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, b9 + fmov b9, $f28 + + MUL C, $f14, $f25 + unop + unop + unop + + ST $f26, 3*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, b9 + fmov b9, $f22 + + MUL C, $f15, $f27 + unop + unop + unop + + ST $f28, 3*SIZE(Y) + MUL S, $f14, $f28 + unop + SUB $f23, $f24, b9 + fmov b9, $f24 + + MUL C, $f16, $f21 + unop + unop + unop + + ST $f22, 4*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, b9 + fmov b9, $f26 + + MUL C, $f17, $f23 + unop + unop + unop + + ST $f24, 4*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, b9 + fmov b9, $f28 + + MUL C, $f18, $f25 + unop + unop + unop + + ST $f26, 5*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, b9 + fmov b9, $f22 + + MUL C, $f19, $f27 + unop + unop + unop + + ST $f28, 5*SIZE(Y) + MUL S, $f18, $f28 + unop + SUB $f23, $f24, b9 + fmov b9, $f24 + + ST $f22, 6*SIZE(X) + ADD $f25, $f26, b9 + fmov b9, $f26 + ST $f24, 6*SIZE(Y) + SUB $f27, $f28, b9 + fmov b9, $f28 + + ST $f26, 7*SIZE(X) + ldi X, 8*SIZE(X) + ST $f28, 7*SIZE(Y) + ldi Y, 8*SIZE(Y) + .align 4 + + +$L15: + and N, 3, I + ble I, $L998 + .align 4 + +$L16: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + LD $f15, 1*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, b9 + fmov b9, $f22 + SUB $f23, $f24, b9 + fmov b9, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, b9 + fmov b9, $f26 + SUB $f27, $f28, b9 + fmov b9, $f28 + + ST $f22, 0*SIZE(X) + ST $f24, 0*SIZE(Y) + ldi I, -1(I) + + ST $f26, 1*SIZE(X) + ldi X, 2 * SIZE(X) + ST $f28, 1*SIZE(Y) + ldi Y, 2 * SIZE(Y) + + bgt I, $L16 + .align 4 + +$L998: + clr $0 + ret + .align 4 + +$L50: + mov X, XX + mov Y, YY + + sra N, 2, I + ble I, $L55 + .align 4 + +$L51: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 1*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, b9 + fmov b9, $f22 + SUB $f23, $f24, b9 + fmov b9, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, b9 + fmov b9, $f26 + SUB $f27, $f28, b9 + fmov b9, $f28 + + ST $f22, 0*SIZE(XX) + ST $f24, 0*SIZE(YY) + ST $f26, 1*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 1*SIZE(YY) + SXADDQ INCY, YY, YY + + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 1*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, b9 + fmov b9, $f22 + SUB $f23, $f24, b9 + fmov b9, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, b9 + fmov b9, $f26 + SUB $f27, $f28, b9 + fmov b9, $f28 + + ST $f22, 0*SIZE(XX) + ST $f24, 0*SIZE(YY) + ST $f26, 1*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 1*SIZE(YY) + SXADDQ INCY, YY, YY + + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 1*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, b9 + fmov b9, $f22 + SUB $f23, $f24, b9 + fmov b9, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, b9 + fmov b9, $f26 + SUB $f27, $f28, b9 + fmov b9, $f28 + + ST $f22, 0*SIZE(XX) + ST $f24, 0*SIZE(YY) + ST $f26, 1*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 1*SIZE(YY) + SXADDQ INCY, YY, YY + + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 1*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, b9 + fmov b9, $f22 + SUB $f23, $f24, b9 + fmov b9, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, b9 + fmov b9, $f26 + SUB $f27, $f28, b9 + fmov b9, $f28 + + ST $f22, 0*SIZE(XX) + ST $f24, 0*SIZE(YY) + ST $f26, 1*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 1*SIZE(YY) + SXADDQ INCY, YY, YY + + ldi I, -1(I) + bgt I, $L51 + .align 4 + +$L55: + and N, 3, I + ble I, $L999 + .align 4 + +$L56: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + LD $f15, 1*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, b9 + fmov b9, $f22 + SUB $f23, $f24, b9 + fmov b9, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, b9 + fmov b9, $f26 + SUB $f27, $f28, b9 + fmov b9, $f28 + + ST $f22, 0*SIZE(X) + ST $f24, 0*SIZE(Y) + ldi I, -1(I) + + ST $f26, 1*SIZE(X) + ST $f28, 1*SIZE(Y) + SXADDQ INCX, X, X + SXADDQ INCY, Y, Y + + bgt I, $L56 + .align 4 + +$L999: + clr $0 + ret + EPILOGUE diff --git a/kernel/sw_64/zrot.S.bak b/kernel/sw_64/zrot.S.bak new file mode 100644 index 0000000..83dd2b1 --- /dev/null +++ b/kernel/sw_64/zrot.S.bak @@ -0,0 +1,631 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 +#define I $21 +#define XX $23 +#define YY $24 + +#define C $f10 +#define S $f11 + +#define PREFETCH_SIZE 80 + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + fmov $f21, C + LD S, 0($sp) + + addl INCX, INCX, INCX + addl INCY, INCY, INCY + + cmpeq INCX, 2, $23 + cmpeq INCY, 2, $24 + ble N, $L998 + + and $23, $24, $23 + beq $23, $L50 + + sra N, 2, I + ble I, $L15 + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + LD $f15, 1*SIZE(Y) + + LD $f16, 2*SIZE(X) + LD $f17, 2*SIZE(Y) + LD $f18, 3*SIZE(X) + LD $f19, 3*SIZE(Y) + + MUL C, $f12, $f21 + unop + MUL S, $f13, $f22 + MUL C, $f13, $f23 + + LD $f13, 4*SIZE(Y) + MUL S, $f12, $f24 + LD $f12, 4*SIZE(X) + MUL C, $f14, $f25 + + ldi I, -1(I) + MUL S, $f15, $f26 + ADD $f21, $f22, $f22 + MUL C, $f15, $f27 + + LD $f15, 5*SIZE(Y) + MUL S, $f14, $f28 + SUB $f23, $f24, $f24 + ble I, $L13 + .align 4 + +$L12: + MUL C, $f16, $f21 + fillcs (PREFETCH_SIZE) * SIZE(X) + unop + LD $f14, 5*SIZE(X) + + ST $f22, 0*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + fillcs (PREFETCH_SIZE) * SIZE(Y) + unop + LD $f17, 6*SIZE(Y) + + ST $f24, 0*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + LD $f16, 6*SIZE(X) + unop + unop + + ST $f26, 1*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + LD $f19, 7*SIZE(Y) + + ST $f28, 1*SIZE(Y) + MUL S, $f18, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + LD $f18, 7*SIZE(X) + unop + unop + + ST $f22, 2*SIZE(X) + unop + MUL S, $f13, $f22 + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + LD $f13, 8*SIZE(Y) + unop + unop + + ST $f24, 2*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + LD $f12, 8*SIZE(X) + unop + unop + + ST $f26, 3*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + LD $f15, 9*SIZE(Y) + unop + unop + + ST $f28, 3*SIZE(Y) + MUL S, $f14, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f16, $f21 + LD $f14, 9*SIZE(X) + unop + unop + + ST $f22, 4*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + LD $f17, 10*SIZE(Y) + unop + unop + + ST $f24, 4*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + LD $f16, 10*SIZE(X) + unop + unop + + ST $f26, 5*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + LD $f19, 11*SIZE(Y) + unop + unop + + ST $f28, 5*SIZE(Y) + MUL S, $f18, $f28 + ldi I, -1(I) + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + LD $f18, 11*SIZE(X) + unop + unop + + ST $f22, 6*SIZE(X) + MUL S, $f13, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + LD $f13, 12*SIZE(Y) + ldi X, 8*SIZE(X) + unop + + ST $f24, 6*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + LD $f12, 4*SIZE(X) + ldi Y, 8*SIZE(Y) + unop + + ST $f26, -1*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + LD $f15, 5*SIZE(Y) + unop + unop + + ST $f28, -1*SIZE(Y) + MUL S, $f14, $f28 + SUB $f23, $f24, $f24 + bgt I, $L12 + .align 4 + +$L13: + MUL C, $f16, $f21 + LD $f14, 5*SIZE(X) + unop + unop + + ST $f22, 0*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + unop + unop + LD $f17, 6*SIZE(Y) + + ST $f24, 0*SIZE(Y) + MUL S, $f16, $f24 + LD $f16, 6*SIZE(X) + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + unop + unop + unop + + ST $f26, 1*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + LD $f19, 7*SIZE(Y) + + ST $f28, 1*SIZE(Y) + MUL S, $f18, $f28 + LD $f18, 7*SIZE(X) + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + unop + unop + unop + + ST $f22, 2*SIZE(X) + unop + MUL S, $f13, $f22 + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + unop + unop + unop + + ST $f24, 2*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + unop + unop + unop + + ST $f26, 3*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + unop + unop + unop + + ST $f28, 3*SIZE(Y) + MUL S, $f14, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f16, $f21 + unop + unop + unop + + ST $f22, 4*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + unop + unop + unop + + ST $f24, 4*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + unop + unop + unop + + ST $f26, 5*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + unop + + ST $f28, 5*SIZE(Y) + MUL S, $f18, $f28 + unop + SUB $f23, $f24, $f24 + + ST $f22, 6*SIZE(X) + ADD $f25, $f26, $f26 + ST $f24, 6*SIZE(Y) + SUB $f27, $f28, $f28 + + ST $f26, 7*SIZE(X) + ldi X, 8*SIZE(X) + ST $f28, 7*SIZE(Y) + ldi Y, 8*SIZE(Y) + .align 4 + + +$L15: + and N, 3, I + ble I, $L998 + .align 4 + +$L16: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + LD $f15, 1*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(X) + ST $f24, 0*SIZE(Y) + ldi I, -1(I) + + ST $f26, 1*SIZE(X) + ldi X, 2 * SIZE(X) + ST $f28, 1*SIZE(Y) + ldi Y, 2 * SIZE(Y) + + bgt I, $L16 + .align 4 + +$L998: + clr $0 + ret + .align 4 + +$L50: + mov X, XX + mov Y, YY + + sra N, 2, I + ble I, $L55 + .align 4 + +$L51: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 1*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(XX) + ST $f24, 0*SIZE(YY) + ST $f26, 1*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 1*SIZE(YY) + SXADDQ INCY, YY, YY + + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 1*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(XX) + ST $f24, 0*SIZE(YY) + ST $f26, 1*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 1*SIZE(YY) + SXADDQ INCY, YY, YY + + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 1*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(XX) + ST $f24, 0*SIZE(YY) + ST $f26, 1*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 1*SIZE(YY) + SXADDQ INCY, YY, YY + + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 1*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(XX) + ST $f24, 0*SIZE(YY) + ST $f26, 1*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 1*SIZE(YY) + SXADDQ INCY, YY, YY + + ldi I, -1(I) + bgt I, $L51 + .align 4 + +$L55: + and N, 3, I + ble I, $L999 + .align 4 + +$L56: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + LD $f15, 1*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(X) + ST $f24, 0*SIZE(Y) + ldi I, -1(I) + + ST $f26, 1*SIZE(X) + ST $f28, 1*SIZE(Y) + SXADDQ INCX, X, X + SXADDQ INCY, Y, Y + + bgt I, $L56 + .align 4 + +$L999: + clr $0 + ret + EPILOGUE diff --git a/kernel/sw_64/zrot_simd.S b/kernel/sw_64/zrot_simd.S new file mode 100644 index 0000000..9e00ebf --- /dev/null +++ b/kernel/sw_64/zrot_simd.S @@ -0,0 +1,799 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 +#define I $21 +#define XX $23 +#define YY $24 + +#define C $f10 +#define S $f11 + +#define x0 $f12 +#define x1 $f14 +#define x2 $f16 +#define x3 $f18 + +#define y0 $f13 +#define y1 $f15 +#define y2 $f17 +#define y3 $f19 + +#define t0 $f20 +#define t1 $f21 +#define t2 $f22 +#define t3 $f23 +#define t4 $f24 +#define t5 $f25 +#define t6 $f26 +#define t7 $f27 + +#define PREFETCHSIZE 80 + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + fmov $f21, C + LD S, 0($sp) + + addl INCX, INCX, INCX + addl INCY, INCY, INCY + + cmpeq INCX, 2, $23 + cmpeq INCY, 2, $24 + ble N, $L998 + + and $23, $24, $23 + beq $23, $L50 + +/* test the address of X */ + and X, (VEC_LEN*SIZE-1), $3 + and Y, (VEC_LEN*SIZE-1), $4 + or $3, $4, $4 + bne $4, $UnAlign_ACCESS + +/*Align Accessing*/ + sra N, 3, I + ble I, $Remain + + vcpyf C, C + vcpyf S, S + + VLD x0, 0*VEC_LEN*SIZE(X) + VLD x1, 1*VEC_LEN*SIZE(X) + VLD x2, 2*VEC_LEN*SIZE(X) + VLD x3, 3*VEC_LEN*SIZE(X) + + VLD y0, 0*VEC_LEN*SIZE(Y) + VLD y1, 1*VEC_LEN*SIZE(Y) + VLD y2, 2*VEC_LEN*SIZE(Y) + VLD y3, 3*VEC_LEN*SIZE(Y) + + addl X, 16 * SIZE, X + addl Y, 16 * SIZE, Y + subl I, 1, I + ble I, $MainLoopEnd + .align 4 + +$MainLoop: + VMUL C, x0, t0 + fillcs (PREFETCHSIZE) * SIZE(X) + VMUL C, x1, t1 + fillcs (PREFETCHSIZE) * SIZE(Y) + + VMUL C, x2, t2 + subl I, 1, I + VMUL C, x3, t3 + nop + + VMUL S, x0, t4 + VLD x0, 0*VEC_LEN*SIZE(X) + VMUL S, x1, t5 + VLD x1, 1*VEC_LEN*SIZE(X) + + VMUL S, x2, t6 + VLD x2, 2*VEC_LEN*SIZE(X) + VMUL S, x3, t7 + VLD x3, 3*VEC_LEN*SIZE(X) + + VMAD S, y0, t0, t0 + VMAD S, y1, t1, t1 + VMAD S, y2, t2, t2 + VMAD S, y3, t3, t3 + + VMSUB C, y0, t4, t4 + VLD y0, 0*VEC_LEN*SIZE(Y) + VMSUB C, y1, t5, t5 + VLD y1, 1*VEC_LEN*SIZE(Y) + + VMSUB C, y2, t6, t6 + VLD y2, 2*VEC_LEN*SIZE(Y) + VMSUB C, y3, t7, t7 + VLD y3, 3*VEC_LEN*SIZE(Y) + + VST t0, -4*VEC_LEN*SIZE(X) + VST t1, -3*VEC_LEN*SIZE(X) + VST t2, -2*VEC_LEN*SIZE(X) + VST t3, -1*VEC_LEN*SIZE(X) + + VST t4, -4*VEC_LEN*SIZE(Y) + VST t5, -3*VEC_LEN*SIZE(Y) + VST t6, -2*VEC_LEN*SIZE(Y) + VST t7, -1*VEC_LEN*SIZE(Y) + + addl X, 16 * SIZE, X + addl Y, 16 * SIZE, Y + nop + bgt I, $MainLoop + .align 4 +$MainLoopEnd: + VMUL C, x0, t0 + VMUL C, x1, t1 + VMUL C, x2, t2 + VMUL C, x3, t3 + + VMUL S, x0, t4 + VMUL S, x1, t5 + VMUL S, x2, t6 + VMUL S, x3, t7 + + VMAD S, y0, t0, t0 + VMAD S, y1, t1, t1 + VMAD S, y2, t2, t2 + VMAD S, y3, t3, t3 + + VMSUB C, y0, t4, t4 + VMSUB C, y1, t5, t5 + VMSUB C, y2, t6, t6 + VMSUB C, y3, t7, t7 + + VST t0, -4*VEC_LEN*SIZE(X) + VST t1, -3*VEC_LEN*SIZE(X) + VST t2, -2*VEC_LEN*SIZE(X) + VST t3, -1*VEC_LEN*SIZE(X) + + VST t4, -4*VEC_LEN*SIZE(Y) + VST t5, -3*VEC_LEN*SIZE(Y) + VST t6, -2*VEC_LEN*SIZE(Y) + VST t7, -1*VEC_LEN*SIZE(Y) + + .align 4 +$Remain: + and N, 7, I + ble I, $End +$RemainLoop: + LD x0, 0*SIZE(X) + LD y0, 0*SIZE(Y) + LD x1, 1*SIZE(X) + LD y1, 1*SIZE(Y) + + MUL C, x0, t0 + MUL S, x0, t4 + MAD S, y0, t0, t0 + MSUB C, y0, t4, t4 + + MUL C, x1, t1 + ldi I, -1(I) + MUL S, x1, t5 + ldi X, 2 * SIZE(X) + + MAD S, y1, t1, t1 + ldi Y, 2 * SIZE(Y) + MSUB C, y1, t5, t5 + nop + + ST t0, -2*SIZE(X) + ST t1, -1*SIZE(X) + ST t4, -2*SIZE(Y) + ST t5, -1*SIZE(Y) + + bgt I, $RemainLoop + .align 4 +$End: + clr $0 + ret + .align 4 + +$UnAlign_ACCESS: + sra N, 2, I + ble I, $L15 + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + LD $f15, 1*SIZE(Y) + + LD $f16, 2*SIZE(X) + LD $f17, 2*SIZE(Y) + LD $f18, 3*SIZE(X) + LD $f19, 3*SIZE(Y) + + MUL C, $f12, $f21 + unop + MUL S, $f13, $f22 + MUL C, $f13, $f23 + + LD $f13, 4*SIZE(Y) + MUL S, $f12, $f24 + LD $f12, 4*SIZE(X) + MUL C, $f14, $f25 + + ldi I, -1(I) + MUL S, $f15, $f26 + ADD $f21, $f22, $f22 + MUL C, $f15, $f27 + + LD $f15, 5*SIZE(Y) + MUL S, $f14, $f28 + SUB $f23, $f24, $f24 + ble I, $L13 + .align 4 + +$L12: + MUL C, $f16, $f21 + fillcs (PREFETCHSIZE) * SIZE(X) + unop + LD $f14, 5*SIZE(X) + + ST $f22, 0*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + fillcs (PREFETCHSIZE) * SIZE(Y) + unop + LD $f17, 6*SIZE(Y) + + ST $f24, 0*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + LD $f16, 6*SIZE(X) + unop + unop + + ST $f26, 1*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + LD $f19, 7*SIZE(Y) + + ST $f28, 1*SIZE(Y) + MUL S, $f18, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + LD $f18, 7*SIZE(X) + unop + unop + + ST $f22, 2*SIZE(X) + unop + MUL S, $f13, $f22 + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + LD $f13, 8*SIZE(Y) + unop + unop + + ST $f24, 2*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + LD $f12, 8*SIZE(X) + unop + unop + + ST $f26, 3*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + LD $f15, 9*SIZE(Y) + unop + unop + + ST $f28, 3*SIZE(Y) + MUL S, $f14, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f16, $f21 + LD $f14, 9*SIZE(X) + unop + unop + + ST $f22, 4*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + LD $f17, 10*SIZE(Y) + unop + unop + + ST $f24, 4*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + LD $f16, 10*SIZE(X) + unop + unop + + ST $f26, 5*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + LD $f19, 11*SIZE(Y) + unop + unop + + ST $f28, 5*SIZE(Y) + MUL S, $f18, $f28 + ldi I, -1(I) + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + LD $f18, 11*SIZE(X) + unop + unop + + ST $f22, 6*SIZE(X) + MUL S, $f13, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + LD $f13, 12*SIZE(Y) + ldi X, 8*SIZE(X) + unop + + ST $f24, 6*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + LD $f12, 4*SIZE(X) + ldi Y, 8*SIZE(Y) + unop + + ST $f26, -1*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + LD $f15, 5*SIZE(Y) + unop + unop + + ST $f28, -1*SIZE(Y) + MUL S, $f14, $f28 + SUB $f23, $f24, $f24 + bgt I, $L12 + .align 4 + +$L13: + MUL C, $f16, $f21 + LD $f14, 5*SIZE(X) + unop + unop + + ST $f22, 0*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + unop + unop + LD $f17, 6*SIZE(Y) + + ST $f24, 0*SIZE(Y) + MUL S, $f16, $f24 + LD $f16, 6*SIZE(X) + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + unop + unop + unop + + ST $f26, 1*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + LD $f19, 7*SIZE(Y) + + ST $f28, 1*SIZE(Y) + MUL S, $f18, $f28 + LD $f18, 7*SIZE(X) + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + unop + unop + unop + + ST $f22, 2*SIZE(X) + unop + MUL S, $f13, $f22 + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + unop + unop + unop + + ST $f24, 2*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + unop + unop + unop + + ST $f26, 3*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + unop + unop + unop + + ST $f28, 3*SIZE(Y) + MUL S, $f14, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f16, $f21 + unop + unop + unop + + ST $f22, 4*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + unop + unop + unop + + ST $f24, 4*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + unop + unop + unop + + ST $f26, 5*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + unop + + ST $f28, 5*SIZE(Y) + MUL S, $f18, $f28 + unop + SUB $f23, $f24, $f24 + + ST $f22, 6*SIZE(X) + ADD $f25, $f26, $f26 + ST $f24, 6*SIZE(Y) + SUB $f27, $f28, $f28 + + ST $f26, 7*SIZE(X) + ldi X, 8*SIZE(X) + ST $f28, 7*SIZE(Y) + ldi Y, 8*SIZE(Y) + .align 4 + + +$L15: + and N, 3, I + ble I, $L998 + .align 4 + +$L16: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + LD $f15, 1*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(X) + ST $f24, 0*SIZE(Y) + ldi I, -1(I) + + ST $f26, 1*SIZE(X) + ldi X, 2 * SIZE(X) + ST $f28, 1*SIZE(Y) + ldi Y, 2 * SIZE(Y) + + bgt I, $L16 + .align 4 + +$L998: + clr $0 + ret + .align 4 + +$L50: + mov X, XX + mov Y, YY + + sra N, 2, I + ble I, $L55 + .align 4 + +$L51: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 1*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(XX) + ST $f24, 0*SIZE(YY) + ST $f26, 1*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 1*SIZE(YY) + SXADDQ INCY, YY, YY + + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 1*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(XX) + ST $f24, 0*SIZE(YY) + ST $f26, 1*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 1*SIZE(YY) + SXADDQ INCY, YY, YY + + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 1*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(XX) + ST $f24, 0*SIZE(YY) + ST $f26, 1*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 1*SIZE(YY) + SXADDQ INCY, YY, YY + + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 1*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(XX) + ST $f24, 0*SIZE(YY) + ST $f26, 1*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 1*SIZE(YY) + SXADDQ INCY, YY, YY + + ldi I, -1(I) + bgt I, $L51 + .align 4 + +$L55: + and N, 3, I + ble I, $L999 + .align 4 + +$L56: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + LD $f15, 1*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(X) + ST $f24, 0*SIZE(Y) + ldi I, -1(I) + + ST $f26, 1*SIZE(X) + ST $f28, 1*SIZE(Y) + SXADDQ INCX, X, X + SXADDQ INCY, Y, Y + + bgt I, $L56 + .align 4 + +$L999: + clr $0 + ret + EPILOGUE diff --git a/kernel/sw_64/zscal.S b/kernel/sw_64/zscal.S new file mode 100644 index 0000000..9589624 --- /dev/null +++ b/kernel/sw_64/zscal.S @@ -0,0 +1,255 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $21 +#define INCX $17 + +#define XX $18 +#define I $19 + +#define ALPHA_R $f19 +#define ALPHA_I $f20 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f21 + +#define t0 $f22 +#define t1 $f23 +#define t2 $f24 +#define t3 $f25 + +#define t4 $f26 +#define t5 $f27 +#define t6 $f28 +#define t7 $f29 + + PROLOGUE + PROFCODE + + ldl INCX, 0($sp) + mov X, XX + ble N, $L999 + + addl INCX, INCX, INCX + + sra N, 2, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + SXADDQ INCX, X, X + LD a2, 0 * SIZE(X) + LD a3, 1 * SIZE(X) + SXADDQ INCX, X, X + LD a4, 0 * SIZE(X) + LD a5, 1 * SIZE(X) + SXADDQ INCX, X, X + LD a6, 0 * SIZE(X) + LD a7, 1 * SIZE(X) + SXADDQ INCX, X, X + + MUL a0, ALPHA_R, t0 + MUL a1, ALPHA_I, t1 + MUL a0, ALPHA_I, t2 + MUL a1, ALPHA_R, t3 + + SUB t0, t1, t4 + ADD t2, t3, t5 + + ldi I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ST t4, 0 * SIZE(XX) + MUL a2, ALPHA_R, t0 + ST t5, 1 * SIZE(XX) + MUL a3, ALPHA_I, t1 + + MUL a2, ALPHA_I, t2 + LD a0, 0 * SIZE(X) + MUL a3, ALPHA_R, t3 + LD a1, 1 * SIZE(X) + + SUB t0, t1, t6 + SXADDQ INCX, XX, XX + ADD t2, t3, t7 + SXADDQ INCX, X, X + + MUL a4, ALPHA_R, t0 + ST t6, 0 * SIZE(XX) + MUL a5, ALPHA_I, t1 + ST t7, 1 * SIZE(XX) + + MUL a4, ALPHA_I, t2 + LD a2, 0 * SIZE(X) + MUL a5, ALPHA_R, t3 + LD a3, 1 * SIZE(X) + + SUB t0, t1, t4 + SXADDQ INCX, XX, XX + ADD t2, t3, t5 + SXADDQ INCX, X, X + + MUL a6, ALPHA_R, t0 + ST t4, 0 * SIZE(XX) + MUL a7, ALPHA_I, t1 + ST t5, 1 * SIZE(XX) + + MUL a6, ALPHA_I, t2 + LD a4, 0 * SIZE(X) + MUL a7, ALPHA_R, t3 + LD a5, 1 * SIZE(X) + + SUB t0, t1, t6 + SXADDQ INCX, XX, XX + ADD t2, t3, t7 + SXADDQ INCX, X, X + + MUL a0, ALPHA_R, t0 + ST t6, 0 * SIZE(XX) + MUL a1, ALPHA_I, t1 + ST t7, 1 * SIZE(XX) + + MUL a0, ALPHA_I, t2 + LD a6, 0 * SIZE(X) + MUL a1, ALPHA_R, t3 + LD a7, 1 * SIZE(X) + + SUB t0, t1, t4 + ldi I, -1(I) + ADD t2, t3, t5 + SXADDQ INCX, XX, XX + + fillcs PREFETCHSIZE * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + MUL a2, ALPHA_R, t0 + MUL a3, ALPHA_I, t1 + ST t4, 0 * SIZE(XX) + MUL a2, ALPHA_I, t2 + ST t5, 1 * SIZE(XX) + MUL a3, ALPHA_R, t3 + + SUB t0, t1, t6 + SXADDQ INCX, XX, XX + ADD t2, t3, t7 + unop + + ST t6, 0 * SIZE(XX) + MUL a4, ALPHA_R, t0 + ST t7, 1 * SIZE(XX) + MUL a5, ALPHA_I, t1 + MUL a4, ALPHA_I, t2 + MUL a5, ALPHA_R, t3 + + SUB t0, t1, t4 + SXADDQ INCX, XX, XX + ADD t2, t3, t5 + unop + + MUL a6, ALPHA_R, t0 + ST t4, 0 * SIZE(XX) + MUL a7, ALPHA_I, t1 + ST t5, 1 * SIZE(XX) + + MUL a6, ALPHA_I, t2 + MUL a7, ALPHA_R, t3 + + SUB t0, t1, t6 + SXADDQ INCX, XX, XX + ADD t2, t3, t7 + + ST t6, 0 * SIZE(XX) + ST t7, 1 * SIZE(XX) + SXADDQ INCX, XX, XX + .align 4 + +$L15: + and N, 3, I + unop + unop + ble I, $L999 + .align 4 + +$L17: + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + SXADDQ INCX, X, X + + MUL a0, ALPHA_R, t0 + MUL a1, ALPHA_I, t1 + MUL a0, ALPHA_I, t2 + MUL a1, ALPHA_R, t3 + + SUB t0, t1, t4 + ADD t2, t3, t5 + + ST t4, 0 * SIZE(XX) + ST t5, 1 * SIZE(XX) + SXADDQ INCX, XX, XX + + ldi I, -1(I) + bne I, $L17 + .align 4 + +$L999: + ret + EPILOGUE diff --git a/kernel/sw_64/zscal.S.bak b/kernel/sw_64/zscal.S.bak new file mode 100644 index 0000000..4525b56 --- /dev/null +++ b/kernel/sw_64/zscal.S.bak @@ -0,0 +1,443 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $21 +#define INCX $17 + +#define XX $18 +#define I $19 + +#define ALPHA_R $f19 +#define ALPHA_I $f20 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f21 + +#define t0 $f22 +#define t1 $f23 +#define t2 $f24 +#define t3 $f25 + +#define t4 $f26 +#define t5 $f27 +#define t6 $f28 +#define t7 $f29 + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + + ldl INCX, 0($sp) + mov X, XX + cmpeq INCX, 1, $0 + ble N, $L999 + + beq $0, $Sub + nop + +/* + unloop 4 (4*2=8) +*/ + sra N, 2, I + ble I, $Remain + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + + LD a4, 4 * SIZE(X) + LD a5, 5 * SIZE(X) + + LD a6, 6 * SIZE(X) + LD a7, 7 * SIZE(X) + + + MUL a0, ALPHA_R, t0 + MUL a0, ALPHA_I, t2 + + NMAD a1, ALPHA_I, t0, t4 + MAD a1, ALPHA_R, t2, t5 +/* + MUL a1, ALPHA_I, t1 + MUL a1, ALPHA_R, t3 + SUB t0, t1, t4 + ADD t2, t3, t5 +*/ + ldi I, -1(I) + addl X, 8*SIZE, X + + ble I, $MainLoopEnd + .align 4 + +$MainLoop: + MUL a2, ALPHA_R, t0 + ST t4, -8 * SIZE(X) + MUL a2, ALPHA_I, t2 + ST t5, -7 * SIZE(X) + + + NMAD a3, ALPHA_I, t0, t6 + LD a0, 0 * SIZE(X) + MAD a3, ALPHA_R, t2, t7 + LD a1, 1 * SIZE(X) + + ST t6, -6 * SIZE(X) + MUL a4, ALPHA_R, t0 + ST t7, -5 * SIZE(X) + MUL a4, ALPHA_I, t2 + + + NMAD a5, ALPHA_I, t0, t4 + LD a2, 2 * SIZE(X) + MAD a5, ALPHA_R, t2, t5 + LD a3, 3 * SIZE(X) +/* + MUL a5, ALPHA_I, t1 + MUL a5, ALPHA_R, t3 + + SUB t0, t1, t4 + ADD t2, t3, t5 +*/ + + MUL a6, ALPHA_R, t0 + ST t4, -4 * SIZE(X) + MUL a6, ALPHA_I, t2 + ST t5, -3 * SIZE(X) + + NMAD a7, ALPHA_I, t0, t6 + LD a4, 4 * SIZE(X) + MAD a7, ALPHA_R, t2, t7 + LD a5, 5 * SIZE(X) +/* + + MUL a7, ALPHA_I, t1 + MUL a7, ALPHA_R, t3 + + SUB t0, t1, t6 + ADD t2, t3, t7 +*/ + MUL a0, ALPHA_R, t0 + ST t6, -2 * SIZE(X) + MUL a0, ALPHA_I, t2 + ST t7, -1 * SIZE(X) + + NMAD a1, ALPHA_I, t0, t4 + LD a6, 6 * SIZE(X) + MAD a1, ALPHA_R, t2, t5 + LD a7, 7 * SIZE(X) + + + + fillcs PREFETCHSIZE * SIZE(X) + subl I, 1, I + addl X, 8*SIZE, X + bgt I, $MainLoop + .align 4 + +$MainLoopEnd: + MUL a2, ALPHA_R, t0 + ST t4, -8 * SIZE(X) + MUL a2, ALPHA_I, t2 + ST t5, -7 * SIZE(X) + + + NMAD a3, ALPHA_I, t0, t6 + MAD a3, ALPHA_R, t2, t7 + + + ST t6, -6 * SIZE(X) + MUL a4, ALPHA_R, t0 + ST t7, -5 * SIZE(X) + MUL a4, ALPHA_I, t2 + + + NMAD a5, ALPHA_I, t0, t4 + MAD a5, ALPHA_R, t2, t5 +/* + MUL a5, ALPHA_I, t1 + MUL a5, ALPHA_R, t3 + + SUB t0, t1, t4 + ADD t2, t3, t5 +*/ + + MUL a6, ALPHA_R, t0 + ST t4, -4 * SIZE(X) + MUL a6, ALPHA_I, t2 + ST t5, -3 * SIZE(X) + + NMAD a7, ALPHA_I, t0, t6 + MAD a7, ALPHA_R, t2, t7 +/* + + MUL a7, ALPHA_I, t1 + MUL a7, ALPHA_R, t3 + + SUB t0, t1, t6 + ADD t2, t3, t7 +*/ + ST t6, -2 * SIZE(X) + ST t7, -1 * SIZE(X) + + .align 4 +$Remain: + and N, 3, I + unop + unop + ble I, $L999 + .align 4 + +$RemainLoop: + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + + + MUL a0, ALPHA_R, t0 + MUL a0, ALPHA_I, t2 + + NMAD a1, ALPHA_I, t0, t4 + MAD a1, ALPHA_R, t2, t5 + +/* + MUL a1, ALPHA_I, t1 + MUL a1, ALPHA_R, t3 + SUB t0, t1, t4 + ADD t2, t3, t5 +*/ + ST t4, 0 * SIZE(X) + ST t5, 1 * SIZE(X) + + addl X, 2*SIZE, X + ldi I, -1(I) + bne I, $RemainLoop + nop + + ret + .align 4 + +$Sub: + addl INCX, INCX, INCX + + sra N, 2, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + SXADDQ INCX, X, X + LD a2, 0 * SIZE(X) + LD a3, 1 * SIZE(X) + SXADDQ INCX, X, X + LD a4, 0 * SIZE(X) + LD a5, 1 * SIZE(X) + SXADDQ INCX, X, X + LD a6, 0 * SIZE(X) + LD a7, 1 * SIZE(X) + SXADDQ INCX, X, X + + MUL a0, ALPHA_R, t0 + MUL a1, ALPHA_I, t1 + MUL a0, ALPHA_I, t2 + MUL a1, ALPHA_R, t3 + + SUB t0, t1, t4 + ADD t2, t3, t5 + + ldi I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ST t4, 0 * SIZE(XX) + MUL a2, ALPHA_R, t0 + ST t5, 1 * SIZE(XX) + MUL a3, ALPHA_I, t1 + + MUL a2, ALPHA_I, t2 + LD a0, 0 * SIZE(X) + MUL a3, ALPHA_R, t3 + LD a1, 1 * SIZE(X) + + SUB t0, t1, t6 + SXADDQ INCX, XX, XX + ADD t2, t3, t7 + SXADDQ INCX, X, X + + MUL a4, ALPHA_R, t0 + ST t6, 0 * SIZE(XX) + MUL a5, ALPHA_I, t1 + ST t7, 1 * SIZE(XX) + + MUL a4, ALPHA_I, t2 + LD a2, 0 * SIZE(X) + MUL a5, ALPHA_R, t3 + LD a3, 1 * SIZE(X) + + SUB t0, t1, t4 + SXADDQ INCX, XX, XX + ADD t2, t3, t5 + SXADDQ INCX, X, X + + MUL a6, ALPHA_R, t0 + ST t4, 0 * SIZE(XX) + MUL a7, ALPHA_I, t1 + ST t5, 1 * SIZE(XX) + + MUL a6, ALPHA_I, t2 + LD a4, 0 * SIZE(X) + MUL a7, ALPHA_R, t3 + LD a5, 1 * SIZE(X) + + SUB t0, t1, t6 + SXADDQ INCX, XX, XX + ADD t2, t3, t7 + SXADDQ INCX, X, X + + MUL a0, ALPHA_R, t0 + ST t6, 0 * SIZE(XX) + MUL a1, ALPHA_I, t1 + ST t7, 1 * SIZE(XX) + + MUL a0, ALPHA_I, t2 + LD a6, 0 * SIZE(X) + MUL a1, ALPHA_R, t3 + LD a7, 1 * SIZE(X) + + SUB t0, t1, t4 + ldi I, -1(I) + ADD t2, t3, t5 + SXADDQ INCX, XX, XX + + fillcs PREFETCHSIZE * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + MUL a2, ALPHA_R, t0 + MUL a3, ALPHA_I, t1 + ST t4, 0 * SIZE(XX) + MUL a2, ALPHA_I, t2 + ST t5, 1 * SIZE(XX) + MUL a3, ALPHA_R, t3 + + SUB t0, t1, t6 + SXADDQ INCX, XX, XX + ADD t2, t3, t7 + unop + + ST t6, 0 * SIZE(XX) + MUL a4, ALPHA_R, t0 + ST t7, 1 * SIZE(XX) + MUL a5, ALPHA_I, t1 + MUL a4, ALPHA_I, t2 + MUL a5, ALPHA_R, t3 + + SUB t0, t1, t4 + SXADDQ INCX, XX, XX + ADD t2, t3, t5 + unop + + MUL a6, ALPHA_R, t0 + ST t4, 0 * SIZE(XX) + MUL a7, ALPHA_I, t1 + ST t5, 1 * SIZE(XX) + + MUL a6, ALPHA_I, t2 + MUL a7, ALPHA_R, t3 + + SUB t0, t1, t6 + SXADDQ INCX, XX, XX + ADD t2, t3, t7 + + ST t6, 0 * SIZE(XX) + ST t7, 1 * SIZE(XX) + SXADDQ INCX, XX, XX + .align 4 + +$L15: + and N, 3, I + unop + unop + ble I, $L999 + .align 4 + +$L17: + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + SXADDQ INCX, X, X + + MUL a0, ALPHA_R, t0 + MUL a1, ALPHA_I, t1 + MUL a0, ALPHA_I, t2 + MUL a1, ALPHA_R, t3 + + SUB t0, t1, t4 + ADD t2, t3, t5 + + ST t4, 0 * SIZE(XX) + ST t5, 1 * SIZE(XX) + SXADDQ INCX, XX, XX + + ldi I, -1(I) + bne I, $L17 + .align 4 + +$L999: + ret + EPILOGUE diff --git a/kernel/sw_64/zscal_simd.S b/kernel/sw_64/zscal_simd.S new file mode 100644 index 0000000..09d2f38 --- /dev/null +++ b/kernel/sw_64/zscal_simd.S @@ -0,0 +1,579 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 96 + +#define N $16 +#define X $21 +#define INCX $17 + +#define XX $18 +#define I $19 + +#define ALPHA_R $f19 +#define ALPHA_I $f20 + + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f21 + +#define t0 $f22 +#define t1 $f23 +#define t2 $f24 +#define t3 $f25 + +#define t4 $f26 +#define t5 $f27 +#define t6 $f28 +#define t7 $f29 + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + + ldl INCX, 0($sp) + mov X, XX + cmpeq INCX, 1, $0 + ble N, $L999 + + beq $0, $Sub + .align 5 + + and X, (VEC_LEN*SIZE-1), $6 + bgt $6, $UnAlign_X_ACCESS + +/* + Unloop 8 (8*2=16) +*/ + sra N, 3, I + vcpyf ALPHA_R, ALPHA_R + vcpyf ALPHA_I, ALPHA_I + ble I, $Remain + + VLD a0, 0*VEC_LEN*SIZE(X) + VLD a1, 1*VEC_LEN*SIZE(X) + VLD a2, 2*VEC_LEN*SIZE(X) + VLD a3, 3*VEC_LEN*SIZE(X) + + subl I, 1, I + addl X, 16*SIZE, X + ble I, $MainLoopEnd + .align 4 + + +$MainLoop: + + vextf a0, 1, a4 + vextf a0, 3, a5 + vextf a1, 0, a6 + vextf a1, 2, a7 + + vextf a2, 1, t0 + vextf a2, 3, t1 + vextf a3, 0, t2 + vextf a3, 2, t3 + + vinsf a4, a1, 0, a1 + vinsf a5, a1, 2, a1 + vinsf a6, a0, 1, a0 + vinsf a7, a0, 3, a0 + + vinsf t0, a3, 0, a3 + vinsf t1, a3, 2, a3 + vinsf t2, a2, 1, a2 + vinsf t3, a2, 3, a2 + + VMUL ALPHA_R, a0, t4 + VMUL ALPHA_I, a0, t5 + VMUL ALPHA_R, a2, t6 + VMUL ALPHA_I, a2, t7 + + VNMAD ALPHA_I, a1, t4, t0 + VLD a0, 0*VEC_LEN*SIZE(X) + VMAD ALPHA_R, a1, t5, t1 + VLD a1, 1*VEC_LEN*SIZE(X) + + VNMAD ALPHA_I, a3, t6, t2 + VLD a2, 2*VEC_LEN*SIZE(X) + VMAD ALPHA_R, a3, t7, t3 + VLD a3, 3*VEC_LEN*SIZE(X) + +/*combine the real(t0,t2) & image(t1,t3) vector to complex vector*/ + vextf t0, 1, a4 + vextf t0, 3, a5 + vextf t1, 0, a6 + vextf t1, 2, a7 + + vextf t2, 1, s0 + vextf t2, 3, s1 + vextf t3, 0, s2 + vextf t3, 2, s3 + + vinsf a4, t1, 0, t1 + vinsf a5, t1, 2, t1 + vinsf a6, t0, 1, t0 + vinsf a7, t0, 3, t0 + + vinsf s0, t3, 0, t3 + vinsf s1, t3, 2, t3 + vinsf s2, t2, 1, t2 + vinsf s3, t2, 3, t2 + + VST t0, -4*VEC_LEN*SIZE(X) + VST t1, -3*VEC_LEN*SIZE(X) + VST t2, -2*VEC_LEN*SIZE(X) + VST t3, -1*VEC_LEN*SIZE(X) + + fillcs PREFETCHSIZE * SIZE(X) + subl I, 1, I + addl X, 16*SIZE, X + bgt I, $MainLoop + .align 4 + +$MainLoopEnd: +/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ + vextf a0, 1, a4 + vextf a0, 3, a5 + vextf a1, 0, a6 + vextf a1, 2, a7 + + vextf a2, 1, t0 + vextf a2, 3, t1 + vextf a3, 0, t2 + vextf a3, 2, t3 + + vinsf a4, a1, 0, a1 + vinsf a5, a1, 2, a1 + vinsf a6, a0, 1, a0 + vinsf a7, a0, 3, a0 + + vinsf t0, a3, 0, a3 + vinsf t1, a3, 2, a3 + vinsf t2, a2, 1, a2 + vinsf t3, a2, 3, a2 + + VMUL ALPHA_R, a0, t4 + VMUL ALPHA_I, a0, t5 + VMUL ALPHA_R, a2, t6 + VMUL ALPHA_I, a2, t7 + + VNMAD ALPHA_I, a1, t4, t0 + VMAD ALPHA_R, a1, t5, t1 + VNMAD ALPHA_I, a3, t6, t2 + VMAD ALPHA_R, a3, t7, t3 + +/*combine the real(t0,t2) & image(t1,t3) vector to complex vector*/ + vextf t0, 1, a4 + vextf t0, 3, a5 + vextf t1, 0, a6 + vextf t1, 2, a7 + + vextf t2, 1, s0 + vextf t2, 3, s1 + vextf t3, 0, s2 + vextf t3, 2, s3 + + vinsf a4, t1, 0, t1 + vinsf a5, t1, 2, t1 + vinsf a6, t0, 1, t0 + vinsf a7, t0, 3, t0 + + vinsf s0, t3, 0, t3 + vinsf s1, t3, 2, t3 + vinsf s2, t2, 1, t2 + vinsf s3, t2, 3, t2 + + VST t0, -4*VEC_LEN*SIZE(X) + VST t1, -3*VEC_LEN*SIZE(X) + VST t2, -2*VEC_LEN*SIZE(X) + VST t3, -1*VEC_LEN*SIZE(X) + +$Remain: + and N, 7, I + unop + unop + ble I, $L999 + .align 5 + +$Remain_loop: + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + + MUL a0, ALPHA_R, t0 + MUL a1, ALPHA_I, t1 + MUL a0, ALPHA_I, t2 + MUL a1, ALPHA_R, t3 + + SUB t0, t1, t4 + ADD t2, t3, t5 + ST t4, 0 * SIZE(X) + ST t5, 1 * SIZE(X) + + addl X, 2*SIZE, X + ldi I, -1(I) + bne I, $Remain_loop + ret + .align 5 + +$UnAlign_X_ACCESS: +/* + unloop 4 (4*2=8) +*/ + sra N, 2, I + ble I, $Unalign_Remain + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + + LD a4, 4 * SIZE(X) + MUL a0, ALPHA_R, t0 + LD a5, 5 * SIZE(X) + MUL a0, ALPHA_I, t2 + + LD a6, 6 * SIZE(X) + NMAD a1, ALPHA_I, t0, t4 + LD a7, 7 * SIZE(X) + MAD a1, ALPHA_R, t2, t5 + + + ldi I, -1(I) + addl X, 8*SIZE, X + ble I, $Unalign_MainLoopEnd + .align 4 + +$Unalign_MainLoop: + MUL a2, ALPHA_R, t0 + ST t4, -8 * SIZE(X) + MUL a2, ALPHA_I, t2 + ST t5, -7 * SIZE(X) + + + NMAD a3, ALPHA_I, t0, t6 + LD a0, 0 * SIZE(X) + MAD a3, ALPHA_R, t2, t7 + LD a1, 1 * SIZE(X) + + ST t6, -6 * SIZE(X) + MUL a4, ALPHA_R, t0 + ST t7, -5 * SIZE(X) + MUL a4, ALPHA_I, t2 + + + NMAD a5, ALPHA_I, t0, t4 + LD a2, 2 * SIZE(X) + MAD a5, ALPHA_R, t2, t5 + LD a3, 3 * SIZE(X) + + MUL a6, ALPHA_R, t0 + ST t4, -4 * SIZE(X) + MUL a6, ALPHA_I, t2 + ST t5, -3 * SIZE(X) + + NMAD a7, ALPHA_I, t0, t6 + LD a4, 4 * SIZE(X) + MAD a7, ALPHA_R, t2, t7 + LD a5, 5 * SIZE(X) + + MUL a0, ALPHA_R, t0 + ST t6, -2 * SIZE(X) + MUL a0, ALPHA_I, t2 + ST t7, -1 * SIZE(X) + + NMAD a1, ALPHA_I, t0, t4 + LD a6, 6 * SIZE(X) + MAD a1, ALPHA_R, t2, t5 + LD a7, 7 * SIZE(X) + + + + fillcs PREFETCHSIZE * SIZE(X) + subl I, 1, I + addl X, 8*SIZE, X + bgt I, $Unalign_MainLoop + .align 4 + +$Unalign_MainLoopEnd: + MUL a2, ALPHA_R, t0 + ST t4, -8 * SIZE(X) + MUL a2, ALPHA_I, t2 + ST t5, -7 * SIZE(X) + + + NMAD a3, ALPHA_I, t0, t6 + MAD a3, ALPHA_R, t2, t7 + + + ST t6, -6 * SIZE(X) + MUL a4, ALPHA_R, t0 + ST t7, -5 * SIZE(X) + MUL a4, ALPHA_I, t2 + + + NMAD a5, ALPHA_I, t0, t4 + MAD a5, ALPHA_R, t2, t5 + + MUL a6, ALPHA_R, t0 + ST t4, -4 * SIZE(X) + MUL a6, ALPHA_I, t2 + ST t5, -3 * SIZE(X) + + NMAD a7, ALPHA_I, t0, t6 + MAD a7, ALPHA_R, t2, t7 + ST t6, -2 * SIZE(X) + ST t7, -1 * SIZE(X) + + .align 4 +$Unalign_Remain: + and N, 3, I + unop + unop + ble I, $L999 + .align 4 + +$Unalign_RemainLoop: + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + + + MUL a0, ALPHA_R, t0 + MUL a0, ALPHA_I, t2 + + NMAD a1, ALPHA_I, t0, t4 + MAD a1, ALPHA_R, t2, t5 + + ST t4, 0 * SIZE(X) + ST t5, 1 * SIZE(X) + + addl X, 2*SIZE, X + ldi I, -1(I) + bne I, $Unalign_RemainLoop + nop + + ret + .align 4 + +$Sub: + addl INCX, INCX, INCX + + sra N, 2, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + SXADDQ INCX, X, X + LD a2, 0 * SIZE(X) + LD a3, 1 * SIZE(X) + SXADDQ INCX, X, X + LD a4, 0 * SIZE(X) + LD a5, 1 * SIZE(X) + SXADDQ INCX, X, X + LD a6, 0 * SIZE(X) + LD a7, 1 * SIZE(X) + SXADDQ INCX, X, X + + MUL a0, ALPHA_R, t0 + MUL a1, ALPHA_I, t1 + MUL a0, ALPHA_I, t2 + MUL a1, ALPHA_R, t3 + + SUB t0, t1, t4 + ADD t2, t3, t5 + + ldi I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ST t4, 0 * SIZE(XX) + MUL a2, ALPHA_R, t0 + ST t5, 1 * SIZE(XX) + MUL a3, ALPHA_I, t1 + + MUL a2, ALPHA_I, t2 + LD a0, 0 * SIZE(X) + MUL a3, ALPHA_R, t3 + LD a1, 1 * SIZE(X) + + SUB t0, t1, t6 + SXADDQ INCX, XX, XX + ADD t2, t3, t7 + SXADDQ INCX, X, X + + MUL a4, ALPHA_R, t0 + ST t6, 0 * SIZE(XX) + MUL a5, ALPHA_I, t1 + ST t7, 1 * SIZE(XX) + + MUL a4, ALPHA_I, t2 + LD a2, 0 * SIZE(X) + MUL a5, ALPHA_R, t3 + LD a3, 1 * SIZE(X) + + SUB t0, t1, t4 + SXADDQ INCX, XX, XX + ADD t2, t3, t5 + SXADDQ INCX, X, X + + MUL a6, ALPHA_R, t0 + ST t4, 0 * SIZE(XX) + MUL a7, ALPHA_I, t1 + ST t5, 1 * SIZE(XX) + + MUL a6, ALPHA_I, t2 + LD a4, 0 * SIZE(X) + MUL a7, ALPHA_R, t3 + LD a5, 1 * SIZE(X) + + SUB t0, t1, t6 + SXADDQ INCX, XX, XX + ADD t2, t3, t7 + SXADDQ INCX, X, X + + MUL a0, ALPHA_R, t0 + ST t6, 0 * SIZE(XX) + MUL a1, ALPHA_I, t1 + ST t7, 1 * SIZE(XX) + + MUL a0, ALPHA_I, t2 + LD a6, 0 * SIZE(X) + MUL a1, ALPHA_R, t3 + LD a7, 1 * SIZE(X) + + SUB t0, t1, t4 + ldi I, -1(I) + ADD t2, t3, t5 + SXADDQ INCX, XX, XX + + fillcs PREFETCHSIZE * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + MUL a2, ALPHA_R, t0 + MUL a3, ALPHA_I, t1 + ST t4, 0 * SIZE(XX) + MUL a2, ALPHA_I, t2 + ST t5, 1 * SIZE(XX) + MUL a3, ALPHA_R, t3 + + SUB t0, t1, t6 + SXADDQ INCX, XX, XX + ADD t2, t3, t7 + unop + + ST t6, 0 * SIZE(XX) + MUL a4, ALPHA_R, t0 + ST t7, 1 * SIZE(XX) + MUL a5, ALPHA_I, t1 + MUL a4, ALPHA_I, t2 + MUL a5, ALPHA_R, t3 + + SUB t0, t1, t4 + SXADDQ INCX, XX, XX + ADD t2, t3, t5 + unop + + MUL a6, ALPHA_R, t0 + ST t4, 0 * SIZE(XX) + MUL a7, ALPHA_I, t1 + ST t5, 1 * SIZE(XX) + + MUL a6, ALPHA_I, t2 + MUL a7, ALPHA_R, t3 + + SUB t0, t1, t6 + SXADDQ INCX, XX, XX + ADD t2, t3, t7 + + ST t6, 0 * SIZE(XX) + ST t7, 1 * SIZE(XX) + SXADDQ INCX, XX, XX + .align 4 + +$L15: + and N, 3, I + unop + unop + ble I, $L999 + .align 4 + +$L17: + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + SXADDQ INCX, X, X + + MUL a0, ALPHA_R, t0 + MUL a1, ALPHA_I, t1 + MUL a0, ALPHA_I, t2 + MUL a1, ALPHA_R, t3 + + SUB t0, t1, t4 + ADD t2, t3, t5 + + ST t4, 0 * SIZE(XX) + ST t5, 1 * SIZE(XX) + SXADDQ INCX, XX, XX + + ldi I, -1(I) + bne I, $L17 + .align 4 + +$L999: + ret + EPILOGUE diff --git a/kernel/sw_64/zsum.S b/kernel/sw_64/zsum.S new file mode 100644 index 0000000..7b8570c --- /dev/null +++ b/kernel/sw_64/zsum.S @@ -0,0 +1,234 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define I $19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f19 + +#define t0 $f20 +#define t1 $f21 +#define t2 $f22 +#define t3 $f23 + + PROLOGUE + PROFCODE + + fclr s0 + unop + fclr t0 + addw INCX, INCX, $20 + mov $20,INCX + + fclr s1 + unop + fclr t1 + ble N, $L999 + + fclr s2 + sra N, 2, I + fclr s3 + ble I, $L15 + + LD a0, 0 * SIZE(X) + fclr t2 + LD a1, 1 * SIZE(X) + SXADDQ INCX, X, X + + LD a2, 0 * SIZE(X) + fclr t3 + LD a3, 1 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + LD a5, 1 * SIZE(X) + SXADDQ INCX, X, X + ldi I, -1(I) + + ble I, $L13 + .align 4 + +$L12: + ADD s0, t0, $f24 + fmov $f24,s0 + ldl $31, PREFETCHSIZE * SIZE(X) + fmov a0, t0 + ldi I, -1(I) + + ADD s1, t1, $f24 + fmov $f24,s1 + LD a6, 0 * SIZE(X) + fmov a1, t1 + unop + + ADD s2, t2, $f24 + fmov $f24,s2 + LD a7, 1 * SIZE(X) + fmov a2, t2 + SXADDQ INCX, X, X + + ADD s3, t3, $f24 + fmov $f24,s3 + LD a0, 0 * SIZE(X) + fmov a3, t3 + unop + + ADD s0, t0, $f24 + fmov $f24,s0 + LD a1, 1 * SIZE(X) + fmov a4, t0 + SXADDQ INCX, X, X + + ADD s1, t1, $f24 + fmov $f24,s1 + LD a2, 0 * SIZE(X) + fmov a5, t1 + unop + + ADD s2, t2, $f24 + fmov $f24,s2 + LD a3, 1 * SIZE(X) + fmov a6, t2 + SXADDQ INCX, X, X + + ADD s3, t3, $f24 + fmov $f24,s3 + LD a4, 0 * SIZE(X) + fmov a7, t3 + unop + + LD a5, 1 * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + ADD s0, t0, $f24 + fmov $f24,s0 + LD a6, 0 * SIZE(X) + fmov a0, t0 + + ADD s1, t1, $f24 + fmov $f24,s1 + LD a7, 1 * SIZE(X) + fmov a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, $f24 + fmov $f24,s2 + fmov a2, t2 + ADD s3, t3, $f24 + fmov $f24,s3 + fmov a3, t3 + + ADD s0, t0, $f24 + fmov $f24,s0 + fmov a4, t0 + ADD s1, t1, $f24 + fmov $f24,s1 + fmov a5, t1 + ADD s2, t2, $f24 + fmov $f24,s2 + fmov a6, t2 + ADD s3, t3, $f24 + fmov $f24,s3 + fmov a7, t3 + + ADD s2, t2, $f24 + fmov $f24,s2 + ADD s3, t3, $f24 + fmov $f24,s3 + + .align 4 + +$L15: + ADD s0, s2, $f24 + fmov $f24,s0 + and N, 3, I + ADD s1, s3, $f24 + fmov $f24,s1 + ble I, $L999 + .align 4 + +$L17: + ADD s0, t0, $f24 + fmov $f24,s0 + LD a0, 0 * SIZE(X) + fmov a0, t0 + ldi I, -1(I) + + ADD s1, t1, $f24 + fmov $f24,s1 + LD a1, 1 * SIZE(X) + fmov a1, t1 + SXADDQ INCX, X, X + + bne I, $L17 + .align 4 + +$L999: + ADD s0, t0, $f24 + fmov $f24,s0 + ADD s1, t1, $f24 + fmov $f24,s1 + + ADD s0, s1, $f24 + fmov $f24,s0 + ret + EPILOGUE diff --git a/kernel/sw_64/zswap.S.bak b/kernel/sw_64/zswap.S.bak new file mode 100644 index 0000000..f0b19dd --- /dev/null +++ b/kernel/sw_64/zswap.S.bak @@ -0,0 +1,244 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + + mov $21, $17 + ldl $18, 0($sp) + ldl $19, 8($sp) + ldl $20, 16($sp) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + ble $16, $SubEnd # if n <= 0 goto $End + + cmpeq $18, 1, $1 + addl $18, $18, $18 + cmpeq $20, 1, $2 + addl $20, $20, $20 + + sra $16, 2, $21 + and $1, $2, $1 + and $16, 3, $22 + beq $1, $Sub + + ble $21, $MainRemain + .align 4 + +$MainLoop: + LD $f10, 0*SIZE($19) + LD $f11, 1*SIZE($19) + LD $f12, 2*SIZE($19) + LD $f13, 3*SIZE($19) + LD $f14, 4*SIZE($19) + LD $f15, 5*SIZE($19) + LD $f16, 6*SIZE($19) + LD $f17, 7*SIZE($19) + + LD $f20, 0*SIZE($17) + LD $f21, 1*SIZE($17) + LD $f22, 2*SIZE($17) + LD $f23, 3*SIZE($17) + LD $f24, 4*SIZE($17) + LD $f25, 5*SIZE($17) + LD $f26, 6*SIZE($17) + LD $f27, 7*SIZE($17) + + fillcs 16*SIZE($17) + unop + fillcs 16*SIZE($19) + subl $21, 1, $21 + + ST $f10, 0*SIZE($17) + ST $f11, 1*SIZE($17) + ST $f12, 2*SIZE($17) + ST $f13, 3*SIZE($17) + ST $f14, 4*SIZE($17) + ST $f15, 5*SIZE($17) + ST $f16, 6*SIZE($17) + ST $f17, 7*SIZE($17) + + ST $f20, 0*SIZE($19) + ST $f21, 1*SIZE($19) + ST $f22, 2*SIZE($19) + ST $f23, 3*SIZE($19) + ST $f24, 4*SIZE($19) + ST $f25, 5*SIZE($19) + ST $f26, 6*SIZE($19) + ST $f27, 7*SIZE($19) + + ldi $17, 8*SIZE($17) + ldi $19, 8*SIZE($19) + bgt $21, $MainLoop + .align 4 + +$MainRemain: + ble $22, $MainEnd + .align 4 + +$MainRemainLoop: + LD $f10, 0*SIZE($19) + LD $f11, 1*SIZE($19) + LD $f20, 0*SIZE($17) + LD $f21, 1*SIZE($17) + + ldi $17, 2*SIZE($17) + ldi $19, 2*SIZE($19) + subl $22, 1, $22 + ST $f10, -2*SIZE($17) + ST $f11, -1*SIZE($17) + ST $f20, -2*SIZE($19) + ST $f21, -1*SIZE($19) + bgt $22, $MainRemainLoop + .align 4 + +$MainEnd: + clr $0 + ret + .align 4 + +$Sub: + mov $17, $23 + mov $19, $24 + ble $21, $SubRemain + .align 4 + +$SubLoop: + LD $f10, 0*SIZE($19) + LD $f11, 1*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f12, 0*SIZE($19) + LD $f13, 1*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f14, 0*SIZE($19) + LD $f15, 1*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f16, 0*SIZE($19) + LD $f17, 1*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f20, 0*SIZE($17) + LD $f21, 1*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f22, 0*SIZE($17) + LD $f23, 1*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f24, 0*SIZE($17) + LD $f25, 1*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f26, 0*SIZE($17) + LD $f27, 1*SIZE($17) + SXADDQ $18, $17, $17 + + ST $f10, 0*SIZE($23) + ST $f11, 1*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f12, 0*SIZE($23) + ST $f13, 1*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f14, 0*SIZE($23) + ST $f15, 1*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f16, 0*SIZE($23) + ST $f17, 1*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f20, 0*SIZE($24) + ST $f21, 1*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f22, 0*SIZE($24) + ST $f23, 1*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f24, 0*SIZE($24) + ST $f25, 1*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f26, 0*SIZE($24) + ST $f27, 1*SIZE($24) + SXADDQ $20, $24, $24 + + subl $21, 1, $21 + bgt $21, $SubLoop + .align 4 + +$SubRemain: + ble $22, $SubEnd + .align 4 + +$SubRemainLoop: + LD $f10, 0*SIZE($19) + LD $f11, 1*SIZE($19) + LD $f20, 0*SIZE($17) + LD $f21, 1*SIZE($17) + + subl $22, 1, $22 + + ST $f10, 0*SIZE($17) + ST $f11, 1*SIZE($17) + ST $f20, 0*SIZE($19) + ST $f21, 1*SIZE($19) + + SXADDQ $18, $17, $17 + SXADDQ $20, $19, $19 + bgt $22, $SubRemainLoop + .align 4 + +$SubEnd: + clr $0 + ret + EPILOGUE diff --git a/kernel/sw_64/zswap.c b/kernel/sw_64/zswap.c new file mode 100644 index 0000000..ae4760a --- /dev/null +++ b/kernel/sw_64/zswap.c @@ -0,0 +1,72 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/sw_64/zswap_simd.S b/kernel/sw_64/zswap_simd.S new file mode 100644 index 0000000..e49c95b --- /dev/null +++ b/kernel/sw_64/zswap_simd.S @@ -0,0 +1,306 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 64 +#define X $17 +#define Y $19 + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + + mov $21, $17 + ldl $18, 0($sp) + ldl $19, 8($sp) + ldl $20, 16($sp) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + ble $16, $SubEnd # if n <= 0 goto $End + + cmpeq $18, 1, $1 + addl $18, $18, $18 + cmpeq $20, 1, $2 + addl $20, $20, $20 + +/* + Unloop 8 complex, 16 real +*/ + + sra $16, 3, $21 + and $1, $2, $1 + and $16, 7, $22 + beq $1, $Sub + +/* + test the address of Y & X +*/ + and Y, (VEC_LEN*SIZE-1), $4 + and X, (VEC_LEN*SIZE-1), $3 + or $3, $4, $4 + bne $4, $UnAlign_ACCESS + +/* align access*/ + + ble $21, $MainRemain + .align 4 + +$MainLoop: + VLD $f10, 0*VEC_LEN*SIZE(Y) + VLD $f11, 1*VEC_LEN*SIZE(Y) + VLD $f12, 2*VEC_LEN*SIZE(Y) + VLD $f13, 3*VEC_LEN*SIZE(Y) + + VLD $f20, 0*VEC_LEN*SIZE(X) + VLD $f21, 1*VEC_LEN*SIZE(X) + VLD $f22, 2*VEC_LEN*SIZE(X) + VLD $f23, 3*VEC_LEN*SIZE(X) + + fillcs PREFETCHSIZE * SIZE(X) + unop + fillcs PREFETCHSIZE * SIZE(Y) + subl $21, 1, $21 + + VST $f10, 0*VEC_LEN*SIZE(X) + VST $f11, 1*VEC_LEN*SIZE(X) + VST $f12, 2*VEC_LEN*SIZE(X) + VST $f13, 3*VEC_LEN*SIZE(X) + + VST $f20, 0*VEC_LEN*SIZE(Y) + VST $f21, 1*VEC_LEN*SIZE(Y) + VST $f22, 2*VEC_LEN*SIZE(Y) + VST $f23, 3*VEC_LEN*SIZE(Y) + + ldi $17, 16*SIZE(X) + ldi $19, 16*SIZE(Y) + bgt $21, $MainLoop + .align 4 + + jmp $MainRemain + .align 4 + +$UnAlign_ACCESS: + sra $16, 2, $21 + and $16, 3, $22 + nop + ble $21, $MainRemain + .align 4 +$UnAlign_ACCESS_MainLoop: + + LD $f10, 0*SIZE(Y) + LD $f11, 1*SIZE(Y) + LD $f12, 2*SIZE(Y) + LD $f13, 3*SIZE(Y) + LD $f14, 4*SIZE(Y) + LD $f15, 5*SIZE(Y) + LD $f16, 6*SIZE(Y) + LD $f17, 7*SIZE(Y) + + LD $f20, 0*SIZE(X) + LD $f21, 1*SIZE(X) + LD $f22, 2*SIZE(X) + LD $f23, 3*SIZE(X) + LD $f24, 4*SIZE(X) + LD $f25, 5*SIZE(X) + LD $f26, 6*SIZE(X) + LD $f27, 7*SIZE(X) + + fillcs 16*SIZE(X) + unop + fillcs 16*SIZE(Y) + subl $21, 1, $21 + + ST $f10, 0*SIZE(X) + ST $f11, 1*SIZE(X) + ST $f12, 2*SIZE(X) + ST $f13, 3*SIZE(X) + ST $f14, 4*SIZE(X) + ST $f15, 5*SIZE(X) + ST $f16, 6*SIZE(X) + ST $f17, 7*SIZE(X) + + ST $f20, 0*SIZE(Y) + ST $f21, 1*SIZE(Y) + ST $f22, 2*SIZE(Y) + ST $f23, 3*SIZE(Y) + ST $f24, 4*SIZE(Y) + ST $f25, 5*SIZE(Y) + ST $f26, 6*SIZE(Y) + ST $f27, 7*SIZE(Y) + + ldi X, 8*SIZE(X) + ldi Y, 8*SIZE(Y) + bgt $21, $UnAlign_ACCESS_MainLoop + .align 4 + +$MainRemain: + ble $22, $MainEnd + .align 4 + +$MainRemainLoop: + LD $f10, 0*SIZE(Y) + LD $f11, 1*SIZE(Y) + LD $f20, 0*SIZE(X) + LD $f21, 1*SIZE(X) + + ldi X, 2*SIZE(X) + ldi Y, 2*SIZE(Y) + subl $22, 1, $22 + ST $f10, -2*SIZE(X) + ST $f11, -1*SIZE(X) + ST $f20, -2*SIZE(Y) + ST $f21, -1*SIZE(Y) + bgt $22, $MainRemainLoop + .align 4 + +$MainEnd: + clr $0 + ret + .align 4 + +$Sub: + sra $16, 2, $21 + and $16, 3, $22 + + mov $17, $23 + mov $19, $24 + ble $21, $SubRemain + .align 4 + +$SubLoop: + LD $f10, 0*SIZE($19) + LD $f11, 1*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f12, 0*SIZE($19) + LD $f13, 1*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f14, 0*SIZE($19) + LD $f15, 1*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f16, 0*SIZE($19) + LD $f17, 1*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f20, 0*SIZE($17) + LD $f21, 1*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f22, 0*SIZE($17) + LD $f23, 1*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f24, 0*SIZE($17) + LD $f25, 1*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f26, 0*SIZE($17) + LD $f27, 1*SIZE($17) + SXADDQ $18, $17, $17 + + ST $f10, 0*SIZE($23) + ST $f11, 1*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f12, 0*SIZE($23) + ST $f13, 1*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f14, 0*SIZE($23) + ST $f15, 1*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f16, 0*SIZE($23) + ST $f17, 1*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f20, 0*SIZE($24) + ST $f21, 1*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f22, 0*SIZE($24) + ST $f23, 1*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f24, 0*SIZE($24) + ST $f25, 1*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f26, 0*SIZE($24) + ST $f27, 1*SIZE($24) + SXADDQ $20, $24, $24 + + subl $21, 1, $21 + bgt $21, $SubLoop + .align 4 + +$SubRemain: + ble $22, $SubEnd + .align 4 + +$SubRemainLoop: + LD $f10, 0*SIZE($19) + LD $f11, 1*SIZE($19) + LD $f20, 0*SIZE($17) + LD $f21, 1*SIZE($17) + + subl $22, 1, $22 + + ST $f10, 0*SIZE($17) + ST $f11, 1*SIZE($17) + ST $f20, 0*SIZE($19) + ST $f21, 1*SIZE($19) + + SXADDQ $18, $17, $17 + SXADDQ $20, $19, $19 + bgt $22, $SubRemainLoop + .align 4 + +$SubEnd: + clr $0 + ret + EPILOGUE diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LN.S b/kernel/sw_64/ztrsm_kernel_2x2_LN.S new file mode 100644 index 0000000..3a14e58 --- /dev/null +++ b/kernel/sw_64/ztrsm_kernel_2x2_LN.S @@ -0,0 +1,2593 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(SW6) +#error "Architecture is not specified." +#endif + +#ifdef SW6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 48 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + + .set noat + .set noreorder + .arch sw6a + +.text + .align 5 + .globl CNAME + .ent CNAME + +#define STACKSIZE 88 + +#define M $16 +#define N $17 +#define K $18 +#define A $21 +#define B $22 +#define C $20 +#define LDC $23 + +#define C1 $19 +#define C2 $24 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define tmp $9 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha_i $f29 +#define alpha_r $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + +#if defined(LN) || defined(LT) +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#define ADD5 ADD +#define ADD6 SUB +#endif +#else +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#define ADD5 ADD +#define ADD6 SUB +#endif +#endif + + +CNAME: + .frame $sp, STACKSIZE, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + ldi $at, _mcount + jsr $at, ($at), _mcount +#endif + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + ldi $sp, -STACKSIZE($sp) + + ldl B, 0 + STACKSIZE($sp) + ldl C, 8 + STACKSIZE($sp) + ldl LDC, 16 + STACKSIZE($sp) + ldl OFFSET, 24 + STACKSIZE($sp) + + sll LDC, ZBASE_SHIFT, LDC + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + stl tmp, 72($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + addl M, M, TMP2 + mull TMP2, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ TMP2, C, C +#endif + +#ifdef RN + negl OFFSET, KK +#endif + +#ifdef RT + mull N, K, TMP1 + addl TMP1, TMP1, TMP1 + SXADDQ TMP1, B, B + + mull N, LDC, TMP1 + addl TMP1, C, C + + subl N, OFFSET, KK +#endif + + sra N, 1, J + ble J, $L30 + .align 4 + +$L01: +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + subl B, TMP1, B + + subl C, LDC, C2 + subl C2, LDC, C1 + subl C2, LDC, C +#else + mov C, C1 + addl C, LDC, C2 + addl C2, LDC, C +#endif + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + and M, 1, I + fclr t1 + fclr t2 + fclr t3 + fclr t4 + + fclr c01 + fclr c05 + ble I, $L20 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + ldi AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(B) + ldi BO, 4 * SIZE(B) + + ldi L, -2(KK) + + ble KK, $L28 + ble L, $L25 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, ZBASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(BO) + ldi BO, 4 * SIZE(BO) + + ldi L, -2(TMP1) + + ble TMP1, $L28 + ble L, $L25 +#endif + .align 5 + +$L22: + ADD1 c09, t1, b5 + fmov b5, c09 +// unop + MUL a1, b1, t1 + unop + + ADD3 c10, t2, b5 + fmov b5, c10 +// unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, b5 + fmov b5, c13 +// unop + MUL a1, b2, t3 + ldi BO, 8 * SIZE(BO) + + ADD2 c14, t4, b5 + fmov b5, c14 +// unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 +// unop + MUL a1, b3, t1 + unop + + ADD3 c02, t2, b5 + fmov b5, c02 +// unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD4 c05, t3, b5 + fmov b5, c05 +// unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD2 c06, t4, b5 + fmov b5, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + FIMOVD b5, tmp + + ADD1 c09, t1, b5 + fmov b5, c09 +// unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD3 c10, t2, b5 + fmov b5, c10 +// unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD4 c13, t3, b5 + fmov b5, c13 +// unop + MUL a3, b2, t3 + ldi AO, 4 * SIZE(AO) + + ADD2 c14, t4, b5 + fmov b5, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 + ldi L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD3 c02, t2, b5 + fmov b5, c02 +// unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD4 c05, t3, b5 + fmov b5, c05 +// unop + IFMOVD tmp, b5 + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, b5 + fmov b5, c06 + IFMOVD tmp, b5 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD1 c09, t1, b5 + fmov b5, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + .align 4 + + ADD3 c10, t2, b5 + fmov b5, c10 +// unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, b5 + fmov b5, c13 +// unop + MUL a1, b2, t3 + unop + + ADD2 c14, t4, b5 + fmov b5, c14 +// unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 +// unop + MUL a1, b3, t1 + ldi AO, 2 * SIZE(AO) + + ADD3 c02, t2, b5 + fmov b5, c02 +// unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD4 c05, t3, b5 + fmov b5, c05 +// unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, b5 + fmov b5, c06 +// unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c09, t1, b5 + fmov b5, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD3 c10, t2, b5 + fmov b5, c10 + MUL a2, b1, t2 + ADD4 c13, t3, b5 + fmov b5, c13 + MUL a1, b2, t3 + + ADD2 c14, t4, b5 + fmov b5, c14 + MUL a2, b2, t4 + ADD1 c01, t1, b5 + fmov b5, c01 + MUL a1, b3, t1 + + ADD3 c02, t2, b5 + fmov b5, c02 + MUL a2, b3, t2 + ADD4 c05, t3, b5 + fmov b5, c05 + MUL a1, b4, t3 + + ADD2 c06, t4, b5 + fmov b5, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b4, t4 + ldi BO, 4 * SIZE(BO) + + ADD1 c09, t1, b5 + fmov b5, c09 + ADD3 c10, t2, b5 + fmov b5, c10 + ADD4 c13, t3, b5 + fmov b5, c13 + ADD2 c14, t4, b5 + fmov b5, c14 + + ADD c01, c06, b5 + fmov b5, c01 + ADD c02, c05, b5 + fmov b5, c02 + ADD c09, c14, b5 + fmov b5, c09 + ADD c10, c13, b5 + fmov b5, c10 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 1, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c10, b5 + fmov b5, c10 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c10, b5 + fmov b5, c10 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + ADD5 c09, t3, b5 + fmov b5, c09 + ADD6 c10, t4, b5 + fmov b5, c10 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + + MUL a3, c01, t1 + MUL a3, c02, t2 + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c09, t1, b5 + fmov b5, c09 + ADD5 c10, t2, b5 + fmov b5, c10 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c09, t1, b5 + fmov b5, c09 + ADD6 c10, t2, b5 + fmov b5, c10 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c09, t1, b5 + fmov b5, c09 + ADD6 c10, t2, b5 + fmov b5, c10 + + MUL a3, c09, t1 + MUL a3, c10, t2 + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + + MUL a4, c10, t1 + MUL a4, c09, t2 + ADD6 c01, t1, b5 + fmov b5, c01 + ADD5 c02, t2, b5 + fmov b5, c02 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c10, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) + ldi C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + +#ifndef LN + ldi C1, 2 * SIZE(C1) + ldi C2, 2 * SIZE(C2) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L20: + sra M, 1, I + fclr t1 + fclr t2 + fclr t3 + fclr t4 + + fclr c01 + fclr c05 + + ble I, $L29 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c07 + + ldi BO, 4 * SIZE(B) + fclr c11 + ldi AO, 4 * SIZE(AO) + fclr c15 + + fillcs 4 * SIZE(C1) + fclr c04 + ldi L, -2(KK) + fclr c08 + + fillcs 4 * SIZE(C2) + fclr c12 + fclr c16 + ble KK, $L18 + ble L, $L15 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c07 + + ldi BO, 4 * SIZE(BO) + fclr c11 + ldi AO, 4 * SIZE(AO) + fclr c15 + + fillcs 4 * SIZE(C1) + fclr c04 + ldi L, -2(TMP1) + fclr c08 + + fillcs 4 * SIZE(C2) + fclr c12 + fclr c16 + ble TMP1, $L18 + ble L, $L15 +#endif + .align 5 + +$L12: +/* 1 */ + ADD1 c11, t1, b5 + fmov b5, c11 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD3 c12, t2, b5 + fmov b5, c12 +// unop + MUL b1, a2, t2 + unop + + ADD2 c16, t3, b5 + fmov b5, c16 +// unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD4 c15, t4, b5 + fmov b5, c15 +// unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + FIMOVD b5, tmp + +/* 2 */ + ADD1 c01, t1, b5 + fmov b5, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD3 c02, t2, b5 + fmov b5, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD2 c06, t3, b5 + fmov b5, c06 +// unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, b5 + fmov b5, c05 +// unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD1 c03, t1, b5 + fmov b5, c03 +// unop + MUL b3, a1, t1 + unop + + ADD3 c04, t2, b5 + fmov b5, c04 +// unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, b5 + fmov b5, c08 +// unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, b5 + fmov b5, c13 +// unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD1 c09, t1, b5 + fmov b5, c09 +// unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD3 c10, t2, b5 + fmov b5, c10 +// unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, b5 + fmov b5, c14 +// unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD4 c07, t4, b5 + fmov b5, c07 +// unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD1 c11, t1, b5 + fmov b5, c11 +// unop + IFMOVD tmp, b5 + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD3 c12, t2, b5 + fmov b5, c12 + ldi L, -2(L) + IFMOVD tmp, b5 + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD2 c16, t3, b5 + fmov b5, c16 +// unop + MUL b2, a2, t3 + unop + + ADD4 c15, t4, b5 + fmov b5, c15 +// unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD1 c01, t1, b5 + fmov b5, c01 +// unop + IFMOVD tmp, b5 + MUL b5, a6, t1 + unop + + ADD3 c02, t2, b5 + fmov b5, c02 +// unop + IFMOVD tmp, b5 + MUL b5, a4, t2 + unop + + ADD2 c06, t3, b5 + fmov b5, c06 +// unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, b5 + fmov b5, c05 +// unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD1 c03, t1, b5 + fmov b5, c03 + ldi AO, 8 * SIZE(AO) + MUL b3, a5, t1 +// unop + + ADD3 c04, t2, b5 + fmov b5, c04 + ldi BO, 8 * SIZE(BO) + MUL b3, a2, t2 +// unop + + ADD2 c08, t3, b5 + fmov b5, c08 +// unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD4 c13, t4, b5 + fmov b5, c13 +// unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD1 c09, t1, b5 + fmov b5, c09 +// unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD3 c10, t2, b5 + fmov b5, c10 +// unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD2 c14, t3, b5 + fmov b5, c14 +// unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, b5 + fmov b5, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD1 c11, t1, b5 + fmov b5, c11 +// unop + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD3 c12, t2, b5 + fmov b5, c12 + MUL b1, a2, t2 + ADD2 c16, t3, b5 + fmov b5, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, b5 + fmov b5, c15 + MUL b2, a1, t4 + ADD1 c01, t1, b5 + fmov b5, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, b5 + fmov b5, c02 +// unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD2 c06, t3, b5 + fmov b5, c06 + MUL b2, a4, t3 + ADD4 c05, t4, b5 + fmov b5, c05 + MUL b4, a1, t4 + + ADD1 c03, t1, b5 + fmov b5, c03 +// unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c04, t2, b5 + fmov b5, c04 +// unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, b5 + fmov b5, c08 +// unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, b5 + fmov b5, c13 +// unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c09, t1, b5 + fmov b5, c09 +// unop + MUL b3, a3, t1 + ldi AO, 4 * SIZE(AO) + + ADD3 c10, t2, b5 + fmov b5, c10 +// unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, b5 + fmov b5, c14 +// unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, b5 + fmov b5, c07 +// unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD1 c11, t1, b5 + fmov b5, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD3 c12, t2, b5 + fmov b5, c12 + MUL b1, a2, t2 + ADD2 c16, t3, b5 + fmov b5, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, b5 + fmov b5, c15 + MUL b2, a1, t4 + ADD1 c01, t1, b5 + fmov b5, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, b5 + fmov b5, c02 + MUL b1, a4, t2 + ADD2 c06, t3, b5 + fmov b5, c06 + MUL b2, a4, t3 + + ADD4 c05, t4, b5 + fmov b5, c05 + MUL b4, a1, t4 + ADD1 c03, t1, b5 + fmov b5, c03 + MUL b3, a1, t1 + + ADD3 c04, t2, b5 + fmov b5, c04 + MUL b3, a2, t2 + ADD2 c08, t3, b5 + fmov b5, c08 + MUL b4, a2, t3 + + ADD4 c13, t4, b5 + fmov b5, c13 + MUL b2, a3, t4 + ADD1 c09, t1,b5 + fmov b5, c09 + MUL b3, a3, t1 + + ADD3 c10, t2, b5 + fmov b5, c10 + MUL b3, a4, t2 + ADD2 c14, t3, b5 + fmov b5, c14 + MUL b4, a4, t3 + + ADD4 c07, t4, b5 + fmov b5, c07 + ldi AO, 4 * SIZE(AO) + MUL b4, a3, t4 + ldi BO, 4 * SIZE(BO) + + ADD1 c11, t1, b5 + fmov b5, c11 + ADD3 c12, t2, b5 + fmov b5, c12 + ADD2 c16, t3, b5 + fmov b5, c16 + ADD4 c15, t4, b5 + fmov b5, c15 + + ADD c01, c06, b5 + fmov b5, c01 + ADD c02, c05, b5 + fmov b5, c02 + ADD c03, c08, b5 + fmov b5, c03 + ADD c04, c07, b5 + fmov b5, c04 + + ADD c09, c14, b5 + fmov b5, c09 + ADD c10, c13, b5 + fmov b5, c10 + ADD c11, c16, b5 + fmov b5, c11 + ADD c12, c15, b5 + fmov b5, c12 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c10, b5 + fmov b5, c10 + + SUB b1, c03, b5 + fmov b5, c03 + SUB b2, c04, b5 + fmov b5, c04 + SUB b3, c11, b5 + fmov b5, c11 + SUB b4, c12, b5 + fmov b5, c12 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 + + SUB b1, c09, b5 + fmov b5, c09 + SUB b2, c10, b5 + fmov b5, c10 + SUB b3, c11, b5 + fmov b5, c11 + SUB b4, c12, b5 + fmov b5, c12 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + MUL a1, c11, b5 + fmov b5, c11 + MUL a1, c12, b5 + fmov b5, c12 + + ADD5 c03, t1, b5 + fmov b5, c03 + ADD6 c04, t2, b5 + fmov b5, c04 + ADD5 c11, t3, b5 + fmov b5, c11 + ADD6 c12, t4, b5 + fmov b5, c12 + + MUL a3, c03, t1 + MUL a3, c04, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + SUB c09, t3, b5 + fmov b5, c09 + SUB c10, t4, b5 + fmov b5, c10 + + MUL a4, c04, t1 + MUL a4, c03, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, b5 + fmov b5, c01 + ADD5 c02, t2, b5 + fmov b5, c02 + ADD6 c09, t3, b5 + fmov b5, c09 + ADD5 c10, t4, b5 + fmov b5, c10 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + ADD5 c09, t3, b5 + fmov b5, c09 + ADD6 c10, t4, b5 + fmov b5, c10 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + ADD5 c09, t3, b5 + fmov b5, c09 + ADD6 c10, t4, b5 + fmov b5, c10 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c09, t3 + MUL a3, c10, t4 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c04, t2, b5 + fmov b5, c04 + SUB c11, t3, b5 + fmov b5, c11 + SUB c12, t4, b5 + fmov b5, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c10, t3 + MUL a4, c09, t4 + + ADD6 c03, t1, b5 + fmov b5, c03 + ADD5 c04, t2, b5 + fmov b5, c04 + ADD6 c11, t3, b5 + fmov b5, c11 + ADD5 c12, t4, b5 + fmov b5, c12 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, c03 + MUL a1, c04, c04 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c03, t1, b5 + fmov b5, c03 + ADD6 c04, t2, b5 + fmov b5, c04 + ADD5 c11, t3, b5 + fmov b5, c11 + ADD6 c12, t4, b5 + fmov b5, c12 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + ADD5 c03, t3, b5 + fmov b5, c03 + ADD6 c04, t4, b5 + fmov b5, c04 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + SUB c11, t3, b5 + fmov b5, c11 + SUB c12, t4, b5 + fmov b5, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c04, t3 + MUL a4, c03, t4 + + ADD6 c09, t1, b5 + fmov b5, c09 + ADD5 c10, t2, b5 + fmov b5, c10 + ADD6 c11, t3, b5 + fmov b5, c11 + ADD5 c12, t4, b5 + fmov b5, c12 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c09, t1, b5 + fmov b5, c09 + ADD6 c10, t2, b5 + fmov b5, c10 + ADD5 c11, t3, b5 + fmov b5, c11 + ADD6 c12, t4, b5 + fmov b5, c12 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c09, t1, b5 + fmov b5, c09 + ADD6 c10, t2, b5 + fmov b5, c10 + ADD5 c11, t3, b5 + fmov b5, c11 + ADD6 c12, t4, b5 + fmov b5, c12 + + MUL a3, c09, t1 + MUL a3, c10, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + SUB c03, t3, b5 + fmov b5, c03 + SUB c04, t4, b5 + fmov b5, c04 + + MUL a4, c10, t1 + MUL a4, c09, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, b5 + fmov b5, c01 + ADD5 c02, t2, b5 + fmov b5, c02 + ADD6 c03, t3, b5 + fmov b5, c03 + ADD5 c04, t4, b5 + fmov b5, c04 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + ADD5 c03, t3, b5 + fmov b5, c03 + ADD6 c04, t4, b5 + fmov b5, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c04, 5 * SIZE(BO) + ST c11, 6 * SIZE(BO) + ST c12, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c11, 6 * SIZE(AO) + ST c12, 7 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) + ldi C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + ST c11, 2 * SIZE(C2) + ST c12, 3 * SIZE(C2) + +#ifndef LN + ldi C1, 4 * SIZE(C1) + ldi C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP1 + addl AO, TMP1, AO + addl BO, TMP1, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + fclr c01 + fclr c05 + + ldi I, -1(I) + bgt I, $L11 + .align 4 + +$L29: +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 2, KK +#endif + +#ifdef RT + subl KK, 2, KK +#endif + + ldi J, -1(J) + bgt J, $L01 + .align 4 + +$L30: + and N, 1, J + ble J, $L999 + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + subl B, TMP1, B + + subl C, LDC, C1 + subl C, LDC, C +#else + mov C, C1 + addl C, LDC, C +#endif + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + and M, 1, I + ble I, $L50 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi AO, 2 * SIZE(AO) + ldi BO, 2 * SIZE(B) + + ldi L, -2(KK) + + ble KK, $L58 + ble L, $L55 +#else +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi AO, 2 * SIZE(AO) + ldi BO, 2 * SIZE(BO) + + ldi L, -2(TMP1) + + ble TMP1, $L58 + ble L, $L55 +#endif + .align 5 + +$L52: + ADD1 c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b1, t1 + unop + + ADD3 c02, t2, b5 + fmov b5, c02 + ldi AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD4 c05, t3, b5 + fmov b5, c05 + ldi L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, b5 + fmov b5, c06 +// unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c01, t1, b5 + fmov b5, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + ldi BO, 4 * SIZE(BO) + + ADD3 c02, t2, b5 + fmov b5, c02 +// unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD4 c05, t3, b5 + fmov b5, c05 +// unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, b5 + fmov b5, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) +// unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L52 + .align 4 + +$L55: + ADD1 c01, t1, b5 + fmov b5, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD3 c02, t2, b5 + fmov b5, c02 +// unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c05, t3, b5 + fmov b5, c05 + ldi BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD2 c06, t4, b5 + fmov b5, c06 +// unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD1 c01, t1, b5 + fmov b5, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + ldi AO, 2 * SIZE(AO) + .align 4 + +$L57: + ADD3 c02, t2, b5 + fmov b5, c02 + MUL a2, b1, t2 + ADD4 c05, t3, b5 + fmov b5, c05 + MUL a1, b2, t3 + + ADD2 c06, t4, b5 + fmov b5, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b2, t4 + ldi BO, 2 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 + ADD3 c02, t2, b5 + fmov b5, c02 + ADD4 c05, t3, b5 + fmov b5, c05 + ADD2 c06, t4, b5 + fmov b5, c06 + + ADD c01, c06, b5 + fmov b5, c01 + ADD c02, c05, b5 + fmov b5, c02 + +$L58: +#if defined(LN) || defined(RT) + subl KK, 1, TMP1 + + sll TMP1, ZBASE_SHIFT, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + ldi C1, 2 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L50: + sra M, 1, I + ble I, $L59 + .align 4 + +$L41: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi BO, 2 * SIZE(B) + fclr c03 + ldi AO, 4 * SIZE(AO) + fclr c07 + + ldi L, -2(KK) + fclr c04 + fclr c08 + + ble KK, $L48 + ble L, $L45 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi BO, 2 * SIZE(BO) + fclr c03 + ldi AO, 4 * SIZE(AO) + fclr c07 + + ldi L, -2(TMP1) + fclr c04 + fclr c08 + + ble TMP1, $L48 + ble L, $L45 +#endif + .align 5 + +$L42: + ADD4 c05, t1, b5 + fmov b5, c05 +// unop + MUL a1, b1, t1 + unop + + ADD2 c06, t2, b5 + fmov b5, c06 + ldi L, -2(L) + MUL a2, b1, t2 +// unop + + ADD4 c07, t3, b5 + fmov b5, c07 +// unop + MUL a3, b1, t3 + unop + + ADD2 c08, t4, b5 + fmov b5, c08 +// unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 +// unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, b5 + fmov b5, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, b5 + fmov b5, c03 +// unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, b5 + fmov b5, c04 +// unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD4 c05, t1, b5 + fmov b5, c05 +// unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD2 c06, t2, b5 + fmov b5, c06 +// unop + MUL a2, b3, t2 + unop + + ADD4 c07, t3, b5 + fmov b5, c07 +// unop + MUL a3, b3, t3 + ldi AO, 8 * SIZE(AO) + + ADD2 c08, t4, b5 + fmov b5, c08 +// unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 +// unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD3 c02, t2, b5 + fmov b5, c02 +// unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD1 c03, t3, b5 + fmov b5, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD3 c04, t4, b5 + fmov b5, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L42 + .align 4 + +$L45: + ADD4 c05, t1, b5 + fmov b5, c05 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L47 +#else + blbs TMP1, $L47 +#endif + .align 4 + + ADD2 c06, t2, b5 + fmov b5, c06 + MUL a2, b1, t2 + ADD4 c07, t3, b5 + fmov b5, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, b5 + fmov b5, c08 +// unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 +// unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, b5 + fmov b5, c02 +// unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, b5 + fmov b5, c03 +// unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, b5 + fmov b5, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + ldi AO, 4 * SIZE(AO) + + ADD4 c05, t1, b5 + fmov b5, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L47: + ADD2 c06, t2, b5 + fmov b5, c06 + MUL a2, b1, t2 + ADD4 c07, t3, b5 + fmov b5, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, b5 + fmov b5, c08 + MUL a4, b1, t4 + ADD1 c01, t1, b5 + fmov b5, c01 + MUL a1, b2, t1 + + ADD3 c02, t2, b5 + fmov b5, c02 + MUL a2, b2, t2 + ADD1 c03, t3, b5 + fmov b5, c03 + MUL a3, b2, t3 + + ADD3 c04, t4, b5 + fmov b5, c04 + ldi AO, 4 * SIZE(AO) + MUL a4, b2, t4 + ldi BO, 2 * SIZE(BO) + + ADD4 c05, t1, b5 + fmov b5, c05 + ADD2 c06, t2, b5 + fmov b5, c06 + ADD4 c07, t3, b5 + fmov b5, c07 + ADD2 c08, t4, b5 + fmov b5, c08 + + ADD c01, c06, b5 + fmov b5, c01 + ADD c02, c05, b5 + fmov b5, c02 + ADD c03, c08, b5 + fmov b5, c03 + ADD c04, c07, b5 + fmov b5, c04 + +$L48: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 1, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + + ADD5 c03, t1, b5 + fmov b5, c03 + ADD6 c04, t2, b5 + fmov b5, c04 + MUL a3, c03, t1 + MUL a3, c04, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + MUL a4, c04, t1 + MUL a4, c03, t2 + + ADD6 c01, t1, b5 + fmov b5, c01 + ADD5 c02, t2, b5 + fmov b5, c02 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c04, t2, b5 + fmov b5, c04 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c03, t1, b5 + fmov b5, c03 + ADD5 c04, t2, b5 + fmov b5, c04 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c03, t1, b5 + fmov b5, c03 + ADD6 c04, t2, b5 + fmov b5, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + ADD5 c03, t3, b5 + fmov b5, c03 + ADD6 c04, t4, b5 + fmov b5, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + ldi C1, 4 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + + ldi I, -1(I) + bgt I, $L41 + .align 4 + +$L59: +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 1, KK +#endif + +#ifdef RT + subl KK, 1, KK +#endif + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + ldl tmp, 72($sp) + clr $0 + ldi $sp, STACKSIZE($sp) + ret + .ident VERSION + .end CNAME diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LN.S.bak b/kernel/sw_64/ztrsm_kernel_2x2_LN.S.bak new file mode 100644 index 0000000..71202d8 --- /dev/null +++ b/kernel/sw_64/ztrsm_kernel_2x2_LN.S.bak @@ -0,0 +1,2230 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + +#if !defined(SW2B) +#error "Architecture is not specified." +#endif + +#ifdef SW2B +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + + + .set noat + .set noreorder + .arch ev6 + +.text + .align 5 + .globl CNAME + .ent CNAME + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $21 +#define B $22 +#define C $20 +#define LDC $23 + +#define C1 $19 +#define C2 $24 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha_i $f29 +#define alpha_r $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + +#if defined(LN) || defined(LT) +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#define ADD5 ADD +#define ADD6 SUB +#endif +#else +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#define ADD5 ADD +#define ADD6 SUB +#endif +#endif + + +CNAME: + .frame $sp, STACKSIZE, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + ldi $at, _mcount + jsr $at, ($at), _mcount +#endif + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + ldi $sp, -STACKSIZE($sp) + + ldl B, 0 + STACKSIZE($sp) + ldl C, 8 + STACKSIZE($sp) + ldl LDC, 16 + STACKSIZE($sp) + ldl OFFSET, 24 + STACKSIZE($sp) + + sll LDC, ZBASE_SHIFT, LDC + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + addl M, M, TMP2 + mull TMP2, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ TMP2, C, C +#endif + +#ifdef RN + negl OFFSET, KK +#endif + +#ifdef RT + mull N, K, TMP1 + addl TMP1, TMP1, TMP1 + SXADDQ TMP1, B, B + + mull N, LDC, TMP1 + addl TMP1, C, C + + subl N, OFFSET, KK +#endif + + sra N, 1, J + ble J, $L30 + .align 4 + +$L01: +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + subl B, TMP1, B + + subl C, LDC, C2 + subl C2, LDC, C1 + subl C2, LDC, C +#else + mov C, C1 + addl C, LDC, C2 + addl C2, LDC, C +#endif + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + and M, 1, I + fclr t1 + fclr t2 + fclr t3 + fclr t4 + + fclr c01 + fclr c05 + ble I, $L20 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + ldi AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(B) + ldi BO, 4 * SIZE(B) + + ldi L, -2(KK) + + ble KK, $L28 + ble L, $L25 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, ZBASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(BO) + ldi BO, 4 * SIZE(BO) + + ldi L, -2(TMP1) + + ble TMP1, $L28 + ble L, $L25 +#endif + .align 5 + +$L22: + ADD1 c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + ldi BO, 8 * SIZE(BO) + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a3, b2, t3 + ldi AO, 4 * SIZE(AO) + + ADD2 c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD1 c01, t1, c01 + ldi L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD1 c09, t1, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + .align 4 + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + ldi AO, 2 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD3 c10, t2, c10 + MUL a2, b1, t2 + ADD4 c13, t3, c13 + MUL a1, b2, t3 + + ADD2 c14, t4, c14 + MUL a2, b2, t4 + ADD1 c01, t1, c01 + MUL a1, b3, t1 + + ADD3 c02, t2, c02 + MUL a2, b3, t2 + ADD4 c05, t3, c05 + MUL a1, b4, t3 + + ADD2 c06, t4, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b4, t4 + ldi BO, 4 * SIZE(BO) + + ADD1 c09, t1, c09 + ADD3 c10, t2, c10 + ADD4 c13, t3, c13 + ADD2 c14, t4, c14 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c09, c14, c09 + ADD c10, c13, c10 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 1, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + + MUL a3, c01, t1 + MUL a3, c02, t2 + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c09, t1, c09 + ADD5 c10, t2, c10 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + + MUL a3, c09, t1 + MUL a3, c10, t2 + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a4, c10, t1 + MUL a4, c09, t2 + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c10, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) + ldi C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + +#ifndef LN + ldi C1, 2 * SIZE(C1) + ldi C2, 2 * SIZE(C2) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L20: + sra M, 1, I + fclr t1 + fclr t2 + fclr t3 + fclr t4 + + fclr c01 + fclr c05 + + ble I, $L29 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c07 + + ldi BO, 4 * SIZE(B) + fclr c11 + ldi AO, 4 * SIZE(AO) + fclr c15 + + fillcs 4 * SIZE(C1) + fclr c04 + ldi L, -2(KK) + fclr c08 + + fillcs 4 * SIZE(C2) + fclr c12 + fclr c16 + ble KK, $L18 + ble L, $L15 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c07 + + ldi BO, 4 * SIZE(BO) + fclr c11 + ldi AO, 4 * SIZE(AO) + fclr c15 + + fillcs 4 * SIZE(C1) + fclr c04 + ldi L, -2(TMP1) + fclr c08 + + fillcs 4 * SIZE(C2) + fclr c12 + fclr c16 + ble TMP1, $L18 + ble L, $L15 +#endif + .align 5 + +$L12: +/* 1 */ + ADD1 c11, t1, c11 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD3 c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD4 c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD1 c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD3 c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD1 c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD3 c12, t2, c12 + ldi L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD4 c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD1 c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD1 c03, t1, c03 + ldi AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD3 c04, t2, c04 + ldi BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD1 c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD1 c11, t1, c11 + unop + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD2 c06, t3, c06 + MUL b2, a4, t3 + ADD4 c05, t4, c05 + MUL b4, a1, t4 + + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + ldi AO, 4 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD1 c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + MUL b1, a4, t2 + ADD2 c06, t3, c06 + MUL b2, a4, t3 + + ADD4 c05, t4, c05 + MUL b4, a1, t4 + ADD1 c03, t1, c03 + MUL b3, a1, t1 + + ADD3 c04, t2, c04 + MUL b3, a2, t2 + ADD2 c08, t3, c08 + MUL b4, a2, t3 + + ADD4 c13, t4, c13 + MUL b2, a3, t4 + ADD1 c09, t1, c09 + MUL b3, a3, t1 + + ADD3 c10, t2, c10 + MUL b3, a4, t2 + ADD2 c14, t3, c14 + MUL b4, a4, t3 + + ADD4 c07, t4, c07 + ldi AO, 4 * SIZE(AO) + MUL b4, a3, t4 + ldi BO, 4 * SIZE(BO) + + ADD1 c11, t1, c11 + ADD3 c12, t2, c12 + ADD2 c16, t3, c16 + ADD4 c15, t4, c15 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + + ADD c09, c14, c09 + ADD c10, c13, c10 + ADD c11, c16, c11 + ADD c12, c15, c12 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 + + SUB b1, c03, c03 + SUB b2, c04, c04 + SUB b3, c11, c11 + SUB b4, c12, c12 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c09, c09 + SUB b2, c10, c10 + SUB b3, c11, c11 + SUB b4, c12, c12 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, c03 + MUL a1, c04, c04 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 + + MUL a3, c03, t1 + MUL a3, c04, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c09, t3, c09 + SUB c10, t4, c10 + + MUL a4, c04, t1 + MUL a4, c03, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + ADD6 c09, t3, c09 + ADD5 c10, t4, c10 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c09, t3 + MUL a3, c10, t4 + + SUB c03, t1, c03 + SUB c04, t2, c04 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c10, t3 + MUL a4, c09, t4 + + ADD6 c03, t1, c03 + ADD5 c04, t2, c04 + ADD6 c11, t3, c11 + ADD5 c12, t4, c12 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, c03 + MUL a1, c04, c04 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c04, t3 + MUL a4, c03, t4 + + ADD6 c09, t1, c09 + ADD5 c10, t2, c10 + ADD6 c11, t3, c11 + ADD5 c12, t4, c12 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 + + MUL a3, c09, t1 + MUL a3, c10, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a4, c10, t1 + MUL a4, c09, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + ADD6 c03, t3, c03 + ADD5 c04, t4, c04 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c04, 5 * SIZE(BO) + ST c11, 6 * SIZE(BO) + ST c12, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c11, 6 * SIZE(AO) + ST c12, 7 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) + ldi C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + ST c11, 2 * SIZE(C2) + ST c12, 3 * SIZE(C2) + +#ifndef LN + ldi C1, 4 * SIZE(C1) + ldi C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP1 + addl AO, TMP1, AO + addl BO, TMP1, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + fclr c01 + fclr c05 + + ldi I, -1(I) + bgt I, $L11 + .align 4 + +$L29: +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 2, KK +#endif + +#ifdef RT + subl KK, 2, KK +#endif + + ldi J, -1(J) + bgt J, $L01 + .align 4 + +$L30: + and N, 1, J + ble J, $L999 + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + subl B, TMP1, B + + subl C, LDC, C1 + subl C, LDC, C +#else + mov C, C1 + addl C, LDC, C +#endif + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + and M, 1, I + ble I, $L50 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi AO, 2 * SIZE(AO) + ldi BO, 2 * SIZE(B) + + ldi L, -2(KK) + + ble KK, $L58 + ble L, $L55 +#else +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi AO, 2 * SIZE(AO) + ldi BO, 2 * SIZE(BO) + + ldi L, -2(TMP1) + + ble TMP1, $L58 + ble L, $L55 +#endif + .align 5 + +$L52: + ADD1 c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD3 c02, t2, c02 + ldi AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + ldi L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + ldi BO, 4 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L52 + .align 4 + +$L55: + ADD1 c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD3 c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + ldi BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + ldi AO, 2 * SIZE(AO) + .align 4 + +$L57: + ADD3 c02, t2, c02 + MUL a2, b1, t2 + ADD4 c05, t3, c05 + MUL a1, b2, t3 + + ADD2 c06, t4, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b2, t4 + ldi BO, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + ADD3 c02, t2, c02 + ADD4 c05, t3, c05 + ADD2 c06, t4, c06 + + ADD c01, c06, c01 + ADD c02, c05, c02 + +$L58: +#if defined(LN) || defined(RT) + subl KK, 1, TMP1 + + sll TMP1, ZBASE_SHIFT, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + ldi C1, 2 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L50: + sra M, 1, I + ble I, $L59 + .align 4 + +$L41: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi BO, 2 * SIZE(B) + fclr c03 + ldi AO, 4 * SIZE(AO) + fclr c07 + + ldi L, -2(KK) + fclr c04 + fclr c08 + + ble KK, $L48 + ble L, $L45 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi BO, 2 * SIZE(BO) + fclr c03 + ldi AO, 4 * SIZE(AO) + fclr c07 + + ldi L, -2(TMP1) + fclr c04 + fclr c08 + + ble TMP1, $L48 + ble L, $L45 +#endif + .align 5 + +$L42: + ADD4 c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD2 c06, t2, c06 + ldi L, -2(L) + MUL a2, b1, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD4 c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD2 c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b3, t3 + ldi AO, 8 * SIZE(AO) + + ADD2 c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD1 c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L42 + .align 4 + +$L45: + ADD4 c05, t1, c05 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L47 +#else + blbs TMP1, $L47 +#endif + .align 4 + + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + ldi AO, 4 * SIZE(AO) + + ADD4 c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L47: + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + MUL a4, b1, t4 + ADD1 c01, t1, c01 + MUL a1, b2, t1 + + ADD3 c02, t2, c02 + MUL a2, b2, t2 + ADD1 c03, t3, c03 + MUL a3, b2, t3 + + ADD3 c04, t4, c04 + ldi AO, 4 * SIZE(AO) + MUL a4, b2, t4 + ldi BO, 2 * SIZE(BO) + + ADD4 c05, t1, c05 + ADD2 c06, t2, c06 + ADD4 c07, t3, c07 + ADD2 c08, t4, c08 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + +$L48: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 1, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + MUL a3, c03, t1 + MUL a3, c04, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + MUL a4, c04, t1 + MUL a4, c03, t2 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c03, t1, c03 + SUB c04, t2, c04 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c03, t1, c03 + ADD5 c04, t2, c04 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + ldi C1, 4 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + + ldi I, -1(I) + bgt I, $L41 + .align 4 + +$L59: +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 1, KK +#endif + +#ifdef RT + subl KK, 1, KK +#endif + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + clr $0 + ldi $sp, STACKSIZE($sp) + ret + .ident VERSION + .end CNAME diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LT.S b/kernel/sw_64/ztrsm_kernel_2x2_LT.S new file mode 100644 index 0000000..bb38b56 --- /dev/null +++ b/kernel/sw_64/ztrsm_kernel_2x2_LT.S @@ -0,0 +1,2624 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(SW6) +#error "Architecture is not specified." +#endif + +#ifdef SW6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 48 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + + .set noat + .set noreorder + .arch sw6a + +.text + .align 5 + .globl CNAME + .ent CNAME + +#define STACKSIZE 88 + +#define M $16 +#define N $17 +#define K $18 +#define A $21 +#define B $22 +#define C $20 +#define LDC $23 + +#define C1 $19 +#define C2 $24 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define tmp $9 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha_i $f29 +#define alpha_r $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + +#if defined(LN) || defined(LT) +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#define ADD5 ADD +#define ADD6 SUB +#endif +#else +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#define ADD5 ADD +#define ADD6 SUB +#endif +#endif + + +CNAME: + .frame $sp, STACKSIZE, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + ldi $at, _mcount + jsr $at, ($at), _mcount +#endif + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + ldi $sp, -STACKSIZE($sp) + + ldl B, 0 + STACKSIZE($sp) + ldl C, 8 + STACKSIZE($sp) + ldl LDC, 16 + STACKSIZE($sp) + ldl OFFSET, 24 + STACKSIZE($sp) + + sll LDC, ZBASE_SHIFT, LDC + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + stl tmp, 72($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + addl M, M, TMP2 + mull TMP2, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ TMP2, C, C +#endif + +#ifdef RN + negl OFFSET, KK +#endif + +#ifdef RT + mull N, K, TMP1 + addl TMP1, TMP1, TMP1 + SXADDQ TMP1, B, B + + mull N, LDC, TMP1 + addl TMP1, C, C + + subl N, OFFSET, KK +#endif + + sra N, 1, J + ble J, $L30 + .align 4 + +$L01: +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + subl B, TMP1, B + + subl C, LDC, C2 + subl C2, LDC, C1 + subl C2, LDC, C +#else + mov C, C1 + addl C, LDC, C2 + addl C2, LDC, C +#endif + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + fclr t1 + fclr t2 + fclr t3 + fclr t4 + + fclr c01 + fclr c05 + + ble I, $L20 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c07 + + ldi BO, 4 * SIZE(B) + fclr c11 + ldi AO, 4 * SIZE(AO) + fclr c15 + + fillcs 4 * SIZE(C1) + fclr c04 + ldi L, -2(KK) + fclr c08 + + fillcs 4 * SIZE(C2) + fclr c12 + fclr c16 + ble KK, $L18 + ble L, $L15 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c07 + + ldi BO, 4 * SIZE(BO) + fclr c11 + ldi AO, 4 * SIZE(AO) + fclr c15 + + fillcs 4 * SIZE(C1) + fclr c04 + ldi L, -2(TMP1) + fclr c08 + + fillcs 4 * SIZE(C2) + fclr c12 + fclr c16 + ble TMP1, $L18 + ble L, $L15 +#endif + .align 5 + +$L12: +/* 1 */ + ADD1 c11, t1, b5 + fmov b5, c11 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD3 c12, t2, b5 + fmov b5, c12 + unop + MUL b1, a2, t2 + unop + + ADD2 c16, t3, b5 + fmov b5, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD4 c15, t4, b5 + fmov b5, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + FIMOVD b5, tmp +/* 2 */ + ADD1 c01, t1, b5 + fmov b5, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD3 c02, t2, b5 + fmov b5, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD2 c06, t3, b5 + fmov b5, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, b5 + fmov b5, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD1 c03, t1, b5 + fmov b5, c03 + unop + MUL b3, a1, t1 + unop + + ADD3 c04, t2, b5 + fmov b5, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, b5 + fmov b5, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, b5 + fmov b5, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD1 c09, t1, b5 + fmov b5, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD3 c10, t2, b5 + fmov b5, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, b5 + fmov b5, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD4 c07, t4, b5 + fmov b5, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD1 c11, t1, b5 + fmov b5, c11 + unop + IFMOVD tmp, b5 + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD3 c12, t2, b5 + fmov b5, c12 + ldi L, -2(L) + IFMOVD tmp, b5 + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD2 c16, t3, b5 + fmov b5, c16 + unop + MUL b2, a2, t3 + unop + + ADD4 c15, t4, b5 + fmov b5, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD1 c01, t1, b5 + fmov b5, c01 + unop + IFMOVD tmp, b5 + MUL b5, a6, t1 + unop + + ADD3 c02, t2, b5 + fmov b5, c02 + unop + IFMOVD tmp, b5 + MUL b5, a4, t2 + unop + + ADD2 c06, t3, b5 + fmov b5, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, b5 + fmov b5, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD1 c03, t1, b5 + fmov b5, c03 + ldi AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD3 c04, t2, b5 + fmov b5, c04 + ldi BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD2 c08, t3, b5 + fmov b5, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD4 c13, t4, b5 + fmov b5, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD1 c09, t1, b5 + fmov b5, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD3 c10, t2, b5 + fmov b5, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD2 c14, t3, b5 + fmov b5, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, b5 + fmov b5, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD1 c11, t1, b5 + fmov b5, c11 + unop + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD3 c12, t2, b5 + fmov b5, c12 + MUL b1, a2, t2 + ADD2 c16, t3, b5 + fmov b5, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, b5 + fmov b5, c15 + MUL b2, a1, t4 + ADD1 c01, t1, b5 + fmov b5, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, b5 + fmov b5, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD2 c06, t3, b5 + fmov b5, c06 + MUL b2, a4, t3 + ADD4 c05, t4, b5 + fmov b5, c05 + MUL b4, a1, t4 + + ADD1 c03, t1, b5 + fmov b5, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c04, t2, b5 + fmov b5, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, b5 + fmov b5, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, b5 + fmov b5, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c09, t1, b5 + fmov b5, c09 + unop + MUL b3, a3, t1 + ldi AO, 4 * SIZE(AO) + + ADD3 c10, t2, b5 + fmov b5, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, b5 + fmov b5, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, b5 + fmov b5, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD1 c11, t1, b5 + fmov b5, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD3 c12, t2, b5 + fmov b5, c12 + MUL b1, a2, t2 + ADD2 c16, t3, b5 + fmov b5, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, b5 + fmov b5, c15 + MUL b2, a1, t4 + ADD1 c01, t1, b5 + fmov b5, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, b5 + fmov b5, c02 + MUL b1, a4, t2 + ADD2 c06, t3, b5 + fmov b5, c06 + MUL b2, a4, t3 + + ADD4 c05, t4, b5 + fmov b5, c05 + MUL b4, a1, t4 + ADD1 c03, t1, b5 + fmov b5, c03 + MUL b3, a1, t1 + + ADD3 c04, t2, b5 + fmov b5, c04 + MUL b3, a2, t2 + ADD2 c08, t3, b5 + fmov b5, c08 + MUL b4, a2, t3 + + ADD4 c13, t4, b5 + fmov b5, c13 + MUL b2, a3, t4 + ADD1 c09, t1, b5 + fmov b5, c09 + MUL b3, a3, t1 + + ADD3 c10, t2, b5 + fmov b5, c10 + MUL b3, a4, t2 + ADD2 c14, t3, b5 + fmov b5, c14 + MUL b4, a4, t3 + + ADD4 c07, t4, b5 + fmov b5, c07 + ldi AO, 4 * SIZE(AO) + MUL b4, a3, t4 + ldi BO, 4 * SIZE(BO) + + ADD1 c11, t1, b5 + fmov b5, c11 + ADD3 c12, t2, b5 + fmov b5, c12 + ADD2 c16, t3, b5 + fmov b5, c16 + ADD4 c15, t4, b5 + fmov b5, c15 + + ADD c01, c06, b5 + fmov b5, c01 + ADD c02, c05, b5 + fmov b5, c02 + ADD c03, c08, b5 + fmov b5, c03 + ADD c04, c07, b5 + fmov b5, c04 + + ADD c09, c14, b5 + fmov b5, c09 + ADD c10, c13, b5 + fmov b5, c10 + ADD c11, c16, b5 + fmov b5, c11 + ADD c12, c15, b5 + fmov b5, c12 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c10, b5 + fmov b5, c10 + + SUB b1, c03, b5 + fmov b5, c03 + SUB b2, c04, b5 + fmov b5, c04 + SUB b3, c11, b5 + fmov b5, c11 + SUB b4, c12, b5 + fmov b5, c12 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 + + SUB b1, c09, b5 + fmov b5, c09 + SUB b2, c10, b5 + fmov b5, c10 + SUB b3, c11, b5 + fmov b5, c11 + SUB b4, c12, b5 + fmov b5, c12 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + MUL a1, c11, b5 + fmov b5, c11 + MUL a1, c12, b5 + fmov b5, c12 + + ADD5 c03, t1, b5 + fmov b5, c03 + ADD6 c04, t2, b5 + fmov b5, c04 + ADD5 c11, t3, b5 + fmov b5, c11 + ADD6 c12, t4, b5 + fmov b5, c12 + + MUL a3, c03, t1 + MUL a3, c04, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + SUB c09, t3, b5 + fmov b5, c09 + SUB c10, t4, b5 + fmov b5, c10 + + MUL a4, c04, t1 + MUL a4, c03, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, b5 + fmov b5, c01 + ADD5 c02, t2, b5 + fmov b5, c02 + ADD6 c09, t3, b5 + fmov b5, c09 + ADD5 c10, t4, b5 + fmov b5, c10 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + ADD5 c09, t3, b5 + fmov b5, c09 + ADD6 c10, t4, b5 + fmov b5, c10 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + ADD5 c09, t3, b5 + fmov b5, c09 + ADD6 c10, t4, b5 + fmov b5, c10 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c09, t3 + MUL a3, c10, t4 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c04, t2, b5 + fmov b5, c04 + SUB c11, t3, b5 + fmov b5, c11 + SUB c12, t4, b5 + fmov b5, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c10, t3 + MUL a4, c09, t4 + + ADD6 c03, t1, b5 + fmov b5, c03 + ADD5 c04, t2, b5 + fmov b5, c04 + ADD6 c11, t3, b5 + fmov b5, c11 + ADD5 c12, t4, b5 + fmov b5, c12 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + MUL a1, c11, b5 + fmov b5, c11 + MUL a1, c12, b5 + fmov b5, c12 + + ADD5 c03, t1, b5 + fmov b5, c03 + ADD6 c04, t2, b5 + fmov b5, c04 + ADD5 c11, t3, b5 + fmov b5, c11 + ADD6 c12, t4, b5 + fmov b5, c12 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + ADD5 c03, t3, b5 + fmov b5, c03 + ADD6 c04, t4, b5 + fmov b5, c04 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + SUB c11, t3, b5 + fmov b5, c11 + SUB c12, t4, b5 + fmov b5, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c04, t3 + MUL a4, c03, t4 + + ADD6 c09, t1, b5 + fmov b5, c09 + ADD5 c10, t2, b5 + fmov b5, c10 + ADD6 c11, t3, b5 + fmov b5, c11 + ADD5 c12, t4, b5 + fmov b5, c12 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + MUL a1, c11, b5 + fmov b5, c11 + MUL a1, c12, b5 + fmov b5, c12 + + ADD5 c09, t1, b5 + fmov b5, c09 + ADD6 c10, t2, b5 + fmov b5, c10 + ADD5 c11, t3, b5 + fmov b5, c11 + ADD6 c12, t4, b5 + fmov b5, c12 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + MUL a1, c11, b5 + fmov b5, c11 + MUL a1, c12, b5 + fmov b5, c12 + + ADD5 c09, t1, b5 + fmov b5, c09 + ADD6 c10, t2, b5 + fmov b5, c10 + ADD5 c11, t3, b5 + fmov b5, c11 + ADD6 c12, t4, b5 + fmov b5, c12 + + MUL a3, c09, t1 + MUL a3, c10, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + SUB c03, t3, b5 + fmov b5, c03 + SUB c04, t4, b5 + fmov b5, c04 + + MUL a4, c10, t1 + MUL a4, c09, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, b5 + fmov b5, c01 + ADD5 c02, t2, b5 + fmov b5, c02 + ADD6 c03, t3, b5 + fmov b5, c03 + ADD5 c04, t4, b5 + fmov b5, c04 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + ADD5 c03, t3, b5 + fmov b5, c03 + ADD6 c04, t4, b5 + fmov b5, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c04, 5 * SIZE(BO) + ST c11, 6 * SIZE(BO) + ST c12, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c11, 6 * SIZE(AO) + ST c12, 7 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) + ldi C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + ST c11, 2 * SIZE(C2) + ST c12, 3 * SIZE(C2) + +#ifndef LN + ldi C1, 4 * SIZE(C1) + ldi C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP1 + addl AO, TMP1, AO + addl BO, TMP1, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + fclr c01 + fclr c05 + + ldi I, -1(I) + bgt I, $L11 + .align 4 + +$L20: + and M, 1, I + ble I, $L29 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + ldi AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(B) + ldi BO, 4 * SIZE(B) + + ldi L, -2(KK) + + ble KK, $L28 + ble L, $L25 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, ZBASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(BO) + ldi BO, 4 * SIZE(BO) + + ldi L, -2(TMP1) + + ble TMP1, $L28 + ble L, $L25 +#endif + .align 5 + +$L22: + ADD1 c09, t1, b5 + fmov b5, c09 + unop + MUL a1, b1, t1 + unop + + ADD3 c10, t2, b5 + fmov b5, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, b5 + fmov b5, c13 + unop + MUL a1, b2, t3 + ldi BO, 8 * SIZE(BO) + + ADD2 c14, t4, b5 + fmov b5, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b3, t1 + unop + + ADD3 c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD4 c05, t3, b5 + fmov b5, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD2 c06, t4, b5 + fmov b5, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + FIMOVD b5, tmp + + ADD1 c09, t1, b5 + fmov b5, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD3 c10, t2, b5 + fmov b5, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD4 c13, t3, b5 + fmov b5, c13 + unop + MUL a3, b2, t3 + ldi AO, 4 * SIZE(AO) + + ADD2 c14, t4, b5 + fmov b5, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 + ldi L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD3 c02, t2, b5 + fmov b5, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD4 c05, t3, b5 + fmov b5, c05 + unop + IFMOVD tmp, b5 + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, b5 + fmov b5, c06 + IFMOVD tmp, b5 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD1 c09, t1, b5 + fmov b5, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + .align 4 + + ADD3 c10, t2, b5 + fmov b5, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, b5 + fmov b5, c13 + unop + MUL a1, b2, t3 + unop + + ADD2 c14, t4, b5 + fmov b5, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b3, t1 + ldi AO, 2 * SIZE(AO) + + ADD3 c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD4 c05, t3, b5 + fmov b5, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, b5 + fmov b5, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c09, t1, b5 + fmov b5, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD3 c10, t2, b5 + fmov b5, c10 + MUL a2, b1, t2 + ADD4 c13, t3, b5 + fmov b5, c13 + MUL a1, b2, t3 + + ADD2 c14, t4, b5 + fmov b5, c14 + MUL a2, b2, t4 + ADD1 c01, t1, b5 + fmov b5, c01 + MUL a1, b3, t1 + + ADD3 c02, t2, b5 + fmov b5, c02 + MUL a2, b3, t2 + ADD4 c05, t3, b5 + fmov b5, c05 + MUL a1, b4, t3 + + ADD2 c06, t4, b5 + fmov b5, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b4, t4 + ldi BO, 4 * SIZE(BO) + + ADD1 c09, t1, b5 + fmov b5, c09 + ADD3 c10, t2, b5 + fmov b5, c10 + ADD4 c13, t3, b5 + fmov b5, c13 + ADD2 c14, t4, b5 + fmov b5, c14 + + ADD c01, c06, b5 + fmov b5, c01 + ADD c02, c05, b5 + fmov b5, c02 + ADD c09, c14, b5 + fmov b5, c09 + ADD c10, c13, b5 + fmov b5, c10 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 1, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c10, b5 + fmov b5, c10 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c10, b5 + fmov b5, c10 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + ADD5 c09, t3, b5 + fmov b5, c09 + ADD6 c10, t4, b5 + fmov b5, c10 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + + MUL a3, c01, t1 + MUL a3, c02, t2 + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c09, t1, b5 + fmov b5, c09 + ADD5 c10, t2, b5 + fmov b5, c10 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + + ADD5 c09, t1, b5 + fmov b5, c09 + ADD6 c10, t2, b5 + fmov b5, c10 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + + ADD5 c09, t1, b5 + fmov b5, c09 + ADD6 c10, t2, b5 + fmov b5, c10 + + MUL a3, c09, t1 + MUL a3, c10, t2 + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + + MUL a4, c10, t1 + MUL a4, c09, t2 + ADD6 c01, t1, b5 + fmov b5, c01 + ADD5 c02, t2, b5 + fmov b5, c02 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c10, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) + ldi C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + +#ifndef LN + ldi C1, 2 * SIZE(C1) + ldi C2, 2 * SIZE(C2) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L29: +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 2, KK +#endif + +#ifdef RT + subl KK, 2, KK +#endif + + ldi J, -1(J) + bgt J, $L01 + .align 4 + +$L30: + and N, 1, J + ble J, $L999 + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + subl B, TMP1, B + + subl C, LDC, C1 + subl C, LDC, C +#else + mov C, C1 + addl C, LDC, C +#endif + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + ble I, $L50 + .align 4 + +$L41: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi BO, 2 * SIZE(B) + fclr c03 + ldi AO, 4 * SIZE(AO) + fclr c07 + + ldi L, -2(KK) + fclr c04 + fclr c08 + + ble KK, $L48 + ble L, $L45 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi BO, 2 * SIZE(BO) + fclr c03 + ldi AO, 4 * SIZE(AO) + fclr c07 + + ldi L, -2(TMP1) + fclr c04 + fclr c08 + + ble TMP1, $L48 + ble L, $L45 +#endif + .align 5 + +$L42: + ADD4 c05, t1, b5 + fmov b5, c05 + unop + MUL a1, b1, t1 + unop + + ADD2 c06, t2, b5 + fmov b5, c06 + ldi L, -2(L) + MUL a2, b1, t2 + unop + + ADD4 c07, t3, b5 + fmov b5, c07 + unop + MUL a3, b1, t3 + unop + + ADD2 c08, t4, b5 + fmov b5, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, b5 + fmov b5, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, b5 + fmov b5, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD4 c05, t1, b5 + fmov b5, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD2 c06, t2, b5 + fmov b5, c06 + unop + MUL a2, b3, t2 + unop + + ADD4 c07, t3, b5 + fmov b5, c07 + unop + MUL a3, b3, t3 + ldi AO, 8 * SIZE(AO) + + ADD2 c08, t4, b5 + fmov b5, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD3 c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD1 c03, t3, b5 + fmov b5, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD3 c04, t4, b5 + fmov b5, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L42 + .align 4 + +$L45: + ADD4 c05, t1, b5 + fmov b5, c05 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L47 +#else + blbs TMP1, $L47 +#endif + .align 4 + + ADD2 c06, t2, b5 + fmov b5, c06 + MUL a2, b1, t2 + ADD4 c07, t3, b5 + fmov b5, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, b5 + fmov b5, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, b5 + fmov b5, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + ldi AO, 4 * SIZE(AO) + + ADD4 c05, t1, b5 + fmov b5, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L47: + ADD2 c06, t2, b5 + fmov b5, c06 + MUL a2, b1, t2 + ADD4 c07, t3, b5 + fmov b5, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, b5 + fmov b5, c08 + MUL a4, b1, t4 + ADD1 c01, t1, b5 + fmov b5, c01 + MUL a1, b2, t1 + + ADD3 c02, t2, b5 + fmov b5, c02 + MUL a2, b2, t2 + ADD1 c03, t3, b5 + fmov b5, c03 + MUL a3, b2, t3 + + ADD3 c04, t4, b5 + fmov b5, c04 + ldi AO, 4 * SIZE(AO) + MUL a4, b2, t4 + ldi BO, 2 * SIZE(BO) + + ADD4 c05, t1, b5 + fmov b5, c05 + ADD2 c06, t2, b5 + fmov b5, c06 + ADD4 c07, t3, b5 + fmov b5, c07 + ADD2 c08, t4, b5 + fmov b5, c08 + + ADD c01, c06, b5 + fmov b5, c01 + ADD c02, c05, b5 + fmov b5, c02 + ADD c03, c08, b5 + fmov b5, c03 + ADD c04, c07, b5 + fmov b5, c04 + +$L48: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 1, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + + ADD5 c03, t1, b5 + fmov b5, c03 + ADD6 c04, t2, b5 + fmov b5, c04 + MUL a3, c03, t1 + MUL a3, c04, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + MUL a4, c04, t1 + MUL a4, c03, t2 + + ADD6 c01, t1, b5 + fmov b5, c01 + ADD5 c02, t2, b5 + fmov b5, c02 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c04, t2, b5 + fmov b5, c04 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c03, t1, b5 + fmov b5, c03 + ADD5 c04, t2, b5 + fmov b5, c04 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + + ADD5 c03, t1, b5 + fmov b5, c03 + ADD6 c04, t2, b5 + fmov b5, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + ADD5 c03, t3, b5 + fmov b5, c03 + ADD6 c04, t4, b5 + fmov b5, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + ldi C1, 4 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + + ldi I, -1(I) + bgt I, $L41 + .align 4 + +$L50: + and M, 1, I + ble I, $L59 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi AO, 2 * SIZE(AO) + ldi BO, 2 * SIZE(B) + + ldi L, -2(KK) + + ble KK, $L58 + ble L, $L55 +#else +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi AO, 2 * SIZE(AO) + ldi BO, 2 * SIZE(BO) + + ldi L, -2(TMP1) + + ble TMP1, $L58 + ble L, $L55 +#endif + .align 5 + +$L52: + ADD1 c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b1, t1 + unop + + ADD3 c02, t2, b5 + fmov b5, c02 + ldi AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD4 c05, t3, b5 + fmov b5, c05 + ldi L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, b5 + fmov b5, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c01, t1, b5 + fmov b5, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + ldi BO, 4 * SIZE(BO) + + ADD3 c02, t2, b5 + fmov b5, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD4 c05, t3, b5 + fmov b5, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, b5 + fmov b5, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L52 + .align 4 + +$L55: + ADD1 c01, t1, b5 + fmov b5, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD3 c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c05, t3, b5 + fmov b5, c05 + ldi BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD2 c06, t4, b5 + fmov b5, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD1 c01, t1, b5 + fmov b5, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + ldi AO, 2 * SIZE(AO) + .align 4 + +$L57: + ADD3 c02, t2, b5 + fmov b5, c02 + MUL a2, b1, t2 + ADD4 c05, t3, b5 + fmov b5, c05 + MUL a1, b2, t3 + + ADD2 c06, t4, b5 + fmov b5, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b2, t4 + ldi BO, 2 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 + ADD3 c02, t2, b5 + fmov b5, c02 + ADD4 c05, t3, b5 + fmov b5, c05 + ADD2 c06, t4, b5 + fmov b5, c06 + + ADD c01, c06, b5 + fmov b5, c01 + ADD c02, c05, b5 + fmov b5, c02 + +$L58: +#if defined(LN) || defined(RT) + subl KK, 1, TMP1 + + sll TMP1, ZBASE_SHIFT, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + ldi C1, 2 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L59: +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 1, KK +#endif + +#ifdef RT + subl KK, 1, KK +#endif + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + ldl tmp, 72($sp) + + clr $0 + ldi $sp, STACKSIZE($sp) + ret + .ident VERSION + .end CNAME diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LT.S.bak b/kernel/sw_64/ztrsm_kernel_2x2_LT.S.bak new file mode 100644 index 0000000..f4a2c13 --- /dev/null +++ b/kernel/sw_64/ztrsm_kernel_2x2_LT.S.bak @@ -0,0 +1,2222 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(SW2B) +#error "Architecture is not specified." +#endif + +#ifdef SW2B +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + + + .set noat + .set noreorder + .arch ev6 + +.text + .align 5 + .globl CNAME + .ent CNAME + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $21 +#define B $22 +#define C $20 +#define LDC $23 + +#define C1 $19 +#define C2 $24 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha_i $f29 +#define alpha_r $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + +#if defined(LN) || defined(LT) +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#define ADD5 ADD +#define ADD6 SUB +#endif +#else +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#define ADD5 ADD +#define ADD6 SUB +#endif +#endif + + +CNAME: + .frame $sp, STACKSIZE, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + ldi $at, _mcount + jsr $at, ($at), _mcount +#endif + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + ldi $sp, -STACKSIZE($sp) + + ldl B, 0 + STACKSIZE($sp) + ldl C, 8 + STACKSIZE($sp) + ldl LDC, 16 + STACKSIZE($sp) + ldl OFFSET, 24 + STACKSIZE($sp) + + sll LDC, ZBASE_SHIFT, LDC + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + addl M, M, TMP2 + mull TMP2, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ TMP2, C, C +#endif + +#ifdef RN + negl OFFSET, KK +#endif + +#ifdef RT + mull N, K, TMP1 + addl TMP1, TMP1, TMP1 + SXADDQ TMP1, B, B + + mull N, LDC, TMP1 + addl TMP1, C, C + + subl N, OFFSET, KK +#endif + + sra N, 1, J + ble J, $L30 + .align 4 + +$L01: +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + subl B, TMP1, B + + subl C, LDC, C2 + subl C2, LDC, C1 + subl C2, LDC, C +#else + mov C, C1 + addl C, LDC, C2 + addl C2, LDC, C +#endif + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + fclr t1 + fclr t2 + fclr t3 + fclr t4 + + fclr c01 + fclr c05 + + ble I, $L20 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c07 + + ldi BO, 4 * SIZE(B) + fclr c11 + ldi AO, 4 * SIZE(AO) + fclr c15 + + fillcs 4 * SIZE(C1) + fclr c04 + ldi L, -2(KK) + fclr c08 + + fillcs 4 * SIZE(C2) + fclr c12 + fclr c16 + ble KK, $L18 + ble L, $L15 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c07 + + ldi BO, 4 * SIZE(BO) + fclr c11 + ldi AO, 4 * SIZE(AO) + fclr c15 + + fillcs 4 * SIZE(C1) + fclr c04 + ldi L, -2(TMP1) + fclr c08 + + fillcs 4 * SIZE(C2) + fclr c12 + fclr c16 + ble TMP1, $L18 + ble L, $L15 +#endif + .align 5 + +$L12: +/* 1 */ + ADD1 c11, t1, c11 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD3 c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD4 c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD1 c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD3 c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD1 c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD3 c12, t2, c12 + ldi L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD4 c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD1 c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD1 c03, t1, c03 + ldi AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD3 c04, t2, c04 + ldi BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD1 c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD1 c11, t1, c11 + unop + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD2 c06, t3, c06 + MUL b2, a4, t3 + ADD4 c05, t4, c05 + MUL b4, a1, t4 + + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + ldi AO, 4 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD1 c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + MUL b1, a4, t2 + ADD2 c06, t3, c06 + MUL b2, a4, t3 + + ADD4 c05, t4, c05 + MUL b4, a1, t4 + ADD1 c03, t1, c03 + MUL b3, a1, t1 + + ADD3 c04, t2, c04 + MUL b3, a2, t2 + ADD2 c08, t3, c08 + MUL b4, a2, t3 + + ADD4 c13, t4, c13 + MUL b2, a3, t4 + ADD1 c09, t1, c09 + MUL b3, a3, t1 + + ADD3 c10, t2, c10 + MUL b3, a4, t2 + ADD2 c14, t3, c14 + MUL b4, a4, t3 + + ADD4 c07, t4, c07 + ldi AO, 4 * SIZE(AO) + MUL b4, a3, t4 + ldi BO, 4 * SIZE(BO) + + ADD1 c11, t1, c11 + ADD3 c12, t2, c12 + ADD2 c16, t3, c16 + ADD4 c15, t4, c15 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + + ADD c09, c14, c09 + ADD c10, c13, c10 + ADD c11, c16, c11 + ADD c12, c15, c12 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 + + SUB b1, c03, c03 + SUB b2, c04, c04 + SUB b3, c11, c11 + SUB b4, c12, c12 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c09, c09 + SUB b2, c10, c10 + SUB b3, c11, c11 + SUB b4, c12, c12 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, c03 + MUL a1, c04, c04 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 + + MUL a3, c03, t1 + MUL a3, c04, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c09, t3, c09 + SUB c10, t4, c10 + + MUL a4, c04, t1 + MUL a4, c03, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + ADD6 c09, t3, c09 + ADD5 c10, t4, c10 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c09, t3 + MUL a3, c10, t4 + + SUB c03, t1, c03 + SUB c04, t2, c04 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c10, t3 + MUL a4, c09, t4 + + ADD6 c03, t1, c03 + ADD5 c04, t2, c04 + ADD6 c11, t3, c11 + ADD5 c12, t4, c12 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, c03 + MUL a1, c04, c04 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c04, t3 + MUL a4, c03, t4 + + ADD6 c09, t1, c09 + ADD5 c10, t2, c10 + ADD6 c11, t3, c11 + ADD5 c12, t4, c12 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 + + MUL a3, c09, t1 + MUL a3, c10, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a4, c10, t1 + MUL a4, c09, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + ADD6 c03, t3, c03 + ADD5 c04, t4, c04 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c04, 5 * SIZE(BO) + ST c11, 6 * SIZE(BO) + ST c12, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c11, 6 * SIZE(AO) + ST c12, 7 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) + ldi C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + ST c11, 2 * SIZE(C2) + ST c12, 3 * SIZE(C2) + +#ifndef LN + ldi C1, 4 * SIZE(C1) + ldi C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP1 + addl AO, TMP1, AO + addl BO, TMP1, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + fclr c01 + fclr c05 + + ldi I, -1(I) + bgt I, $L11 + .align 4 + +$L20: + and M, 1, I + ble I, $L29 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + ldi AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(B) + ldi BO, 4 * SIZE(B) + + ldi L, -2(KK) + + ble KK, $L28 + ble L, $L25 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, ZBASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(BO) + ldi BO, 4 * SIZE(BO) + + ldi L, -2(TMP1) + + ble TMP1, $L28 + ble L, $L25 +#endif + .align 5 + +$L22: + ADD1 c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + ldi BO, 8 * SIZE(BO) + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a3, b2, t3 + ldi AO, 4 * SIZE(AO) + + ADD2 c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD1 c01, t1, c01 + ldi L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD1 c09, t1, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + .align 4 + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + ldi AO, 2 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD3 c10, t2, c10 + MUL a2, b1, t2 + ADD4 c13, t3, c13 + MUL a1, b2, t3 + + ADD2 c14, t4, c14 + MUL a2, b2, t4 + ADD1 c01, t1, c01 + MUL a1, b3, t1 + + ADD3 c02, t2, c02 + MUL a2, b3, t2 + ADD4 c05, t3, c05 + MUL a1, b4, t3 + + ADD2 c06, t4, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b4, t4 + ldi BO, 4 * SIZE(BO) + + ADD1 c09, t1, c09 + ADD3 c10, t2, c10 + ADD4 c13, t3, c13 + ADD2 c14, t4, c14 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c09, c14, c09 + ADD c10, c13, c10 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 1, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + + MUL a3, c01, t1 + MUL a3, c02, t2 + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c09, t1, c09 + ADD5 c10, t2, c10 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + + MUL a3, c09, t1 + MUL a3, c10, t2 + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a4, c10, t1 + MUL a4, c09, t2 + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c10, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) + ldi C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + +#ifndef LN + ldi C1, 2 * SIZE(C1) + ldi C2, 2 * SIZE(C2) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L29: +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 2, KK +#endif + +#ifdef RT + subl KK, 2, KK +#endif + + ldi J, -1(J) + bgt J, $L01 + .align 4 + +$L30: + and N, 1, J + ble J, $L999 + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + subl B, TMP1, B + + subl C, LDC, C1 + subl C, LDC, C +#else + mov C, C1 + addl C, LDC, C +#endif + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + ble I, $L50 + .align 4 + +$L41: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi BO, 2 * SIZE(B) + fclr c03 + ldi AO, 4 * SIZE(AO) + fclr c07 + + ldi L, -2(KK) + fclr c04 + fclr c08 + + ble KK, $L48 + ble L, $L45 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi BO, 2 * SIZE(BO) + fclr c03 + ldi AO, 4 * SIZE(AO) + fclr c07 + + ldi L, -2(TMP1) + fclr c04 + fclr c08 + + ble TMP1, $L48 + ble L, $L45 +#endif + .align 5 + +$L42: + ADD4 c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD2 c06, t2, c06 + ldi L, -2(L) + MUL a2, b1, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD4 c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD2 c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b3, t3 + ldi AO, 8 * SIZE(AO) + + ADD2 c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD1 c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L42 + .align 4 + +$L45: + ADD4 c05, t1, c05 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L47 +#else + blbs TMP1, $L47 +#endif + .align 4 + + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + ldi AO, 4 * SIZE(AO) + + ADD4 c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L47: + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + MUL a4, b1, t4 + ADD1 c01, t1, c01 + MUL a1, b2, t1 + + ADD3 c02, t2, c02 + MUL a2, b2, t2 + ADD1 c03, t3, c03 + MUL a3, b2, t3 + + ADD3 c04, t4, c04 + ldi AO, 4 * SIZE(AO) + MUL a4, b2, t4 + ldi BO, 2 * SIZE(BO) + + ADD4 c05, t1, c05 + ADD2 c06, t2, c06 + ADD4 c07, t3, c07 + ADD2 c08, t4, c08 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + +$L48: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 1, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + MUL a3, c03, t1 + MUL a3, c04, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + MUL a4, c04, t1 + MUL a4, c03, t2 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c03, t1, c03 + SUB c04, t2, c04 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c03, t1, c03 + ADD5 c04, t2, c04 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + ldi C1, 4 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + + ldi I, -1(I) + bgt I, $L41 + .align 4 + +$L50: + and M, 1, I + ble I, $L59 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi AO, 2 * SIZE(AO) + ldi BO, 2 * SIZE(B) + + ldi L, -2(KK) + + ble KK, $L58 + ble L, $L55 +#else +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi AO, 2 * SIZE(AO) + ldi BO, 2 * SIZE(BO) + + ldi L, -2(TMP1) + + ble TMP1, $L58 + ble L, $L55 +#endif + .align 5 + +$L52: + ADD1 c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD3 c02, t2, c02 + ldi AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + ldi L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + ldi BO, 4 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L52 + .align 4 + +$L55: + ADD1 c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD3 c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + ldi BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + ldi AO, 2 * SIZE(AO) + .align 4 + +$L57: + ADD3 c02, t2, c02 + MUL a2, b1, t2 + ADD4 c05, t3, c05 + MUL a1, b2, t3 + + ADD2 c06, t4, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b2, t4 + ldi BO, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + ADD3 c02, t2, c02 + ADD4 c05, t3, c05 + ADD2 c06, t4, c06 + + ADD c01, c06, c01 + ADD c02, c05, c02 + +$L58: +#if defined(LN) || defined(RT) + subl KK, 1, TMP1 + + sll TMP1, ZBASE_SHIFT, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + ldi C1, 2 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L59: +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 1, KK +#endif + +#ifdef RT + subl KK, 1, KK +#endif + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + clr $0 + ldi $sp, STACKSIZE($sp) + ret + .ident VERSION + .end CNAME diff --git a/kernel/sw_64/ztrsm_kernel_2x2_RT.S b/kernel/sw_64/ztrsm_kernel_2x2_RT.S new file mode 100644 index 0000000..97dbc16 --- /dev/null +++ b/kernel/sw_64/ztrsm_kernel_2x2_RT.S @@ -0,0 +1,2623 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(SW6) +#error "Architecture is not specified." +#endif + +#ifdef SW6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 48 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + + .set noat + .set noreorder + .arch sw6a + +.text + .align 5 + .globl CNAME + .ent CNAME + +#define STACKSIZE 88 + +#define M $16 +#define N $17 +#define K $18 +#define A $21 +#define B $22 +#define C $20 +#define LDC $23 + +#define C1 $19 +#define C2 $24 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define tmp $9 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha_i $f29 +#define alpha_r $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + +#if defined(LN) || defined(LT) +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#define ADD5 ADD +#define ADD6 SUB +#endif +#else +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#define ADD5 ADD +#define ADD6 SUB +#endif +#endif + + +CNAME: + .frame $sp, STACKSIZE, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + ldi $at, _mcount + jsr $at, ($at), _mcount +#endif + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + ldi $sp, -STACKSIZE($sp) + + ldl B, 0 + STACKSIZE($sp) + ldl C, 8 + STACKSIZE($sp) + ldl LDC, 16 + STACKSIZE($sp) + ldl OFFSET, 24 + STACKSIZE($sp) + + sll LDC, ZBASE_SHIFT, LDC + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + stl tmp, 72($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + addl M, M, TMP2 + mull TMP2, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ TMP2, C, C +#endif + +#ifdef RN + negl OFFSET, KK +#endif + +#ifdef RT + mull N, K, TMP1 + addl TMP1, TMP1, TMP1 + SXADDQ TMP1, B, B + + mull N, LDC, TMP1 + addl TMP1, C, C + + subl N, OFFSET, KK +#endif + + and N, 1, J + ble J, $L30 + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + subl B, TMP1, B + + subl C, LDC, C1 + subl C, LDC, C +#else + mov C, C1 + addl C, LDC, C +#endif + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + ble I, $L50 + .align 4 + +$L41: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi BO, 2 * SIZE(B) + fclr c03 + ldi AO, 4 * SIZE(AO) + fclr c07 + + ldi L, -2(KK) + fclr c04 + fclr c08 + + ble KK, $L48 + ble L, $L45 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi BO, 2 * SIZE(BO) + fclr c03 + ldi AO, 4 * SIZE(AO) + fclr c07 + + ldi L, -2(TMP1) + fclr c04 + fclr c08 + + ble TMP1, $L48 + ble L, $L45 +#endif + .align 5 + +$L42: + ADD4 c05, t1, b5 + fmov b5, c05 + unop + MUL a1, b1, t1 + unop + + ADD2 c06, t2, b5 + fmov b5, c06 + ldi L, -2(L) + MUL a2, b1, t2 + unop + + ADD4 c07, t3, b5 + fmov b5, c07 + unop + MUL a3, b1, t3 + unop + + ADD2 c08, t4, b5 + fmov b5, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, b5 + fmov b5, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, b5 + fmov b5, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD4 c05, t1, b5 + fmov b5, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD2 c06, t2, b5 + fmov b5, c06 + unop + MUL a2, b3, t2 + unop + + ADD4 c07, t3, b5 + fmov b5, c07 + unop + MUL a3, b3, t3 + ldi AO, 8 * SIZE(AO) + + ADD2 c08, t4, b5 + fmov b5, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD3 c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD1 c03, t3, b5 + fmov b5, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD3 c04, t4, b5 + fmov b5, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L42 + .align 4 + +$L45: + ADD4 c05, t1, b5 + fmov b5, c05 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L47 +#else + blbs TMP1, $L47 +#endif + .align 4 + + ADD2 c06, t2, b5 + fmov b5, c06 + MUL a2, b1, t2 + ADD4 c07, t3, b5 + fmov b5, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, b5 + fmov b5, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, b5 + fmov b5, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, b5 + fmov b5, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + ldi AO, 4 * SIZE(AO) + + ADD4 c05, t1, b5 + fmov b5, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L47: + ADD2 c06, t2, b5 + fmov b5, c06 + MUL a2, b1, t2 + ADD4 c07, t3, b5 + fmov b5, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, b5 + fmov b5, c08 + MUL a4, b1, t4 + ADD1 c01, t1, b5 + fmov b5, c01 + MUL a1, b2, t1 + + ADD3 c02, t2, b5 + fmov b5, c02 + MUL a2, b2, t2 + ADD1 c03, t3, b5 + fmov b5, c03 + MUL a3, b2, t3 + + ADD3 c04, t4, b5 + fmov b5, c04 + ldi AO, 4 * SIZE(AO) + MUL a4, b2, t4 + ldi BO, 2 * SIZE(BO) + + ADD4 c05, t1, b5 + fmov b5, c05 + ADD2 c06, t2, b5 + fmov b5, c06 + ADD4 c07, t3, b5 + fmov b5, c07 + ADD2 c08, t4, b5 + fmov b5, c08 + + ADD c01, c06, b5 + fmov b5, c01 + ADD c02, c05, b5 + fmov b5, c02 + ADD c03, c08, b5 + fmov b5, c03 + ADD c04, c07, b5 + fmov b5, c04 + +$L48: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 1, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + + ADD5 c03, t1, b5 + fmov b5, c03 + ADD6 c04, t2, b5 + fmov b5, c04 + MUL a3, c03, t1 + MUL a3, c04, t2 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + MUL a4, c04, t1 + MUL a4, c03, t2 + + ADD6 c01, t1, b5 + fmov b5, c01 + ADD5 c02, t2, b5 + fmov b5, c02 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c04, t2, b5 + fmov b5, c04 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c03, t1, b5 + fmov b5, c03 + ADD5 c04, t2, b5 + fmov b5, c04 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + + ADD5 c03, t1, b5 + fmov b5, c03 + ADD6 c04, t2, b5 + fmov b5, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + ADD5 c03, t3, b5 + fmov b5, c03 + ADD6 c04, t4, b5 + fmov b5, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + ldi C1, 4 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + + ldi I, -1(I) + bgt I, $L41 + .align 4 + +$L50: + and M, 1, I + ble I, $L59 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi AO, 2 * SIZE(AO) + ldi BO, 2 * SIZE(B) + + ldi L, -2(KK) + + ble KK, $L58 + ble L, $L55 +#else +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi AO, 2 * SIZE(AO) + ldi BO, 2 * SIZE(BO) + + ldi L, -2(TMP1) + + ble TMP1, $L58 + ble L, $L55 +#endif + .align 5 + +$L52: + ADD1 c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b1, t1 + unop + + ADD3 c02, t2, b5 + fmov b5, c02 + ldi AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD4 c05, t3, b5 + fmov b5, c05 + ldi L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, b5 + fmov b5, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c01, t1, b5 + fmov b5, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + ldi BO, 4 * SIZE(BO) + + ADD3 c02, t2, b5 + fmov b5, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD4 c05, t3, b5 + fmov b5, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, b5 + fmov b5, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L52 + .align 4 + +$L55: + ADD1 c01, t1, b5 + fmov b5, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD3 c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c05, t3, b5 + fmov b5, c05 + ldi BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD2 c06, t4, b5 + fmov b5, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD1 c01, t1, b5 + fmov b5, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + ldi AO, 2 * SIZE(AO) + .align 4 + +$L57: + ADD3 c02, t2, b5 + fmov b5, c02 + MUL a2, b1, t2 + ADD4 c05, t3, b5 + fmov b5, c05 + MUL a1, b2, t3 + + ADD2 c06, t4, b5 + fmov b5, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b2, t4 + ldi BO, 2 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 + ADD3 c02, t2, b5 + fmov b5, c02 + ADD4 c05, t3, b5 + fmov b5, c05 + ADD2 c06, t4, b5 + fmov b5, c06 + + ADD c01, c06, b5 + fmov b5, c01 + ADD c02, c05, b5 + fmov b5, c02 + +$L58: +#if defined(LN) || defined(RT) + subl KK, 1, TMP1 + + sll TMP1, ZBASE_SHIFT, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + ldi C1, 2 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L59: +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 1, KK +#endif + +#ifdef RT + subl KK, 1, KK +#endif + .align 4 + +$L30: + sra N, 1, J + ble J, $L999 + .align 4 + +$L01: +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + subl B, TMP1, B + + subl C, LDC, C2 + subl C2, LDC, C1 + subl C2, LDC, C +#else + mov C, C1 + addl C, LDC, C2 + addl C2, LDC, C +#endif + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + fclr t1 + fclr t2 + fclr t3 + fclr t4 + + fclr c01 + fclr c05 + + ble I, $L20 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c07 + + ldi BO, 4 * SIZE(B) + fclr c11 + ldi AO, 4 * SIZE(AO) + fclr c15 + + fillcs 4 * SIZE(C1) + fclr c04 + ldi L, -2(KK) + fclr c08 + + fillcs 4 * SIZE(C2) + fclr c12 + fclr c16 + ble KK, $L18 + ble L, $L15 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c07 + + ldi BO, 4 * SIZE(BO) + fclr c11 + ldi AO, 4 * SIZE(AO) + fclr c15 + + fillcs 4 * SIZE(C1) + fclr c04 + ldi L, -2(TMP1) + fclr c08 + + fillcs 4 * SIZE(C2) + fclr c12 + fclr c16 + ble TMP1, $L18 + ble L, $L15 +#endif + .align 5 + +$L12: +/* 1 */ + ADD1 c11, t1, b5 + fmov b5, c11 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD3 c12, t2, b5 + fmov b5, c12 + unop + MUL b1, a2, t2 + unop + + ADD2 c16, t3, b5 + fmov b5, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD4 c15, t4, b5 + fmov b5, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + FIMOVD b5, tmp +/* 2 */ + ADD1 c01, t1, b5 + fmov b5, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD3 c02, t2, b5 + fmov b5, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD2 c06, t3, b5 + fmov b5, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, b5 + fmov b5, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD1 c03, t1, b5 + fmov b5, c03 + unop + MUL b3, a1, t1 + unop + + ADD3 c04, t2, b5 + fmov b5, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, b5 + fmov b5, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, b5 + fmov b5, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD1 c09, t1, b5 + fmov b5, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD3 c10, t2, b5 + fmov b5, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, b5 + fmov b5, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD4 c07, t4, b5 + fmov b5, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD1 c11, t1, b5 + fmov b5, c11 + unop + IFMOVD tmp, b5 + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD3 c12, t2, b5 + fmov b5, c12 + ldi L, -2(L) + IFMOVD tmp, b5 + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD2 c16, t3, b5 + fmov b5, c16 + unop + MUL b2, a2, t3 + unop + + ADD4 c15, t4, b5 + fmov b5, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD1 c01, t1, b5 + fmov b5, c01 + unop + IFMOVD tmp, b5 + MUL b5, a6, t1 + unop + + ADD3 c02, t2, b5 + fmov b5, c02 + unop + IFMOVD tmp, b5 + MUL b5, a4, t2 + unop + + ADD2 c06, t3, b5 + fmov b5, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, b5 + fmov b5, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD1 c03, t1, b5 + fmov b5, c03 + ldi AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD3 c04, t2, b5 + fmov b5, c04 + ldi BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD2 c08, t3, b5 + fmov b5, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD4 c13, t4, b5 + fmov b5, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD1 c09, t1, b5 + fmov b5, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD3 c10, t2, b5 + fmov b5, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD2 c14, t3, b5 + fmov b5, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, b5 + fmov b5, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD1 c11, t1, b5 + fmov b5, c11 + unop + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD3 c12, t2, b5 + fmov b5, c12 + MUL b1, a2, t2 + ADD2 c16, t3, b5 + fmov b5, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, b5 + fmov b5, c15 + MUL b2, a1, t4 + ADD1 c01, t1, b5 + fmov b5, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, b5 + fmov b5, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD2 c06, t3, b5 + fmov b5, c06 + MUL b2, a4, t3 + ADD4 c05, t4, b5 + fmov b5, c05 + MUL b4, a1, t4 + + ADD1 c03, t1, b5 + fmov b5, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c04, t2, b5 + fmov b5, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, b5 + fmov b5, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, b5 + fmov b5, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c09, t1, b5 + fmov b5, c09 + unop + MUL b3, a3, t1 + ldi AO, 4 * SIZE(AO) + + ADD3 c10, t2, b5 + fmov b5, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, b5 + fmov b5, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, b5 + fmov b5, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD1 c11, t1, b5 + fmov b5, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD3 c12, t2, b5 + fmov b5, c12 + MUL b1, a2, t2 + ADD2 c16, t3, b5 + fmov b5, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, b5 + fmov b5, c15 + MUL b2, a1, t4 + ADD1 c01, t1, b5 + fmov b5, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, b5 + fmov b5, c02 + MUL b1, a4, t2 + ADD2 c06, t3, b5 + fmov b5, c06 + MUL b2, a4, t3 + + ADD4 c05, t4, b5 + fmov b5, c05 + MUL b4, a1, t4 + ADD1 c03, t1, b5 + fmov b5, c03 + MUL b3, a1, t1 + + ADD3 c04, t2, b5 + fmov b5, c04 + MUL b3, a2, t2 + ADD2 c08, t3, b5 + fmov b5, c08 + MUL b4, a2, t3 + + ADD4 c13, t4, b5 + fmov b5, c13 + MUL b2, a3, t4 + ADD1 c09, t1, b5 + fmov b5, c09 + MUL b3, a3, t1 + + ADD3 c10, t2, b5 + fmov b5, c10 + MUL b3, a4, t2 + ADD2 c14, t3, b5 + fmov b5, c14 + MUL b4, a4, t3 + + ADD4 c07, t4, b5 + fmov b5, c07 + ldi AO, 4 * SIZE(AO) + MUL b4, a3, t4 + ldi BO, 4 * SIZE(BO) + + ADD1 c11, t1, b5 + fmov b5, c11 + ADD3 c12, t2, b5 + fmov b5, c12 + ADD2 c16, t3, b5 + fmov b5, c16 + ADD4 c15, t4, b5 + fmov b5, c15 + + ADD c01, c06, b5 + fmov b5, c01 + ADD c02, c05, b5 + fmov b5, c02 + ADD c03, c08, b5 + fmov b5, c03 + ADD c04, c07, b5 + fmov b5, c04 + + ADD c09, c14, b5 + fmov b5, c09 + ADD c10, c13, b5 + fmov b5, c10 + ADD c11, c16, b5 + fmov b5, c11 + ADD c12, c15, b5 + fmov b5, c12 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c10, b5 + fmov b5, c10 + + SUB b1, c03, b5 + fmov b5, c03 + SUB b2, c04, b5 + fmov b5, c04 + SUB b3, c11, b5 + fmov b5, c11 + SUB b4, c12, b5 + fmov b5, c12 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c03, b5 + fmov b5, c03 + SUB a4, c04, b5 + fmov b5, c04 + + SUB b1, c09, b5 + fmov b5, c09 + SUB b2, c10, b5 + fmov b5, c10 + SUB b3, c11, b5 + fmov b5, c11 + SUB b4, c12, b5 + fmov b5, c12 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + MUL a1, c11, b5 + fmov b5, c11 + MUL a1, c12, b5 + fmov b5, c12 + + ADD5 c03, t1, b5 + fmov b5, c03 + ADD6 c04, t2, b5 + fmov b5, c04 + ADD5 c11, t3, b5 + fmov b5, c11 + ADD6 c12, t4, b5 + fmov b5, c12 + + MUL a3, c03, t1 + MUL a3, c04, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + SUB c09, t3, b5 + fmov b5, c09 + SUB c10, t4, b5 + fmov b5, c10 + + MUL a4, c04, t1 + MUL a4, c03, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, b5 + fmov b5, c01 + ADD5 c02, t2, b5 + fmov b5, c02 + ADD6 c09, t3, b5 + fmov b5, c09 + ADD5 c10, t4, b5 + fmov b5, c10 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + ADD5 c09, t3, b5 + fmov b5, c09 + ADD6 c10, t4, b5 + fmov b5, c10 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + ADD5 c09, t3, b5 + fmov b5, c09 + ADD6 c10, t4, b5 + fmov b5, c10 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c09, t3 + MUL a3, c10, t4 + + SUB c03, t1, b5 + fmov b5, c03 + SUB c04, t2, b5 + fmov b5, c04 + SUB c11, t3, b5 + fmov b5, c11 + SUB c12, t4, b5 + fmov b5, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c10, t3 + MUL a4, c09, t4 + + ADD6 c03, t1, b5 + fmov b5, c03 + ADD5 c04, t2, b5 + fmov b5, c04 + ADD6 c11, t3, b5 + fmov b5, c11 + ADD5 c12, t4, b5 + fmov b5, c12 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + MUL a1, c11, b5 + fmov b5, c11 + MUL a1, c12, b5 + fmov b5, c12 + + ADD5 c03, t1, b5 + fmov b5, c03 + ADD6 c04, t2, b5 + fmov b5, c04 + ADD5 c11, t3, b5 + fmov b5, c11 + ADD6 c12, t4, b5 + fmov b5, c12 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + ADD5 c03, t3, b5 + fmov b5, c03 + ADD6 c04, t4, b5 + fmov b5, c04 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + SUB c11, t3, b5 + fmov b5, c11 + SUB c12, t4, b5 + fmov b5, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c04, t3 + MUL a4, c03, t4 + + ADD6 c09, t1, b5 + fmov b5, c09 + ADD5 c10, t2, b5 + fmov b5, c10 + ADD6 c11, t3, b5 + fmov b5, c11 + ADD5 c12, t4, b5 + fmov b5, c12 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + MUL a1, c11, b5 + fmov b5, c11 + MUL a1, c12, b5 + fmov b5, c12 + + ADD5 c09, t1, b5 + fmov b5, c09 + ADD6 c10, t2, b5 + fmov b5, c10 + ADD5 c11, t3, b5 + fmov b5, c11 + ADD6 c12, t4, b5 + fmov b5, c12 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + MUL a1, c11, b5 + fmov b5, c11 + MUL a1, c12, b5 + fmov b5, c12 + + ADD5 c09, t1, b5 + fmov b5, c09 + ADD6 c10, t2, b5 + fmov b5, c10 + ADD5 c11, t3, b5 + fmov b5, c11 + ADD6 c12, t4, b5 + fmov b5, c12 + + MUL a3, c09, t1 + MUL a3, c10, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + SUB c03, t3, b5 + fmov b5, c03 + SUB c04, t4, b5 + fmov b5, c04 + + MUL a4, c10, t1 + MUL a4, c09, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, b5 + fmov b5, c01 + ADD5 c02, t2, b5 + fmov b5, c02 + ADD6 c03, t3, b5 + fmov b5, c03 + ADD5 c04, t4, b5 + fmov b5, c04 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c03, b5 + fmov b5, c03 + MUL a1, c04, b5 + fmov b5, c04 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + ADD5 c03, t3, b5 + fmov b5, c03 + ADD6 c04, t4, b5 + fmov b5, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c04, 5 * SIZE(BO) + ST c11, 6 * SIZE(BO) + ST c12, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c11, 6 * SIZE(AO) + ST c12, 7 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) + ldi C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + ST c11, 2 * SIZE(C2) + ST c12, 3 * SIZE(C2) + +#ifndef LN + ldi C1, 4 * SIZE(C1) + ldi C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP1 + addl AO, TMP1, AO + addl BO, TMP1, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + fclr c01 + fclr c05 + + ldi I, -1(I) + bgt I, $L11 + .align 4 + +$L20: + and M, 1, I + ble I, $L29 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + ldi AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(B) + ldi BO, 4 * SIZE(B) + + ldi L, -2(KK) + + ble KK, $L28 + ble L, $L25 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, ZBASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(BO) + ldi BO, 4 * SIZE(BO) + + ldi L, -2(TMP1) + + ble TMP1, $L28 + ble L, $L25 +#endif + .align 5 + +$L22: + ADD1 c09, t1, b5 + fmov b5, c09 + unop + MUL a1, b1, t1 + unop + + ADD3 c10, t2, b5 + fmov b5, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, b5 + fmov b5, c13 + unop + MUL a1, b2, t3 + ldi BO, 8 * SIZE(BO) + + ADD2 c14, t4, b5 + fmov b5, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b3, t1 + unop + + ADD3 c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD4 c05, t3, b5 + fmov b5, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD2 c06, t4, b5 + fmov b5, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + FIMOVD b5, tmp + + ADD1 c09, t1, b5 + fmov b5, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD3 c10, t2, b5 + fmov b5, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD4 c13, t3, b5 + fmov b5, c13 + unop + MUL a3, b2, t3 + ldi AO, 4 * SIZE(AO) + + ADD2 c14, t4, b5 + fmov b5, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 + ldi L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD3 c02, t2, b5 + fmov b5, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD4 c05, t3, b5 + fmov b5, c05 + unop + IFMOVD tmp, b5 + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, b5 + fmov b5, c06 + IFMOVD tmp, b5 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD1 c09, t1, b5 + fmov b5, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + .align 4 + + ADD3 c10, t2, b5 + fmov b5, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, b5 + fmov b5, c13 + unop + MUL a1, b2, t3 + unop + + ADD2 c14, t4, b5 + fmov b5, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c01, t1, b5 + fmov b5, c01 + unop + MUL a1, b3, t1 + ldi AO, 2 * SIZE(AO) + + ADD3 c02, t2, b5 + fmov b5, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD4 c05, t3, b5 + fmov b5, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, b5 + fmov b5, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c09, t1, b5 + fmov b5, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD3 c10, t2, b5 + fmov b5, c10 + MUL a2, b1, t2 + ADD4 c13, t3, b5 + fmov b5, c13 + MUL a1, b2, t3 + + ADD2 c14, t4, b5 + fmov b5, c14 + MUL a2, b2, t4 + ADD1 c01, t1, b5 + fmov b5, c01 + MUL a1, b3, t1 + + ADD3 c02, t2, b5 + fmov b5, c02 + MUL a2, b3, t2 + ADD4 c05, t3, b5 + fmov b5, c05 + MUL a1, b4, t3 + + ADD2 c06, t4, b5 + fmov b5, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b4, t4 + ldi BO, 4 * SIZE(BO) + + ADD1 c09, t1, b5 + fmov b5, c09 + ADD3 c10, t2, b5 + fmov b5, c10 + ADD4 c13, t3, b5 + fmov b5, c13 + ADD2 c14, t4, b5 + fmov b5, c14 + + ADD c01, c06, b5 + fmov b5, c01 + ADD c02, c05, b5 + fmov b5, c02 + ADD c09, c14, b5 + fmov b5, c09 + ADD c10, c13, b5 + fmov b5, c10 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 1, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c10, b5 + fmov b5, c10 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, b5 + fmov b5, c01 + SUB a2, c02, b5 + fmov b5, c02 + SUB a3, c09, b5 + fmov b5, c09 + SUB a4, c10, b5 + fmov b5, c10 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + ADD5 c09, t3, b5 + fmov b5, c09 + ADD6 c10, t4, b5 + fmov b5, c10 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 + + MUL a3, c01, t1 + MUL a3, c02, t2 + SUB c09, t1, b5 + fmov b5, c09 + SUB c10, t2, b5 + fmov b5, c10 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c09, t1, b5 + fmov b5, c09 + ADD5 c10, t2, b5 + fmov b5, c10 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + + ADD5 c09, t1, b5 + fmov b5, c09 + ADD6 c10, t2, b5 + fmov b5, c10 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, b5 + fmov b5, c09 + MUL a1, c10, b5 + fmov b5, c10 + + ADD5 c09, t1, b5 + fmov b5, c09 + ADD6 c10, t2, b5 + fmov b5, c10 + + MUL a3, c09, t1 + MUL a3, c10, t2 + SUB c01, t1, b5 + fmov b5, c01 + SUB c02, t2, b5 + fmov b5, c02 + + MUL a4, c10, t1 + MUL a4, c09, t2 + ADD6 c01, t1, b5 + fmov b5, c01 + ADD5 c02, t2, b5 + fmov b5, c02 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, b5 + fmov b5, c01 + MUL a1, c02, b5 + fmov b5, c02 + + ADD5 c01, t1, b5 + fmov b5, c01 + ADD6 c02, t2, b5 + fmov b5, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c10, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) + ldi C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + +#ifndef LN + ldi C1, 2 * SIZE(C1) + ldi C2, 2 * SIZE(C2) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L29: +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 2, KK +#endif + +#ifdef RT + subl KK, 2, KK +#endif + + ldi J, -1(J) + bgt J, $L01 + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + ldl tmp, 72($sp) + clr $0 + ldi $sp, STACKSIZE($sp) + ret + .ident VERSION + .end CNAME diff --git a/kernel/sw_64/ztrsm_kernel_2x2_RT.S.bak b/kernel/sw_64/ztrsm_kernel_2x2_RT.S.bak new file mode 100644 index 0000000..4d4f59d --- /dev/null +++ b/kernel/sw_64/ztrsm_kernel_2x2_RT.S.bak @@ -0,0 +1,2223 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + +#if !defined(SW2B) +#error "Architecture is not specified." +#endif + +#ifdef SW2B +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + + + .set noat + .set noreorder + .arch ev6 + +.text + .align 5 + .globl CNAME + .ent CNAME + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $21 +#define B $22 +#define C $20 +#define LDC $23 + +#define C1 $19 +#define C2 $24 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha_i $f29 +#define alpha_r $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + +#if defined(LN) || defined(LT) +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#define ADD5 ADD +#define ADD6 SUB +#endif +#else +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#define ADD5 ADD +#define ADD6 SUB +#endif +#endif + + +CNAME: + .frame $sp, STACKSIZE, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + ldi $at, _mcount + jsr $at, ($at), _mcount +#endif + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + ldi $sp, -STACKSIZE($sp) + + ldl B, 0 + STACKSIZE($sp) + ldl C, 8 + STACKSIZE($sp) + ldl LDC, 16 + STACKSIZE($sp) + ldl OFFSET, 24 + STACKSIZE($sp) + + sll LDC, ZBASE_SHIFT, LDC + + fstd $f2, 0($sp) + fstd $f3, 8($sp) + fstd $f4, 16($sp) + fstd $f5, 24($sp) + fstd $f6, 32($sp) + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + addl M, M, TMP2 + mull TMP2, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ TMP2, C, C +#endif + +#ifdef RN + negl OFFSET, KK +#endif + +#ifdef RT + mull N, K, TMP1 + addl TMP1, TMP1, TMP1 + SXADDQ TMP1, B, B + + mull N, LDC, TMP1 + addl TMP1, C, C + + subl N, OFFSET, KK +#endif + + and N, 1, J + ble J, $L30 + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + subl B, TMP1, B + + subl C, LDC, C1 + subl C, LDC, C +#else + mov C, C1 + addl C, LDC, C +#endif + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + ble I, $L50 + .align 4 + +$L41: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi BO, 2 * SIZE(B) + fclr c03 + ldi AO, 4 * SIZE(AO) + fclr c07 + + ldi L, -2(KK) + fclr c04 + fclr c08 + + ble KK, $L48 + ble L, $L45 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi BO, 2 * SIZE(BO) + fclr c03 + ldi AO, 4 * SIZE(AO) + fclr c07 + + ldi L, -2(TMP1) + fclr c04 + fclr c08 + + ble TMP1, $L48 + ble L, $L45 +#endif + .align 5 + +$L42: + ADD4 c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD2 c06, t2, c06 + ldi L, -2(L) + MUL a2, b1, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD4 c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD2 c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b3, t3 + ldi AO, 8 * SIZE(AO) + + ADD2 c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD1 c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L42 + .align 4 + +$L45: + ADD4 c05, t1, c05 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L47 +#else + blbs TMP1, $L47 +#endif + .align 4 + + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + ldi AO, 4 * SIZE(AO) + + ADD4 c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + +$L47: + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + MUL a4, b1, t4 + ADD1 c01, t1, c01 + MUL a1, b2, t1 + + ADD3 c02, t2, c02 + MUL a2, b2, t2 + ADD1 c03, t3, c03 + MUL a3, b2, t3 + + ADD3 c04, t4, c04 + ldi AO, 4 * SIZE(AO) + MUL a4, b2, t4 + ldi BO, 2 * SIZE(BO) + + ADD4 c05, t1, c05 + ADD2 c06, t2, c06 + ADD4 c07, t3, c07 + ADD2 c08, t4, c08 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + +$L48: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 1, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + MUL a3, c03, t1 + MUL a3, c04, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + MUL a4, c04, t1 + MUL a4, c03, t2 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c03, t1, c03 + SUB c04, t2, c04 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c03, t1, c03 + ADD5 c04, t2, c04 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + ldi C1, 4 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + + ldi I, -1(I) + bgt I, $L41 + .align 4 + +$L50: + and M, 1, I + ble I, $L59 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + ldi AO, 2 * SIZE(AO) + ldi BO, 2 * SIZE(B) + + ldi L, -2(KK) + + ble KK, $L58 + ble L, $L55 +#else +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + ldi AO, 2 * SIZE(AO) + ldi BO, 2 * SIZE(BO) + + ldi L, -2(TMP1) + + ble TMP1, $L58 + ble L, $L55 +#endif + .align 5 + +$L52: + ADD1 c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD3 c02, t2, c02 + ldi AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + ldi L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + ldi BO, 4 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L52 + .align 4 + +$L55: + ADD1 c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD3 c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + ldi BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + ldi AO, 2 * SIZE(AO) + .align 4 + +$L57: + ADD3 c02, t2, c02 + MUL a2, b1, t2 + ADD4 c05, t3, c05 + MUL a1, b2, t3 + + ADD2 c06, t4, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b2, t4 + ldi BO, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + ADD3 c02, t2, c02 + ADD4 c05, t3, c05 + ADD2 c06, t4, c06 + + ADD c01, c06, c01 + ADD c02, c05, c02 + +$L58: +#if defined(LN) || defined(RT) + subl KK, 1, TMP1 + + sll TMP1, ZBASE_SHIFT, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + ldi C1, 2 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L59: +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 1, KK +#endif + +#ifdef RT + subl KK, 1, KK +#endif + .align 4 + +$L30: + sra N, 1, J + ble J, $L999 + .align 4 + +$L01: +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + subl B, TMP1, B + + subl C, LDC, C2 + subl C2, LDC, C1 + subl C2, LDC, C +#else + mov C, C1 + addl C, LDC, C2 + addl C2, LDC, C +#endif + +#ifdef LN + addl M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + fclr t1 + fclr t2 + fclr t3 + fclr t4 + + fclr c01 + fclr c05 + + ble I, $L20 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c07 + + ldi BO, 4 * SIZE(B) + fclr c11 + ldi AO, 4 * SIZE(AO) + fclr c15 + + fillcs 4 * SIZE(C1) + fclr c04 + ldi L, -2(KK) + fclr c08 + + fillcs 4 * SIZE(C2) + fclr c12 + fclr c16 + ble KK, $L18 + ble L, $L15 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c07 + + ldi BO, 4 * SIZE(BO) + fclr c11 + ldi AO, 4 * SIZE(AO) + fclr c15 + + fillcs 4 * SIZE(C1) + fclr c04 + ldi L, -2(TMP1) + fclr c08 + + fillcs 4 * SIZE(C2) + fclr c12 + fclr c16 + ble TMP1, $L18 + ble L, $L15 +#endif + .align 5 + +$L12: +/* 1 */ + ADD1 c11, t1, c11 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + fillcs PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD3 c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD4 c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD1 c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD3 c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD1 c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD3 c12, t2, c12 + ldi L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD4 c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD1 c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD1 c03, t1, c03 + ldi AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD3 c04, t2, c04 + ldi BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD1 c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD1 c11, t1, c11 + unop + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD2 c06, t3, c06 + MUL b2, a4, t3 + ADD4 c05, t4, c05 + MUL b4, a1, t4 + + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + ldi AO, 4 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD1 c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + MUL b1, a4, t2 + ADD2 c06, t3, c06 + MUL b2, a4, t3 + + ADD4 c05, t4, c05 + MUL b4, a1, t4 + ADD1 c03, t1, c03 + MUL b3, a1, t1 + + ADD3 c04, t2, c04 + MUL b3, a2, t2 + ADD2 c08, t3, c08 + MUL b4, a2, t3 + + ADD4 c13, t4, c13 + MUL b2, a3, t4 + ADD1 c09, t1, c09 + MUL b3, a3, t1 + + ADD3 c10, t2, c10 + MUL b3, a4, t2 + ADD2 c14, t3, c14 + MUL b4, a4, t3 + + ADD4 c07, t4, c07 + ldi AO, 4 * SIZE(AO) + MUL b4, a3, t4 + ldi BO, 4 * SIZE(BO) + + ADD1 c11, t1, c11 + ADD3 c12, t2, c12 + ADD2 c16, t3, c16 + ADD4 c15, t4, c15 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + + ADD c09, c14, c09 + ADD c10, c13, c10 + ADD c11, c16, c11 + ADD c12, c15, c12 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 + + SUB b1, c03, c03 + SUB b2, c04, c04 + SUB b3, c11, c11 + SUB b4, c12, c12 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c09, c09 + SUB b2, c10, c10 + SUB b3, c11, c11 + SUB b4, c12, c12 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, c03 + MUL a1, c04, c04 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 + + MUL a3, c03, t1 + MUL a3, c04, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c09, t3, c09 + SUB c10, t4, c10 + + MUL a4, c04, t1 + MUL a4, c03, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + ADD6 c09, t3, c09 + ADD5 c10, t4, c10 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c09, t3 + MUL a3, c10, t4 + + SUB c03, t1, c03 + SUB c04, t2, c04 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c10, t3 + MUL a4, c09, t4 + + ADD6 c03, t1, c03 + ADD5 c04, t2, c04 + ADD6 c11, t3, c11 + ADD5 c12, t4, c12 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, c03 + MUL a1, c04, c04 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c04, t3 + MUL a4, c03, t4 + + ADD6 c09, t1, c09 + ADD5 c10, t2, c10 + ADD6 c11, t3, c11 + ADD5 c12, t4, c12 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 + + MUL a3, c09, t1 + MUL a3, c10, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a4, c10, t1 + MUL a4, c09, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + ADD6 c03, t3, c03 + ADD5 c04, t4, c04 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c04, 5 * SIZE(BO) + ST c11, 6 * SIZE(BO) + ST c12, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c11, 6 * SIZE(AO) + ST c12, 7 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) + ldi C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + ST c11, 2 * SIZE(C2) + ST c12, 3 * SIZE(C2) + +#ifndef LN + ldi C1, 4 * SIZE(C1) + ldi C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP1 + addl AO, TMP1, AO + addl BO, TMP1, BO +#endif + +#ifdef LT + addl KK, 2, KK +#endif + +#ifdef LN + subl KK, 2, KK +#endif + fclr c01 + fclr c05 + + ldi I, -1(I) + bgt I, $L11 + .align 4 + +$L20: + and M, 1, I + ble I, $L29 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + ldi AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(B) + ldi BO, 4 * SIZE(B) + + ldi L, -2(KK) + + ble KK, $L28 + ble L, $L25 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO + sll KK, ZBASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + ldi AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(BO) + ldi BO, 4 * SIZE(BO) + + ldi L, -2(TMP1) + + ble TMP1, $L28 + ble L, $L25 +#endif + .align 5 + +$L22: + ADD1 c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + ldi BO, 8 * SIZE(BO) + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a3, b2, t3 + ldi AO, 4 * SIZE(AO) + + ADD2 c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD1 c01, t1, c01 + ldi L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD1 c09, t1, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + .align 4 + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + ldi AO, 2 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD3 c10, t2, c10 + MUL a2, b1, t2 + ADD4 c13, t3, c13 + MUL a1, b2, t3 + + ADD2 c14, t4, c14 + MUL a2, b2, t4 + ADD1 c01, t1, c01 + MUL a1, b3, t1 + + ADD3 c02, t2, c02 + MUL a2, b3, t2 + ADD4 c05, t3, c05 + MUL a1, b4, t3 + + ADD2 c06, t4, c06 + ldi AO, 2 * SIZE(AO) + MUL a2, b4, t4 + ldi BO, 4 * SIZE(BO) + + ADD1 c09, t1, c09 + ADD3 c10, t2, c10 + ADD4 c13, t3, c13 + ADD2 c14, t4, c14 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c09, c14, c09 + ADD c10, c13, c10 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 1, TMP1 +#else + subl KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) + ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + + MUL a3, c01, t1 + MUL a3, c02, t2 + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c09, t1, c09 + ADD5 c10, t2, c10 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + + MUL a3, c09, t1 + MUL a3, c10, t2 + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a4, c10, t1 + MUL a4, c09, t2 + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c10, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) + ldi C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + +#ifndef LN + ldi C1, 2 * SIZE(C1) + ldi C2, 2 * SIZE(C2) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT + addl KK, 1, KK +#endif + +#ifdef LN + subl KK, 1, KK +#endif + .align 4 + +$L29: +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + addl B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addl KK, 2, KK +#endif + +#ifdef RT + subl KK, 2, KK +#endif + + ldi J, -1(J) + bgt J, $L01 + .align 4 + +$L999: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) + clr $0 + ldi $sp, STACKSIZE($sp) + ret + .ident VERSION + .end CNAME diff --git a/lapack/laswp/sw_64/Makefile b/lapack/laswp/sw_64/Makefile new file mode 100644 index 0000000..af1f019 --- /dev/null +++ b/lapack/laswp/sw_64/Makefile @@ -0,0 +1,8 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c + +include ../generic/Makefile + diff --git a/param.h b/param.h index ee4640f..1a5f361 100644 --- a/param.h +++ b/param.h @@ -2128,7 +2128,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(EV4) || defined(EV5) || defined(EV6) +#if defined(EV4) || defined(EV5) || defined(SW6) #ifdef EV4 #define SNUMOPT 1 @@ -2140,7 +2140,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 512 #define GEMM_DEFAULT_OFFSET_B 512 -#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL +//#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2185,7 +2186,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_Q 64 #endif -#ifdef EV6 +#ifdef SW6 #define SGEMM_DEFAULT_P 256 #define SGEMM_DEFAULT_Q 512 -- 2.31.1