diff --git a/hotspot/src/cpu/aarch64/vm/c1_LIRGenerator_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/c1_LIRGenerator_aarch64.cpp index 7080ea10d..62a8ab7bd 100644 --- a/hotspot/src/cpu/aarch64/vm/c1_LIRGenerator_aarch64.cpp +++ b/hotspot/src/cpu/aarch64/vm/c1_LIRGenerator_aarch64.cpp @@ -919,6 +919,126 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) { } } +void LIRGenerator::do_dgemm_dgemm(Intrinsic* x) { + assert(x->number_of_arguments() == 16, "wrong type"); + + LIRItem ta(x->argument_at(0), this); + LIRItem tb(x->argument_at(1), this); + LIRItem m(x->argument_at(2), this); + LIRItem n(x->argument_at(3), this); + LIRItem k(x->argument_at(4), this); + LIRItem alpha(x->argument_at(5), this); + LIRItem a(x->argument_at(6), this); + LIRItem a_offset(x->argument_at(7), this); + LIRItem lda(x->argument_at(8), this); + LIRItem b(x->argument_at(9), this); + LIRItem b_offset(x->argument_at(10), this); + LIRItem ldb(x->argument_at(11), this); + LIRItem beta(x->argument_at(12), this); + LIRItem c(x->argument_at(13), this); + LIRItem c_offset(x->argument_at(14), this); + LIRItem ldc(x->argument_at(15), this); + + ta.load_item(); + tb.load_item(); + m.load_item(); + n.load_item(); + k.load_item(); + alpha.load_item(); + a.load_item(); + a_offset.load_nonconstant(); + lda.load_item(); + b.load_item(); + b_offset.load_nonconstant(); + ldb.load_item(); + beta.load_item(); + c.load_item(); + c_offset.load_nonconstant(); + ldc.load_item(); + + LIR_Opr ta_base = ta.result(); + LIR_Opr tb_base = tb.result(); + LIR_Opr r_m = m.result(); + LIR_Opr r_n = n.result(); + LIR_Opr r_k = k.result(); + LIR_Opr r_alpha = alpha.result(); + LIR_Opr a_base = a.result(); + LIR_Opr r_a_offset = a_offset.result(); + LIR_Opr r_lda = lda.result(); + LIR_Opr b_base = b.result(); + LIR_Opr r_b_offset = b_offset.result(); + LIR_Opr r_ldb = ldb.result(); + LIR_Opr r_beta = beta.result(); + LIR_Opr c_base = c.result(); + LIR_Opr r_c_offset = c_offset.result(); + LIR_Opr r_ldc = ldc.result(); + + LIR_Opr ta_value = load_String_value(ta_base); + LIR_Opr ta_offset = load_String_offset(ta_base); + LIR_Opr tb_value = load_String_value(tb_base); + LIR_Opr tb_offset = load_String_offset(tb_base); + + LIR_Address* addr_ta = emit_array_address(ta_value, ta_offset, T_CHAR, false); + LIR_Address* addr_tb = emit_array_address(tb_value, tb_offset, T_CHAR, false); + LIR_Address* addr_a = emit_array_address(a_base, r_a_offset, T_DOUBLE, false); + LIR_Address* addr_b = emit_array_address(b_base, r_b_offset, T_DOUBLE, false); + LIR_Address* addr_c = emit_array_address(c_base, r_c_offset, T_DOUBLE, false); + + LIR_Opr tmp = new_pointer_register(); + LIR_Opr ta_addr = new_register(T_ADDRESS); + __ leal(LIR_OprFact::address(addr_ta), tmp); + __ move(tmp, ta_addr); + tmp = new_pointer_register(); + LIR_Opr tb_addr = new_register(T_ADDRESS); + __ leal(LIR_OprFact::address(addr_tb), tmp); + __ move(tmp, tb_addr); + tmp = new_pointer_register(); + LIR_Opr a_addr = new_register(T_ADDRESS); + __ leal(LIR_OprFact::address(addr_a), tmp); + __ move(tmp, a_addr); + tmp = new_pointer_register(); + LIR_Opr b_addr = new_register(T_ADDRESS); + __ leal(LIR_OprFact::address(addr_b), tmp); + __ move(tmp, b_addr); + tmp = new_pointer_register(); + LIR_Opr c_addr = new_register(T_ADDRESS); + __ leal(LIR_OprFact::address(addr_c), tmp); + __ move(tmp, c_addr); + + BasicTypeList signature(13); + signature.append(T_ADDRESS); + signature.append(T_ADDRESS); + signature.append(T_INT); + signature.append(T_INT); + signature.append(T_INT); + signature.append(T_DOUBLE); + signature.append(T_ADDRESS); + signature.append(T_INT); + signature.append(T_ADDRESS); + signature.append(T_INT); + signature.append(T_DOUBLE); + signature.append(T_ADDRESS); + signature.append(T_INT); + + LIR_OprList* args = new LIR_OprList(); + args->append(ta_addr); + args->append(tb_addr); + args->append(r_m); + args->append(r_n); + args->append(r_k); + args->append(r_alpha); + args->append(a_addr); + args->append(r_lda); + args->append(b_addr); + args->append(r_ldb); + args->append(r_beta); + args->append(c_addr); + args->append(r_ldc); + + assert(StubRoutines::dgemmDgemm() != NULL, "invalid stub entry"); + call_runtime(&signature, args, StubRoutines::dgemmDgemm(), voidType, NULL); + set_no_result(x); +} void LIRGenerator::do_ArrayCopy(Intrinsic* x) { assert(x->number_of_arguments() == 5, "wrong type"); @@ -1038,6 +1158,114 @@ void LIRGenerator::do_update_CRC32(Intrinsic* x) { } } +void LIRGenerator::do_dgemv_dgemv(Intrinsic* x) { + assert(x->number_of_arguments() == 14, "wrong type"); + + LIRItem trans(x->argument_at(0), this); + LIRItem m(x->argument_at(1), this); + LIRItem n(x->argument_at(2), this); + LIRItem alpha(x->argument_at(3), this); + LIRItem array_a(x->argument_at(4), this); + LIRItem array_a_offset(x->argument_at(5), this); + LIRItem lda(x->argument_at(6), this); + LIRItem array_x(x->argument_at(7), this); + LIRItem array_x_offset(x->argument_at(8), this); + LIRItem incx(x->argument_at(9), this); + LIRItem beta(x->argument_at(10), this); + LIRItem array_y(x->argument_at(11), this); + LIRItem array_y_offset(x->argument_at(12), this); + LIRItem incy(x->argument_at(13), this); + + trans.load_item(); + m.load_item(); + n.load_item(); + alpha.load_item(); + array_a.load_item(); + array_a_offset.load_nonconstant(); + lda.load_item(); + array_x.load_item(); + array_x_offset.load_nonconstant(); + incx.load_item(); + beta.load_item(); + array_y.load_item(); + array_y_offset.load_nonconstant(); + incy.load_item(); + + LIR_Opr res_trans_base = trans.result(); + LIR_Opr res_m = m.result(); + LIR_Opr res_n = n.result(); + LIR_Opr res_alpha = alpha.result(); + LIR_Opr res_a_base = array_a.result(); + LIR_Opr res_a_offset = array_a_offset.result(); + LIR_Opr res_lda = lda.result(); + LIR_Opr res_x_base = array_x.result(); + LIR_Opr res_x_offset = array_x_offset.result(); + LIR_Opr res_incx = incx.result(); + LIR_Opr res_beta = beta.result(); + LIR_Opr res_y_base = array_y.result(); + LIR_Opr res_y_offset = array_y_offset.result(); + LIR_Opr res_incy = incy.result(); + + LIR_Opr addr_trans_base = LIRGenerator::load_String_value(res_trans_base); + LIR_Opr addr_trans_offset = LIRGenerator::load_String_offset(res_trans_base); + LIR_Address* addr_trans = emit_array_address(addr_trans_base, addr_trans_offset, T_CHAR, false); + + LIR_Address* addr_a = emit_array_address(res_a_base, res_a_offset, T_DOUBLE, false); + LIR_Address* addr_x = emit_array_address(res_x_base, res_x_offset, T_DOUBLE, false); + LIR_Address* addr_y = emit_array_address(res_y_base, res_y_offset, T_DOUBLE, false); + + // load addr to register + LIR_Opr tmp = new_pointer_register(); + LIR_Opr trans_addr = new_register(T_ADDRESS); + __ leal(LIR_OprFact::address(addr_trans), tmp); + __ move(tmp, trans_addr); + + LIR_Opr tmp1 = new_pointer_register(); + LIR_Opr a_addr = new_register(T_ADDRESS); + __ leal(LIR_OprFact::address(addr_a), tmp1); + __ move(tmp1, a_addr); + + LIR_Opr tmp2 = new_pointer_register(); + LIR_Opr x_addr = new_register(T_ADDRESS); + __ leal(LIR_OprFact::address(addr_x), tmp2); + __ move(tmp2, x_addr); + + LIR_Opr tmp3 = new_pointer_register(); + LIR_Opr y_addr = new_register(T_ADDRESS); + __ leal(LIR_OprFact::address(addr_y), tmp3); + __ move(tmp3, y_addr); + + BasicTypeList signature(11); + signature.append(T_ADDRESS); + signature.append(T_INT); + signature.append(T_INT); + signature.append(T_DOUBLE); + signature.append(T_ADDRESS); + signature.append(T_INT); + signature.append(T_ADDRESS); + signature.append(T_INT); + signature.append(T_DOUBLE); + signature.append(T_ADDRESS); + signature.append(T_INT); + + LIR_OprList* args = new LIR_OprList(); + args->append(trans_addr); + args->append(res_m); + args->append(res_n); + args->append(res_alpha); + args->append(a_addr); + args->append(res_lda); + args->append(x_addr); + args->append(res_incx); + args->append(res_beta); + args->append(y_addr); + args->append(res_incy); + + assert(StubRoutines::dgemvDgemv() != NULL, "invalid stub entry"); + call_runtime(&signature, args, StubRoutines::dgemvDgemv(), voidType, NULL); + set_no_result(x); +} + // _i2l, _i2f, _i2d, _l2i, _l2f, _l2d, _f2i, _f2l, _f2d, _d2i, _d2l, _d2f // _i2b, _i2c, _i2s void LIRGenerator::do_Convert(Convert* x) { diff --git a/hotspot/src/cpu/aarch64/vm/interpreterGenerator_aarch64.hpp b/hotspot/src/cpu/aarch64/vm/interpreterGenerator_aarch64.hpp index c0aaa1de4..a275a6a99 100644 --- a/hotspot/src/cpu/aarch64/vm/interpreterGenerator_aarch64.hpp +++ b/hotspot/src/cpu/aarch64/vm/interpreterGenerator_aarch64.hpp @@ -50,6 +50,11 @@ void generate_transcendental_entry(AbstractInterpreter::MethodKind kind, int fpa address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind); void lock_method(void); void generate_stack_overflow_check(void); + void load_String_value(Register src, Register dst); + void load_String_offset(Register src, Register dst); + void emit_array_address(Register src, Register idx, Register dst, BasicType type); + address generate_Dgemm_dgemm_entry(); + address generate_Dgemv_dgemv_entry(); void generate_counter_incr(Label* overflow, Label* profile_method, Label* profile_method_continue); void generate_counter_overflow(Label* do_continue); diff --git a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp index c5ec637a1..125983179 100644 --- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp +++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp @@ -3221,6 +3221,44 @@ class StubGenerator: public StubCodeGenerator { return start; } + address load_BLAS_library() { + // Try to load BLAS library. + const char library_name[] = "openblas"; + char err_buf[1024] = {0}; + char path[JVM_MAXPATHLEN] = {0}; + os::jvm_path(path, sizeof(path)); + int jvm_offset = -1; + + // Match "jvm[^/]*" in jvm_path. + const char* last_name = strrchr(path, '/'); + last_name = last_name ? last_name : path; + const char* last_lib_name = strstr(last_name, "jvm"); + if (last_lib_name != NULL) { + jvm_offset = last_lib_name - path; + } + + address library = NULL; + // Find the BLAS shared library. + // Search path: /jre/lib///libopenblas.so + if (jvm_offset >= 0) { + if (jvm_offset + strlen(library_name) + strlen(os::dll_file_extension()) < JVM_MAXPATHLEN) { + strncpy(&path[jvm_offset], library_name, JVM_MAXPATHLEN - jvm_offset); + strncat(path, os::dll_file_extension(), strlen(os::dll_file_extension())); + library = (address)os::dll_load(path, err_buf, sizeof(err_buf)); + } + } + return library; + } + + address get_BLAS_func_entry(address library, const char* func_name) { + if (library == NULL) { + return NULL; + } + + // Try to find BLAS function entry. + return (address)os::dll_lookup((void*)library, func_name); + } + /** * Arguments: * @@ -3254,6 +3292,218 @@ class StubGenerator: public StubCodeGenerator { return start; } + // Parameter conversion from JVM to native BLAS + // + // Register: + // r0: transa r0: transa + // r1: transb r1: transb + // r2: m r2: &m + // r3: n r3: &n + // r4: k =========> r4: &k + // r5: A r5: &alpha + // r6: lda r6: A + // r7: B r7: &lda + // v0: alpha + // v1: beta + // + // Stack: + // |-------| |-------| + // | ldc | | ldc | + // |-------| |-------| + // | C | | C | + // |-------| |-------| + // | ldb | | ldb | + // |-------| <-- sp |-------| + // | | | m | + // |-------| |-------| + // | | | n | + // |-------| |-------| + // | | | k | + // |-------| |-------| + // | | | lda | + // |-------| |-------| + // | | | alpha | + // |-------| |-------| + // | | | beta | + // |-------| =========> |-------| + // | | | lr | + // |-------| |-------| + // | | | rfp | + // |-------| |-------| <-- fp + // | ... | | ... | + // |-------| |-------| + // | | | &ldc | + // |-------| |-------| + // | | | C | + // |-------| |-------| + // | | | &bata | + // |-------| |-------| + // | | | &ldb | + // |-------| |-------| + // | | | B | + // |-------| |-------| <-- sp + address generate_dgemmDgemm(address library) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "dgemm_dgemm"); + + address fn = get_BLAS_func_entry(library, "dgemm_"); + if (fn == NULL) return NULL; + + address start = __ pc(); + + const Register transa = c_rarg0; + const Register transb = c_rarg1; + const Register m = c_rarg2; + const Register n = c_rarg3; + const Register k = c_rarg4; + const FloatRegister alpha = c_farg0; + const Register A = c_rarg5; + const Register lda = c_rarg6; + const Register B = c_rarg7; + const FloatRegister beta = c_farg1; + + BLOCK_COMMENT("Entry:"); + + // extend stack + __ sub(sp, sp, 0x60); + __ stp(rfp, lr, Address(sp, 48)); + __ add(rfp, sp, 0x30); + // load BLAS function entry + __ mov(rscratch1, fn); + // C + __ ldr(rscratch2, Address(rfp, 56)); + // store m / n to stack + __ stpw(n, m, Address(rfp, 40)); + // &beta + __ add(r2, rfp, 0x10); + // store k / lda to stack + __ stpw(lda, k, Address(rfp, 32)); + // load ldc + __ add(r3, rfp, 0x40); + // store C / &beta + __ stp(r2, rscratch2, Address(sp, 16)); + // &ldb + __ add(r2, rfp, 0x30); + // store B + __ str(B, Address(sp)); + // move A from r5 to r6 + __ mov(r6, A); + // store ldc + __ str(r3, Address(sp, 32)); + // &alpha + __ add(r5, rfp, 0x18); + // store &ldb + __ str(r2, Address(sp, 8)); + // &k + __ add(r4, rfp, 0x24); + // store alpha / beta + __ stpd(beta, alpha, Address(rfp, 16)); + // load &lda to r7 + __ add(r7, rfp, 0x20); + // load &n + __ add(r3, rfp, 0x28); + // load &m + __ add(r2, rfp, 0x2c); + // call dgemm + __ blr(rscratch1); + + // restore rfp and lr + __ ldp(rfp, lr, Address(sp, 48)); + // exit stack + __ add(sp, sp, 0x60); + __ ret(lr); + + return start; + } + + /** + * public void dgemv(String trans, int m, int n, + * double alpha, double[] a, int lda, + * double[] x, int incx, + * double beta, double[] y, int incy) + * + * Arguments: + * + * Inputs: + * c_rarg0 - char* trans + * c_rarg1 - int m + * c_rarg2 - int n + * d0/c_farg0 - double alpha + * c_rarg3 - double[] a + * c_rarg4 - int lda + * c_rarg5 - double[] x + * c_rarg6 - int incx + * d1/c_farg1 - double beta + * c_rarg7 - double[] y + * [sp] - int incy + * + * Output: + * null + * + */ + + address generate_dgemvDgemv(address library) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "dgemv_dgemv"); + + address fn = get_BLAS_func_entry(library, "dgemv_"); + if (fn == NULL) return NULL; + + address start = __ pc(); + BLOCK_COMMENT("Entry: "); + + Register trans = c_rarg0; + Register m = c_rarg1; + Register n = c_rarg2; + Register a = c_rarg3; + Register lda = c_rarg4; + Register x = c_rarg5; + Register incx = c_rarg6; + Register y = c_rarg7; + + FloatRegister alpha = c_farg0; + FloatRegister beta = c_farg1; + + __ sub(sp, sp, 0x50); + __ stp(rfp, lr, Address(sp, 32)); + __ add(rfp, sp, 0x20); + + // no need for saving trans to tmp register, keep it in register x0 + __ strw(m, Address(rfp, 44)); + __ strw(n, Address(rfp, 40)); + __ strd(alpha, Address(rfp, 32)); + __ strw(lda, Address(rfp, 28)); + __ strw(incx, Address(rfp, 24)); + __ strd(beta, Address(rfp, 16)); + + // pre call + // load incy and push on stack, order incy --> y --> beta + __ add(r1, rfp, 0x30); + __ str(r1, Address(sp, 16)); + __ str(y, Address(sp, 8)); + __ add(r1, rfp, 0x10); + __ str(r1, Address(sp)); + + __ add(r7, rfp, 0x18); + __ mov(r6, x); + __ add(r5, rfp, 0x1c); + __ mov(r4, a); + __ add(r3, rfp, 0x20); + __ add(r2, rfp, 0x28); + __ add(r1, rfp, 0x2c); + + __ mov(rscratch1, fn); + __ blr(rscratch1); + + __ ldp(rfp, lr, Address(sp, 32)); + __ add(sp, sp, 0x50); + __ ret(lr); + + return start; + } + + + /** * Arguments: * @@ -4252,6 +4502,14 @@ class StubGenerator: public StubCodeGenerator { StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); } + + if (UseF2jBLASIntrinsics) { + StubRoutines::_BLAS_library = load_BLAS_library(); + // F2jBLAS intrinsics will use the implements in BLAS dynamic library + StubRoutines::_ddotF2jBLAS = generate_ddotF2jBLAS(); + StubRoutines::_dgemmDgemm = generate_dgemmDgemm(StubRoutines::_BLAS_library); + StubRoutines::_dgemvDgemv = generate_dgemvDgemv(StubRoutines::_BLAS_library); + } } void generate_all() { @@ -4296,10 +4554,6 @@ class StubGenerator: public StubCodeGenerator { StubRoutines::_montgomerySquare = g.generate_multiply(); } - if (UseF2jBLASIntrinsics) { - StubRoutines::_ddotF2jBLAS = generate_ddotF2jBLAS(); - } - if (UseAESIntrinsics) { StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); diff --git a/hotspot/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp index ae5cb3f32..924b6670f 100644 --- a/hotspot/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp +++ b/hotspot/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp @@ -856,6 +856,250 @@ address InterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractInterpret return generate_native_entry(false); } +// Access the char-array of String +void InterpreterGenerator::load_String_value(Register src, Register dst) { + // Need to cooperate with JDK-8243996 + int value_offset = java_lang_String::value_offset_in_bytes(); + + __ add(src, src, value_offset); + __ load_heap_oop(dst, Address(src)); +} + +void InterpreterGenerator::load_String_offset(Register src, Register dst) { + __ mov(dst, 0); + + // Get String value offset, because of order of initialization for Interpreter, + // we have to hardcode the offset for String value. (JDK-8243996) + if (java_lang_String::has_offset_field()) { + int offset_offset = java_lang_String::offset_offset_in_bytes(); + __ add(src, src, offset_offset); + __ ldrw(dst, Address(src)); + } +} + +void InterpreterGenerator::emit_array_address(Register src, Register idx, + Register dst, BasicType type) { + int offset_in_bytes = arrayOopDesc::base_offset_in_bytes(type); + int elem_size = type2aelembytes(type); + int shift = exact_log2(elem_size); + + __ lsl(idx, idx, shift); + __ add(idx, idx, offset_in_bytes); + __ add(dst, src, idx); +} + +/** + * Stub Arguments: + * + * c_rarg0 - char* transa + * c_rarg1 - char* transb + * c_rarg2 - int m + * c_rarg3 - int n + * c_rarg4 - int k + * d0 - double alpha + * c_rarg5 - double[] A + * c_rarg6 - int lda + * c_rarg7 - double[] B + * d1 - double beta + * [sp + 16] - int ldc + * [sp + 8] - double[] C + * [sp] - int ldb + * + */ +address InterpreterGenerator::generate_Dgemm_dgemm_entry() { + if (!UseF2jBLASIntrinsics || (StubRoutines::dgemmDgemm() == NULL)) return NULL; + + address entry = __ pc(); + + // r13: senderSP must preserved for slow path + + // Arguments are reversed on java expression stack + const Register ta = c_rarg0; + const Register tb = c_rarg1; + const Register m = c_rarg2; + const Register n = c_rarg3; + const Register k = c_rarg4; + const FloatRegister alpha = c_farg0; + const Register A = c_rarg5; + const Register lda = c_rarg6; + const Register B = c_rarg7; + const FloatRegister beta = c_farg1; + const Register tmp1 = rscratch1; + const Register tmp2 = rscratch2; + + // trana + __ ldr(ta, Address(esp, 17 * wordSize)); + load_String_value(ta, tmp1); + load_String_offset(ta, tmp2); + emit_array_address(tmp1, tmp2, ta, T_CHAR); + // tranb + __ ldr(tb, Address(esp, 16 * wordSize)); + load_String_value(tb, tmp1); + load_String_offset(tb, tmp2); + emit_array_address(tmp1, tmp2, tb, T_CHAR); + // m, n, k + __ ldrw(m, Address(esp, 15 * wordSize)); + __ ldrw(n, Address(esp, 14 * wordSize)); + __ ldrw(k, Address(esp, 13 * wordSize)); + // alpha + __ ldrd(alpha, Address(esp, 11 * wordSize)); + // A + __ ldr(tmp1, Address(esp, 10 * wordSize)); + __ mov(tmp2, 0); + __ ldrw(tmp2, Address(esp, 9 * wordSize)); + emit_array_address(tmp1, tmp2, A, T_DOUBLE); + // lda + __ ldrw(lda, Address(esp, 8 * wordSize)); + // B + __ ldr(tmp1, Address(esp, 7 * wordSize)); + __ ldrw(tmp2, Address(esp, 6 * wordSize)); + emit_array_address(tmp1, tmp2, B, T_DOUBLE); + // beta + __ ldrd(beta, Address(esp, 3 * wordSize)); + // Start pushing arguments to machine stack. + // + // Remove the incoming args, peeling the machine SP back to where it + // was in the caller. This is not strictly necessary, but unless we + // do so the stack frame may have a garbage FP; this ensures a + // correct call stack that we can always unwind. The ANDR should be + // unnecessary because the sender SP in r13 is always aligned, but + // it doesn't hurt. + __ andr(sp, r13, -16); + __ str(lr, Address(sp, -wordSize)); + // ldc + __ ldrw(tmp1, Address(esp, 0x0)); + __ strw(tmp1, Address(sp, 2 * -wordSize)); + // C + __ ldr(tmp1, Address(esp, 2 * wordSize)); + __ ldrw(tmp2, Address(esp, wordSize)); + emit_array_address(tmp1, tmp2, tmp1, T_DOUBLE); + __ str(tmp1, Address(sp, 3 * -wordSize)); + // ldb + __ ldrw(tmp2, Address(esp, 5 * wordSize)); + __ strw(tmp2, Address(sp, 4 * -wordSize)); + + // Call function + __ add(sp, sp, 4 * -wordSize); + address fn = CAST_FROM_FN_PTR(address, StubRoutines::dgemmDgemm()); + __ mov(tmp1, fn); + __ blr(tmp1); + + __ ldr(lr, Address(sp, 3 * wordSize)); + // For assert(Rd != sp || imm % 16 == 0) + __ add(sp, sp, 4 * wordSize); + __ br(lr); + + return entry; +} + +address InterpreterGenerator::generate_Dgemv_dgemv_entry() { + if (StubRoutines::dgemvDgemv() == NULL) return NULL; + address entry = __ pc(); + + const Register trans = c_rarg0; // trans + const Register m = c_rarg1; // m + const Register n = c_rarg2; // n + const Register a = c_rarg3; // array a addr + const Register lda = c_rarg4; // lda + const Register x = c_rarg5; // array x addr + const Register incx = c_rarg6; // incx + const Register y = c_rarg7; // array y addr + + const FloatRegister alpha = v0; // alpha + const FloatRegister beta = v1; // beta + + const Register tmp1 = rscratch1; + const Register tmp2 = rscratch2; + + // esp: expression stack of caller + // dgemv parameter ---> the position in stack ---> move to register + // | char* trans | | esp + 15 | | r0 | + // | int m | | esp + 14 | | r1 | + // | int n | | esp + 13 | | r2 | + // | double alpha | | esp + 11 | | v0 | + // ---------------- ------------ -------- + // | double* a | | esp + 10 | | | + // | | | | | r3 | + // | int a_offset | | esp + 9 | | | + // ---------------- ------------ -------- + // | int lda | | esp + 8 | | r4 | + // ---------------- ------------ -------- + // | double* x | | esp + 7 | | | + // | | | | | r5 | + // | int x_offset | | esp + 6 | | | + // ---------------- ------------ -------- + // | int incx | | esp + 5 | | r6 | + // | double beta | | esp + 3 | | v1 | + // ---------------- ------------ -------- + // | double* y | | esp + 2 | | | + // | | | | | r7 | + // | int y_offset | | esp + 1 | | | + // ---------------- ------------ -------- + // | int incy | | esp | | [sp] | + + + // trans + __ ldr(trans, Address(esp, 15 * wordSize)); + load_String_value(trans, tmp1); + load_String_offset(trans, tmp2); + emit_array_address(tmp1, tmp2, trans, T_CHAR); + // m, n + __ ldrw(m, Address(esp, 14 * wordSize)); + __ ldrw(n, Address(esp, 13 * wordSize)); + + // alpha + __ ldrd(alpha, Address(esp, 11 * wordSize)); + + // a + __ ldr(tmp1, Address(esp, 10 * wordSize)); + __ mov(tmp2, zr); + __ ldrw(tmp2, Address(esp, 9 * wordSize)); + emit_array_address(tmp1, tmp2, a, T_DOUBLE); + + // lda + __ ldrw(lda, Address(esp, 8 * wordSize)); + + // x + __ ldr(tmp1, Address(esp, 7 * wordSize)); + __ mov(tmp2, zr); + __ ldrw(tmp2, Address(esp, 6 * wordSize)); + emit_array_address(tmp1, tmp2, x, T_DOUBLE); + + // incx + __ ldrw(incx, Address(esp, 5 * wordSize)); + + // beta + __ ldrd(beta, Address(esp, 3 * wordSize)); + + // y + __ ldr(tmp1, Address(esp, 2 * wordSize)); + __ mov(tmp2, zr); + __ ldrw(tmp2, Address(esp, wordSize)); + emit_array_address(tmp1, tmp2, y, T_DOUBLE); + + // resume sp, restore lr + __ andr(sp, r13, -16); + __ str(lr, Address(sp, -wordSize)); + + // incy, push on stack + __ ldrw(tmp1, Address(esp, 0)); + __ strw(tmp1, Address(sp, 2 * -wordSize)); + + __ add(sp, sp, -2 * wordSize); + + // call function + address fn = CAST_FROM_FN_PTR(address, StubRoutines::dgemvDgemv()); + __ mov(tmp1, fn); + __ blr(tmp1); + + // resume lr + __ ldr(lr, Address(sp, wordSize)); + __ add(sp, sp, 2 * wordSize); + __ br(lr); + + return entry; +} + void InterpreterGenerator::bang_stack_shadow_pages(bool native_call) { // Bang each page in the shadow zone. We can't assume it's been done for // an interpreter frame with greater than a page of locals, so each page @@ -1575,6 +1819,10 @@ address AbstractInterpreterGenerator::generate_method_entry( : // fall thru case Interpreter::java_util_zip_CRC32_updateByteBuffer : entry_point = ((InterpreterGenerator*)this)->generate_CRC32_updateBytes_entry(kind); break; + case Interpreter::org_netlib_blas_Dgemm_dgemm + : entry_point = ((InterpreterGenerator*)this)->generate_Dgemm_dgemm_entry(); break; + case Interpreter::org_netlib_blas_Dgemv_dgemv + : entry_point = ((InterpreterGenerator*)this)->generate_Dgemv_dgemv_entry(); break; default : ShouldNotReachHere(); break; } diff --git a/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp b/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp index f1160792a..477c6e550 100644 --- a/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp +++ b/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp @@ -754,6 +754,13 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) { } } +void LIRGenerator::do_dgemm_dgemm(Intrinsic* x) { + fatal("BLAS intrinsics are not implemented on this platform!"); +} + +void LIRGenerator::do_dgemv_dgemv(Intrinsic* x) { + fatal("BLAS intrinsics are not implemented on this platform!"); +} void LIRGenerator::do_ArrayCopy(Intrinsic* x) { assert(x->number_of_arguments() == 5, "wrong type"); diff --git a/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp b/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp index dd23f005b..d1ecbaeb4 100644 --- a/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp +++ b/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp @@ -896,6 +896,13 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) { } } +void LIRGenerator::do_dgemm_dgemm(Intrinsic* x) { + fatal("BLAS intrinsics are not implemented on this platform!"); +} + +void LIRGenerator::do_dgemv_dgemv(Intrinsic *x) { + fatal("Blas intrinsics are not implemented on this platform!"); +} void LIRGenerator::do_ArrayCopy(Intrinsic* x) { assert(x->number_of_arguments() == 5, "wrong type"); diff --git a/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp b/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp index 459315cb7..79b2b2bb1 100644 --- a/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp +++ b/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp @@ -3672,6 +3672,20 @@ bool GraphBuilder::try_inline_intrinsics(ciMethod* callee) { case vmIntrinsics::_fullFence : break; + case vmIntrinsics::_dgemm_dgemm: + if (!UseF2jBLASIntrinsics || (StubRoutines::dgemmDgemm() == NULL)) { + return false; + } + cantrap = false; + preserves_state = true; + break; + + case vmIntrinsics::_dgemv_dgemv: + if (!UseF2jBLASIntrinsics || (StubRoutines::dgemvDgemv() == NULL)) return false; + cantrap = false; + preserves_state = true; + break; + default : return false; // do not inline } // create intrinsic node diff --git a/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp b/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp index 65c04e3e5..070fd8052 100644 --- a/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp +++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp @@ -1208,7 +1208,7 @@ void LIRGenerator::do_Return(Return* x) { set_no_result(x); } -// Examble: ref.get() +// Example: ref.get() // Combination of LoadField and g1 pre-write barrier void LIRGenerator::do_Reference_get(Intrinsic* x) { @@ -1220,7 +1220,7 @@ void LIRGenerator::do_Reference_get(Intrinsic* x) { LIRItem reference(x->argument_at(0), this); reference.load_item(); - // need to perform the null check on the reference objecy + // need to perform the null check on the reference object CodeEmitInfo* info = NULL; if (x->needs_null_check()) { info = state_for(x); @@ -1422,6 +1422,35 @@ LIR_Opr LIRGenerator::load_constant(LIR_Const* c) { return result; } +// Access the char-array of String +LIR_Opr LIRGenerator::load_String_value(LIR_Opr str) { + int value_offset = java_lang_String::value_offset_in_bytes(); + LIR_Opr value = new_register(T_ARRAY); + LIR_Opr tmp = new_pointer_register(); + + __ add(str, LIR_OprFact::intConst(value_offset), tmp); + LIR_Address* array_addr = new LIR_Address(tmp, T_ARRAY); + __ load(array_addr, value); + + return value; +} + +LIR_Opr LIRGenerator::load_String_offset(LIR_Opr str) { + LIR_Opr offset = new_register(T_INT); + + if (java_lang_String::has_offset_field()) { + LIR_Opr tmp = new_pointer_register(); + int offset_offset = java_lang_String::offset_offset_in_bytes(); + __ add(str, LIR_OprFact::intConst(offset_offset), tmp); + LIR_Address* addr = new LIR_Address(tmp, T_INT); + __ load(addr, offset); + } else { + offset = LIR_OprFact::intConst(0); + } + + return offset; +} + // Various barriers void LIRGenerator::pre_barrier(LIR_Opr addr_opr, LIR_Opr pre_val, @@ -3290,6 +3328,14 @@ void LIRGenerator::do_Intrinsic(Intrinsic* x) { do_update_CRC32(x); break; + case vmIntrinsics::_dgemm_dgemm: + do_dgemm_dgemm(x); + break; + + case vmIntrinsics::_dgemv_dgemv: + do_dgemv_dgemv(x); + break; + default: ShouldNotReachHere(); break; } } diff --git a/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp b/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp index 24d072b36..57d675c5b 100644 --- a/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp +++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp @@ -210,6 +210,10 @@ class LIRGenerator: public InstructionVisitor, public BlockClosure { // Given an immediate value, return an operand usable in logical ops. LIR_Opr load_immediate(int x, BasicType type); + // Get String value and offset + LIR_Opr load_String_value(LIR_Opr str); + LIR_Opr load_String_offset(LIR_Opr str); + void set_result(Value x, LIR_Opr opr) { assert(opr->is_valid(), "must set to valid value"); assert(x->operand()->is_illegal(), "operand should never change"); @@ -251,6 +255,8 @@ class LIRGenerator: public InstructionVisitor, public BlockClosure { void do_FPIntrinsics(Intrinsic* x); void do_Reference_get(Intrinsic* x); void do_update_CRC32(Intrinsic* x); + void do_dgemm_dgemm(Intrinsic* x); + void do_dgemv_dgemv(Intrinsic* x); void do_UnsafePrefetch(UnsafePrefetch* x, bool is_store); diff --git a/hotspot/src/share/vm/c1/c1_Runtime1.cpp b/hotspot/src/share/vm/c1/c1_Runtime1.cpp index f379a0395..3ece7f6ea 100644 --- a/hotspot/src/share/vm/c1/c1_Runtime1.cpp +++ b/hotspot/src/share/vm/c1/c1_Runtime1.cpp @@ -305,6 +305,8 @@ const char* Runtime1::name_for_address(address entry) { FUNCTION_CASE(entry, JFR_TIME_FUNCTION); #endif FUNCTION_CASE(entry, StubRoutines::updateBytesCRC32()); + FUNCTION_CASE(entry, StubRoutines::dgemmDgemm()); + FUNCTION_CASE(entry, StubRoutines::dgemvDgemv()); #undef FUNCTION_CASE diff --git a/hotspot/src/share/vm/classfile/vmSymbols.cpp b/hotspot/src/share/vm/classfile/vmSymbols.cpp index a5f89dbf8..34514022a 100644 --- a/hotspot/src/share/vm/classfile/vmSymbols.cpp +++ b/hotspot/src/share/vm/classfile/vmSymbols.cpp @@ -333,6 +333,8 @@ bool vmIntrinsics::should_be_pinned(vmIntrinsics::ID id) { #endif case vmIntrinsics::_currentTimeMillis: case vmIntrinsics::_nanoTime: + case vmIntrinsics::_dgemm_dgemm: + case vmIntrinsics::_dgemv_dgemv: return true; default: return false; diff --git a/hotspot/src/share/vm/classfile/vmSymbols.hpp b/hotspot/src/share/vm/classfile/vmSymbols.hpp index 6bd8dbedd..942d172a1 100644 --- a/hotspot/src/share/vm/classfile/vmSymbols.hpp +++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp @@ -857,6 +857,14 @@ do_intrinsic(_f2jblas_ddot, com_github_fommil_netlib_f2jblas, ddot_name, ddot_signature, F_R) \ do_name( ddot_name, "ddot") \ do_signature(ddot_signature, "(I[DI[DI)D") \ + do_class(org_netlib_blas_dgemm, "org/netlib/blas/Dgemm") \ + do_intrinsic(_dgemm_dgemm, org_netlib_blas_dgemm, dgemm_name, dgemm_signature, F_S) \ + do_name( dgemm_name, "dgemm") \ + do_signature(dgemm_signature, "(Ljava/lang/String;Ljava/lang/String;IIID[DII[DIID[DII)V") \ + do_class(org_netlib_blas_dgemv, "org/netlib/blas/Dgemv") \ + do_intrinsic(_dgemv_dgemv, org_netlib_blas_dgemv, dgemv_name, dgemv_signature, F_S) \ + do_name( dgemv_name, "dgemv") \ + do_signature(dgemv_signature, "(Ljava/lang/String;IID[DII[DIID[DII)V") \ \ /* support for sun.security.provider.SHA2 */ \ do_class(sun_security_provider_sha2, "sun/security/provider/SHA2") \ diff --git a/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp b/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp index e14c50bf0..293382b3c 100644 --- a/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp +++ b/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp @@ -100,6 +100,8 @@ class AbstractInterpreter: AllStatic { java_util_zip_CRC32_update, // implementation of java.util.zip.CRC32.update() java_util_zip_CRC32_updateBytes, // implementation of java.util.zip.CRC32.updateBytes() java_util_zip_CRC32_updateByteBuffer, // implementation of java.util.zip.CRC32.updateByteBuffer() + org_netlib_blas_Dgemm_dgemm, // implementation of org.netlib.blas.Dgemm.dgemm() + org_netlib_blas_Dgemv_dgemv, // implementation of org.netlib.blas.Dgemv.dgemv() number_of_method_entries, invalid = -1 }; diff --git a/hotspot/src/share/vm/interpreter/cppInterpreter.cpp b/hotspot/src/share/vm/interpreter/cppInterpreter.cpp index 0007aa8be..9e48a1d94 100644 --- a/hotspot/src/share/vm/interpreter/cppInterpreter.cpp +++ b/hotspot/src/share/vm/interpreter/cppInterpreter.cpp @@ -31,17 +31,20 @@ #ifdef CC_INTERP # define __ _masm-> -void CppInterpreter::initialize() { +void CppInterpreter::initialize_stub() { if (_code != NULL) return; + int code_size = InterpreterCodeSize; + NOT_PRODUCT(code_size *= 4;) // debug uses extra interpreter code space + _code = new StubQueue(new InterpreterCodeletInterface, code_size, NULL, + "Interpreter"); +} + +void CppInterpreter::initialize_code() { AbstractInterpreter::initialize(); // generate interpreter { ResourceMark rm; TraceTime timer("Interpreter generation", TraceStartupTime); - int code_size = InterpreterCodeSize; - NOT_PRODUCT(code_size *= 4;) // debug uses extra interpreter code space - _code = new StubQueue(new InterpreterCodeletInterface, code_size, NULL, - "Interpreter"); InterpreterGenerator g(_code); if (PrintInterpreter) print(); } diff --git a/hotspot/src/share/vm/interpreter/cppInterpreter.hpp b/hotspot/src/share/vm/interpreter/cppInterpreter.hpp index 6a6447503..58efcfaf2 100644 --- a/hotspot/src/share/vm/interpreter/cppInterpreter.hpp +++ b/hotspot/src/share/vm/interpreter/cppInterpreter.hpp @@ -54,7 +54,8 @@ class CppInterpreter: public AbstractInterpreter { public: // Initialization/debugging - static void initialize(); + static void initialize_stub(); + static void initialize_code(); // this only returns whether a pc is within generated code for the interpreter. // This is a moderately dubious interface for the c++ interpreter. Only diff --git a/hotspot/src/share/vm/interpreter/interpreter.cpp b/hotspot/src/share/vm/interpreter/interpreter.cpp index 7ce4bdbb3..a313f2e63 100644 --- a/hotspot/src/share/vm/interpreter/interpreter.cpp +++ b/hotspot/src/share/vm/interpreter/interpreter.cpp @@ -85,8 +85,6 @@ void InterpreterCodelet::print_on(outputStream* st) const { // Implementation of platform independent aspects of Interpreter void AbstractInterpreter::initialize() { - if (_code != NULL) return; - // make sure 'imported' classes are initialized if (CountBytecodes || TraceBytecodes || StopInterpreterAt) BytecodeCounter::reset(); if (PrintBytecodeHistogram) BytecodeHistogram::reset(); @@ -114,8 +112,22 @@ void AbstractInterpreter::print() { } -void interpreter_init() { - Interpreter::initialize(); +// The reason that interpreter initialization is split into two parts is that the first part +// needs to run before methods are loaded (which with CDS implies linked also), and the other +// part needs to run after. The reason is that when methods are loaded (with CDS) or linked +// (without CDS), the i2c adapters are generated that assert we are currently in the interpreter. +// Asserting that requires knowledge about where the interpreter is in memory. Therefore, +// establishing the interpreter address must be done before methods are loaded. However, +// we would like to actually generate the interpreter after methods are loaded. That allows +// us to remove otherwise hardcoded offsets regarding fields that are needed in the interpreter +// code. This leads to a split if 1. reserving the memory for the interpreter, 2. loading methods +// and 3. generating the interpreter. +void interpreter_init_stub() { + Interpreter::initialize_stub(); +} + +void interpreter_init_code() { + Interpreter::initialize_code(); #ifndef PRODUCT if (TraceBytecodes) BytecodeTracer::set_closure(BytecodeTracer::std_closure()); #endif // PRODUCT @@ -251,6 +263,13 @@ AbstractInterpreter::MethodKind AbstractInterpreter::method_kind(methodHandle m) return java_lang_ref_reference_get; } + if (UseF2jBLASIntrinsics) { + switch (m->intrinsic_id()) { + case vmIntrinsics::_dgemm_dgemm: return org_netlib_blas_Dgemm_dgemm; + case vmIntrinsics::_dgemv_dgemv: return org_netlib_blas_Dgemv_dgemv; + } + } + // Accessor method? if (m->is_accessor()) { assert(m->size_of_parameters() == 1, "fast code for accessors assumes parameter size = 1"); @@ -311,6 +330,8 @@ void AbstractInterpreter::print_method_kind(MethodKind kind) { case java_util_zip_CRC32_update : tty->print("java_util_zip_CRC32_update"); break; case java_util_zip_CRC32_updateBytes : tty->print("java_util_zip_CRC32_updateBytes"); break; case java_util_zip_CRC32_updateByteBuffer : tty->print("java_util_zip_CRC32_updateByteBuffer"); break; + case org_netlib_blas_Dgemm_dgemm : tty->print("org_netlib_blas_Dgemm_dgemm"); break; + case org_netlib_blas_Dgemv_dgemv : tty->print("org_netlib_blas_Dgemv_dgemv"); break; default: if (kind >= method_handle_invoke_FIRST && kind <= method_handle_invoke_LAST) { diff --git a/hotspot/src/share/vm/interpreter/templateInterpreter.cpp b/hotspot/src/share/vm/interpreter/templateInterpreter.cpp index 1520c7b1c..f38f05117 100644 --- a/hotspot/src/share/vm/interpreter/templateInterpreter.cpp +++ b/hotspot/src/share/vm/interpreter/templateInterpreter.cpp @@ -32,12 +32,20 @@ # define __ _masm-> -void TemplateInterpreter::initialize() { +void TemplateInterpreter::initialize_stub() { if (_code != NULL) return; // assertions assert((int)Bytecodes::number_of_codes <= (int)DispatchTable::length, "dispatch table too small"); + // allocate interpreter + int code_size = InterpreterCodeSize; + NOT_PRODUCT(code_size *= 4;) // debug uses extra interpreter code space + _code = new StubQueue(new InterpreterCodeletInterface, code_size, NULL, + "Interpreter"); +} + +void TemplateInterpreter::initialize_code() { AbstractInterpreter::initialize(); TemplateTable::initialize(); @@ -45,10 +53,6 @@ void TemplateInterpreter::initialize() { // generate interpreter { ResourceMark rm; TraceTime timer("Interpreter generation", TraceStartupTime); - int code_size = InterpreterCodeSize; - NOT_PRODUCT(code_size *= 4;) // debug uses extra interpreter code space - _code = new StubQueue(new InterpreterCodeletInterface, code_size, NULL, - "Interpreter"); InterpreterGenerator g(_code); if (PrintInterpreter) print(); } @@ -401,6 +405,11 @@ void TemplateInterpreterGenerator::generate_all() { method_entry(java_util_zip_CRC32_updateByteBuffer) } + if (UseF2jBLASIntrinsics) { + method_entry(org_netlib_blas_Dgemm_dgemm) + method_entry(org_netlib_blas_Dgemv_dgemv) + } + initialize_method_handle_entries(); // all native method kinds (must be one contiguous block) diff --git a/hotspot/src/share/vm/interpreter/templateInterpreter.hpp b/hotspot/src/share/vm/interpreter/templateInterpreter.hpp index 5f76dca8a..96da6353c 100644 --- a/hotspot/src/share/vm/interpreter/templateInterpreter.hpp +++ b/hotspot/src/share/vm/interpreter/templateInterpreter.hpp @@ -132,7 +132,8 @@ class TemplateInterpreter: public AbstractInterpreter { public: // Initialization/debugging - static void initialize(); + static void initialize_stub(); + static void initialize_code(); // this only returns whether a pc is within generated code for the interpreter. static bool contains(address pc) { return _code != NULL && _code->contains(pc); } diff --git a/hotspot/src/share/vm/opto/escape.cpp b/hotspot/src/share/vm/opto/escape.cpp index 68631dbf2..0e0cc1028 100644 --- a/hotspot/src/share/vm/opto/escape.cpp +++ b/hotspot/src/share/vm/opto/escape.cpp @@ -979,7 +979,9 @@ void ConnectionGraph::process_call_arguments(CallNode *call) { strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0 || strcmp(call->as_CallLeaf()->_name, "montgomery_multiply") == 0 || strcmp(call->as_CallLeaf()->_name, "montgomery_square") == 0 || - strcmp(call->as_CallLeaf()->_name, "f2jblas_ddot") == 0) + strcmp(call->as_CallLeaf()->_name, "f2jblas_ddot") == 0 || + strcmp(call->as_CallLeaf()->_name, "dgemm_dgemm") == 0) || + strcmp(call->as_CallLeaf()->_name, "dgemv_dgemv") == 0 ))) { call->dump(); fatal(err_msg_res("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name)); diff --git a/hotspot/src/share/vm/opto/graphKit.cpp b/hotspot/src/share/vm/opto/graphKit.cpp index 41a067ce2..1c3bc2e8c 100644 --- a/hotspot/src/share/vm/opto/graphKit.cpp +++ b/hotspot/src/share/vm/opto/graphKit.cpp @@ -2372,7 +2372,11 @@ Node* GraphKit::make_runtime_call(int flags, Node* parm0, Node* parm1, Node* parm2, Node* parm3, Node* parm4, Node* parm5, - Node* parm6, Node* parm7) { + Node* parm6, Node* parm7, + Node* parm8, Node* parm9, + Node* parm10, Node* parm11, + Node* parm12, Node* parm13, + Node* parm14, Node* parm15) { // Slow-path call bool is_leaf = !(flags & RC_NO_LEAF); bool has_io = (!is_leaf && !(flags & RC_NO_IO)); @@ -2415,7 +2419,15 @@ Node* GraphKit::make_runtime_call(int flags, if (parm5 != NULL) { call->init_req(TypeFunc::Parms+5, parm5); if (parm6 != NULL) { call->init_req(TypeFunc::Parms+6, parm6); if (parm7 != NULL) { call->init_req(TypeFunc::Parms+7, parm7); - /* close each nested if ===> */ } } } } } } } } + if (parm8 != NULL) { call->init_req(TypeFunc::Parms+8, parm8); + if (parm9 != NULL) { call->init_req(TypeFunc::Parms+9, parm9); + if (parm10 != NULL) { call->init_req(TypeFunc::Parms+10, parm10); + if (parm11 != NULL) { call->init_req(TypeFunc::Parms+11, parm11); + if (parm12 != NULL) { call->init_req(TypeFunc::Parms+12, parm12); + if (parm13 != NULL) { call->init_req(TypeFunc::Parms+13, parm13); + if (parm14 != NULL) { call->init_req(TypeFunc::Parms+14, parm14); + if (parm15 != NULL) { call->init_req(TypeFunc::Parms+15, parm15); + /* close each nested if ===> */ } } } } } } } } } } } } } } } } assert(call->in(call->req()-1) != NULL, "must initialize all parms"); if (!is_leaf) { diff --git a/hotspot/src/share/vm/opto/graphKit.hpp b/hotspot/src/share/vm/opto/graphKit.hpp index 7a363fd33..e9a061acf 100644 --- a/hotspot/src/share/vm/opto/graphKit.hpp +++ b/hotspot/src/share/vm/opto/graphKit.hpp @@ -818,7 +818,11 @@ class GraphKit : public Phase { Node* parm0 = NULL, Node* parm1 = NULL, Node* parm2 = NULL, Node* parm3 = NULL, Node* parm4 = NULL, Node* parm5 = NULL, - Node* parm6 = NULL, Node* parm7 = NULL); + Node* parm6 = NULL, Node* parm7 = NULL, + Node* parm8 = NULL, Node* parm9 = NULL, + Node* parm10 = NULL, Node* parm11 = NULL, + Node* parm12 = NULL, Node* parm13 = NULL, + Node* parm14 = NULL, Node* parm15 = NULL); enum { // flag values for make_runtime_call RC_NO_FP = 1, // CallLeafNoFPNode RC_NO_IO = 2, // do not hook IO edges diff --git a/hotspot/src/share/vm/opto/library_call.cpp b/hotspot/src/share/vm/opto/library_call.cpp index 5cbc0f012..10eeea217 100644 --- a/hotspot/src/share/vm/opto/library_call.cpp +++ b/hotspot/src/share/vm/opto/library_call.cpp @@ -336,6 +336,8 @@ class LibraryCallKit : public GraphKit { bool inline_montgomeryMultiply(); bool inline_montgomerySquare(); bool inline_ddotF2jBLAS(); + bool inline_dgemmDgemm(); + bool inline_dgemvDgemv(); bool inline_profileBoolean(); }; @@ -589,6 +591,8 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) { break; case vmIntrinsics::_f2jblas_ddot: + case vmIntrinsics::_dgemm_dgemm: + case vmIntrinsics::_dgemv_dgemv: if (!UseF2jBLASIntrinsics) return NULL; break; @@ -988,9 +992,13 @@ bool LibraryCallKit::try_to_inline(int predicate) { case vmIntrinsics::_profileBoolean: return inline_profileBoolean(); + case vmIntrinsics::_f2jblas_ddot: return inline_ddotF2jBLAS(); - + case vmIntrinsics::_dgemm_dgemm: + return inline_dgemmDgemm(); + case vmIntrinsics::_dgemv_dgemv: + return inline_dgemvDgemv(); default: // If you get here, it may be that someone has added a new intrinsic // to the list in vmSymbols.hpp without implementing it here. @@ -6354,6 +6362,144 @@ bool LibraryCallKit::inline_ddotF2jBLAS() { } /** + * double org.netlib.blas.Dgemm.dgemm(java.lang.String transa, + * java.lang.String transb, int m, int n, int k, + * double alpha, double[] a, int offset_a, int lda, + * double[] b, int offset_b, int ldb, double beta, + * double[] c, int offset_c, int Ldc) + */ +bool LibraryCallKit::inline_dgemmDgemm() { + assert(callee()->signature()->count() == 16, "Dgemm.dgemm has 16 parameters"); + + address stubAddr = StubRoutines::dgemmDgemm(); + if (stubAddr == NULL) return false; + + Node* transa = argument(0); + Node* transb = argument(1); + Node* m = argument(2); + Node* n = argument(3); + Node* k = argument(4); + Node* alpha = round_double_node(argument(5)); + Node* a = argument(7); + Node* a_offset = argument(8); + Node* lda = argument(9); + Node* b = argument(10); + Node* b_offset = argument(11); + Node* ldb = argument(12); + Node* beta = round_double_node(argument(13)); + Node* c = argument(15); + Node* c_offset = argument(16); + Node* ldc = argument(17); + + const Type* a_type = a->Value(&_gvn); + const Type* b_type = b->Value(&_gvn); + const Type* c_type = c->Value(&_gvn); + const TypeAryPtr* a_base_type = a_type->isa_aryptr(); + const TypeAryPtr* b_base_type = b_type->isa_aryptr(); + const TypeAryPtr* c_base_type = c_type->isa_aryptr(); + if (a_base_type == NULL || b_base_type == NULL || c_base_type == NULL) return false; + + ciKlass* a_klass = a_base_type->klass(); + ciKlass* b_klass = b_base_type->klass(); + ciKlass* c_klass = c_base_type->klass(); + if (a_klass == NULL || b_klass == NULL || c_klass == NULL) return false; + + BasicType a_elem_type = a_klass->as_array_klass()->element_type()->basic_type(); + BasicType b_elem_type = b_klass->as_array_klass()->element_type()->basic_type(); + BasicType c_elem_type = a_klass->as_array_klass()->element_type()->basic_type(); + if (a_elem_type != T_DOUBLE || b_elem_type != T_DOUBLE || c_elem_type != T_DOUBLE) return false; + + // get array a/b/c's addr + Node* a_start = array_element_address(a, a_offset, a_elem_type); + Node* b_start = array_element_address(b, b_offset, b_elem_type); + Node* c_start = array_element_address(c, c_offset, c_elem_type); + + // Get start addr of string + Node* transa_value = load_String_value(NULL, transa); + Node* transa_offset = load_String_offset(NULL, transa); + Node* transa_start = array_element_address(transa_value, transa_offset, T_CHAR); + Node* transb_value = load_String_value(NULL, transb); + Node* transb_offset = load_String_offset(NULL, transb); + Node* transb_start = array_element_address(transb_value, transb_offset, T_CHAR); + + const char *stubName = "dgemm_dgemm"; + make_runtime_call(RC_LEAF, OptoRuntime::dgemmDgemm_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + transa_start, transb_start, m, n, k, alpha, top(), + a_start, lda, b_start, ldb, beta, top(), c_start, ldc); + + return true; +} + +/** + * void org.netlib.blas.Dgemv.dgemv(string trans, int m, int n, double alpha, + * double[] a, int _a_offset, int lda, + * double[] x, int _x_offset, int incx, double beta, + * double[] y, int _y_offset, int incy) + */ +bool LibraryCallKit::inline_dgemvDgemv() { + assert(callee()->signature()->count() == 14, "F2jBLAS.dgemv has 14 parameters"); + Node* trans = argument(0); + Node* m = argument(1); + Node* n = argument(2); + Node* alpha = round_double_node(argument(3)); + Node* a = argument(5); + Node* a_offset = argument(6); + Node* lda = argument(7); + Node* x = argument(8); + Node* x_offset = argument(9); + Node* incx = argument(10); + Node* beta = round_double_node(argument(11)); + Node* y = argument(13); + Node* y_offset = argument(14); + Node* incy = argument(15); + + const Type* a_type = a->Value(&_gvn); + const Type* x_type = x->Value(&_gvn); + const Type* y_type = y->Value(&_gvn); + const TypeAryPtr* a_base_type = a_type->isa_aryptr(); + const TypeAryPtr* x_base_type = x_type->isa_aryptr(); + const TypeAryPtr* y_base_type = y_type->isa_aryptr(); + if (a_base_type == NULL || x_base_type == NULL || y_base_type == NULL) return false; + + ciKlass* a_klass = a_base_type->klass(); + ciKlass* x_klass = x_base_type->klass(); + ciKlass* y_klass = y_base_type->klass(); + + if (a_klass == NULL || x_klass == NULL || y_klass == NULL) return false; + + BasicType a_elem_type = a_klass->as_array_klass()->element_type()->basic_type(); + BasicType x_elem_type = x_klass->as_array_klass()->element_type()->basic_type(); + BasicType y_elem_type = y_klass->as_array_klass()->element_type()->basic_type(); + + if (a_elem_type != T_DOUBLE || x_elem_type != T_DOUBLE || y_elem_type != T_DOUBLE) return false; + + + address stubAddr = StubRoutines::dgemvDgemv(); + if (stubAddr == NULL) return false; + + // 'a_start' points to array a + scaled offset + Node* a_start = array_element_address(a, a_offset, a_elem_type); + // 'x_start' points to array x + scaled offset + Node* x_start = array_element_address(x, x_offset, x_elem_type); + // 'y_start' points to array y + scaled offset + Node* y_start = array_element_address(y, y_offset, y_elem_type); + + Node* no_ctrl = NULL; + + // get start addr of string + Node* trans_value = load_String_value(no_ctrl, trans); + Node* trans_offset = load_String_offset(no_ctrl, trans); + Node* trans_start = array_element_address(trans_value, trans_offset, T_CHAR); + + const char *stubName = "dgemv_dgemv"; + Node* call = make_runtime_call(RC_LEAF, OptoRuntime::dgemvDgemv_Type(), stubAddr, stubName, + TypePtr::BOTTOM, trans_start, m, n, alpha, top(), a_start, + lda, x_start, incx, beta, top(), y_start, incy); + return true; +} + +/** * Calculate CRC32 for ByteBuffer. * int java.util.zip.CRC32.updateByteBuffer(int crc, long buf, int off, int len) */ diff --git a/hotspot/src/share/vm/opto/runtime.cpp b/hotspot/src/share/vm/opto/runtime.cpp index f1fe4d666..dc8f0c774 100644 --- a/hotspot/src/share/vm/opto/runtime.cpp +++ b/hotspot/src/share/vm/opto/runtime.cpp @@ -944,6 +944,81 @@ const TypeFunc* OptoRuntime::ddotF2jBLAS_Type() { return TypeFunc::make(domain, range); } +/** + * double org.netlib.blas.Dgemm.dgemm(java.lang.String transa, + * java.lang.String transb, int m, int n, int k, + * double alpha, double[] a, int offset_a, int lda, + * double[] b, int offset_b, int ldb, double beta, + * double[] c, int offset_c, int Ldc) + */ +const TypeFunc* OptoRuntime::dgemmDgemm_Type() { + // create input type (domain) + int num_args = 15; + int argcnt = num_args; + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + + fields[argp++] = TypeAryPtr::CHARS; // char[] + fields[argp++] = TypeAryPtr::CHARS; // char[] + fields[argp++] = TypeInt::INT; // int m + fields[argp++] = TypeInt::INT; // int n + fields[argp++] = TypeInt::INT; // int k + fields[argp++] = Type::DOUBLE; // double alpha + fields[argp++] = Type::HALF; + fields[argp++] = TypeAryPtr::DOUBLES; // double[] a + fields[argp++] = TypeInt::INT; // int lda + fields[argp++] = TypeAryPtr::DOUBLES; // double[] b + fields[argp++] = TypeInt::INT; // int ldb + fields[argp++] = Type::DOUBLE; // double beta + fields[argp++] = Type::HALF; + fields[argp++] = TypeAryPtr::DOUBLES; // double[] c + fields[argp++] = TypeInt::INT; // int ldc + assert(argp == TypeFunc::Parms + argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields); + + // no result type needed + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms + 0] = NULL; // void + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields); + return TypeFunc::make(domain, range); +} + +/** + * void dgemv(String trans, int m, int n, double alpha, + * double[] a, int _a_offset, int lda, + * double[] x, int _x_offset, int incx, double beta, + * double[] y, int _y_offset, int incy) + */ +const TypeFunc* OptoRuntime::dgemvDgemv_Type() { + // create input type (domain) + int num_args = 13; + int argcnt = num_args; + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + + fields[argp++] = TypeAryPtr::CHARS; // char[] + fields[argp++] = TypeInt::INT; // int m + fields[argp++] = TypeInt::INT; // int n + fields[argp++] = Type::DOUBLE; // double alpha + fields[argp++] = Type::HALF; + fields[argp++] = TypeAryPtr::DOUBLES; // double[] a + fields[argp++] = TypeInt::INT; // int lda + fields[argp++] = TypeAryPtr::DOUBLES; // double[] x + fields[argp++] = TypeInt::INT; // int incx + fields[argp++] = Type::DOUBLE; // double beta + fields[argp++] = Type::HALF; + fields[argp++] = TypeAryPtr::DOUBLES; // double[] y + fields[argp++] = TypeInt::INT; // int incy + assert(argp == TypeFunc::Parms + argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields); + + // no result type needed + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms + 0] = NULL; // void + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields); + return TypeFunc::make(domain, range); +} + // for cipherBlockChaining calls of aescrypt encrypt/decrypt, four pointers and a length, returning int const TypeFunc* OptoRuntime::cipherBlockChaining_aescrypt_Type() { // create input type (domain) diff --git a/hotspot/src/share/vm/opto/runtime.hpp b/hotspot/src/share/vm/opto/runtime.hpp index 66d393c5c..e07c34c15 100644 --- a/hotspot/src/share/vm/opto/runtime.hpp +++ b/hotspot/src/share/vm/opto/runtime.hpp @@ -318,6 +318,8 @@ private: static const TypeFunc* updateBytesCRC32_Type(); static const TypeFunc* ddotF2jBLAS_Type(); + static const TypeFunc* dgemmDgemm_Type(); + static const TypeFunc* dgemvDgemv_Type(); // leaf on stack replacement interpreter accessor types static const TypeFunc* osr_end_Type(); diff --git a/hotspot/src/share/vm/runtime/init.cpp b/hotspot/src/share/vm/runtime/init.cpp index 1512ccc96..4c133bd4e 100644 --- a/hotspot/src/share/vm/runtime/init.cpp +++ b/hotspot/src/share/vm/runtime/init.cpp @@ -54,7 +54,8 @@ void VM_Version_init(); void os_init_globals(); // depends on VM_Version_init, before universe_init void stubRoutines_init1(); jint universe_init(); // depends on codeCache_init and stubRoutines_init -void interpreter_init(); // before any methods loaded +void interpreter_init_stub(); // before any methods loaded +void interpreter_init_code(); // after methods loaded, but before they are linked void invocationCounter_init(); // before any methods loaded void marksweep_init(); void accessFlags_init(); @@ -106,7 +107,7 @@ jint init_globals() { if (status != JNI_OK) return status; - interpreter_init(); // before any methods loaded + interpreter_init_stub(); // before methods get loaded invocationCounter_init(); // before any methods loaded marksweep_init(); accessFlags_init(); @@ -114,6 +115,7 @@ jint init_globals() { InterfaceSupport_init(); SharedRuntime::generate_stubs(); universe2_init(); // dependent on codeCache_init and stubRoutines_init1 + interpreter_init_code(); // after universe2_init and before any method gets linked referenceProcessor_init(); jni_handles_init(); #if INCLUDE_VM_STRUCTS diff --git a/hotspot/src/share/vm/runtime/stubRoutines.cpp b/hotspot/src/share/vm/runtime/stubRoutines.cpp index 10f438bc5..f2106d13a 100644 --- a/hotspot/src/share/vm/runtime/stubRoutines.cpp +++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp @@ -136,7 +136,10 @@ address StubRoutines::_sha512_implCompressMB = NULL; address StubRoutines::_updateBytesCRC32 = NULL; address StubRoutines::_crc_table_adr = NULL; +address StubRoutines::_BLAS_library = NULL; address StubRoutines::_ddotF2jBLAS = NULL; +address StubRoutines::_dgemmDgemm = NULL; +address StubRoutines::_dgemvDgemv = NULL; address StubRoutines::_multiplyToLen = NULL; address StubRoutines::_squareToLen = NULL; diff --git a/hotspot/src/share/vm/runtime/stubRoutines.hpp b/hotspot/src/share/vm/runtime/stubRoutines.hpp index a4eeb910d..16075d9f4 100644 --- a/hotspot/src/share/vm/runtime/stubRoutines.hpp +++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp @@ -214,7 +214,10 @@ class StubRoutines: AllStatic { static address _updateBytesCRC32; static address _crc_table_adr; + static address _BLAS_library; static address _ddotF2jBLAS; + static address _dgemmDgemm; + static address _dgemvDgemv; static address _multiplyToLen; static address _squareToLen; @@ -380,6 +383,8 @@ class StubRoutines: AllStatic { static address crc_table_addr() { return _crc_table_adr; } static address ddotF2jBLAS() { return _ddotF2jBLAS; } + static address dgemmDgemm() { return _dgemmDgemm; } + static address dgemvDgemv() { return _dgemvDgemv; } static address multiplyToLen() {return _multiplyToLen; } static address squareToLen() {return _squareToLen; } -- 2.12.3