diff --git a/luajit-riscv-makefile.patch b/luajit-riscv-makefile.patch
new file mode 100644
index 0000000..aa45161
--- /dev/null
+++ b/luajit-riscv-makefile.patch
@@ -0,0 +1,12 @@
+Index: wrk-4.2.0/Makefile
+===================================================================
+--- wrk-4.2.0.orig/Makefile
++++ wrk-4.2.0/Makefile
+@@ -87,7 +87,6 @@ $(ODIR)/$(OPENSSL): deps/$(OPENSSL).tar.
+ $(ODIR)/lib/libluajit-5.1.a: $(ODIR)/$(LUAJIT)
+ 	@echo Building LuaJIT...
+ 	@$(MAKE) -C $< PREFIX=$(abspath $(ODIR)) BUILDMODE=static install
+-	@cd $(ODIR)/bin && ln -s luajit-2.1.0-beta3 luajit
+ 
+ $(ODIR)/lib/libssl.a: $(ODIR)/$(OPENSSL)
+ 	@echo Building OpenSSL...
diff --git a/luajit-riscv.patch b/luajit-riscv.patch
new file mode 100644
index 0000000..f4e4652
--- /dev/null
+++ b/luajit-riscv.patch
@@ -0,0 +1,34768 @@
+Index: wrk-4.2.0/obj/LuaJIT-2.1/.gitattributes
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/.gitattributes
+@@ -0,0 +1 @@
++/.relver export-subst
+Index: wrk-4.2.0/obj/LuaJIT-2.1/.relver
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/.relver
+@@ -0,0 +1 @@
++$Format:%ct$
+Index: wrk-4.2.0/obj/LuaJIT-2.1/COPYRIGHT
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/COPYRIGHT
++++ wrk-4.2.0/obj/LuaJIT-2.1/COPYRIGHT
+@@ -1,7 +1,7 @@
+ ===============================================================================
+ LuaJIT -- a Just-In-Time Compiler for Lua. https://luajit.org/
+ 
+-Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+Index: wrk-4.2.0/obj/LuaJIT-2.1/Makefile
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/Makefile
++++ wrk-4.2.0/obj/LuaJIT-2.1/Makefile
+@@ -10,16 +10,21 @@
+ # For MSVC, please follow the instructions given in src/msvcbuild.bat.
+ # For MinGW and Cygwin, cd to src and run make with the Makefile there.
+ #
+-# Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++# Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ ##############################################################################
+ 
+ MAJVER=  2
+ MINVER=  1
+-RELVER=  0
+-PREREL=  -beta3
+-VERSION= $(MAJVER).$(MINVER).$(RELVER)$(PREREL)
+ ABIVER=  5.1
+ 
++# LuaJIT uses rolling releases. The release version is based on the time of
++# the latest git commit. The 'git' command must be available during the build.
++RELVER= $(shell cat src/luajit_relver.txt 2>/dev/null || : )
++# Note: setting it with := doesn't work, since it will change during the build.
++
++MMVERSION= $(MAJVER).$(MINVER)
++VERSION= $(MMVERSION).$(RELVER)
++
+ ##############################################################################
+ #
+ # Change the installation path as needed. This automatically adjusts
+@@ -33,9 +38,10 @@ DPREFIX= $(DESTDIR)$(PREFIX)
+ INSTALL_BIN=   $(DPREFIX)/bin
+ INSTALL_LIB=   $(DPREFIX)/$(MULTILIB)
+ INSTALL_SHARE= $(DPREFIX)/share
+-INSTALL_INC=   $(DPREFIX)/include/luajit-$(MAJVER).$(MINVER)
++INSTALL_DEFINC= $(DPREFIX)/include/luajit-$(MMVERSION)
++INSTALL_INC=   $(INSTALL_DEFINC)
+ 
+-INSTALL_LJLIBD= $(INSTALL_SHARE)/luajit-$(VERSION)
++INSTALL_LJLIBD= $(INSTALL_SHARE)/luajit-$(MMVERSION)
+ INSTALL_JITLIB= $(INSTALL_LJLIBD)/jit
+ INSTALL_LMODD= $(INSTALL_SHARE)/lua
+ INSTALL_LMOD= $(INSTALL_LMODD)/$(ABIVER)
+@@ -49,10 +55,10 @@ INSTALL_TSYMNAME= luajit
+ INSTALL_ANAME= libluajit-$(ABIVER).a
+ INSTALL_SOSHORT1= libluajit-$(ABIVER).so
+ INSTALL_SOSHORT2= libluajit-$(ABIVER).so.$(MAJVER)
+-INSTALL_SONAME= $(INSTALL_SOSHORT2).$(MINVER).$(RELVER)
++INSTALL_SONAME= libluajit-$(ABIVER).so.$(VERSION)
+ INSTALL_DYLIBSHORT1= libluajit-$(ABIVER).dylib
+ INSTALL_DYLIBSHORT2= libluajit-$(ABIVER).$(MAJVER).dylib
+-INSTALL_DYLIBNAME= libluajit-$(ABIVER).$(MAJVER).$(MINVER).$(RELVER).dylib
++INSTALL_DYLIBNAME= libluajit-$(ABIVER).$(VERSION).dylib
+ INSTALL_PCNAME= luajit.pc
+ 
+ INSTALL_STATIC= $(INSTALL_LIB)/$(INSTALL_ANAME)
+@@ -77,7 +83,11 @@ INSTALL_F= install -m 0644
+ UNINSTALL= $(RM)
+ LDCONFIG= ldconfig -n 2>/dev/null
+ SED_PC= sed -e "s|^prefix=.*|prefix=$(PREFIX)|" \
+-            -e "s|^multilib=.*|multilib=$(MULTILIB)|"
++	    -e "s|^multilib=.*|multilib=$(MULTILIB)|" \
++	    -e "s|^relver=.*|relver=$(RELVER)|"
++ifneq ($(INSTALL_DEFINC),$(INSTALL_INC))
++  SED_PC+= -e "s|^includedir=.*|includedir=$(INSTALL_INC)|"
++endif
+ 
+ FILE_T= luajit
+ FILE_A= libluajit.a
+@@ -88,7 +98,10 @@ FILES_INC= lua.h lualib.h lauxlib.h luac
+ FILES_JITLIB= bc.lua bcsave.lua dump.lua p.lua v.lua zone.lua \
+ 	      dis_x86.lua dis_x64.lua dis_arm.lua dis_arm64.lua \
+ 	      dis_arm64be.lua dis_ppc.lua dis_mips.lua dis_mipsel.lua \
+-	      dis_mips64.lua dis_mips64el.lua vmdef.lua
++	      dis_mips64.lua dis_mips64el.lua \
++	      dis_mips64r6.lua dis_mips64r6el.lua \
++		  dis_riscv.lua dis_riscv64.lua \
++	      vmdef.lua
+ 
+ ifeq (,$(findstring Windows,$(OS)))
+   HOST_SYS:= $(shell uname -s)
+@@ -109,9 +122,9 @@ endif
+ INSTALL_DEP= src/luajit
+ 
+ default all $(INSTALL_DEP):
+-	@echo "==== Building LuaJIT $(VERSION) ===="
++	@echo "==== Building LuaJIT $(MMVERSION) ===="
+ 	$(MAKE) -C src
+-	@echo "==== Successfully built LuaJIT $(VERSION) ===="
++	@echo "==== Successfully built LuaJIT $(MMVERSION) ===="
+ 
+ install: $(INSTALL_DEP)
+ 	@echo "==== Installing LuaJIT $(VERSION) to $(PREFIX) ===="
+@@ -130,18 +143,12 @@ install: $(INSTALL_DEP)
+ 	  $(RM) $(FILE_PC).tmp
+ 	cd src && $(INSTALL_F) $(FILES_INC) $(INSTALL_INC)
+ 	cd src/jit && $(INSTALL_F) $(FILES_JITLIB) $(INSTALL_JITLIB)
++	$(SYMLINK) $(INSTALL_TNAME) $(INSTALL_TSYM)
+ 	@echo "==== Successfully installed LuaJIT $(VERSION) to $(PREFIX) ===="
+-	@echo ""
+-	@echo "Note: the development releases deliberately do NOT install a symlink for luajit"
+-	@echo "You can do this now by running this command (with sudo):"
+-	@echo ""
+-	@echo "  $(SYMLINK) $(INSTALL_TNAME) $(INSTALL_TSYM)"
+-	@echo ""
+-
+ 
+ uninstall:
+ 	@echo "==== Uninstalling LuaJIT $(VERSION) from $(PREFIX) ===="
+-	$(UNINSTALL) $(INSTALL_T) $(INSTALL_STATIC) $(INSTALL_DYN) $(INSTALL_SHORT1) $(INSTALL_SHORT2) $(INSTALL_MAN)/$(FILE_MAN) $(INSTALL_PC)
++	$(UNINSTALL) $(INSTALL_TSYM) $(INSTALL_T) $(INSTALL_STATIC) $(INSTALL_DYN) $(INSTALL_SHORT1) $(INSTALL_SHORT2) $(INSTALL_MAN)/$(FILE_MAN) $(INSTALL_PC)
+ 	for file in $(FILES_JITLIB); do \
+ 	  $(UNINSTALL) $(INSTALL_JITLIB)/$$file; \
+ 	  done
+@@ -155,8 +162,9 @@ uninstall:
+ ##############################################################################
+ 
+ amalg:
+-	@echo "Building LuaJIT $(VERSION)"
++	@echo "==== Building LuaJIT $(MMVERSION) (amalgamation) ===="
+ 	$(MAKE) -C src amalg
++	@echo "==== Successfully built LuaJIT $(MMVERSION) (amalgamation) ===="
+ 
+ clean:
+ 	$(MAKE) -C src clean
+Index: wrk-4.2.0/obj/LuaJIT-2.1/README
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/README
++++ wrk-4.2.0/obj/LuaJIT-2.1/README
+@@ -1,11 +1,11 @@
+-README for LuaJIT 2.1.0-beta3
+------------------------------
++README for LuaJIT 2.1
++---------------------
+ 
+ LuaJIT is a Just-In-Time (JIT) compiler for the Lua programming language.
+ 
+ Project Homepage: https://luajit.org/
+ 
+-LuaJIT is Copyright (C) 2005-2021 Mike Pall.
++LuaJIT is Copyright (C) 2005-2023 Mike Pall.
+ LuaJIT is free software, released under the MIT license.
+ See full Copyright Notice in the COPYRIGHT file or in luajit.h.
+ 
+Index: wrk-4.2.0/obj/LuaJIT-2.1/README.md
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/README.md
+@@ -0,0 +1,31 @@
++# LJRV - LuaJIT RISC-V 64 Port
++
++LuaJIT is a Just-In-Time (JIT) compiler for the Lua programming language,
++RISC-V is a free and open ISA enabling a new era of processor innovation.
++
++## Introduction
++
++LJRV is a ongoing porting project of LuaJIT to the RISC-V 64-bit architecture by PLCT Lab, ISCAS.
++The ultimate goal is to provide a RISC-V 64 LuaJIT implementation and have it upstreamed to the official LuaJIT repository.
++
++## Progress
++
++- [x] Interpreter Runtime
++- [x] JIT Compiler
++
++LJRV is still of beta quality, particularly the JIT compiler.
++For production usage, we suggests disable the JIT compiler during compilation by setting `XCFLAGS+= -DLUAJIT_DISABLE_JIT` in Makefile or environment variable.
++
++## Bug Report
++
++Please report bugs to [Issues](https://github.com/ruyisdk/LuaJIT/issues).
++
++## Copyright
++
++LuaJIT is Copyright (C) 2005-2023 Mike Pall.
++LuaJIT is free software, released under the MIT license.
++See full Copyright Notice in the COPYRIGHT file or in luajit.h.
++
++LJRV is Copyright (C) 2022-2023 PLCT Lab, ISCAS. Contributed by gns.
++LJRV is free software, released under the MIT license.
++LJRV is part of RuyiSDK.
+Index: wrk-4.2.0/obj/LuaJIT-2.1/doc/bluequad-print.css
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/doc/bluequad-print.css
++++ wrk-4.2.0/obj/LuaJIT-2.1/doc/bluequad-print.css
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2004-2021 Mike Pall.
++/* Copyright (C) 2004-2023 Mike Pall.
+  *
+  * You are welcome to use the general ideas of this design for your own sites.
+  * But please do not steal the stylesheet, the layout or the color scheme.
+Index: wrk-4.2.0/obj/LuaJIT-2.1/doc/bluequad.css
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/doc/bluequad.css
++++ wrk-4.2.0/obj/LuaJIT-2.1/doc/bluequad.css
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2004-2021 Mike Pall.
++/* Copyright (C) 2004-2023 Mike Pall.
+  *
+  * You are welcome to use the general ideas of this design for your own sites.
+  * But please do not steal the stylesheet, the layout or the color scheme.
+@@ -206,11 +206,9 @@ img.right {
+ .ext {
+   color: #ff8000;
+ }
+-.new {
+-  font-size: 6pt;
+-  vertical-align: middle;
+-  background: #ff8000;
+-  color: #ffffff;
++.note {
++  padding: 0.5em 1em;
++  border-left: 3px solid #bfcfff;
+ }
+ #site {
+   clear: both;
+Index: wrk-4.2.0/obj/LuaJIT-2.1/doc/contact.html
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/doc/contact.html
++++ wrk-4.2.0/obj/LuaJIT-2.1/doc/contact.html
+@@ -1,9 +1,9 @@
+-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
++<!DOCTYPE html>
+ <html>
+ <head>
+ <title>Contact</title>
+-<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+-<meta name="Copyright" content="Copyright (C) 2005-2021">
++<meta charset="utf-8">
++<meta name="Copyright" content="Copyright (C) 2005-2023">
+ <meta name="Language" content="en">
+ <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+ <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+@@ -37,6 +37,8 @@
+ <a href="ext_ffi_semantics.html">FFI Semantics</a>
+ </li></ul>
+ </li><li>
++<a href="ext_buffer.html">String Buffers</a>
++</li><li>
+ <a href="ext_jit.html">jit.* Library</a>
+ </li><li>
+ <a href="ext_c_api.html">Lua/C API</a>
+@@ -44,11 +46,9 @@
+ <a href="ext_profiler.html">Profiler</a>
+ </li></ul>
+ </li><li>
+-<a href="status.html">Status</a>
+-</li><li>
+-<a href="faq.html">FAQ</a>
++<a href="https://luajit.org/status.html">Status <span class="ext">&raquo;</span></a>
+ </li><li>
+-<a href="http://wiki.luajit.org/">Wiki <span class="ext">&raquo;</span></a>
++<a href="https://luajit.org/faq.html">FAQ <span class="ext">&raquo;</span></a>
+ </li><li>
+ <a href="https://luajit.org/list.html">Mailing List <span class="ext">&raquo;</span></a>
+ </li></ul>
+@@ -84,10 +84,17 @@ xD("fyZKB8xv\"FJytmz8.KAB0u52D")
+ </p>
+ </noscript>
+ 
++<p><i>
++Note: I cannot reply to GMail, Google Workplace, Outlook or Office365
++mail addresses, since they prefer to mindlessly filter out mails sent
++from small domains using independent mail servers, such as mine. If you
++don't like that, please complain to Google or Microsoft, not me.
++</i></p>
++
+ <h2>Copyright</h2>
+ <p>
+ All documentation is
+-Copyright &copy; 2005-2021 Mike Pall.
++Copyright &copy; 2005-2023 Mike Pall.
+ </p>
+ 
+ 
+@@ -95,7 +102,7 @@ Copyright &copy; 2005-2021 Mike Pall.
+ </div>
+ <div id="foot">
+ <hr class="hide">
+-Copyright &copy; 2005-2021
++Copyright &copy; 2005-2023
+ <span class="noprint">
+ &middot;
+ <a href="contact.html">Contact</a>
+Index: wrk-4.2.0/obj/LuaJIT-2.1/doc/ext_buffer.html
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/doc/ext_buffer.html
+@@ -0,0 +1,689 @@
++<!DOCTYPE html>
++<html>
++<head>
++<title>String Buffer Library</title>
++<meta charset="utf-8">
++<meta name="Copyright" content="Copyright (C) 2005-2023">
++<meta name="Language" content="en">
++<link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
++<link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
++<style type="text/css">
++.lib {
++  vertical-align: middle;
++  margin-left: 5px;
++  padding: 0 5px;
++  font-size: 60%;
++  border-radius: 5px;
++  background: #c5d5ff;
++  color: #000;
++}
++</style>
++</head>
++<body>
++<div id="site">
++<a href="https://luajit.org"><span>Lua<span id="logo">JIT</span></span></a>
++</div>
++<div id="head">
++<h1>String Buffer Library</h1>
++</div>
++<div id="nav">
++<ul><li>
++<a href="luajit.html">LuaJIT</a>
++<ul><li>
++<a href="https://luajit.org/download.html">Download <span class="ext">&raquo;</span></a>
++</li><li>
++<a href="install.html">Installation</a>
++</li><li>
++<a href="running.html">Running</a>
++</li></ul>
++</li><li>
++<a href="extensions.html">Extensions</a>
++<ul><li>
++<a href="ext_ffi.html">FFI Library</a>
++<ul><li>
++<a href="ext_ffi_tutorial.html">FFI Tutorial</a>
++</li><li>
++<a href="ext_ffi_api.html">ffi.* API</a>
++</li><li>
++<a href="ext_ffi_semantics.html">FFI Semantics</a>
++</li></ul>
++</li><li>
++<a class="current" href="ext_buffer.html">String Buffers</a>
++</li><li>
++<a href="ext_jit.html">jit.* Library</a>
++</li><li>
++<a href="ext_c_api.html">Lua/C API</a>
++</li><li>
++<a href="ext_profiler.html">Profiler</a>
++</li></ul>
++</li><li>
++<a href="https://luajit.org/status.html">Status <span class="ext">&raquo;</span></a>
++</li><li>
++<a href="https://luajit.org/faq.html">FAQ <span class="ext">&raquo;</span></a>
++</li><li>
++<a href="https://luajit.org/list.html">Mailing List <span class="ext">&raquo;</span></a>
++</li></ul>
++</div>
++<div id="main">
++<p>
++The string buffer library allows <b>high-performance manipulation of
++string-like data</b>.
++</p>
++<p>
++Unlike Lua strings, which are constants, string buffers are
++<b>mutable</b> sequences of 8-bit (binary-transparent) characters. Data
++can be stored, formatted and encoded into a string buffer and later
++converted, extracted or decoded.
++</p>
++<p>
++The convenient string buffer API simplifies common string manipulation
++tasks, that would otherwise require creating many intermediate strings.
++String buffers improve performance by eliminating redundant memory
++copies, object creation, string interning and garbage collection
++overhead. In conjunction with the FFI library, they allow zero-copy
++operations.
++</p>
++<p>
++The string buffer library also includes a high-performance
++<a href="serialize">serializer</a> for Lua objects.
++</p>
++
++<h2 id="use">Using the String Buffer Library</h2>
++<p>
++The string buffer library is built into LuaJIT by default, but it's not
++loaded by default. Add this to the start of every Lua file that needs
++one of its functions:
++</p>
++<pre class="code">
++local buffer = require("string.buffer")
++</pre>
++<p>
++The convention for the syntax shown on this page is that <tt>buffer</tt>
++refers to the buffer library and <tt>buf</tt> refers to an individual
++buffer object.
++</p>
++<p>
++Please note the difference between a Lua function call, e.g.
++<tt>buffer.new()</tt> (with a dot) and a Lua method call, e.g.
++<tt>buf:reset()</tt> (with a colon).
++</p>
++
++<h3 id="buffer_object">Buffer Objects</h3>
++<p>
++A buffer object is a garbage-collected Lua object. After creation with
++<tt>buffer.new()</tt>, it can (and should) be reused for many operations.
++When the last reference to a buffer object is gone, it will eventually
++be freed by the garbage collector, along with the allocated buffer
++space.
++</p>
++<p>
++Buffers operate like a FIFO (first-in first-out) data structure. Data
++can be appended (written) to the end of the buffer and consumed (read)
++from the front of the buffer. These operations may be freely mixed.
++</p>
++<p>
++The buffer space that holds the characters is managed automatically
++&mdash; it grows as needed and already consumed space is recycled. Use
++<tt>buffer.new(size)</tt> and <tt>buf:free()</tt>, if you need more
++control.
++</p>
++<p>
++The maximum size of a single buffer is the same as the maximum size of a
++Lua string, which is slightly below two gigabytes. For huge data sizes,
++neither strings nor buffers are the right data structure &mdash; use the
++FFI library to directly map memory or files up to the virtual memory
++limit of your OS.
++</p>
++
++<h3 id="buffer_overview">Buffer Method Overview</h3>
++<ul>
++<li>
++The <tt>buf:put*()</tt>-like methods append (write) characters to the
++end of the buffer.
++</li>
++<li>
++The <tt>buf:get*()</tt>-like methods consume (read) characters from the
++front of the buffer.
++</li>
++<li>
++Other methods, like <tt>buf:tostring()</tt> only read the buffer
++contents, but don't change the buffer.
++</li>
++<li>
++The <tt>buf:set()</tt> method allows zero-copy consumption of a string
++or an FFI cdata object as a buffer.
++</li>
++<li>
++The FFI-specific methods allow zero-copy read/write-style operations or
++modifying the buffer contents in-place. Please check the
++<a href="#ffi_caveats">FFI caveats</a> below, too.
++</li>
++<li>
++Methods that don't need to return anything specific, return the buffer
++object itself as a convenience. This allows method chaining, e.g.:
++<tt>buf:reset():encode(obj)</tt> or <tt>buf:skip(len):get()</tt>
++</li>
++</ul>
++
++<h2 id="create">Buffer Creation and Management</h2>
++
++<h3 id="buffer_new"><tt>local buf = buffer.new([size [,options]])<br>
++local buf = buffer.new([options])</tt></h3>
++<p>
++Creates a new buffer object.
++</p>
++<p>
++The optional <tt>size</tt> argument ensures a minimum initial buffer
++size. This is strictly an optimization when the required buffer size is
++known beforehand. The buffer space will grow as needed, in any case.
++</p>
++<p>
++The optional table <tt>options</tt> sets various
++<a href="#serialize_options">serialization options</a>.
++</p>
++
++<h3 id="buffer_reset"><tt>buf = buf:reset()</tt></h3>
++<p>
++Reset (empty) the buffer. The allocated buffer space is not freed and
++may be reused.
++</p>
++
++<h3 id="buffer_free"><tt>buf = buf:free()</tt></h3>
++<p>
++The buffer space of the buffer object is freed. The object itself
++remains intact, empty and may be reused.
++</p>
++<p>
++Note: you normally don't need to use this method. The garbage collector
++automatically frees the buffer space, when the buffer object is
++collected. Use this method, if you need to free the associated memory
++immediately.
++</p>
++
++<h2 id="write">Buffer Writers</h2>
++
++<h3 id="buffer_put"><tt>buf = buf:put([str|num|obj] [,…])</tt></h3>
++<p>
++Appends a string <tt>str</tt>, a number <tt>num</tt> or any object
++<tt>obj</tt> with a <tt>__tostring</tt> metamethod to the buffer.
++Multiple arguments are appended in the given order.
++</p>
++<p>
++Appending a buffer to a buffer is possible and short-circuited
++internally. But it still involves a copy. Better combine the buffer
++writes to use a single buffer.
++</p>
++
++<h3 id="buffer_putf"><tt>buf = buf:putf(format, …)</tt></h3>
++<p>
++Appends the formatted arguments to the buffer. The <tt>format</tt>
++string supports the same options as <tt>string.format()</tt>.
++</p>
++
++<h3 id="buffer_putcdata"><tt>buf = buf:putcdata(cdata, len)</tt><span class="lib">FFI</span></h3>
++<p>
++Appends the given <tt>len</tt> number of bytes from the memory pointed
++to by the FFI <tt>cdata</tt> object to the buffer. The object needs to
++be convertible to a (constant) pointer.
++</p>
++
++<h3 id="buffer_set"><tt>buf = buf:set(str)<br>
++buf = buf:set(cdata, len)</tt><span class="lib">FFI</span></h3>
++<p>
++This method allows zero-copy consumption of a string or an FFI cdata
++object as a buffer. It stores a reference to the passed string
++<tt>str</tt> or the FFI <tt>cdata</tt> object in the buffer. Any buffer
++space originally allocated is freed. This is <i>not</i> an append
++operation, unlike the <tt>buf:put*()</tt> methods.
++</p>
++<p>
++After calling this method, the buffer behaves as if
++<tt>buf:free():put(str)</tt> or <tt>buf:free():put(cdata,&nbsp;len)</tt>
++had been called. However, the data is only referenced and not copied, as
++long as the buffer is only consumed.
++</p>
++<p>
++In case the buffer is written to later on, the referenced data is copied
++and the object reference is removed (copy-on-write semantics).
++</p>
++<p>
++The stored reference is an anchor for the garbage collector and keeps the
++originally passed string or FFI cdata object alive.
++</p>
++
++<h3 id="buffer_reserve"><tt>ptr, len = buf:reserve(size)</tt><span class="lib">FFI</span><br>
++<tt>buf = buf:commit(used)</tt><span class="lib">FFI</span></h3>
++<p>
++The <tt>reserve</tt> method reserves at least <tt>size</tt> bytes of
++write space in the buffer. It returns an <tt>uint8_t&nbsp;*</tt> FFI
++cdata pointer <tt>ptr</tt> that points to this space.
++</p>
++<p>
++The available length in bytes is returned in <tt>len</tt>. This is at
++least <tt>size</tt> bytes, but may be more to facilitate efficient
++buffer growth. You can either make use of the additional space or ignore
++<tt>len</tt> and only use <tt>size</tt> bytes.
++</p>
++<p>
++The <tt>commit</tt> method appends the <tt>used</tt> bytes of the
++previously returned write space to the buffer data.
++</p>
++<p>
++This pair of methods allows zero-copy use of C read-style APIs:
++</p>
++<pre class="code">
++local MIN_SIZE = 65536
++repeat
++  local ptr, len = buf:reserve(MIN_SIZE)
++  local n = C.read(fd, ptr, len)
++  if n == 0 then break end -- EOF.
++  if n &lt; 0 then error("read error") end
++  buf:commit(n)
++until false
++</pre>
++<p>
++The reserved write space is <i>not</i> initialized. At least the
++<tt>used</tt> bytes <b>must</b> be written to before calling the
++<tt>commit</tt> method. There's no need to call the <tt>commit</tt>
++method, if nothing is added to the buffer (e.g. on error).
++</p>
++
++<h2 id="read">Buffer Readers</h2>
++
++<h3 id="buffer_length"><tt>len = #buf</tt></h3>
++<p>
++Returns the current length of the buffer data in bytes.
++</p>
++
++<h3 id="buffer_concat"><tt>res = str|num|buf .. str|num|buf […]</tt></h3>
++<p>
++The Lua concatenation operator <tt>..</tt> also accepts buffers, just
++like strings or numbers. It always returns a string and not a buffer.
++</p>
++<p>
++Note that although this is supported for convenience, this thwarts one
++of the main reasons to use buffers, which is to avoid string
++allocations. Rewrite it with <tt>buf:put()</tt> and <tt>buf:get()</tt>.
++</p>
++<p>
++Mixing this with unrelated objects that have a <tt>__concat</tt>
++metamethod may not work, since these probably only expect strings.
++</p>
++
++<h3 id="buffer_skip"><tt>buf = buf:skip(len)</tt></h3>
++<p>
++Skips (consumes) <tt>len</tt> bytes from the buffer up to the current
++length of the buffer data.
++</p>
++
++<h3 id="buffer_get"><tt>str, … = buf:get([len|nil] [,…])</tt></h3>
++<p>
++Consumes the buffer data and returns one or more strings. If called
++without arguments, the whole buffer data is consumed. If called with a
++number, up to <tt>len</tt> bytes are consumed. A <tt>nil</tt> argument
++consumes the remaining buffer space (this only makes sense as the last
++argument). Multiple arguments consume the buffer data in the given
++order.
++</p>
++<p>
++Note: a zero length or no remaining buffer data returns an empty string
++and not <tt>nil</tt>.
++</p>
++
++<h3 id="buffer_tostring"><tt>str = buf:tostring()<br>
++str = tostring(buf)</tt></h3>
++<p>
++Creates a string from the buffer data, but doesn't consume it. The
++buffer remains unchanged.
++</p>
++<p>
++Buffer objects also define a <tt>__tostring</tt> metamethod. This means
++buffers can be passed to the global <tt>tostring()</tt> function and
++many other functions that accept this in place of strings. The important
++internal uses in functions like <tt>io.write()</tt> are short-circuited
++to avoid the creation of an intermediate string object.
++</p>
++
++<h3 id="buffer_ref"><tt>ptr, len = buf:ref()</tt><span class="lib">FFI</span></h3>
++<p>
++Returns an <tt>uint8_t&nbsp;*</tt> FFI cdata pointer <tt>ptr</tt> that
++points to the buffer data. The length of the buffer data in bytes is
++returned in <tt>len</tt>.
++</p>
++<p>
++The returned pointer can be directly passed to C functions that expect a
++buffer and a length. You can also do bytewise reads
++(<tt>local&nbsp;x&nbsp;=&nbsp;ptr[i]</tt>) or writes
++(<tt>ptr[i]&nbsp;=&nbsp;0x40</tt>) of the buffer data.
++</p>
++<p>
++In conjunction with the <tt>skip</tt> method, this allows zero-copy use
++of C write-style APIs:
++</p>
++<pre class="code">
++repeat
++  local ptr, len = buf:ref()
++  if len == 0 then break end
++  local n = C.write(fd, ptr, len)
++  if n &lt; 0 then error("write error") end
++  buf:skip(n)
++until n >= len
++</pre>
++<p>
++Unlike Lua strings, buffer data is <i>not</i> implicitly
++zero-terminated. It's not safe to pass <tt>ptr</tt> to C functions that
++expect zero-terminated strings. If you're not using <tt>len</tt>, then
++you're doing something wrong.
++</p>
++
++<h2 id="serialize">Serialization of Lua Objects</h2>
++<p>
++The following functions and methods allow <b>high-speed serialization</b>
++(encoding) of a Lua object into a string and decoding it back to a Lua
++object. This allows convenient storage and transport of <b>structured
++data</b>.
++</p>
++<p>
++The encoded data is in an <a href="#serialize_format">internal binary
++format</a>. The data can be stored in files, binary-transparent
++databases or transmitted to other LuaJIT instances across threads,
++processes or networks.
++</p>
++<p>
++Encoding speed can reach up to 1 Gigabyte/second on a modern desktop- or
++server-class system, even when serializing many small objects. Decoding
++speed is mostly constrained by object creation cost.
++</p>
++<p>
++The serializer handles most Lua types, common FFI number types and
++nested structures. Functions, thread objects, other FFI cdata and full
++userdata cannot be serialized (yet).
++</p>
++<p>
++The encoder serializes nested structures as trees. Multiple references
++to a single object will be stored separately and create distinct objects
++after decoding. Circular references cause an error.
++</p>
++
++<h3 id="serialize_methods">Serialization Functions and Methods</h3>
++
++<h3 id="buffer_encode"><tt>str = buffer.encode(obj)<br>
++buf = buf:encode(obj)</tt></h3>
++<p>
++Serializes (encodes) the Lua object <tt>obj</tt>. The stand-alone
++function returns a string <tt>str</tt>. The buffer method appends the
++encoding to the buffer.
++</p>
++<p>
++<tt>obj</tt> can be any of the supported Lua types &mdash; it doesn't
++need to be a Lua table.
++</p>
++<p>
++This function may throw an error when attempting to serialize
++unsupported object types, circular references or deeply nested tables.
++</p>
++
++<h3 id="buffer_decode"><tt>obj = buffer.decode(str)<br>
++obj = buf:decode()</tt></h3>
++<p>
++The stand-alone function deserializes (decodes) the string
++<tt>str</tt>, the buffer method deserializes one object from the
++buffer. Both return a Lua object <tt>obj</tt>.
++</p>
++<p>
++The returned object may be any of the supported Lua types &mdash;
++even <tt>nil</tt>.
++</p>
++<p>
++This function may throw an error when fed with malformed or incomplete
++encoded data. The stand-alone function throws when there's left-over
++data after decoding a single top-level object. The buffer method leaves
++any left-over data in the buffer.
++</p>
++<p>
++Attempting to deserialize an FFI type will throw an error, if the FFI
++library is not built-in or has not been loaded, yet.
++</p>
++
++<h3 id="serialize_options">Serialization Options</h3>
++<p>
++The <tt>options</tt> table passed to <tt>buffer.new()</tt> may contain
++the following members (all optional):
++</p>
++<ul>
++<li>
++<tt>dict</tt> is a Lua table holding a <b>dictionary of strings</b> that
++commonly occur as table keys of objects you are serializing. These keys
++are compactly encoded as indexes during serialization. A well-chosen
++dictionary saves space and improves serialization performance.
++</li>
++<li>
++<tt>metatable</tt> is a Lua table holding a <b>dictionary of metatables</b>
++for the table objects you are serializing.
++</li>
++</ul>
++<p>
++<tt>dict</tt> needs to be an array of strings and <tt>metatable</tt> needs
++to be an array of tables. Both starting at index 1 and without holes (no
++<tt>nil</tt> in between). The tables are anchored in the buffer object and
++internally modified into a two-way index (don't do this yourself, just pass
++a plain array). The tables must not be modified after they have been passed
++to <tt>buffer.new()</tt>.
++</p>
++<p>
++The <tt>dict</tt> and <tt>metatable</tt> tables used by the encoder and
++decoder must be the same. Put the most common entries at the front. Extend
++at the end to ensure backwards-compatibility &mdash; older encodings can
++then still be read. You may also set some indexes to <tt>false</tt> to
++explicitly drop backwards-compatibility. Old encodings that use these
++indexes will throw an error when decoded.
++</p>
++<p>
++Metatables that are not found in the <tt>metatable</tt> dictionary are
++ignored when encoding. Decoding returns a table with a <tt>nil</tt>
++metatable.
++</p>
++<p>
++Note: parsing and preparation of the options table is somewhat
++expensive. Create a buffer object only once and recycle it for multiple
++uses. Avoid mixing encoder and decoder buffers, since the
++<tt>buf:set()</tt> method frees the already allocated buffer space:
++</p>
++<pre class="code">
++local options = {
++  dict = { "commonly", "used", "string", "keys" },
++}
++local buf_enc = buffer.new(options)
++local buf_dec = buffer.new(options)
++
++local function encode(obj)
++  return buf_enc:reset():encode(obj):get()
++end
++
++local function decode(str)
++  return buf_dec:set(str):decode()
++end
++</pre>
++
++<h3 id="serialize_stream">Streaming Serialization</h3>
++<p>
++In some contexts, it's desirable to do piecewise serialization of large
++datasets, also known as <i>streaming</i>.
++</p>
++<p>
++This serialization format can be safely concatenated and supports streaming.
++Multiple encodings can simply be appended to a buffer and later decoded
++individually:
++</p>
++<pre class="code">
++local buf = buffer.new()
++buf:encode(obj1)
++buf:encode(obj2)
++local copy1 = buf:decode()
++local copy2 = buf:decode()
++</pre>
++<p>
++Here's how to iterate over a stream:
++</p>
++<pre class="code">
++while #buf ~= 0 do
++  local obj = buf:decode()
++  -- Do something with obj.
++end
++</pre>
++<p>
++Since the serialization format doesn't prepend a length to its encoding,
++network applications may need to transmit the length, too.
++</p>
++
++<h3 id="serialize_format">Serialization Format Specification</h3>
++<p>
++This serialization format is designed for <b>internal use</b> by LuaJIT
++applications. Serialized data is upwards-compatible and portable across
++all supported LuaJIT platforms.
++</p>
++<p>
++It's an <b>8-bit binary format</b> and not human-readable. It uses e.g.
++embedded zeroes and stores embedded Lua string objects unmodified, which
++are 8-bit-clean, too. Encoded data can be safely concatenated for
++streaming and later decoded one top-level object at a time.
++</p>
++<p>
++The encoding is reasonably compact, but tuned for maximum performance,
++not for minimum space usage. It compresses well with any of the common
++byte-oriented data compression algorithms.
++</p>
++<p>
++Although documented here for reference, this format is explicitly
++<b>not</b> intended to be a 'public standard' for structured data
++interchange across computer languages (like JSON or MessagePack). Please
++do not use it as such.
++</p>
++<p>
++The specification is given below as a context-free grammar with a
++top-level <tt>object</tt> as the starting point. Alternatives are
++separated by the <tt>|</tt> symbol and <tt>*</tt> indicates repeats.
++Grouping is implicit or indicated by <tt>{…}</tt>. Terminals are
++either plain hex numbers, encoded as bytes, or have a <tt>.format</tt>
++suffix.
++</p>
++<pre>
++object    → nil | false | true
++          | null | lightud32 | lightud64
++          | int | num | tab | tab_mt
++          | int64 | uint64 | complex
++          | string
++
++nil       → 0x00
++false     → 0x01
++true      → 0x02
++
++null      → 0x03                            // NULL lightuserdata
++lightud32 → 0x04 data.I                   // 32 bit lightuserdata
++lightud64 → 0x05 data.L                   // 64 bit lightuserdata
++
++int       → 0x06 int.I                                 // int32_t
++num       → 0x07 double.L
++
++tab       → 0x08                                   // Empty table
++          | 0x09 h.U h*{object object}          // Key/value hash
++          | 0x0a a.U a*object                    // 0-based array
++          | 0x0b a.U a*object h.U h*{object object}      // Mixed
++          | 0x0c a.U (a-1)*object                // 1-based array
++          | 0x0d a.U (a-1)*object h.U h*{object object}  // Mixed
++tab_mt    → 0x0e (index-1).U tab          // Metatable dict entry
++
++int64     → 0x10 int.L                             // FFI int64_t
++uint64    → 0x11 uint.L                           // FFI uint64_t
++complex   → 0x12 re.L im.L                         // FFI complex
++
++string    → (0x20+len).U len*char.B
++          | 0x0f (index-1).U                 // String dict entry
++
++.B = 8 bit
++.I = 32 bit little-endian
++.L = 64 bit little-endian
++.U = prefix-encoded 32 bit unsigned number n:
++     0x00..0xdf   → n.B
++     0xe0..0x1fdf → (0xe0|(((n-0xe0)>>8)&0x1f)).B ((n-0xe0)&0xff).B
++   0x1fe0..       → 0xff n.I
++</pre>
++
++<h2 id="error">Error handling</h2>
++<p>
++Many of the buffer methods can throw an error. Out-of-memory or usage
++errors are best caught with an outer wrapper for larger parts of code.
++There's not much one can do after that, anyway.
++</p>
++<p>
++OTOH, you may want to catch some errors individually. Buffer methods need
++to receive the buffer object as the first argument. The Lua colon-syntax
++<tt>obj:method()</tt> does that implicitly. But to wrap a method with
++<tt>pcall()</tt>, the arguments need to be passed like this:
++</p>
++<pre class="code">
++local ok, err = pcall(buf.encode, buf, obj)
++if not ok then
++  -- Handle error in err.
++end
++</pre>
++
++<h2 id="ffi_caveats">FFI caveats</h2>
++<p>
++The string buffer library has been designed to work well together with
++the FFI library. But due to the low-level nature of the FFI library,
++some care needs to be taken:
++</p>
++<p>
++First, please remember that FFI pointers are zero-indexed. The space
++returned by <tt>buf:reserve()</tt> and <tt>buf:ref()</tt> starts at the
++returned pointer and ends before <tt>len</tt> bytes after that.
++</p>
++<p>
++I.e. the first valid index is <tt>ptr[0]</tt> and the last valid index
++is <tt>ptr[len-1]</tt>. If the returned length is zero, there's no valid
++index at all. The returned pointer may even be <tt>NULL</tt>.
++</p>
++<p>
++The space pointed to by the returned pointer is only valid as long as
++the buffer is not modified in any way (neither append, nor consume, nor
++reset, etc.). The pointer is also not a GC anchor for the buffer object
++itself.
++</p>
++<p>
++Buffer data is only guaranteed to be byte-aligned. Casting the returned
++pointer to a data type with higher alignment may cause unaligned
++accesses. It depends on the CPU architecture whether this is allowed or
++not (it's always OK on x86/x64 and mostly OK on other modern
++architectures).
++</p>
++<p>
++FFI pointers or references do not count as GC anchors for an underlying
++object. E.g. an <tt>array</tt> allocated with <tt>ffi.new()</tt> is
++anchored by <tt>buf:set(array,&nbsp;len)</tt>, but not by
++<tt>buf:set(array+offset,&nbsp;len)</tt>. The addition of the offset
++creates a new pointer, even when the offset is zero. In this case, you
++need to make sure there's still a reference to the original array as
++long as its contents are in use by the buffer.
++</p>
++<p>
++Even though each LuaJIT VM instance is single-threaded (but you can
++create multiple VMs), FFI data structures can be accessed concurrently.
++Be careful when reading/writing FFI cdata from/to buffers to avoid
++concurrent accesses or modifications. In particular, the memory
++referenced by <tt>buf:set(cdata,&nbsp;len)</tt> must not be modified
++while buffer readers are working on it. Shared, but read-only memory
++mappings of files are OK, but only if the file does not change.
++</p>
++<br class="flush">
++</div>
++<div id="foot">
++<hr class="hide">
++Copyright &copy; 2005-2023
++<span class="noprint">
++&middot;
++<a href="contact.html">Contact</a>
++</span>
++</div>
++</body>
++</html>
+Index: wrk-4.2.0/obj/LuaJIT-2.1/doc/ext_c_api.html
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/doc/ext_c_api.html
++++ wrk-4.2.0/obj/LuaJIT-2.1/doc/ext_c_api.html
+@@ -1,9 +1,9 @@
+-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
++<!DOCTYPE html>
+ <html>
+ <head>
+ <title>Lua/C API Extensions</title>
+-<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+-<meta name="Copyright" content="Copyright (C) 2005-2021">
++<meta charset="utf-8">
++<meta name="Copyright" content="Copyright (C) 2005-2023">
+ <meta name="Language" content="en">
+ <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+ <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+@@ -37,6 +37,8 @@
+ <a href="ext_ffi_semantics.html">FFI Semantics</a>
+ </li></ul>
+ </li><li>
++<a href="ext_buffer.html">String Buffers</a>
++</li><li>
+ <a href="ext_jit.html">jit.* Library</a>
+ </li><li>
+ <a class="current" href="ext_c_api.html">Lua/C API</a>
+@@ -44,11 +46,9 @@
+ <a href="ext_profiler.html">Profiler</a>
+ </li></ul>
+ </li><li>
+-<a href="status.html">Status</a>
+-</li><li>
+-<a href="faq.html">FAQ</a>
++<a href="https://luajit.org/status.html">Status <span class="ext">&raquo;</span></a>
+ </li><li>
+-<a href="http://wiki.luajit.org/">Wiki <span class="ext">&raquo;</span></a>
++<a href="https://luajit.org/faq.html">FAQ <span class="ext">&raquo;</span></a>
+ </li><li>
+ <a href="https://luajit.org/list.html">Mailing List <span class="ext">&raquo;</span></a>
+ </li></ul>
+@@ -105,7 +105,7 @@ Turn the whole JIT compiler on or off or
+ This sets the mode for the function at the stack index <tt>idx</tt> or
+ the parent of the calling function (<tt>idx = 0</tt>). It either
+ enables JIT compilation for a function, disables it and flushes any
+-already compiled code or only flushes already compiled code. This
++already compiled code, or only flushes already compiled code. This
+ applies recursively to all sub-functions of the function with
+ <tt>LUAJIT_MODE_ALLFUNC</tt> or only to the sub-functions with
+ <tt>LUAJIT_MODE_ALLSUBFUNC</tt>.
+@@ -124,7 +124,7 @@ traces which link to it.
+ This mode defines a wrapper function for calls to C functions. If
+ called with <tt>LUAJIT_MODE_ON</tt>, the stack index at <tt>idx</tt>
+ must be a <tt>lightuserdata</tt> object holding a pointer to the wrapper
+-function. From now on all C functions are called through the wrapper
++function. From now on, all C functions are called through the wrapper
+ function. If called with <tt>LUAJIT_MODE_OFF</tt> this mode is turned
+ off and all C functions are directly called.
+ </p>
+@@ -173,7 +173,7 @@ Also note that this mechanism is not wit
+ </div>
+ <div id="foot">
+ <hr class="hide">
+-Copyright &copy; 2005-2021
++Copyright &copy; 2005-2023
+ <span class="noprint">
+ &middot;
+ <a href="contact.html">Contact</a>
+Index: wrk-4.2.0/obj/LuaJIT-2.1/doc/ext_ffi.html
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/doc/ext_ffi.html
++++ wrk-4.2.0/obj/LuaJIT-2.1/doc/ext_ffi.html
+@@ -1,9 +1,9 @@
+-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
++<!DOCTYPE html>
+ <html>
+ <head>
+ <title>FFI Library</title>
+-<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+-<meta name="Copyright" content="Copyright (C) 2005-2021">
++<meta charset="utf-8">
++<meta name="Copyright" content="Copyright (C) 2005-2023">
+ <meta name="Language" content="en">
+ <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+ <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+@@ -37,6 +37,8 @@
+ <a href="ext_ffi_semantics.html">FFI Semantics</a>
+ </li></ul>
+ </li><li>
++<a href="ext_buffer.html">String Buffers</a>
++</li><li>
+ <a href="ext_jit.html">jit.* Library</a>
+ </li><li>
+ <a href="ext_c_api.html">Lua/C API</a>
+@@ -44,11 +46,9 @@
+ <a href="ext_profiler.html">Profiler</a>
+ </li></ul>
+ </li><li>
+-<a href="status.html">Status</a>
+-</li><li>
+-<a href="faq.html">FAQ</a>
++<a href="https://luajit.org/status.html">Status <span class="ext">&raquo;</span></a>
+ </li><li>
+-<a href="http://wiki.luajit.org/">Wiki <span class="ext">&raquo;</span></a>
++<a href="https://luajit.org/faq.html">FAQ <span class="ext">&raquo;</span></a>
+ </li><li>
+ <a href="https://luajit.org/list.html">Mailing List <span class="ext">&raquo;</span></a>
+ </li></ul>
+@@ -155,7 +155,7 @@ call the binding function. Phew!
+ <h2 id="cdata">Motivating Example: Using C Data Structures</h2>
+ <p>
+ The FFI library allows you to create and access C&nbsp;data
+-structures. Of course the main use for this is for interfacing with
++structures. Of course, the main use for this is for interfacing with
+ C&nbsp;functions. But they can be used stand-alone, too.
+ </p>
+ <p>
+@@ -167,7 +167,7 @@ implemented with a big table holding lot
+ both a substantial memory overhead as well as a performance overhead.
+ </p>
+ <p>
+-Here's a sketch of a library that operates on color images plus a
++Here's a sketch of a library that operates on color images, plus a
+ simple benchmark. First, the plain Lua version:
+ </p>
+ <pre class="code">
+@@ -182,7 +182,7 @@ local function image_ramp_green(n)
+   return img
+ end
+ 
+-local function image_to_grey(img, n)
++local function image_to_gray(img, n)
+   for i=1,n do
+     local y = floor(0.3*img[i].red + 0.59*img[i].green + 0.11*img[i].blue)
+     img[i].red = y; img[i].green = y; img[i].blue = y
+@@ -192,14 +192,14 @@ end
+ local N = 400*400
+ local img = image_ramp_green(N)
+ for i=1,1000 do
+-  image_to_grey(img, N)
++  image_to_gray(img, N)
+ end
+ </pre>
+ <p>
+ This creates a table with 160.000 pixels, each of which is a table
+-holding four number values in the range of 0-255. First an image with
++holding four number values in the range of 0-255. First, an image with
+ a green ramp is created (1D for simplicity), then the image is
+-converted to greyscale 1000 times. Yes, that's silly, but I was in
++converted to grayscale 1000 times. Yes, that's silly, but I was in
+ need of a simple example ...
+ </p>
+ <p>
+@@ -306,7 +306,7 @@ be more compact and faster. This is cert
+ ~1.7x). Switching to a struct-of-arrays would help, too.
+ </p>
+ <p style="font-size: 8pt;">
+-However the resulting code would be less idiomatic and rather
++However, the resulting code would be less idiomatic and rather
+ error-prone. And it still doesn't get even close to the performance of
+ the FFI version of the code. Also, high-level data structures cannot
+ be easily passed to other C&nbsp;functions, especially I/O functions,
+@@ -316,7 +316,7 @@ without undue conversion penalties.
+ </div>
+ <div id="foot">
+ <hr class="hide">
+-Copyright &copy; 2005-2021
++Copyright &copy; 2005-2023
+ <span class="noprint">
+ &middot;
+ <a href="contact.html">Contact</a>
+Index: wrk-4.2.0/obj/LuaJIT-2.1/doc/ext_ffi_api.html
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/doc/ext_ffi_api.html
++++ wrk-4.2.0/obj/LuaJIT-2.1/doc/ext_ffi_api.html
+@@ -1,9 +1,9 @@
+-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
++<!DOCTYPE html>
+ <html>
+ <head>
+ <title>ffi.* API Functions</title>
+-<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+-<meta name="Copyright" content="Copyright (C) 2005-2021">
++<meta charset="utf-8">
++<meta name="Copyright" content="Copyright (C) 2005-2023">
+ <meta name="Language" content="en">
+ <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+ <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+@@ -42,6 +42,8 @@ td.abiparam { font-weight: bold; width:
+ <a href="ext_ffi_semantics.html">FFI Semantics</a>
+ </li></ul>
+ </li><li>
++<a href="ext_buffer.html">String Buffers</a>
++</li><li>
+ <a href="ext_jit.html">jit.* Library</a>
+ </li><li>
+ <a href="ext_c_api.html">Lua/C API</a>
+@@ -49,11 +51,9 @@ td.abiparam { font-weight: bold; width:
+ <a href="ext_profiler.html">Profiler</a>
+ </li></ul>
+ </li><li>
+-<a href="status.html">Status</a>
+-</li><li>
+-<a href="faq.html">FAQ</a>
++<a href="https://luajit.org/status.html">Status <span class="ext">&raquo;</span></a>
+ </li><li>
+-<a href="http://wiki.luajit.org/">Wiki <span class="ext">&raquo;</span></a>
++<a href="https://luajit.org/faq.html">FAQ <span class="ext">&raquo;</span></a>
+ </li><li>
+ <a href="https://luajit.org/list.html">Mailing List <span class="ext">&raquo;</span></a>
+ </li></ul>
+@@ -119,7 +119,7 @@ separated by semicolons. The trailing se
+ declaration may be omitted.
+ </p>
+ <p>
+-Please note that external symbols are only <em>declared</em>, but they
++Please note, that external symbols are only <em>declared</em>, but they
+ are <em>not bound</em> to any specific address, yet. Binding is
+ achieved with C&nbsp;library namespaces (see below).
+ </p>
+@@ -207,7 +207,7 @@ parse the cdecl only once and get its ct
+ <tt>ffi.typeof()</tt>. Then use the ctype as a constructor repeatedly.
+ </p>
+ <p style="font-size: 8pt;">
+-Please note that an anonymous <tt>struct</tt> declaration implicitly
++Please note, that an anonymous <tt>struct</tt> declaration implicitly
+ creates a new and distinguished ctype every time you use it for
+ <tt>ffi.new()</tt>. This is probably <b>not</b> what you want,
+ especially if you create more than one cdata object. Different anonymous
+@@ -254,12 +254,12 @@ afterwards. Neither the contents of the
+ contents of an <tt>__index</tt> table (if any) may be modified
+ afterwards. The associated metatable automatically applies to all uses
+ of this type, no matter how the objects are created or where they
+-originate from. Note that pre-defined operations on types have
++originate from. Note that predefined operations on types have
+ precedence (e.g. declared field names cannot be overridden).
+ </p>
+ <p>
+ All standard Lua metamethods are implemented. These are called directly,
+-without shortcuts and on any mix of types. For binary operations, the
++without shortcuts, and on any mix of types. For binary operations, the
+ left operand is checked first for a valid ctype metamethod. The
+ <tt>__gc</tt> metamethod only applies to <tt>struct</tt>/<tt>union</tt>
+ types and performs an implicit <a href="#ffi_gc"><tt>ffi.gc()</tt></a>
+@@ -463,8 +463,10 @@ otherwise. The following parameters are
+ <tr class="odd">
+ <td class="abiparam">win</td><td class="abidesc">Windows variant of the standard ABI</td></tr>
+ <tr class="even">
+-<td class="abiparam">uwp</td><td class="abidesc">Universal Windows Platform</td></tr>
++<td class="abiparam">pauth</td><td class="abidesc">Pointer authentication ABI</td></tr>
+ <tr class="odd">
++<td class="abiparam">uwp</td><td class="abidesc">Universal Windows Platform</td></tr>
++<tr class="even">
+ <td class="abiparam">gc64</td><td class="abidesc">64 bit GC references</td></tr>
+ </table>
+ 
+@@ -490,7 +492,7 @@ have some extra methods:
+ <p>
+ Free the resources associated with a callback. The associated Lua
+ function is unanchored and may be garbage collected. The callback
+-function pointer is no longer valid and must not be called anymore
++function pointer is no longer valid and must not be called again
+ (it may be reused by a subsequently created callback).
+ </p>
+ 
+@@ -556,7 +558,7 @@ named <tt>i</tt>.
+ </div>
+ <div id="foot">
+ <hr class="hide">
+-Copyright &copy; 2005-2021
++Copyright &copy; 2005-2023
+ <span class="noprint">
+ &middot;
+ <a href="contact.html">Contact</a>
+Index: wrk-4.2.0/obj/LuaJIT-2.1/doc/ext_ffi_semantics.html
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/doc/ext_ffi_semantics.html
++++ wrk-4.2.0/obj/LuaJIT-2.1/doc/ext_ffi_semantics.html
+@@ -1,9 +1,9 @@
+-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
++<!DOCTYPE html>
+ <html>
+ <head>
+ <title>FFI Semantics</title>
+-<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+-<meta name="Copyright" content="Copyright (C) 2005-2021">
++<meta charset="utf-8">
++<meta name="Copyright" content="Copyright (C) 2005-2023">
+ <meta name="Language" content="en">
+ <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+ <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+@@ -42,6 +42,8 @@ td.convop { font-style: italic; width: 4
+ <a class="current" href="ext_ffi_semantics.html">FFI Semantics</a>
+ </li></ul>
+ </li><li>
++<a href="ext_buffer.html">String Buffers</a>
++</li><li>
+ <a href="ext_jit.html">jit.* Library</a>
+ </li><li>
+ <a href="ext_c_api.html">Lua/C API</a>
+@@ -49,11 +51,9 @@ td.convop { font-style: italic; width: 4
+ <a href="ext_profiler.html">Profiler</a>
+ </li></ul>
+ </li><li>
+-<a href="status.html">Status</a>
+-</li><li>
+-<a href="faq.html">FAQ</a>
++<a href="https://luajit.org/status.html">Status <span class="ext">&raquo;</span></a>
+ </li><li>
+-<a href="http://wiki.luajit.org/">Wiki <span class="ext">&raquo;</span></a>
++<a href="https://luajit.org/faq.html">FAQ <span class="ext">&raquo;</span></a>
+ </li><li>
+ <a href="https://luajit.org/list.html">Mailing List <span class="ext">&raquo;</span></a>
+ </li></ul>
+@@ -86,7 +86,7 @@ footprint. It's used by the <a href="ext
+ functions</a> to declare C&nbsp;types or external symbols.
+ </p>
+ <p>
+-It's only purpose is to parse C&nbsp;declarations, as found e.g. in
++Its only purpose is to parse C&nbsp;declarations, as found e.g. in
+ C&nbsp;header files. Although it does evaluate constant expressions,
+ it's <em>not</em> a C&nbsp;compiler. The body of <tt>inline</tt>
+ C&nbsp;function definitions is simply ignored.
+@@ -163,7 +163,7 @@ function declarations.</li>
+ 
+ </ul>
+ <p>
+-The following C&nbsp;types are pre-defined by the C&nbsp;parser (like
++The following C&nbsp;types are predefined by the C&nbsp;parser (like
+ a <tt>typedef</tt>, except re-declarations will be ignored):
+ </p>
+ <ul>
+@@ -581,9 +581,9 @@ ffi.new("struct nested", {x=1,y={2,3}})
+ 
+ <h2 id="cdata_ops">Operations on cdata Objects</h2>
+ <p>
+-All of the standard Lua operators can be applied to cdata objects or a
++All standard Lua operators can be applied to cdata objects or a
+ mix of a cdata object and another Lua object. The following list shows
+-the pre-defined operations.
++the predefined operations.
+ </p>
+ <p>
+ Reference types are dereferenced <em>before</em> performing each of
+@@ -591,7 +591,7 @@ the operations below &mdash; the operati
+ C&nbsp;type pointed to by the reference.
+ </p>
+ <p>
+-The pre-defined operations are always tried first before deferring to a
++The predefined operations are always tried first before deferring to a
+ metamethod or index table (if any) for the corresponding ctype (except
+ for <tt>__new</tt>). An error is raised if the metamethod lookup or
+ index table lookup fails.
+@@ -641,7 +641,7 @@ assigning to an index of a vector raises
+ </ul>
+ <p>
+ A ctype object can be indexed with a string key, too. The only
+-pre-defined operation is reading scoped constants of
++predefined operation is reading scoped constants of
+ <tt>struct</tt>/<tt>union</tt> types. All other accesses defer
+ to the corresponding metamethods or index tables (if any).
+ </p>
+@@ -654,7 +654,7 @@ certain optimizations.
+ <p>
+ As a consequence, the <em>elements</em> of complex numbers and
+ vectors are immutable. But the elements of an aggregate holding these
+-types <em>may</em> be modified of course. I.e. you cannot assign to
++types <em>may</em> be modified, of course. I.e. you cannot assign to
+ <tt>foo.c.im</tt>, but you can assign a (newly created) complex number
+ to <tt>foo.c</tt>.
+ </p>
+@@ -673,8 +673,8 @@ through unions is explicitly detected an
+ to <tt>ffi.new(ct, ...)</tt>, unless a <tt>__new</tt> metamethod is
+ defined. The <tt>__new</tt> metamethod is called with the ctype object
+ plus any other arguments passed to the constructor. Note that you have to
+-use <tt>ffi.new</tt> inside of it, since calling <tt>ct(...)</tt> would
+-cause infinite recursion.</li>
++use <tt>ffi.new</tt> inside the metamethod, since calling <tt>ct(...)</tt>
++would cause infinite recursion.</li>
+ 
+ <li><b>C&nbsp;function call</b>: a cdata function or cdata function
+ pointer can be called. The passed arguments are
+@@ -685,7 +685,7 @@ variable argument part of vararg C&nbsp;
+ C&nbsp;function is called and the return value (if any) is
+ <a href="#convert_tolua">converted to a Lua object</a>.<br>
+ On Windows/x86 systems, <tt>__stdcall</tt> functions are automatically
+-detected and a function declared as <tt>__cdecl</tt> (the default) is
++detected, and a function declared as <tt>__cdecl</tt> (the default) is
+ silently fixed up after the first call.</li>
+ 
+ </ul>
+@@ -695,7 +695,7 @@ silently fixed up after the first call.<
+ 
+ <li><b>Pointer arithmetic</b>: a cdata pointer/array and a cdata
+ number or a Lua number can be added or subtracted. The number must be
+-on the right hand side for a subtraction. The result is a pointer of
++on the right-hand side for a subtraction. The result is a pointer of
+ the same type with an address plus or minus the number value
+ multiplied by the element size in bytes. An error is raised if the
+ element size is undefined.</li>
+@@ -710,7 +710,7 @@ operators (<tt>+&nbsp;-&nbsp;*&nbsp;/&nb
+ minus) can be applied to two cdata numbers, or a cdata number and a
+ Lua number. If one of them is an <tt>uint64_t</tt>, the other side is
+ converted to an <tt>uint64_t</tt> and an unsigned arithmetic operation
+-is performed. Otherwise both sides are converted to an
++is performed. Otherwise, both sides are converted to an
+ <tt>int64_t</tt> and a signed arithmetic operation is performed. The
+ result is a boxed 64&nbsp;bit cdata object.<br>
+ 
+@@ -757,7 +757,7 @@ which is compatible with any other point
+ <li><b>64&nbsp;bit integer comparison</b>: two cdata numbers, or a
+ cdata number and a Lua number can be compared with each other. If one
+ of them is an <tt>uint64_t</tt>, the other side is converted to an
+-<tt>uint64_t</tt> and an unsigned comparison is performed. Otherwise
++<tt>uint64_t</tt> and an unsigned comparison is performed. Otherwise,
+ both sides are converted to an <tt>int64_t</tt> and a signed
+ comparison is performed.<br>
+ 
+@@ -782,9 +782,9 @@ keys!</b>
+ A cdata object is treated like any other garbage-collected object and
+ is hashed and compared by its address for table indexing. Since
+ there's no interning for cdata value types, the same value may be
+-boxed in different cdata objects with different addresses. Thus
++boxed in different cdata objects with different addresses. Thus,
+ <tt>t[1LL+1LL]</tt> and <tt>t[2LL]</tt> usually <b>do not</b> point to
+-the same hash slot and they certainly <b>do not</b> point to the same
++the same hash slot, and they certainly <b>do not</b> point to the same
+ hash slot as <tt>t[2]</tt>.
+ </p>
+ <p>
+@@ -806,7 +806,7 @@ the resulting Lua number as a key when i
+ One obvious benefit: <tt>t[tonumber(2LL)]</tt> <b>does</b> point to
+ the same slot as <tt>t[2]</tt>.</li>
+ 
+-<li>Otherwise use either <tt>tostring()</tt> on 64&nbsp;bit integers
++<li>Otherwise, use either <tt>tostring()</tt> on 64&nbsp;bit integers
+ or complex numbers or combine multiple fields of a cdata aggregate to
+ a Lua string (e.g. with
+ <a href="ext_ffi_api.html#ffi_string"><tt>ffi.string()</tt></a>). Then
+@@ -814,7 +814,7 @@ use the resulting Lua string as a key wh
+ 
+ <li>Create your own specialized hash table implementation using the
+ C&nbsp;types provided by the FFI library, just like you would in
+-C&nbsp;code. Ultimately this may give much better performance than the
++C&nbsp;code. Ultimately, this may give much better performance than the
+ other alternatives or what a generic by-value hash table could
+ possibly provide.</li>
+ 
+@@ -880,7 +880,7 @@ garbage collector will automatically fre
+ the end of the next GC cycle).
+ </p>
+ <p>
+-Please note that pointers themselves are cdata objects, however they
++Please note, that pointers themselves are cdata objects, however they
+ are <b>not</b> followed by the garbage collector. So e.g. if you
+ assign a cdata array to a pointer, you must keep the cdata object
+ holding the array alive as long as the pointer is still in use:
+@@ -929,18 +929,18 @@ of the function pointer and the Lua func
+ </p>
+ <p>
+ This can happen implicitly due to the usual conversions, e.g. when
+-passing a Lua function to a function pointer argument. Or you can use
++passing a Lua function to a function pointer argument. Or, you can use
+ <tt>ffi.cast()</tt> to explicitly cast a Lua function to a
+ C&nbsp;function pointer.
+ </p>
+ <p>
+-Currently only certain C&nbsp;function types can be used as callback
++Currently, only certain C&nbsp;function types can be used as callback
+ functions. Neither C&nbsp;vararg functions nor functions with
+ pass-by-value aggregate argument or result types are supported. There
+-are no restrictions for the kind of Lua functions that can be called
++are no restrictions on the kind of Lua functions that can be called
+ from the callback &mdash; no checks for the proper number of arguments
+ are made. The return value of the Lua function will be converted to the
+-result type and an error will be thrown for invalid conversions.
++result type, and an error will be thrown for invalid conversions.
+ </p>
+ <p>
+ It's allowed to throw errors across a callback invocation, but it's not
+@@ -1001,7 +1001,7 @@ convention cannot be automatically detec
+ <tt>__stdcall</tt> calls <em>to</em> Windows functions.
+ </p>
+ <p>
+-For some use cases it's necessary to free up the resources or to
++For some use cases, it's necessary to free up the resources or to
+ dynamically redirect callbacks. Use an explicit cast to a
+ C&nbsp;function pointer and keep the resulting cdata object. Then use
+ the <a href="ext_ffi_api.html#callback_free"><tt>cb:free()</tt></a>
+@@ -1054,7 +1054,7 @@ GUI application, which waits for user in
+ </p>
+ <p>
+ For new designs <b>avoid push-style APIs</b>: a C&nbsp;function repeatedly
+-calling a callback for each result. Instead <b>use pull-style APIs</b>:
++calling a callback for each result. Instead, <b>use pull-style APIs</b>:
+ call a C&nbsp;function repeatedly to get a new result. Calls from Lua
+ to C via the FFI are much faster than the other way round. Most well-designed
+ libraries already use pull-style APIs (read/write, get/put).
+@@ -1073,7 +1073,7 @@ function.
+ </p>
+ <p>
+ Indexing a C&nbsp;library namespace object with a symbol name (a Lua
+-string) automatically binds it to the library. First the symbol type
++string) automatically binds it to the library. First, the symbol type
+ is resolved &mdash; it must have been declared with
+ <a href="ext_ffi_api.html#ffi_cdef"><tt>ffi.cdef</tt></a>. Then the
+ symbol address is resolved by searching for the symbol name in the
+@@ -1128,7 +1128,7 @@ Performance notice: the JIT compiler spe
+ namespace objects and to the strings used to index it. This
+ effectively turns function cdata objects into constants. It's not
+ useful and actually counter-productive to explicitly cache these
+-function objects, e.g. <tt>local strlen = ffi.C.strlen</tt>. OTOH it
++function objects, e.g. <tt>local strlen = ffi.C.strlen</tt>. OTOH, it
+ <em>is</em> useful to cache the namespace itself, e.g. <tt>local C =
+ ffi.C</tt>.
+ </p>
+@@ -1153,14 +1153,14 @@ This behavior is inevitable, since the g
+ interoperability with C&nbsp;code. Adding extra safety measures, like
+ bounds checks, would be futile. There's no way to detect
+ misdeclarations of C&nbsp;functions, since shared libraries only
+-provide symbol names, but no type information. Likewise there's no way
++provide symbol names, but no type information. Likewise, there's no way
+ to infer the valid range of indexes for a returned pointer.
+ </p>
+ <p>
+ Again: the FFI library is a low-level library. This implies it needs
+ to be used with care, but it's flexibility and performance often
+ outweigh this concern. If you're a C or C++ developer, it'll be easy
+-to apply your existing knowledge. OTOH writing code for the FFI
++to apply your existing knowledge. OTOH, writing code for the FFI
+ library is not for the faint of heart and probably shouldn't be the
+ first exercise for someone with little experience in Lua, C or C++.
+ </p>
+@@ -1188,7 +1188,7 @@ currently incomplete:
+ <li>C&nbsp;declarations are not passed through a C&nbsp;pre-processor,
+ yet.</li>
+ <li>The C&nbsp;parser is able to evaluate most constant expressions
+-commonly found in C&nbsp;header files. However it doesn't handle the
++commonly found in C&nbsp;header files. However, it doesn't handle the
+ full range of C&nbsp;expression semantics and may fail for some
+ obscure constructs.</li>
+ <li><tt>static const</tt> declarations only work for integer types
+@@ -1246,7 +1246,7 @@ compiled.</li>
+ </div>
+ <div id="foot">
+ <hr class="hide">
+-Copyright &copy; 2005-2021
++Copyright &copy; 2005-2023
+ <span class="noprint">
+ &middot;
+ <a href="contact.html">Contact</a>
+Index: wrk-4.2.0/obj/LuaJIT-2.1/doc/ext_ffi_tutorial.html
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/doc/ext_ffi_tutorial.html
++++ wrk-4.2.0/obj/LuaJIT-2.1/doc/ext_ffi_tutorial.html
+@@ -1,9 +1,9 @@
+-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
++<!DOCTYPE html>
+ <html>
+ <head>
+ <title>FFI Tutorial</title>
+-<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+-<meta name="Copyright" content="Copyright (C) 2005-2021">
++<meta charset="utf-8">
++<meta name="Copyright" content="Copyright (C) 2005-2023">
+ <meta name="Language" content="en">
+ <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+ <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+@@ -44,6 +44,8 @@ td.idiomlua b { font-weight: normal; col
+ <a href="ext_ffi_semantics.html">FFI Semantics</a>
+ </li></ul>
+ </li><li>
++<a href="ext_buffer.html">String Buffers</a>
++</li><li>
+ <a href="ext_jit.html">jit.* Library</a>
+ </li><li>
+ <a href="ext_c_api.html">Lua/C API</a>
+@@ -51,11 +53,9 @@ td.idiomlua b { font-weight: normal; col
+ <a href="ext_profiler.html">Profiler</a>
+ </li></ul>
+ </li><li>
+-<a href="status.html">Status</a>
+-</li><li>
+-<a href="faq.html">FAQ</a>
++<a href="https://luajit.org/status.html">Status <span class="ext">&raquo;</span></a>
+ </li><li>
+-<a href="http://wiki.luajit.org/">Wiki <span class="ext">&raquo;</span></a>
++<a href="https://luajit.org/faq.html">FAQ <span class="ext">&raquo;</span></a>
+ </li><li>
+ <a href="https://luajit.org/list.html">Mailing List <span class="ext">&raquo;</span></a>
+ </li></ul>
+@@ -83,7 +83,7 @@ of its functions:
+ local ffi = require("ffi")
+ </pre>
+ <p>
+-Please note this doesn't define an <tt>ffi</tt> variable in the table
++Please note, this doesn't define an <tt>ffi</tt> variable in the table
+ of globals &mdash; you really need to use the local variable. The
+ <tt>require</tt> function ensures the library is only loaded once.
+ </p>
+@@ -192,7 +192,7 @@ don't need to declare them as such.
+ <span class="mark">&#9316;</span> The <tt>poll()</tt>
+ function takes a couple more arguments we're not going to use. You can
+ simply use <tt>nil</tt> to pass a <tt>NULL</tt> pointer and <tt>0</tt>
+-for the <tt>nfds</tt> parameter. Please note that the
++for the <tt>nfds</tt> parameter. Please note, that the
+ number&nbsp;<tt>0</tt> <em>does not convert to a pointer value</em>,
+ unlike in C++. You really have to pass pointers to pointer arguments
+ and numbers to number arguments.
+@@ -289,12 +289,12 @@ Here's the step-by-step explanation:
+ <p>
+ <span class="mark">&#9312;</span> This defines some of the
+ C&nbsp;functions provided by zlib. For the sake of this example, some
+-type indirections have been reduced and it uses the pre-defined
++type indirections have been reduced and it uses the predefined
+ fixed-size integer types, while still adhering to the zlib API/ABI.
+ </p>
+ <p>
+ <span class="mark">&#9313;</span> This loads the zlib shared
+-library. On POSIX systems it's named <tt>libz.so</tt> and usually
++library. On POSIX systems, it's named <tt>libz.so</tt> and usually
+ comes pre-installed. Since <tt>ffi.load()</tt> automatically adds any
+ missing standard prefixes/suffixes, we can simply load the
+ <tt>"z"</tt> library. On Windows it's named <tt>zlib1.dll</tt> and
+@@ -322,7 +322,7 @@ actual length that was used.
+ <p>
+ In C you'd pass in the address of a local variable
+ (<tt>&amp;buflen</tt>). But since there's no address-of operator in
+-Lua, we'll just pass in a one-element array. Conveniently it can be
++Lua, we'll just pass in a one-element array. Conveniently, it can be
+ initialized with the maximum buffer size in one step. Calling the
+ actual <tt>zlib.compress2</tt> function is then straightforward.
+ </p>
+@@ -346,7 +346,7 @@ for garbage collection and string intern
+ <span class="mark">&#9317;</span> The <tt>uncompress</tt>
+ functions does the exact opposite of the <tt>compress</tt> function.
+ The compressed data doesn't include the size of the original string,
+-so this needs to be passed in. Otherwise no surprises here.
++so this needs to be passed in. Otherwise, no surprises here.
+ </p>
+ <p>
+ <span class="mark">&#9318;</span> The code, that makes use
+@@ -380,7 +380,7 @@ Ok, so the <tt>ffi.*</tt> functions gene
+ wherever you'd want to use a number. That's why we get a away with
+ passing <tt>n</tt> to <tt>ffi.string()</tt> above. But other Lua
+ library functions or modules don't know how to deal with this. So for
+-maximum portability one needs to use <tt>tonumber()</tt> on returned
++maximum portability, one needs to use <tt>tonumber()</tt> on returned
+ <tt>long</tt> results before passing them on. Otherwise the
+ application might work on some systems, but would fail in a POSIX/x64
+ environment.
+@@ -452,7 +452,7 @@ the origin.
+ </p>
+ <p>
+ <span class="mark">&#9315;</span> If we run out of operators, we can
+-define named methods, too. Here the <tt>__index</tt> table defines an
++define named methods, too. Here, the <tt>__index</tt> table defines an
+ <tt>area</tt> function. For custom indexing needs, one might want to
+ define <tt>__index</tt> and <tt>__newindex</tt> <em>functions</em> instead.
+ </p>
+@@ -466,13 +466,13 @@ be used e.g. to create an array of point
+ apply to any and all uses of this type.
+ </p>
+ <p>
+-Please note that the association with a metatable is permanent and
++Please note, that the association with a metatable is permanent and
+ <b>the metatable must not be modified afterwards!</b> Ditto for the
+ <tt>__index</tt> table.
+ </p>
+ <p>
+ <span class="mark">&#9317;</span> Here are some simple usage examples
+-for the point type and their expected results. The pre-defined
++for the point type and their expected results. The predefined
+ operations (such as <tt>a.x</tt>) can be freely mixed with the newly
+ defined metamethods. Note that <tt>area</tt> is a method and must be
+ called with the Lua syntax for methods: <tt>a:area()</tt>, not
+@@ -481,7 +481,7 @@ called with the Lua syntax for methods:
+ <p>
+ The C&nbsp;type metamethod mechanism is most useful when used in
+ conjunction with C&nbsp;libraries that are written in an object-oriented
+-style. Creators return a pointer to a new instance and methods take an
++style. Creators return a pointer to a new instance, and methods take an
+ instance pointer as the first argument. Sometimes you can just point
+ <tt>__index</tt> to the library namespace and <tt>__gc</tt> to the
+ destructor and you're done. But often enough you'll want to add
+@@ -567,7 +567,7 @@ end
+ </pre>
+ <p>
+ This turns them into indirect calls and generates bigger and slower
+-machine code. Instead you'll want to cache the namespace itself and
++machine code. Instead, you'll want to cache the namespace itself and
+ rely on the JIT compiler to eliminate the lookups:
+ </p>
+ <pre class="code">
+@@ -587,7 +587,7 @@ it to a local variable in the function s
+ </div>
+ <div id="foot">
+ <hr class="hide">
+-Copyright &copy; 2005-2021
++Copyright &copy; 2005-2023
+ <span class="noprint">
+ &middot;
+ <a href="contact.html">Contact</a>
+Index: wrk-4.2.0/obj/LuaJIT-2.1/doc/ext_jit.html
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/doc/ext_jit.html
++++ wrk-4.2.0/obj/LuaJIT-2.1/doc/ext_jit.html
+@@ -1,9 +1,9 @@
+-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
++<!DOCTYPE html>
+ <html>
+ <head>
+ <title>jit.* Library</title>
+-<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+-<meta name="Copyright" content="Copyright (C) 2005-2021">
++<meta charset="utf-8">
++<meta name="Copyright" content="Copyright (C) 2005-2023">
+ <meta name="Language" content="en">
+ <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+ <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+@@ -37,6 +37,8 @@
+ <a href="ext_ffi_semantics.html">FFI Semantics</a>
+ </li></ul>
+ </li><li>
++<a href="ext_buffer.html">String Buffers</a>
++</li><li>
+ <a class="current" href="ext_jit.html">jit.* Library</a>
+ </li><li>
+ <a href="ext_c_api.html">Lua/C API</a>
+@@ -44,11 +46,9 @@
+ <a href="ext_profiler.html">Profiler</a>
+ </li></ul>
+ </li><li>
+-<a href="status.html">Status</a>
+-</li><li>
+-<a href="faq.html">FAQ</a>
++<a href="https://luajit.org/status.html">Status <span class="ext">&raquo;</span></a>
+ </li><li>
+-<a href="http://wiki.luajit.org/">Wiki <span class="ext">&raquo;</span></a>
++<a href="https://luajit.org/faq.html">FAQ <span class="ext">&raquo;</span></a>
+ </li><li>
+ <a href="https://luajit.org/list.html">Mailing List <span class="ext">&raquo;</span></a>
+ </li></ul>
+@@ -135,7 +135,9 @@ Contains the LuaJIT version string.
+ <h3 id="jit_version_num"><tt>jit.version_num</tt></h3>
+ <p>
+ Contains the version number of the LuaJIT core. Version xx.yy.zz
+-is represented by the decimal number xxyyzz.
++is represented by the decimal number xxyyzz.<br>
++<b>DEPRECATED after the switch to
++<a href="https://luajit.org/status.html#release"><span class="ext">&raquo;</span>&nbsp;rolling releases</a>. zz is frozen at 99.</b>
+ </p>
+ 
+ <h3 id="jit_os"><tt>jit.os</tt></h3>
+@@ -152,7 +154,7 @@ Contains the target architecture name:
+ 
+ <h2 id="jit_opt"><tt>jit.opt.*</tt> &mdash; JIT compiler optimization control</h2>
+ <p>
+-This sub-module provides the backend for the <tt>-O</tt> command line
++This submodule provides the backend for the <tt>-O</tt> command line
+ option.
+ </p>
+ <p>
+@@ -172,7 +174,7 @@ which was one of the ways to enable opti
+ 
+ <h2 id="jit_util"><tt>jit.util.*</tt> &mdash; JIT compiler introspection</h2>
+ <p>
+-This sub-module holds functions to introspect the bytecode, generated
++This submodule holds functions to introspect the bytecode, generated
+ traces, the IR and the generated machine code. The functionality
+ provided by this module is still in flux and therefore undocumented.
+ </p>
+@@ -185,7 +187,7 @@ if you want to know more.
+ </div>
+ <div id="foot">
+ <hr class="hide">
+-Copyright &copy; 2005-2021
++Copyright &copy; 2005-2023
+ <span class="noprint">
+ &middot;
+ <a href="contact.html">Contact</a>
+Index: wrk-4.2.0/obj/LuaJIT-2.1/doc/ext_profiler.html
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/doc/ext_profiler.html
++++ wrk-4.2.0/obj/LuaJIT-2.1/doc/ext_profiler.html
+@@ -1,9 +1,9 @@
+-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
++<!DOCTYPE html>
+ <html>
+ <head>
+ <title>Profiler</title>
+-<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+-<meta name="Copyright" content="Copyright (C) 2005-2021">
++<meta charset="utf-8">
++<meta name="Copyright" content="Copyright (C) 2005-2023">
+ <meta name="Language" content="en">
+ <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+ <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+@@ -37,6 +37,8 @@
+ <a href="ext_ffi_semantics.html">FFI Semantics</a>
+ </li></ul>
+ </li><li>
++<a href="ext_buffer.html">String Buffers</a>
++</li><li>
+ <a href="ext_jit.html">jit.* Library</a>
+ </li><li>
+ <a href="ext_c_api.html">Lua/C API</a>
+@@ -44,11 +46,9 @@
+ <a class="current" href="ext_profiler.html">Profiler</a>
+ </li></ul>
+ </li><li>
+-<a href="status.html">Status</a>
+-</li><li>
+-<a href="faq.html">FAQ</a>
++<a href="https://luajit.org/status.html">Status <span class="ext">&raquo;</span></a>
+ </li><li>
+-<a href="http://wiki.luajit.org/">Wiki <span class="ext">&raquo;</span></a>
++<a href="https://luajit.org/faq.html">FAQ <span class="ext">&raquo;</span></a>
+ </li><li>
+ <a href="https://luajit.org/list.html">Mailing List <span class="ext">&raquo;</span></a>
+ </li></ul>
+@@ -156,7 +156,7 @@ To see how much time is spent in differe
+ Combinations of <tt>v/z</tt> with <tt>f/F/l</tt> produce two-level
+ views, e.g. <tt>-jp=vf</tt> or <tt>-jp=fv</tt>. This shows the time
+ spent in a VM state or zone vs. hotspots. This can be used to answer
+-questions like "Which time consuming functions are only interpreted?" or
++questions like "Which time-consuming functions are only interpreted?" or
+ "What's the garbage collector overhead for a specific function?".
+ </p>
+ <p>
+@@ -215,7 +215,7 @@ local profile = require("jit.profile")
+ This module can be used to implement your own higher-level profiler.
+ A typical profiling run starts the profiler, captures stack dumps in
+ the profiler callback, adds them to a hash table to aggregate the number
+-of samples, stops the profiler and then analyzes all of the captured
++of samples, stops the profiler and then analyzes all captured
+ stack dumps. Other parameters can be sampled in the profiler callback,
+ too. But it's important not to spend too much time in the callback,
+ since this may skew the statistics.
+@@ -269,9 +269,9 @@ returns a string with a stack dump for t
+ formatted according to the <tt>fmt</tt> argument:
+ </p>
+ <ul>
+-<li><tt>p</tt> &mdash; Preserve the full path for module names. Otherwise
++<li><tt>p</tt> &mdash; Preserve the full path for module names. Otherwise,
+ only the file name is used.</li>
+-<li><tt>f</tt> &mdash; Dump the function name if it can be derived. Otherwise
++<li><tt>f</tt> &mdash; Dump the function name if it can be derived. Otherwise,
+ use module:line.</li>
+ <li><tt>F</tt> &mdash; Ditto, but dump module:name.</li>
+ <li><tt>l</tt> &mdash; Dump module:line.</li>
+@@ -349,7 +349,7 @@ use.
+ </div>
+ <div id="foot">
+ <hr class="hide">
+-Copyright &copy; 2005-2021
++Copyright &copy; 2005-2023
+ <span class="noprint">
+ &middot;
+ <a href="contact.html">Contact</a>
+Index: wrk-4.2.0/obj/LuaJIT-2.1/doc/extensions.html
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/doc/extensions.html
++++ wrk-4.2.0/obj/LuaJIT-2.1/doc/extensions.html
+@@ -1,9 +1,9 @@
+-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
++<!DOCTYPE html>
+ <html>
+ <head>
+ <title>Extensions</title>
+-<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+-<meta name="Copyright" content="Copyright (C) 2005-2021">
++<meta charset="utf-8">
++<meta name="Copyright" content="Copyright (C) 2005-2023">
+ <meta name="Language" content="en">
+ <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+ <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+@@ -54,6 +54,8 @@ td.excinterop {
+ <a href="ext_ffi_semantics.html">FFI Semantics</a>
+ </li></ul>
+ </li><li>
++<a href="ext_buffer.html">String Buffers</a>
++</li><li>
+ <a href="ext_jit.html">jit.* Library</a>
+ </li><li>
+ <a href="ext_c_api.html">Lua/C API</a>
+@@ -61,16 +63,15 @@ td.excinterop {
+ <a href="ext_profiler.html">Profiler</a>
+ </li></ul>
+ </li><li>
+-<a href="status.html">Status</a>
+-</li><li>
+-<a href="faq.html">FAQ</a>
++<a href="https://luajit.org/status.html">Status <span class="ext">&raquo;</span></a>
+ </li><li>
+-<a href="http://wiki.luajit.org/">Wiki <span class="ext">&raquo;</span></a>
++<a href="https://luajit.org/faq.html">FAQ <span class="ext">&raquo;</span></a>
+ </li><li>
+ <a href="https://luajit.org/list.html">Mailing List <span class="ext">&raquo;</span></a>
+ </li></ul>
+ </div>
+ <div id="main">
++
+ <p>
+ LuaJIT is fully upwards-compatible with Lua 5.1. It supports all
+ <a href="https://www.lua.org/manual/5.1/manual.html#5"><span class="ext">&raquo;</span>&nbsp;standard Lua
+@@ -86,7 +87,7 @@ or LuaJIT.
+ </p>
+ <p>
+ LuaJIT extends the standard Lua VM with new functionality and adds
+-several extension modules. Please note this page is only about
++several extension modules. Please note, this page is only about
+ <em>functional</em> enhancements and not about performance enhancements,
+ such as the optimized VM, the faster interpreter or the JIT compiler.
+ </p>
+@@ -195,7 +196,7 @@ usage. See also the
+ </p>
+ <p>
+ The generated bytecode is portable and can be loaded on any architecture
+-that LuaJIT supports, independent of word size or endianess. However the
++that LuaJIT supports, independent of word size or endianess. However, the
+ bytecode compatibility versions must match. Bytecode stays compatible
+ for dot releases (x.y.0 &rarr; x.y.1), but may change with major or
+ minor releases (2.0 &rarr; 2.1) or between any beta release. Foreign
+@@ -227,7 +228,7 @@ avoids managing backlinks, saves an allo
+ incremental array/hash part growth.
+ </p>
+ <p>
+-Please note this function is meant for very specific situations. In most
++Please note, this function is meant for very specific situations. In most
+ cases it's better to replace the (usually single) link with a new table
+ and let the GC do its work.
+ </p>
+@@ -237,7 +238,7 @@ and let the GC do its work.
+ LuaJIT uses a Tausworthe PRNG with period 2^223 to implement
+ <tt>math.random()</tt> and <tt>math.randomseed()</tt>. The quality of
+ the PRNG results is much superior compared to the standard Lua
+-implementation which uses the platform-specific ANSI rand().
++implementation, which uses the platform-specific ANSI rand().
+ </p>
+ <p>
+ The PRNG generates the same sequences from the same seeds on all
+@@ -255,7 +256,7 @@ Important: Neither this nor any other PR
+ <h3 id="io"><tt>io.*</tt> functions handle 64&nbsp;bit file offsets</h3>
+ <p>
+ The file I/O functions in the standard <tt>io.*</tt> library handle
+-64&nbsp;bit file offsets. In particular this means it's possible
++64&nbsp;bit file offsets. In particular, this means it's possible
+ to open files larger than 2&nbsp;Gigabytes and to reposition or obtain
+ the current file position for offsets beyond 2&nbsp;GB
+ (<tt>fp:seek()</tt> method).
+@@ -392,29 +393,19 @@ the toolchain used to compile LuaJIT:
+ <td class="excinterop">Interoperability</td>
+ </tr>
+ <tr class="odd separate">
+-<td class="excplatform">POSIX/x64, DWARF2 unwinding</td>
+-<td class="exccompiler">GCC 4.3+, Clang</td>
++<td class="excplatform">External frame unwinding</td>
++<td class="exccompiler">GCC, Clang, MSVC</td>
+ <td class="excinterop"><b style="color: #00a000;">Full</b></td>
+ </tr>
+ <tr class="even">
+-<td class="excplatform">ARM <tt>-DLUAJIT_UNWIND_EXTERNAL</tt></td>
+-<td class="exccompiler">GCC, Clang</td>
+-<td class="excinterop"><b style="color: #00a000;">Full</b></td>
+-</tr>
+-<tr class="odd">
+-<td class="excplatform">Other platforms, DWARF2 unwinding</td>
++<td class="excplatform">Internal frame unwinding + DWARF2</td>
+ <td class="exccompiler">GCC, Clang</td>
+ <td class="excinterop"><b style="color: #c06000;">Limited</b></td>
+ </tr>
+-<tr class="even">
+-<td class="excplatform">Windows/x64</td>
+-<td class="exccompiler">MSVC</td>
+-<td class="excinterop"><b style="color: #00a000;">Full</b></td>
+-</tr>
+ <tr class="odd">
+-<td class="excplatform">Windows/x86</td>
+-<td class="exccompiler">Any</td>
+-<td class="excinterop"><b style="color: #00a000;">Full</b></td>
++<td class="excplatform">Windows 64 bit</td>
++<td class="exccompiler">non-MSVC</td>
++<td class="excinterop"><b style="color: #c06000;">Limited</b></td>
+ </tr>
+ <tr class="even">
+ <td class="excplatform">Other platforms</td>
+@@ -470,7 +461,7 @@ C++ destructors.</li>
+ </div>
+ <div id="foot">
+ <hr class="hide">
+-Copyright &copy; 2005-2021
++Copyright &copy; 2005-2023
+ <span class="noprint">
+ &middot;
+ <a href="contact.html">Contact</a>
+Index: wrk-4.2.0/obj/LuaJIT-2.1/doc/faq.html
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/doc/faq.html
++++ /dev/null
+@@ -1,185 +0,0 @@
+-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+-<html>
+-<head>
+-<title>Frequently Asked Questions (FAQ)</title>
+-<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+-<meta name="Copyright" content="Copyright (C) 2005-2021">
+-<meta name="Language" content="en">
+-<link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+-<link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+-<style type="text/css">
+-dd { margin-left: 1.5em; }
+-</style>
+-</head>
+-<body>
+-<div id="site">
+-<a href="https://luajit.org"><span>Lua<span id="logo">JIT</span></span></a>
+-</div>
+-<div id="head">
+-<h1>Frequently Asked Questions (FAQ)</h1>
+-</div>
+-<div id="nav">
+-<ul><li>
+-<a href="luajit.html">LuaJIT</a>
+-<ul><li>
+-<a href="https://luajit.org/download.html">Download <span class="ext">&raquo;</span></a>
+-</li><li>
+-<a href="install.html">Installation</a>
+-</li><li>
+-<a href="running.html">Running</a>
+-</li></ul>
+-</li><li>
+-<a href="extensions.html">Extensions</a>
+-<ul><li>
+-<a href="ext_ffi.html">FFI Library</a>
+-<ul><li>
+-<a href="ext_ffi_tutorial.html">FFI Tutorial</a>
+-</li><li>
+-<a href="ext_ffi_api.html">ffi.* API</a>
+-</li><li>
+-<a href="ext_ffi_semantics.html">FFI Semantics</a>
+-</li></ul>
+-</li><li>
+-<a href="ext_jit.html">jit.* Library</a>
+-</li><li>
+-<a href="ext_c_api.html">Lua/C API</a>
+-</li><li>
+-<a href="ext_profiler.html">Profiler</a>
+-</li></ul>
+-</li><li>
+-<a href="status.html">Status</a>
+-</li><li>
+-<a class="current" href="faq.html">FAQ</a>
+-</li><li>
+-<a href="http://wiki.luajit.org/">Wiki <span class="ext">&raquo;</span></a>
+-</li><li>
+-<a href="https://luajit.org/list.html">Mailing List <span class="ext">&raquo;</span></a>
+-</li></ul>
+-</div>
+-<div id="main">
+-<dl id="info">
+-<dt>Q: Where can I learn more about LuaJIT and Lua?</dt>
+-<dd>
+-<ul style="padding: 0;">
+-<li>The <a href="https://luajit.org/list.html"><span class="ext">&raquo;</span>&nbsp;LuaJIT mailing list</a> focuses on topics
+-related to LuaJIT.</li>
+-<li>The <a href="http://wiki.luajit.org/"><span class="ext">&raquo;</span>&nbsp;LuaJIT wiki</a> gathers community
+-resources about LuaJIT.</li>
+-<li>News about Lua itself can be found at the
+-<a href="https://www.lua.org/lua-l.html"><span class="ext">&raquo;</span>&nbsp;Lua mailing list</a>.
+-The mailing list archives are worth checking out for older postings
+-about LuaJIT.</li>
+-<li>The <a href="https://lua.org"><span class="ext">&raquo;</span>&nbsp;main Lua.org site</a> has complete
+-<a href="https://www.lua.org/docs.html"><span class="ext">&raquo;</span>&nbsp;documentation</a> of the language
+-and links to books and papers about Lua.</li>
+-<li>The community-managed <a href="http://lua-users.org/wiki/"><span class="ext">&raquo;</span>&nbsp;Lua Wiki</a>
+-has information about diverse topics.</li>
+-</ul>
+-</dl>
+-
+-<dl id="tech">
+-<dt>Q: Where can I learn more about the compiler technology used by LuaJIT?</dt>
+-<dd>
+-Please use the following Google Scholar searches to find relevant papers:<br>
+-Search for: <a href="https://scholar.google.com/scholar?q=Trace+Compiler"><span class="ext">&raquo;</span>&nbsp;Trace Compiler</a><br>
+-Search for: <a href="https://scholar.google.com/scholar?q=JIT+Compiler"><span class="ext">&raquo;</span>&nbsp;JIT Compiler</a><br>
+-Search for: <a href="https://scholar.google.com/scholar?q=Dynamic+Language+Optimizations"><span class="ext">&raquo;</span>&nbsp;Dynamic Language Optimizations</a><br>
+-Search for: <a href="https://scholar.google.com/scholar?q=SSA+Form"><span class="ext">&raquo;</span>&nbsp;SSA Form</a><br>
+-Search for: <a href="https://scholar.google.com/scholar?q=Linear+Scan+Register+Allocation"><span class="ext">&raquo;</span>&nbsp;Linear Scan Register Allocation</a><br>
+-Here is a list of the <a href="http://lua-users.org/lists/lua-l/2009-11/msg00089.html"><span class="ext">&raquo;</span>&nbsp;innovative features in LuaJIT</a>.<br>
+-And, you know, reading the source is of course the only way to enlightenment.
+-</dd>
+-</dl>
+-
+-<dl id="arg">
+-<dt>Q: Why do I get this error: "attempt to index global 'arg' (a nil value)"?<br>
+-Q: My vararg functions fail after switching to LuaJIT!</dt>
+-<dd>LuaJIT is compatible to the Lua 5.1 language standard. It doesn't
+-support the implicit <tt>arg</tt> parameter for old-style vararg
+-functions from Lua 5.0.<br>Please convert your code to the
+-<a href="https://www.lua.org/manual/5.1/manual.html#2.5.9"><span class="ext">&raquo;</span>&nbsp;Lua 5.1
+-vararg syntax</a>.</dd>
+-</dl>
+-
+-<dl id="x87">
+-<dt>Q: Why do I get this error: "bad FPU precision"?<br>
+-<dt>Q: I get weird behavior after initializing Direct3D.<br>
+-<dt>Q: Some FPU operations crash after I load a Delphi DLL.<br>
+-</dt>
+-<dd>
+-
+-DirectX/Direct3D (up to version 9) sets the x87 FPU to single-precision
+-mode by default. This violates the Windows ABI and interferes with the
+-operation of many programs &mdash; LuaJIT is affected, too. Please make
+-sure you always use the <tt>D3DCREATE_FPU_PRESERVE</tt> flag when
+-initializing Direct3D.<br>
+-
+-Direct3D version 10 or higher do not show this behavior anymore.
+-Consider testing your application with older versions, too.<br>
+-
+-Similarly, the Borland/Delphi runtime modifies the FPU control word and
+-enables FP exceptions. Of course this violates the Windows ABI, too.
+-Please check the Delphi docs for the Set8087CW method.
+-
+-</dl>
+-
+-<dl id="ctrlc">
+-<dt>Q: Sometimes Ctrl-C fails to stop my Lua program. Why?</dt>
+-<dd>The interrupt signal handler sets a Lua debug hook. But this is
+-ignored by compiled code. If your program is running in a tight loop
+-and never falls back to the interpreter, the debug hook never runs and
+-can't throw the "interrupted!" error.<br>
+-You have to press Ctrl-C twice to get stop your program. That's similar
+-to when it's stuck running inside a C function under the Lua interpreter.</dd>
+-</dl>
+-
+-<dl id="sandbox">
+-<dt>Q: Can Lua code be safely sandboxed?</dt>
+-<dd>
+-Maybe for an extremly restricted subset of Lua and if you relentlessly
+-scrutinize every single interface function you offer to the untrusted code.<br>
+-
+-Although Lua provides some sandboxing functionality (<tt>setfenv()</tt>, hooks),
+-it's very hard to get this right even for the Lua core libraries. Of course,
+-you'll need to inspect any extension library, too. And there are libraries
+-that are inherently unsafe, e.g. the <a href="ext_ffi.html">FFI library</a>.<br>
+-
+-More reading material at the <a href="http://lua-users.org/wiki/SandBoxes"><span class="ext">&raquo;</span>&nbsp;Lua Wiki</a> and <a href="https://en.wikipedia.org/wiki/Sandbox_(computer_security)"><span class="ext">&raquo;</span>&nbsp;Wikipedia</a>.<br><br>
+-
+-Relatedly, <b>loading untrusted bytecode is not safe!</b><br>
+-
+-It's trivial to crash the Lua or LuaJIT VM with maliciously crafted bytecode.
+-This is well known and there's no bytecode verification on purpose, so please
+-don't report a bug about it. Check the <tt>mode</tt> parameter for the
+-<tt>load*()</tt> functions to disable loading of bytecode.<br><br>
+-
+-<b>In general, the only promising approach is to sandbox Lua code at the
+-process level and not the VM level.</b>
+-</dd>
+-</dl>
+-
+-<dl id="arch">
+-<dt>Q: Lua runs everywhere. Why doesn't LuaJIT support my CPU?</dt>
+-<dd>Because it's a compiler &mdash; it needs to generate native
+-machine code. This means the code generator must be ported to each
+-architecture. And the fast interpreter is written in assembler and
+-must be ported, too. This is quite an undertaking.<br>
+-The <a href="install.html">install documentation</a> shows the supported
+-architectures.<br>
+-Other architectures may follow based on sufficient user demand and
+-market-relevance of the architecture. Sponsoring is required to develop
+-the port itself, to integrate it and to continuously maintain it in the
+-actively developed branches.</dd>
+-</dl>
+-<br class="flush">
+-</div>
+-<div id="foot">
+-<hr class="hide">
+-Copyright &copy; 2005-2021
+-<span class="noprint">
+-&middot;
+-<a href="contact.html">Contact</a>
+-</span>
+-</div>
+-</body>
+-</html>
+Index: wrk-4.2.0/obj/LuaJIT-2.1/doc/install.html
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/doc/install.html
++++ wrk-4.2.0/obj/LuaJIT-2.1/doc/install.html
+@@ -1,9 +1,9 @@
+-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
++<!DOCTYPE html>
+ <html>
+ <head>
+ <title>Installation</title>
+-<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+-<meta name="Copyright" content="Copyright (C) 2005-2021">
++<meta charset="utf-8">
++<meta name="Copyright" content="Copyright (C) 2005-2023">
+ <meta name="Language" content="en">
+ <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+ <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+@@ -14,25 +14,20 @@ table.compat {
+ }
+ table.compat td {
+   border: 1px solid #bfcfff;
+-  height: 2.5em;
++  height: 1.5em;
+ }
+ table.compat tr.compathead td {
+   font-weight: bold;
+   border-bottom: 2px solid #bfcfff;
+ }
+-tr.compathead td.compatos {
+-  vertical-align: top;
++td.compatname {
++  width: 10%;
+ }
+-table.compat td.compatcpu {
+-  width: 18%;
+-  border-right: 2px solid #bfcfff;
++td.compatbits {
++  width: 5%;
+ }
+-td.compatos {
++td.compatx {
+   width: 21%;
+-  vertical-align: middle;
+-}
+-td.compatno {
+-  background-color: #d0d0d0;
+ }
+ </style>
+ </head>
+@@ -65,6 +60,8 @@ td.compatno {
+ <a href="ext_ffi_semantics.html">FFI Semantics</a>
+ </li></ul>
+ </li><li>
++<a href="ext_buffer.html">String Buffers</a>
++</li><li>
+ <a href="ext_jit.html">jit.* Library</a>
+ </li><li>
+ <a href="ext_c_api.html">Lua/C API</a>
+@@ -72,20 +69,18 @@ td.compatno {
+ <a href="ext_profiler.html">Profiler</a>
+ </li></ul>
+ </li><li>
+-<a href="status.html">Status</a>
+-</li><li>
+-<a href="faq.html">FAQ</a>
++<a href="https://luajit.org/status.html">Status <span class="ext">&raquo;</span></a>
+ </li><li>
+-<a href="http://wiki.luajit.org/">Wiki <span class="ext">&raquo;</span></a>
++<a href="https://luajit.org/faq.html">FAQ <span class="ext">&raquo;</span></a>
+ </li><li>
+ <a href="https://luajit.org/list.html">Mailing List <span class="ext">&raquo;</span></a>
+ </li></ul>
+ </div>
+ <div id="main">
+ <p>
+-LuaJIT is only distributed as a source package. This page explains
+-how to build and install LuaJIT with different operating systems
+-and C&nbsp;compilers.
++LuaJIT is only distributed as source code &mdash; get it from the
++<a href="https://luajit.org/download.html"><span class="ext">&raquo;</span>&nbsp;git repository</a>. This page explains how to build
++and install the LuaJIT binary and library for different operating systems.
+ </p>
+ <p>
+ For the impatient (on POSIX systems):
+@@ -93,62 +88,24 @@ For the impatient (on POSIX systems):
+ <pre class="code">
+ make &amp;&amp; sudo make install
+ </pre>
++
++<h2 id="req">Requirements</h2>
+ <p>
+-LuaJIT currently builds out-of-the box on most systems.
+-Here's the compatibility matrix for the supported combinations of
+-operating systems, CPUs and compilers:
++LuaJIT currently builds out-of-the box on most systems. Please check the
++supported operating systems and CPU architectures on the
++<a href="https://luajit.org/status.html"><span class="ext">&raquo;</span>&nbsp;status page</a>.
++</p>
++<p>
++Building LuaJIT requires a recent toolchain based on GCC, Clang/LLVM or
++MSVC++.
++</p>
++<p>
++The Makefile-based build system requires GNU Make and supports
++cross-builds.
++</p>
++<p>
++Batch files are provided for MSVC++ builds and console cross-builds.
+ </p>
+-<table class="compat">
+-<tr class="compathead">
+-<td class="compatcpu">CPU / OS</td>
+-<td class="compatos"><a href="#posix">Linux</a> or<br><a href="#android">Android</a></td>
+-<td class="compatos"><a href="#posix">*BSD, Other</a></td>
+-<td class="compatos"><a href="#posix">macOS 10.4+</a> or<br><a href="#ios">iOS 3.0+</a></td>
+-<td class="compatos"><a href="#windows">Windows 7<br>or later</a></td>
+-</tr>
+-<tr class="odd separate">
+-<td class="compatcpu">x86 (32 bit)</td>
+-<td class="compatos">GCC 4.2+</td>
+-<td class="compatos">GCC 4.2+</td>
+-<td class="compatos">XCode 5.0+<br>Clang</td>
+-<td class="compatos">MSVC<br>MinGW, Cygwin</td>
+-</tr>
+-<tr class="even">
+-<td class="compatcpu">x64 (64 bit)</td>
+-<td class="compatos">GCC 4.2+</td>
+-<td class="compatos">GCC 4.2+<br>ORBIS (<a href="#ps4">PS4</a>)</td>
+-<td class="compatos">XCode 5.0+<br>Clang</td>
+-<td class="compatos">MSVC<br>Durango (<a href="#xboxone">Xbox One</a>)</td>
+-</tr>
+-<tr class="odd">
+-<td class="compatcpu"><a href="#cross2">ARMv5+<br>ARM9E+</a></td>
+-<td class="compatos">GCC 4.2+</td>
+-<td class="compatos">GCC 4.2+<br>PSP2 (<a href="#psvita">PS VITA</a>)</td>
+-<td class="compatos">XCode 5.0+<br>Clang</td>
+-<td class="compatos compatno">&nbsp;</td>
+-</tr>
+-<tr class="even">
+-<td class="compatcpu"><a href="#cross2">ARM64<br>ARM64be</a></td>
+-<td class="compatos">GCC 4.8+</td>
+-<td class="compatos compatno">&nbsp;</td>
+-<td class="compatos">XCode 6.0+<br>Clang 3.5+</td>
+-<td class="compatos compatno">&nbsp;</td>
+-</tr>
+-<tr class="odd">
+-<td class="compatcpu"><a href="#cross2">PPC</a></td>
+-<td class="compatos">GCC 4.3+</td>
+-<td class="compatos">GCC 4.3+<br>GCC 4.1 (<a href="#ps3">PS3</a>)</td>
+-<td class="compatos compatno">&nbsp;</td>
+-<td class="compatos">XEDK (<a href="#xbox360">Xbox 360</a>)</td>
+-</tr>
+-<tr class="even">
+-<td class="compatcpu"><a href="#cross2">MIPS32<br>MIPS64<br>MIPS64r6</a></td>
+-<td class="compatos">GCC 4.3+</td>
+-<td class="compatos">GCC 4.3+</td>
+-<td class="compatos compatno">&nbsp;</td>
+-<td class="compatos compatno">&nbsp;</td>
+-</tr>
+-</table>
+ 
+ <h2>Configuring LuaJIT</h2>
+ <p>
+@@ -157,7 +114,6 @@ Usually there is no need to tweak the se
+ hold all user-configurable settings:
+ </p>
+ <ul>
+-<li><tt>src/luaconf.h</tt> sets some configuration variables.</li>
+ <li><tt>Makefile</tt> has settings for <b>installing</b> LuaJIT (POSIX
+ only).</li>
+ <li><tt>src/Makefile</tt> has settings for <b>compiling</b> LuaJIT
+@@ -180,20 +136,12 @@ Please check the note about the
+ <h2 id="posix">POSIX Systems (Linux, macOS, *BSD etc.)</h2>
+ <h3>Prerequisites</h3>
+ <p>
+-Depending on your distribution, you may need to install a package for
+-GCC, the development headers and/or a complete SDK. E.g. on a current
+-Debian/Ubuntu, install <tt>libc6-dev</tt> with the package manager.
++Depending on your distribution, you may need to install a package for a
++compiler (GCC or Clang/LLVM), the development headers and/or a complete SDK.
++E.g. on a current Debian/Ubuntu, install <tt>build-essential</tt> with the
++package manager.
+ </p>
+-<p>
+-The recommended way to fetch the latest version is to do a pull from
+-the git repository. Alternatively download the latest source package of
+-LuaJIT (pick the .tar.gz). Move it to a directory of your choice,
+-open a terminal window and change to this directory. Now unpack the archive
+-and change to the newly created directory:
+-</p>
+-<pre class="code">
+-tar zxf LuaJIT-2.1.0-beta3.tar.gz
+-cd LuaJIT-2.1.0-beta3</pre>
++</pre>
+ <h3>Building LuaJIT</h3>
+ <p>
+ The supplied Makefiles try to auto-detect the settings needed for your
+@@ -253,15 +201,10 @@ Either install one of the open source SD
+ GCC plus the required development headers.
+ Or install Microsoft's Visual Studio (MSVC).
+ </p>
+-<p>
+-Next, pull from the git repository or download the source package and
+-unpack it using an archive manager (e.g. the Windows Explorer) to
+-a directory of your choice.
+-</p>
+ <h3>Building with MSVC</h3>
+ <p>
+-Open a "Visual Studio Command Prompt" (either x86 or x64), <tt>cd</tt> to the
+-directory where you've unpacked the sources and run these commands:
++Open a "Visual Studio Command Prompt" (x86, x64 or ARM64), <tt>cd</tt> to the
++directory with the source code and run these commands:
+ </p>
+ <pre class="code">
+ cd src
+@@ -271,11 +214,14 @@ msvcbuild
+ Check the <tt>msvcbuild.bat</tt> file for more options.
+ Then follow the installation instructions below.
+ </p>
++<p>
++For an x64 to ARM64 cross-build run this first: <tt>vcvarsall.bat x64_arm64</tt>
++</p>
+ <h3>Building with MinGW or Cygwin</h3>
+ <p>
+ Open a command prompt window and make sure the MinGW or Cygwin programs
+-are in your path. Then <tt>cd</tt> to the directory of the git repository
+-or where you've unpacked the sources. Then run this command for MinGW:
++are in your path. Then <tt>cd</tt> to the directory of the git repository.
++Then run this command for MinGW:
+ </p>
+ <pre class="code">
+ mingw32-make
+@@ -377,15 +323,15 @@ make HOST_CC="gcc -m32" CROSS=arm-linux-
+ make HOST_CC="gcc -m32" CROSS=arm-linux-gnueabihf-
+ 
+ # ARM64
+-make CROSS=aarch64-linux-
++make CROSS=aarch64-linux-gnu-
+ 
+ # PPC
+ make HOST_CC="gcc -m32" CROSS=powerpc-linux-gnu-
+ 
+ # MIPS32 big-endian
+-make HOST_CC="gcc -m32" CROSS=mips-linux-
++make HOST_CC="gcc -m32" CROSS=mips-linux-gnu-
+ # MIPS32 little-endian
+-make HOST_CC="gcc -m32" CROSS=mipsel-linux-
++make HOST_CC="gcc -m32" CROSS=mipsel-linux-gnu-
+ 
+ # MIPS64 big-endian
+ make CROSS=mips-linux- TARGET_CFLAGS="-mips64r2 -mabi=64"
+@@ -405,7 +351,8 @@ NDKCROSS=$NDKBIN/aarch64-linux-android-
+ NDKCC=$NDKBIN/aarch64-linux-android21-clang
+ make CROSS=$NDKCROSS \
+      STATIC_CC=$NDKCC DYNAMIC_CC="$NDKCC -fPIC" \
+-     TARGET_LD=$NDKCC
++     TARGET_LD=$NDKCC TARGET_AR="$NDKBIN/llvm-ar rcus" \
++     TARGET_STRIP=$NDKBIN/llvm-strip
+ 
+ # Android/ARM, armeabi-v7a (ARMv7 VFP), Android 4.1+ (JB)
+ NDKDIR=/opt/android/ndk
+@@ -414,7 +361,8 @@ NDKCROSS=$NDKBIN/arm-linux-androideabi-
+ NDKCC=$NDKBIN/armv7a-linux-androideabi16-clang
+ make HOST_CC="gcc -m32" CROSS=$NDKCROSS \
+      STATIC_CC=$NDKCC DYNAMIC_CC="$NDKCC -fPIC" \
+-     TARGET_LD=$NDKCC
++     TARGET_LD=$NDKCC TARGET_AR="$NDKBIN/llvm-ar rcus" \
++     TARGET_STRIP=$NDKBIN/llvm-strip
+ </pre>
+ <p>
+ You can cross-compile for <b id="ios">iOS 3.0+</b> (iPhone/iPad) using the <a href="https://developer.apple.com/ios/"><span class="ext">&raquo;</span>&nbsp;iOS SDK</a>:
+@@ -438,8 +386,7 @@ make DEFAULT_CC=clang CROSS="$(dirname $
+ <h3 id="consoles">Cross-compiling for consoles</h3>
+ <p>
+ Building LuaJIT for consoles requires both a supported host compiler
+-(x86 or x64) and a cross-compiler (to PPC or ARM) from the official
+-console SDK.
++(x86 or x64) and a cross-compiler from the official console SDK.
+ </p>
+ <p>
+ Due to restrictions on consoles, the JIT compiler is disabled and only
+@@ -460,45 +407,58 @@ To cross-compile for <b id="ps3">PS3</b>
+ make HOST_CC="gcc -m32" CROSS=ppu-lv2-
+ </pre>
+ <p>
+-To cross-compile for <b id="ps4">PS4</b> from a Windows host,
+-open a "Visual Studio .NET Command Prompt" (64&nbsp;bit host compiler),
+-<tt>cd</tt> to the directory where you've unpacked the sources and
+-run the following commands:
++To cross-compile for the other consoles from a Windows host, open a
++"Native Tools Command Prompt for VS". You need to choose either the 32
++or the 64&nbsp;bit version of the host compiler to match the target.
++Then <tt>cd</tt> to the <tt>src</tt> directory below the source code
++and run the build command given in the table:
+ </p>
+-<pre class="code">
+-cd src
+-ps4build
+-</pre>
+-<p>
+-To cross-compile for <b id="psvita">PS Vita</b> from a Windows host,
+-open a "Visual Studio .NET Command Prompt" (32&nbsp;bit host compiler),
+-<tt>cd</tt> to the directory where you've unpacked the sources and
+-run the following commands:
+-</p>
+-<pre class="code">
+-cd src
+-psvitabuild
+-</pre>
+-<p>
+-To cross-compile for <b id="xbox360">Xbox 360</b> from a Windows host,
+-open a "Visual Studio .NET Command Prompt" (32&nbsp;bit host compiler),
+-<tt>cd</tt> to the directory where you've unpacked the sources and run
+-the following commands:
+-</p>
+-<pre class="code">
+-cd src
+-xedkbuild
+-</pre>
++<table class="compat">
++<tr class="compathead">
++<td class="compatname">Console</td>
++<td class="compatbits">Bits</td>
++<td class="compatx">Build Command</td>
++</tr>
++<tr class="odd separate">
++<td class="compatname"><b id="ps4">PS4</b></td>
++<td class="compatbits">64</td>
++<td class="compatx"><tt>ps4build</tt></td>
++</tr>
++<tr class="even">
++<td class="compatname"><b id="ps5">PS5</b></td>
++<td class="compatbits">64</td>
++<td class="compatx"><tt>ps5build</tt></td>
++</tr>
++<tr class="odd">
++<td class="compatname"><b id="psvita">PS Vita</b></td>
++<td class="compatbits">32</td>
++<td class="compatx"><tt>psvitabuild</tt></td>
++</tr>
++<tr class="even">
++<td class="compatname"><b id="xbox360">Xbox 360</b></td>
++<td class="compatbits">32</td>
++<td class="compatx"><tt>xedkbuild</tt></td>
++</tr>
++<tr class="odd">
++<td class="compatname"><b id="xboxone">Xbox One</b></td>
++<td class="compatbits">64</td>
++<td class="compatx"><tt>xb1build</tt></td>
++</tr>
++<tr class="even">
++<td class="compatname"><b id="nx32">Nintendo Switch NX32</b></td>
++<td class="compatbits">32</td>
++<td class="compatx"><tt>nxbuild</tt></td>
++</tr>
++<tr class="odd">
++<td class="compatname"><b id="nx64">Nintendo Switch NX64</b></td>
++<td class="compatbits">64</td>
++<td class="compatx"><tt>nxbuild</tt></td>
++</tr>
++</table>
+ <p>
+-To cross-compile for <b id="xboxone">Xbox One</b> from a Windows host,
+-open a "Visual Studio .NET Command Prompt" (64&nbsp;bit host compiler),
+-<tt>cd</tt> to the directory where you've unpacked the sources and run
+-the following commands:
++Please check out the comments in the corresponding <tt>*.bat</tt>
++file for more options.
+ </p>
+-<pre class="code">
+-cd src
+-xb1build
+-</pre>
+ 
+ <h2 id="embed">Embedding LuaJIT</h2>
+ <p>
+@@ -540,7 +500,7 @@ allocator from your system (no support f
+ of calling <tt>luaopen_base</tt> etc. directly.</li>
+ <li>To change or extend the list of standard libraries to load, copy
+ <tt>src/lib_init.c</tt> to your project and modify it accordingly.
+-Make sure the <tt>jit</tt> library is loaded or the JIT compiler
++Make sure the <tt>jit</tt> library is loaded, or the JIT compiler
+ will not be activated.</li>
+ <li>The <tt>bit.*</tt> module for bitwise operations
+ is already built-in. There's no need to statically link
+@@ -559,7 +519,7 @@ in unspeakable ways.
+ There should be absolutely no need to patch <tt>luaconf.h</tt> or any
+ of the Makefiles. And please do not hand-pick files for your packages &mdash;
+ simply use whatever <tt>make install</tt> creates. There's a reason
+-for all of the files <em>and</em> directories it creates.
++for all the files <em>and</em> directories it creates.
+ </p>
+ <p>
+ The build system uses GNU make and auto-detects most settings based on
+@@ -611,7 +571,7 @@ to me (the upstream) and not you (the pa
+ </div>
+ <div id="foot">
+ <hr class="hide">
+-Copyright &copy; 2005-2021
++Copyright &copy; 2005-2023
+ <span class="noprint">
+ &middot;
+ <a href="contact.html">Contact</a>
+Index: wrk-4.2.0/obj/LuaJIT-2.1/doc/luajit.html
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/doc/luajit.html
++++ wrk-4.2.0/obj/LuaJIT-2.1/doc/luajit.html
+@@ -1,9 +1,9 @@
+-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
++<!DOCTYPE html>
+ <html>
+ <head>
+ <title>LuaJIT</title>
+-<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+-<meta name="Copyright" content="Copyright (C) 2005-2021">
++<meta charset="utf-8">
++<meta name="Copyright" content="Copyright (C) 2005-2023">
+ <meta name="Language" content="en">
+ <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+ <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+@@ -67,30 +67,6 @@ table.fcompat td {
+   background-image: -o-linear-gradient(#41bfbf 10%, #b0ffff 95%);
+   background-image: -ms-linear-gradient(#41bfbf 10%, #b0ffff 95%);
+ }
+-table.stats td {
+-  color: #ffffff;
+-  background: #a0a0a0;
+-  background-image: linear-gradient(#808080 10%, #d0d0d0 95%);
+-  background-image: -moz-linear-gradient(#808080 10%, #d0d0d0 95%);
+-  background-image: -webkit-linear-gradient(#808080 10%, #d0d0d0 95%);
+-  background-image: -o-linear-gradient(#808080 10%, #d0d0d0 95%);
+-  background-image: -ms-linear-gradient(#808080 10%, #d0d0d0 95%);
+-}
+-table.stats td.speed {
+-  color: #ff4020;
+-}
+-table.stats td.kb {
+-  color: #ffff80;
+-  background: #808080;
+-  background-image: linear-gradient(#606060 10%, #c0c0c0 95%);
+-  background-image: -moz-linear-gradient(#606060 10%, #c0c0c0 95%);
+-  background-image: -webkit-linear-gradient(#606060 10%, #c0c0c0 95%);
+-  background-image: -o-linear-gradient(#606060 10%, #c0c0c0 95%);
+-  background-image: -ms-linear-gradient(#606060 10%, #c0c0c0 95%);
+-}
+-table.feature small {
+-  font-size: 50%;
+-}
+ </style>
+ </head>
+ <body>
+@@ -122,6 +98,8 @@ table.feature small {
+ <a href="ext_ffi_semantics.html">FFI Semantics</a>
+ </li></ul>
+ </li><li>
++<a href="ext_buffer.html">String Buffers</a>
++</li><li>
+ <a href="ext_jit.html">jit.* Library</a>
+ </li><li>
+ <a href="ext_c_api.html">Lua/C API</a>
+@@ -129,11 +107,9 @@ table.feature small {
+ <a href="ext_profiler.html">Profiler</a>
+ </li></ul>
+ </li><li>
+-<a href="status.html">Status</a>
++<a href="https://luajit.org/status.html">Status <span class="ext">&raquo;</span></a>
+ </li><li>
+-<a href="faq.html">FAQ</a>
+-</li><li>
+-<a href="http://wiki.luajit.org/">Wiki <span class="ext">&raquo;</span></a>
++<a href="https://luajit.org/faq.html">FAQ <span class="ext">&raquo;</span></a>
+ </li><li>
+ <a href="https://luajit.org/list.html">Mailing List <span class="ext">&raquo;</span></a>
+ </li></ul>
+@@ -146,7 +122,7 @@ Lua is a powerful, dynamic and light-wei
+ It may be embedded or used as a general-purpose, stand-alone language.
+ </p>
+ <p>
+-LuaJIT is Copyright &copy; 2005-2021 Mike Pall, released under the
++LuaJIT is Copyright &copy; 2005-2023 Mike Pall, released under the
+ <a href="https://www.opensource.org/licenses/mit-license.php"><span class="ext">&raquo;</span>&nbsp;MIT open source license</a>.
+ </p>
+ <p>
+@@ -160,7 +136,7 @@ LuaJIT is Copyright &copy; 2005-2021 Mik
+ <tr><td><span style="font-size:90%;">Embedded</span></td><td>Android</td><td>iOS</td></tr>
+ </table>
+ <table class="feature os os3">
+-<tr><td>PS3</td><td>PS4</td><td>PS Vita</td><td>Xbox 360</td><td>Xbox One</td></tr>
++<tr><td>PS3</td><td>PS4<br>PS5</td><td>PS Vita</td><td>Xbox 360</td><td>Xbox One</td><td>Nintendo<br>Switch</td></tr>
+ </table>
+ <table class="feature compiler">
+ <tr><td>GCC</td><td>Clang<br>LLVM</td><td>MSVC</td></tr>
+@@ -173,23 +149,20 @@ LuaJIT is Copyright &copy; 2005-2021 Mik
+ </table>
+ 
+ <h2>Overview</h2>
+-<table class="feature stats">
+-<tr>
+-<td class="speed">3x<br>-&nbsp;&nbsp;100x</td>
+-<td class="kb">115&nbsp;<small>KB</small><br>VM</td>
+-<td class="kb">90&nbsp;<small>KB</small><br>JIT</td>
+-<td class="kloc">63&nbsp;<small>KLOC</small><br>C</td>
+-<td class="kloc">24&nbsp;<small>KLOC</small><br>ASM</td>
+-<td class="kloc">11&nbsp;<small>KLOC</small><br>Lua</td>
+-</tr>
+-</table>
+ <p style="margin-top: 1em;">
+ LuaJIT has been successfully used as a <b>scripting middleware</b> in
+ games, appliances, network and graphics apps, numerical simulations,
+-trading platforms and many other specialty applications. It scales from
+-embedded devices, smartphones, desktops up to server farms. It combines
+-high flexibility with high performance
+-and an unmatched <b>low memory footprint</b>.
++trading platforms and many other specialty applications.
++</p>
++<p>
++LuaJIT is part of a hundred million web sites, huge SaaS installations,
++network switches, set-top boxes and other embedded devices. You've probably
++already used LuaJIT without knowing about it.
++</p>
++<p>
++LuaJIT scales from embedded devices, smartphones, desktops up to server
++farms. It combines high flexibility with high performance and an unmatched
++<b>low memory footprint</b>.
+ </p>
+ <p>
+ LuaJIT has been in continuous development since 2005. It's widely
+@@ -220,7 +193,7 @@ Please select a sub-topic in the navigat
+ </div>
+ <div id="foot">
+ <hr class="hide">
+-Copyright &copy; 2005-2021
++Copyright &copy; 2005-2023
+ <span class="noprint">
+ &middot;
+ <a href="contact.html">Contact</a>
+Index: wrk-4.2.0/obj/LuaJIT-2.1/doc/running.html
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/doc/running.html
++++ wrk-4.2.0/obj/LuaJIT-2.1/doc/running.html
+@@ -1,9 +1,9 @@
+-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
++<!DOCTYPE html>
+ <html>
+ <head>
+ <title>Running LuaJIT</title>
+-<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+-<meta name="Copyright" content="Copyright (C) 2005-2021">
++<meta charset="utf-8">
++<meta name="Copyright" content="Copyright (C) 2005-2023">
+ <meta name="Language" content="en">
+ <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+ <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+@@ -59,6 +59,8 @@ td.param_default {
+ <a href="ext_ffi_semantics.html">FFI Semantics</a>
+ </li></ul>
+ </li><li>
++<a href="ext_buffer.html">String Buffers</a>
++</li><li>
+ <a href="ext_jit.html">jit.* Library</a>
+ </li><li>
+ <a href="ext_c_api.html">Lua/C API</a>
+@@ -66,16 +68,15 @@ td.param_default {
+ <a href="ext_profiler.html">Profiler</a>
+ </li></ul>
+ </li><li>
+-<a href="status.html">Status</a>
+-</li><li>
+-<a href="faq.html">FAQ</a>
++<a href="https://luajit.org/status.html">Status <span class="ext">&raquo;</span></a>
+ </li><li>
+-<a href="http://wiki.luajit.org/">Wiki <span class="ext">&raquo;</span></a>
++<a href="https://luajit.org/faq.html">FAQ <span class="ext">&raquo;</span></a>
+ </li><li>
+ <a href="https://luajit.org/list.html">Mailing List <span class="ext">&raquo;</span></a>
+ </li></ul>
+ </div>
+ <div id="main">
++
+ <p>
+ LuaJIT has only a single stand-alone executable, called <tt>luajit</tt> on
+ POSIX systems or <tt>luajit.exe</tt> on Windows. It can be used to run simple
+@@ -109,6 +110,7 @@ are accepted:
+ <li><tt>-t type</tt> &mdash; Set output file type (default: auto-detect from output name).</li>
+ <li><tt>-a arch</tt> &mdash; Override architecture for object files (default: native).</li>
+ <li><tt>-o os</tt> &mdash; Override OS for object files (default: native).</li>
++<li><tt>-F name</tt> &mdash; Override filename (default: input filename).</li>
+ <li><tt>-e chunk</tt> &mdash; Use chunk string as input.</li>
+ <li><tt>-</tt> (a single minus sign) &mdash; Use stdin as input and/or stdout as output.</li>
+ </ul>
+@@ -118,7 +120,8 @@ file name:
+ </p>
+ <ul>
+ <li><tt>c</tt> &mdash; C source file, exported bytecode data.</li>
+-<li><tt>h</tt> &mdash; C header file, static bytecode data.</li>
++<li><tt>cc</tt> &mdash; C++ source file, exported bytecode data.</li>
++<li><tt>h</tt> &mdash; C/C++ header file, static bytecode data.</li>
+ <li><tt>obj</tt> or <tt>o</tt> &mdash; Object file, exported bytecode data
+ (OS- and architecture-specific).</li>
+ <li><tt>raw</tt> or any other extension &mdash; Raw bytecode file (portable).
+@@ -182,9 +185,9 @@ written in Lua. They are mainly used for
+ itself. For a description of their options and output format, please
+ read the comment block at the start of their source.
+ They can be found in the <tt>lib</tt> directory of the source
+-distribution or installed under the <tt>jit</tt> directory. By default
+-this is <tt>/usr/local/share/luajit-2.1.0-beta3/jit</tt> on POSIX
+-systems.
++distribution or installed under the <tt>jit</tt> directory. By default,
++this is <tt>/usr/local/share/luajit-XX.YY.ZZ>/jit</tt> on POSIX
++systems (replace XX.YY.ZZ by the installed version).
+ </p>
+ 
+ <h3 id="opt_O"><tt>-O[level]</tt><br>
+@@ -214,11 +217,17 @@ to a specific value.
+ You can either use this option multiple times (like <tt>-Ocse
+ -O-dce -Ohotloop=10</tt>) or separate several settings with a comma
+ (like <tt>-O+cse,-dce,hotloop=10</tt>). The settings are applied from
+-left to right and later settings override earlier ones. You can freely
++left to right, and later settings override earlier ones. You can freely
+ mix the three forms, but note that setting an optimization level
+ overrides all earlier flags.
+ </p>
+ <p>
++Note that <tt>-Ofma</tt> is not enabled by default at any level,
++because it affects floating-point result accuracy. Only enable this,
++if you fully understand the trade-offs of FMA for performance (higher),
++determinism (lower) and numerical accuracy (higher).
++</p>
++<p>
+ Here are the available flags and at what optimization levels they
+ are enabled:
+ </p>
+@@ -250,6 +259,8 @@ are enabled:
+ <td class="flag_name">sink</td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_level">&bull;</td><td class="flag_desc">Allocation/Store Sinking</td></tr>
+ <tr class="even">
+ <td class="flag_name">fuse</td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_level">&bull;</td><td class="flag_desc">Fusion of operands into instructions</td></tr>
++<tr class="odd">
++<td class="flag_name">fma </td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_desc">Fused multiply-add</td></tr>
+ </table>
+ <p>
+ Here are the parameters and their default settings:
+@@ -293,7 +304,7 @@ Here are the parameters and their defaul
+ </div>
+ <div id="foot">
+ <hr class="hide">
+-Copyright &copy; 2005-2021
++Copyright &copy; 2005-2023
+ <span class="noprint">
+ &middot;
+ <a href="contact.html">Contact</a>
+Index: wrk-4.2.0/obj/LuaJIT-2.1/doc/status.html
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/doc/status.html
++++ /dev/null
+@@ -1,111 +0,0 @@
+-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+-<html>
+-<head>
+-<title>Status</title>
+-<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+-<meta name="Copyright" content="Copyright (C) 2005-2021">
+-<meta name="Language" content="en">
+-<link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+-<link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+-<style type="text/css">
+-ul li { padding-bottom: 0.3em; }
+-</style>
+-</head>
+-<body>
+-<div id="site">
+-<a href="https://luajit.org"><span>Lua<span id="logo">JIT</span></span></a>
+-</div>
+-<div id="head">
+-<h1>Status</h1>
+-</div>
+-<div id="nav">
+-<ul><li>
+-<a href="luajit.html">LuaJIT</a>
+-<ul><li>
+-<a href="https://luajit.org/download.html">Download <span class="ext">&raquo;</span></a>
+-</li><li>
+-<a href="install.html">Installation</a>
+-</li><li>
+-<a href="running.html">Running</a>
+-</li></ul>
+-</li><li>
+-<a href="extensions.html">Extensions</a>
+-<ul><li>
+-<a href="ext_ffi.html">FFI Library</a>
+-<ul><li>
+-<a href="ext_ffi_tutorial.html">FFI Tutorial</a>
+-</li><li>
+-<a href="ext_ffi_api.html">ffi.* API</a>
+-</li><li>
+-<a href="ext_ffi_semantics.html">FFI Semantics</a>
+-</li></ul>
+-</li><li>
+-<a href="ext_jit.html">jit.* Library</a>
+-</li><li>
+-<a href="ext_c_api.html">Lua/C API</a>
+-</li><li>
+-<a href="ext_profiler.html">Profiler</a>
+-</li></ul>
+-</li><li>
+-<a class="current" href="status.html">Status</a>
+-</li><li>
+-<a href="faq.html">FAQ</a>
+-</li><li>
+-<a href="http://wiki.luajit.org/">Wiki <span class="ext">&raquo;</span></a>
+-</li><li>
+-<a href="https://luajit.org/list.html">Mailing List <span class="ext">&raquo;</span></a>
+-</li></ul>
+-</div>
+-<div id="main">
+-<p>
+-This documentation is for LuaJIT 2.1.0-beta3. Please check the <tt>doc</tt>
+-directory in each git branch for the version-specific documentation.
+-</p>
+-<p>
+-The currently developed branches are LuaJIT&nbsp;2.1 and LuaJIT&nbsp;2.0.
+-</p>
+-<p>
+-LuaJIT&nbsp;2.0 is in feature-freeze &mdash; new features will only
+-be added to LuaJIT&nbsp;2.1.
+-</p>
+-
+-<h2>Current Status</h2>
+-<p>
+-LuaJIT ought to run all Lua&nbsp;5.1-compatible source code just fine.
+-It's considered a serious bug if the VM crashes or produces unexpected
+-results &mdash; please report this.
+-</p>
+-<p>
+-Known incompatibilities and issues in LuaJIT&nbsp;2.0:
+-</p>
+-<ul>
+-<li>
+-There are some differences in <b>implementation-defined</b> behavior.
+-These either have a good reason, are arbitrary design choices
+-or are due to quirks in the VM. The latter cases may get fixed if a
+-demonstrable need is shown.
+-</li>
+-<li>
+-The Lua <b>debug API</b> is missing a couple of features (return
+-hooks for non-Lua functions) and shows slightly different behavior
+-in LuaJIT (no per-coroutine hooks, no tail call counting).
+-</li>
+-<li>
+-Currently some <b>out-of-memory</b> errors from <b>on-trace code</b> are not
+-handled correctly. The error may fall through an on-trace
+-<tt>pcall</tt> or it may be passed on to the function set with
+-<tt>lua_atpanic</tt> on x64.
+-</li>
+-</ul>
+-<br class="flush">
+-</div>
+-<div id="foot">
+-<hr class="hide">
+-Copyright &copy; 2005-2021
+-<span class="noprint">
+-&middot;
+-<a href="contact.html">Contact</a>
+-</span>
+-</div>
+-</body>
+-</html>
+Index: wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_arm.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/dynasm/dasm_arm.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_arm.h
+@@ -1,6 +1,6 @@
+ /*
+ ** DynASM ARM encoding engine.
+-** Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++** Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ ** Released under the MIT license. See dynasm.lua for full copyright notice.
+ */
+ 
+@@ -70,7 +70,7 @@ struct dasm_State {
+   size_t lgsize;
+   int *pclabels;		/* PC label chains/pos ptrs. */
+   size_t pcsize;
+-  void **globals;		/* Array of globals (bias -10). */
++  void **globals;		/* Array of globals. */
+   dasm_Section *section;	/* Pointer to active section. */
+   size_t codesize;		/* Total size of all code sections. */
+   int maxsection;		/* 0 <= sectionidx < maxsection. */
+@@ -87,7 +87,6 @@ void dasm_init(Dst_DECL, int maxsection)
+ {
+   dasm_State *D;
+   size_t psz = 0;
+-  int i;
+   Dst_REF = NULL;
+   DASM_M_GROW(Dst, struct dasm_State, Dst_REF, psz, DASM_PSZ(maxsection));
+   D = Dst_REF;
+@@ -98,12 +97,7 @@ void dasm_init(Dst_DECL, int maxsection)
+   D->pcsize = 0;
+   D->globals = NULL;
+   D->maxsection = maxsection;
+-  for (i = 0; i < maxsection; i++) {
+-    D->sections[i].buf = NULL;  /* Need this for pass3. */
+-    D->sections[i].rbuf = D->sections[i].buf - DASM_SEC2POS(i);
+-    D->sections[i].bsize = 0;
+-    D->sections[i].epos = 0;  /* Wrong, but is recalculated after resize. */
+-  }
++  memset((void *)D->sections, 0, maxsection * sizeof(dasm_Section));
+ }
+ 
+ /* Free DynASM state. */
+@@ -123,7 +117,7 @@ void dasm_free(Dst_DECL)
+ void dasm_setupglobal(Dst_DECL, void **gl, unsigned int maxgl)
+ {
+   dasm_State *D = Dst_REF;
+-  D->globals = gl - 10;  /* Negative bias to compensate for locals. */
++  D->globals = gl;
+   DASM_M_GROW(Dst, int, D->lglabels, D->lgsize, (10+maxgl)*sizeof(int));
+ }
+ 
+@@ -148,6 +142,7 @@ void dasm_setup(Dst_DECL, const void *ac
+   if (D->pclabels) memset((void *)D->pclabels, 0, D->pcsize);
+   for (i = 0; i < D->maxsection; i++) {
+     D->sections[i].pos = DASM_SEC2POS(i);
++    D->sections[i].rbuf = D->sections[i].buf - D->sections[i].pos;
+     D->sections[i].ofs = 0;
+   }
+ }
+@@ -294,7 +289,7 @@ int dasm_link(Dst_DECL, size_t *szp)
+ 
+   { /* Handle globals not defined in this translation unit. */
+     int idx;
+-    for (idx = 20; idx*sizeof(int) < D->lgsize; idx++) {
++    for (idx = 10; idx*sizeof(int) < D->lgsize; idx++) {
+       int n = D->lglabels[idx];
+       /* Undefined label: Collapse rel chain and replace with marker (< 0). */
+       while (n > 0) { int *pb = DASM_POS2PTR(D, n); n = *pb; *pb = -idx; }
+@@ -371,7 +366,10 @@ int dasm_encode(Dst_DECL, void *buffer)
+ 	  ins &= 255; while ((((char *)cp - base) & ins)) *cp++ = 0xe1a00000;
+ 	  break;
+ 	case DASM_REL_LG:
+-	  CK(n >= 0, UNDEF_LG);
++	  if (n < 0) {
++	    n = (int)((ptrdiff_t)D->globals[-n-10] - (ptrdiff_t)cp - 4);
++	    goto patchrel;
++	  }
+ 	  /* fallthrough */
+ 	case DASM_REL_PC:
+ 	  CK(n >= 0, UNDEF_PC);
+@@ -393,7 +391,7 @@ int dasm_encode(Dst_DECL, void *buffer)
+ 	  }
+ 	  break;
+ 	case DASM_LABEL_LG:
+-	  ins &= 2047; if (ins >= 20) D->globals[ins-10] = (void *)(base + n);
++	  ins &= 2047; if (ins >= 20) D->globals[ins-20] = (void *)(base + n);
+ 	  break;
+ 	case DASM_LABEL_PC: break;
+ 	case DASM_IMM:
+Index: wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_arm.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/dynasm/dasm_arm.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_arm.lua
+@@ -1,7 +1,7 @@
+ ------------------------------------------------------------------------------
+ -- DynASM ARM module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- See dynasm.lua for full copyright notice.
+ ------------------------------------------------------------------------------
+ 
+@@ -9,9 +9,9 @@
+ local _info = {
+   arch =	"arm",
+   description =	"DynASM ARM module",
+-  version =	"1.4.0",
+-  vernum =	 10400,
+-  release =	"2015-10-18",
++  version =	"1.5.0",
++  vernum =	 10500,
++  release =	"2021-05-02",
+   author =	"Mike Pall",
+   license =	"MIT",
+ }
+Index: wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_arm64.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/dynasm/dasm_arm64.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_arm64.h
+@@ -1,6 +1,6 @@
+ /*
+ ** DynASM ARM64 encoding engine.
+-** Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++** Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ ** Released under the MIT license. See dynasm.lua for full copyright notice.
+ */
+ 
+@@ -21,8 +21,9 @@ enum {
+   /* The following actions need a buffer position. */
+   DASM_ALIGN, DASM_REL_LG, DASM_LABEL_LG,
+   /* The following actions also have an argument. */
+-  DASM_REL_PC, DASM_LABEL_PC,
++  DASM_REL_PC, DASM_LABEL_PC, DASM_REL_A,
+   DASM_IMM, DASM_IMM6, DASM_IMM12, DASM_IMM13W, DASM_IMM13X, DASM_IMML,
++  DASM_IMMV, DASM_VREG,
+   DASM__MAX
+ };
+ 
+@@ -39,6 +40,7 @@ enum {
+ #define DASM_S_RANGE_LG		0x13000000
+ #define DASM_S_RANGE_PC		0x14000000
+ #define DASM_S_RANGE_REL	0x15000000
++#define DASM_S_RANGE_VREG	0x16000000
+ #define DASM_S_UNDEF_LG		0x21000000
+ #define DASM_S_UNDEF_PC		0x22000000
+ 
+@@ -70,7 +72,7 @@ struct dasm_State {
+   size_t lgsize;
+   int *pclabels;		/* PC label chains/pos ptrs. */
+   size_t pcsize;
+-  void **globals;		/* Array of globals (bias -10). */
++  void **globals;		/* Array of globals. */
+   dasm_Section *section;	/* Pointer to active section. */
+   size_t codesize;		/* Total size of all code sections. */
+   int maxsection;		/* 0 <= sectionidx < maxsection. */
+@@ -87,7 +89,6 @@ void dasm_init(Dst_DECL, int maxsection)
+ {
+   dasm_State *D;
+   size_t psz = 0;
+-  int i;
+   Dst_REF = NULL;
+   DASM_M_GROW(Dst, struct dasm_State, Dst_REF, psz, DASM_PSZ(maxsection));
+   D = Dst_REF;
+@@ -98,12 +99,7 @@ void dasm_init(Dst_DECL, int maxsection)
+   D->pcsize = 0;
+   D->globals = NULL;
+   D->maxsection = maxsection;
+-  for (i = 0; i < maxsection; i++) {
+-    D->sections[i].buf = NULL;  /* Need this for pass3. */
+-    D->sections[i].rbuf = D->sections[i].buf - DASM_SEC2POS(i);
+-    D->sections[i].bsize = 0;
+-    D->sections[i].epos = 0;  /* Wrong, but is recalculated after resize. */
+-  }
++  memset((void *)D->sections, 0, maxsection * sizeof(dasm_Section));
+ }
+ 
+ /* Free DynASM state. */
+@@ -123,7 +119,7 @@ void dasm_free(Dst_DECL)
+ void dasm_setupglobal(Dst_DECL, void **gl, unsigned int maxgl)
+ {
+   dasm_State *D = Dst_REF;
+-  D->globals = gl - 10;  /* Negative bias to compensate for locals. */
++  D->globals = gl;
+   DASM_M_GROW(Dst, int, D->lglabels, D->lgsize, (10+maxgl)*sizeof(int));
+ }
+ 
+@@ -148,6 +144,7 @@ void dasm_setup(Dst_DECL, const void *ac
+   if (D->pclabels) memset((void *)D->pclabels, 0, D->pcsize);
+   for (i = 0; i < D->maxsection; i++) {
+     D->sections[i].pos = DASM_SEC2POS(i);
++    D->sections[i].rbuf = D->sections[i].buf - D->sections[i].pos;
+     D->sections[i].ofs = 0;
+   }
+ }
+@@ -156,10 +153,10 @@ void dasm_setup(Dst_DECL, const void *ac
+ #ifdef DASM_CHECKS
+ #define CK(x, st) \
+   do { if (!(x)) { \
+-    D->status = DASM_S_##st|(p-D->actionlist-1); return; } } while (0)
++    D->status = DASM_S_##st|(int)(p-D->actionlist-1); return; } } while (0)
+ #define CKPL(kind, st) \
+   do { if ((size_t)((char *)pl-(char *)D->kind##labels) >= D->kind##size) { \
+-    D->status = DASM_S_RANGE_##st|(p-D->actionlist-1); return; } } while (0)
++    D->status = DASM_S_RANGE_##st|(int)(p-D->actionlist-1); return; } } while (0)
+ #else
+ #define CK(x, st)	((void)0)
+ #define CKPL(kind, st)	((void)0)
+@@ -188,7 +185,9 @@ static int dasm_imm13(int lo, int hi)
+   unsigned long long n = (((unsigned long long)hi) << 32) | (unsigned int)lo;
+   unsigned long long m = 1ULL, a, b, c;
+   if (n & 1) { n = ~n; inv = 1; }
+-  a = n & -n; b = (n+a)&-(n+a); c = (n+a-b)&-(n+a-b);
++  a = n & (unsigned long long)-(long long)n;
++  b = (n+a)&(unsigned long long)-(long long)(n+a);
++  c = (n+a-b)&(unsigned long long)-(long long)(n+a-b);
+   xa = dasm_ffs(a); xb = dasm_ffs(b);
+   if (c) {
+     w = dasm_ffs(c) - xa;
+@@ -247,7 +246,7 @@ void dasm_put(Dst_DECL, int start, ...)
+ 	n = (ins & 255); CK(n < D->maxsection, RANGE_SEC);
+ 	D->section = &D->sections[n]; goto stop;
+       case DASM_ESC: p++; ofs += 4; break;
+-      case DASM_REL_EXT: break;
++      case DASM_REL_EXT: if ((ins & 0x8000)) ofs += 8; break;
+       case DASM_ALIGN: ofs += (ins & 255); b[pos++] = ofs; break;
+       case DASM_REL_LG:
+ 	n = (ins & 2047) - 10; pl = D->lglabels + n;
+@@ -268,6 +267,11 @@ void dasm_put(Dst_DECL, int start, ...)
+ 	  *pl = pos;
+ 	}
+ 	pos++;
++	if ((ins & 0x8000)) ofs += 8;
++	break;
++      case DASM_REL_A:
++	b[pos++] = n;
++	b[pos++] = va_arg(ap, int);
+ 	break;
+       case DASM_LABEL_LG:
+ 	pl = D->lglabels + (ins & 2047) - 10; CKPL(lg, LG); goto putlabel;
+@@ -312,13 +316,21 @@ void dasm_put(Dst_DECL, int start, ...)
+ 	}
+       case DASM_IMML: {
+ #ifdef DASM_CHECKS
+-	int scale = (p[-2] >> 30);
++	int scale = (ins & 3);
+ 	CK((!(n & ((1<<scale)-1)) && (unsigned int)(n>>scale) < 4096) ||
+ 	   (unsigned int)(n+256) < 512, RANGE_I);
+ #endif
+ 	b[pos++] = n;
+ 	break;
+ 	}
++      case DASM_IMMV:
++	ofs += 4;
++	b[pos++] = n;
++	break;
++      case DASM_VREG:
++	CK(n < 32, RANGE_VREG);
++	b[pos++] = n;
++	break;
+       }
+     }
+   }
+@@ -348,7 +360,7 @@ int dasm_link(Dst_DECL, size_t *szp)
+ 
+   { /* Handle globals not defined in this translation unit. */
+     int idx;
+-    for (idx = 20; idx*sizeof(int) < D->lgsize; idx++) {
++    for (idx = 10; idx*sizeof(int) < D->lgsize; idx++) {
+       int n = D->lglabels[idx];
+       /* Undefined label: Collapse rel chain and replace with marker (< 0). */
+       while (n > 0) { int *pb = DASM_POS2PTR(D, n); n = *pb; *pb = -idx; }
+@@ -375,8 +387,8 @@ int dasm_link(Dst_DECL, size_t *szp)
+ 	case DASM_REL_LG: case DASM_REL_PC: pos++; break;
+ 	case DASM_LABEL_LG: case DASM_LABEL_PC: b[pos++] += ofs; break;
+ 	case DASM_IMM: case DASM_IMM6: case DASM_IMM12: case DASM_IMM13W:
+-	case DASM_IMML: pos++; break;
+-	case DASM_IMM13X: pos += 2; break;
++	case DASM_IMML: case DASM_IMMV: case DASM_VREG: pos++; break;
++	case DASM_IMM13X: case DASM_REL_A: pos += 2; break;
+ 	}
+       }
+       stop: (void)0;
+@@ -391,7 +403,7 @@ int dasm_link(Dst_DECL, size_t *szp)
+ 
+ #ifdef DASM_CHECKS
+ #define CK(x, st) \
+-  do { if (!(x)) return DASM_S_##st|(p-D->actionlist-1); } while (0)
++  do { if (!(x)) return DASM_S_##st|(int)(p-D->actionlist-1); } while (0)
+ #else
+ #define CK(x, st)	((void)0)
+ #endif
+@@ -423,10 +435,15 @@ int dasm_encode(Dst_DECL, void *buffer)
+ 	  n = DASM_EXTERN(Dst, (unsigned char *)cp, (ins&2047), !(ins&2048));
+ 	  goto patchrel;
+ 	case DASM_ALIGN:
+-	  ins &= 255; while ((((char *)cp - base) & ins)) *cp++ = 0xe1a00000;
++	  ins &= 255; while ((((char *)cp - base) & ins)) *cp++ = 0xd503201f;
+ 	  break;
+ 	case DASM_REL_LG:
+-	  CK(n >= 0, UNDEF_LG);
++	  if (n < 0) {
++	    ptrdiff_t na = (ptrdiff_t)D->globals[-n-10] - (ptrdiff_t)cp + 4;
++	    n = (int)na;
++	    CK((ptrdiff_t)n == na, RANGE_REL);
++	    goto patchrel;
++	  }
+ 	  /* fallthrough */
+ 	case DASM_REL_PC:
+ 	  CK(n >= 0, UNDEF_PC);
+@@ -446,10 +463,26 @@ int dasm_encode(Dst_DECL, void *buffer)
+ 	  } else if ((ins & 0x1000)) {  /* TBZ, TBNZ */
+ 	    CK((n & 3) == 0 && ((n+0x00008000) >> 16) == 0, RANGE_REL);
+ 	    cp[-1] |= ((n << 3) & 0x0007ffe0);
++	  } else if ((ins & 0x8000)) {  /* absolute */
++	    cp[0] = (unsigned int)((ptrdiff_t)cp - 4 + n);
++	    cp[1] = (unsigned int)(((ptrdiff_t)cp - 4 + n) >> 32);
++	    cp += 2;
+ 	  }
+ 	  break;
++	case DASM_REL_A: {
++	  ptrdiff_t na = (((ptrdiff_t)(*b++) << 32) | (unsigned int)n);
++	  if ((ins & 0x3000) == 0x3000) {  /* ADRP */
++	    ins &= ~0x1000;
++	    na = (na >> 12) - (((ptrdiff_t)cp - 4) >> 12);
++	  } else {
++	    na = na - (ptrdiff_t)cp + 4;
++	  }
++	  n = (int)na;
++	  CK((ptrdiff_t)n == na, RANGE_REL);
++	  goto patchrel;
++	}
+ 	case DASM_LABEL_LG:
+-	  ins &= 2047; if (ins >= 20) D->globals[ins-10] = (void *)(base + n);
++	  ins &= 2047; if (ins >= 20) D->globals[ins-20] = (void *)(base + n);
+ 	  break;
+ 	case DASM_LABEL_PC: break;
+ 	case DASM_IMM:
+@@ -468,11 +501,17 @@ int dasm_encode(Dst_DECL, void *buffer)
+ 	  cp[-1] |= (dasm_imm13(n, *b++) << 10);
+ 	  break;
+ 	case DASM_IMML: {
+-	  int scale = (p[-2] >> 30);
++	  int scale = (ins & 3);
+ 	  cp[-1] |= (!(n & ((1<<scale)-1)) && (unsigned int)(n>>scale) < 4096) ?
+ 	    ((n << (10-scale)) | 0x01000000) : ((n & 511) << 12);
+ 	  break;
+ 	  }
++	case DASM_IMMV:
++	  *cp++ = n;
++	  break;
++	case DASM_VREG:
++	  cp[-1] |= (n & 0x1f) << (ins & 0x1f);
++	  break;
+ 	default: *cp++ = ins; break;
+ 	}
+       }
+@@ -512,7 +551,7 @@ int dasm_checkstep(Dst_DECL, int secmatc
+   }
+   if (D->status == DASM_S_OK && secmatch >= 0 &&
+       D->section != &D->sections[secmatch])
+-    D->status = DASM_S_MATCH_SEC|(D->section-D->sections);
++    D->status = DASM_S_MATCH_SEC|(int)(D->section-D->sections);
+   return D->status;
+ }
+ #endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_arm64.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/dynasm/dasm_arm64.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_arm64.lua
+@@ -1,7 +1,7 @@
+ ------------------------------------------------------------------------------
+ -- DynASM ARM64 module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- See dynasm.lua for full copyright notice.
+ ------------------------------------------------------------------------------
+ 
+@@ -9,9 +9,9 @@
+ local _info = {
+   arch =	"arm",
+   description =	"DynASM ARM64 module",
+-  version =	"1.4.0",
+-  vernum =	 10400,
+-  release =	"2015-10-18",
++  version =	"1.5.0",
++  vernum =	 10500,
++  release =	"2021-05-02",
+   author =	"Mike Pall",
+   license =	"MIT",
+ }
+@@ -23,12 +23,12 @@ local _M = { _info = _info }
+ local type, tonumber, pairs, ipairs = type, tonumber, pairs, ipairs
+ local assert, setmetatable, rawget = assert, setmetatable, rawget
+ local _s = string
+-local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char
++local format, byte, char = _s.format, _s.byte, _s.char
+ local match, gmatch, gsub = _s.match, _s.gmatch, _s.gsub
+ local concat, sort, insert = table.concat, table.sort, table.insert
+ local bit = bit or require("bit")
+ local band, shl, shr, sar = bit.band, bit.lshift, bit.rshift, bit.arshift
+-local ror, tohex = bit.ror, bit.tohex
++local ror, tohex, tobit = bit.ror, bit.tohex, bit.tobit
+ 
+ -- Inherited tables and callbacks.
+ local g_opt, g_arch
+@@ -39,7 +39,9 @@ local wline, werror, wfatal, wwarn
+ local action_names = {
+   "STOP", "SECTION", "ESC", "REL_EXT",
+   "ALIGN", "REL_LG", "LABEL_LG",
+-  "REL_PC", "LABEL_PC", "IMM", "IMM6", "IMM12", "IMM13W", "IMM13X", "IMML",
++  "REL_PC", "LABEL_PC", "REL_A",
++  "IMM", "IMM6", "IMM12", "IMM13W", "IMM13X", "IMML", "IMMV",
++  "VREG",
+ }
+ 
+ -- Maximum number of section buffer positions for dasm_put().
+@@ -246,9 +248,12 @@ local map_cond = {
+ 
+ local parse_reg_type
+ 
+-local function parse_reg(expr)
++local function parse_reg(expr, shift, no_vreg)
+   if not expr then werror("expected register name") end
+   local tname, ovreg = match(expr, "^([%w_]+):(@?%l%d+)$")
++  if not tname then
++    tname, ovreg = match(expr, "^([%w_]+):(R[xwqdshb]%b())$")
++  end
+   local tp = map_type[tname or expr]
+   if tp then
+     local reg = ovreg or tp.reg
+@@ -266,18 +271,28 @@ local function parse_reg(expr)
+       elseif parse_reg_type ~= rt then
+ 	werror("register size mismatch")
+       end
+-      return r, tp
++      return shl(r, shift), tp
+     end
+   end
++  local vrt, vreg = match(expr, "^R([xwqdshb])(%b())$")
++  if vreg then
++    if not parse_reg_type then
++      parse_reg_type = vrt
++    elseif parse_reg_type ~= vrt then
++      werror("register size mismatch")
++    end
++    if not no_vreg then waction("VREG", shift, vreg) end
++    return 0
++  end
+   werror("bad register name `"..expr.."'")
+ end
+ 
+ local function parse_reg_base(expr)
+   if expr == "sp" then return 0x3e0 end
+-  local base, tp = parse_reg(expr)
++  local base, tp = parse_reg(expr, 5)
+   if parse_reg_type ~= "x" then werror("bad register type") end
+   parse_reg_type = false
+-  return shl(base, 5), tp
++  return base, tp
+ end
+ 
+ local parse_ctx = {}
+@@ -297,7 +312,7 @@ local function parse_number(n)
+   local code = loadenv("return "..n)
+   if code then
+     local ok, y = pcall(code)
+-    if ok then return y end
++    if ok and type(y) == "number" then return y end
+   end
+   return nil
+ end
+@@ -403,7 +418,7 @@ local function parse_imm_load(imm, scale
+     end
+     werror("out of range immediate `"..imm.."'")
+   else
+-    waction("IMML", 0, imm)
++    waction("IMML", scale, imm)
+     return 0
+   end
+ end
+@@ -462,6 +477,7 @@ end
+ 
+ local function parse_load(params, nparams, n, op)
+   if params[n+2] then werror("too many operands") end
++  local scale = shr(op, 30)
+   local pn, p2 = params[n], params[n+1]
+   local p1, wb = match(pn, "^%[%s*(.-)%s*%](!?)$")
+   if not p1 then
+@@ -470,14 +486,13 @@ local function parse_load(params, nparam
+       if reg and tailr ~= "" then
+ 	local base, tp = parse_reg_base(reg)
+ 	if tp then
+-	  waction("IMML", 0, format(tp.ctypefmt, tailr))
++	  waction("IMML", scale, format(tp.ctypefmt, tailr))
+ 	  return op + base
+ 	end
+       end
+     end
+     werror("expected address operand")
+   end
+-  local scale = shr(op, 30)
+   if p2 then
+     if wb == "!" then werror("bad use of '!'") end
+     op = op + parse_reg_base(p1) + parse_imm(p2, 9, 12, 0, true) + 0x400
+@@ -494,7 +509,7 @@ local function parse_load(params, nparam
+ 	op = op + parse_imm_load(imm, scale)
+       else
+ 	local p2b, p3b, p3s = match(p2a, "^,%s*([^,%s]*)%s*,?%s*(%S*)%s*(.*)$")
+-	op = op + shl(parse_reg(p2b), 16) + 0x00200800
++	op = op + parse_reg(p2b, 16) + 0x00200800
+ 	if parse_reg_type ~= "x" and parse_reg_type ~= "w" then
+ 	  werror("bad index register type")
+ 	end
+@@ -534,7 +549,7 @@ end
+ local function parse_load_pair(params, nparams, n, op)
+   if params[n+2] then werror("too many operands") end
+   local pn, p2 = params[n], params[n+1]
+-  local scale = shr(op, 30) == 0 and 2 or 3
++  local scale = 2 + shr(op, 31 - band(shr(op, 26), 1))
+   local p1, wb = match(pn, "^%[%s*(.-)%s*%](!?)$")
+   if not p1 then
+     if not p2 then
+@@ -561,14 +576,14 @@ local function parse_load_pair(params, n
+ end
+ 
+ local function parse_label(label, def)
+-  local prefix = sub(label, 1, 2)
++  local prefix = label:sub(1, 2)
+   -- =>label (pc label reference)
+   if prefix == "=>" then
+-    return "PC", 0, sub(label, 3)
++    return "PC", 0, label:sub(3)
+   end
+   -- ->name (global label reference)
+   if prefix == "->" then
+-    return "LG", map_global[sub(label, 3)]
++    return "LG", map_global[label:sub(3)]
+   end
+   if def then
+     -- [1-9] (local label definition)
+@@ -586,8 +601,11 @@ local function parse_label(label, def)
+     if extname then
+       return "EXT", map_extern[extname]
+     end
++    -- &expr (pointer)
++    if label:sub(1, 1) == "&" then
++      return "A", 0, format("(ptrdiff_t)(%s)", label:sub(2))
++    end
+   end
+-  werror("bad label `"..label.."'")
+ end
+ 
+ local function branch_type(op)
+@@ -620,24 +638,24 @@ local function alias_bfx(p)
+ end
+ 
+ local function alias_bfiz(p)
+-  parse_reg(p[1])
++  parse_reg(p[1], 0, true)
+   if parse_reg_type == "w" then
+-    p[3] = "#-("..p[3]:sub(2)..")%32"
++    p[3] = "#(32-("..p[3]:sub(2).."))%32"
+     p[4] = "#("..p[4]:sub(2)..")-1"
+   else
+-    p[3] = "#-("..p[3]:sub(2)..")%64"
++    p[3] = "#(64-("..p[3]:sub(2).."))%64"
+     p[4] = "#("..p[4]:sub(2)..")-1"
+   end
+ end
+ 
+ local alias_lslimm = op_alias("ubfm_4", function(p)
+-  parse_reg(p[1])
++  parse_reg(p[1], 0, true)
+   local sh = p[3]:sub(2)
+   if parse_reg_type == "w" then
+-    p[3] = "#-("..sh..")%32"
++    p[3] = "#(32-("..sh.."))%32"
+     p[4] = "#31-("..sh..")"
+   else
+-    p[3] = "#-("..sh..")%64"
++    p[3] = "#(64-("..sh.."))%64"
+     p[4] = "#63-("..sh..")"
+   end
+ end)
+@@ -788,8 +806,8 @@ map_op = {
+   ["ldrsw_*"] = "98000000DxB|b8800000DxL",
+   -- NOTE: ldur etc. are handled by ldr et al.
+ 
+-  ["stp_*"]   = "28000000DAwP|a8000000DAxP|2c000000DAsP|6c000000DAdP",
+-  ["ldp_*"]   = "28400000DAwP|a8400000DAxP|2c400000DAsP|6c400000DAdP",
++  ["stp_*"]   = "28000000DAwP|a8000000DAxP|2c000000DAsP|6c000000DAdP|ac000000DAqP",
++  ["ldp_*"]   = "28400000DAwP|a8400000DAxP|2c400000DAsP|6c400000DAdP|ac400000DAqP",
+   ["ldpsw_*"] = "68400000DAxP",
+ 
+   -- Branches.
+@@ -805,6 +823,13 @@ map_op = {
+   tbz_3  = "36000000DTBw|36000000DTBx",
+   tbnz_3 = "37000000DTBw|37000000DTBx",
+ 
++  -- ARM64e: Pointer authentication codes (PAC).
++  blraaz_1  = "d63f081fNx",
++  braa_2    = "d71f0800NDx",
++  braaz_1   = "d61f081fNx",
++  pacibsp_0 = "d503237f",
++  retab_0   = "d65f0fff",
++
+   -- Miscellaneous instructions.
+   -- TODO: hlt, hvc, smc, svc, eret, dcps[123], drps, mrs, msr
+   -- TODO: sys, sysl, ic, dc, at, tlbi
+@@ -881,25 +906,25 @@ end
+ 
+ -- Handle opcodes defined with template strings.
+ local function parse_template(params, template, nparams, pos)
+-  local op = tonumber(sub(template, 1, 8), 16)
++  local op = tonumber(template:sub(1, 8), 16)
+   local n = 1
+   local rtt = {}
+ 
+   parse_reg_type = false
+ 
+   -- Process each character.
+-  for p in gmatch(sub(template, 9), ".") do
++  for p in gmatch(template:sub(9), ".") do
+     local q = params[n]
+     if p == "D" then
+-      op = op + parse_reg(q); n = n + 1
++      op = op + parse_reg(q, 0); n = n + 1
+     elseif p == "N" then
+-      op = op + shl(parse_reg(q), 5); n = n + 1
++      op = op + parse_reg(q, 5); n = n + 1
+     elseif p == "M" then
+-      op = op + shl(parse_reg(q), 16); n = n + 1
++      op = op + parse_reg(q, 16); n = n + 1
+     elseif p == "A" then
+-      op = op + shl(parse_reg(q), 10); n = n + 1
++      op = op + parse_reg(q, 10); n = n + 1
+     elseif p == "m" then
+-      op = op + shl(parse_reg(params[n-1]), 16)
++      op = op + parse_reg(params[n-1], 16)
+ 
+     elseif p == "p" then
+       if q == "sp" then params[n] = "@x31" end
+@@ -917,7 +942,7 @@ local function parse_template(params, te
+ 	werror("bad register type")
+       end
+       parse_reg_type = false
+-    elseif p == "x" or p == "w" or p == "d" or p == "s" then
++    elseif p == "x" or p == "w" or p == "d" or p == "s" or p == "q" then
+       if parse_reg_type ~= p then
+ 	werror("register size mismatch")
+       end
+@@ -930,8 +955,14 @@ local function parse_template(params, te
+ 
+     elseif p == "B" then
+       local mode, v, s = parse_label(q, false); n = n + 1
++      if not mode then werror("bad label `"..q.."'") end
+       local m = branch_type(op)
+-      waction("REL_"..mode, v+m, s, 1)
++      if mode == "A" then
++	waction("REL_"..mode, v+m, format("(unsigned int)(%s)", s))
++	actargs[#actargs+1] = format("(unsigned int)((%s)>>32)", s)
++      else
++	waction("REL_"..mode, v+m, s, 1)
++      end
+ 
+     elseif p == "I" then
+       op = op + parse_imm12(q); n = n + 1
+@@ -977,8 +1008,8 @@ function op_template(params, template, n
+   if not params then return template:gsub("%x%x%x%x%x%x%x%x", "") end
+ 
+   -- Limit number of section buffer positions used by a single dasm_put().
+-  -- A single opcode needs a maximum of 3 positions.
+-  if secpos+3 > maxsecpos then wflush() end
++  -- A single opcode needs a maximum of 4 positions.
++  if secpos+4 > maxsecpos then wflush() end
+   local pos = wpos()
+   local lpos, apos, spos = #actlist, #actargs, secpos
+ 
+@@ -990,9 +1021,11 @@ function op_template(params, template, n
+     actlist[lpos+1] = nil
+     actlist[lpos+2] = nil
+     actlist[lpos+3] = nil
++    actlist[lpos+4] = nil
+     actargs[apos+1] = nil
+     actargs[apos+2] = nil
+     actargs[apos+3] = nil
++    actargs[apos+4] = nil
+   end
+   error(err, 0)
+ end
+@@ -1036,23 +1069,50 @@ map_op[".label_1"] = function(params)
+   if not params then return "[1-9] | ->global | =>pcexpr" end
+   if secpos+1 > maxsecpos then wflush() end
+   local mode, n, s = parse_label(params[1], true)
+-  if mode == "EXT" then werror("bad label definition") end
++  if not mode or mode == "EXT" then werror("bad label definition") end
+   waction("LABEL_"..mode, n, s, 1)
+ end
+ 
+ ------------------------------------------------------------------------------
+ 
+ -- Pseudo-opcodes for data storage.
+-map_op[".long_*"] = function(params)
++local function op_data(params)
+   if not params then return "imm..." end
++  local sz = params.op == ".long" and 4 or 8
+   for _,p in ipairs(params) do
+-    local n = tonumber(p)
+-    if not n then werror("bad immediate `"..p.."'") end
+-    if n < 0 then n = n + 2^32 end
+-    wputw(n)
++    local imm = parse_number(p)
++    if imm then
++      local n = tobit(imm)
++      if n == imm or (n < 0 and n + 2^32 == imm) then
++	wputw(n < 0 and n + 2^32 or n)
++	if sz == 8 then
++	  wputw(imm < 0 and 0xffffffff or 0)
++	end
++      elseif sz == 4 then
++	werror("bad immediate `"..p.."'")
++      else
++	imm = nil
++      end
++    end
++    if not imm then
++      local mode, v, s = parse_label(p, false)
++      if sz == 4 then
++	if mode then werror("label does not fit into .long") end
++	waction("IMMV", 0, p)
++      elseif mode and mode ~= "A" then
++	waction("REL_"..mode, v+0x8000, s, 1)
++      else
++	if mode == "A" then p = s end
++	waction("IMMV", 0, format("(unsigned int)(%s)", p))
++	waction("IMMV", 0, format("(unsigned int)((unsigned long long)(%s)>>32)", p))
++      end
++    end
+     if secpos+2 > maxsecpos then wflush() end
+   end
+ end
++map_op[".long_*"] = op_data
++map_op[".quad_*"] = op_data
++map_op[".addr_*"] = op_data
+ 
+ -- Alignment pseudo-opcode.
+ map_op[".align_1"] = function(params)
+Index: wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_mips.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/dynasm/dasm_mips.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_mips.h
+@@ -1,6 +1,6 @@
+ /*
+ ** DynASM MIPS encoding engine.
+-** Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++** Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ ** Released under the MIT license. See dynasm.lua for full copyright notice.
+ */
+ 
+@@ -69,7 +69,7 @@ struct dasm_State {
+   size_t lgsize;
+   int *pclabels;		/* PC label chains/pos ptrs. */
+   size_t pcsize;
+-  void **globals;		/* Array of globals (bias -10). */
++  void **globals;		/* Array of globals. */
+   dasm_Section *section;	/* Pointer to active section. */
+   size_t codesize;		/* Total size of all code sections. */
+   int maxsection;		/* 0 <= sectionidx < maxsection. */
+@@ -86,7 +86,6 @@ void dasm_init(Dst_DECL, int maxsection)
+ {
+   dasm_State *D;
+   size_t psz = 0;
+-  int i;
+   Dst_REF = NULL;
+   DASM_M_GROW(Dst, struct dasm_State, Dst_REF, psz, DASM_PSZ(maxsection));
+   D = Dst_REF;
+@@ -97,12 +96,7 @@ void dasm_init(Dst_DECL, int maxsection)
+   D->pcsize = 0;
+   D->globals = NULL;
+   D->maxsection = maxsection;
+-  for (i = 0; i < maxsection; i++) {
+-    D->sections[i].buf = NULL;  /* Need this for pass3. */
+-    D->sections[i].rbuf = D->sections[i].buf - DASM_SEC2POS(i);
+-    D->sections[i].bsize = 0;
+-    D->sections[i].epos = 0;  /* Wrong, but is recalculated after resize. */
+-  }
++  memset((void *)D->sections, 0, maxsection * sizeof(dasm_Section));
+ }
+ 
+ /* Free DynASM state. */
+@@ -122,7 +116,7 @@ void dasm_free(Dst_DECL)
+ void dasm_setupglobal(Dst_DECL, void **gl, unsigned int maxgl)
+ {
+   dasm_State *D = Dst_REF;
+-  D->globals = gl - 10;  /* Negative bias to compensate for locals. */
++  D->globals = gl;
+   DASM_M_GROW(Dst, int, D->lglabels, D->lgsize, (10+maxgl)*sizeof(int));
+ }
+ 
+@@ -147,6 +141,7 @@ void dasm_setup(Dst_DECL, const void *ac
+   if (D->pclabels) memset((void *)D->pclabels, 0, D->pcsize);
+   for (i = 0; i < D->maxsection; i++) {
+     D->sections[i].pos = DASM_SEC2POS(i);
++    D->sections[i].rbuf = D->sections[i].buf - D->sections[i].pos;
+     D->sections[i].ofs = 0;
+   }
+ }
+@@ -155,10 +150,10 @@ void dasm_setup(Dst_DECL, const void *ac
+ #ifdef DASM_CHECKS
+ #define CK(x, st) \
+   do { if (!(x)) { \
+-    D->status = DASM_S_##st|(p-D->actionlist-1); return; } } while (0)
++    D->status = DASM_S_##st|(int)(p-D->actionlist-1); return; } } while (0)
+ #define CKPL(kind, st) \
+   do { if ((size_t)((char *)pl-(char *)D->kind##labels) >= D->kind##size) { \
+-    D->status = DASM_S_RANGE_##st|(p-D->actionlist-1); return; } } while (0)
++    D->status = DASM_S_RANGE_##st|(int)(p-D->actionlist-1); return; } } while (0)
+ #else
+ #define CK(x, st)	((void)0)
+ #define CKPL(kind, st)	((void)0)
+@@ -273,7 +268,7 @@ int dasm_link(Dst_DECL, size_t *szp)
+ 
+   { /* Handle globals not defined in this translation unit. */
+     int idx;
+-    for (idx = 20; idx*sizeof(int) < D->lgsize; idx++) {
++    for (idx = 10; idx*sizeof(int) < D->lgsize; idx++) {
+       int n = D->lglabels[idx];
+       /* Undefined label: Collapse rel chain and replace with marker (< 0). */
+       while (n > 0) { int *pb = DASM_POS2PTR(D, n); n = *pb; *pb = -idx; }
+@@ -314,7 +309,7 @@ int dasm_link(Dst_DECL, size_t *szp)
+ 
+ #ifdef DASM_CHECKS
+ #define CK(x, st) \
+-  do { if (!(x)) return DASM_S_##st|(p-D->actionlist-1); } while (0)
++  do { if (!(x)) return DASM_S_##st|(int)(p-D->actionlist-1); } while (0)
+ #else
+ #define CK(x, st)	((void)0)
+ #endif
+@@ -349,7 +344,10 @@ int dasm_encode(Dst_DECL, void *buffer)
+ 	  ins &= 255; while ((((char *)cp - base) & ins)) *cp++ = 0x60000000;
+ 	  break;
+ 	case DASM_REL_LG:
+-	  CK(n >= 0, UNDEF_LG);
++	  if (n < 0) {
++	    n = (int)((ptrdiff_t)D->globals[-n-10] - (ptrdiff_t)cp);
++	    goto patchrel;
++	  }
+ 	  /* fallthrough */
+ 	case DASM_REL_PC:
+ 	  CK(n >= 0, UNDEF_PC);
+@@ -366,7 +364,7 @@ int dasm_encode(Dst_DECL, void *buffer)
+ 	  }
+ 	  break;
+ 	case DASM_LABEL_LG:
+-	  ins &= 2047; if (ins >= 20) D->globals[ins-10] = (void *)(base + n);
++	  ins &= 2047; if (ins >= 20) D->globals[ins-20] = (void *)(base + n);
+ 	  break;
+ 	case DASM_LABEL_PC: break;
+ 	case DASM_IMMS:
+@@ -414,7 +412,7 @@ int dasm_checkstep(Dst_DECL, int secmatc
+   }
+   if (D->status == DASM_S_OK && secmatch >= 0 &&
+       D->section != &D->sections[secmatch])
+-    D->status = DASM_S_MATCH_SEC|(D->section-D->sections);
++    D->status = DASM_S_MATCH_SEC|(int)(D->section-D->sections);
+   return D->status;
+ }
+ #endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_mips.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/dynasm/dasm_mips.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_mips.lua
+@@ -1,7 +1,7 @@
+ ------------------------------------------------------------------------------
+ -- DynASM MIPS32/MIPS64 module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- See dynasm.lua for full copyright notice.
+ ------------------------------------------------------------------------------
+ 
+@@ -12,9 +12,9 @@ local mipsr6 = _map_def.MIPSR6
+ local _info = {
+   arch =	mips64 and "mips64" or "mips",
+   description =	"DynASM MIPS32/MIPS64 module",
+-  version =	"1.4.0",
+-  vernum =	 10400,
+-  release =	"2020-01-20",
++  version =	"1.5.0",
++  vernum =	 10500,
++  release =	"2021-05-02",
+   author =	"Mike Pall",
+   license =	"MIT",
+ }
+Index: wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_mips64.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/dynasm/dasm_mips64.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_mips64.lua
+@@ -1,7 +1,7 @@
+ ------------------------------------------------------------------------------
+ -- DynASM MIPS64 module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- See dynasm.lua for full copyright notice.
+ ------------------------------------------------------------------------------
+ -- This module just sets 64 bit mode for the combined MIPS/MIPS64 module.
+Index: wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_ppc.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/dynasm/dasm_ppc.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_ppc.h
+@@ -1,6 +1,6 @@
+ /*
+ ** DynASM PPC/PPC64 encoding engine.
+-** Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++** Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ ** Released under the MIT license. See dynasm.lua for full copyright notice.
+ */
+ 
+@@ -69,7 +69,7 @@ struct dasm_State {
+   size_t lgsize;
+   int *pclabels;		/* PC label chains/pos ptrs. */
+   size_t pcsize;
+-  void **globals;		/* Array of globals (bias -10). */
++  void **globals;		/* Array of globals. */
+   dasm_Section *section;	/* Pointer to active section. */
+   size_t codesize;		/* Total size of all code sections. */
+   int maxsection;		/* 0 <= sectionidx < maxsection. */
+@@ -86,7 +86,6 @@ void dasm_init(Dst_DECL, int maxsection)
+ {
+   dasm_State *D;
+   size_t psz = 0;
+-  int i;
+   Dst_REF = NULL;
+   DASM_M_GROW(Dst, struct dasm_State, Dst_REF, psz, DASM_PSZ(maxsection));
+   D = Dst_REF;
+@@ -97,12 +96,7 @@ void dasm_init(Dst_DECL, int maxsection)
+   D->pcsize = 0;
+   D->globals = NULL;
+   D->maxsection = maxsection;
+-  for (i = 0; i < maxsection; i++) {
+-    D->sections[i].buf = NULL;  /* Need this for pass3. */
+-    D->sections[i].rbuf = D->sections[i].buf - DASM_SEC2POS(i);
+-    D->sections[i].bsize = 0;
+-    D->sections[i].epos = 0;  /* Wrong, but is recalculated after resize. */
+-  }
++  memset((void *)D->sections, 0, maxsection * sizeof(dasm_Section));
+ }
+ 
+ /* Free DynASM state. */
+@@ -122,7 +116,7 @@ void dasm_free(Dst_DECL)
+ void dasm_setupglobal(Dst_DECL, void **gl, unsigned int maxgl)
+ {
+   dasm_State *D = Dst_REF;
+-  D->globals = gl - 10;  /* Negative bias to compensate for locals. */
++  D->globals = gl;
+   DASM_M_GROW(Dst, int, D->lglabels, D->lgsize, (10+maxgl)*sizeof(int));
+ }
+ 
+@@ -147,6 +141,7 @@ void dasm_setup(Dst_DECL, const void *ac
+   if (D->pclabels) memset((void *)D->pclabels, 0, D->pcsize);
+   for (i = 0; i < D->maxsection; i++) {
+     D->sections[i].pos = DASM_SEC2POS(i);
++    D->sections[i].rbuf = D->sections[i].buf - D->sections[i].pos;
+     D->sections[i].ofs = 0;
+   }
+ }
+@@ -277,7 +272,7 @@ int dasm_link(Dst_DECL, size_t *szp)
+ 
+   { /* Handle globals not defined in this translation unit. */
+     int idx;
+-    for (idx = 20; idx*sizeof(int) < D->lgsize; idx++) {
++    for (idx = 10; idx*sizeof(int) < D->lgsize; idx++) {
+       int n = D->lglabels[idx];
+       /* Undefined label: Collapse rel chain and replace with marker (< 0). */
+       while (n > 0) { int *pb = DASM_POS2PTR(D, n); n = *pb; *pb = -idx; }
+@@ -353,7 +348,10 @@ int dasm_encode(Dst_DECL, void *buffer)
+ 	  ins &= 255; while ((((char *)cp - base) & ins)) *cp++ = 0x60000000;
+ 	  break;
+ 	case DASM_REL_LG:
+-	  CK(n >= 0, UNDEF_LG);
++	  if (n < 0) {
++	    n = (int)((ptrdiff_t)D->globals[-n-10] - (ptrdiff_t)cp);
++	    goto patchrel;
++	  }
+ 	  /* fallthrough */
+ 	case DASM_REL_PC:
+ 	  CK(n >= 0, UNDEF_PC);
+@@ -365,7 +363,7 @@ int dasm_encode(Dst_DECL, void *buffer)
+ 	  cp[-1] |= ((n+4) & ((ins & 2048) ? 0x0000fffc: 0x03fffffc));
+ 	  break;
+ 	case DASM_LABEL_LG:
+-	  ins &= 2047; if (ins >= 20) D->globals[ins-10] = (void *)(base + n);
++	  ins &= 2047; if (ins >= 20) D->globals[ins-20] = (void *)(base + n);
+ 	  break;
+ 	case DASM_LABEL_PC: break;
+ 	case DASM_IMM:
+Index: wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_ppc.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/dynasm/dasm_ppc.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_ppc.lua
+@@ -1,7 +1,7 @@
+ ------------------------------------------------------------------------------
+ -- DynASM PPC/PPC64 module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- See dynasm.lua for full copyright notice.
+ --
+ -- Support for various extensions contributed by Caio Souza Oliveira.
+@@ -11,9 +11,9 @@
+ local _info = {
+   arch =	"ppc",
+   description =	"DynASM PPC module",
+-  version =	"1.4.0",
+-  vernum =	 10400,
+-  release =	"2015-10-18",
++  version =	"1.5.0",
++  vernum =	 10500,
++  release =	"2021-05-02",
+   author =	"Mike Pall",
+   license =	"MIT",
+ }
+Index: wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_proto.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/dynasm/dasm_proto.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_proto.h
+@@ -1,6 +1,6 @@
+ /*
+ ** DynASM encoding engine prototypes.
+-** Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++** Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ ** Released under the MIT license. See dynasm.lua for full copyright notice.
+ */
+ 
+@@ -10,8 +10,8 @@
+ #include <stddef.h>
+ #include <stdarg.h>
+ 
+-#define DASM_IDENT	"DynASM 1.4.0"
+-#define DASM_VERSION	10400	/* 1.4.0 */
++#define DASM_IDENT	"DynASM 1.5.0"
++#define DASM_VERSION	10500	/* 1.5.0 */
+ 
+ #ifndef Dst_DECL
+ #define Dst_DECL	dasm_State **Dst
+Index: wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_riscv.h
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_riscv.h
+@@ -0,0 +1,433 @@
++/*
++** DynASM RISC-V encoding engine.
++** Copyright (C) 2005-2023 Mike Pall. All rights reserved.
++** Released under the MIT license. See dynasm.lua for full copyright notice.
++*/
++
++#include <stddef.h>
++#include <stdarg.h>
++#include <string.h>
++#include <stdlib.h>
++
++#define DASM_ARCH		"riscv"
++
++#ifndef DASM_EXTERN
++#define DASM_EXTERN(a,b,c,d)	0
++#endif
++
++/* Action definitions. */
++enum {
++  DASM_STOP, DASM_SECTION, DASM_ESC, DASM_REL_EXT,
++  /* The following actions need a buffer position. */
++  DASM_ALIGN, DASM_REL_LG, DASM_LABEL_LG,
++  /* The following actions also have an argument. */
++  DASM_REL_PC, DASM_LABEL_PC, DASM_IMM, DASM_IMMS,
++  DASM__MAX
++};
++
++/* Maximum number of section buffer positions for a single dasm_put() call. */
++#define DASM_MAXSECPOS		25
++
++/* DynASM encoder status codes. Action list offset or number are or'ed in. */
++#define DASM_S_OK		0x00000000
++#define DASM_S_NOMEM		0x01000000
++#define DASM_S_PHASE		0x02000000
++#define DASM_S_MATCH_SEC	0x03000000
++#define DASM_S_RANGE_I		0x11000000
++#define DASM_S_RANGE_SEC	0x12000000
++#define DASM_S_RANGE_LG		0x13000000
++#define DASM_S_RANGE_PC		0x14000000
++#define DASM_S_RANGE_REL	0x15000000
++#define DASM_S_UNDEF_LG		0x21000000
++#define DASM_S_UNDEF_PC		0x22000000
++
++/* Macros to convert positions (8 bit section + 24 bit index). */
++#define DASM_POS2IDX(pos)	((pos)&0x00ffffff)
++#define DASM_POS2BIAS(pos)	((pos)&0xff000000)
++#define DASM_SEC2POS(sec)	((sec)<<24)
++#define DASM_POS2SEC(pos)	((pos)>>24)
++#define DASM_POS2PTR(D, pos)	(D->sections[DASM_POS2SEC(pos)].rbuf + (pos))
++
++/* Action list type. */
++typedef const unsigned int *dasm_ActList;
++
++/* Per-section structure. */
++typedef struct dasm_Section {
++  int *rbuf;		/* Biased buffer pointer (negative section bias). */
++  int *buf;		/* True buffer pointer. */
++  size_t bsize;		/* Buffer size in bytes. */
++  int pos;		/* Biased buffer position. */
++  int epos;		/* End of biased buffer position - max single put. */
++  int ofs;		/* Byte offset into section. */
++} dasm_Section;
++
++/* Core structure holding the DynASM encoding state. */
++struct dasm_State {
++  size_t psize;			/* Allocated size of this structure. */
++  dasm_ActList actionlist;	/* Current actionlist pointer. */
++  int *lglabels;		/* Local/global chain/pos ptrs. */
++  size_t lgsize;
++  int *pclabels;		/* PC label chains/pos ptrs. */
++  size_t pcsize;
++  void **globals;		/* Array of globals. */
++  dasm_Section *section;	/* Pointer to active section. */
++  size_t codesize;		/* Total size of all code sections. */
++  int maxsection;		/* 0 <= sectionidx < maxsection. */
++  int status;			/* Status code. */
++  dasm_Section sections[1];	/* All sections. Alloc-extended. */
++};
++
++/* The size of the core structure depends on the max. number of sections. */
++#define DASM_PSZ(ms)	(sizeof(dasm_State)+(ms-1)*sizeof(dasm_Section))
++
++
++/* Initialize DynASM state. */
++void dasm_init(Dst_DECL, int maxsection)
++{
++  dasm_State *D;
++  size_t psz = 0;
++  Dst_REF = NULL;
++  DASM_M_GROW(Dst, struct dasm_State, Dst_REF, psz, DASM_PSZ(maxsection));
++  D = Dst_REF;
++  D->psize = psz;
++  D->lglabels = NULL;
++  D->lgsize = 0;
++  D->pclabels = NULL;
++  D->pcsize = 0;
++  D->globals = NULL;
++  D->maxsection = maxsection;
++  memset((void *)D->sections, 0, maxsection * sizeof(dasm_Section));
++}
++
++/* Free DynASM state. */
++void dasm_free(Dst_DECL)
++{
++  dasm_State *D = Dst_REF;
++  int i;
++  for (i = 0; i < D->maxsection; i++)
++    if (D->sections[i].buf)
++      DASM_M_FREE(Dst, D->sections[i].buf, D->sections[i].bsize);
++  if (D->pclabels) DASM_M_FREE(Dst, D->pclabels, D->pcsize);
++  if (D->lglabels) DASM_M_FREE(Dst, D->lglabels, D->lgsize);
++  DASM_M_FREE(Dst, D, D->psize);
++}
++
++/* Setup global label array. Must be called before dasm_setup(). */
++void dasm_setupglobal(Dst_DECL, void **gl, unsigned int maxgl)
++{
++  dasm_State *D = Dst_REF;
++  D->globals = gl;
++  DASM_M_GROW(Dst, int, D->lglabels, D->lgsize, (10+maxgl)*sizeof(int));
++}
++
++/* Grow PC label array. Can be called after dasm_setup(), too. */
++void dasm_growpc(Dst_DECL, unsigned int maxpc)
++{
++  dasm_State *D = Dst_REF;
++  size_t osz = D->pcsize;
++  DASM_M_GROW(Dst, int, D->pclabels, D->pcsize, maxpc*sizeof(int));
++  memset((void *)(((unsigned char *)D->pclabels)+osz), 0, D->pcsize-osz);
++}
++
++/* Setup encoder. */
++void dasm_setup(Dst_DECL, const void *actionlist)
++{
++  dasm_State *D = Dst_REF;
++  int i;
++  D->actionlist = (dasm_ActList)actionlist;
++  D->status = DASM_S_OK;
++  D->section = &D->sections[0];
++  memset((void *)D->lglabels, 0, D->lgsize);
++  if (D->pclabels) memset((void *)D->pclabels, 0, D->pcsize);
++  for (i = 0; i < D->maxsection; i++) {
++    D->sections[i].pos = DASM_SEC2POS(i);
++    D->sections[i].rbuf = D->sections[i].buf - D->sections[i].pos;
++    D->sections[i].ofs = 0;
++  }
++}
++
++
++#ifdef DASM_CHECKS
++#define CK(x, st) \
++  do { if (!(x)) { \
++    D->status = DASM_S_##st|(int)(p-D->actionlist-1); return; } } while (0)
++#define CKPL(kind, st) \
++  do { if ((size_t)((char *)pl-(char *)D->kind##labels) >= D->kind##size) { \
++    D->status = DASM_S_RANGE_##st|(int)(p-D->actionlist-1); return; } } while (0)
++#else
++#define CK(x, st)	((void)0)
++#define CKPL(kind, st)	((void)0)
++#endif
++
++static int dasm_imms(int n)
++{
++  return (n >= -2048 && n < 2048) ? n : 4096;
++}
++/* Pass 1: Store actions and args, link branches/labels, estimate offsets. */
++void dasm_put(Dst_DECL, int start, ...)
++{
++  va_list ap;
++  dasm_State *D = Dst_REF;
++  dasm_ActList p = D->actionlist + start;
++  dasm_Section *sec = D->section;
++  int pos = sec->pos, ofs = sec->ofs;
++  int *b;
++
++  if (pos >= sec->epos) {
++    DASM_M_GROW(Dst, int, sec->buf, sec->bsize,
++      sec->bsize + 2*DASM_MAXSECPOS*sizeof(int));
++    sec->rbuf = sec->buf - DASM_POS2BIAS(pos);
++    sec->epos = (int)sec->bsize/sizeof(int) - DASM_MAXSECPOS+DASM_POS2BIAS(pos);
++  }
++
++  b = sec->rbuf;
++  b[pos++] = start;
++
++  va_start(ap, start);
++  while (1) {
++    unsigned int ins = *p++;
++    unsigned int action = (ins >> 20);
++    if (action >= DASM__MAX || (ins & 0xf)) {
++      ofs += 4;
++    } else {
++      ins >>= 4;
++      int *pl, n = action >= DASM_REL_PC ? va_arg(ap, int) : 0;
++      switch (action) {
++      case DASM_STOP: goto stop;
++      case DASM_SECTION:
++	n = (ins & 255); CK(n < D->maxsection, RANGE_SEC);
++	D->section = &D->sections[n]; goto stop;
++      case DASM_ESC: p++; ofs += 4; break;
++      case DASM_REL_EXT: break;
++      case DASM_ALIGN: ofs += (ins & 255); b[pos++] = ofs; break;
++      case DASM_REL_LG:
++	n = (ins & 2047) - 10; pl = D->lglabels + n;
++	/* Bkwd rel or global. */
++	if (n >= 0) { CK(n>=10||*pl<0, RANGE_LG); CKPL(lg, LG); goto putrel; }
++	pl += 10; n = *pl;
++	if (n < 0) n = 0;  /* Start new chain for fwd rel if label exists. */
++	goto linkrel;
++      case DASM_REL_PC:
++	pl = D->pclabels + n; CKPL(pc, PC);
++      putrel:
++	n = *pl;
++	if (n < 0) {  /* Label exists. Get label pos and store it. */
++	  b[pos] = -n;
++	} else {
++      linkrel:
++	  b[pos] = n;  /* Else link to rel chain, anchored at label. */
++	  *pl = pos;
++	}
++	pos++;
++	break;
++      case DASM_LABEL_LG:
++	pl = D->lglabels + (ins & 2047) - 10; CKPL(lg, LG); goto putlabel;
++      case DASM_LABEL_PC:
++	pl = D->pclabels + n; CKPL(pc, PC);
++      putlabel:
++	n = *pl;  /* n > 0: Collapse rel chain and replace with label pos. */
++	while (n > 0) { int *pb = DASM_POS2PTR(D, n); n = *pb; *pb = pos;
++  }
++	*pl = -pos;  /* Label exists now. */
++	b[pos++] = ofs;  /* Store pass1 offset estimate. */
++	break;
++      case DASM_IMM:
++#ifdef DASM_CHECKS
++	CK((n & ((1<<((ins>>10)&31))-1)) == 0, RANGE_I);
++#endif
++	n >>= ((ins>>10)&31);
++#ifdef DASM_CHECKS
++	if (ins & 0x8000)
++	  CK(((n + (1<<(((ins>>5)&31)-1)))>>((ins>>5)&31)) == 0, RANGE_I);
++	else
++	  CK((n>>((ins>>5)&31)) == 0, RANGE_I);
++#endif
++	b[pos++] = n;
++	break;
++      case DASM_IMMS:
++#ifdef DASM_CHECKS
++        CK(dasm_imms(n) != 4096, RANGE_I);
++#endif
++	      b[pos++] = n;
++	      break;
++      }
++    }
++  }
++stop:
++  va_end(ap);
++  sec->pos = pos;
++  sec->ofs = ofs;
++}
++#undef CK
++
++/* Pass 2: Link sections, shrink aligns, fix label offsets. */
++int dasm_link(Dst_DECL, size_t *szp)
++{
++  dasm_State *D = Dst_REF;
++  int secnum;
++  int ofs = 0;
++
++#ifdef DASM_CHECKS
++  *szp = 0;
++  if (D->status != DASM_S_OK) return D->status;
++  {
++    int pc;
++    for (pc = 0; pc*sizeof(int) < D->pcsize; pc++)
++      if (D->pclabels[pc] > 0) return DASM_S_UNDEF_PC|pc;
++  }
++#endif
++
++  { /* Handle globals not defined in this translation unit. */
++    int idx;
++    for (idx = 10; idx*sizeof(int) < D->lgsize; idx++) {
++      int n = D->lglabels[idx];
++      /* Undefined label: Collapse rel chain and replace with marker (< 0). */
++      while (n > 0) { int *pb = DASM_POS2PTR(D, n); n = *pb; *pb = -idx; }
++    }
++  }
++
++  /* Combine all code sections. No support for data sections (yet). */
++  for (secnum = 0; secnum < D->maxsection; secnum++) {
++    dasm_Section *sec = D->sections + secnum;
++    int *b = sec->rbuf;
++    int pos = DASM_SEC2POS(secnum);
++    int lastpos = sec->pos;
++
++    while (pos != lastpos) {
++      dasm_ActList p = D->actionlist + b[pos++];
++      while (1) {
++	  unsigned int ins = *p++;
++	  unsigned int action = (ins >> 20);
++	  if (ins & 0xf) continue; else ins >>= 4;
++	  switch (action) {
++	  case DASM_STOP: case DASM_SECTION: goto stop;
++	  case DASM_ESC: p++; break;
++	  case DASM_REL_EXT: break;
++	  case DASM_ALIGN: ofs -= (b[pos++] + ofs) & (ins & 255); break;
++	  case DASM_REL_LG: case DASM_REL_PC: pos++; break;
++	  case DASM_LABEL_LG: case DASM_LABEL_PC: b[pos++] += ofs; break;
++	  case DASM_IMM: case DASM_IMMS: pos++; break;
++	  }
++      }
++      stop: (void)0;
++    }
++    ofs += sec->ofs;  /* Next section starts right after current section. */
++  }
++
++  D->codesize = ofs;  /* Total size of all code sections */
++  *szp = ofs;
++  return DASM_S_OK;
++}
++
++#ifdef DASM_CHECKS
++#define CK(x, st) \
++  do { if (!(x)) return DASM_S_##st|(int)(p-D->actionlist-1); } while (0)
++#else
++#define CK(x, st)	((void)0)
++#endif
++
++/* Pass 3: Encode sections. */
++int dasm_encode(Dst_DECL, void *buffer)
++{
++  dasm_State *D = Dst_REF;
++  char *base = (char *)buffer;
++  unsigned int *cp = (unsigned int *)buffer;
++  int secnum;
++
++  /* Encode all code sections. No support for data sections (yet). */
++  for (secnum = 0; secnum < D->maxsection; secnum++) {
++    dasm_Section *sec = D->sections + secnum;
++    int *b = sec->buf;
++    int *endb = sec->rbuf + sec->pos;
++
++    while (b != endb) {
++      dasm_ActList p = D->actionlist + *b++;
++      while (1) {
++	unsigned int ins = *p++;
++	if (ins & 0xf) { *cp++ = ins; continue; }
++	unsigned int action = (ins >> 20);
++	unsigned int val = (ins >> 4);
++	int n = (action >= DASM_ALIGN && action < DASM__MAX) ? *b++ : 0;
++	switch (action) {
++	case DASM_STOP: case DASM_SECTION: goto stop;
++	case DASM_ESC: *cp++ = *p++; break;
++	case DASM_REL_EXT:
++	  n = DASM_EXTERN(Dst, (unsigned char *)cp, (val & 2047), 1);
++	  goto patchrel;
++	case DASM_ALIGN:
++	  val &= 255; while ((((char *)cp - base) & val)) *cp++ = 0x60000000;
++	  break;
++	case DASM_REL_LG:
++	  if (n < 0) {
++	    n = (int)((ptrdiff_t)D->globals[-n-10] - (ptrdiff_t)cp + 4);
++	    goto patchrel;
++	  }
++	  /* fallthrough */
++	case DASM_REL_PC:
++	  CK(n >= 0, UNDEF_PC);
++	  n = *DASM_POS2PTR(D, n) - (int)((char *)cp - base) + 4;
++	patchrel:
++	  if (val & 2048) { /* B */
++	    CK((n & 1) == 0 && ((n + 0x1000) >> 13) == 0, RANGE_REL);
++	    cp[-1] |= ((n << 19) & 0x80000000) | ((n << 20) & 0x7e000000)
++	           |  ((n << 7)  & 0x00000f00) | ((n >> 4)  & 0x00000080);
++	  } else { /* J */
++	    CK((n & 1) == 0 && ((n+0x00100000) >> 21) == 0, RANGE_REL);
++	    cp[-1] |= ((n << 11) & 0x80000000) | ((n << 20) & 0x7fe00000)
++	           |  ((n << 9)  & 0x00100000) | (n & 0x000ff000);
++	  }
++	  break;
++	case DASM_LABEL_LG:
++	  val &= 2047; if (val >= 20) D->globals[val-20] = (void *)(base + n);
++	  break;
++	case DASM_LABEL_PC: break;
++	case DASM_IMM:
++	  cp[-1] |= (n & ((1<<((val>>5)&31))-1)) << (val&31);
++	  break;
++	case DASM_IMMS:
++	  cp[-1] |= (((n << 20) & 0xfe000000) | ((n << 7) & 0x00000f80));
++	  break;
++	default: *cp++ = ins; break;
++	}
++      }
++      stop: (void)0;
++    }
++  }
++
++  if (base + D->codesize != (char *)cp)  /* Check for phase errors. */
++    return DASM_S_PHASE;
++  return DASM_S_OK;
++}
++#undef CK
++
++/* Get PC label offset. */
++int dasm_getpclabel(Dst_DECL, unsigned int pc)
++{
++  dasm_State *D = Dst_REF;
++  if (pc*sizeof(int) < D->pcsize) {
++    int pos = D->pclabels[pc];
++    if (pos < 0) return *DASM_POS2PTR(D, -pos);
++    if (pos > 0) return -1;  /* Undefined. */
++  }
++  return -2;  /* Unused or out of range. */
++}
++
++#ifdef DASM_CHECKS
++/* Optional sanity checker to call between isolated encoding steps. */
++int dasm_checkstep(Dst_DECL, int secmatch)
++{
++  dasm_State *D = Dst_REF;
++  if (D->status == DASM_S_OK) {
++    int i;
++    for (i = 1; i <= 9; i++) {
++      if (D->lglabels[i] > 0) { D->status = DASM_S_UNDEF_LG|i; break; }
++      D->lglabels[i] = 0;
++    }
++  }
++  if (D->status == DASM_S_OK && secmatch >= 0 &&
++      D->section != &D->sections[secmatch])
++    D->status = DASM_S_MATCH_SEC|(int)(D->section-D->sections);
++  return D->status;
++}
++#endif
++
+Index: wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_riscv.lua
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_riscv.lua
+@@ -0,0 +1,981 @@
++------------------------------------------------------------------------------
++-- DynASM RISC-V module.
++--
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
++-- See dynasm.lua for full copyright notice.
++------------------------------------------------------------------------------
++
++local riscv32 = riscv32
++local riscv64 = riscv64
++
++-- Module information:
++local _info = {
++  arch =	riscv32 and "riscv32" or riscv64 and "riscv64",
++  description =	"DynASM RISC-V module",
++  version =	"1.5.0",
++  vernum =	 10500,
++  release =	"2022-07-12",
++  author =	"Mike Pall",
++  license =	"MIT",
++}
++
++-- Exported glue functions for the arch-specific module.
++local _M = { _info = _info }
++
++-- Cache library functions.
++local type, tonumber, pairs, ipairs = type, tonumber, pairs, ipairs
++local assert, setmetatable = assert, setmetatable
++local _s = string
++local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char
++local match, gmatch = _s.match, _s.gmatch
++local concat, sort = table.concat, table.sort
++local bit = bit or require("bit")
++local band, shl, shr, sar = bit.band, bit.lshift, bit.rshift, bit.arshift
++local tohex = bit.tohex
++
++local function __orderedIndexGen(t)
++    local orderedIndex = {}
++    for key in pairs(t) do
++        table.insert(orderedIndex, key)
++    end
++    table.sort( orderedIndex )
++    return orderedIndex
++end
++
++local function __orderedNext(t, state)
++    local key = nil
++    if state == nil then
++        t.__orderedIndex = __orderedIndexGen(t)
++        key = t.__orderedIndex[1]
++    else
++        local j = 0
++        for _,_ in pairs(t.__orderedIndex) do j = j + 1 end
++        for i = 1, j do
++            if t.__orderedIndex[i] == state then
++                key = t.__orderedIndex[i+1]
++            end
++        end
++    end
++
++    if key then
++        return key, t[key]
++    end
++
++    t.__orderedIndex = nil
++    return
++end
++
++local function opairs(t)
++    return __orderedNext, t, nil
++end
++
++-- Inherited tables and callbacks.
++local g_opt, g_arch
++local wline, werror, wfatal, wwarn
++
++-- Action name list.
++-- CHECK: Keep this in sync with the C code!
++local action_names = {
++  "STOP", "SECTION", "ESC", "REL_EXT",
++  "ALIGN", "REL_LG", "LABEL_LG",
++  "REL_PC", "LABEL_PC", "IMM", "IMMS",
++}
++
++-- Maximum number of section buffer positions for dasm_put().
++-- CHECK: Keep this in sync with the C code!
++local maxsecpos = 25 -- Keep this low, to avoid excessively long C lines.
++
++-- Action name -> action number.
++local map_action = {}
++for n,name in ipairs(action_names) do
++  map_action[name] = n-1
++end
++
++-- Action list buffer.
++local actlist = {}
++
++-- Argument list for next dasm_put(). Start with offset 0 into action list.
++local actargs = { 0 }
++
++-- Current number of section buffer positions for dasm_put().
++local secpos = 1
++
++------------------------------------------------------------------------------
++
++-- Dump action names and numbers.
++local function dumpactions(out)
++  out:write("DynASM encoding engine action codes:\n")
++  for n,name in ipairs(action_names) do
++    local num = map_action[name]
++    out:write(format("  %-10s %02X  %d\n", name, num, num))
++  end
++  out:write("\n")
++end
++
++-- Write action list buffer as a huge static C array.
++local function writeactions(out, name)
++  local nn = #actlist
++  if nn == 0 then nn = 1; actlist[0] = map_action.STOP end
++  out:write("static const unsigned int ", name, "[", nn, "] = {\n")
++  for i = 1,nn-1 do
++    assert(out:write("0x", tohex(actlist[i]), ",\n"))
++  end
++  assert(out:write("0x", tohex(actlist[nn]), "\n};\n\n"))
++end
++
++------------------------------------------------------------------------------
++
++-- Add word to action list.
++local function wputxw(n)
++  assert(n >= 0 and n <= 0xffffffff and n % 1 == 0, "word out of range")
++  actlist[#actlist+1] = n
++end
++
++-- Add action to list with optional arg. Advance buffer pos, too.
++local function waction(action, val, a, num)
++  local w = assert(map_action[action], "bad action name `"..action.."'")
++  wputxw(w * 0x100000 + (val or 0) * 16)
++  if a then actargs[#actargs+1] = a end
++  if a or num then secpos = secpos + (num or 1) end
++end
++
++-- Flush action list (intervening C code or buffer pos overflow).
++local function wflush(term)
++  if #actlist == actargs[1] then return end -- Nothing to flush.
++  if not term then waction("STOP") end -- Terminate action list.
++  wline(format("dasm_put(Dst, %s);", concat(actargs, ", ")), true)
++  actargs = { #actlist } -- Actionlist offset is 1st arg to next dasm_put().
++  secpos = 1 -- The actionlist offset occupies a buffer position, too.
++end
++
++-- Put escaped word.
++local function wputw(n)
++  if band(n, 0xf) == 0 then waction("ESC") end
++  wputxw(n)
++end
++
++-- Reserve position for word.
++local function wpos()
++  local pos = #actlist+1
++  actlist[pos] = ""
++  return pos
++end
++
++-- Store word to reserved position.
++local function wputpos(pos, n)
++  assert(n >= -0x80000000 and n <= 0xffffffff and n % 1 == 0, "word out of range")
++  actlist[pos] = n
++end
++
++------------------------------------------------------------------------------
++
++-- Global label name -> global label number. With auto assignment on 1st use.
++local next_global = 20
++local map_global = setmetatable({}, { __index = function(t, name)
++  if not match(name, "^[%a_][%w_]*$") then werror("bad global label") end
++  local n = next_global
++  if n > 2047 then werror("too many global labels") end
++  next_global = n + 1
++  t[name] = n
++  return n
++end})
++
++-- Dump global labels.
++local function dumpglobals(out, lvl)
++  local t = {}
++  for name, n in pairs(map_global) do t[n] = name end
++  out:write("Global labels:\n")
++  for i=20,next_global-1 do
++    out:write(format("  %s\n", t[i]))
++  end
++  out:write("\n")
++end
++
++-- Write global label enum.
++local function writeglobals(out, prefix)
++  local t = {}
++  for name, n in pairs(map_global) do t[n] = name end
++  out:write("enum {\n")
++  for i=20,next_global-1 do
++    out:write("  ", prefix, t[i], ",\n")
++  end
++  out:write("  ", prefix, "_MAX\n};\n")
++end
++
++-- Write global label names.
++local function writeglobalnames(out, name)
++  local t = {}
++  for name, n in pairs(map_global) do t[n] = name end
++  out:write("static const char *const ", name, "[] = {\n")
++  for i=20,next_global-1 do
++    out:write("  \"", t[i], "\",\n")
++  end
++  out:write("  (const char *)0\n};\n")
++end
++
++------------------------------------------------------------------------------
++
++-- Extern label name -> extern label number. With auto assignment on 1st use.
++local next_extern = 0
++local map_extern_ = {}
++local map_extern = setmetatable({}, { __index = function(t, name)
++  -- No restrictions on the name for now.
++  local n = next_extern
++  if n > 2047 then werror("too many extern labels") end
++  next_extern = n + 1
++  t[name] = n
++  map_extern_[n] = name
++  return n
++end})
++
++-- Dump extern labels.
++local function dumpexterns(out, lvl)
++  out:write("Extern labels:\n")
++  for i=0,next_extern-1 do
++    out:write(format("  %s\n", map_extern_[i]))
++  end
++  out:write("\n")
++end
++
++-- Write extern label names.
++local function writeexternnames(out, name)
++  out:write("static const char *const ", name, "[] = {\n")
++  for i=0,next_extern-1 do
++    out:write("  \"", map_extern_[i], "\",\n")
++  end
++  out:write("  (const char *)0\n};\n")
++end
++
++------------------------------------------------------------------------------
++
++-- Arch-specific maps.
++local map_archdef = {
++  ra = "x1", sp = "x2",
++} -- Ext. register name -> int. name.
++
++local map_type = {}		-- Type name -> { ctype, reg }
++local ctypenum = 0		-- Type number (for Dt... macros).
++
++-- Reverse defines for registers.
++function _M.revdef(s)
++  if s == "x1" then return "ra"
++  elseif s == "x2" then return "sp" end
++  return s
++end
++
++------------------------------------------------------------------------------
++
++-- Template strings for RISC-V instructions.
++local map_op = {}
++
++local map_op_rv32imafd = {
++
++  -- DASM pseudo-instrs
++  empty_0 = "ffffffff",
++  call_1 = "7fffffffJ",
++
++  -- RV32I
++  lui_2 = "00000037DU",
++  auipc_2 = "00000017DA",
++
++  jal_2  = "0000006fDJ",
++  jalr_3 = "00000067DRJ",
++  -- pseudo-instrs
++  j_1 = "0000006fJ",
++  jal_1 = "000000efJ",
++  jr_1 = "00000067R",
++  jalr_1 = "000000e7R",
++  jalr_2 = "000000e7RJ",
++
++  beq_3  = "00000063RrB",
++  bne_3  = "00001063RrB",
++  blt_3  = "00004063RrB",
++  bge_3  = "00005063RrB",
++  bltu_3 = "00006063RrB",
++  bgeu_3 = "00007063RrB",
++  -- pseudo-instrs
++  bnez_2 = "00001063RB",
++  beqz_2 = "00000063RB",
++  blez_2 = "00005063rB",
++  bgez_2 = "00005063RB",
++  bltz_2 = "00004063RB",
++  bgtz_2 = "00004063rB",
++  bgt_3 = "00004063rRB",
++  ble_3 = "00005063rRB",
++  bgtu_3 = "00006063rRB",
++  bleu_3 = "00007063rRB",
++
++  lb_2  = "00000003DL",
++  lh_2  = "00001003DL",
++  lw_2  = "00002003DL",
++  lbu_2 = "00004003DL",
++  lhu_2 = "00005003DL",
++
++  sb_2 = "00000023rS",
++  sh_2 = "00001023rS",
++  sw_2 = "00002023rS",
++
++  addi_3  = "00000013DRI",
++  slti_3  = "00002013DRI",
++  sltiu_3 = "00003013DRI",
++  xori_3 = "00004013DRI",
++  ori_3 = "00006013DRI",
++  andi_3 = "00007013DRI",
++  slli_3 = "00001013DRi",
++  srli_3 = "00005013DRi",
++  srai_3 = "40005013DRi",
++  -- pseudo-instrs
++  seqz_2 = "00103013DR",
++  ["zext.b_2"] = "0ff07013DR",
++
++  add_3 = "00000033DRr",
++  sub_3 = "40000033DRr",
++  sll_3 = "00001033DRr",
++  slt_3 = "00002033DRr",
++  sltu_3 = "00003033DRr",
++  xor_3 = "00004033DRr",
++  srl_3 = "00005033DRr",
++  sra_3 = "40005033DRr",
++  or_3 = "00006033DRr",
++  and_3 = "00007033DRr",
++  -- pseudo-instrs
++  snez_2 = "00003033Dr",
++  sltz_2 = "00002033DR",
++  sgtz_2 = "00002033Dr",
++
++  ecall_0 = "00000073",
++  ebreak_0 = "00100073",
++
++  nop_0 = "00000013",
++  li_2 = "00000013DI",
++  mv_2 = "00000013DR",
++  not_2 = "fff04013DR",
++  neg_2 = "40000033Dr",
++  ret_0 = "00008067",
++
++  -- RV32M
++  mul_3    = "02000033DRr",
++  mulh_3   = "02001033DRr",
++  mulhsu_3 = "02002033DRr",
++  mulhu_3  = "02003033DRr",
++  div_3  = "02004033DRr",
++  divu_3 = "02005033DRr",
++  rem_3  = "02006033DRr",
++  remu_3 = "02007033DRr",
++
++  -- RV32A
++  ["lr.w_2"] = "c0000053FR",
++  ["sc.w_2"] = "c0001053FRr",
++  ["amoswap.w_3"] = "c0002053FRr",
++  ["amoadd.w_3"] = "c0003053FRr",
++  ["amoxor.w_3"] = "c0004053FRr",
++  ["amoor.w_3"] = "c0005053FRr",
++  ["amoand.w_3"] = "c0006053FRr",
++  ["amomin.w_3"] = "c0007053FRr",
++  ["amomax.w_3"] = "c0008053FRr",
++  ["amominu.w_3"] = "c0009053FRr",
++  ["amomaxu.w_3"] = "c000a053FRr",
++
++  -- RV32F
++  ["flw_2"] = "00002007FL",
++  ["fsw_2"] = "00002027gS",
++
++  ["fmadd.s_4"]  = "00000043FGgH",
++  ["fmsub.s_4"]  = "00000047FGgH",
++  ["fnmsub.s_4"] = "0000004bFGgH",
++  ["fnmadd.s_4"] = "0000004fFGgH",
++  ["fmadd.s_5"]  = "00000043FGgHM",
++  ["fmsub.s_5"]  = "00000047FGgHM",
++  ["fnmsub.s_5"] = "0000004bFGgHM",
++  ["fnmadd.s_5"] = "0000004fFGgHM",
++
++  ["fadd.s_3"]  = "00000053FGg",
++  ["fsub.s_3"]  = "08000053FGg",
++  ["fmul.s_3"]  = "10000053FGg",
++  ["fdiv.s_3"]  = "18000053FGg",
++  ["fsqrt.s_2"] = "58000053FG",
++  ["fadd.s_4"]  = "00000053FGgM",
++  ["fsub.s_4"]  = "08000053FGgM",
++  ["fmul.s_4"]  = "10000053FGgM",
++  ["fdiv.s_4"]  = "18000053FGgM",
++  ["fsqrt.s_3"] = "58000053FGM",
++
++  ["fsgnj.s_3"]  = "20000053FGg",
++  ["fsgnjn.s_3"] = "20001053FGg",
++  ["fsgnjx.s_3"] = "20002053FGg",
++
++  ["fmin.s_3"] = "28000053FGg",
++  ["fmax.s_3"] = "28001053FGg",
++
++  ["fcvt.w.s_2"]  = "c0000053DG",
++  ["fcvt.wu.s_2"] = "c0100053DG",
++  ["fcvt.w.s_3"]  = "c0000053DGM",
++  ["fcvt.wu.s_3"] = "c0100053DGM",
++  ["fmv.x.w_2"] = "e0000053DG",
++
++  ["feq.s_3"] = "a0002053DGg",
++  ["flt.s_3"] = "a0001053DGg",
++  ["fle.s_3"] = "a0000053DGg",
++
++  ["fclass.s_2"] = "e0001053DG",
++
++  ["fcvt.s.w_2"]  = "d0000053FR",
++  ["fcvt.s.wu_2"] = "d0100053FR",
++  ["fcvt.s.w_3"]  = "d0000053FRM",
++  ["fcvt.s.wu_3"] = "d0100053FRM",
++  ["fmv.w.x_2"] = "f0000053FR",
++
++  -- RV32D
++  ["fld_2"] = "00003007FL",
++  ["fsd_2"] = "00003027gS",
++  
++  ["fmadd.d_4"]  = "02000043FGgH",
++  ["fmsub.d_4"]  = "02000047FGgH",
++  ["fnmsub.d_4"] = "0200004bFGgH",
++  ["fnmadd.d_4"] = "0200004fFGgH",
++  ["fmadd.d_5"]  = "02000043FGgHM",
++  ["fmsub.d_5"]  = "02000047FGgHM",
++  ["fnmsub.d_5"] = "0200004bFGgHM",
++  ["fnmadd.d_5"] = "0200004fFGgHM",
++
++  ["fadd.d_3"]  = "02000053FGg",
++  ["fsub.d_3"]  = "0a000053FGg",
++  ["fmul.d_3"]  = "12000053FGg",
++  ["fdiv.d_3"]  = "1a000053FGg",
++  ["fsqrt.d_2"] = "5a000053FG",
++  ["fadd.d_4"]  = "02000053FGgM",
++  ["fsub.d_4"]  = "0a000053FGgM",
++  ["fmul.d_4"]  = "12000053FGgM",
++  ["fdiv.d_4"]  = "1a000053FGgM",
++  ["fsqrt.d_3"] = "5a000053FGM",
++
++  ["fsgnj.d_3"]  = "22000053FGg",
++  ["fsgnjn.d_3"] = "22001053FGg",
++  ["fsgnjx.d_3"] = "22002053FGg",
++  ["fmin.d_3"] = "2a000053FGg",
++  ["fmax.d_3"] = "2a001053FGg",
++  ["fcvt.s.d_2"] = "40100053FG",
++  ["fcvt.d.s_2"] = "42000053FG",
++  ["feq.d_3"] = "a2002053DGg",
++  ["flt.d_3"] = "a2001053DGg",
++  ["fle.d_3"] = "a2000053DGg",
++  ["fclass.d_2"] = "e2001053DG",
++  ["fcvt.w.d_2"]  = "c2000053DG",
++  ["fcvt.wu.d_2"] = "c2100053DG",
++  ["fcvt.d.w_2"]  = "d2000053FR",
++  ["fcvt.d.wu_2"] = "d2100053FR",
++  ["fcvt.w.d_3"]  = "c2000053DGM",
++  ["fcvt.wu.d_3"] = "c2100053DGM",
++  ["fcvt.d.w_3"]  = "d2000053FRM",
++  ["fcvt.d.wu_3"] = "d2100053FRM",
++
++  ["fmv.d_2"] = "22000053FY",
++  ["fneg.d_2"] = "22001053FY",
++  ["fabs.d_2"] = "22002053FY",
++
++}
++
++local map_op_rv64imafd = {
++
++  -- RV64I
++  lwu_2 = "00006003DL",
++  ld_2  = "00003003DL",
++
++  sd_2 = "00003023rS",
++
++  slli_3 = "00001013DRj",
++  srli_3 = "00005013DRj",
++  srai_3 = "40005013DRj",
++
++  addiw_3 = "0000001bDRI",
++  slliw_3 = "0000101bDRi",
++  srliw_3 = "0000501bDRi",
++  sraiw_3 = "4000501bDRi",
++
++  addw_3 = "0000003bDRr",
++  subw_3 = "4000003bDRr",
++  sllw_3 = "0000103bDRr",
++  srlw_3 = "0000503bDRr",
++  sraw_3 = "4000503bDRr",
++
++  negw_2 = "4000003bDr",
++  ["sext.w_2"] = "0000001bDR",
++
++  -- RV64M
++  mulw_3  = "0200003bDRr",
++  divw_3  = "0200403bDRr",
++  divuw_3 = "0200503bDRr",
++  remw_3  = "0200603bDRr",
++  remuw_3 = "0200703bDRr",
++
++  -- RV64A
++  ["lr.d_2"] = "c2000053FR",
++  ["sc.d_2"] = "c2001053FRr",
++  ["amoswap.d_3"] = "c2002053FRr",
++  ["amoadd.d_3"] = "c2003053FRr",
++  ["amoxor.d_3"] = "c2004053FRr",
++  ["amoor.d_3"] = "c2005053FRr",
++  ["amoand.d_3"] = "c2006053FRr",
++  ["amomin.d_3"] = "c2007053FRr",
++  ["amomax.d_3"] = "c2008053FRr",
++  ["amominu.d_3"] = "c2009053FRr",
++  ["amomaxu.d_3"] = "c200a053FRr",
++
++  -- RV64F
++  ["fcvt.l.s_2"]  = "c0200053DG",
++  ["fcvt.lu.s_2"] = "c0300053DG",
++  ["fcvt.l.s_3"]  = "c0200053DGM",
++  ["fcvt.lu.s_3"] = "c0300053DGM",
++  ["fcvt.s.l_2"]  = "d0200053FR",
++  ["fcvt.s.lu_2"] = "d0300053FR",
++  ["fcvt.s.l_3"]  = "d0200053FRM",
++  ["fcvt.s.lu_3"] = "d0300053FRM",
++
++  -- RV64D
++  ["fcvt.l.d_2"]  = "c2200053DG",
++  ["fcvt.lu.d_2"] = "c2300053DG",
++  ["fcvt.l.d_3"]  = "c2200053DGM",
++  ["fcvt.lu.d_3"] = "c2300053DGM",
++  ["fmv.x.d_2"]   = "e2000053DG",
++  ["fcvt.d.l_2"]  = "d2200053FR",
++  ["fcvt.d.lu_2"] = "d2300053FR",
++  ["fcvt.d.l_3"]  = "d2200053FRM",
++  ["fcvt.d.lu_3"] = "d2300053FRM",
++  ["fmv.d.x_2"]   = "f2000053FR",
++
++}
++
++local map_op_zicsr = {
++  csrrw_3 = "00001073DCR",
++  csrrs_3 = "00002073DCR",
++  csrrc_3 = "00003073DCR",
++  csrrwi_3 = "00005073DCu",
++  csrrsi_3 = "00006073DCu",
++  csrrci_3 = "00007073DCu",
++
++  -- pseudo-ops
++  csrrw_2 = "00001073DC",
++  csrrs_2 = "00002073CR",
++  csrrc_2 = "00003073CR",
++  csrrwi_2 = "00005073Cu",
++  csrrsi_2 = "00006073Cu",
++  csrrci_2 = "00007073Cu",
++
++  rdinstret_1 = "C0202073D",
++  rdcycle_1 = "C0002073D",
++  rdtime_1 = "C0102073D",
++  rdinstreth_1 = "C8202073D",
++  rdcycleh_1 = "C8002073D",
++  rdtimeh_1 = "C8102073D",
++
++  frcsr_1 = "00302073D",
++  fscsr_2 = "00301073DR",
++  fscsr_1 = "00301073R",
++  frrm_1 = "00202073D",
++  fsrm_2 = "00201073DR",
++  fsrm_1 = "00201073R",
++  fsrmi_2 = "00205073Du",
++  fsrmi_1 = "00205073u",
++  frflags_1 = "00102073D",
++  fsflags_2 = "00101073DR",
++  fsflagsi_2 = "00105073Du",
++  fsflagsi_1 = "00105073u",
++}
++
++local map_op_zifencei = {
++  ["fence.i_3"] = "0000100fDRI",
++}
++
++local list_map_op_rv32 = { ['a'] = map_op_rv32imafd, ['b'] = map_op_zifencei, ['c'] = map_op_zicsr }
++local list_map_op_rv64 = { ['a'] = map_op_rv32imafd, ['b'] = map_op_rv64imafd, ['c'] = map_op_zifencei, ['d'] = map_op_zicsr }
++
++if riscv32 then for _, map in opairs(list_map_op_rv32) do
++  for k, v in pairs(map) do map_op[k] = v end
++  end
++end
++if riscv64 then for _, map in opairs(list_map_op_rv64) do
++  for k, v in pairs(map) do map_op[k] = v end
++  end
++end
++
++------------------------------------------------------------------------------
++
++local function parse_gpr(expr)
++  local tname, ovreg = match(expr, "^([%w_]+):(x[1-3]?[0-9])$")
++  local tp = map_type[tname or expr]
++  if tp then
++    local reg = ovreg or tp.reg
++    if not reg then
++      werror("type `"..(tname or expr).."' needs a register override")
++    end
++    expr = reg
++  end
++  local r = match(expr, "^x([1-3]?[0-9])$")
++  if r then
++    r = tonumber(r)
++    if r <= 31 then return r, tp end
++  end
++  werror("bad register name `"..expr.."'")
++end
++
++local function parse_fpr(expr)
++  local r = match(expr, "^f([1-3]?[0-9])$")
++  if r then
++    r = tonumber(r)
++    if r <= 31 then return r end
++  end
++  werror("bad register name `"..expr.."'")
++end
++
++local function parse_imm(imm, bits, shift, scale, signed, action)
++  local n = tonumber(imm)
++  if n then
++    local m = sar(n, scale)
++    if shl(m, scale) == n then
++      if signed then
++          local s = sar(m, bits-1)
++          if s == 0 then return shl(m, shift)
++          elseif s == -1 then return shl(m + shl(1, bits), shift) end
++      else
++          if sar(m, bits) == 0 then return shl(m, shift) end
++      end
++    end
++    werror("out of range immediate `"..imm.."'")
++  elseif match(imm, "^[xf]([1-3]?[0-9])$") or
++           match(imm, "^([%w_]+):([xf][1-3]?[0-9])$") then
++    werror("expected immediate operand, got register")
++  else
++    waction(action or "IMM",
++        (signed and 32768 or 0)+shl(scale, 10)+shl(bits, 5)+shift, imm)
++    return 0
++  end
++end
++
++local function parse_csr(expr)
++  local r = match(expr, "^([1-4]?[0-9]?[0-9]?[0-9])$")
++  if r then
++    r = tonumber(r)
++    if r <= 4095 then return r end
++  end
++  werror("bad register name `"..expr.."'")
++end
++
++local function parse_imms(imm)
++  local n = tonumber(imm)
++  if n then
++    if n >= -2048 and n < 2048 then
++      local imm5, imm7 = band(n, 0x1f), shr(band(n, 0xfe0), 5)
++      return shl(imm5, 7) + shl(imm7, 25)
++    end
++    werror("out of range immediate `"..imm.."'")
++  elseif match(imm, "^[xf]([1-3]?[0-9])$") or
++         match(imm, "^([%w_]+):([xf][1-3]?[0-9])$") then
++    werror("expected immediate operand, got register")
++  else
++    waction("IMMS", 0, imm); return 0
++  end
++end
++
++local function parse_rm(mode)
++  local rnd_mode = {
++    rne = 0, rtz = 1, rdn = 2, rup = 3, rmm = 4, dyn = 7
++  }
++  local n = rnd_mode[mode]
++  if n then return n
++  else werror("bad rounding mode `"..mode.."'") end
++end
++
++local function parse_disp(disp, mode)
++  local imm, reg = match(disp, "^(.*)%(([%w_:]+)%)$")
++  if imm then
++    local r = shl(parse_gpr(reg), 15)
++    local extname = match(imm, "^extern%s+(%S+)$")
++    if extname then
++      waction("REL_EXT", map_extern[extname], nil, 1)
++      return r
++    else
++      if mode == "load" then
++        return r + parse_imm(imm, 12, 20, 0, true)
++      elseif mode == "store" then
++        return r + parse_imms(imm)
++      else
++        werror("bad displacement mode '"..mode.."'")
++      end
++    end
++  end
++  local reg, tailr = match(disp, "^([%w_:]+)%s*(.*)$")
++  if reg and tailr ~= "" then
++    local r, tp = parse_gpr(reg)
++    if tp then
++      if mode == "load" then
++          waction("IMM", 32768+12*32+20, format(tp.ctypefmt, tailr))
++      elseif mode == "store" then
++          waction("IMMS", 0, format(tp.ctypefmt, tailr))
++      else
++        werror("bad displacement mode '"..mode.."'")
++      end
++      return shl(r, 15)
++    end
++  end
++  werror("bad displacement `"..disp.."'")
++end
++
++local function parse_label(label, def)
++  local prefix = sub(label, 1, 2)
++  -- =>label (pc label reference)
++  if prefix == "=>" then
++    return "PC", 0, sub(label, 3)
++  end
++  -- ->name (global label reference)
++  if prefix == "->" then
++    return "LG", map_global[sub(label, 3)]
++  end
++  if def then
++    -- [1-9] (local label definition)
++    if match(label, "^[1-9]$") then
++      return "LG", 10+tonumber(label)
++    end
++  else
++    -- [<>][1-9] (local label reference)
++    local dir, lnum = match(label, "^([<>])([1-9])$")
++    if dir then -- Fwd: 1-9, Bkwd: 11-19.
++      return "LG", lnum + (dir == ">" and 0 or 10)
++    end
++    -- extern label (extern label reference)
++    local extname = match(label, "^extern%s+(%S+)$")
++    if extname then
++      return "EXT", map_extern[extname]
++    end
++  end
++  werror("bad label `"..label.."'")
++end
++
++------------------------------------------------------------------------------
++
++-- Handle opcodes defined with template strings.
++map_op[".template__"] = function(params, template, nparams)
++  if not params then return sub(template, 9) end
++  local op = tonumber(sub(template, 1, 8), 16)
++  local n = 1
++
++  -- Limit number of section buffer positions used by a single dasm_put().
++  -- A single opcode needs a maximum of 2 positions (ins/ext).
++  if secpos+2 > maxsecpos then wflush() end
++  local pos = wpos()
++
++  -- Process each character.
++  for p in gmatch(sub(template, 9), ".") do
++    if p == "D" then  -- gpr rd
++      op = op + shl(parse_gpr(params[n]), 7); n = n + 1
++    elseif p == "R" then  -- gpr rs1
++      op = op + shl(parse_gpr(params[n]), 15); n = n + 1
++    elseif p == "r" then  -- gpr rs2
++      op = op + shl(parse_gpr(params[n]), 20); n = n + 1
++    elseif p == "F" then  -- fpr rd
++      op = op + shl(parse_fpr(params[n]), 7); n = n + 1
++    elseif p == "G" then  -- fpr rs1
++      op = op + shl(parse_fpr(params[n]), 15); n = n + 1
++    elseif p == "g" then  -- fpr rs2
++      op = op + shl(parse_fpr(params[n]), 20); n = n + 1
++    elseif p == "H" then  -- fpr rs3
++      op = op + shl(parse_fpr(params[n]), 27); n = n + 1
++    elseif p == "C" then  -- csr
++      op = op + shl(parse_csr(params[n]), 20); n = n + 1
++    elseif p == "M" then  -- fpr rounding mode
++      op = op + shl(parse_rm(params[n]), 12); n = n + 1
++    elseif p == "Y" then  -- fpr psuedo-op
++      local r = parse_fpr(params[n])
++      op = op + shl(r, 15) + shl(r, 20); n = n + 1
++    elseif p == "I" then  -- I-type imm12
++      op = op + parse_imm(params[n], 12, 20, 0, true); n = n + 1
++    elseif p == "i" then  -- I-type shamt5
++      op = op + parse_imm(params[n], 5, 20, 0, false); n = n + 1
++    elseif p == "j" then  -- I-type shamt6
++      op = op + parse_imm(params[n], 6, 20, 0, false); n = n + 1
++    elseif p == "u" then  -- I-type uimm
++      op = op + parse_imm(params[n], 5, 15, 0, false); n = n + 1
++    elseif p == "U" then  -- U-type imm20
++      op = op + parse_imm(params[n], 20, 12, 0, false); n = n + 1
++    elseif p == "L" then  -- load
++      op = op + parse_disp(params[n], "load"); n = n + 1
++    elseif p == "S" then  -- store
++      op = op + parse_disp(params[n], "store"); n = n + 1
++    elseif p == "B" or p == "J" then  -- control flow
++      local mode, m, s = parse_label(params[n], false)
++      if p == "B" then m = m + 2048 end
++      waction("REL_"..mode, m, s, 1); n = n + 1
++    elseif p == "A" then  -- AUIPC
++      local mode, m, s = parse_label(params[n], false)
++      waction("REL_"..mode, m, s, 1); n = n + 1
++    else
++      assert(false)
++    end
++  end
++  wputpos(pos, op)
++end
++
++------------------------------------------------------------------------------
++
++-- Pseudo-opcode to mark the position where the action list is to be emitted.
++map_op[".actionlist_1"] = function(params)
++  if not params then return "cvar" end
++  local name = params[1] -- No syntax check. You get to keep the pieces.
++  wline(function(out) writeactions(out, name) end)
++end
++
++-- Pseudo-opcode to mark the position where the global enum is to be emitted.
++map_op[".globals_1"] = function(params)
++  if not params then return "prefix" end
++  local prefix = params[1] -- No syntax check. You get to keep the pieces.
++  wline(function(out) writeglobals(out, prefix) end)
++end
++
++-- Pseudo-opcode to mark the position where the global names are to be emitted.
++map_op[".globalnames_1"] = function(params)
++  if not params then return "cvar" end
++  local name = params[1] -- No syntax check. You get to keep the pieces.
++  wline(function(out) writeglobalnames(out, name) end)
++end
++
++-- Pseudo-opcode to mark the position where the extern names are to be emitted.
++map_op[".externnames_1"] = function(params)
++  if not params then return "cvar" end
++  local name = params[1] -- No syntax check. You get to keep the pieces.
++  wline(function(out) writeexternnames(out, name) end)
++end
++
++------------------------------------------------------------------------------
++
++-- Label pseudo-opcode (converted from trailing colon form).
++map_op[".label_1"] = function(params)
++  if not params then return "[1-9] | ->global | =>pcexpr" end
++  if secpos+1 > maxsecpos then wflush() end
++  local mode, n, s = parse_label(params[1], true)
++  if mode == "EXT" then werror("bad label definition") end
++  waction("LABEL_"..mode, n, s, 1)
++end
++
++------------------------------------------------------------------------------
++
++-- Pseudo-opcodes for data storage.
++map_op[".long_*"] = function(params)
++  if not params then return "imm..." end
++  for _,p in ipairs(params) do
++    local n = tonumber(p)
++    if not n then werror("bad immediate `"..p.."'") end
++    if n < 0 then n = n + 2^32 end
++    wputw(n)
++    if secpos+2 > maxsecpos then wflush() end
++  end
++end
++
++-- Alignment pseudo-opcode.
++map_op[".align_1"] = function(params)
++  if not params then return "numpow2" end
++  if secpos+1 > maxsecpos then wflush() end
++  local align = tonumber(params[1])
++  if align then
++    local x = align
++    -- Must be a power of 2 in the range (2 ... 256).
++    for i=1,8 do
++      x = x / 2
++      if x == 1 then
++    waction("ALIGN", align-1, nil, 1) -- Action byte is 2**n-1.
++    return
++      end
++    end
++  end
++  werror("bad alignment")
++end
++
++------------------------------------------------------------------------------
++
++-- Pseudo-opcode for (primitive) type definitions (map to C types).
++map_op[".type_3"] = function(params, nparams)
++  if not params then
++    return nparams == 2 and "name, ctype" or "name, ctype, reg"
++  end
++  local name, ctype, reg = params[1], params[2], params[3]
++  if not match(name, "^[%a_][%w_]*$") then
++    werror("bad type name `"..name.."'")
++  end
++  local tp = map_type[name]
++  if tp then
++    werror("duplicate type `"..name.."'")
++  end
++  -- Add #type to defines. A bit unclean to put it in map_archdef.
++  map_archdef["#"..name] = "sizeof("..ctype..")"
++  -- Add new type and emit shortcut define.
++  local num = ctypenum + 1
++  map_type[name] = {
++    ctype = ctype,
++    ctypefmt = format("Dt%X(%%s)", num),
++    reg = reg,
++  }
++  wline(format("#define Dt%X(_V) (int)(ptrdiff_t)&(((%s *)0)_V)", num, ctype))
++  ctypenum = num
++end
++map_op[".type_2"] = map_op[".type_3"]
++
++-- Dump type definitions.
++local function dumptypes(out, lvl)
++  local t = {}
++  for name in pairs(map_type) do t[#t+1] = name end
++  sort(t)
++  out:write("Type definitions:\n")
++  for _,name in ipairs(t) do
++    local tp = map_type[name]
++    local reg = tp.reg or ""
++    out:write(format("  %-20s %-20s %s\n", name, tp.ctype, reg))
++  end
++  out:write("\n")
++end
++
++------------------------------------------------------------------------------
++
++-- Set the current section.
++function _M.section(num)
++  waction("SECTION", num)
++  wflush(true) -- SECTION is a terminal action.
++end
++
++------------------------------------------------------------------------------
++
++-- Dump architecture description.
++function _M.dumparch(out)
++  out:write(format("DynASM %s version %s, released %s\n\n",
++    _info.arch, _info.version, _info.release))
++  dumpactions(out)
++end
++
++-- Dump all user defined elements.
++function _M.dumpdef(out, lvl)
++  dumptypes(out, lvl)
++  dumpglobals(out, lvl)
++  dumpexterns(out, lvl)
++end
++
++------------------------------------------------------------------------------
++
++-- Pass callbacks from/to the DynASM core.
++function _M.passcb(wl, we, wf, ww)
++  wline, werror, wfatal, wwarn = wl, we, wf, ww
++  return wflush
++end
++
++-- Setup the arch-specific module.
++function _M.setup(arch, opt)
++  g_arch, g_opt = arch, opt
++end
++
++-- Merge the core maps and the arch-specific maps.
++function _M.mergemaps(map_coreop, map_def)
++  setmetatable(map_op, { __index = map_coreop })
++  setmetatable(map_def, { __index = map_archdef })
++  return map_op, map_def
++end
++
++return _M
++
++------------------------------------------------------------------------------
++
+Index: wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_riscv32.lua
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_riscv32.lua
+@@ -0,0 +1,12 @@
++------------------------------------------------------------------------------
++-- DynASM RISC-V 32 module.
++--
++-- Copyright (C) 2005-2022 Mike Pall. All rights reserved.
++-- See dynasm.lua for full copyright notice.
++------------------------------------------------------------------------------
++-- This module just sets 32 bit mode for the combined RISC-V module.
++-- All the interesting stuff is there.
++------------------------------------------------------------------------------
++
++riscv32 = true -- Using a global is an ugly, but effective solution.
++return require("dasm_riscv")
+Index: wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_riscv64.lua
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_riscv64.lua
+@@ -0,0 +1,12 @@
++------------------------------------------------------------------------------
++-- DynASM RISC-V 64 module.
++--
++-- Copyright (C) 2005-2022 Mike Pall. All rights reserved.
++-- See dynasm.lua for full copyright notice.
++------------------------------------------------------------------------------
++-- This module just sets 64 bit mode for the combined RISC-V module.
++-- All the interesting stuff is there.
++------------------------------------------------------------------------------
++
++riscv64 = true -- Using a global is an ugly, but effective solution.
++return require("dasm_riscv")
+Index: wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_x64.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/dynasm/dasm_x64.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_x64.lua
+@@ -1,7 +1,7 @@
+ ------------------------------------------------------------------------------
+ -- DynASM x64 module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- See dynasm.lua for full copyright notice.
+ ------------------------------------------------------------------------------
+ -- This module just sets 64 bit mode for the combined x86/x64 module.
+Index: wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_x86.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/dynasm/dasm_x86.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_x86.h
+@@ -1,6 +1,6 @@
+ /*
+ ** DynASM x86 encoding engine.
+-** Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++** Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ ** Released under the MIT license. See dynasm.lua for full copyright notice.
+ */
+ 
+@@ -68,7 +68,7 @@ struct dasm_State {
+   size_t lgsize;
+   int *pclabels;		/* PC label chains/pos ptrs. */
+   size_t pcsize;
+-  void **globals;		/* Array of globals (bias -10). */
++  void **globals;		/* Array of globals. */
+   dasm_Section *section;	/* Pointer to active section. */
+   size_t codesize;		/* Total size of all code sections. */
+   int maxsection;		/* 0 <= sectionidx < maxsection. */
+@@ -85,7 +85,6 @@ void dasm_init(Dst_DECL, int maxsection)
+ {
+   dasm_State *D;
+   size_t psz = 0;
+-  int i;
+   Dst_REF = NULL;
+   DASM_M_GROW(Dst, struct dasm_State, Dst_REF, psz, DASM_PSZ(maxsection));
+   D = Dst_REF;
+@@ -96,12 +95,7 @@ void dasm_init(Dst_DECL, int maxsection)
+   D->pcsize = 0;
+   D->globals = NULL;
+   D->maxsection = maxsection;
+-  for (i = 0; i < maxsection; i++) {
+-    D->sections[i].buf = NULL;  /* Need this for pass3. */
+-    D->sections[i].rbuf = D->sections[i].buf - DASM_SEC2POS(i);
+-    D->sections[i].bsize = 0;
+-    D->sections[i].epos = 0;  /* Wrong, but is recalculated after resize. */
+-  }
++  memset((void *)D->sections, 0, maxsection * sizeof(dasm_Section));
+ }
+ 
+ /* Free DynASM state. */
+@@ -121,7 +115,7 @@ void dasm_free(Dst_DECL)
+ void dasm_setupglobal(Dst_DECL, void **gl, unsigned int maxgl)
+ {
+   dasm_State *D = Dst_REF;
+-  D->globals = gl - 10;  /* Negative bias to compensate for locals. */
++  D->globals = gl;
+   DASM_M_GROW(Dst, int, D->lglabels, D->lgsize, (10+maxgl)*sizeof(int));
+ }
+ 
+@@ -146,6 +140,7 @@ void dasm_setup(Dst_DECL, const void *ac
+   if (D->pclabels) memset((void *)D->pclabels, 0, D->pcsize);
+   for (i = 0; i < D->maxsection; i++) {
+     D->sections[i].pos = DASM_SEC2POS(i);
++    D->sections[i].rbuf = D->sections[i].buf - D->sections[i].pos;
+     D->sections[i].ofs = 0;
+   }
+ }
+@@ -239,8 +234,11 @@ void dasm_put(Dst_DECL, int start, ...)
+ 	}
+ 	pos++;
+ 	ofs += 4;  /* Maximum offset needed. */
+-	if (action == DASM_REL_LG || action == DASM_REL_PC)
++	if (action == DASM_REL_LG || action == DASM_REL_PC) {
+ 	  b[pos++] = ofs;  /* Store pass1 offset estimate. */
++	} else if (sizeof(ptrdiff_t) == 8) {
++	  ofs += 4;
++	}
+ 	break;
+       case DASM_LABEL_LG: pl = D->lglabels + *p++; CKPL(lg, LG); goto putlabel;
+       case DASM_LABEL_PC: pl = D->pclabels + va_arg(ap, int); CKPL(pc, PC);
+@@ -365,10 +363,22 @@ int dasm_link(Dst_DECL, size_t *szp)
+   do { *((unsigned short *)cp) = (unsigned short)(x); cp+=2; } while (0)
+ #define dasmd(x) \
+   do { *((unsigned int *)cp) = (unsigned int)(x); cp+=4; } while (0)
++#define dasmq(x) \
++  do { *((unsigned long long *)cp) = (unsigned long long)(x); cp+=8; } while (0)
+ #else
+ #define dasmw(x)	do { dasmb(x); dasmb((x)>>8); } while (0)
+ #define dasmd(x)	do { dasmw(x); dasmw((x)>>16); } while (0)
++#define dasmq(x)	do { dasmd(x); dasmd((x)>>32); } while (0)
+ #endif
++static unsigned char *dasma_(unsigned char *cp, ptrdiff_t x)
++{
++  if (sizeof(ptrdiff_t) == 8)
++    dasmq((unsigned long long)x);
++  else
++    dasmd((unsigned int)x);
++  return cp;
++}
++#define dasma(x)	(cp = dasma_(cp, (x)))
+ 
+ /* Pass 3: Encode sections. */
+ int dasm_encode(Dst_DECL, void *buffer)
+@@ -430,7 +440,7 @@ int dasm_encode(Dst_DECL, void *buffer)
+ 	  break;
+ 	}
+ 	case DASM_REL_LG: p++; if (n >= 0) goto rel_pc;
+-	  b++; n = (int)(ptrdiff_t)D->globals[-n];
++	  b++; n = (int)(ptrdiff_t)D->globals[-n-10];
+ 	  /* fallthrough */
+ 	case DASM_REL_A: rel_a:
+ 	  n -= (unsigned int)(ptrdiff_t)(cp+4); goto wd; /* !x64 */
+@@ -443,17 +453,18 @@ int dasm_encode(Dst_DECL, void *buffer)
+ 	  goto wb;
+ 	}
+ 	case DASM_IMM_LG:
+-	  p++; if (n < 0) { n = (int)(ptrdiff_t)D->globals[-n]; goto wd; }
++	  p++;
++	  if (n < 0) { dasma((ptrdiff_t)D->globals[-n-10]); break; }
+ 	  /* fallthrough */
+ 	case DASM_IMM_PC: {
+ 	  int *pb = DASM_POS2PTR(D, n);
+-	  n = *pb < 0 ? pb[1] : (*pb + (int)(ptrdiff_t)base);
+-	  goto wd;
++	  dasma(*pb < 0 ? (ptrdiff_t)pb[1] : (*pb + (ptrdiff_t)base));
++	  break;
+ 	}
+ 	case DASM_LABEL_LG: {
+ 	  int idx = *p++;
+ 	  if (idx >= 10)
+-	    D->globals[idx] = (void *)(base + (*p == DASM_SETLABEL ? *b : n));
++	    D->globals[idx-10] = (void *)(base + (*p == DASM_SETLABEL ? *b : n));
+ 	  break;
+ 	}
+ 	case DASM_LABEL_PC: case DASM_SETLABEL: break;
+Index: wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_x86.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/dynasm/dasm_x86.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dasm_x86.lua
+@@ -1,7 +1,7 @@
+ ------------------------------------------------------------------------------
+ -- DynASM x86/x64 module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- See dynasm.lua for full copyright notice.
+ ------------------------------------------------------------------------------
+ 
+@@ -11,9 +11,9 @@ local x64 = x64
+ local _info = {
+   arch =	x64 and "x64" or "x86",
+   description =	"DynASM x86/x64 module",
+-  version =	"1.4.0",
+-  vernum =	 10400,
+-  release =	"2015-10-18",
++  version =	"1.5.0",
++  vernum =	 10500,
++  release =	"2021-05-02",
+   author =	"Mike Pall",
+   license =	"MIT",
+ }
+@@ -484,6 +484,22 @@ local function wputdarg(n)
+   end
+ end
+ 
++-- Put signed or unsigned qword or arg.
++local function wputqarg(n)
++  local tn = type(n)
++  if tn == "number" then -- This is only used for numbers from -2^31..2^32-1.
++    wputb(band(n, 255))
++    wputb(band(shr(n, 8), 255))
++    wputb(band(shr(n, 16), 255))
++    wputb(shr(n, 24))
++    local sign = n < 0 and 255 or 0
++    wputb(sign); wputb(sign); wputb(sign); wputb(sign)
++  else
++    waction("IMM_D", format("(unsigned int)(%s)", n))
++    waction("IMM_D", format("(unsigned int)((unsigned long long)(%s)>>32)", n))
++  end
++end
++
+ -- Put operand-size dependent number or arg (defaults to dword).
+ local function wputszarg(sz, n)
+   if not sz or sz == "d" or sz == "q" then wputdarg(n)
+@@ -663,10 +679,16 @@ local function opmodestr(op, args)
+ end
+ 
+ -- Convert number to valid integer or nil.
+-local function toint(expr)
++local function toint(expr, isqword)
+   local n = tonumber(expr)
+   if n then
+-    if n % 1 ~= 0 or n < -2147483648 or n > 4294967295 then
++    if n % 1 ~= 0 then
++      werror("not an integer number `"..expr.."'")
++    elseif isqword then
++      if n < -2147483648 or n > 2147483647 then
++	n = nil -- Handle it as an expression to avoid precision loss.
++      end
++    elseif n < -2147483648 or n > 4294967295 then
+       werror("bad integer number `"..expr.."'")
+     end
+     return n
+@@ -749,7 +771,7 @@ local function rtexpr(expr)
+ end
+ 
+ -- Parse operand and return { mode, opsize, reg, xreg, xsc, disp, imm }.
+-local function parseoperand(param)
++local function parseoperand(param, isqword)
+   local t = {}
+ 
+   local expr = param
+@@ -810,7 +832,7 @@ local function parseoperand(param)
+       if t.disp then break end
+ 
+       -- [reg+xreg...]
+-      local xreg, tailx = match(tailr, "^+%s*([@%w_:]+)%s*(.*)$")
++      local xreg, tailx = match(tailr, "^%+%s*([@%w_:]+)%s*(.*)$")
+       xreg, t.xreg, tp = rtexpr(xreg)
+       if not t.xreg then
+ 	-- [reg+-expr]
+@@ -837,7 +859,7 @@ local function parseoperand(param)
+       t.disp = dispexpr(tailx)
+     else
+       -- imm or opsize*imm
+-      local imm = toint(expr)
++      local imm = toint(expr, isqword)
+       if not imm and sub(expr, 1, 1) == "*" and t.opsize then
+ 	imm = toint(sub(expr, 2))
+ 	if imm then
+@@ -1952,7 +1974,7 @@ local function dopattern(pat, args, sz,
+ 	local a = args[narg]
+ 	narg = narg + 1
+ 	local mode, imm = a.mode, a.imm
+-	if mode == "iJ" and not match("iIJ", c) then
++	if mode == "iJ" and not match(x64 and "J" or "iIJ", c) then
+ 	  werror("bad operand size for label")
+ 	end
+ 	if c == "S" then
+@@ -2144,14 +2166,16 @@ end
+ local function op_data(params)
+   if not params then return "imm..." end
+   local sz = sub(params.op, 2, 2)
+-  if sz == "a" then sz = addrsize end
++  if sz == "l" then sz = "d" elseif sz == "a" then sz = addrsize end
+   for _,p in ipairs(params) do
+-    local a = parseoperand(p)
++    local a = parseoperand(p, sz == "q")
+     if sub(a.mode, 1, 1) ~= "i" or (a.opsize and a.opsize ~= sz) then
+       werror("bad mode or size in `"..p.."'")
+     end
+     if a.mode == "iJ" then
+       wputlabel("IMM_", a.imm, 1)
++    elseif sz == "q" then
++      wputqarg(a.imm)
+     else
+       wputszarg(sz, a.imm)
+     end
+@@ -2163,7 +2187,11 @@ map_op[".byte_*"] = op_data
+ map_op[".sbyte_*"] = op_data
+ map_op[".word_*"] = op_data
+ map_op[".dword_*"] = op_data
++map_op[".qword_*"] = op_data
+ map_op[".aword_*"] = op_data
++map_op[".long_*"] = op_data
++map_op[".quad_*"] = op_data
++map_op[".addr_*"] = op_data
+ 
+ ------------------------------------------------------------------------------
+ 
+Index: wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dynasm.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/dynasm/dynasm.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/dynasm/dynasm.lua
+@@ -2,7 +2,7 @@
+ -- DynASM. A dynamic assembler for code generation engines.
+ -- Originally designed and implemented for LuaJIT.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- See below for full copyright notice.
+ ------------------------------------------------------------------------------
+ 
+@@ -10,14 +10,14 @@
+ local _info = {
+   name =	"DynASM",
+   description =	"A dynamic assembler for code generation engines",
+-  version =	"1.4.0",
+-  vernum =	 10400,
+-  release =	"2015-10-18",
++  version =	"1.5.0",
++  vernum =	 10500,
++  release =	"2021-05-02",
+   author =	"Mike Pall",
+   url =		"https://luajit.org/dynasm.html",
+   license =	"MIT",
+   copyright =	[[
+-Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+Index: wrk-4.2.0/obj/LuaJIT-2.1/etc/luajit.1
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/etc/luajit.1
++++ wrk-4.2.0/obj/LuaJIT-2.1/etc/luajit.1
+@@ -74,7 +74,7 @@ luajit \-jv \-e "for i=1,10 do for j=1,1
+ Runs some nested loops and shows the resulting traces.
+ .SH COPYRIGHT
+ .PP
+-\fBLuaJIT\fR is Copyright \(co 2005-2021 Mike Pall.
++\fBLuaJIT\fR is Copyright \(co 2005-2023 Mike Pall.
+ .br
+ \fBLuaJIT\fR is open source software, released under the MIT license.
+ .SH SEE ALSO
+Index: wrk-4.2.0/obj/LuaJIT-2.1/etc/luajit.pc
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/etc/luajit.pc
++++ wrk-4.2.0/obj/LuaJIT-2.1/etc/luajit.pc
+@@ -1,8 +1,8 @@
+ # Package information for LuaJIT to be used by pkg-config.
+ majver=2
+ minver=1
+-relver=0
+-version=${majver}.${minver}.${relver}-beta3
++relver=ROLLING
++version=${majver}.${minver}.${relver}
+ abiver=5.1
+ 
+ prefix=/usr/local
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/.gitignore
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/.gitignore
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/.gitignore
+@@ -1,4 +1,6 @@
+ luajit
++luajit.h
++luajit_relver.txt
+ lj_bcdef.h
+ lj_ffdef.h
+ lj_libdef.h
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/Makefile
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/Makefile
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/Makefile
+@@ -7,12 +7,11 @@
+ # Also works with MinGW and Cygwin on Windows.
+ # Please check msvcbuild.bat for building with MSVC on Windows.
+ #
+-# Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++# Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ ##############################################################################
+ 
+ MAJVER=  2
+ MINVER=  1
+-RELVER=  0
+ ABIVER=  5.1
+ NODOTABIVER= 51
+ 
+@@ -53,6 +52,7 @@ CCOPT_arm=
+ CCOPT_arm64=
+ CCOPT_ppc=
+ CCOPT_mips=
++CCOPT_riscv64=
+ #
+ CCDEBUG=
+ # Uncomment the next line to generate debug information:
+@@ -211,7 +211,7 @@ TARGET_CC= $(STATIC_CC)
+ TARGET_STCC= $(STATIC_CC)
+ TARGET_DYNCC= $(DYNAMIC_CC)
+ TARGET_LD= $(CROSS)$(CC)
+-TARGET_AR= $(CROSS)ar rcus 2>/dev/null
++TARGET_AR= $(CROSS)ar rcus
+ TARGET_STRIP= $(CROSS)strip
+ 
+ TARGET_LIBPATH= $(or $(PREFIX),/usr/local)/$(or $(MULTILIB),lib)
+@@ -234,7 +234,7 @@ TARGET_ALDFLAGS= $(LDOPTIONS) $(TARGET_X
+ TARGET_ASHLDFLAGS= $(LDOPTIONS) $(TARGET_XSHLDFLAGS) $(TARGET_FLAGS) $(TARGET_SHLDFLAGS)
+ TARGET_ALIBS= $(TARGET_XLIBS) $(LIBS) $(TARGET_LIBS)
+ 
+-TARGET_TESTARCH=$(shell $(TARGET_CC) $(TARGET_TCFLAGS) -E lj_arch.h -dM)
++TARGET_TESTARCH:=$(shell $(TARGET_CC) $(TARGET_TCFLAGS) -E lj_arch.h -dM)
+ ifneq (,$(findstring LJ_TARGET_X64 ,$(TARGET_TESTARCH)))
+   TARGET_LJARCH= x64
+ else
+@@ -268,6 +268,12 @@ ifneq (,$(findstring LJ_TARGET_MIPS ,$(T
+     TARGET_LJARCH= mips
+   endif
+ else
++ifneq (,$(findstring LJ_TARGET_RISCV32 ,$(TARGET_TESTARCH)))
++    TARGET_LJARCH= riscv32
++else
++ifneq (,$(findstring LJ_TARGET_RISCV64 ,$(TARGET_TESTARCH)))
++    TARGET_LJARCH= riscv64
++else
+   $(error Unsupported target architecture)
+ endif
+ endif
+@@ -275,6 +281,8 @@ endif
+ endif
+ endif
+ endif
++endif
++endif
+ 
+ ifneq (,$(findstring LJ_TARGET_PS3 1,$(TARGET_TESTARCH)))
+   TARGET_SYS= PS3
+@@ -311,6 +319,7 @@ ifeq (Windows,$(TARGET_SYS))
+   TARGET_XSHLDFLAGS= -shared -Wl,--out-implib,$(TARGET_DLLDOTANAME)
+   TARGET_DYNXLDOPTS=
+ else
++  TARGET_AR+= 2>/dev/null
+ ifeq (,$(shell $(TARGET_CC) -o /dev/null -c -x c /dev/null -fno-stack-protector 2>/dev/null || echo 1))
+   TARGET_XCFLAGS+= -fno-stack-protector
+ endif
+@@ -319,23 +328,27 @@ ifeq (Darwin,$(TARGET_SYS))
+     $(error missing: export MACOSX_DEPLOYMENT_TARGET=XX.YY)
+   endif
+   TARGET_STRIP+= -x
++  TARGET_XCFLAGS+= -DLUAJIT_UNWIND_EXTERNAL
+   TARGET_XSHLDFLAGS= -dynamiclib -single_module -undefined dynamic_lookup -fPIC
+   TARGET_DYNXLDOPTS=
+-  TARGET_XSHLDFLAGS+= -install_name $(TARGET_DYLIBPATH) -compatibility_version $(MAJVER).$(MINVER) -current_version $(MAJVER).$(MINVER).$(RELVER)
+-  ifeq (x64,$(TARGET_LJARCH))
+-    TARGET_XLDFLAGS+= -pagezero_size 10000 -image_base 100000000
+-    TARGET_XSHLDFLAGS+= -image_base 7fff04c4a000
+-  endif
++  TARGET_XSHLDFLAGS+= -install_name $(TARGET_DYLIBPATH) -compatibility_version $(MAJVER).$(MINVER) -current_version $(MAJVER).$(MINVER).255
+ else
+ ifeq (iOS,$(TARGET_SYS))
+   TARGET_STRIP+= -x
+   TARGET_XSHLDFLAGS= -dynamiclib -single_module -undefined dynamic_lookup -fPIC
+   TARGET_DYNXLDOPTS=
+-  TARGET_XSHLDFLAGS+= -install_name $(TARGET_DYLIBPATH) -compatibility_version $(MAJVER).$(MINVER) -current_version $(MAJVER).$(MINVER).$(RELVER)
++  TARGET_XSHLDFLAGS+= -install_name $(TARGET_DYLIBPATH) -compatibility_version $(MAJVER).$(MINVER) -current_version $(MAJVER).$(MINVER).255
+   ifeq (arm64,$(TARGET_LJARCH))
+     TARGET_XCFLAGS+= -fno-omit-frame-pointer
+   endif
+ else
++  ifeq (,$(findstring LJ_NO_UNWIND 1,$(TARGET_TESTARCH)))
++    # Find out whether the target toolchain always generates unwind tables.
++    TARGET_TESTUNWIND=$(shell exec 2>/dev/null; echo 'extern void b(void);int a(void){b();return 0;}' | $(TARGET_CC) -c -x c - -o tmpunwind.o && { grep -qa -e eh_frame -e __unwind_info tmpunwind.o || grep -qU -e eh_frame -e __unwind_info tmpunwind.o; } && echo E; rm -f tmpunwind.o)
++    ifneq (,$(findstring E,$(TARGET_TESTUNWIND)))
++      TARGET_XCFLAGS+= -DLUAJIT_UNWIND_EXTERNAL
++    endif
++  endif
+   ifneq (SunOS,$(TARGET_SYS))
+     ifneq (PS3,$(TARGET_SYS))
+       TARGET_XLDFLAGS+= -Wl,-E
+@@ -383,10 +396,11 @@ MINILUA_O= host/minilua.o
+ MINILUA_LIBS= -lm
+ MINILUA_T= host/minilua
+ MINILUA_X= $(MINILUA_T)
++MINILUA_DEP=
+ 
+ ifeq (,$(HOST_LUA))
+   HOST_LUA= $(MINILUA_X)
+-  DASM_DEP= $(MINILUA_T)
++  MINILUA_DEP= $(MINILUA_T)
+ endif
+ 
+ DASM_DIR= ../dynasm
+@@ -428,6 +442,10 @@ ifneq (,$(findstring LJ_NO_UNWIND 1,$(TA
+   DASM_AFLAGS+= -D NO_UNWIND
+   TARGET_ARCH+= -DLUAJIT_NO_UNWIND
+ endif
++ifneq (,$(findstring LJ_ABI_PAUTH 1,$(TARGET_TESTARCH)))
++  DASM_AFLAGS+= -D PAUTH
++  TARGET_ARCH+= -DLJ_ABI_PAUTH=1
++endif
+ DASM_AFLAGS+= -D VER=$(subst LJ_ARCH_VERSION_,,$(filter LJ_ARCH_VERSION_%,$(subst LJ_ARCH_VERSION ,LJ_ARCH_VERSION_,$(TARGET_TESTARCH))))
+ ifeq (Windows,$(TARGET_SYS))
+   DASM_AFLAGS+= -D WIN
+@@ -459,12 +477,26 @@ ifeq (ppc,$(TARGET_LJARCH))
+     DASM_AFLAGS+= -D PPE -D TOC
+   endif
+ endif
++ifneq (,$(findstring LJ_TARGET_RISCV32 ,$(TARGET_TESTARCH)))
++  DASM_AFLAGS+= -D RISCV32
++endif
++ifneq (,$(findstring LJ_TARGET_RISCV64 ,$(TARGET_TESTARCH)))
++  DASM_AFLAGS+= -D RISCV64
++endif
+ endif
+ endif
+ 
+ DASM_FLAGS= $(DASM_XFLAGS) $(DASM_AFLAGS)
+ DASM_DASC= vm_$(DASM_ARCH).dasc
+ 
++GIT= git
++ifeq (Windows,$(HOST_SYS)$(HOST_MSYS))
++  GIT_RELVER= if exist ..\.git ( $(GIT) show -s --format=%%ct >luajit_relver.txt ) else ( type ..\.relver >luajit_relver.txt )
++else
++  GIT_RELVER= [ -e ../.git ] && $(GIT) show -s --format=%ct >luajit_relver.txt 2>/dev/null || cat ../.relver >luajit_relver.txt 2>/dev/null || :
++endif
++GIT_DEP= $(wildcard ../.git/HEAD ../.git/refs/heads/*)
++
+ BUILDVM_O= host/buildvm.o host/buildvm_asm.o host/buildvm_peobj.o \
+ 	   host/buildvm_lib.o host/buildvm_fold.o
+ BUILDVM_T= host/buildvm
+@@ -479,13 +511,15 @@ LJVM_BOUT= $(LJVM_S)
+ LJVM_MODE= elfasm
+ 
+ LJLIB_O= lib_base.o lib_math.o lib_bit.o lib_string.o lib_table.o \
+-	 lib_io.o lib_os.o lib_package.o lib_debug.o lib_jit.o lib_ffi.o
++	 lib_io.o lib_os.o lib_package.o lib_debug.o lib_jit.o lib_ffi.o \
++	 lib_buffer.o
+ LJLIB_C= $(LJLIB_O:.o=.c)
+ 
+ LJCORE_O= lj_assert.o lj_gc.o lj_err.o lj_char.o lj_bc.o lj_obj.o lj_buf.o \
+ 	  lj_str.o lj_tab.o lj_func.o lj_udata.o lj_meta.o lj_debug.o \
+ 	  lj_prng.o lj_state.o lj_dispatch.o lj_vmevent.o lj_vmmath.o \
+-	  lj_strscan.o lj_strfmt.o lj_strfmt_num.o lj_api.o lj_profile.o \
++	  lj_strscan.o lj_strfmt.o lj_strfmt_num.o lj_serialize.o \
++	  lj_api.o lj_profile.o \
+ 	  lj_lex.o lj_parse.o lj_bcread.o lj_bcwrite.o lj_load.o \
+ 	  lj_ir.o lj_opt_mem.o lj_opt_fold.o lj_opt_narrow.o \
+ 	  lj_opt_dce.o lj_opt_loop.o lj_opt_split.o lj_opt_sink.o \
+@@ -509,8 +543,8 @@ LUAJIT_T= luajit
+ 
+ ALL_T= $(LUAJIT_T) $(LUAJIT_A) $(LUAJIT_SO) $(HOST_T)
+ ALL_HDRGEN= lj_bcdef.h lj_ffdef.h lj_libdef.h lj_recdef.h lj_folddef.h \
+-	    host/buildvm_arch.h
+-ALL_GEN= $(LJVM_S) $(ALL_HDRGEN) $(LIB_VMDEFP)
++	    host/buildvm_arch.h luajit.h
++ALL_GEN= $(LJVM_S) $(ALL_HDRGEN) luajit_relver.txt $(LIB_VMDEFP)
+ WIN_RM= *.obj *.lib *.exp *.dll *.exe *.manifest *.pdb *.ilk
+ ALL_RM= $(ALL_T) $(ALL_GEN) *.o host/*.o $(WIN_RM)
+ 
+@@ -634,7 +668,12 @@ $(MINILUA_T): $(MINILUA_O)
+ 	$(E) "HOSTLINK  $@"
+ 	$(Q)$(HOST_CC) $(HOST_ALDFLAGS) -o $@ $(MINILUA_O) $(MINILUA_LIBS) $(HOST_ALIBS)
+ 
+-host/buildvm_arch.h: $(DASM_DASC) $(DASM_DEP) $(DASM_DIR)/*.lua lj_arch.h lua.h luaconf.h
++luajit.h: $(MINILUA_DEP) $(GIT_DEP) luajit_rolling.h
++	$(E) "VERSION   $@"
++	$(Q)$(GIT_RELVER)
++	$(Q)$(HOST_LUA) host/genversion.lua
++
++host/buildvm_arch.h: $(DASM_DASC) $(MINILUA_DEP) lj_arch.h lua.h luaconf.h
+ 	$(E) "DYNASM    $@"
+ 	$(Q)$(DASM) $(DASM_FLAGS) -o $@ $(DASM_DASC)
+ 
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/Makefile.dep
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/Makefile.dep
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/Makefile.dep
+@@ -1,15 +1,19 @@
+ lib_aux.o: lib_aux.c lua.h luaconf.h lauxlib.h lj_obj.h lj_def.h \
+  lj_arch.h lj_err.h lj_errmsg.h lj_state.h lj_trace.h lj_jit.h lj_ir.h \
+- lj_dispatch.h lj_bc.h lj_traceerr.h lj_lib.h
++ lj_dispatch.h lj_bc.h lj_traceerr.h lj_lib.h lj_vmevent.h
+ lib_base.o: lib_base.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \
+- lj_def.h lj_arch.h lj_gc.h lj_err.h lj_errmsg.h lj_debug.h lj_str.h \
+- lj_tab.h lj_meta.h lj_state.h lj_frame.h lj_bc.h lj_ctype.h lj_cconv.h \
+- lj_ff.h lj_ffdef.h lj_dispatch.h lj_jit.h lj_ir.h lj_char.h lj_strscan.h \
+- lj_strfmt.h lj_lib.h lj_libdef.h
++ lj_def.h lj_arch.h lj_gc.h lj_err.h lj_errmsg.h lj_debug.h lj_buf.h \
++ lj_str.h lj_tab.h lj_meta.h lj_state.h lj_frame.h lj_bc.h lj_ctype.h \
++ lj_cconv.h lj_ff.h lj_ffdef.h lj_dispatch.h lj_jit.h lj_ir.h lj_char.h \
++ lj_strscan.h lj_strfmt.h lj_lib.h lj_libdef.h
+ lib_bit.o: lib_bit.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h lj_def.h \
+  lj_arch.h lj_err.h lj_errmsg.h lj_buf.h lj_gc.h lj_str.h lj_strscan.h \
+  lj_strfmt.h lj_ctype.h lj_cdata.h lj_cconv.h lj_carith.h lj_ff.h \
+  lj_ffdef.h lj_lib.h lj_libdef.h
++lib_buffer.o: lib_buffer.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \
++ lj_def.h lj_arch.h lj_gc.h lj_err.h lj_errmsg.h lj_buf.h lj_str.h \
++ lj_tab.h lj_udata.h lj_meta.h lj_ctype.h lj_cdata.h lj_cconv.h \
++ lj_strfmt.h lj_serialize.h lj_lib.h lj_libdef.h
+ lib_debug.o: lib_debug.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \
+  lj_def.h lj_arch.h lj_gc.h lj_err.h lj_errmsg.h lj_debug.h lj_lib.h \
+  lj_libdef.h
+@@ -48,10 +52,10 @@ lj_api.o: lj_api.c lj_obj.h lua.h luacon
+  lj_meta.h lj_state.h lj_bc.h lj_frame.h lj_trace.h lj_jit.h lj_ir.h \
+  lj_dispatch.h lj_traceerr.h lj_vm.h lj_strscan.h lj_strfmt.h
+ lj_asm.o: lj_asm.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
+- lj_str.h lj_tab.h lj_frame.h lj_bc.h lj_ctype.h lj_ir.h lj_jit.h \
+- lj_ircall.h lj_iropt.h lj_mcode.h lj_trace.h lj_dispatch.h lj_traceerr.h \
+- lj_snap.h lj_asm.h lj_vm.h lj_target.h lj_target_*.h lj_emit_*.h \
+- lj_asm_*.h
++ lj_buf.h lj_str.h lj_tab.h lj_frame.h lj_bc.h lj_ctype.h lj_ir.h \
++ lj_jit.h lj_ircall.h lj_iropt.h lj_mcode.h lj_trace.h lj_dispatch.h \
++ lj_traceerr.h lj_snap.h lj_asm.h lj_vm.h lj_target.h lj_target_*.h \
++ lj_prng.h lj_emit_*.h lj_asm_*.h
+ lj_assert.o: lj_assert.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h
+ lj_bc.o: lj_bc.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_bc.h \
+  lj_bcdef.h
+@@ -77,8 +81,8 @@ lj_ccallback.o: lj_ccallback.c lj_obj.h
+  lj_target_*.h lj_mcode.h lj_jit.h lj_ir.h lj_trace.h lj_dispatch.h \
+  lj_traceerr.h lj_vm.h
+ lj_cconv.o: lj_cconv.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+- lj_err.h lj_errmsg.h lj_tab.h lj_ctype.h lj_gc.h lj_cdata.h lj_cconv.h \
+- lj_ccallback.h
++ lj_err.h lj_errmsg.h lj_buf.h lj_gc.h lj_str.h lj_tab.h lj_ctype.h \
++ lj_cdata.h lj_cconv.h lj_ccallback.h
+ lj_cdata.o: lj_cdata.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+  lj_gc.h lj_err.h lj_errmsg.h lj_tab.h lj_ctype.h lj_cconv.h lj_cdata.h
+ lj_char.o: lj_char.c lj_char.h lj_def.h lua.h luaconf.h
+@@ -110,32 +114,32 @@ lj_err.o: lj_err.c lj_obj.h lua.h luacon
+  lj_ff.h lj_ffdef.h lj_trace.h lj_jit.h lj_ir.h lj_dispatch.h \
+  lj_traceerr.h lj_vm.h lj_strfmt.h
+ lj_ffrecord.o: lj_ffrecord.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+- lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_frame.h lj_bc.h lj_ff.h \
+- lj_ffdef.h lj_ir.h lj_jit.h lj_ircall.h lj_iropt.h lj_trace.h \
+- lj_dispatch.h lj_traceerr.h lj_record.h lj_ffrecord.h lj_crecord.h \
+- lj_vm.h lj_strscan.h lj_strfmt.h lj_recdef.h
++ lj_err.h lj_errmsg.h lj_buf.h lj_gc.h lj_str.h lj_tab.h lj_frame.h \
++ lj_bc.h lj_ff.h lj_ffdef.h lj_ir.h lj_jit.h lj_ircall.h lj_iropt.h \
++ lj_trace.h lj_dispatch.h lj_traceerr.h lj_record.h lj_ffrecord.h \
++ lj_crecord.h lj_vm.h lj_strscan.h lj_strfmt.h lj_serialize.h lj_recdef.h
+ lj_func.o: lj_func.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
+  lj_func.h lj_trace.h lj_jit.h lj_ir.h lj_dispatch.h lj_bc.h \
+  lj_traceerr.h lj_vm.h
+ lj_gc.o: lj_gc.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
+  lj_err.h lj_errmsg.h lj_buf.h lj_str.h lj_tab.h lj_func.h lj_udata.h \
+  lj_meta.h lj_state.h lj_frame.h lj_bc.h lj_ctype.h lj_cdata.h lj_trace.h \
+- lj_jit.h lj_ir.h lj_dispatch.h lj_traceerr.h lj_vm.h
++ lj_jit.h lj_ir.h lj_dispatch.h lj_traceerr.h lj_vm.h lj_vmevent.h
+ lj_gdbjit.o: lj_gdbjit.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+  lj_gc.h lj_err.h lj_errmsg.h lj_debug.h lj_frame.h lj_bc.h lj_buf.h \
+  lj_str.h lj_strfmt.h lj_jit.h lj_ir.h lj_dispatch.h
+ lj_ir.o: lj_ir.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
+  lj_buf.h lj_str.h lj_tab.h lj_ir.h lj_jit.h lj_ircall.h lj_iropt.h \
+  lj_trace.h lj_dispatch.h lj_bc.h lj_traceerr.h lj_ctype.h lj_cdata.h \
+- lj_carith.h lj_vm.h lj_strscan.h lj_strfmt.h lj_prng.h
++ lj_carith.h lj_vm.h lj_strscan.h lj_serialize.h lj_strfmt.h lj_prng.h
+ lj_lex.o: lj_lex.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
+  lj_err.h lj_errmsg.h lj_buf.h lj_str.h lj_tab.h lj_ctype.h lj_cdata.h \
+  lualib.h lj_state.h lj_lex.h lj_parse.h lj_char.h lj_strscan.h \
+  lj_strfmt.h
+ lj_lib.o: lj_lib.c lauxlib.h lua.h luaconf.h lj_obj.h lj_def.h lj_arch.h \
+  lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_func.h lj_bc.h \
+- lj_dispatch.h lj_jit.h lj_ir.h lj_vm.h lj_strscan.h lj_strfmt.h lj_lex.h \
+- lj_bcdump.h lj_lib.h
++ lj_dispatch.h lj_jit.h lj_ir.h lj_ctype.h lj_vm.h lj_strscan.h \
++ lj_strfmt.h lj_lex.h lj_bcdump.h lj_lib.h
+ lj_load.o: lj_load.c lua.h luaconf.h lauxlib.h lj_obj.h lj_def.h \
+  lj_arch.h lj_gc.h lj_err.h lj_errmsg.h lj_buf.h lj_str.h lj_func.h \
+  lj_frame.h lj_bc.h lj_vm.h lj_lex.h lj_bcdump.h lj_parse.h
+@@ -170,15 +174,18 @@ lj_parse.o: lj_parse.c lj_obj.h lua.h lu
+  lj_gc.h lj_err.h lj_errmsg.h lj_debug.h lj_buf.h lj_str.h lj_tab.h \
+  lj_func.h lj_state.h lj_bc.h lj_ctype.h lj_strfmt.h lj_lex.h lj_parse.h \
+  lj_vm.h lj_vmevent.h
++lj_prng.o: lj_prng.c lj_def.h lua.h luaconf.h lj_arch.h lj_prng.h
+ lj_profile.o: lj_profile.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+  lj_buf.h lj_gc.h lj_str.h lj_frame.h lj_bc.h lj_debug.h lj_dispatch.h \
+  lj_jit.h lj_ir.h lj_trace.h lj_traceerr.h lj_profile.h luajit.h
+-lj_prng.o: lj_prng.c lj_def.h lua.h luaconf.h lj_arch.h lj_prng.h
+ lj_record.o: lj_record.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+  lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_meta.h lj_frame.h lj_bc.h \
+  lj_ctype.h lj_gc.h lj_ff.h lj_ffdef.h lj_debug.h lj_ir.h lj_jit.h \
+  lj_ircall.h lj_iropt.h lj_trace.h lj_dispatch.h lj_traceerr.h \
+  lj_record.h lj_ffrecord.h lj_snap.h lj_vm.h lj_prng.h
++lj_serialize.o: lj_serialize.c lj_obj.h lua.h luaconf.h lj_def.h \
++ lj_arch.h lj_err.h lj_errmsg.h lj_buf.h lj_gc.h lj_str.h lj_tab.h \
++ lj_udata.h lj_ctype.h lj_cdata.h lj_ir.h lj_serialize.h
+ lj_snap.o: lj_snap.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
+  lj_tab.h lj_state.h lj_frame.h lj_bc.h lj_ir.h lj_jit.h lj_iropt.h \
+  lj_trace.h lj_dispatch.h lj_traceerr.h lj_snap.h lj_target.h \
+@@ -189,9 +196,10 @@ lj_state.o: lj_state.c lj_obj.h lua.h lu
+  lj_ir.h lj_dispatch.h lj_traceerr.h lj_vm.h lj_prng.h lj_lex.h \
+  lj_alloc.h luajit.h
+ lj_str.o: lj_str.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
+- lj_err.h lj_errmsg.h lj_str.h lj_char.h
++ lj_err.h lj_errmsg.h lj_str.h lj_char.h lj_prng.h
+ lj_strfmt.o: lj_strfmt.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+- lj_buf.h lj_gc.h lj_str.h lj_state.h lj_char.h lj_strfmt.h
++ lj_err.h lj_errmsg.h lj_buf.h lj_gc.h lj_str.h lj_meta.h lj_state.h \
++ lj_char.h lj_strfmt.h lj_ctype.h lj_lib.h
+ lj_strfmt_num.o: lj_strfmt_num.c lj_obj.h lua.h luaconf.h lj_def.h \
+  lj_arch.h lj_buf.h lj_gc.h lj_str.h lj_strfmt.h
+ lj_strscan.o: lj_strscan.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+@@ -204,7 +212,7 @@ lj_trace.o: lj_trace.c lj_obj.h lua.h lu
+  lj_dispatch.h lj_traceerr.h lj_snap.h lj_gdbjit.h lj_record.h lj_asm.h \
+  lj_vm.h lj_vmevent.h lj_target.h lj_target_*.h lj_prng.h
+ lj_udata.o: lj_udata.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+- lj_gc.h lj_udata.h
++ lj_gc.h lj_err.h lj_errmsg.h lj_udata.h
+ lj_vmevent.o: lj_vmevent.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+  lj_str.h lj_tab.h lj_state.h lj_dispatch.h lj_bc.h lj_jit.h lj_ir.h \
+  lj_vm.h lj_vmevent.h
+@@ -214,25 +222,25 @@ ljamalg.o: ljamalg.c lua.h luaconf.h lau
+  lj_def.h lj_arch.h lj_gc.c lj_gc.h lj_err.h lj_errmsg.h lj_buf.h \
+  lj_str.h lj_tab.h lj_func.h lj_udata.h lj_meta.h lj_state.h lj_frame.h \
+  lj_bc.h lj_ctype.h lj_cdata.h lj_trace.h lj_jit.h lj_ir.h lj_dispatch.h \
+- lj_traceerr.h lj_vm.h lj_err.c lj_debug.h lj_ff.h lj_ffdef.h lj_strfmt.h \
+- lj_char.c lj_char.h lj_bc.c lj_bcdef.h lj_obj.c lj_buf.c lj_str.c \
+- lj_tab.c lj_func.c lj_udata.c lj_meta.c lj_strscan.h lj_lib.h lj_debug.c \
+- lj_prng.c lj_prng.h lj_state.c lj_lex.h lj_alloc.h luajit.h \
+- lj_dispatch.c lj_ccallback.h lj_profile.h lj_vmevent.c lj_vmevent.h \
+- lj_vmmath.c lj_strscan.c lj_strfmt.c lj_strfmt_num.c lj_api.c \
+- lj_profile.c lj_lex.c lualib.h lj_parse.h lj_parse.c lj_bcread.c \
+- lj_bcdump.h lj_bcwrite.c lj_load.c lj_ctype.c lj_cdata.c lj_cconv.h \
+- lj_cconv.c lj_ccall.c lj_ccall.h lj_ccallback.c lj_target.h \
+- lj_target_*.h lj_mcode.h lj_carith.c lj_carith.h lj_clib.c lj_clib.h \
+- lj_cparse.c lj_cparse.h lj_lib.c lj_ir.c lj_ircall.h lj_iropt.h \
+- lj_opt_mem.c lj_opt_fold.c lj_folddef.h lj_opt_narrow.c lj_opt_dce.c \
+- lj_opt_loop.c lj_snap.h lj_opt_split.c lj_opt_sink.c lj_mcode.c \
+- lj_snap.c lj_record.c lj_record.h lj_ffrecord.h lj_crecord.c \
+- lj_crecord.h lj_ffrecord.c lj_recdef.h lj_asm.c lj_asm.h lj_emit_*.h \
+- lj_asm_*.h lj_trace.c lj_gdbjit.h lj_gdbjit.c lj_alloc.c lib_aux.c \
+- lib_base.c lj_libdef.h lib_math.c lib_string.c lib_table.c lib_io.c \
+- lib_os.c lib_package.c lib_debug.c lib_bit.c lib_jit.c lib_ffi.c \
+- lib_init.c
++ lj_traceerr.h lj_vm.h lj_vmevent.h lj_err.c lj_debug.h lj_ff.h \
++ lj_ffdef.h lj_strfmt.h lj_char.c lj_char.h lj_bc.c lj_bcdef.h lj_obj.c \
++ lj_buf.c lj_str.c lj_prng.h lj_tab.c lj_func.c lj_udata.c lj_meta.c \
++ lj_strscan.h lj_lib.h lj_debug.c lj_prng.c lj_state.c lj_lex.h \
++ lj_alloc.h luajit.h lj_dispatch.c lj_ccallback.h lj_profile.h \
++ lj_vmevent.c lj_vmmath.c lj_strscan.c lj_strfmt.c lj_strfmt_num.c \
++ lj_serialize.c lj_serialize.h lj_api.c lj_profile.c lj_lex.c lualib.h \
++ lj_parse.h lj_parse.c lj_bcread.c lj_bcdump.h lj_bcwrite.c lj_load.c \
++ lj_ctype.c lj_cdata.c lj_cconv.h lj_cconv.c lj_ccall.c lj_ccall.h \
++ lj_ccallback.c lj_target.h lj_target_*.h lj_mcode.h lj_carith.c \
++ lj_carith.h lj_clib.c lj_clib.h lj_cparse.c lj_cparse.h lj_lib.c lj_ir.c \
++ lj_ircall.h lj_iropt.h lj_opt_mem.c lj_opt_fold.c lj_folddef.h \
++ lj_opt_narrow.c lj_opt_dce.c lj_opt_loop.c lj_snap.h lj_opt_split.c \
++ lj_opt_sink.c lj_mcode.c lj_snap.c lj_record.c lj_record.h lj_ffrecord.h \
++ lj_crecord.c lj_crecord.h lj_ffrecord.c lj_recdef.h lj_asm.c lj_asm.h \
++ lj_emit_*.h lj_asm_*.h lj_trace.c lj_gdbjit.h lj_gdbjit.c lj_alloc.c \
++ lib_aux.c lib_base.c lj_libdef.h lib_math.c lib_string.c lib_table.c \
++ lib_io.c lib_os.c lib_package.c lib_debug.c lib_bit.c lib_jit.c \
++ lib_ffi.c lib_buffer.c lib_init.c
+ luajit.o: luajit.c lua.h luaconf.h lauxlib.h lualib.h luajit.h lj_arch.h
+ host/buildvm.o: host/buildvm.c host/buildvm.h lj_def.h lua.h luaconf.h \
+  lj_arch.h lj_obj.h lj_def.h lj_arch.h lj_gc.h lj_obj.h lj_bc.h lj_ir.h \
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/host/buildvm.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/host/buildvm.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/host/buildvm.c
+@@ -1,6 +1,6 @@
+ /*
+ ** LuaJIT VM builder.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** This is a tool to build the hand-tuned assembler code required for
+ ** LuaJIT's bytecode interpreter. It supports a variety of output formats
+@@ -18,8 +18,10 @@
+ #include "lj_obj.h"
+ #include "lj_gc.h"
+ #include "lj_bc.h"
++#if LJ_HASJIT
+ #include "lj_ir.h"
+ #include "lj_ircall.h"
++#endif
+ #include "lj_frame.h"
+ #include "lj_dispatch.h"
+ #if LJ_HASFFI
+@@ -65,6 +67,8 @@ static int collect_reloc(BuildCtx *ctx,
+ #include "../dynasm/dasm_ppc.h"
+ #elif LJ_TARGET_MIPS
+ #include "../dynasm/dasm_mips.h"
++#elif LJ_TARGET_RISCV32 || LJ_TARGET_RISCV64
++#include "../dynasm/dasm_riscv.h"
+ #else
+ #error "No support for this architecture (yet)"
+ #endif
+@@ -250,6 +254,7 @@ BCDEF(BCNAME)
+   NULL
+ };
+ 
++#if LJ_HASJIT
+ const char *const ir_names[] = {
+ #define IRNAME(name, m, m1, m2)	#name,
+ IRDEF(IRNAME)
+@@ -290,7 +295,9 @@ static const char *const trace_errors[]
+ #include "lj_traceerr.h"
+   NULL
+ };
++#endif
+ 
++#if LJ_HASJIT
+ static const char *lower(char *buf, const char *s)
+ {
+   char *p = buf;
+@@ -301,6 +308,7 @@ static const char *lower(char *buf, cons
+   *p = '\0';
+   return buf;
+ }
++#endif
+ 
+ /* Emit C source code for bytecode-related definitions. */
+ static void emit_bcdef(BuildCtx *ctx)
+@@ -318,15 +326,19 @@ static void emit_bcdef(BuildCtx *ctx)
+ /* Emit VM definitions as Lua code for debug modules. */
+ static void emit_vmdef(BuildCtx *ctx)
+ {
++#if LJ_HASJIT
+   char buf[80];
++#endif
+   int i;
+   fprintf(ctx->fp, "-- This is a generated file. DO NOT EDIT!\n\n");
++  fprintf(ctx->fp, "assert(require(\"jit\").version == \"%s\", \"LuaJIT core/library version mismatch\")\n\n", LUAJIT_VERSION);
+   fprintf(ctx->fp, "return {\n\n");
+ 
+   fprintf(ctx->fp, "bcnames = \"");
+   for (i = 0; bc_names[i]; i++) fprintf(ctx->fp, "%-6s", bc_names[i]);
+   fprintf(ctx->fp, "\",\n\n");
+ 
++#if LJ_HASJIT
+   fprintf(ctx->fp, "irnames = \"");
+   for (i = 0; ir_names[i]; i++) fprintf(ctx->fp, "%-6s", ir_names[i]);
+   fprintf(ctx->fp, "\",\n\n");
+@@ -355,6 +367,7 @@ static void emit_vmdef(BuildCtx *ctx)
+   for (i = 0; trace_errors[i]; i++)
+     fprintf(ctx->fp, "\"%s\",\n", trace_errors[i]);
+   fprintf(ctx->fp, "},\n\n");
++#endif
+ }
+ 
+ /* -- Argument parsing ---------------------------------------------------- */
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/host/buildvm.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/host/buildvm.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/host/buildvm.h
+@@ -1,6 +1,6 @@
+ /*
+ ** LuaJIT VM builder.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _BUILDVM_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/host/buildvm_asm.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/host/buildvm_asm.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/host/buildvm_asm.c
+@@ -1,6 +1,6 @@
+ /*
+ ** LuaJIT VM builder: Assembler source code emitter.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #include "buildvm.h"
+@@ -97,9 +97,15 @@ static void emit_asm_words(BuildCtx *ctx
+ #if LJ_TARGET_ARM64 && LJ_BE
+     ins = lj_bswap(ins);  /* ARM64 instructions are always little-endian. */
+ #endif
+-    if ((i & 15) == 0)
++    if ((i & 15) == 0) {
++#if LJ_TARGET_RISCV64
++      while (ins == 0xffffffffu) { i += 4; ins = *(uint32_t *)(p+i); }
++#endif
+       fprintf(ctx->fp, "\t.long 0x%08x", ins);
+-    else
++    } else
++#if LJ_TARGET_RISCV64
++    if (ins != 0xffffffffu)
++#endif
+       fprintf(ctx->fp, ",0x%08x", ins);
+     if ((i & 15) == 12) putc('\n', ctx->fp);
+   }
+@@ -156,6 +162,21 @@ static void emit_asm_wordreloc(BuildCtx
+ 	  "Error: unsupported opcode %08x for %s symbol relocation.\n",
+ 	  ins, sym);
+   exit(1);
++#elif LJ_TARGET_RISCV64
++  if (ins == 0x7fffffffu) {
++    fprintf(ctx->fp, "\tcall %s\n", sym);
++  } else if ((ins & 0x7f) == 0x17u) {
++    fprintf(ctx->fp, "\tauipc x%d, %s\n", (ins >> 7) & 31, sym);
++  } else if ((ins & 0x7f) == 0x67u) {
++    fprintf(ctx->fp, "\tjalr x%d, x%d, %s\n", (ins >> 7) & 31, (ins >> 15) & 31, sym);
++  } else if ((ins & 0x7f) == 0x6fu) {
++    fprintf(ctx->fp, "\tjal x%d, %s\n", (ins >> 7) & 31, sym);
++  } else {
++    fprintf(stderr,
++  	    "Error: unsupported opcode %08x for %s symbol relocation.\n",
++  	    ins, sym);
++    exit(1);
++  }
+ #else
+ #error "missing relocation support for this architecture"
+ #endif
+@@ -243,6 +264,15 @@ void emit_asm(BuildCtx *ctx)
+ 
+   fprintf(ctx->fp, "\t.file \"buildvm_%s.dasc\"\n", ctx->dasm_arch);
+   fprintf(ctx->fp, "\t.text\n");
++#if LJ_TARGET_MIPS32 && !LJ_ABI_SOFTFP
++  fprintf(ctx->fp, "\t.module fp=32\n");
++#endif
++#if LJ_TARGET_MIPS
++  fprintf(ctx->fp, "\t.set nomips16\n\t.abicalls\n\t.set noreorder\n\t.set nomacro\n");
++#endif
++#if LJ_TARGET_RISCV64
++  fprintf(ctx->fp, ".option arch, -c\n.option norelax\n");
++#endif
+   emit_asm_align(ctx, 4);
+ 
+ #if LJ_TARGET_PS3
+@@ -269,9 +299,6 @@ void emit_asm(BuildCtx *ctx)
+ 	  ".pad #28\n");
+ #endif
+ #endif
+-#if LJ_TARGET_MIPS
+-  fprintf(ctx->fp, ".set nomips16\n.abicalls\n.set noreorder\n.set nomacro\n");
+-#endif
+ 
+   for (i = rel = 0; i < ctx->nsym; i++) {
+     int32_t ofs = ctx->sym[i].ofs;
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/host/buildvm_fold.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/host/buildvm_fold.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/host/buildvm_fold.c
+@@ -1,10 +1,11 @@
+ /*
+ ** LuaJIT VM builder: IR folding hash table generator.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #include "buildvm.h"
+ #include "lj_obj.h"
++#if LJ_HASJIT
+ #include "lj_ir.h"
+ 
+ /* Context for the folding hash table generator. */
+@@ -226,4 +227,10 @@ void emit_fold(BuildCtx *ctx)
+ 
+   makehash(ctx);
+ }
++#else
++void emit_fold(BuildCtx *ctx)
++{
++  UNUSED(ctx);
++}
++#endif
+ 
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/host/buildvm_lib.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/host/buildvm_lib.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/host/buildvm_lib.c
+@@ -1,6 +1,6 @@
+ /*
+ ** LuaJIT VM builder: library definition compiler.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #include "buildvm.h"
+@@ -379,12 +379,21 @@ void emit_lib(BuildCtx *ctx)
+       /* Simplistic pre-processor. Only handles top-level #if/#endif. */
+       if (buf[0] == '#' && buf[1] == 'i' && buf[2] == 'f') {
+ 	int ok = 1;
+-	if (!strcmp(buf, "#if LJ_52\n"))
++	size_t len = strlen(buf);
++	if (buf[len-1] == '\n') {
++	  buf[len-1] = 0;
++	  if (buf[len-2] == '\r') {
++	    buf[len-2] = 0;
++	  }
++	}
++	if (!strcmp(buf, "#if LJ_52"))
+ 	  ok = LJ_52;
+-	else if (!strcmp(buf, "#if LJ_HASJIT\n"))
++	else if (!strcmp(buf, "#if LJ_HASJIT"))
+ 	  ok = LJ_HASJIT;
+-	else if (!strcmp(buf, "#if LJ_HASFFI\n"))
++	else if (!strcmp(buf, "#if LJ_HASFFI"))
+ 	  ok = LJ_HASFFI;
++	else if (!strcmp(buf, "#if LJ_HASBUFFER"))
++	  ok = LJ_HASBUFFER;
+ 	if (!ok) {
+ 	  int lvl = 1;
+ 	  while (fgets(buf, sizeof(buf), fp) != NULL) {
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/host/buildvm_libbc.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/host/buildvm_libbc.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/host/buildvm_libbc.h
+@@ -4,42 +4,67 @@ static const int libbc_endian = 0;
+ 
+ static const uint8_t libbc_code[] = {
+ #if LJ_FR2
+-0,1,2,0,0,1,2,24,1,0,0,76,1,2,0,241,135,158,166,3,220,203,178,130,4,0,1,2,0,
+-0,1,2,24,1,0,0,76,1,2,0,243,244,148,165,20,198,190,199,252,3,0,1,2,0,0,0,3,
+-16,0,5,0,21,1,0,0,76,1,2,0,0,2,10,0,0,0,15,16,0,12,0,16,1,9,0,41,2,1,0,21,3,
+-0,0,41,4,1,0,77,2,8,128,18,6,1,0,18,8,5,0,59,9,5,0,66,6,3,2,10,6,0,0,88,7,1,
+-128,76,6,2,0,79,2,248,127,75,0,1,0,0,2,11,0,0,0,16,16,0,12,0,16,1,9,0,43,2,
+-0,0,18,3,0,0,41,4,0,0,88,5,7,128,18,7,1,0,18,9,5,0,18,10,6,0,66,7,3,2,10,7,
+-0,0,88,8,1,128,76,7,2,0,70,5,3,3,82,5,247,127,75,0,1,0,0,1,2,0,0,0,3,16,0,12,
+-0,21,1,0,0,76,1,2,0,0,2,10,0,0,2,30,16,0,12,0,21,2,0,0,11,1,0,0,88,3,7,128,
+-8,2,0,0,88,3,23,128,59,3,2,0,43,4,0,0,64,4,2,0,76,3,2,0,88,3,18,128,16,1,14,
+-0,41,3,1,0,3,3,1,0,88,3,14,128,3,1,2,0,88,3,12,128,59,3,1,0,22,4,1,1,18,5,2,
+-0,41,6,1,0,77,4,4,128,23,8,1,7,59,9,7,0,64,9,8,0,79,4,252,127,43,4,0,0,64,4,
+-2,0,76,3,2,0,75,0,1,0,0,2,0,5,12,0,0,0,35,16,0,12,0,16,1,14,0,16,2,14,0,16,
+-3,14,0,11,4,0,0,88,5,1,128,18,4,0,0,16,4,12,0,3,1,2,0,88,5,24,128,33,5,1,3,
+-0,2,3,0,88,6,4,128,2,3,1,0,88,6,2,128,4,4,0,0,88,6,9,128,18,6,1,0,18,7,2,0,
+-41,8,1,0,77,6,4,128,32,10,5,9,59,11,9,0,64,11,10,4,79,6,252,127,88,6,8,128,
+-18,6,2,0,18,7,1,0,41,8,255,255,77,6,4,128,32,10,5,9,59,11,9,0,64,11,10,4,79,
+-6,252,127,76,4,2,0,0
++/* math.deg */ 0,1,2,0,0,1,2,BC_MULVN,1,0,0,BC_RET1,1,2,0,241,135,158,166,3,
++220,203,178,130,4,
++/* math.rad */ 0,1,2,0,0,1,2,BC_MULVN,1,0,0,BC_RET1,1,2,0,243,244,148,165,20,
++198,190,199,252,3,
++/* string.len */ 0,1,2,0,0,0,3,BC_ISTYPE,0,5,0,BC_LEN,1,0,0,BC_RET1,1,2,0,
++/* table.foreachi */ 0,2,10,0,0,0,15,BC_ISTYPE,0,12,0,BC_ISTYPE,1,9,0,
++BC_KSHORT,2,1,0,BC_LEN,3,0,0,BC_KSHORT,4,1,0,BC_FORI,2,8,128,BC_MOV,6,1,0,
++BC_MOV,8,5,0,BC_TGETR,9,5,0,BC_CALL,6,3,2,BC_ISEQP,6,0,0,BC_JMP,7,1,128,
++BC_RET1,6,2,0,BC_FORL,2,248,127,BC_RET0,0,1,0,
++/* table.foreach */ 0,2,11,0,0,1,16,BC_ISTYPE,0,12,0,BC_ISTYPE,1,9,0,BC_KPRI,
++2,0,0,BC_MOV,3,0,0,BC_KNUM,4,0,0,BC_JMP,5,7,128,BC_MOV,7,1,0,BC_MOV,9,5,0,
++BC_MOV,10,6,0,BC_CALL,7,3,2,BC_ISEQP,7,0,0,BC_JMP,8,1,128,BC_RET1,7,2,0,
++BC_ITERN,5,3,3,BC_ITERL,5,247,127,BC_RET0,0,1,0,1,255,255,249,255,15,
++/* table.getn */ 0,1,2,0,0,0,3,BC_ISTYPE,0,12,0,BC_LEN,1,0,0,BC_RET1,1,2,0,
++/* table.remove */ 0,2,10,0,0,2,30,BC_ISTYPE,0,12,0,BC_LEN,2,0,0,BC_ISNEP,1,0,
++0,BC_JMP,3,7,128,BC_ISEQN,2,0,0,BC_JMP,3,23,128,BC_TGETR,3,2,0,BC_KPRI,4,0,0,
++BC_TSETR,4,2,0,BC_RET1,3,2,0,BC_JMP,3,18,128,BC_ISTYPE,1,14,0,BC_KSHORT,3,1,0,
++BC_ISGT,3,1,0,BC_JMP,3,14,128,BC_ISGT,1,2,0,BC_JMP,3,12,128,BC_TGETR,3,1,0,
++BC_ADDVN,4,1,1,BC_MOV,5,2,0,BC_KSHORT,6,1,0,BC_FORI,4,4,128,BC_SUBVN,8,1,7,
++BC_TGETR,9,7,0,BC_TSETR,9,8,0,BC_FORL,4,252,127,BC_KPRI,4,0,0,BC_TSETR,4,2,0,
++BC_RET1,3,2,0,BC_RET0,0,1,0,0,2,
++/* table.move */ 0,5,12,0,0,0,35,BC_ISTYPE,0,12,0,BC_ISTYPE,1,14,0,BC_ISTYPE,
++2,14,0,BC_ISTYPE,3,14,0,BC_ISNEP,4,0,0,BC_JMP,5,1,128,BC_MOV,4,0,0,BC_ISTYPE,
++4,12,0,BC_ISGT,1,2,0,BC_JMP,5,24,128,BC_SUBVV,5,1,3,BC_ISLT,2,3,0,BC_JMP,6,4,
++128,BC_ISLE,3,1,0,BC_JMP,6,2,128,BC_ISEQV,4,0,0,BC_JMP,6,9,128,BC_MOV,6,1,0,
++BC_MOV,7,2,0,BC_KSHORT,8,1,0,BC_FORI,6,4,128,BC_ADDVV,10,5,9,BC_TGETR,11,9,0,
++BC_TSETR,11,10,4,BC_FORL,6,252,127,BC_JMP,6,8,128,BC_MOV,6,2,0,BC_MOV,7,1,0,
++BC_KSHORT,8,255,255,BC_FORI,6,4,128,BC_ADDVV,10,5,9,BC_TGETR,11,9,0,BC_TSETR,
++11,10,4,BC_FORL,6,252,127,BC_RET1,4,2,0,
+ #else
+-0,1,2,0,0,1,2,24,1,0,0,76,1,2,0,241,135,158,166,3,220,203,178,130,4,0,1,2,0,
+-0,1,2,24,1,0,0,76,1,2,0,243,244,148,165,20,198,190,199,252,3,0,1,2,0,0,0,3,
+-16,0,5,0,21,1,0,0,76,1,2,0,0,2,9,0,0,0,15,16,0,12,0,16,1,9,0,41,2,1,0,21,3,
+-0,0,41,4,1,0,77,2,8,128,18,6,1,0,18,7,5,0,59,8,5,0,66,6,3,2,10,6,0,0,88,7,1,
+-128,76,6,2,0,79,2,248,127,75,0,1,0,0,2,10,0,0,0,16,16,0,12,0,16,1,9,0,43,2,
+-0,0,18,3,0,0,41,4,0,0,88,5,7,128,18,7,1,0,18,8,5,0,18,9,6,0,66,7,3,2,10,7,0,
+-0,88,8,1,128,76,7,2,0,70,5,3,3,82,5,247,127,75,0,1,0,0,1,2,0,0,0,3,16,0,12,
+-0,21,1,0,0,76,1,2,0,0,2,10,0,0,2,30,16,0,12,0,21,2,0,0,11,1,0,0,88,3,7,128,
+-8,2,0,0,88,3,23,128,59,3,2,0,43,4,0,0,64,4,2,0,76,3,2,0,88,3,18,128,16,1,14,
+-0,41,3,1,0,3,3,1,0,88,3,14,128,3,1,2,0,88,3,12,128,59,3,1,0,22,4,1,1,18,5,2,
+-0,41,6,1,0,77,4,4,128,23,8,1,7,59,9,7,0,64,9,8,0,79,4,252,127,43,4,0,0,64,4,
+-2,0,76,3,2,0,75,0,1,0,0,2,0,5,12,0,0,0,35,16,0,12,0,16,1,14,0,16,2,14,0,16,
+-3,14,0,11,4,0,0,88,5,1,128,18,4,0,0,16,4,12,0,3,1,2,0,88,5,24,128,33,5,1,3,
+-0,2,3,0,88,6,4,128,2,3,1,0,88,6,2,128,4,4,0,0,88,6,9,128,18,6,1,0,18,7,2,0,
+-41,8,1,0,77,6,4,128,32,10,5,9,59,11,9,0,64,11,10,4,79,6,252,127,88,6,8,128,
+-18,6,2,0,18,7,1,0,41,8,255,255,77,6,4,128,32,10,5,9,59,11,9,0,64,11,10,4,79,
+-6,252,127,76,4,2,0,0
++/* math.deg */ 0,1,2,0,0,1,2,BC_MULVN,1,0,0,BC_RET1,1,2,0,241,135,158,166,3,
++220,203,178,130,4,
++/* math.rad */ 0,1,2,0,0,1,2,BC_MULVN,1,0,0,BC_RET1,1,2,0,243,244,148,165,20,
++198,190,199,252,3,
++/* string.len */ 0,1,2,0,0,0,3,BC_ISTYPE,0,5,0,BC_LEN,1,0,0,BC_RET1,1,2,0,
++/* table.foreachi */ 0,2,9,0,0,0,15,BC_ISTYPE,0,12,0,BC_ISTYPE,1,9,0,
++BC_KSHORT,2,1,0,BC_LEN,3,0,0,BC_KSHORT,4,1,0,BC_FORI,2,8,128,BC_MOV,6,1,0,
++BC_MOV,7,5,0,BC_TGETR,8,5,0,BC_CALL,6,3,2,BC_ISEQP,6,0,0,BC_JMP,7,1,128,
++BC_RET1,6,2,0,BC_FORL,2,248,127,BC_RET0,0,1,0,
++/* table.foreach */ 0,2,10,0,0,1,16,BC_ISTYPE,0,12,0,BC_ISTYPE,1,9,0,BC_KPRI,
++2,0,0,BC_MOV,3,0,0,BC_KNUM,4,0,0,BC_JMP,5,7,128,BC_MOV,7,1,0,BC_MOV,8,5,0,
++BC_MOV,9,6,0,BC_CALL,7,3,2,BC_ISEQP,7,0,0,BC_JMP,8,1,128,BC_RET1,7,2,0,
++BC_ITERN,5,3,3,BC_ITERL,5,247,127,BC_RET0,0,1,0,1,255,255,249,255,15,
++/* table.getn */ 0,1,2,0,0,0,3,BC_ISTYPE,0,12,0,BC_LEN,1,0,0,BC_RET1,1,2,0,
++/* table.remove */ 0,2,10,0,0,2,30,BC_ISTYPE,0,12,0,BC_LEN,2,0,0,BC_ISNEP,1,0,
++0,BC_JMP,3,7,128,BC_ISEQN,2,0,0,BC_JMP,3,23,128,BC_TGETR,3,2,0,BC_KPRI,4,0,0,
++BC_TSETR,4,2,0,BC_RET1,3,2,0,BC_JMP,3,18,128,BC_ISTYPE,1,14,0,BC_KSHORT,3,1,0,
++BC_ISGT,3,1,0,BC_JMP,3,14,128,BC_ISGT,1,2,0,BC_JMP,3,12,128,BC_TGETR,3,1,0,
++BC_ADDVN,4,1,1,BC_MOV,5,2,0,BC_KSHORT,6,1,0,BC_FORI,4,4,128,BC_SUBVN,8,1,7,
++BC_TGETR,9,7,0,BC_TSETR,9,8,0,BC_FORL,4,252,127,BC_KPRI,4,0,0,BC_TSETR,4,2,0,
++BC_RET1,3,2,0,BC_RET0,0,1,0,0,2,
++/* table.move */ 0,5,12,0,0,0,35,BC_ISTYPE,0,12,0,BC_ISTYPE,1,14,0,BC_ISTYPE,
++2,14,0,BC_ISTYPE,3,14,0,BC_ISNEP,4,0,0,BC_JMP,5,1,128,BC_MOV,4,0,0,BC_ISTYPE,
++4,12,0,BC_ISGT,1,2,0,BC_JMP,5,24,128,BC_SUBVV,5,1,3,BC_ISLT,2,3,0,BC_JMP,6,4,
++128,BC_ISLE,3,1,0,BC_JMP,6,2,128,BC_ISEQV,4,0,0,BC_JMP,6,9,128,BC_MOV,6,1,0,
++BC_MOV,7,2,0,BC_KSHORT,8,1,0,BC_FORI,6,4,128,BC_ADDVV,10,5,9,BC_TGETR,11,9,0,
++BC_TSETR,11,10,4,BC_FORL,6,252,127,BC_JMP,6,8,128,BC_MOV,6,2,0,BC_MOV,7,1,0,
++BC_KSHORT,8,255,255,BC_FORI,6,4,128,BC_ADDVV,10,5,9,BC_TGETR,11,9,0,BC_TSETR,
++11,10,4,BC_FORL,6,252,127,BC_RET1,4,2,0,
+ #endif
++0
+ };
+ 
+ static const struct { const char *name; int ofs; } libbc_map[] = {
+@@ -48,9 +73,9 @@ static const struct { const char *name;
+ {"string_len",50},
+ {"table_foreachi",69},
+ {"table_foreach",136},
+-{"table_getn",207},
+-{"table_remove",226},
+-{"table_move",355},
+-{NULL,502}
++{"table_getn",213},
++{"table_remove",232},
++{"table_move",361},
++{NULL,508}
+ };
+ 
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/host/buildvm_peobj.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/host/buildvm_peobj.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/host/buildvm_peobj.c
+@@ -1,6 +1,6 @@
+ /*
+ ** LuaJIT VM builder: PE object emitter.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Only used for building on Windows, since we cannot assume the presence
+ ** of a suitable assembler. The host and target byte order must match.
+@@ -9,7 +9,7 @@
+ #include "buildvm.h"
+ #include "lj_bc.h"
+ 
+-#if LJ_TARGET_X86ORX64
++#if LJ_TARGET_WINDOWS || LJ_TARGET_CYGWIN
+ 
+ /* Context for PE object emitter. */
+ static char *strtab;
+@@ -93,6 +93,17 @@ typedef struct PEsymaux {
+ #define PEOBJ_RELOC_ADDR32NB	0x03
+ #define PEOBJ_RELOC_OFS		0
+ #define PEOBJ_TEXT_FLAGS	0x60500020  /* 60=r+x, 50=align16, 20=code. */
++#define PEOBJ_PDATA_NRELOC	6
++#define PEOBJ_XDATA_SIZE	(8*2+4+6*2)
++#elif LJ_TARGET_ARM64
++#define PEOBJ_ARCH_TARGET	0xaa64
++#define PEOBJ_RELOC_REL32	0x03  /* MS: BRANCH26. */
++#define PEOBJ_RELOC_DIR32	0x01
++#define PEOBJ_RELOC_ADDR32NB	0x02
++#define PEOBJ_RELOC_OFS		(-4)
++#define PEOBJ_TEXT_FLAGS	0x60500020  /* 60=r+x, 50=align16, 20=code. */
++#define PEOBJ_PDATA_NRELOC	4
++#define PEOBJ_XDATA_SIZE	(4+24+4 +4+8)
+ #endif
+ 
+ /* Section numbers (0-based). */
+@@ -100,7 +111,7 @@ enum {
+   PEOBJ_SECT_ABS = -2,
+   PEOBJ_SECT_UNDEF = -1,
+   PEOBJ_SECT_TEXT,
+-#if LJ_TARGET_X64
++#ifdef PEOBJ_PDATA_NRELOC
+   PEOBJ_SECT_PDATA,
+   PEOBJ_SECT_XDATA,
+ #elif LJ_TARGET_X86
+@@ -175,6 +186,9 @@ void emit_peobj(BuildCtx *ctx)
+   uint32_t sofs;
+   int i, nrsym;
+   union { uint8_t b; uint32_t u; } host_endian;
++#ifdef PEOBJ_PDATA_NRELOC
++  uint32_t fcofs = (uint32_t)ctx->sym[ctx->nsym-1].ofs;
++#endif
+ 
+   sofs = sizeof(PEheader) + PEOBJ_NSECTIONS*sizeof(PEsection);
+ 
+@@ -188,18 +202,18 @@ void emit_peobj(BuildCtx *ctx)
+   /* Flags: 60 = read+execute, 50 = align16, 20 = code. */
+   pesect[PEOBJ_SECT_TEXT].flags = PEOBJ_TEXT_FLAGS;
+ 
+-#if LJ_TARGET_X64
++#ifdef PEOBJ_PDATA_NRELOC
+   memcpy(pesect[PEOBJ_SECT_PDATA].name, ".pdata", sizeof(".pdata")-1);
+   pesect[PEOBJ_SECT_PDATA].ofs = sofs;
+-  sofs += (pesect[PEOBJ_SECT_PDATA].size = 6*4);
++  sofs += (pesect[PEOBJ_SECT_PDATA].size = PEOBJ_PDATA_NRELOC*4);
+   pesect[PEOBJ_SECT_PDATA].relocofs = sofs;
+-  sofs += (pesect[PEOBJ_SECT_PDATA].nreloc = 6) * PEOBJ_RELOC_SIZE;
++  sofs += (pesect[PEOBJ_SECT_PDATA].nreloc = PEOBJ_PDATA_NRELOC) * PEOBJ_RELOC_SIZE;
+   /* Flags: 40 = read, 30 = align4, 40 = initialized data. */
+   pesect[PEOBJ_SECT_PDATA].flags = 0x40300040;
+ 
+   memcpy(pesect[PEOBJ_SECT_XDATA].name, ".xdata", sizeof(".xdata")-1);
+   pesect[PEOBJ_SECT_XDATA].ofs = sofs;
+-  sofs += (pesect[PEOBJ_SECT_XDATA].size = 8*2+4+6*2);  /* See below. */
++  sofs += (pesect[PEOBJ_SECT_XDATA].size = PEOBJ_XDATA_SIZE);  /* See below. */
+   pesect[PEOBJ_SECT_XDATA].relocofs = sofs;
+   sofs += (pesect[PEOBJ_SECT_XDATA].nreloc = 1) * PEOBJ_RELOC_SIZE;
+   /* Flags: 40 = read, 30 = align4, 40 = initialized data. */
+@@ -234,7 +248,7 @@ void emit_peobj(BuildCtx *ctx)
+   */
+   nrsym = ctx->nrelocsym;
+   pehdr.nsyms = 1+PEOBJ_NSECTIONS*2 + 1+ctx->nsym + nrsym;
+-#if LJ_TARGET_X64
++#ifdef PEOBJ_PDATA_NRELOC
+   pehdr.nsyms += 1;  /* Symbol for lj_err_unwind_win. */
+ #endif
+ 
+@@ -259,7 +273,6 @@ void emit_peobj(BuildCtx *ctx)
+ 
+ #if LJ_TARGET_X64
+   { /* Write .pdata section. */
+-    uint32_t fcofs = (uint32_t)ctx->sym[ctx->nsym-1].ofs;
+     uint32_t pdata[3];  /* Start of .text, end of .text and .xdata. */
+     PEreloc reloc;
+     pdata[0] = 0; pdata[1] = fcofs; pdata[2] = 0;
+@@ -308,6 +321,87 @@ void emit_peobj(BuildCtx *ctx)
+     reloc.type = PEOBJ_RELOC_ADDR32NB;
+     owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
+   }
++#elif LJ_TARGET_ARM64
++  /* https://learn.microsoft.com/en-us/cpp/build/arm64-exception-handling */
++  { /* Write .pdata section. */
++    uint32_t pdata[4];
++    PEreloc reloc;
++    pdata[0] = 0;
++    pdata[1] = 0;
++    pdata[2] = fcofs;
++    pdata[3] = 4+24+4;
++    owrite(ctx, &pdata, sizeof(pdata));
++    /* Start of .text and start of .xdata. */
++    reloc.vaddr = 0; reloc.symidx = 1+2+nrsym+2+2+1;
++    reloc.type = PEOBJ_RELOC_ADDR32NB;
++    owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
++    reloc.vaddr = 4; reloc.symidx = 1+2+nrsym+2;
++    reloc.type = PEOBJ_RELOC_ADDR32NB;
++    owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
++    /* Start of vm_ffi_call and start of second part of .xdata. */
++    reloc.vaddr = 8; reloc.symidx = 1+2+nrsym+2+2+1;
++    reloc.type = PEOBJ_RELOC_ADDR32NB;
++    owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
++    reloc.vaddr = 12; reloc.symidx = 1+2+nrsym+2;
++    reloc.type = PEOBJ_RELOC_ADDR32NB;
++    owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
++  }
++  { /* Write .xdata section. */
++    uint32_t u32;
++    uint8_t *p, uwc[24];
++    PEreloc reloc;
++
++#define CBE16(x)	(*p = ((x) >> 8) & 0xff, p[1] = (x) & 0xff, p += 2)
++#define CALLOC_S(s)	(*p++ = ((s) >> 4))  /* s < 512 */
++#define CSAVE_FPLR(o)	(*p++ = 0x40 | ((o) >> 3))  /* o <= 504 */
++#define CSAVE_REGP(r,o)	CBE16(0xc800 | (((r) - 19) << 6) | ((o) >> 3))
++#define CSAVE_REGS(r1,r2,o1) do { \
++  int r, o; for (r = r1, o = o1; r <= r2; r += 2, o -= 16) CSAVE_REGP(r, o); \
++} while (0)
++#define CSAVE_REGPX(r,o) CBE16(0xcc00 | (((r) - 19) << 6) | (~(o) >> 3))
++#define CSAVE_FREGP(r,o) CBE16(0xd800 | (((r) - 8) << 6) | ((o) >> 3))
++#define CSAVE_FREGS(r1,r2,o1) do { \
++  int r, o; for (r = r1, o = o1; r <= r2; r += 2, o -= 16) CSAVE_FREGP(r, o); \
++} while (0)
++#define CADD_FP(s)	CBE16(0xe200 | ((s) >> 3))  /* s < 8*256 */
++#define CODE_NOP	0xe3
++#define CODE_END	0xe4
++#define CEND_ALIGN	do { \
++  *p++ = CODE_END; \
++  while ((p - uwc) & 3) *p++ = CODE_NOP; \
++} while (0)
++
++    /* Unwind codes for .text section with handler. */
++    p = uwc;
++    CADD_FP(192);		/* +2 */
++    CSAVE_REGS(19, 28, 176);	/* +5*2 */
++    CSAVE_FREGS(8, 15, 96);	/* +4*2 */
++    CSAVE_FPLR(192);		/* +1 */
++    CALLOC_S(208);		/* +1 */
++    CEND_ALIGN;			/* +1 +1 -> 24 */
++
++    u32 = ((24u >> 2) << 27) | (1u << 20) | (fcofs >> 2);
++    owrite(ctx, &u32, 4);
++    owrite(ctx, &uwc, 24);
++
++    u32 = 0;  /* Handler RVA to be relocated at 4 + 24. */
++    owrite(ctx, &u32, 4);
++
++    /* Unwind codes for vm_ffi_call without handler. */
++    p = uwc;
++    CADD_FP(16);		/* +2 */
++    CSAVE_FPLR(16);		/* +1 */
++    CSAVE_REGPX(19, -32);	/* +2 */
++    CEND_ALIGN;			/* +1 +2 -> 8 */
++
++    u32 = ((8u >> 2) << 27) | (((uint32_t)ctx->codesz - fcofs) >> 2);
++    owrite(ctx, &u32, 4);
++    owrite(ctx, &uwc, 8);
++
++    reloc.vaddr = 4 + 24; reloc.symidx = 1+2+nrsym+2+2;
++    reloc.type = PEOBJ_RELOC_ADDR32NB;
++    owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
++  }
+ #elif LJ_TARGET_X86
+   /* Write .sxdata section. */
+   for (i = 0; i < nrsym; i++) {
+@@ -339,7 +433,7 @@ void emit_peobj(BuildCtx *ctx)
+       emit_peobj_sym(ctx, ctx->relocsym[i], 0,
+ 		     PEOBJ_SECT_UNDEF, PEOBJ_TYPE_FUNC, PEOBJ_SCL_EXTERN);
+ 
+-#if LJ_TARGET_X64
++#ifdef PEOBJ_PDATA_NRELOC
+     emit_peobj_sym_sect(ctx, pesect, PEOBJ_SECT_PDATA);
+     emit_peobj_sym_sect(ctx, pesect, PEOBJ_SECT_XDATA);
+     emit_peobj_sym(ctx, "lj_err_unwind_win", 0,
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/host/genlibbc.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/host/genlibbc.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/host/genlibbc.lua
+@@ -2,7 +2,7 @@
+ -- Lua script to dump the bytecode of the library functions written in Lua.
+ -- The resulting 'buildvm_libbc.h' is used for the build process of LuaJIT.
+ ----------------------------------------------------------------------------
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- Released under the MIT license. See Copyright Notice in luajit.h
+ ----------------------------------------------------------------------------
+ 
+@@ -55,7 +55,7 @@ local function transform_lua(code)
+   end)
+   code = string.gsub(code, "PAIRS%((.-)%)", function(var)
+     fixup.PAIRS = true
+-    return format("nil, %s, 0", var)
++    return format("nil, %s, 0x4dp80", var)
+   end)
+   return "return "..code, fixup
+ end
+@@ -79,9 +79,11 @@ local name2itype = {
+   str = 5, func = 9, tab = 12, int = 14, num = 15
+ }
+ 
+-local BC = {}
++local BC, BCN = {}, {}
+ for i=0,#bcnames/6-1 do
+-  BC[string.gsub(string.sub(bcnames, i*6+1, i*6+6), " ", "")] = i
++  local name = bcnames:sub(i*6+1, i*6+6):gsub(" ", "")
++  BC[name] = i
++  BCN[i] = name
+ end
+ local xop, xra = isbe and 3 or 0, isbe and 2 or 1
+ local xrc, xrb = isbe and 1 or 2, isbe and 0 or 3
+@@ -96,6 +98,7 @@ local function fixup_dump(dump, fixup)
+   p = read_uleb128(p)
+   p = read_uleb128(p)
+   p, sizebc = read_uleb128(p)
++  local startbc = tonumber(p - start)
+   local rawtab = {}
+   for i=0,sizebc-1 do
+     local op = p[xop]
+@@ -129,7 +132,10 @@ local function fixup_dump(dump, fixup)
+     end
+     p = p + 4
+   end
+-  return ffi.string(start, n)
++  local ndump = ffi.string(start, n)
++  -- Fixup hi-part of 0x4dp80 to LJ_KEYINDEX.
++  ndump = ndump:gsub("\x80\x80\xcd\xaa\x04", "\xff\xff\xf9\xff\x0f")
++  return { dump = ndump, startbc = startbc, sizebc = sizebc }
+ end
+ 
+ local function find_defs(src)
+@@ -149,24 +155,46 @@ local function gen_header(defs)
+   local function w(x) t[#t+1] = x end
+   w("/* This is a generated file. DO NOT EDIT! */\n\n")
+   w("static const int libbc_endian = ") w(isbe and 1 or 0) w(";\n\n")
+-  local s = ""
+-  for _,name in ipairs(defs) do
+-    s = s .. defs[name]
++  local s, sb = "", ""
++  for i,name in ipairs(defs) do
++    local d = defs[name]
++    s = s .. d.dump
++    sb = sb .. string.char(i) .. ("\0"):rep(d.startbc - 1)
++	    .. (isbe and "\0\0\0\255" or "\255\0\0\0"):rep(d.sizebc)
++	    .. ("\0"):rep(#d.dump - d.startbc - d.sizebc*4)
+   end
+   w("static const uint8_t libbc_code[] = {\n")
+   local n = 0
+   for i=1,#s do
+     local x = string.byte(s, i)
+-    w(x); w(",")
+-    n = n + (x < 10 and 2 or (x < 100 and 3 or 4))
+-    if n >= 75 then n = 0; w("\n") end
++    local xb = string.byte(sb, i)
++    if xb == 255 then
++      local name = BCN[x]
++      local m = #name + 4
++      if n + m > 78 then n = 0; w("\n") end
++      n = n + m
++      w("BC_"); w(name)
++    else
++      local m = x < 10 and 2 or (x < 100 and 3 or 4)
++      if xb == 0 then
++	if n + m > 78 then n = 0; w("\n") end
++      else
++	local name = defs[xb]:gsub("_", ".")
++	if n ~= 0 then w("\n") end
++	w("/* "); w(name); w(" */ ")
++	n = #name + 7
++      end
++      n = n + m
++      w(x)
++    end
++    w(",")
+   end
+-  w("0\n};\n\n")
++  w("\n0\n};\n\n")
+   w("static const struct { const char *name; int ofs; } libbc_map[] = {\n")
+   local m = 0
+   for _,name in ipairs(defs) do
+     w('{"'); w(name); w('",'); w(m) w('},\n')
+-    m = m + #defs[name]
++    m = m + #defs[name].dump
+   end
+   w("{NULL,"); w(m); w("}\n};\n\n")
+   return table.concat(t)
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/host/genminilua.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/host/genminilua.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/host/genminilua.lua
+@@ -2,7 +2,7 @@
+ -- Lua script to generate a customized, minified version of Lua.
+ -- The resulting 'minilua' is used for the build process of LuaJIT.
+ ----------------------------------------------------------------------------
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- Released under the MIT license. See Copyright Notice in luajit.h
+ ----------------------------------------------------------------------------
+ 
+@@ -327,6 +327,12 @@ local function rename_tokens2(src)
+   return gsub(src, "ZY([%w_]+)", "union %1")
+ end
+ 
++local function fix_bugs_and_warnings(src)
++ src = gsub(src, "(luaD_checkstack%(L,p%->maxstacksize)%)", "%1+p->numparams)")
++ src = gsub(src, "if%(sep==%-1%)(return'%[';)\nelse (luaX_lexerror%b();)", "if (sep!=-1)%2\n%1")
++ return gsub(src, "(default:{\nNode%*n=mainposition)", "/*fallthrough*/\n%1")
++end
++
+ local function func_gather(src)
+   local nodes, list = {}, {}
+   local pos, len = 1, #src
+@@ -425,5 +431,6 @@ src = rename_tokens1(src)
+ src = func_collect(src)
+ src = rename_tokens2(src)
+ src = restore_strings(src)
++src = fix_bugs_and_warnings(src)
+ src = merge_header(src, license)
+ io.write(src)
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/host/genversion.lua
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/host/genversion.lua
+@@ -0,0 +1,45 @@
++----------------------------------------------------------------------------
++-- Lua script to embed the rolling release version in luajit.h.
++----------------------------------------------------------------------------
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
++-- Released under the MIT license. See Copyright Notice in luajit.h
++----------------------------------------------------------------------------
++
++local arg = {...}
++local FILE_ROLLING_H = arg[1] or "luajit_rolling.h"
++local FILE_RELVER_TXT = arg[2] or "luajit_relver.txt"
++local FILE_LUAJIT_H = arg[3] or "luajit.h"
++
++local function file_read(file)
++  local fp = assert(io.open(file, "rb"), "run from the wrong directory")
++  local data = assert(fp:read("*a"))
++  fp:close()
++  return data
++end
++
++local function file_write_mod(file, data)
++  local fp = io.open(file, "rb")
++  if fp then
++    local odata = assert(fp:read("*a"))
++    fp:close()
++    if odata == data then return end
++  end
++  fp = assert(io.open(file, "wb"))
++  assert(fp:write(data))
++  assert(fp:close())
++end
++
++local text = file_read(FILE_ROLLING_H)
++local relver = file_read(FILE_RELVER_TXT):match("(%d+)")
++
++if relver then
++  text = text:gsub("ROLLING", relver)
++else
++  io.stderr:write([[
++**** WARNING Cannot determine rolling release version from git log.
++**** WARNING The 'git' command must be available during the build.
++]])
++  file_write_mod(FILE_RELVER_TXT, "ROLLING\n") -- Fallback for install target.
++end
++
++file_write_mod(FILE_LUAJIT_H, text)
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/host/minilua.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/host/minilua.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/host/minilua.c
+@@ -1134,7 +1134,7 @@ if(!cl->isC){
+ CallInfo*ci;
+ StkId st,base;
+ Proto*p=cl->p;
+-luaD_checkstack(L,p->maxstacksize);
++luaD_checkstack(L,p->maxstacksize+p->numparams);
+ func=restorestack(L,funcr);
+ if(!p->is_vararg){
+ base=func+1;
+@@ -1639,6 +1639,7 @@ lua_number2int(k,n);
+ if(luai_numeq(cast_num(k),nvalue(key)))
+ return luaH_getnum(t,k);
+ }
++/*fallthrough*/
+ default:{
+ Node*n=mainposition(t,key);
+ do{
+@@ -2905,8 +2906,8 @@ if(sep>=0){
+ read_long_string(ls,seminfo,sep);
+ return TK_STRING;
+ }
+-else if(sep==-1)return'[';
+-else luaX_lexerror(ls,"invalid long string delimiter",TK_STRING);
++else if (sep!=-1)luaX_lexerror(ls,"invalid long string delimiter",TK_STRING);
++return'[';
+ }
+ case'=':{
+ next(ls);
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/jit/bc.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/jit/bc.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/jit/bc.lua
+@@ -1,7 +1,7 @@
+ ----------------------------------------------------------------------------
+ -- LuaJIT bytecode listing module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- Released under the MIT license. See Copyright Notice in luajit.h
+ ----------------------------------------------------------------------------
+ --
+@@ -41,7 +41,6 @@
+ 
+ -- Cache some library functions and objects.
+ local jit = require("jit")
+-assert(jit.version_num == 20100, "LuaJIT core/library version mismatch")
+ local jutil = require("jit.util")
+ local vmdef = require("jit.vmdef")
+ local bit = require("bit")
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/jit/bcsave.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/jit/bcsave.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/jit/bcsave.lua
+@@ -1,7 +1,7 @@
+ ----------------------------------------------------------------------------
+ -- LuaJIT module to save/list bytecode.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- Released under the MIT license. See Copyright Notice in luajit.h
+ ----------------------------------------------------------------------------
+ --
+@@ -11,7 +11,7 @@
+ ------------------------------------------------------------------------------
+ 
+ local jit = require("jit")
+-assert(jit.version_num == 20100, "LuaJIT core/library version mismatch")
++assert(jit.version_num == 20199, "LuaJIT core/library version mismatch")
+ local bit = require("bit")
+ 
+ -- Symbol name prefix for LuaJIT bytecode.
+@@ -33,11 +33,12 @@ Save LuaJIT bytecode: luajit -b[options]
+   -t type   Set output file type (default: auto-detect from output name).
+   -a arch   Override architecture for object files (default: native).
+   -o os     Override OS for object files (default: native).
++  -F name   Override filename (default: input filename).
+   -e chunk  Use chunk string as input.
+   --        Stop handling options.
+   -         Use stdin as input and/or stdout as output.
+ 
+-File types: c h obj o raw (default)
++File types: c cc h obj o raw (default)
+ ]]
+   os.exit(1)
+ end
+@@ -49,10 +50,22 @@ local function check(ok, ...)
+   os.exit(1)
+ end
+ 
+-local function readfile(input)
++local function readfile(ctx, input)
+   if type(input) == "function" then return input end
+-  if input == "-" then input = nil end
+-  return check(loadfile(input))
++  if ctx.filename then
++    local data
++    if input == "-" then
++      data = io.stdin:read("*a")
++    else
++      local fp = assert(io.open(input, "rb"))
++      data = assert(fp:read("*a"))
++      assert(fp:close())
++    end
++    return check(load(data, ctx.filename))
++  else
++    if input == "-" then input = nil end
++    return check(loadfile(input))
++  end
+ end
+ 
+ local function savefile(name, mode)
+@@ -60,10 +73,15 @@ local function savefile(name, mode)
+   return check(io.open(name, mode))
+ end
+ 
++local function set_stdout_binary(ffi)
++  ffi.cdef[[int _setmode(int fd, int mode);]]
++  ffi.C._setmode(1, 0x8000)
++end
++
+ ------------------------------------------------------------------------------
+ 
+ local map_type = {
+-  raw = "raw", c = "c", h = "h", o = "obj", obj = "obj",
++  raw = "raw", c = "c", cc = "c", h = "h", o = "obj", obj = "obj",
+ }
+ 
+ local map_arch = {
+@@ -79,6 +97,7 @@ local map_arch = {
+   mips64el =	{ e = "le", b = 64, m = 8, f = 0x80000007, },
+   mips64r6 =	{ e = "be", b = 64, m = 8, f = 0xa0000407, },
+   mips64r6el =	{ e = "le", b = 64, m = 8, f = 0xa0000407, },
++  riscv64 =    { e = "le", b = 64, m = 243, f = 0x00000004, },
+ }
+ 
+ local map_os = {
+@@ -125,6 +144,11 @@ local function bcsave_tail(fp, output, s
+ end
+ 
+ local function bcsave_raw(output, s)
++  if output == "-" and jit.os == "Windows" then
++    local ok, ffi = pcall(require, "ffi")
++    check(ok, "FFI library required to write binary file to stdout")
++    set_stdout_binary(ffi)
++  end
+   local fp = savefile(output, "wb")
+   bcsave_tail(fp, output, s)
+ end
+@@ -446,18 +470,18 @@ typedef struct {
+   uint32_t value;
+ } mach_nlist;
+ typedef struct {
+-  uint32_t strx;
++  int32_t strx;
+   uint8_t type, sect;
+   uint16_t desc;
+   uint64_t value;
+ } mach_nlist_64;
+ typedef struct
+ {
+-  uint32_t magic, nfat_arch;
++  int32_t magic, nfat_arch;
+ } mach_fat_header;
+ typedef struct
+ {
+-  uint32_t cputype, cpusubtype, offset, size, align;
++  int32_t cputype, cpusubtype, offset, size, align;
+ } mach_fat_arch;
+ typedef struct {
+   struct {
+@@ -491,6 +515,18 @@ typedef struct {
+   mach_nlist sym_entry;
+   uint8_t space[4096];
+ } mach_fat_obj;
++typedef struct {
++  mach_fat_header fat;
++  mach_fat_arch fat_arch[2];
++  struct {
++    mach_header_64 hdr;
++    mach_segment_command_64 seg;
++    mach_section_64 sec;
++    mach_symtab_command sym;
++  } arch[2];
++  mach_nlist_64 sym_entry;
++  uint8_t space[4096];
++} mach_fat_obj_64;
+ ]]
+   local symname = '_'..LJBC_PREFIX..ctx.modname
+   local isfat, is64, align, mobj = false, false, 4, "mach_obj"
+@@ -499,7 +535,7 @@ typedef struct {
+   elseif ctx.arch == "arm" then
+     isfat, mobj = true, "mach_fat_obj"
+   elseif ctx.arch == "arm64" then
+-    is64, align, isfat, mobj = true, 8, true, "mach_fat_obj"
++    is64, align, isfat, mobj = true, 8, true, "mach_fat_obj_64"
+   else
+     check(ctx.arch == "x86", "unsupported architecture for OSX")
+   end
+@@ -568,6 +604,9 @@ end
+ local function bcsave_obj(ctx, output, s)
+   local ok, ffi = pcall(require, "ffi")
+   check(ok, "FFI library required to write this file type")
++  if output == "-" and jit.os == "Windows" then
++    set_stdout_binary(ffi)
++  end
+   if ctx.os == "windows" then
+     return bcsave_peobj(ctx, output, s, ffi)
+   elseif ctx.os == "osx" then
+@@ -579,13 +618,13 @@ end
+ 
+ ------------------------------------------------------------------------------
+ 
+-local function bclist(input, output)
+-  local f = readfile(input)
++local function bclist(ctx, input, output)
++  local f = readfile(ctx, input)
+   require("jit.bc").dump(f, savefile(output, "w"), true)
+ end
+ 
+ local function bcsave(ctx, input, output)
+-  local f = readfile(input)
++  local f = readfile(ctx, input)
+   local s = string.dump(f, ctx.strip)
+   local t = ctx.type
+   if not t then
+@@ -638,6 +677,8 @@ local function docmd(...)
+ 	    ctx.arch = checkarg(tremove(arg, n), map_arch, "architecture")
+ 	  elseif opt == "o" then
+ 	    ctx.os = checkarg(tremove(arg, n), map_os, "OS name")
++	  elseif opt == "F" then
++	    ctx.filename = "@"..tremove(arg, n)
+ 	  else
+ 	    usage()
+ 	  end
+@@ -649,7 +690,7 @@ local function docmd(...)
+   end
+   if list then
+     if #arg == 0 or #arg > 2 then usage() end
+-    bclist(arg[1], arg[2] or "-")
++    bclist(ctx, arg[1], arg[2] or "-")
+   else
+     if #arg ~= 2 then usage() end
+     bcsave(ctx, arg[1], arg[2])
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_arm.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/jit/dis_arm.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_arm.lua
+@@ -1,7 +1,7 @@
+ ----------------------------------------------------------------------------
+ -- LuaJIT ARM disassembler module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- Released under the MIT license. See Copyright Notice in luajit.h
+ ----------------------------------------------------------------------------
+ -- This is a helper module used by the LuaJIT machine code dumper module.
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_arm64.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/jit/dis_arm64.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_arm64.lua
+@@ -1,7 +1,7 @@
+ ----------------------------------------------------------------------------
+ -- LuaJIT ARM64 disassembler module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- Released under the MIT license. See Copyright Notice in luajit.h
+ --
+ -- Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
+@@ -107,24 +107,20 @@ local map_logsr = { -- Logical, shifted
+     [0] = {
+       shift = 29, mask = 3,
+       [0] = {
+-	shift = 21, mask = 7,
+-	[0] = "andDNMSg", "bicDNMSg", "andDNMSg", "bicDNMSg",
+-	"andDNMSg", "bicDNMSg", "andDNMg", "bicDNMg"
++	shift = 21, mask = 1,
++	[0] = "andDNMSg", "bicDNMSg"
+       },
+       {
+-	shift = 21, mask = 7,
+-	[0] ="orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0MSg", "orn|mvnDN0MSg",
+-	     "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0Mg", "orn|mvnDN0Mg"
++	shift = 21, mask = 1,
++	[0] = "orr|movDN0MSg", "orn|mvnDN0MSg"
+       },
+       {
+-	shift = 21, mask = 7,
+-	[0] = "eorDNMSg", "eonDNMSg", "eorDNMSg", "eonDNMSg",
+-	"eorDNMSg", "eonDNMSg", "eorDNMg", "eonDNMg"
++	shift = 21, mask = 1,
++	[0] = "eorDNMSg", "eonDNMSg"
+       },
+       {
+-	shift = 21, mask = 7,
+-	[0] = "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMSg", "bicsDNMSg",
+-	"ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMg", "bicsDNMg"
++	shift = 21, mask = 1,
++	[0] = "ands|tstD0NMSg", "bicsDNMSg"
+       }
+     },
+     false -- unallocated
+@@ -132,24 +128,20 @@ local map_logsr = { -- Logical, shifted
+   {
+     shift = 29, mask = 3,
+     [0] = {
+-      shift = 21, mask = 7,
+-      [0] = "andDNMSg", "bicDNMSg", "andDNMSg", "bicDNMSg",
+-      "andDNMSg", "bicDNMSg", "andDNMg", "bicDNMg"
++      shift = 21, mask = 1,
++      [0] = "andDNMSg", "bicDNMSg"
+     },
+     {
+-      shift = 21, mask = 7,
+-      [0] = "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0MSg", "orn|mvnDN0MSg",
+-      "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0Mg", "orn|mvnDN0Mg"
++      shift = 21, mask = 1,
++      [0] = "orr|movDN0MSg", "orn|mvnDN0MSg"
+     },
+     {
+-      shift = 21, mask = 7,
+-      [0] = "eorDNMSg", "eonDNMSg", "eorDNMSg", "eonDNMSg",
+-      "eorDNMSg", "eonDNMSg", "eorDNMg", "eonDNMg"
++      shift = 21, mask = 1,
++      [0] = "eorDNMSg", "eonDNMSg"
+     },
+     {
+-      shift = 21, mask = 7,
+-      [0] = "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMSg", "bicsDNMSg",
+-      "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMg", "bicsDNMg"
++      shift = 21, mask = 1,
++      [0] = "ands|tstD0NMSg", "bicsDNMSg"
+     }
+   }
+ }
+@@ -735,7 +727,7 @@ local map_cond = {
+   "hi", "ls", "ge", "lt", "gt", "le", "al",
+ }
+ 
+-local map_shift = { [0] = "lsl", "lsr", "asr", }
++local map_shift = { [0] = "lsl", "lsr", "asr", "ror"}
+ 
+ local map_extend = {
+   [0] = "uxtb", "uxth", "uxtw", "uxtx", "sxtb", "sxth", "sxtw", "sxtx",
+@@ -956,7 +948,7 @@ local function disass_ins(ctx)
+     elseif p == "U" then
+       local rn = map_regs.x[band(rshift(op, 5), 31)]
+       local sz = band(rshift(op, 30), 3)
+-      local imm12 = lshift(arshift(lshift(op, 10), 20), sz)
++      local imm12 = lshift(rshift(lshift(op, 10), 20), sz)
+       if imm12 ~= 0 then
+ 	x = "["..rn..", #"..imm12.."]"
+       else
+@@ -993,8 +985,7 @@ local function disass_ins(ctx)
+ 	x = x.."]"
+       end
+     elseif p == "P" then
+-      local opcv, sh = rshift(op, 26), 2
+-      if opcv >= 0x2a then sh = 4 elseif opcv >= 0x1b then sh = 3 end
++      local sh = 2 + rshift(op, 31 - band(rshift(op, 26), 1))
+       local imm7 = lshift(arshift(lshift(op, 10), 25), sh)
+       local rn = map_regs.x[band(rshift(op, 5), 31)]
+       local ind = band(rshift(op, 23), 3)
+@@ -1089,7 +1080,7 @@ local function disass_ins(ctx)
+ 	  last = "#"..(sf+32 - immr)
+ 	  operands[#operands] = last
+ 	  x = x + 1
+-	elseif x >= immr then
++	else
+ 	  name = a2
+ 	  x = x - immr + 1
+ 	end
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_arm64be.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/jit/dis_arm64be.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_arm64be.lua
+@@ -1,7 +1,7 @@
+ ----------------------------------------------------------------------------
+ -- LuaJIT ARM64BE disassembler wrapper module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- Released under the MIT license. See Copyright Notice in luajit.h
+ ----------------------------------------------------------------------------
+ -- ARM64 instructions are always little-endian. So just forward to the
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_mips.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/jit/dis_mips.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_mips.lua
+@@ -1,7 +1,7 @@
+ ----------------------------------------------------------------------------
+ -- LuaJIT MIPS disassembler module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- Released under the MIT/X license. See Copyright Notice in luajit.h
+ ----------------------------------------------------------------------------
+ -- This is a helper module used by the LuaJIT machine code dumper module.
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_mips64.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/jit/dis_mips64.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_mips64.lua
+@@ -1,7 +1,7 @@
+ ----------------------------------------------------------------------------
+ -- LuaJIT MIPS64 disassembler wrapper module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- Released under the MIT license. See Copyright Notice in luajit.h
+ ----------------------------------------------------------------------------
+ -- This module just exports the big-endian functions from the
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_mips64el.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/jit/dis_mips64el.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_mips64el.lua
+@@ -1,7 +1,7 @@
+ ----------------------------------------------------------------------------
+ -- LuaJIT MIPS64EL disassembler wrapper module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- Released under the MIT license. See Copyright Notice in luajit.h
+ ----------------------------------------------------------------------------
+ -- This module just exports the little-endian functions from the
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_mips64r6.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/jit/dis_mips64r6.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_mips64r6.lua
+@@ -1,7 +1,7 @@
+ ----------------------------------------------------------------------------
+ -- LuaJIT MIPS64R6 disassembler wrapper module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- Released under the MIT license. See Copyright Notice in luajit.h
+ ----------------------------------------------------------------------------
+ -- This module just exports the r6 big-endian functions from the
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_mips64r6el.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/jit/dis_mips64r6el.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_mips64r6el.lua
+@@ -1,7 +1,7 @@
+ ----------------------------------------------------------------------------
+ -- LuaJIT MIPS64R6EL disassembler wrapper module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- Released under the MIT license. See Copyright Notice in luajit.h
+ ----------------------------------------------------------------------------
+ -- This module just exports the r6 little-endian functions from the
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_mipsel.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/jit/dis_mipsel.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_mipsel.lua
+@@ -1,7 +1,7 @@
+ ----------------------------------------------------------------------------
+ -- LuaJIT MIPSEL disassembler wrapper module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- Released under the MIT license. See Copyright Notice in luajit.h
+ ----------------------------------------------------------------------------
+ -- This module just exports the little-endian functions from the
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_ppc.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/jit/dis_ppc.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_ppc.lua
+@@ -1,7 +1,7 @@
+ ----------------------------------------------------------------------------
+ -- LuaJIT PPC disassembler module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- Released under the MIT/X license. See Copyright Notice in luajit.h
+ ----------------------------------------------------------------------------
+ -- This is a helper module used by the LuaJIT machine code dumper module.
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_riscv.lua
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_riscv.lua
+@@ -0,0 +1,793 @@
++------------------------------------------------------------------------------
++-- LuaJIT RISC-V disassembler module.
++--
++-- Copyright (C) 2005-2022 Mike Pall. All rights reserved.
++-- Released under the MIT license. See Copyright Notice in luajit.h
++--
++-- Contributed by Milos Poletanovic from Syrmia.com.
++------------------------------------------------------------------------------
++-- This is a helper module used by the LuaJIT machine code dumper module.
++--
++-- It disassembles most standard RISC-V instructions.
++-- Mode is little-endian
++------------------------------------------------------------------------------
++
++local type = type
++local byte, format = string.byte, string.format
++local match, gmatch = string.match, string.gmatch
++local concat = table.concat
++local bit = require("bit")
++local band, bor, tohex = bit.band, bit.bor, bit.tohex
++local lshift, rshift, arshift = bit.lshift, bit.rshift, bit.arshift
++
++------------------------------------------------------------------------------
++-- Opcode maps
++------------------------------------------------------------------------------
++
++--RVC32 extension
++
++local map_quad0 = {
++  shift = 13, mask = 7,
++  [0] = "c.addi4spnZW", "c.fldNMh", "c.lwZMn", "c.flwNMn",
++  false, "c.fsdNMh", "c.swZMn", "c.fswNMn"
++}
++
++local map_sub2quad1 = {
++  shift = 5, mask = 3,
++  [0] = "c.subMZ", "c.xorMZ", "c.orMZ", "c.andMZ"
++}
++
++local map_sub1quad1 = {
++  shift = 10, mask = 3,
++  [0] = "c.srliM1", "c.sraiM1", "c.andiMx", map_sub2quad1
++}
++
++local map_quad1 = {
++  shift = 13, mask = 7,
++  [0] = {
++    shift = 7, mask = 31,
++    [0] = "c.nop", _ = "c.addiDx"
++  },
++  [1] = "c.jalT", [2] = "c.liDx",
++  [3] = {
++    shift = 7, mask = 31,
++    [0] = "c.luiDK", [1] = "c.luiDK", [2] = "c.addi16spX",
++    _ = "c.luiDK"
++  },
++  [4] = map_sub1quad1, [5] = "c.jT", [6] = "c.beqzMq", [7] = "c.bnezMq"
++}
++
++local map_sub1quad2 = {
++  shift = 12, mask = 1,
++  [0] = {
++    shift = 2, mask = 31,
++    [0] = "c.jrD", _ = "c.mvDE"
++  },
++  [1] = {
++    shift = 2, mask = 31,
++    [0] = {
++      shift = 7, mask = 31,
++      [0] = "c.ebreak", _ = "c.jalrD"
++    },
++   _ = "c.addDE"
++  }
++}
++
++local map_quad2 = {
++  shift = 13, mask = 7,
++  [0] = "c.slliD1", [1] = "c.fldspFQ",[2] = "c.lwspDY", [3] = "c.flwspFY",
++  [4] = map_sub1quad2, [5] = "c.fsdspVt", [6] = "c.swspEu", [7] = "c.fswspVu"
++}
++
++local map_compr = {
++  [0] = map_quad0, map_quad1, map_quad2
++}
++
++--RV32M
++local map_mext = {
++  shift = 12, mask = 7,
++  [0] = "mulDRr", "mulhDRr", "mulhsuDRr", "mulhuDRr",
++  "divDRr", "divuDRr", "remDRr", "remuDRr"
++}
++
++--RV64M
++local map_mext64 = {
++  shift = 12, mask = 7,
++  [0] = "mulwDRr", [4] = "divwDRr", [5] = "divuwDRr", [6] = "remwDRr",
++  [7] = "remuwDRr"
++}
++
++--RV32F, RV64F, RV32D, RV64D
++local map_fload = {
++  shift = 12, mask = 7,
++  [2] = "flwFL", [3] = "fldFL"
++}
++
++local map_fstore = {
++  shift = 12, mask = 7,
++  [2] = "fswSg", [3] = "fsdSg"
++}
++
++local map_fmadd = {
++  shift = 25, mask = 3,
++  [0] = "fmadd.sFGgH", "fmadd.dFGgH"
++}
++
++local map_fmsub = {
++  shift = 25, mask = 3,
++  [0] = "fmsub.sFGgH", "fmsub.dFGgH"
++}
++
++local map_fnmsub = {
++  shift = 25, mask = 3,
++  [0] = "fnmsub.sFGgH", "fnmsub.dFGgH"
++}
++
++local map_fnmadd = {
++  shift = 25, mask = 3,
++  [0] = "fnmadd.sFGgH", "fnmadd.dFGgH"
++}
++
++local map_fsgnjs = {
++  shift = 12, mask = 7,
++  [0] = "fsgnj.s|fmv.sFGg6", "fsgnjn.s|fneg.sFGg6", "fsgnjx.s|fabs.sFGg6"
++}
++
++local map_fsgnjd = {
++  shift = 12, mask = 7,
++  [0] = "fsgnj.d|fmv.dFGg6", "fsgnjn.d|fneg.dFGg6", "fsgnjx.d|fabs.dFGg6"
++}
++
++local map_fms = {
++  shift = 12, mask = 7,
++  [0] = "fmin.sFGg", "fmax.sFGg"
++}
++
++local map_fmd = {
++  shift = 12, mask = 7,
++  [0] = "fmin.dFGg", "fmax.dFGg"
++}
++
++local map_fcomps = {
++  shift = 12, mask = 7,
++  [0] = "fle.sDGg", "flt.sDGg", "feq.sDGg"
++}
++
++local map_fcompd = {
++  shift = 12, mask = 7,
++  [0] = "fle.dDGg", "flt.dDGg", "feq.dDGg"
++}
++
++local map_fcvtwls = {
++  shift = 20, mask = 31,
++  [0] = "fcvt.w.sDG", "fcvt.wu.sDG", "fcvt.l.sDG", "fcvt.lu.sDG"
++}
++
++local map_fcvtwld = {
++  shift = 20, mask = 31,
++  [0] = "fcvt.w.dDG", "fcvt.wu.dDG", "fcvt.l.dDG", "fcvt.lu.dDG"
++}
++
++local map_fcvts = {
++  shift = 20, mask = 31,
++  [0] = "fcvt.s.wFR", "fcvt.s.wuFR", "fcvt.s.lFR", "fcvt.s.luFR"
++}
++
++local map_fcvtd = {
++  shift = 20, mask = 31,
++  [0] = "fcvt.d.wFR", "fcvt.d.wuFR", "fcvt.d.lFR", "fcvt.d.luFR"
++}
++
++local map_fext = {
++  shift = 25, mask = 127,
++  [0] = "fadd.sFGg", [1] = "fadd.dFGg", [4] = "fsub.sFGg", [5] = "fsub.dFGg",
++  [8] = "fmul.sFGg", [9] = "fmul.dFGg", [12] = "fdiv.sFGg", [13] = "fdiv.dFGg",
++  [16] = map_fsgnjs, [17] = map_fsgnjd, [20] = map_fms, [21] = map_fmd,
++  [32] = "fcvt.s.dFG", [33] = "fcvt.d.sFG",[44] = "fsqrt.sFG", [45] = "fsqrt.dFG",
++  [80] = map_fcomps, [81] = map_fcompd, [96] = map_fcvtwls, [97] = map_fcvtwld,
++  [104] = map_fcvts, [105] = map_fcvtd,
++  [112] = {
++    shift = 12, mask = 7,
++    [0] = "fmv.x.wDG", "fclass.sDG"
++  },
++  [113] = {
++  shift = 12, mask = 7,
++    [0] = "fmv.x.dDG", "fclass.dDG"
++  },
++  [120] = "fmv.w.xFR", [121] = "fmv.d.xFR"
++}
++
++--RV32A, RV64A
++local map_aext = {
++  shift = 27, mask = 31,
++  [0] = {
++    shift = 12, mask = 7,
++    [2] = "amoadd.wDrO", [3] = "amoadd.dDrO"
++  },
++  {
++    shift = 12, mask = 7,
++    [2] = "amoswap.wDrO", [3] = "amoswap.dDrO"
++  },
++  {
++    shift = 12, mask = 7,
++    [2] = "lr.wDO", [3] = "lr.dDO"
++  },
++  {
++    shift = 12, mask = 7,
++    [2] = "sc.wDrO", [3] = "sc.dDrO"
++  },
++  {
++    shift = 12, mask = 7,
++    [2] = "amoxor.wDrO", [3] = "amoxor.dDrO"
++  },
++  [8] = {
++    shift = 12, mask = 7,
++    [2] = "amoor.wDrO", [3] = "amoor.dDrO"
++  },
++  [12] = {
++    shift = 12, mask = 7,
++    [2] = "amoand.wDrO", [3] = "amoand.dDrO"
++  },
++  [16] = {
++    shift = 12, mask = 7,
++    [2] = "amomin.wDrO", [3] = "amomin.dDrO"
++  },
++  [20] = {
++    shift = 12, mask = 7,
++    [2] = "amomax.wDrO", [3] = "amomax.dDrO"
++  },
++  [24] = {
++    shift = 12, mask = 7,
++    [2] = "amominu.wDrO", [3] = "amominu.dDrO"
++  },
++  [28] = {
++   shift = 12, mask = 7,
++   [2] = "amomaxu.wDrO", [3] = "amomaxu.dDrO"
++  },
++}
++
++-- RV32I, RV64I
++local map_load = {
++  shift = 12, mask = 7,
++  [0] = "lbDL", "lhDL", "lwDL", "ldDL",
++  "lbuDL", "lhuDL", "lwuDL"
++}
++
++local map_ali = {
++  shift = 12, mask = 7,
++  [0] = {
++    shift = 7, mask = 0x1ffffff,
++    [0] = "nop", _ = "addi|li|mvDR0I2"
++  }
++  ,"slliDRi", "sltiDRI", "sltiu|seqzDRI5",
++  "xori|notDRI4",
++  {
++    shift = 26, mask = 63,
++    [0] = "srliDRi", [16] = "sraiDRi"
++  },
++  "oriDRI", "andiDRI"
++}
++
++local map_branch = {
++  shift = 12, mask = 7,
++  [0] = "beq|beqzRr0B", "bne|bnezRr0B" , false, false,
++  "blt|bgtz|bltzR0r2B", "bge|blez|bgezR0r2B", "bltuRrB", "bgeuRrB"
++}
++
++local map_store = {
++  shift = 12, mask = 7,
++  [0] = "sbSr", "shSr", "swSr", "sdSr"
++}
++
++local map_al = {
++  shift = 25, mask = 127,
++  [0] = {
++    shift = 12, mask = 7,
++    [0] = "addDRr", "sllDRr", "slt|sgtz|sltzDR0r2", "sltu|snezDR0r",
++    "xorDRr", "srlDRr", "orDRr", "andDRr"
++  },
++  map_mext,
++  [32] = {
++    shift = 12, mask = 7,
++    [0] = "sub|negDR0r", [5] = "sraDRr"
++  }
++}
++
++--64I
++local map_addi_shift = {
++  shift = 12, mask = 7,
++  [0] = "addiw|sext.wDRI0", "slliwDRi",
++  [5] = {
++    shift = 25, mask = 127,
++    [0] = "srliwDRi", [32] = "sraiwDRi"
++  }
++}
++
++local map_arithw_shiftw = {
++  shift = 25, mask = 127,
++  [0] = {
++    shift = 12, mask = 7,
++    [0] = "addwDRr", [1] = "sllwDRr", [5] = "srlwDRr"
++  },
++  [1] = map_mext64,
++  [32] = {
++    shift = 12, mask = 7,
++    [0] = "subw|negwDR0r", [5] = "srawDRr"
++  }
++}
++
++local map_ecabre = {
++  shift = 12, mask = 7,
++  [0] = {
++   shift = 20, mask = 4095,
++   [0] = "ecall", "ebreak"
++  }
++}
++
++local map_fence = {
++  shift = 12, mask = 1,
++  [0] = "fence", --"fence.i" ZIFENCEI EXTENSION
++}
++
++local map_jalr = {
++  shift = 7, mask = 0x1ffffff,
++  _ = "jalr|jrDRI7", [256] = "ret"
++}
++
++local map_pri = {
++  [3] = map_load, [7] = map_fload, [15] = map_fence, [19] = map_ali,
++  [23] = "auipcDA", [27] = map_addi_shift,
++  [35] = map_store, [39] = map_fstore, [47] = map_aext, [51] = map_al,
++  [55] = "luiDU", [59] = map_arithw_shiftw, [67] = map_fmadd, [71] = map_fmsub,
++  [75] = map_fnmsub, [99] = map_branch, [79] = map_fnmadd, [83] = map_fext,
++  [103] = map_jalr, [111] = "jal|j|D0J", [115] = map_ecabre
++}
++
++------------------------------------------------------------------------------
++
++local map_gpr = {
++  [0] = "zero", "ra", "sp", "gp", "tp", "x5", "x6", "x7",
++  "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
++  "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
++  "x24", "x25", "x26", "x27", "x28", "x29", "x30", "x31",
++}
++
++local map_fgpr = {
++  [0] = "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7",
++  "f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15",
++  "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
++  "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31",
++}
++
++------------------------------------------------------------------------------
++
++-- Output a nicely formatted line with an opcode and operands.
++local function putop(ctx, text, operands)
++  local pos = ctx.pos
++	local extra = ""
++  if ctx.rel then
++    local sym = ctx.symtab[ctx.rel]
++    if sym then extra = "\t->"..sym end
++  end
++  if ctx.hexdump > 0 then
++    ctx.out:write((format("%08x  %s  %-7s %s%s\n",
++    ctx.addr+pos, tohex(ctx.op), text, concat(operands, ","), extra)))
++  else
++    ctx.out(format("%08x  %-7s %s%s\n",
++    ctx.addr+pos, text, concat(operands, ", "), extra))
++  end
++  local pos = ctx.pos
++  local first_byte = byte(ctx.code, ctx.pos+1)
++  --Examine if the next instruction is 16-bits or 32-bits
++  if(band(first_byte, 3) < 3) then
++    ctx.pos = pos + 2
++  else
++    ctx.pos = pos + 4
++  end
++end
++
++-- Fallback for unknown opcodes.
++local function unknown(ctx)
++  return putop(ctx, ".long", { "0x"..tohex(ctx.op) })
++end
++
++local function get_le(ctx)
++  local pos = ctx.pos
++  --Examine if the next instruction is 16-bits or 32-bits
++  local first_byte = byte(ctx.code, pos+1)
++  if(band(first_byte, 3) < 3) then --checking first two bits of opcode
++    local b0, b1 = byte(ctx.code, pos+1, pos+2)
++    return bor(lshift(b1, 8), b0)
++  else
++    local b0, b1, b2, b3 = byte(ctx.code, pos+1, pos+4)
++    return bor(lshift(b3, 24), lshift(b2, 16), lshift(b1, 8), b0)
++  end
++end
++
++local function parse_W(opcode)
++  local part1 = band(rshift(opcode, 7), 15) --9:6
++  local part2 = band(rshift(opcode, 11), 3) --5:4
++  local part3 = band(rshift(opcode, 5), 1)--3
++  local part4 = band(rshift(opcode, 6), 1)--2
++  return bor(lshift(0, 31), lshift(part1, 6) , lshift(part2, 4),
++             lshift(part3, 3), lshift(part4, 2))
++end
++
++local function parse_x(opcode)
++  local part1 = band(rshift(opcode, 12), 1) --5
++  local part2 = band(rshift(opcode, 2), 31) --4:0
++  if(part1 == 1) then
++    return bor(lshift(1, 31), lshift(0x1ffffff, 6), lshift(part1, 5), part2)
++  else
++    return bor(lshift(0, 31), lshift(part1, 5), part2)
++  end
++end
++
++local function parse_X(opcode)
++  local part1 = band(rshift(opcode, 12), 1) --12
++  local part2 = band(rshift(opcode, 3), 3) --8:7
++  local part3 = band(rshift(opcode, 5), 1) --6
++  local part4 = band(rshift(opcode, 2), 1) --5
++  local part5 = band(rshift(opcode, 6), 1) --4
++  if(part1 == 1) then
++    return bor(lshift(1, 31), lshift(0x3fffff, 9), lshift(part2, 7),
++               lshift(part3, 6), lshift(part4, 5), lshift(part5, 4))
++  else
++    return bor(lshift(0, 31), lshift(part2, 7), lshift(part3, 6),
++               lshift(part4, 5), lshift(part5, 4))
++  end
++end
++
++local function parse_S(opcode)
++  local part1 = band(rshift(opcode, 25), 127) --11:5
++  local sign = band(rshift(part1, 6), 1)
++  local part2 = band(rshift(opcode, 7), 31) --4:0
++  if (sign == 1) then
++    return bor(lshift(1, 31), lshift(0x7ffff, 12), lshift(part1, 5), part2)
++  else
++    return bor(lshift(0, 31), lshift(part1, 5), part2)
++  end
++end
++
++local function parse_B(opcode)
++  local part1 = band(rshift(opcode, 7), 1) --11
++  local part2 = band(rshift(opcode, 25), 63) --10:5
++  local part3 = band(rshift(opcode, 8), 15) -- 4 : 1
++  if (part1 == 1) then
++    return bor(lshift(1, 31), lshift(0x7ffff, 12), lshift(part1, 11),
++               lshift(part2, 5), lshift(part3, 1), 0)
++  else
++    return bor(lshift(0, 31), lshift(part1, 11), lshift(part2, 5),
++               lshift(part3, 1), 0)
++  end
++end
++
++local function parse_q(opcode)
++  local part1 = band(rshift(opcode, 12), 1) --8
++  local part2 = band(rshift(opcode, 5), 3) --7:6
++  local part3 = band(rshift(opcode, 2), 1) --5
++  local part4 = band(rshift(opcode, 10), 3) --4:3
++  local part5 = band(rshift(opcode, 3), 3) --2:1
++  if(part1 == 1) then
++    return bor(lshift(1, 31), lshift(0x7fffff, 8), lshift(part2, 6),
++               lshift(part3, 5), lshift(part4, 3), lshift(part5, 1))
++  else
++    return bor(lshift(0, 31), lshift(part2, 6), lshift(part3, 5),
++               lshift(part4, 3), lshift(part5, 1))
++  end
++end
++
++local function parse_J(opcode)
++  local part1 = band(rshift(opcode, 31), 1) --20
++  local part2 = band(rshift(opcode, 12), 255) -- 19:12
++  local part3 = band(rshift(opcode, 20), 1) --11
++  local part4 = band(rshift(opcode, 21), 1023) --10:1
++  if(part1 == 1) then
++    return bor(lshift(1, 31), lshift(0x7ff, 20), lshift(part2, 12),
++               lshift(part3, 11), lshift(part4, 1))
++  else
++    return bor(lshift(0, 31), lshift(0, 20), lshift(part2, 12),
++               lshift(part3, 11), lshift(part4, 1))
++  end
++end
++
++local function parse_T(opcode)
++  local part1 = band(rshift(opcode, 12), 1) --11
++  local part2 = band(rshift(opcode, 8), 1) --10
++  local part3 = band(rshift(opcode, 9), 3)--9:8
++  local part4 = band(rshift(opcode, 6), 1) --7
++  local part5 = band(rshift(opcode, 7), 1) -- 6
++  local part6 = band(rshift(opcode, 2), 1) --5
++  local part7 = band(rshift(opcode, 11), 1) --4
++  local part8 = band(rshift(opcode, 3), 7) --3:1
++  if(part1 == 1) then
++    return bor(lshift(1, 31), lshift(0x7ffff, 12), lshift(part1, 11),
++               lshift(part2, 10), lshift(part3, 8), lshift(part4, 7),
++               lshift(part5, 6), lshift(part6, 5), lshift(part7, 4),
++               lshift(part8, 1))
++  else
++    return bor(lshift(0, 31), lshift(part1, 11), lshift(part2, 10),
++               lshift(part3, 8), lshift(part4, 7), lshift(part5, 6),
++               lshift(part6, 5), lshift(part7, 4), lshift(part8, 1))
++  end
++end
++
++local function parse_K(opcode)
++  local part1 = band(rshift(opcode, 12), 1) --5 17
++  local part2 = band(rshift(opcode, 2), 31) --4:0  16:12
++  if(part1 == 1) then
++    return bor(lshift(0, 31), lshift(0x7fff, 5), part2)
++  else
++    return bor(lshift(0, 31), lshift(part1, 5), part2)
++  end
++end
++
++-- Disassemble a single instruction.
++local function disass_ins(ctx)
++  local op = ctx:get()
++  local operands = {}
++  local last = nil
++  ctx.op = op
++  ctx.rel =nil
++
++  local opat = 0
++  --for compressed instructions
++  if(band(op, 3) < 3) then
++    opat = ctx.map_compr[band(op, 3)]
++    while type(opat) ~= "string" do
++      if not opat then return unknown(ctx) end
++      local test = band(rshift(op, opat.shift), opat.mask)
++      opat = opat[band(rshift(op, opat.shift), opat.mask)] or opat._
++    end
++  else
++    opat = ctx.map_pri[band(op,127)]
++    while type(opat) ~= "string" do
++      if not opat then return unknown(ctx) end
++      opat = opat[band(rshift(op, opat.shift), opat.mask)] or opat._
++    end
++  end
++  local name, pat = match(opat, "^([a-z0-9_.]*)(.*)")
++  local altname, pat2 = match(pat, "|([a-z0-9_.|]*)(.*)")
++  local a1, a2 = 0
++  if altname then
++   pat = pat2
++  end
++
++  local alias_done = false --variable for the case of 2 pseudoinstructions, if both parameters are x0, 0
++
++  for p in gmatch(pat, ".") do
++    local x = nil
++    if p == "D" then
++      x = map_gpr[band(rshift(op, 7), 31)]
++    elseif p == "F" then
++      x = map_fgpr[band(rshift(op, 7), 31)]
++    elseif p == "R" then
++      x = map_gpr[band(rshift(op, 15), 31)]
++    elseif p == "G" then
++      x = map_fgpr[band(rshift(op, 15), 31)]
++    elseif p == "r" then
++      x = map_gpr[band(rshift(op, 20), 31)]
++      if(name == "sb" or name == "sh" or name == "sw" or name == "sd") then
++        local temp = last --because of the diffrent order of the characters
++        operands[#operands] = x
++        x = temp
++      end
++    elseif p == "g" then
++      x = map_fgpr[band(rshift(op, 20), 31)]
++     if(name == "fsw" or name == "fsd") then
++        local temp = last
++        operands[#operands] = x
++        x = temp
++     end
++    elseif p == "Z" then
++      x = map_gpr[8 + band(rshift(op, 2), 7)]
++    elseif p == "N" then
++      x = map_fgpr[8 + band(rshift(op, 2), 7)]
++    elseif p == "M" then
++      x = map_gpr[8 + band(rshift(op, 7), 7)]
++    elseif p == "E" then
++      x = map_gpr[band(rshift(op, 2), 31)]
++    elseif p == "W" then
++      local uimm = parse_W(op)
++      x = format("%s,%d", "sp", uimm)
++    elseif p == "x" then
++      x = parse_x(op)
++    elseif p == "h" then
++      local part1 = band(rshift(op, 5), 3) --7:6
++      local part2 = band(rshift(op, 10), 7) --5:3
++      local uimm = bor(lshift(0, 31), lshift(part1, 6) , lshift(part2, 3))
++      operands[#operands] = format("%d(%s)", uimm, last)
++    elseif p == "X" then
++      local imm = parse_X(op)
++      x = format("%s,%d", "sp", imm)
++    elseif p == "O" then
++      x = format("(%s)", map_gpr[band(rshift(op, 15), 31)])
++    elseif p == "H" then
++      x = map_fgpr[band(rshift(op, 27), 31)]
++    elseif p == "L" then
++      local register = map_gpr[band(rshift(op, 15), 31)]
++      local disp = arshift(op, 20)
++      x = format("%d(%s)", disp, register)
++    elseif p == "I" then
++      x = arshift(op, 20)
++      --different for jalr
++      if(name == "jalr") then
++        local reg = map_gpr[band(rshift(op, 15), 31)]
++        if(ctx.reltab[reg] == nil) then
++          operands[#operands] = format("%d(%s)", x, last)
++        else
++          local target = ctx.reltab[reg] + x
++          operands[#operands] = format("%d(%s) #0x%08x", x, last, target)
++          ctx.rel = target
++          ctx.reltab[reg] = nil --assume no reuses of the register
++        end
++        x = nil --not to add additional operand
++      end
++    elseif p == "i" then
++      --both for RV32I AND RV64I
++      local value = band(arshift(op, 20), 63)
++      x = string.format("0x%x", value)
++    elseif p == "S" then
++      local register = map_gpr[band(rshift(op, 15), 31)] --register
++      local imm = parse_S(op)
++      x = format("%d(%s)", imm, register)
++    elseif p == "n" then
++      local part1 = band(rshift(op, 5), 1) --6
++      local part2 = band(rshift(op, 10), 7) --5:3
++      local part3 = band(rshift(op, 6), 1) --2
++      local uimm = bor(lshift(0, 31), lshift(part1, 6), lshift(part2, 3),
++                       lshift(part3, 2))
++      operands[#operands] = format("%d(%s)", uimm, last)
++    elseif p == "A" then
++      local value, dest = band(rshift(op, 12), 0xfffff), map_gpr[band(rshift(op, 7), 31)]
++      ctx.reltab[dest] = ctx.addr + ctx.pos + lshift(value, 12)
++      x = format("0x%x", value)
++    elseif p == "B" then
++      x = ctx.addr + ctx.pos + parse_B(op)
++      ctx.rel = x
++      x = format("0x%08x", x)
++    elseif p == "U" then
++      local value = band(rshift(op, 12), 0xfffff)
++      x = string.format("0x%x", value)
++    elseif p == "Q" then
++      local part1 = band(rshift(op, 2), 7) --8:6
++      local part2 = band(rshift(op, 12), 1) --5
++      local part3 = band(rshift(op, 5), 3) --4:3
++      local uimm = bor(lshift(0, 31), lshift(part1, 6), lshift(part2, 5),
++                       lshift(part3, 3))
++      x = format("%d(%s)", uimm, "sp")
++   elseif p == "q" then
++      x = ctx.addr + ctx.pos + parse_q(op)
++      ctx.rel = x
++      x = format("0x%08x", x)
++    elseif p == "J" then
++      x = ctx.addr + ctx.pos + parse_J(op)
++      ctx.rel = x
++      x = format("0x%08x", x)
++    elseif p == "K" then
++      local value = parse_K(op)
++      x = string.format("0x%x", value)
++    elseif p == "Y" then
++      local part1 = band(rshift(op, 2), 3) --7:6
++      local part2 = band(rshift(op, 12), 1) --5
++      local part3 = band(rshift(op, 4), 7) --4:2
++      local uimm = bor(lshift(0, 31), lshift(part1, 6), lshift(part2, 5),
++                       lshift(part3, 2))
++      x = format("%d(%s)", uimm, "sp")
++    elseif p == "1" then
++      local part1 = band(rshift(op, 12), 1) --5
++      local part2 = band(rshift(op, 2), 31) --4:0
++      local uimm = bor(lshift(0, 31), lshift(part1, 5), part2)
++      x = string.format("0x%x", uimm)
++    elseif p == "T" then
++      x = ctx.addr + ctx.pos + parse_T(op)
++      ctx.rel = x
++      x = format("0x%08x", x)
++    elseif p == "t" then
++      local part1 = band(rshift(op, 7), 7) --8:6
++      local part2 = band(rshift(op, 10), 7) --5:3
++      local uimm = bor(lshift(0, 31), lshift(part1, 6), lshift(part2, 3))
++      x = format("%d(%s)", uimm, "sp")
++    elseif p == "u" then
++      local part1 = band(rshift(op, 7), 3) --7:6
++      local part2 = band(rshift(op, 9), 15) --5:2
++      local uimm = bor(lshift(0, 31), lshift(part1, 6), lshift(part2, 2))
++      x = format("%d(%s)", uimm, "sp")
++    elseif p == "V" then
++      x = map_fgpr[band(rshift(op, 2), 31)]
++    elseif p == "0" then --PSEUDOINSTRUCTIONS
++      if (last == "zero" or last == 0) then
++        local n = #operands
++        operands[n] = nil
++        last = operands[n-1]
++        local a1, a2 = match(altname, "([^|]*)|(.*)")
++        if a1 then name, altname = a1, a2
++        else name = altname end
++        alias_done = true
++      end
++    elseif (p == "4") then
++      if(last == -1) then
++        name = altname
++        operands[#operands] = nil
++      end
++    elseif (p == "5") then
++      if(last == 1) then
++        name = altname
++        operands[#operands] = nil
++      end
++    elseif (p == "6") then
++      if(last == operands[#operands - 1]) then
++        name = altname
++        operands[#operands] = nil
++      end
++    elseif (p == "7") then --jalr rs
++      local value = string.sub(operands[#operands], 1, 1)
++      local reg = string.sub(operands[#operands], 3, #(operands[#operands]) - 1)
++      if(value == "0" and
++         (operands[#operands - 1] == "ra" or operands[#operands - 1] == "zero")) then
++        if(operands[#operands - 1] == "zero") then
++          name = altname
++        end
++        operands[#operands] = nil
++        operands[#operands] = reg
++      end
++    elseif (p == "2" and alias_done == false) then
++      if (last == "zero" or last == 0) then
++        local a1, a2 = match(altname, "([^|]*)|(.*)")
++        name = a2
++        operands[#operands] = nil
++      end
++    end
++    if x then operands[#operands+1] = x; last = x end
++  end
++  return putop(ctx, name, operands)
++end
++
++------------------------------------------------------------------------------
++
++-- Disassemble a block of code.
++local function disass_block(ctx, ofs, len)
++  if not ofs then
++    ofs = 0
++  end
++  local stop = len and ofs+len or #ctx.code
++  --instructions can be both 32 and 16 bits
++  stop = stop - stop % 2
++  ctx.pos = ofs - ofs % 2
++  ctx.rel = nil
++  while ctx.pos < stop do disass_ins(ctx) end
++end
++
++-- Extended API: create a disassembler context. Then call ctx:disass(ofs, len).
++local function create(code, addr, out)
++  local ctx = {}
++  ctx.code = code
++  ctx.addr = addr or 0
++  ctx.out = out or io.write
++  ctx.symtab = {}
++  ctx.disass = disass_block
++  ctx.hexdump = 8
++  ctx.get = get_le
++  ctx.map_pri = map_pri
++  ctx.map_compr = map_compr
++  ctx.reltab = {}
++  return ctx
++end
++
++-- Simple API: disassemble code (a string) at address and output via out.
++local function disass(code, addr, out)
++  create(code, addr, out):disass(addr)
++end
++
++-- Return register name for RID.
++local function regname(r)
++  if r < 32 then return map_gpr[r] end
++  return "f"..(r-32)
++end
++
++-- Public module functions.
++return {
++  create = create,
++  disass = disass,
++  regname = regname
++}
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_riscv64.lua
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_riscv64.lua
+@@ -0,0 +1,16 @@
++----------------------------------------------------------------------------
++-- LuaJIT RISC-V 64 disassembler wrapper module.
++--
++-- Copyright (C) 2005-2022 Mike Pall. All rights reserved.
++-- Released under the MIT license. See Copyright Notice in luajit.h
++----------------------------------------------------------------------------
++-- This module just exports the default riscv little-endian functions from the
++-- RISC-V disassembler module. All the interesting stuff is there.
++------------------------------------------------------------------------------
++
++local dis_riscv = require((string.match(..., ".*%.") or "").."dis_riscv")
++return {
++  create = dis_riscv.create,
++  disass = dis_riscv.disass,
++  regname = dis_riscv.regname
++}
+\ No newline at end of file
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_x64.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/jit/dis_x64.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_x64.lua
+@@ -1,7 +1,7 @@
+ ----------------------------------------------------------------------------
+ -- LuaJIT x64 disassembler wrapper module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- Released under the MIT license. See Copyright Notice in luajit.h
+ ----------------------------------------------------------------------------
+ -- This module just exports the 64 bit functions from the combined
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_x86.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/jit/dis_x86.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dis_x86.lua
+@@ -1,7 +1,7 @@
+ ----------------------------------------------------------------------------
+ -- LuaJIT x86/x64 disassembler module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- Released under the MIT license. See Copyright Notice in luajit.h
+ ----------------------------------------------------------------------------
+ -- This is a helper module used by the LuaJIT machine code dumper module.
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dump.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/jit/dump.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/jit/dump.lua
+@@ -1,7 +1,7 @@
+ ----------------------------------------------------------------------------
+ -- LuaJIT compiler dump module.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- Released under the MIT license. See Copyright Notice in luajit.h
+ ----------------------------------------------------------------------------
+ --
+@@ -55,7 +55,6 @@
+ 
+ -- Cache some library functions and objects.
+ local jit = require("jit")
+-assert(jit.version_num == 20100, "LuaJIT core/library version mismatch")
+ local jutil = require("jit.util")
+ local vmdef = require("jit.vmdef")
+ local funcinfo, funcbc = jutil.funcinfo, jutil.funcbc
+@@ -102,10 +101,12 @@ end
+ local function fillsymtab(tr, nexit)
+   local t = symtab
+   if nexitsym == 0 then
++    local maskaddr = jit.arch == "arm" and -2
+     local ircall = vmdef.ircall
+     for i=0,#ircall do
+       local addr = ircalladdr(i)
+       if addr ~= 0 then
++	if maskaddr then addr = band(addr, maskaddr) end
+ 	if addr < 0 then addr = addr + 2^32 end
+ 	t[addr] = ircall[i]
+       end
+@@ -217,8 +218,10 @@ local function colorize_text(s)
+   return s
+ end
+ 
+-local function colorize_ansi(s, t)
+-  return format(colortype_ansi[t], s)
++local function colorize_ansi(s, t, extra)
++  local out = format(colortype_ansi[t], s)
++  if extra then out = "\027[3m"..out end
++  return out
+ end
+ 
+ local irtype_ansi = setmetatable({},
+@@ -227,9 +230,10 @@ local irtype_ansi = setmetatable({},
+ 
+ local html_escape = { ["<"] = "&lt;", [">"] = "&gt;", ["&"] = "&amp;", }
+ 
+-local function colorize_html(s, t)
++local function colorize_html(s, t, extra)
+   s = gsub(s, "[<>&]", html_escape)
+-  return format('<span class="irt_%s">%s</span>', irtype_text[t], s)
++  return format('<span class="irt_%s%s">%s</span>',
++		irtype_text[t], extra and " irt_extra" or "", s)
+ end
+ 
+ local irtype_html = setmetatable({},
+@@ -254,6 +258,7 @@ span.irt_tab { color: #c00000; }
+ span.irt_udt, span.irt_lud { color: #00c0c0; }
+ span.irt_num { color: #4040c0; }
+ span.irt_int, span.irt_i8, span.irt_u8, span.irt_i16, span.irt_u16 { color: #b040b0; }
++span.irt_extra { font-style: italic; }
+ </style>
+ ]]
+ 
+@@ -269,6 +274,7 @@ local litname = {
+     if band(mode, 8) ~= 0 then s = s.."C" end
+     if band(mode, 16) ~= 0 then s = s.."R" end
+     if band(mode, 32) ~= 0 then s = s.."I" end
++    if band(mode, 64) ~= 0 then s = s.."K" end
+     t[mode] = s
+     return s
+   end}),
+@@ -277,15 +283,18 @@ local litname = {
+     local s = irtype[band(mode, 31)]
+     s = irtype[band(shr(mode, 5), 31)].."."..s
+     if band(mode, 0x800) ~= 0 then s = s.." sext" end
+-    local c = shr(mode, 14)
+-    if c == 2 then s = s.." index" elseif c == 3 then s = s.." check" end
++    local c = shr(mode, 12)
++    if c == 1 then s = s.." none"
++    elseif c == 2 then s = s.." index"
++    elseif c == 3 then s = s.." check" end
+     t[mode] = s
+     return s
+   end}),
+   ["FLOAD "] = vmdef.irfield,
+   ["FREF  "] = vmdef.irfield,
+   ["FPMATH"] = vmdef.irfpm,
+-  ["BUFHDR"] = { [0] = "RESET", "APPEND" },
++  ["TMPREF"] = { [0] = "", "IN", "OUT", "INOUT", "", "", "OUT2", "INOUT2" },
++  ["BUFHDR"] = { [0] = "RESET", "APPEND", "WRITE" },
+   ["TOSTR "] = { [0] = "INT", "NUM", "CHAR" },
+ }
+ 
+@@ -345,7 +354,7 @@ local function formatk(tr, idx, sn)
+   else
+     s = tostring(k) -- For primitives.
+   end
+-  s = colorize(format("%-4s", s), t)
++  s = colorize(format("%-4s", s), t, band(sn or 0, 0x100000) ~= 0)
+   if slot then
+     s = format("%s @%d", s, slot)
+   end
+@@ -365,7 +374,7 @@ local function printsnap(tr, snap)
+ 	out:write(colorize(format("%04d/%04d", ref, ref+1), 14))
+       else
+ 	local m, ot, op1, op2 = traceir(tr, ref)
+-	out:write(colorize(format("%04d", ref), band(ot, 31)))
++	out:write(colorize(format("%04d", ref), band(ot, 31), band(sn, 0x100000) ~= 0))
+       end
+       out:write(band(sn, 0x10000) == 0 and " " or "|") -- SNAP_FRAME
+     else
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/jit/p.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/jit/p.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/jit/p.lua
+@@ -1,7 +1,7 @@
+ ----------------------------------------------------------------------------
+ -- LuaJIT profiler.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- Released under the MIT license. See Copyright Notice in luajit.h
+ ----------------------------------------------------------------------------
+ --
+@@ -41,7 +41,6 @@
+ 
+ -- Cache some library functions and objects.
+ local jit = require("jit")
+-assert(jit.version_num == 20100, "LuaJIT core/library version mismatch")
+ local profile = require("jit.profile")
+ local vmdef = require("jit.vmdef")
+ local math = math
+@@ -238,6 +237,7 @@ local function prof_finish()
+     prof_count1 = nil
+     prof_count2 = nil
+     prof_ud = nil
++    if out ~= stdout then out:close() end
+   end
+ end
+ 
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/jit/v.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/jit/v.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/jit/v.lua
+@@ -1,7 +1,7 @@
+ ----------------------------------------------------------------------------
+ -- Verbose mode of the LuaJIT compiler.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- Released under the MIT license. See Copyright Notice in luajit.h
+ ----------------------------------------------------------------------------
+ --
+@@ -59,7 +59,6 @@
+ 
+ -- Cache some library functions and objects.
+ local jit = require("jit")
+-assert(jit.version_num == 20100, "LuaJIT core/library version mismatch")
+ local jutil = require("jit.util")
+ local vmdef = require("jit.vmdef")
+ local funcinfo, traceinfo = jutil.funcinfo, jutil.traceinfo
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/jit/zone.lua
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/jit/zone.lua
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/jit/zone.lua
+@@ -1,7 +1,7 @@
+ ----------------------------------------------------------------------------
+ -- LuaJIT profiler zones.
+ --
+--- Copyright (C) 2005-2021 Mike Pall. All rights reserved.
++-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+ -- Released under the MIT license. See Copyright Notice in luajit.h
+ ----------------------------------------------------------------------------
+ --
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lib_aux.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lib_aux.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lib_aux.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Auxiliary library for the Lua/C API.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Major parts taken verbatim or adapted from the Lua interpreter.
+ ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+@@ -21,6 +21,7 @@
+ #include "lj_state.h"
+ #include "lj_trace.h"
+ #include "lj_lib.h"
++#include "lj_vmevent.h"
+ 
+ #if LJ_TARGET_POSIX
+ #include <sys/wait.h>
+@@ -318,6 +319,18 @@ static int panic(lua_State *L)
+   return 0;
+ }
+ 
++#ifndef LUAJIT_DISABLE_VMEVENT
++static int error_finalizer(lua_State *L)
++{
++  const char *s = lua_tostring(L, -1);
++  fputs("ERROR in finalizer: ", stderr);
++  fputs(s ? s : "?", stderr);
++  fputc('\n', stderr);
++  fflush(stderr);
++  return 0;
++}
++#endif
++
+ #ifdef LUAJIT_USE_SYSMALLOC
+ 
+ #if LJ_64 && !LJ_GC64 && !defined(LUAJIT_USE_VALGRIND)
+@@ -339,7 +352,16 @@ static void *mem_alloc(void *ud, void *p
+ LUALIB_API lua_State *luaL_newstate(void)
+ {
+   lua_State *L = lua_newstate(mem_alloc, NULL);
+-  if (L) G(L)->panic = panic;
++  if (L) {
++    G(L)->panic = panic;
++#ifndef LUAJIT_DISABLE_VMEVENT
++    luaL_findtable(L, LUA_REGISTRYINDEX, LJ_VMEVENTS_REGKEY, LJ_VMEVENTS_HSIZE);
++    lua_pushcfunction(L, error_finalizer);
++    lua_rawseti(L, -2, VMEVENT_HASH(LJ_VMEVENT_ERRFIN));
++    G(L)->vmevmask = VMEVENT_MASK(LJ_VMEVENT_ERRFIN);
++    L->top--;
++#endif
++  }
+   return L;
+ }
+ 
+@@ -353,7 +375,16 @@ LUALIB_API lua_State *luaL_newstate(void
+ #else
+   L = lua_newstate(LJ_ALLOCF_INTERNAL, NULL);
+ #endif
+-  if (L) G(L)->panic = panic;
++  if (L) {
++    G(L)->panic = panic;
++#ifndef LUAJIT_DISABLE_VMEVENT
++    luaL_findtable(L, LUA_REGISTRYINDEX, LJ_VMEVENTS_REGKEY, LJ_VMEVENTS_HSIZE);
++    lua_pushcfunction(L, error_finalizer);
++    lua_rawseti(L, -2, VMEVENT_HASH(LJ_VMEVENT_ERRFIN));
++    G(L)->vmevmask = VMEVENT_MASK(LJ_VMEVENT_ERRFIN);
++    L->top--;
++#endif
++  }
+   return L;
+ }
+ 
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lib_base.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lib_base.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lib_base.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Base and coroutine library.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Major portions taken verbatim or adapted from the Lua interpreter.
+ ** Copyright (C) 1994-2011 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+@@ -19,6 +19,7 @@
+ #include "lj_gc.h"
+ #include "lj_err.h"
+ #include "lj_debug.h"
++#include "lj_buf.h"
+ #include "lj_str.h"
+ #include "lj_tab.h"
+ #include "lj_meta.h"
+@@ -75,9 +76,10 @@ LJLIB_ASM_(type)		LJLIB_REC(.)
+ /* This solves a circular dependency problem -- change FF_next_N as needed. */
+ LJ_STATIC_ASSERT((int)FF_next == FF_next_N);
+ 
+-LJLIB_ASM(next)
++LJLIB_ASM(next)			LJLIB_REC(.)
+ {
+   lj_lib_checktab(L, 1);
++  lj_err_msg(L, LJ_ERR_NEXTIDX);
+   return FFH_UNREACHABLE;
+ }
+ 
+@@ -301,7 +303,7 @@ LJLIB_ASM(tonumber)		LJLIB_REC(.)
+ 	while (lj_char_isspace((unsigned char)(*ep))) ep++;
+ 	if (*ep == '\0') {
+ 	  if (LJ_DUALNUM && LJ_LIKELY(ul < 0x80000000u+neg)) {
+-	    if (neg) ul = (unsigned long)-(long)ul;
++	    if (neg) ul = ~ul+1u;
+ 	    setintV(L->base-1-LJ_FR2, (int32_t)ul);
+ 	  } else {
+ 	    lua_Number n = (lua_Number)ul;
+@@ -406,10 +408,22 @@ LJLIB_CF(load)
+   GCstr *name = lj_lib_optstr(L, 2);
+   GCstr *mode = lj_lib_optstr(L, 3);
+   int status;
+-  if (L->base < L->top && (tvisstr(L->base) || tvisnumber(L->base))) {
+-    GCstr *s = lj_lib_checkstr(L, 1);
++  if (L->base < L->top &&
++      (tvisstr(L->base) || tvisnumber(L->base) || tvisbuf(L->base))) {
++    const char *s;
++    MSize len;
++    if (tvisbuf(L->base)) {
++      SBufExt *sbx = bufV(L->base);
++      s = sbx->r;
++      len = sbufxlen(sbx);
++      if (!name) name = &G(L)->strempty;  /* Buffers are not NUL-terminated. */
++    } else {
++      GCstr *str = lj_lib_checkstr(L, 1);
++      s = strdata(str);
++      len = str->len;
++    }
+     lua_settop(L, 4);  /* Ensure env arg exists. */
+-    status = luaL_loadbufferx(L, strdata(s), s->len, strdata(name ? name : s),
++    status = luaL_loadbufferx(L, s, len, name ? strdata(name) : s,
+ 			      mode ? strdata(mode) : NULL);
+   } else {
+     lj_lib_checkfunc(L, 1);
+@@ -602,7 +616,10 @@ static int ffh_resume(lua_State *L, lua_
+     setstrV(L, L->base-LJ_FR2, lj_err_str(L, em));
+     return FFH_RES(2);
+   }
+-  lj_state_growstack(co, (MSize)(L->top - L->base));
++  if (lj_state_cpgrowstack(co, (MSize)(L->top - L->base)) != LUA_OK) {
++    cTValue *msg = --co->top;
++    lj_err_callermsg(L, strVdata(msg));
++  }
+   return FFH_RETRY;
+ }
+ 
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lib_bit.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lib_bit.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lib_bit.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Bit manipulation library.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lib_bit_c
+@@ -155,7 +155,8 @@ LJLIB_CF(bit_tohex)		LJLIB_REC(.)
+ #endif
+   SBuf *sb = lj_buf_tmp_(L);
+   SFormat sf = (STRFMT_UINT|STRFMT_T_HEX);
+-  if (n < 0) { n = -n; sf |= STRFMT_F_UPPER; }
++  if (n < 0) { n = (int32_t)(~(uint32_t)n+1u); sf |= STRFMT_F_UPPER; }
++  if ((uint32_t)n > 254) n = 254;
+   sf |= ((SFormat)((n+1)&255) << STRFMT_SH_PREC);
+ #if LJ_HASFFI
+   if (n < 16) b &= ((uint64_t)1 << 4*n)-1;
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lib_buffer.c
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lib_buffer.c
+@@ -0,0 +1,360 @@
++/*
++** Buffer library.
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
++*/
++
++#define lib_buffer_c
++#define LUA_LIB
++
++#include "lua.h"
++#include "lauxlib.h"
++#include "lualib.h"
++
++#include "lj_obj.h"
++
++#if LJ_HASBUFFER
++#include "lj_gc.h"
++#include "lj_err.h"
++#include "lj_buf.h"
++#include "lj_str.h"
++#include "lj_tab.h"
++#include "lj_udata.h"
++#include "lj_meta.h"
++#if LJ_HASFFI
++#include "lj_ctype.h"
++#include "lj_cdata.h"
++#include "lj_cconv.h"
++#endif
++#include "lj_strfmt.h"
++#include "lj_serialize.h"
++#include "lj_lib.h"
++
++/* -- Helper functions ---------------------------------------------------- */
++
++/* Check that the first argument is a string buffer. */
++static SBufExt *buffer_tobuf(lua_State *L)
++{
++  if (!(L->base < L->top && tvisbuf(L->base)))
++    lj_err_argtype(L, 1, "buffer");
++  return bufV(L->base);
++}
++
++/* Ditto, but for writers. */
++static LJ_AINLINE SBufExt *buffer_tobufw(lua_State *L)
++{
++  SBufExt *sbx = buffer_tobuf(L);
++  setsbufXL_(sbx, L);
++  return sbx;
++}
++
++#define buffer_toudata(sbx)	((GCudata *)(sbx)-1)
++
++/* -- Buffer methods ------------------------------------------------------ */
++
++#define LJLIB_MODULE_buffer_method
++
++LJLIB_CF(buffer_method_free)
++{
++  SBufExt *sbx = buffer_tobuf(L);
++  lj_bufx_free(L, sbx);
++  L->top = L->base+1;  /* Chain buffer object. */
++  return 1;
++}
++
++LJLIB_CF(buffer_method_reset)		LJLIB_REC(.)
++{
++  SBufExt *sbx = buffer_tobuf(L);
++  lj_bufx_reset(sbx);
++  L->top = L->base+1;  /* Chain buffer object. */
++  return 1;
++}
++
++LJLIB_CF(buffer_method_skip)		LJLIB_REC(.)
++{
++  SBufExt *sbx = buffer_tobuf(L);
++  MSize n = (MSize)lj_lib_checkintrange(L, 2, 0, LJ_MAX_BUF);
++  MSize len = sbufxlen(sbx);
++  if (n < len) {
++    sbx->r += n;
++  } else if (sbufiscow(sbx)) {
++    sbx->r = sbx->w;
++  } else {
++    sbx->r = sbx->w = sbx->b;
++  }
++  L->top = L->base+1;  /* Chain buffer object. */
++  return 1;
++}
++
++LJLIB_CF(buffer_method_set)		LJLIB_REC(.)
++{
++  SBufExt *sbx = buffer_tobuf(L);
++  GCobj *ref;
++  const char *p;
++  MSize len;
++#if LJ_HASFFI
++  if (tviscdata(L->base+1)) {
++    CTState *cts = ctype_cts(L);
++    lj_cconv_ct_tv(cts, ctype_get(cts, CTID_P_CVOID), (uint8_t *)&p,
++		   L->base+1, CCF_ARG(2));
++    len = (MSize)lj_lib_checkintrange(L, 3, 0, LJ_MAX_BUF);
++  } else
++#endif
++  {
++    GCstr *str = lj_lib_checkstrx(L, 2);
++    p = strdata(str);
++    len = str->len;
++  }
++  lj_bufx_free(L, sbx);
++  lj_bufx_set_cow(L, sbx, p, len);
++  ref = gcV(L->base+1);
++  setgcref(sbx->cowref, ref);
++  lj_gc_objbarrier(L, buffer_toudata(sbx), ref);
++  L->top = L->base+1;  /* Chain buffer object. */
++  return 1;
++}
++
++LJLIB_CF(buffer_method_put)		LJLIB_REC(.)
++{
++  SBufExt *sbx = buffer_tobufw(L);
++  ptrdiff_t arg, narg = L->top - L->base;
++  for (arg = 1; arg < narg; arg++) {
++    cTValue *o = &L->base[arg], *mo = NULL;
++  retry:
++    if (tvisstr(o)) {
++      lj_buf_putstr((SBuf *)sbx, strV(o));
++    } else if (tvisint(o)) {
++      lj_strfmt_putint((SBuf *)sbx, intV(o));
++    } else if (tvisnum(o)) {
++      lj_strfmt_putfnum((SBuf *)sbx, STRFMT_G14, numV(o));
++    } else if (tvisbuf(o)) {
++      SBufExt *sbx2 = bufV(o);
++      if (sbx2 == sbx) lj_err_arg(L, (int)(arg+1), LJ_ERR_BUFFER_SELF);
++      lj_buf_putmem((SBuf *)sbx, sbx2->r, sbufxlen(sbx2));
++    } else if (!mo && !tvisnil(mo = lj_meta_lookup(L, o, MM_tostring))) {
++      /* Call __tostring metamethod inline. */
++      copyTV(L, L->top++, mo);
++      copyTV(L, L->top++, o);
++      lua_call(L, 1, 1);
++      o = &L->base[arg];  /* The stack may have been reallocated. */
++      copyTV(L, &L->base[arg], L->top-1);
++      L->top = L->base + narg;
++      goto retry;  /* Retry with the result. */
++    } else {
++      lj_err_argtype(L, (int)(arg+1), "string/number/__tostring");
++    }
++    /* Probably not useful to inline other __tostring MMs, e.g. FFI numbers. */
++  }
++  L->top = L->base+1;  /* Chain buffer object. */
++  lj_gc_check(L);
++  return 1;
++}
++
++LJLIB_CF(buffer_method_putf)		LJLIB_REC(.)
++{
++  SBufExt *sbx = buffer_tobufw(L);
++  lj_strfmt_putarg(L, (SBuf *)sbx, 2, 2);
++  L->top = L->base+1;  /* Chain buffer object. */
++  lj_gc_check(L);
++  return 1;
++}
++
++LJLIB_CF(buffer_method_get)		LJLIB_REC(.)
++{
++  SBufExt *sbx = buffer_tobuf(L);
++  ptrdiff_t arg, narg = L->top - L->base;
++  if (narg == 1) {
++    narg++;
++    setnilV(L->top++);  /* get() is the same as get(nil). */
++  }
++  for (arg = 1; arg < narg; arg++) {
++    TValue *o = &L->base[arg];
++    MSize n = tvisnil(o) ? LJ_MAX_BUF :
++	      (MSize) lj_lib_checkintrange(L, (int)(arg+1), 0, LJ_MAX_BUF);
++    MSize len = sbufxlen(sbx);
++    if (n > len) n = len;
++    setstrV(L, o, lj_str_new(L, sbx->r, n));
++    sbx->r += n;
++  }
++  if (sbx->r == sbx->w && !sbufiscow(sbx)) sbx->r = sbx->w = sbx->b;
++  lj_gc_check(L);
++  return (int)(narg-1);
++}
++
++#if LJ_HASFFI
++LJLIB_CF(buffer_method_putcdata)	LJLIB_REC(.)
++{
++  SBufExt *sbx = buffer_tobufw(L);
++  const char *p;
++  MSize len;
++  if (tviscdata(L->base+1)) {
++    CTState *cts = ctype_cts(L);
++    lj_cconv_ct_tv(cts, ctype_get(cts, CTID_P_CVOID), (uint8_t *)&p,
++		   L->base+1, CCF_ARG(2));
++  } else {
++    lj_err_argtype(L, 2, "cdata");
++  }
++  len = (MSize)lj_lib_checkintrange(L, 3, 0, LJ_MAX_BUF);
++  lj_buf_putmem((SBuf *)sbx, p, len);
++  L->top = L->base+1;  /* Chain buffer object. */
++  return 1;
++}
++
++LJLIB_CF(buffer_method_reserve)		LJLIB_REC(.)
++{
++  SBufExt *sbx = buffer_tobufw(L);
++  MSize sz = (MSize)lj_lib_checkintrange(L, 2, 0, LJ_MAX_BUF);
++  GCcdata *cd;
++  lj_buf_more((SBuf *)sbx, sz);
++  ctype_loadffi(L);
++  cd = lj_cdata_new_(L, CTID_P_UINT8, CTSIZE_PTR);
++  *(void **)cdataptr(cd) = sbx->w;
++  setcdataV(L, L->top++, cd);
++  setintV(L->top++, sbufleft(sbx));
++  return 2;
++}
++
++LJLIB_CF(buffer_method_commit)		LJLIB_REC(.)
++{
++  SBufExt *sbx = buffer_tobuf(L);
++  MSize len = (MSize)lj_lib_checkintrange(L, 2, 0, LJ_MAX_BUF);
++  if (len > sbufleft(sbx)) lj_err_arg(L, 2, LJ_ERR_NUMRNG);
++  sbx->w += len;
++  L->top = L->base+1;  /* Chain buffer object. */
++  return 1;
++}
++
++LJLIB_CF(buffer_method_ref)		LJLIB_REC(.)
++{
++  SBufExt *sbx = buffer_tobuf(L);
++  GCcdata *cd;
++  ctype_loadffi(L);
++  cd = lj_cdata_new_(L, CTID_P_UINT8, CTSIZE_PTR);
++  *(void **)cdataptr(cd) = sbx->r;
++  setcdataV(L, L->top++, cd);
++  setintV(L->top++, sbufxlen(sbx));
++  return 2;
++}
++#endif
++
++LJLIB_CF(buffer_method_encode)		LJLIB_REC(.)
++{
++  SBufExt *sbx = buffer_tobufw(L);
++  cTValue *o = lj_lib_checkany(L, 2);
++  lj_serialize_put(sbx, o);
++  lj_gc_check(L);
++  L->top = L->base+1;  /* Chain buffer object. */
++  return 1;
++}
++
++LJLIB_CF(buffer_method_decode)		LJLIB_REC(.)
++{
++  SBufExt *sbx = buffer_tobufw(L);
++  setnilV(L->top++);
++  sbx->r = lj_serialize_get(sbx, L->top-1);
++  lj_gc_check(L);
++  return 1;
++}
++
++LJLIB_CF(buffer_method___gc)
++{
++  SBufExt *sbx = buffer_tobuf(L);
++  lj_bufx_free(L, sbx);
++  return 0;
++}
++
++LJLIB_CF(buffer_method___tostring)	LJLIB_REC(.)
++{
++  SBufExt *sbx = buffer_tobuf(L);
++  setstrV(L, L->top-1, lj_str_new(L, sbx->r, sbufxlen(sbx)));
++  lj_gc_check(L);
++  return 1;
++}
++
++LJLIB_CF(buffer_method___len)		LJLIB_REC(.)
++{
++  SBufExt *sbx = buffer_tobuf(L);
++  setintV(L->top-1, (int32_t)sbufxlen(sbx));
++  return 1;
++}
++
++LJLIB_PUSH("buffer") LJLIB_SET(__metatable)
++LJLIB_PUSH(top-1) LJLIB_SET(__index)
++
++/* -- Buffer library functions -------------------------------------------- */
++
++#define LJLIB_MODULE_buffer
++
++LJLIB_PUSH(top-2) LJLIB_SET(!)  /* Set environment. */
++
++LJLIB_CF(buffer_new)
++{
++  MSize sz = 0;
++  int targ = 1;
++  GCtab *env, *dict_str = NULL, *dict_mt = NULL;
++  GCudata *ud;
++  SBufExt *sbx;
++  if (L->base < L->top && !tvistab(L->base)) {
++    targ = 2;
++    if (!tvisnil(L->base))
++      sz = (MSize)lj_lib_checkintrange(L, 1, 0, LJ_MAX_BUF);
++  }
++  if (L->base+targ-1 < L->top) {
++    GCtab *options = lj_lib_checktab(L, targ);
++    cTValue *opt_dict, *opt_mt;
++    opt_dict = lj_tab_getstr(options, lj_str_newlit(L, "dict"));
++    if (opt_dict && tvistab(opt_dict)) {
++      dict_str = tabV(opt_dict);
++      lj_serialize_dict_prep_str(L, dict_str);
++    }
++    opt_mt = lj_tab_getstr(options, lj_str_newlit(L, "metatable"));
++    if (opt_mt && tvistab(opt_mt)) {
++      dict_mt = tabV(opt_mt);
++      lj_serialize_dict_prep_mt(L, dict_mt);
++    }
++  }
++  env = tabref(curr_func(L)->c.env);
++  ud = lj_udata_new(L, sizeof(SBufExt), env);
++  ud->udtype = UDTYPE_BUFFER;
++  /* NOBARRIER: The GCudata is new (marked white). */
++  setgcref(ud->metatable, obj2gco(env));
++  setudataV(L, L->top++, ud);
++  sbx = (SBufExt *)uddata(ud);
++  lj_bufx_init(L, sbx);
++  setgcref(sbx->dict_str, obj2gco(dict_str));
++  setgcref(sbx->dict_mt, obj2gco(dict_mt));
++  if (sz > 0) lj_buf_need2((SBuf *)sbx, sz);
++  lj_gc_check(L);
++  return 1;
++}
++
++LJLIB_CF(buffer_encode)			LJLIB_REC(.)
++{
++  cTValue *o = lj_lib_checkany(L, 1);
++  setstrV(L, L->top++, lj_serialize_encode(L, o));
++  lj_gc_check(L);
++  return 1;
++}
++
++LJLIB_CF(buffer_decode)			LJLIB_REC(.)
++{
++  GCstr *str = lj_lib_checkstrx(L, 1);
++  setnilV(L->top++);
++  lj_serialize_decode(L, L->top-1, str);
++  lj_gc_check(L);
++  return 1;
++}
++
++/* ------------------------------------------------------------------------ */
++
++#include "lj_libdef.h"
++
++int luaopen_string_buffer(lua_State *L)
++{
++  LJ_LIB_REG(L, NULL, buffer_method);
++  lua_getfield(L, -1, "__tostring");
++  lua_setfield(L, -2, "tostring");
++  LJ_LIB_REG(L, NULL, buffer);
++  return 1;
++}
++
++#endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lib_debug.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lib_debug.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lib_debug.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Debug library.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Major portions taken verbatim or adapted from the Lua interpreter.
+ ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lib_ffi.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lib_ffi.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lib_ffi.c
+@@ -1,6 +1,6 @@
+ /*
+ ** FFI library.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lib_ffi_c
+@@ -573,6 +573,7 @@ LJLIB_CF(ffi_typeinfo)
+       setintV(lj_tab_setstr(L, t, lj_str_newlit(L, "sib")), (int32_t)ct->sib);
+     if (gcref(ct->name)) {
+       GCstr *s = gco2str(gcref(ct->name));
++      if (isdead(G(L), obj2gco(s))) flipwhite(obj2gco(s));
+       setstrV(L, lj_tab_setstr(L, t, lj_str_newlit(L, "name")), s);
+     }
+     lj_gc_check(L);
+@@ -638,7 +639,7 @@ LJLIB_CF(ffi_alignof)	LJLIB_REC(ffi_xof
+   CTState *cts = ctype_cts(L);
+   CTypeID id = ffi_checkctype(L, cts, NULL);
+   CTSize sz = 0;
+-  CTInfo info = lj_ctype_info(cts, id, &sz);
++  CTInfo info = lj_ctype_info_raw(cts, id, &sz);
+   setintV(L->top-1, 1 << ctype_align(info));
+   return 1;
+ }
+@@ -744,6 +745,9 @@ LJLIB_CF(ffi_abi)	LJLIB_REC(.)
+ #if LJ_ABI_WIN
+     "\003win"
+ #endif
++#if LJ_ABI_PAUTH
++    "\005pauth"
++#endif
+ #if LJ_TARGET_UWP
+     "\003uwp"
+ #endif
+@@ -769,13 +773,13 @@ LJLIB_CF(ffi_metatype)
+   CTypeID id = ffi_checkctype(L, cts, NULL);
+   GCtab *mt = lj_lib_checktab(L, 2);
+   GCtab *t = cts->miscmap;
+-  CType *ct = ctype_get(cts, id);  /* Only allow raw types. */
++  CType *ct = ctype_raw(cts, id);
+   TValue *tv;
+   GCcdata *cd;
+   if (!(ctype_isstruct(ct->info) || ctype_iscomplex(ct->info) ||
+ 	ctype_isvector(ct->info)))
+     lj_err_arg(L, 1, LJ_ERR_FFI_INVTYPE);
+-  tv = lj_tab_setinth(L, t, -(int32_t)id);
++  tv = lj_tab_setinth(L, t, -(int32_t)ctype_typeid(cts, ct));
+   if (!tvisnil(tv))
+     lj_err_caller(L, LJ_ERR_PROTMT);
+   settabV(L, tv, mt);
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lib_init.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lib_init.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lib_init.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Library initialization.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Major parts taken verbatim from the Lua interpreter.
+ ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lib_io.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lib_io.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lib_io.c
+@@ -1,6 +1,6 @@
+ /*
+ ** I/O library.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Major portions taken verbatim or adapted from the Lua interpreter.
+ ** Copyright (C) 1994-2011 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+@@ -60,12 +60,12 @@ static IOFileUD *io_tofile(lua_State *L)
+   return iof;
+ }
+ 
+-static FILE *io_stdfile(lua_State *L, ptrdiff_t id)
++static IOFileUD *io_stdfile(lua_State *L, ptrdiff_t id)
+ {
+   IOFileUD *iof = IOSTDF_IOF(L, id);
+   if (iof->fp == NULL)
+     lj_err_caller(L, LJ_ERR_IOSTDCL);
+-  return iof->fp;
++  return iof;
+ }
+ 
+ static IOFileUD *io_file_new(lua_State *L)
+@@ -178,7 +178,7 @@ static int io_file_readlen(lua_State *L,
+     MSize n = (MSize)fread(buf, 1, m, fp);
+     setstrV(L, L->top++, lj_str_new(L, buf, (size_t)n));
+     lj_gc_check(L);
+-    return (n > 0 || m == 0);
++    return n > 0;
+   } else {
+     int c = getc(fp);
+     ungetc(c, fp);
+@@ -187,8 +187,9 @@ static int io_file_readlen(lua_State *L,
+   }
+ }
+ 
+-static int io_file_read(lua_State *L, FILE *fp, int start)
++static int io_file_read(lua_State *L, IOFileUD *iof, int start)
+ {
++  FILE *fp = iof->fp;
+   int ok, n, nargs = (int)(L->top - L->base) - start;
+   clearerr(fp);
+   if (nargs == 0) {
+@@ -224,8 +225,9 @@ static int io_file_read(lua_State *L, FI
+   return n - start;
+ }
+ 
+-static int io_file_write(lua_State *L, FILE *fp, int start)
++static int io_file_write(lua_State *L, IOFileUD *iof, int start)
+ {
++  FILE *fp = iof->fp;
+   cTValue *tv;
+   int status = 1;
+   for (tv = L->base+start; tv < L->top; tv++) {
+@@ -253,13 +255,11 @@ static int io_file_iter(lua_State *L)
+     lj_err_caller(L, LJ_ERR_IOCLFL);
+   L->top = L->base;
+   if (n) {  /* Copy upvalues with options to stack. */
+-    if (n > LUAI_MAXCSTACK)
+-      lj_err_caller(L, LJ_ERR_STKOV);
+     lj_state_checkstack(L, (MSize)n);
+     memcpy(L->top, &fn->c.upvalue[1], n*sizeof(TValue));
+     L->top += n;
+   }
+-  n = io_file_read(L, iof->fp, 0);
++  n = io_file_read(L, iof, 0);
+   if (ferror(iof->fp))
+     lj_err_callermsg(L, strVdata(L->top-2));
+   if (tvisnil(L->base) && (iof->type & IOFILE_FLAG_CLOSE)) {
+@@ -284,19 +284,25 @@ static int io_file_lines(lua_State *L)
+ 
+ LJLIB_CF(io_method_close)
+ {
+-  IOFileUD *iof = L->base < L->top ? io_tofile(L) :
+-		  IOSTDF_IOF(L, GCROOT_IO_OUTPUT);
++  IOFileUD *iof;
++  if (L->base < L->top) {
++    iof = io_tofile(L);
++  } else {
++    iof = IOSTDF_IOF(L, GCROOT_IO_OUTPUT);
++    if (iof->fp == NULL)
++      lj_err_caller(L, LJ_ERR_IOCLFL);
++  }
+   return io_file_close(L, iof);
+ }
+ 
+ LJLIB_CF(io_method_read)
+ {
+-  return io_file_read(L, io_tofile(L)->fp, 1);
++  return io_file_read(L, io_tofile(L), 1);
+ }
+ 
+ LJLIB_CF(io_method_write)		LJLIB_REC(io_write 0)
+ {
+-  return io_file_write(L, io_tofile(L)->fp, 1);
++  return io_file_write(L, io_tofile(L), 1);
+ }
+ 
+ LJLIB_CF(io_method_flush)		LJLIB_REC(io_flush 0)
+@@ -433,7 +439,7 @@ LJLIB_CF(io_popen)
+ LJLIB_CF(io_tmpfile)
+ {
+   IOFileUD *iof = io_file_new(L);
+-#if LJ_TARGET_PS3 || LJ_TARGET_PS4 || LJ_TARGET_PSVITA
++#if LJ_TARGET_PS3 || LJ_TARGET_PS4 || LJ_TARGET_PS5 || LJ_TARGET_PSVITA || LJ_TARGET_NX
+   iof->fp = NULL; errno = ENOSYS;
+ #else
+   iof->fp = tmpfile();
+@@ -458,7 +464,7 @@ LJLIB_CF(io_write)		LJLIB_REC(io_write G
+ 
+ LJLIB_CF(io_flush)		LJLIB_REC(io_flush GCROOT_IO_OUTPUT)
+ {
+-  return luaL_fileresult(L, fflush(io_stdfile(L, GCROOT_IO_OUTPUT)) == 0, NULL);
++  return luaL_fileresult(L, fflush(io_stdfile(L, GCROOT_IO_OUTPUT)->fp) == 0, NULL);
+ }
+ 
+ static int io_std_getset(lua_State *L, ptrdiff_t id, const char *mode)
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lib_jit.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lib_jit.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lib_jit.c
+@@ -1,6 +1,6 @@
+ /*
+ ** JIT library.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lib_jit_c
+@@ -346,11 +346,7 @@ LJLIB_CF(jit_util_tracek)
+       ir = &T->ir[ir->op1];
+     }
+ #if LJ_HASFFI
+-    if (ir->o == IR_KINT64 && !ctype_ctsG(G(L))) {
+-      ptrdiff_t oldtop = savestack(L, L->top);
+-      luaopen_ffi(L);  /* Load FFI library on-demand. */
+-      L->top = restorestack(L, oldtop);
+-    }
++    if (ir->o == IR_KINT64) ctype_loadffi(L);
+ #endif
+     lj_ir_kvalue(L, L->top-2, ir);
+     setintV(L->top-1, (int32_t)irt_type(ir->t));
+@@ -426,7 +422,8 @@ LJLIB_CF(jit_util_ircalladdr)
+ {
+   uint32_t idx = (uint32_t)lj_lib_checkint(L, 1);
+   if (idx < IRCALL__MAX) {
+-    setintptrV(L->top-1, (intptr_t)(void *)lj_ir_callinfo[idx].func);
++    ASMFunction func = lj_ir_callinfo[idx].func;
++    setintptrV(L->top-1, (intptr_t)(void *)lj_ptr_strip(func));
+     return 1;
+   }
+   return 0;
+@@ -652,6 +649,81 @@ JIT_PARAMDEF(JIT_PARAMINIT)
+ #include <sys/utsname.h>
+ #endif
+ 
++#if LJ_TARGET_RISCV64 && LJ_TARGET_POSIX
++#include <setjmp.h>
++#include <signal.h>
++static sigjmp_buf sigbuf = {0};
++static void detect_sigill(int sig)
++{
++  siglongjmp(sigbuf, 1);
++}
++
++static int riscv_compressed()
++{
++#if defined(__riscv_c) || defined(__riscv_compressed)
++  /* Don't bother checking for RVC -- would crash before getting here. */
++  return 1;
++#elif defined(__GNUC__)
++  /* c.nop; c.nop; */
++  __asm__(".4byte 0x00010001");
++  return 1;
++#else
++  return 0;
++#endif
++}
++
++static int riscv_zba()
++{
++#if defined(__riscv_b) || defined(__riscv_zba)
++  /* Don't bother checking for Zba -- would crash before getting here. */
++  return 1;
++#elif defined(__GNUC__)
++  /* Don't bother verifying the result, just check if the instruction exists. */
++  /* add.uw zero, zero, zero */
++  __asm__(".4byte 0x0800003b");
++  return 1;
++#else
++  return 0;
++#endif
++}
++
++static int riscv_zbb()
++{
++#if defined(__riscv_b) || defined(__riscv_zbb)
++  /* Don't bother checking for Zbb -- would crash before getting here. */
++  return 1;
++#elif defined(__GNUC__)
++  register int t asm ("a0");
++  /* addi a0, zero, 255; sext.b a0, a0; */
++  __asm__("addi a0, zero, 255\n\t.4byte 0x60451513");
++  return t < 0;
++#else
++  return 0;
++#endif
++}
++
++static int riscv_xthead()
++{
++#if defined(__GNUC__)
++    register int t asm ("a0");
++    /* C906 & C910 & C908 all have "xtheadc", XTheadBb subset "xtheadc". */
++    /* Therefore assume XThead* are present if XTheadBb is present. */
++    /* addi a0, zero, 255; th.ext a0, a0, 7, 0; */
++    __asm__("addi a0, zero, 255\n\t.4byte 0x1c05250b");
++    return t == -1;		/* In case of collision with other vendor extensions. */
++#else
++    return 0;
++#endif
++}
++
++static uint32_t riscv_probe(int (*func)(void), uint32_t flag)
++{
++    if (sigsetjmp(sigbuf, 1) == 0) {
++        return func() ? flag : 0;
++    } else return 0;
++}
++#endif
++
+ /* Arch-dependent CPU feature detection. */
+ static uint32_t jit_cpudetect(void)
+ {
+@@ -723,6 +795,22 @@ static uint32_t jit_cpudetect(void)
+   }
+ #endif
+ 
++#elif LJ_TARGET_RISCV64
++#if LJ_HASJIT
++  /* SIGILL-based detection of RVC, Zba, Zbb and XThead. Welcome to the future. */
++  struct sigaction old = {0}, act = {0};
++  act.sa_handler = detect_sigill;
++  sigaction(SIGILL, &act, &old);
++  flags |= riscv_probe(riscv_compressed, JIT_F_RVC);
++  flags |= riscv_probe(riscv_zba, JIT_F_RVZba);
++  flags |= riscv_probe(riscv_zbb, JIT_F_RVZbb);
++  flags |= riscv_probe(riscv_xthead, JIT_F_RVXThead);
++  sigaction(SIGILL, &old, NULL);
++
++  /* Detect V/P? */
++  /* V have no hardware available, P not ratified yet. */
++#endif
++
+ #else
+ #error "Missing CPU detection for this architecture"
+ #endif
+@@ -746,7 +834,7 @@ LUALIB_API int luaopen_jit(lua_State *L)
+ #endif
+   lua_pushliteral(L, LJ_OS_NAME);
+   lua_pushliteral(L, LJ_ARCH_NAME);
+-  lua_pushinteger(L, LUAJIT_VERSION_NUM);
++  lua_pushinteger(L, LUAJIT_VERSION_NUM);  /* Deprecated. */
+   lua_pushliteral(L, LUAJIT_VERSION);
+   LJ_LIB_REG(L, LUA_JITLIBNAME, jit);
+ #if LJ_HASPROFILE
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lib_math.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lib_math.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lib_math.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Math library.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #include <math.h>
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lib_os.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lib_os.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lib_os.c
+@@ -1,6 +1,6 @@
+ /*
+ ** OS library.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Major portions taken verbatim or adapted from the Lua interpreter.
+ ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+@@ -76,7 +76,7 @@ LJLIB_CF(os_rename)
+ 
+ LJLIB_CF(os_tmpname)
+ {
+-#if LJ_TARGET_PS3 || LJ_TARGET_PS4 || LJ_TARGET_PSVITA
++#if LJ_TARGET_PS3 || LJ_TARGET_PS4 || LJ_TARGET_PS5 || LJ_TARGET_PSVITA || LJ_TARGET_NX
+   lj_err_caller(L, LJ_ERR_OSUNIQF);
+   return 0;
+ #else
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lib_package.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lib_package.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lib_package.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Package library.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Major portions taken verbatim or adapted from the Lua interpreter.
+ ** Copyright (C) 1994-2012 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+@@ -57,7 +57,7 @@ static lua_CFunction ll_sym(lua_State *L
+ 
+ static const char *ll_bcsym(void *lib, const char *sym)
+ {
+-#if defined(RTLD_DEFAULT)
++#if defined(RTLD_DEFAULT) && !defined(NO_RTLD_DEFAULT)
+   if (lib == NULL) lib = RTLD_DEFAULT;
+ #elif LJ_TARGET_OSX || LJ_TARGET_BSD
+   if (lib == NULL) lib = (void *)(intptr_t)-2;
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lib_string.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lib_string.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lib_string.c
+@@ -1,6 +1,6 @@
+ /*
+ ** String library.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Major portions taken verbatim or adapted from the Lua interpreter.
+ ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+@@ -640,89 +640,14 @@ LJLIB_CF(string_gsub)
+ 
+ /* ------------------------------------------------------------------------ */
+ 
+-/* Emulate tostring() inline. */
+-static GCstr *string_fmt_tostring(lua_State *L, int arg, int retry)
+-{
+-  TValue *o = L->base+arg-1;
+-  cTValue *mo;
+-  lj_assertL(o < L->top, "bad usage");  /* Caller already checks for existence. */
+-  if (LJ_LIKELY(tvisstr(o)))
+-    return strV(o);
+-  if (retry != 2 && !tvisnil(mo = lj_meta_lookup(L, o, MM_tostring))) {
+-    copyTV(L, L->top++, mo);
+-    copyTV(L, L->top++, o);
+-    lua_call(L, 1, 1);
+-    copyTV(L, L->base+arg-1, --L->top);
+-    return NULL;  /* Buffer may be overwritten, retry. */
+-  }
+-  return lj_strfmt_obj(L, o);
+-}
+-
+ LJLIB_CF(string_format)		LJLIB_REC(.)
+ {
+-  int arg, top = (int)(L->top - L->base);
+-  GCstr *fmt;
+-  SBuf *sb;
+-  FormatState fs;
+-  SFormat sf;
+   int retry = 0;
+-again:
+-  arg = 1;
+-  sb = lj_buf_tmp_(L);
+-  fmt = lj_lib_checkstr(L, arg);
+-  lj_strfmt_init(&fs, strdata(fmt), fmt->len);
+-  while ((sf = lj_strfmt_parse(&fs)) != STRFMT_EOF) {
+-    if (sf == STRFMT_LIT) {
+-      lj_buf_putmem(sb, fs.str, fs.len);
+-    } else if (sf == STRFMT_ERR) {
+-      lj_err_callerv(L, LJ_ERR_STRFMT, strdata(lj_str_new(L, fs.str, fs.len)));
+-    } else {
+-      if (++arg > top)
+-	luaL_argerror(L, arg, lj_obj_typename[0]);
+-      switch (STRFMT_TYPE(sf)) {
+-      case STRFMT_INT:
+-	if (tvisint(L->base+arg-1)) {
+-	  int32_t k = intV(L->base+arg-1);
+-	  if (sf == STRFMT_INT)
+-	    lj_strfmt_putint(sb, k);  /* Shortcut for plain %d. */
+-	  else
+-	    lj_strfmt_putfxint(sb, sf, k);
+-	} else {
+-	  lj_strfmt_putfnum_int(sb, sf, lj_lib_checknum(L, arg));
+-	}
+-	break;
+-      case STRFMT_UINT:
+-	if (tvisint(L->base+arg-1))
+-	  lj_strfmt_putfxint(sb, sf, intV(L->base+arg-1));
+-	else
+-	  lj_strfmt_putfnum_uint(sb, sf, lj_lib_checknum(L, arg));
+-	break;
+-      case STRFMT_NUM:
+-	lj_strfmt_putfnum(sb, sf, lj_lib_checknum(L, arg));
+-	break;
+-      case STRFMT_STR: {
+-	GCstr *str = string_fmt_tostring(L, arg, retry);
+-	if (str == NULL)
+-	  retry = 1;
+-	else if ((sf & STRFMT_T_QUOTED))
+-	  lj_strfmt_putquoted(sb, str);  /* No formatting. */
+-	else
+-	  lj_strfmt_putfstr(sb, sf, str);
+-	break;
+-	}
+-      case STRFMT_CHAR:
+-	lj_strfmt_putfchar(sb, sf, lj_lib_checkint(L, arg));
+-	break;
+-      case STRFMT_PTR:  /* No formatting. */
+-	lj_strfmt_putptr(sb, lj_obj_ptr(G(L), L->base+arg-1));
+-	break;
+-      default:
+-	lj_assertL(0, "bad string format type");
+-	break;
+-      }
+-    }
+-  }
+-  if (retry++ == 1) goto again;
++  SBuf *sb;
++  do {
++    sb = lj_buf_tmp_(L);
++    retry = lj_strfmt_putarg(L, sb, 1, -retry);
++  } while (retry > 0);
+   setstrV(L, L->top-1, lj_buf_str(L, sb));
+   lj_gc_check(L);
+   return 1;
+@@ -743,6 +668,9 @@ LUALIB_API int luaopen_string(lua_State
+   setgcref(basemt_it(g, LJ_TSTR), obj2gco(mt));
+   settabV(L, lj_tab_setstr(L, mt, mmname_str(g, MM_index)), tabV(L->top-1));
+   mt->nomm = (uint8_t)(~(1u<<MM_index));
++#if LJ_HASBUFFER
++  lj_lib_prereg(L, LUA_STRLIBNAME ".buffer", luaopen_string_buffer, tabV(L->top-1));
++#endif
+   return 1;
+ }
+ 
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lib_table.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lib_table.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lib_table.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Table library.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Major portions taken verbatim or adapted from the Lua interpreter.
+ ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+@@ -159,7 +159,7 @@ LJLIB_CF(table_concat)		LJLIB_REC(.)
+   SBuf *sb = lj_buf_tmp_(L);
+   SBuf *sbx = lj_buf_puttab(sb, t, sep, i, e);
+   if (LJ_UNLIKELY(!sbx)) {  /* Error: bad element type. */
+-    int32_t idx = (int32_t)(intptr_t)sbufP(sb);
++    int32_t idx = (int32_t)(intptr_t)sb->w;
+     cTValue *o = lj_tab_getint(t, idx);
+     lj_err_callerv(L, LJ_ERR_TABCAT,
+ 		   lj_obj_itypename[o ? itypemap(o) : ~LJ_TNIL], idx);
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_alloc.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_alloc.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_alloc.c
+@@ -330,7 +330,7 @@ static void *mmap_plain(size_t size)
+ #define CALL_MMAP(prng, size)	mmap_plain(size)
+ #endif
+ 
+-#if LJ_64 && !LJ_GC64 && ((defined(__FreeBSD__) && __FreeBSD__ < 10) || defined(__FreeBSD_kernel__)) && !LJ_TARGET_PS4
++#if LJ_64 && !LJ_GC64 && ((defined(__FreeBSD__) && __FreeBSD__ < 10) || defined(__FreeBSD_kernel__)) && !LJ_TARGET_PS4 && !LJ_TARGET_PS5
+ 
+ #include <sys/resource.h>
+ 
+@@ -365,7 +365,7 @@ static void *CALL_MREMAP_(void *ptr, siz
+ #define CALL_MREMAP(addr, osz, nsz, mv) CALL_MREMAP_((addr), (osz), (nsz), (mv))
+ #define CALL_MREMAP_NOMOVE	0
+ #define CALL_MREMAP_MAYMOVE	1
+-#if LJ_64 && !LJ_GC64
++#if LJ_64 && (!LJ_GC64 || LJ_TARGET_ARM64 || LJ_TARGET_RISCV64)
+ #define CALL_MREMAP_MV		CALL_MREMAP_NOMOVE
+ #else
+ #define CALL_MREMAP_MV		CALL_MREMAP_MAYMOVE
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_api.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_api.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_api.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Public Lua/C API.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Major portions taken verbatim or adapted from the Lua interpreter.
+ ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+@@ -104,7 +104,12 @@ LUA_API int lua_checkstack(lua_State *L,
+   if (size > LUAI_MAXCSTACK || (L->top - L->base + size) > LUAI_MAXCSTACK) {
+     return 0;  /* Stack overflow. */
+   } else if (size > 0) {
+-    lj_state_checkstack(L, (MSize)size);
++    int avail = (int)(mref(L->maxstack, TValue) - L->top);
++    if (size > avail &&
++	lj_state_cpgrowstack(L, (MSize)(size - avail)) != LUA_OK) {
++      L->top--;
++      return 0;  /* Out of memory. */
++    }
+   }
+   return 1;
+ }
+@@ -707,36 +712,10 @@ LUA_API void lua_pushboolean(lua_State *
+   incr_top(L);
+ }
+ 
+-#if LJ_64
+-static void *lightud_intern(lua_State *L, void *p)
+-{
+-  global_State *g = G(L);
+-  uint64_t u = (uint64_t)p;
+-  uint32_t up = lightudup(u);
+-  uint32_t *segmap = mref(g->gc.lightudseg, uint32_t);
+-  MSize segnum = g->gc.lightudnum;
+-  if (segmap) {
+-    MSize seg;
+-    for (seg = 0; seg <= segnum; seg++)
+-      if (segmap[seg] == up)  /* Fast path. */
+-	return (void *)(((uint64_t)seg << LJ_LIGHTUD_BITS_LO) | lightudlo(u));
+-    segnum++;
+-  }
+-  if (!((segnum-1) & segnum) && segnum != 1) {
+-    if (segnum >= (1 << LJ_LIGHTUD_BITS_SEG)) lj_err_msg(L, LJ_ERR_BADLU);
+-    lj_mem_reallocvec(L, segmap, segnum, segnum ? 2*segnum : 2u, uint32_t);
+-    setmref(g->gc.lightudseg, segmap);
+-  }
+-  g->gc.lightudnum = segnum;
+-  segmap[segnum] = up;
+-  return (void *)(((uint64_t)segnum << LJ_LIGHTUD_BITS_LO) | lightudlo(u));
+-}
+-#endif
+-
+ LUA_API void lua_pushlightuserdata(lua_State *L, void *p)
+ {
+ #if LJ_64
+-  p = lightud_intern(L, p);
++  p = lj_lightud_intern(L, p);
+ #endif
+   setrawlightudV(L->top, p);
+   incr_top(L);
+@@ -805,7 +784,7 @@ LUA_API void lua_concat(lua_State *L, in
+ 	L->top -= n;
+ 	break;
+       }
+-      n -= (int)(L->top - top);
++      n -= (int)(L->top - (top - 2*LJ_FR2));
+       L->top = top+2;
+       lj_vm_call(L, top, 1+1);
+       L->top -= 1+LJ_FR2;
+@@ -919,11 +898,13 @@ LUA_API int lua_next(lua_State *L, int i
+   cTValue *t = index2adr(L, idx);
+   int more;
+   lj_checkapi(tvistab(t), "stack slot %d is not a table", idx);
+-  more = lj_tab_next(L, tabV(t), L->top-1);
+-  if (more) {
++  more = lj_tab_next(tabV(t), L->top-1, L->top-1);
++  if (more > 0) {
+     incr_top(L);  /* Return new key and value slot. */
+-  } else {  /* End of traversal. */
++  } else if (!more) {  /* End of traversal. */
+     L->top--;  /* Remove key slot. */
++  } else {
++    lj_err_msg(L, LJ_ERR_NEXTIDX);
+   }
+   return more;
+ }
+@@ -1179,7 +1160,7 @@ static TValue *cpcall(lua_State *L, lua_
+   setfuncV(L, top++, fn);
+   if (LJ_FR2) setnilV(top++);
+ #if LJ_64
+-  ud = lightud_intern(L, ud);
++  ud = lj_lightud_intern(L, ud);
+ #endif
+   setrawlightudV(top++, ud);
+   cframe_nres(L->cframe) = 1+0;  /* Zero results. */
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_arch.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_arch.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_arch.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Target architecture selection.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_ARCH_H
+@@ -31,6 +31,10 @@
+ #define LUAJIT_ARCH_mips32	6
+ #define LUAJIT_ARCH_MIPS64	7
+ #define LUAJIT_ARCH_mips64	7
++#define LUAJIT_ARCH_RISCV32	8
++#define LUAJIT_ARCH_riscv32	8
++#define LUAJIT_ARCH_RISCV64	9
++#define LUAJIT_ARCH_riscv64	9
+ 
+ /* Target OS. */
+ #define LUAJIT_OS_OTHER		0
+@@ -57,7 +61,7 @@
+ #define LUAJIT_TARGET	LUAJIT_ARCH_X64
+ #elif defined(__arm__) || defined(__arm) || defined(__ARM__) || defined(__ARM)
+ #define LUAJIT_TARGET	LUAJIT_ARCH_ARM
+-#elif defined(__aarch64__)
++#elif defined(__aarch64__) || defined(_M_ARM64)
+ #define LUAJIT_TARGET	LUAJIT_ARCH_ARM64
+ #elif defined(__ppc__) || defined(__ppc) || defined(__PPC__) || defined(__PPC) || defined(__powerpc__) || defined(__powerpc) || defined(__POWERPC__) || defined(__POWERPC) || defined(_M_PPC)
+ #define LUAJIT_TARGET	LUAJIT_ARCH_PPC
+@@ -65,8 +69,12 @@
+ #define LUAJIT_TARGET	LUAJIT_ARCH_MIPS64
+ #elif defined(__mips__) || defined(__mips) || defined(__MIPS__) || defined(__MIPS)
+ #define LUAJIT_TARGET	LUAJIT_ARCH_MIPS32
++#elif defined(__riscv) && __riscv_xlen == 32
++#define LUAJIT_TARGET LUAJIT_ARCH_RISCV32
++#elif defined(__riscv) && __riscv_xlen == 64
++#define LUAJIT_TARGET LUAJIT_ARCH_RISCV64
+ #else
+-#error "No support for this architecture (yet)"
++#error "Architecture not supported (in this version), see: https://luajit.org/status.html#architectures"
+ #endif
+ 
+ #endif
+@@ -83,7 +91,7 @@
+ #define LUAJIT_OS	LUAJIT_OS_OSX
+ #elif (defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || \
+        defined(__NetBSD__) || defined(__OpenBSD__) || \
+-       defined(__DragonFly__)) && !defined(__ORBIS__)
++       defined(__DragonFly__)) && !defined(__ORBIS__) && !defined(__PROSPERO__)
+ #define LUAJIT_OS	LUAJIT_OS_BSD
+ #elif (defined(__sun__) && defined(__svr4__))
+ #define LJ_TARGET_SOLARIS	1
+@@ -93,6 +101,9 @@
+ #elif defined(__CYGWIN__)
+ #define LJ_TARGET_CYGWIN	1
+ #define LUAJIT_OS	LUAJIT_OS_POSIX
++#elif defined(__QNX__)
++#define LJ_TARGET_QNX		1
++#define LUAJIT_OS	LUAJIT_OS_POSIX
+ #else
+ #define LUAJIT_OS	LUAJIT_OS_OTHER
+ #endif
+@@ -139,6 +150,13 @@
+ #define NULL ((void*)0)
+ #endif
+ 
++#ifdef __PROSPERO__
++#define LJ_TARGET_PS5		1
++#define LJ_TARGET_CONSOLE	1
++#undef NULL
++#define NULL ((void*)0)
++#endif
++
+ #ifdef __psp2__
+ #define LJ_TARGET_PSVITA	1
+ #define LJ_TARGET_CONSOLE	1
+@@ -155,6 +173,13 @@
+ #define LJ_TARGET_GC64		1
+ #endif
+ 
++#ifdef __NX__
++#define LJ_TARGET_NX		1
++#define LJ_TARGET_CONSOLE	1
++#undef NULL
++#define NULL ((void*)0)
++#endif
++
+ #ifdef _UWP
+ #define LJ_TARGET_UWP		1
+ #if LUAJIT_TARGET == LUAJIT_ARCH_X64
+@@ -170,14 +195,10 @@
+ #define LJ_ARCH_NAME		"x86"
+ #define LJ_ARCH_BITS		32
+ #define LJ_ARCH_ENDIAN		LUAJIT_LE
+-#if LJ_TARGET_WINDOWS || LJ_TARGET_CYGWIN
+-#define LJ_ABI_WIN		1
+-#else
+-#define LJ_ABI_WIN		0
+-#endif
+ #define LJ_TARGET_X86		1
+ #define LJ_TARGET_X86ORX64	1
+ #define LJ_TARGET_EHRETREG	0
++#define LJ_TARGET_EHRAREG	8
+ #define LJ_TARGET_MASKSHIFT	1
+ #define LJ_TARGET_MASKROT	1
+ #define LJ_TARGET_UNALIGNED	1
+@@ -188,14 +209,10 @@
+ #define LJ_ARCH_NAME		"x64"
+ #define LJ_ARCH_BITS		64
+ #define LJ_ARCH_ENDIAN		LUAJIT_LE
+-#if LJ_TARGET_WINDOWS || LJ_TARGET_CYGWIN
+-#define LJ_ABI_WIN		1
+-#else
+-#define LJ_ABI_WIN		0
+-#endif
+ #define LJ_TARGET_X64		1
+ #define LJ_TARGET_X86ORX64	1
+ #define LJ_TARGET_EHRETREG	0
++#define LJ_TARGET_EHRAREG	16
+ #define LJ_TARGET_JUMPRANGE	31	/* +-2^31 = +-2GB */
+ #define LJ_TARGET_MASKSHIFT	1
+ #define LJ_TARGET_MASKROT	1
+@@ -203,6 +220,8 @@
+ #define LJ_ARCH_NUMMODE		LJ_NUMMODE_SINGLE_DUAL
+ #ifndef LUAJIT_DISABLE_GC64
+ #define LJ_TARGET_GC64		1
++#elif LJ_TARGET_OSX
++#error "macOS requires GC64 -- don't disable it"
+ #endif
+ 
+ #elif LUAJIT_TARGET == LUAJIT_ARCH_ARM
+@@ -219,13 +238,14 @@
+ #define LJ_ABI_EABI		1
+ #define LJ_TARGET_ARM		1
+ #define LJ_TARGET_EHRETREG	0
++#define LJ_TARGET_EHRAREG	14
+ #define LJ_TARGET_JUMPRANGE	25	/* +-2^25 = +-32MB */
+ #define LJ_TARGET_MASKSHIFT	0
+ #define LJ_TARGET_MASKROT	1
+ #define LJ_TARGET_UNIFYROT	2	/* Want only IR_BROR. */
+ #define LJ_ARCH_NUMMODE		LJ_NUMMODE_DUAL
+ 
+-#if __ARM_ARCH == 8 || __ARM_ARCH_8__ || __ARM_ARCH_8A__
++#if __ARM_ARCH >= 8 || __ARM_ARCH_8__ || __ARM_ARCH_8A__
+ #define LJ_ARCH_VERSION		80
+ #elif __ARM_ARCH == 7 || __ARM_ARCH_7__ || __ARM_ARCH_7A__ || __ARM_ARCH_7R__ || __ARM_ARCH_7S__ || __ARM_ARCH_7VE__
+ #define LJ_ARCH_VERSION		70
+@@ -247,8 +267,12 @@
+ #define LJ_ARCH_NAME		"arm64"
+ #define LJ_ARCH_ENDIAN		LUAJIT_LE
+ #endif
++#if !defined(LJ_ABI_PAUTH) && defined(__arm64e__)
++#define LJ_ABI_PAUTH		1
++#endif
+ #define LJ_TARGET_ARM64		1
+ #define LJ_TARGET_EHRETREG	0
++#define LJ_TARGET_EHRAREG	30
+ #define LJ_TARGET_JUMPRANGE	27	/* +-2^27 = +-128MB */
+ #define LJ_TARGET_MASKSHIFT	1
+ #define LJ_TARGET_MASKROT	1
+@@ -304,6 +328,7 @@
+ 
+ #define LJ_TARGET_PPC		1
+ #define LJ_TARGET_EHRETREG	3
++#define LJ_TARGET_EHRAREG	65
+ #define LJ_TARGET_JUMPRANGE	25	/* +-2^25 = +-32MB */
+ #define LJ_TARGET_MASKSHIFT	0
+ #define LJ_TARGET_MASKROT	1
+@@ -314,6 +339,7 @@
+ #define LJ_ARCH_NOFFI		1
+ #elif LJ_ARCH_BITS == 64
+ #error "No support for PPC64"
++#undef LJ_TARGET_PPC
+ #endif
+ 
+ #if _ARCH_PWR7
+@@ -406,6 +432,7 @@
+ #endif
+ #define LJ_TARGET_MIPS		1
+ #define LJ_TARGET_EHRETREG	4
++#define LJ_TARGET_EHRAREG	31
+ #define LJ_TARGET_JUMPRANGE	27	/* 2*2^27 = 256MB-aligned region */
+ #define LJ_TARGET_MASKSHIFT	1
+ #define LJ_TARGET_MASKROT	1
+@@ -420,6 +447,30 @@
+ #define LJ_ARCH_VERSION		10
+ #endif
+ 
++#elif LUAJIT_TARGET == LUAJIT_ARCH_RISCV32
++#error "No support for RISC-V 32"
++
++#elif LUAJIT_TARGET == LUAJIT_ARCH_RISCV64
++#if defined(__riscv_float_abi_double)
++
++#define LJ_ARCH_NAME		"riscv64"
++#define LJ_ARCH_BITS		64
++#define LJ_ARCH_ENDIAN		LUAJIT_LE	/* Forget about BE for now */
++#define LJ_TARGET_RISCV64	1
++#define LJ_TARGET_GC64		1
++#define LJ_TARGET_EHRETREG	10
++#define LJ_TARGET_EHRAREG	1
++#define LJ_TARGET_JUMPRANGE	30	/* JAL +-2^20 = +-1MB,\
++        AUIPC+JALR +-2^31 = +-2GB, leave 1 bit to avoid AUIPC corner case */
++#define LJ_TARGET_MASKSHIFT	1
++#define LJ_TARGET_MASKROT	1
++#define LJ_TARGET_UNIFYROT	2	/* Want only IR_BROR, no ROLI */
++#define LJ_ARCH_NUMMODE		LJ_NUMMODE_DUAL
++
++#else
++#error "No support for RISC-V 64 Soft-float/Single-float"
++#endif
++
+ #else
+ #error "No target architecture defined"
+ #endif
+@@ -451,11 +502,17 @@
+ #endif
+ #endif
+ #elif !LJ_TARGET_PS3
++#if __clang__
++#if ((__clang_major__ < 3) || ((__clang_major__ == 3) && __clang_minor__ < 5))
++#error "Need at least Clang 3.5 or newer"
++#endif
++#else
+ #if (__GNUC__ < 4) || ((__GNUC__ == 4) && __GNUC_MINOR__ < 3)
+ #error "Need at least GCC 4.3 or newer"
+ #endif
+ #endif
+ #endif
++#endif
+ 
+ /* Check target-specific constraints. */
+ #ifndef _BUILDVM_H
+@@ -466,36 +523,52 @@
+ #elif LJ_TARGET_ARM
+ #if defined(__ARMEB__)
+ #error "No support for big-endian ARM"
++#undef LJ_TARGET_ARM
+ #endif
+ #if __ARM_ARCH_6M__ || __ARM_ARCH_7M__ || __ARM_ARCH_7EM__
+ #error "No support for Cortex-M CPUs"
++#undef LJ_TARGET_ARM
+ #endif
+ #if !(__ARM_EABI__ || LJ_TARGET_IOS)
+ #error "Only ARM EABI or iOS 3.0+ ABI is supported"
++#undef LJ_TARGET_ARM
+ #endif
+ #elif LJ_TARGET_ARM64
+ #if defined(_ILP32)
+ #error "No support for ILP32 model on ARM64"
++#undef LJ_TARGET_ARM64
+ #endif
+ #elif LJ_TARGET_PPC
+ #if defined(_LITTLE_ENDIAN) && (!defined(_BYTE_ORDER) || (_BYTE_ORDER == _LITTLE_ENDIAN))
+ #error "No support for little-endian PPC32"
++#undef LJ_TARGET_PPC
+ #endif
+ #if defined(__NO_FPRS__) && !defined(_SOFT_FLOAT)
+-#error "No support for PPC/e500 anymore (use LuaJIT 2.0)"
++#error "No support for PPC/e500, use LuaJIT 2.0"
++#undef LJ_TARGET_PPC
+ #endif
+ #elif LJ_TARGET_MIPS32
+ #if !((defined(_MIPS_SIM_ABI32) && _MIPS_SIM == _MIPS_SIM_ABI32) || (defined(_ABIO32) && _MIPS_SIM == _ABIO32))
+ #error "Only o32 ABI supported for MIPS32"
++#undef LJ_TARGET_MIPS
+ #endif
+ #if LJ_TARGET_MIPSR6
+ /* Not that useful, since most available r6 CPUs are 64 bit. */
+ #error "No support for MIPS32R6"
++#undef LJ_TARGET_MIPS
+ #endif
+ #elif LJ_TARGET_MIPS64
+ #if !((defined(_MIPS_SIM_ABI64) && _MIPS_SIM == _MIPS_SIM_ABI64) || (defined(_ABI64) && _MIPS_SIM == _ABI64))
+ /* MIPS32ON64 aka n32 ABI support might be desirable, but difficult. */
+ #error "Only n64 ABI supported for MIPS64"
++#undef LJ_TARGET_MIPS
++#endif
++#elif LJ_TARGET_RISCV
++#if !defined(__riscv_float_abi_double)
++#error "Only RISC-V 64 double float supported for now"
++#endif
++#if defined(__riscv_compressed)
++#error "Compressed instructions not supported for now"
+ #endif
+ #endif
+ #endif
+@@ -551,6 +624,13 @@
+ #define LJ_HASFFI		1
+ #endif
+ 
++/* Disable or enable the string buffer extension. */
++#if defined(LUAJIT_DISABLE_BUFFER)
++#define LJ_HASBUFFER		0
++#else
++#define LJ_HASBUFFER		1
++#endif
++
+ #if defined(LUAJIT_DISABLE_PROFILE)
+ #define LJ_HASPROFILE		0
+ #elif LJ_TARGET_POSIX
+@@ -575,6 +655,10 @@
+ #define LJ_SOFTFP		(!LJ_ARCH_HASFPU)
+ #define LJ_SOFTFP32		(LJ_SOFTFP && LJ_32)
+ 
++#ifndef LJ_ABI_PAUTH
++#define LJ_ABI_PAUTH		0
++#endif
++
+ #if LJ_ARCH_ENDIAN == LUAJIT_BE
+ #define LJ_LE			0
+ #define LJ_BE			1
+@@ -611,13 +695,10 @@
+ #define LJ_NO_SYSTEM		1
+ #endif
+ 
+-#if !defined(LUAJIT_NO_UNWIND) && __GNU_COMPACT_EH__
+-/* NYI: no support for compact unwind specification, yet. */
+-#define LUAJIT_NO_UNWIND	1
+-#endif
+-
+-#if defined(LUAJIT_NO_UNWIND) || defined(__symbian__) || LJ_TARGET_IOS || LJ_TARGET_PS3 || LJ_TARGET_PS4
+-#define LJ_NO_UNWIND		1
++#if LJ_TARGET_WINDOWS || LJ_TARGET_CYGWIN
++#define LJ_ABI_WIN		1
++#else
++#define LJ_ABI_WIN		0
+ #endif
+ 
+ #if LJ_TARGET_WINDOWS
+@@ -632,6 +713,22 @@ extern void *LJ_WIN_LOADLIBA(const char
+ #endif
+ #endif
+ 
++#if defined(LUAJIT_NO_UNWIND) || __GNU_COMPACT_EH__ || defined(__symbian__) || LJ_TARGET_IOS || LJ_TARGET_PS3 || LJ_TARGET_PS4 || LJ_TARGET_PS5
++#define LJ_NO_UNWIND		1
++#endif
++
++#if !LJ_NO_UNWIND && !defined(LUAJIT_UNWIND_INTERNAL) && (LJ_ABI_WIN || (defined(LUAJIT_UNWIND_EXTERNAL) && (defined(__GNUC__) || defined(__clang__))))
++#define LJ_UNWIND_EXT		1
++#else
++#define LJ_UNWIND_EXT		0
++#endif
++
++#if LJ_UNWIND_EXT && LJ_HASJIT && !LJ_TARGET_ARM && !(LJ_ABI_WIN && LJ_TARGET_X86)
++#define LJ_UNWIND_JIT		1
++#else
++#define LJ_UNWIND_JIT		0
++#endif
++
+ /* Compatibility with Lua 5.1 vs. 5.2. */
+ #ifdef LUAJIT_ENABLE_LUA52COMPAT
+ #define LJ_52			1
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_asm.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_asm.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_asm.c
+@@ -1,6 +1,6 @@
+ /*
+ ** IR assembler (SSA IR -> machine code).
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_asm_c
+@@ -11,6 +11,7 @@
+ #if LJ_HASJIT
+ 
+ #include "lj_gc.h"
++#include "lj_buf.h"
+ #include "lj_str.h"
+ #include "lj_tab.h"
+ #include "lj_frame.h"
+@@ -28,6 +29,7 @@
+ #include "lj_dispatch.h"
+ #include "lj_vm.h"
+ #include "lj_target.h"
++#include "lj_prng.h"
+ 
+ #ifdef LUA_USE_ASSERT
+ #include <stdio.h>
+@@ -71,6 +73,8 @@ typedef struct ASMState {
+   IRRef snaprename;	/* Rename highwater mark for snapshot check. */
+   SnapNo snapno;	/* Current snapshot number. */
+   SnapNo loopsnapno;	/* Loop snapshot number. */
++  int snapalloc;	/* Current snapshot needs allocation. */
++  BloomFilter snapfilt1, snapfilt2;	/* Filled with snapshot refs. */
+ 
+   IRRef fuseref;	/* Fusion limit (loopref, 0 or FUSE_DISABLED). */
+   IRRef sectref;	/* Section base reference (loopref or 0). */
+@@ -84,11 +88,18 @@ typedef struct ASMState {
+ 
+   MCode *mcbot;		/* Bottom of reserved MCode. */
+   MCode *mctop;		/* Top of generated MCode. */
++  MCode *mctoporig;	/* Original top of generated MCode. */
+   MCode *mcloop;	/* Pointer to loop MCode (or NULL). */
+   MCode *invmcp;	/* Points to invertible loop branch (or NULL). */
+   MCode *flagmcp;	/* Pending opportunity to merge flag setting ins. */
+   MCode *realign;	/* Realign loop if not NULL. */
+ 
++#ifdef LUAJIT_RANDOM_RA
++  /* Randomize register allocation. OK for fuzz testing, not for production. */
++  uint64_t prngbits;
++  PRNGState prngstate;
++#endif
++
+ #ifdef RID_NUM_KREF
+   intptr_t krefk[RID_NUM_KREF];
+ #endif
+@@ -169,6 +180,41 @@ IRFLDEF(FLOFS)
+   0
+ };
+ 
++#ifdef LUAJIT_RANDOM_RA
++/* Return a fixed number of random bits from the local PRNG state. */
++static uint32_t ra_random_bits(ASMState *as, uint32_t nbits) {
++  uint64_t b = as->prngbits;
++  uint32_t res = (1u << nbits) - 1u;
++  if (b <= res) b = lj_prng_u64(&as->prngstate) | (1ull << 63);
++  res &= (uint32_t)b;
++  as->prngbits = b >> nbits;
++  return res;
++}
++
++/* Pick a random register from a register set. */
++static Reg rset_pickrandom(ASMState *as, RegSet rs)
++{
++  Reg r = rset_pickbot_(rs);
++  rs >>= r;
++  if (rs > 1) {  /* More than one bit set? */
++    while (1) {
++      /* We need to sample max. the GPR or FPR half of the set. */
++      uint32_t d = ra_random_bits(as, RSET_BITS-1);
++      if ((rs >> d) & 1) {
++	r += d;
++	break;
++      }
++    }
++  }
++  return r;
++}
++#define rset_picktop(rs)	rset_pickrandom(as, rs)
++#define rset_pickbot(rs)	rset_pickrandom(as, rs)
++#else
++#define rset_picktop(rs)	rset_picktop_(rs)
++#define rset_pickbot(rs)	rset_pickbot_(rs)
++#endif
++
+ /* -- Target-specific instruction emitter --------------------------------- */
+ 
+ #if LJ_TARGET_X86ORX64
+@@ -181,6 +227,8 @@ IRFLDEF(FLOFS)
+ #include "lj_emit_ppc.h"
+ #elif LJ_TARGET_MIPS
+ #include "lj_emit_mips.h"
++#elif LJ_TARGET_RISCV64
++#include "lj_emit_riscv.h"
+ #else
+ #error "Missing instruction emitter for target CPU"
+ #endif
+@@ -560,7 +608,11 @@ static Reg ra_allock(ASMState *as, intpt
+ 	IRIns *ir = IR(ref);
+ 	if ((ir->o == IR_KINT64 && k == (int64_t)ir_kint64(ir)->u64) ||
+ #if LJ_GC64
++#if LJ_TARGET_ARM64
++	    (ir->o == IR_KINT && (uint64_t)k == (uint32_t)ir->i) ||
++#else
+ 	    (ir->o == IR_KINT && k == ir->i) ||
++#endif
+ 	    (ir->o == IR_KGC && k == (intptr_t)ir_kgc(ir)) ||
+ 	    ((ir->o == IR_KPTR || ir->o == IR_KKPTR) &&
+ 	     k == (intptr_t)ir_kptr(ir))
+@@ -694,7 +746,14 @@ static void ra_rename(ASMState *as, Reg
+   RA_DBGX((as, "rename    $f $r $r", regcost_ref(as->cost[up]), down, up));
+   emit_movrr(as, ir, down, up);  /* Backwards codegen needs inverse move. */
+   if (!ra_hasspill(IR(ref)->s)) {  /* Add the rename to the IR. */
+-    ra_addrename(as, down, ref, as->snapno);
++    /*
++    ** The rename is effective at the subsequent (already emitted) exit
++    ** branch. This is for the current snapshot (as->snapno). Except if we
++    ** haven't yet allocated any refs for the snapshot (as->snapalloc == 1),
++    ** then it belongs to the next snapshot.
++    ** See also the discussion at asm_snap_checkrename().
++    */
++    ra_addrename(as, down, ref, as->snapno + as->snapalloc);
+   }
+ }
+ 
+@@ -807,11 +866,11 @@ static void ra_leftov(ASMState *as, Reg
+ }
+ #endif
+ 
+-#if !LJ_64
+ /* Force a RID_RETLO/RID_RETHI destination register pair (marked as free). */
+ static void ra_destpair(ASMState *as, IRIns *ir)
+ {
+   Reg destlo = ir->r, desthi = (ir+1)->r;
++  IRIns *irx = (LJ_64 && !irt_is64(ir->t)) ? ir+1 : ir;
+   /* First spill unrelated refs blocking the destination registers. */
+   if (!rset_test(as->freeset, RID_RETLO) &&
+       destlo != RID_RETLO && desthi != RID_RETLO)
+@@ -835,29 +894,29 @@ static void ra_destpair(ASMState *as, IR
+   /* Check for conflicts and shuffle the registers as needed. */
+   if (destlo == RID_RETHI) {
+     if (desthi == RID_RETLO) {
+-#if LJ_TARGET_X86
++#if LJ_TARGET_X86ORX64
+       *--as->mcp = XI_XCHGa + RID_RETHI;
++      if (LJ_64 && irt_is64(irx->t)) *--as->mcp = 0x48;
+ #else
+-      emit_movrr(as, ir, RID_RETHI, RID_TMP);
+-      emit_movrr(as, ir, RID_RETLO, RID_RETHI);
+-      emit_movrr(as, ir, RID_TMP, RID_RETLO);
++      emit_movrr(as, irx, RID_RETHI, RID_TMP);
++      emit_movrr(as, irx, RID_RETLO, RID_RETHI);
++      emit_movrr(as, irx, RID_TMP, RID_RETLO);
+ #endif
+     } else {
+-      emit_movrr(as, ir, RID_RETHI, RID_RETLO);
+-      if (desthi != RID_RETHI) emit_movrr(as, ir, desthi, RID_RETHI);
++      emit_movrr(as, irx, RID_RETHI, RID_RETLO);
++      if (desthi != RID_RETHI) emit_movrr(as, irx, desthi, RID_RETHI);
+     }
+   } else if (desthi == RID_RETLO) {
+-    emit_movrr(as, ir, RID_RETLO, RID_RETHI);
+-    if (destlo != RID_RETLO) emit_movrr(as, ir, destlo, RID_RETLO);
++    emit_movrr(as, irx, RID_RETLO, RID_RETHI);
++    if (destlo != RID_RETLO) emit_movrr(as, irx, destlo, RID_RETLO);
+   } else {
+-    if (desthi != RID_RETHI) emit_movrr(as, ir, desthi, RID_RETHI);
+-    if (destlo != RID_RETLO) emit_movrr(as, ir, destlo, RID_RETLO);
++    if (desthi != RID_RETHI) emit_movrr(as, irx, desthi, RID_RETHI);
++    if (destlo != RID_RETLO) emit_movrr(as, irx, destlo, RID_RETLO);
+   }
+   /* Restore spill slots (if any). */
+   if (ra_hasspill((ir+1)->s)) ra_save(as, ir+1, RID_RETHI);
+   if (ra_hasspill(ir->s)) ra_save(as, ir, RID_RETLO);
+ }
+-#endif
+ 
+ /* -- Snapshot handling --------- ----------------------------------------- */
+ 
+@@ -892,7 +951,10 @@ static int asm_sunk_store(ASMState *as,
+ static void asm_snap_alloc1(ASMState *as, IRRef ref)
+ {
+   IRIns *ir = IR(ref);
+-  if (!irref_isk(ref) && (!(ra_used(ir) || ir->r == RID_SUNK))) {
++  if (!irref_isk(ref) && ir->r != RID_SUNK) {
++    bloomset(as->snapfilt1, ref);
++    bloomset(as->snapfilt2, hashrot(ref, ref + HASH_BIAS));
++    if (ra_used(ir)) return;
+     if (ir->r == RID_SINK) {
+       ir->r = RID_SUNK;
+ #if LJ_HASFFI
+@@ -947,11 +1009,12 @@ static void asm_snap_alloc1(ASMState *as
+ }
+ 
+ /* Allocate refs escaping to a snapshot. */
+-static void asm_snap_alloc(ASMState *as)
++static void asm_snap_alloc(ASMState *as, int snapno)
+ {
+-  SnapShot *snap = &as->T->snap[as->snapno];
++  SnapShot *snap = &as->T->snap[snapno];
+   SnapEntry *map = &as->T->snapmap[snap->mapofs];
+   MSize n, nent = snap->nent;
++  as->snapfilt1 = as->snapfilt2 = 0;
+   for (n = 0; n < nent; n++) {
+     SnapEntry sn = map[n];
+     IRRef ref = snap_ref(sn);
+@@ -960,7 +1023,7 @@ static void asm_snap_alloc(ASMState *as)
+       if (LJ_SOFTFP && (sn & SNAP_SOFTFPNUM)) {
+ 	lj_assertA(irt_type(IR(ref+1)->t) == IRT_SOFTFP,
+ 		   "snap %d[%d] points to bad SOFTFP IR %04d",
+-		   as->snapno, n, ref - REF_BIAS);
++		   snapno, n, ref - REF_BIAS);
+ 	asm_snap_alloc1(as, ref+1);
+       }
+     }
+@@ -976,35 +1039,26 @@ static void asm_snap_alloc(ASMState *as)
+ */
+ static int asm_snap_checkrename(ASMState *as, IRRef ren)
+ {
+-  SnapShot *snap = &as->T->snap[as->snapno];
+-  SnapEntry *map = &as->T->snapmap[snap->mapofs];
+-  MSize n, nent = snap->nent;
+-  for (n = 0; n < nent; n++) {
+-    SnapEntry sn = map[n];
+-    IRRef ref = snap_ref(sn);
+-    if (ref == ren || (LJ_SOFTFP && (sn & SNAP_SOFTFPNUM) && ++ref == ren)) {
+-      IRIns *ir = IR(ref);
+-      ra_spill(as, ir);  /* Register renamed, so force a spill slot. */
+-      RA_DBGX((as, "snaprensp $f $s", ref, ir->s));
+-      return 1;  /* Found. */
+-    }
++  if (bloomtest(as->snapfilt1, ren) &&
++      bloomtest(as->snapfilt2, hashrot(ren, ren + HASH_BIAS))) {
++    IRIns *ir = IR(ren);
++    ra_spill(as, ir);  /* Register renamed, so force a spill slot. */
++    RA_DBGX((as, "snaprensp $f $s", ren, ir->s));
++    return 1;  /* Found. */
+   }
+   return 0;  /* Not found. */
+ }
+ 
+-/* Prepare snapshot for next guard instruction. */
++/* Prepare snapshot for next guard or throwing instruction. */
+ static void asm_snap_prep(ASMState *as)
+ {
+-  if (as->curins < as->snapref) {
+-    do {
+-      if (as->snapno == 0) return;  /* Called by sunk stores before snap #0. */
+-      as->snapno--;
+-      as->snapref = as->T->snap[as->snapno].ref;
+-    } while (as->curins < as->snapref);
+-    asm_snap_alloc(as);
++  if (as->snapalloc) {
++    /* Alloc on first invocation for each snapshot. */
++    as->snapalloc = 0;
++    asm_snap_alloc(as, as->snapno);
+     as->snaprename = as->T->nins;
+   } else {
+-    /* Process any renames above the highwater mark. */
++    /* Check any renames above the highwater mark. */
+     for (; as->snaprename < as->T->nins; as->snaprename++) {
+       IRIns *ir = &as->T->ir[as->snaprename];
+       if (asm_snap_checkrename(as, ir->op1))
+@@ -1013,6 +1067,35 @@ static void asm_snap_prep(ASMState *as)
+   }
+ }
+ 
++/* Move to previous snapshot when we cross the current snapshot ref. */
++static void asm_snap_prev(ASMState *as)
++{
++  if (as->curins < as->snapref) {
++    uintptr_t ofs = (uintptr_t)(as->mctoporig - as->mcp);
++    if (ofs >= 0x10000) lj_trace_err(as->J, LJ_TRERR_MCODEOV);
++    do {
++      if (as->snapno == 0) return;
++      as->snapno--;
++      as->snapref = as->T->snap[as->snapno].ref;
++      as->T->snap[as->snapno].mcofs = (uint16_t)ofs;  /* Remember mcode ofs. */
++    } while (as->curins < as->snapref);  /* May have no ins inbetween. */
++    as->snapalloc = 1;
++  }
++}
++
++/* Fixup snapshot mcode offsetst. */
++static void asm_snap_fixup_mcofs(ASMState *as)
++{
++  uint32_t sz = (uint32_t)(as->mctoporig - as->mcp);
++  SnapShot *snap = as->T->snap;
++  SnapNo i;
++  for (i = as->T->nsnap-1; i > 0; i--) {
++    /* Compute offset from mcode start and store in correct snapshot. */
++    snap[i].mcofs = (uint16_t)(sz - snap[i-1].mcofs);
++  }
++  snap[0].mcofs = 0;
++}
++
+ /* -- Miscellaneous helpers ----------------------------------------------- */
+ 
+ /* Calculate stack adjustment. */
+@@ -1057,6 +1140,7 @@ static void asm_snew(ASMState *as, IRIns
+ {
+   const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_str_new];
+   IRRef args[3];
++  asm_snap_prep(as);
+   args[0] = ASMREF_L;  /* lua_State *L    */
+   args[1] = ir->op1;   /* const char *str */
+   args[2] = ir->op2;   /* size_t len      */
+@@ -1069,6 +1153,7 @@ static void asm_tnew(ASMState *as, IRIns
+ {
+   const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_new1];
+   IRRef args[2];
++  asm_snap_prep(as);
+   args[0] = ASMREF_L;     /* lua_State *L    */
+   args[1] = ASMREF_TMP1;  /* uint32_t ahsize */
+   as->gcsteps++;
+@@ -1081,6 +1166,7 @@ static void asm_tdup(ASMState *as, IRIns
+ {
+   const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_dup];
+   IRRef args[2];
++  asm_snap_prep(as);
+   args[0] = ASMREF_L;  /* lua_State *L    */
+   args[1] = ir->op1;   /* const GCtab *kt */
+   as->gcsteps++;
+@@ -1106,28 +1192,43 @@ static void asm_gcstep(ASMState *as, IRI
+ 
+ /* -- Buffer operations --------------------------------------------------- */
+ 
+-static void asm_tvptr(ASMState *as, Reg dest, IRRef ref);
++static void asm_tvptr(ASMState *as, Reg dest, IRRef ref, MSize mode);
++#if LJ_HASBUFFER
++static void asm_bufhdr_write(ASMState *as, Reg sb);
++#endif
+ 
+ static void asm_bufhdr(ASMState *as, IRIns *ir)
+ {
+   Reg sb = ra_dest(as, ir, RSET_GPR);
+-  if ((ir->op2 & IRBUFHDR_APPEND)) {
++  switch (ir->op2) {
++  case IRBUFHDR_RESET: {
++    Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, sb));
++    IRIns irbp;
++    irbp.ot = IRT(0, IRT_PTR);  /* Buffer data pointer type. */
++    emit_storeofs(as, &irbp, tmp, sb, offsetof(SBuf, w));
++    emit_loadofs(as, &irbp, tmp, sb, offsetof(SBuf, b));
++    break;
++    }
++  case IRBUFHDR_APPEND: {
+     /* Rematerialize const buffer pointer instead of likely spill. */
+     IRIns *irp = IR(ir->op1);
+     if (!(ra_hasreg(irp->r) || irp == ir-1 ||
+ 	  (irp == ir-2 && !ra_used(ir-1)))) {
+-      while (!(irp->o == IR_BUFHDR && !(irp->op2 & IRBUFHDR_APPEND)))
++      while (!(irp->o == IR_BUFHDR && irp->op2 == IRBUFHDR_RESET))
+ 	irp = IR(irp->op1);
+       if (irref_isk(irp->op1)) {
+ 	ra_weak(as, ra_allocref(as, ir->op1, RSET_GPR));
+ 	ir = irp;
+       }
+     }
+-  } else {
+-    Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, sb));
+-    /* Passing ir isn't strictly correct, but it's an IRT_PGC, too. */
+-    emit_storeofs(as, ir, tmp, sb, offsetof(SBuf, p));
+-    emit_loadofs(as, ir, tmp, sb, offsetof(SBuf, b));
++    break;
++    }
++#if LJ_HASBUFFER
++  case IRBUFHDR_WRITE:
++    asm_bufhdr_write(as, sb);
++    break;
++#endif
++  default: lj_assertA(0, "bad BUFHDR op2 %d", ir->op2); break;
+   }
+ #if LJ_TARGET_X86ORX64
+   ra_left(as, sb, ir->op1);
+@@ -1179,7 +1280,7 @@ static void asm_bufput(ASMState *as, IRI
+   if (args[1] == ASMREF_TMP1) {
+     Reg tmp = ra_releasetmp(as, ASMREF_TMP1);
+     if (kchar == -129)
+-      asm_tvptr(as, tmp, irs->op1);
++      asm_tvptr(as, tmp, irs->op1, IRTMPREF_IN1);
+     else
+       ra_allockreg(as, kchar, tmp);
+   }
+@@ -1201,6 +1302,7 @@ static void asm_tostr(ASMState *as, IRIn
+ {
+   const CCallInfo *ci;
+   IRRef args[2];
++  asm_snap_prep(as);
+   args[0] = ASMREF_L;
+   as->gcsteps++;
+   if (ir->op2 == IRTOSTR_NUM) {
+@@ -1216,7 +1318,7 @@ static void asm_tostr(ASMState *as, IRIn
+   asm_setupresult(as, ir, ci);  /* GCstr * */
+   asm_gencall(as, ci, args);
+   if (ir->op2 == IRTOSTR_NUM)
+-    asm_tvptr(as, ra_releasetmp(as, ASMREF_TMP1), ir->op1);
++    asm_tvptr(as, ra_releasetmp(as, ASMREF_TMP1), ir->op1, IRTMPREF_IN1);
+ }
+ 
+ #if LJ_32 && LJ_HASFFI && !LJ_SOFTFP && !LJ_TARGET_X86
+@@ -1257,12 +1359,19 @@ static void asm_newref(ASMState *as, IRI
+   IRRef args[3];
+   if (ir->r == RID_SINK)
+     return;
++  asm_snap_prep(as);
+   args[0] = ASMREF_L;     /* lua_State *L */
+   args[1] = ir->op1;      /* GCtab *t     */
+   args[2] = ASMREF_TMP1;  /* cTValue *key */
+   asm_setupresult(as, ir, ci);  /* TValue * */
+   asm_gencall(as, ci, args);
+-  asm_tvptr(as, ra_releasetmp(as, ASMREF_TMP1), ir->op2);
++  asm_tvptr(as, ra_releasetmp(as, ASMREF_TMP1), ir->op2, IRTMPREF_IN1);
++}
++
++static void asm_tmpref(ASMState *as, IRIns *ir)
++{
++  Reg r = ra_dest(as, ir, RSET_GPR);
++  asm_tvptr(as, r, ir->op1, ir->op2);
+ }
+ 
+ static void asm_lref(ASMState *as, IRIns *ir)
+@@ -1601,6 +1710,8 @@ static void asm_loop(ASMState *as)
+ #include "lj_asm_ppc.h"
+ #elif LJ_TARGET_MIPS
+ #include "lj_asm_mips.h"
++#elif LJ_TARGET_RISCV64
++#include "lj_asm_riscv64.h"
+ #else
+ #error "Missing assembler for target CPU"
+ #endif
+@@ -1610,7 +1721,6 @@ static void asm_loop(ASMState *as)
+ #if !LJ_SOFTFP32
+ #if !LJ_TARGET_X86ORX64
+ #define asm_ldexp(as, ir)	asm_callid(as, ir, IRCALL_ldexp)
+-#define asm_fppowi(as, ir)	asm_callid(as, ir, IRCALL_lj_vm_powi)
+ #endif
+ 
+ static void asm_pow(ASMState *as, IRIns *ir)
+@@ -1621,10 +1731,7 @@ static void asm_pow(ASMState *as, IRIns
+ 					  IRCALL_lj_carith_powu64);
+   else
+ #endif
+-  if (irt_isnum(IR(ir->op2)->t))
+-    asm_callid(as, ir, IRCALL_pow);
+-  else
+-    asm_fppowi(as, ir);
++  asm_callid(as, ir, IRCALL_pow);
+ }
+ 
+ static void asm_div(ASMState *as, IRIns *ir)
+@@ -1744,6 +1851,7 @@ static void asm_ir(ASMState *as, IRIns *
+   case IR_NEWREF: asm_newref(as, ir); break;
+   case IR_UREFO: case IR_UREFC: asm_uref(as, ir); break;
+   case IR_FREF: asm_fref(as, ir); break;
++  case IR_TMPREF: asm_tmpref(as, ir); break;
+   case IR_STRREF: asm_strref(as, ir); break;
+   case IR_LREF: asm_lref(as, ir); break;
+ 
+@@ -1830,6 +1938,8 @@ static void asm_head_side(ASMState *as)
+   IRRef1 sloadins[RID_MAX];
+   RegSet allow = RSET_ALL;  /* Inverse of all coalesced registers. */
+   RegSet live = RSET_EMPTY;  /* Live parent registers. */
++  RegSet pallow = RSET_GPR;  /* Registers needed by the parent stack check. */
++  Reg pbase;
+   IRIns *irp = &as->parent->ir[REF_BASE];  /* Parent base. */
+   int32_t spadj, spdelta;
+   int pass2 = 0;
+@@ -1838,10 +1948,13 @@ static void asm_head_side(ASMState *as)
+ 
+   if (as->snapno && as->topslot > as->parent->topslot) {
+     /* Force snap #0 alloc to prevent register overwrite in stack check. */
+-    as->snapno = 0;
+-    asm_snap_alloc(as);
++    asm_snap_alloc(as, 0);
++  }
++  pbase = asm_head_side_base(as, irp);
++  if (pbase != RID_NONE) {
++    rset_clear(allow, pbase);
++    rset_clear(pallow, pbase);
+   }
+-  allow = asm_head_side_base(as, irp, allow);
+ 
+   /* Scan all parent SLOADs and collect register dependencies. */
+   for (i = as->stopins; i > REF_BASE; i--) {
+@@ -1871,6 +1984,7 @@ static void asm_head_side(ASMState *as)
+       sloadins[rs] = (IRRef1)i;
+       rset_set(live, rs);  /* Block live parent register. */
+     }
++    if (!ra_hasspill(regsp_spill(rs))) rset_clear(pallow, regsp_reg(rs));
+   }
+ 
+   /* Calculate stack frame adjustment. */
+@@ -1987,7 +2101,7 @@ static void asm_head_side(ASMState *as)
+     ExitNo exitno = as->J->exitno;
+ #endif
+     as->T->topslot = (uint8_t)as->topslot;  /* Remember for child traces. */
+-    asm_stack_check(as, as->topslot, irp, allow & RSET_GPR, exitno);
++    asm_stack_check(as, as->topslot, irp, pallow, exitno);
+   }
+ }
+ 
+@@ -2078,6 +2192,9 @@ static void asm_setup_regsp(ASMState *as
+ #endif
+ 
+   ra_setup(as);
++#if LJ_TARGET_ARM64
++  ra_setkref(as, RID_GL, (intptr_t)J2G(as->J));
++#endif
+ 
+   /* Clear reg/sp for constants. */
+   for (ir = IR(T->nk), lastir = IR(REF_BASE); ir < lastir; ir++) {
+@@ -2100,6 +2217,7 @@ static void asm_setup_regsp(ASMState *as
+   as->snaprename = nins;
+   as->snapref = nins;
+   as->snapno = T->nsnap;
++  as->snapalloc = 0;
+ 
+   as->stopins = REF_BASE;
+   as->orignins = nins;
+@@ -2148,6 +2266,10 @@ static void asm_setup_regsp(ASMState *as
+       ir->prev = (uint16_t)REGSP_HINT((rload & 15));
+       rload = lj_ror(rload, 4);
+       continue;
++    case IR_TMPREF:
++      if ((ir->op2 & IRTMPREF_OUT2) && as->evenspill < 4)
++	as->evenspill = 4;  /* TMPREF OUT2 needs two TValues on the stack. */
++      break;
+ #endif
+     case IR_CALLXS: {
+       CCallInfo ci;
+@@ -2157,7 +2279,17 @@ static void asm_setup_regsp(ASMState *as
+ 	as->modset |= RSET_SCRATCH;
+       continue;
+       }
+-    case IR_CALLN: case IR_CALLA: case IR_CALLL: case IR_CALLS: {
++    case IR_CALLL:
++      /* lj_vm_next needs two TValues on the stack. */
++#if LJ_TARGET_X64 && LJ_ABI_WIN
++      if (ir->op2 == IRCALL_lj_vm_next && as->evenspill < SPS_FIRST + 4)
++	as->evenspill = SPS_FIRST + 4;
++#else
++      if (SPS_FIRST < 4 && ir->op2 == IRCALL_lj_vm_next && as->evenspill < 4)
++	as->evenspill = 4;
++#endif
++      /* fallthrough */
++    case IR_CALLN: case IR_CALLA: case IR_CALLS: {
+       const CCallInfo *ci = &lj_ir_callinfo[ir->op2];
+       ir->prev = asm_setup_call_slots(as, ir, ci);
+       if (inloop)
+@@ -2165,7 +2297,6 @@ static void asm_setup_regsp(ASMState *as
+ 		      (RSET_SCRATCH & ~RSET_FPR) : RSET_SCRATCH;
+       continue;
+       }
+-#if LJ_SOFTFP || (LJ_32 && LJ_HASFFI)
+     case IR_HIOP:
+       switch ((ir-1)->o) {
+ #if LJ_SOFTFP && LJ_TARGET_ARM
+@@ -2176,7 +2307,7 @@ static void asm_setup_regsp(ASMState *as
+ 	}
+ 	break;
+ #endif
+-#if !LJ_SOFTFP && LJ_NEED_FP64
++#if !LJ_SOFTFP && LJ_NEED_FP64 && LJ_32 && LJ_HASFFI
+       case IR_CONV:
+ 	if (irt_isfp((ir-1)->t)) {
+ 	  ir->prev = REGSP_HINT(RID_FPRET);
+@@ -2184,7 +2315,7 @@ static void asm_setup_regsp(ASMState *as
+ 	}
+ #endif
+       /* fallthrough */
+-      case IR_CALLN: case IR_CALLXS:
++      case IR_CALLN: case IR_CALLL: case IR_CALLS: case IR_CALLXS:
+ #if LJ_SOFTFP
+       case IR_MIN: case IR_MAX:
+ #endif
+@@ -2195,7 +2326,6 @@ static void asm_setup_regsp(ASMState *as
+ 	break;
+       }
+       break;
+-#endif
+ #if LJ_SOFTFP
+     case IR_MIN: case IR_MAX:
+       if ((ir+1)->o != IR_HIOP) break;
+@@ -2250,13 +2380,23 @@ static void asm_setup_regsp(ASMState *as
+       }
+       /* fallthrough */ /* for integer POW */
+     case IR_DIV: case IR_MOD:
+-      if (!irt_isnum(ir->t)) {
++      if ((LJ_64 && LJ_SOFTFP) || !irt_isnum(ir->t)) {
++	ir->prev = REGSP_HINT(RID_RET);
++	if (inloop)
++	  as->modset |= (RSET_SCRATCH & RSET_GPR);
++	continue;
++      }
++      break;
++#if LJ_64 && LJ_SOFTFP
++    case IR_ADD: case IR_SUB: case IR_MUL:
++      if (irt_isnum(ir->t)) {
+ 	ir->prev = REGSP_HINT(RID_RET);
+ 	if (inloop)
+ 	  as->modset |= (RSET_SCRATCH & RSET_GPR);
+ 	continue;
+       }
+       break;
++#endif
+     case IR_FPMATH:
+ #if LJ_TARGET_X86ORX64
+       if (ir->op2 <= IRFPM_TRUNC) {
+@@ -2327,7 +2467,6 @@ void lj_asm_trace(jit_State *J, GCtrace
+ {
+   ASMState as_;
+   ASMState *as = &as_;
+-  MCode *origtop;
+ 
+   /* Remove nops/renames left over from ASM restart due to LJ_TRERR_MCODELM. */
+   {
+@@ -2353,9 +2492,12 @@ void lj_asm_trace(jit_State *J, GCtrace
+   as->realign = NULL;
+   as->loopinv = 0;
+   as->parent = J->parent ? traceref(J, J->parent) : NULL;
++#ifdef LUAJIT_RANDOM_RA
++  (void)lj_prng_u64(&J2G(J)->prng);  /* Ensure PRNG step between traces. */
++#endif
+ 
+   /* Reserve MCode memory. */
+-  as->mctop = origtop = lj_mcode_reserve(J, &as->mcbot);
++  as->mctop = as->mctoporig = lj_mcode_reserve(J, &as->mcbot);
+   as->mcp = as->mctop;
+   as->mclim = as->mcbot + MCLIM_REDZONE;
+   asm_setup_target(as);
+@@ -2394,6 +2536,10 @@ void lj_asm_trace(jit_State *J, GCtrace
+ #endif
+     as->ir = J->curfinal->ir;  /* Use the copied IR. */
+     as->curins = J->cur.nins = as->orignins;
++#ifdef LUAJIT_RANDOM_RA
++    as->prngstate = J2G(J)->prng;  /* Must (re)start from identical state. */
++    as->prngbits = 0;
++#endif
+ 
+     RA_DBG_START();
+     RA_DBGX((as, "===== STOP ====="));
+@@ -2417,6 +2563,7 @@ void lj_asm_trace(jit_State *J, GCtrace
+       lj_assertA(!(LJ_32 && irt_isint64(ir->t)),
+ 		 "IR %04d has unsplit 64 bit type",
+ 		 (int)(ir - as->ir) - REF_BIAS);
++      asm_snap_prev(as);
+       if (!ra_used(ir) && !ir_sideeff(ir) && (as->flags & JIT_F_OPT_DCE))
+ 	continue;  /* Dead-code elimination can be soooo easy. */
+       if (irt_isguard(ir->t))
+@@ -2450,6 +2597,9 @@ void lj_asm_trace(jit_State *J, GCtrace
+       memcpy(J->curfinal->ir + as->orignins, T->ir + as->orignins,
+ 	     (T->nins - as->orignins) * sizeof(IRIns));  /* Copy RENAMEs. */
+       T->nins = J->curfinal->nins;
++      /* Fill mcofs of any unprocessed snapshots. */
++      as->curins = REF_FIRST;
++      asm_snap_prev(as);
+       break;  /* Done. */
+     }
+ 
+@@ -2468,13 +2618,16 @@ void lj_asm_trace(jit_State *J, GCtrace
+   /* Set trace entry point before fixing up tail to allow link to self. */
+   T->mcode = as->mcp;
+   T->mcloop = as->mcloop ? (MSize)((char *)as->mcloop - (char *)as->mcp) : 0;
+-  if (!as->loopref)
++  if (as->loopref)
++    asm_loop_tail_fixup(as);
++  else
+     asm_tail_fixup(as, T->link);  /* Note: this may change as->mctop! */
+   T->szmcode = (MSize)((char *)as->mctop - (char *)as->mcp);
++  asm_snap_fixup_mcofs(as);
+ #if LJ_TARGET_MCODE_FIXUP
+   asm_mcode_fixup(T->mcode, T->szmcode);
+ #endif
+-  lj_mcode_sync(T->mcode, origtop);
++  lj_mcode_sync(T->mcode, as->mctoporig);
+ }
+ 
+ #undef IR
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_asm.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_asm.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_asm.h
+@@ -1,6 +1,6 @@
+ /*
+ ** IR assembler (SSA IR -> machine code).
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_ASM_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_asm_arm.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_asm_arm.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_asm_arm.h
+@@ -1,6 +1,6 @@
+ /*
+ ** ARM IR assembler (SSA IR -> machine code).
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ /* -- Register allocator extensions --------------------------------------- */
+@@ -185,6 +185,9 @@ static Reg asm_fuseahuref(ASMState *as,
+ 	*ofsp = (ofs & 255);  /* Mask out less bits to allow LDRD. */
+ 	return ra_allock(as, (ofs & ~255), allow);
+       }
++    } else if (ir->o == IR_TMPREF) {
++      *ofsp = 0;
++      return RID_SP;
+     }
+   }
+   *ofsp = 0;
+@@ -310,7 +313,11 @@ static void asm_fusexref(ASMState *as, A
+ }
+ 
+ #if !LJ_SOFTFP
+-/* Fuse to multiply-add/sub instruction. */
++/*
++** Fuse to multiply-add/sub instruction.
++** VMLA rounds twice (UMA, not FMA) -- no need to check for JIT_F_OPT_FMA.
++** VFMA needs VFPv4, which is uncommon on the remaining ARM32 targets.
++*/
+ static int asm_fusemadd(ASMState *as, IRIns *ir, ARMIns ai, ARMIns air)
+ {
+   IRRef lref = ir->op1, rref = ir->op2;
+@@ -498,6 +505,30 @@ static void asm_retf(ASMState *as, IRIns
+   emit_lso(as, ARMI_LDR, RID_TMP, base, -4);
+ }
+ 
++/* -- Buffer operations --------------------------------------------------- */
++
++#if LJ_HASBUFFER
++static void asm_bufhdr_write(ASMState *as, Reg sb)
++{
++  Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, sb));
++  IRIns irgc;
++  int32_t addr = i32ptr((void *)&J2G(as->J)->cur_L);
++  irgc.ot = IRT(0, IRT_PGC);  /* GC type. */
++  emit_storeofs(as, &irgc, RID_TMP, sb, offsetof(SBuf, L));
++  if ((as->flags & JIT_F_ARMV6T2)) {
++    emit_dnm(as, ARMI_BFI, RID_TMP, lj_fls(SBUF_MASK_FLAG), tmp);
++  } else {
++    emit_dnm(as, ARMI_ORR, RID_TMP, RID_TMP, tmp);
++    emit_dn(as, ARMI_AND|ARMI_K12|SBUF_MASK_FLAG, tmp, tmp);
++  }
++  emit_lso(as, ARMI_LDR, RID_TMP,
++	   ra_allock(as, (addr & ~4095),
++		     rset_exclude(rset_exclude(RSET_GPR, sb), tmp)),
++	   (addr & 4095));
++  emit_loadofs(as, &irgc, tmp, sb, offsetof(SBuf, L));
++}
++#endif
++
+ /* -- Type conversions ---------------------------------------------------- */
+ 
+ #if !LJ_SOFTFP
+@@ -666,35 +697,55 @@ static void asm_strto(ASMState *as, IRIn
+ /* -- Memory references --------------------------------------------------- */
+ 
+ /* Get pointer to TValue. */
+-static void asm_tvptr(ASMState *as, Reg dest, IRRef ref)
++static void asm_tvptr(ASMState *as, Reg dest, IRRef ref, MSize mode)
+ {
+-  IRIns *ir = IR(ref);
+-  if (irt_isnum(ir->t)) {
+-    if (irref_isk(ref)) {
+-      /* Use the number constant itself as a TValue. */
+-      ra_allockreg(as, i32ptr(ir_knum(ir)), dest);
+-    } else {
++  if ((mode & IRTMPREF_IN1)) {
++    IRIns *ir = IR(ref);
++    if (irt_isnum(ir->t)) {
++      if ((mode & IRTMPREF_OUT1)) {
+ #if LJ_SOFTFP
+-      lj_assertA(0, "unsplit FP op");
++	lj_assertA(irref_isk(ref), "unsplit FP op");
++	emit_dm(as, ARMI_MOV, dest, RID_SP);
++	emit_lso(as, ARMI_STR,
++		 ra_allock(as, (int32_t)ir_knum(ir)->u32.lo, RSET_GPR),
++		 RID_SP, 0);
++	emit_lso(as, ARMI_STR,
++		 ra_allock(as, (int32_t)ir_knum(ir)->u32.hi, RSET_GPR),
++		 RID_SP, 4);
+ #else
+-      /* Otherwise force a spill and use the spill slot. */
+-      emit_opk(as, ARMI_ADD, dest, RID_SP, ra_spill(as, ir), RSET_GPR);
++	Reg src = ra_alloc1(as, ref, RSET_FPR);
++	emit_dm(as, ARMI_MOV, dest, RID_SP);
++	emit_vlso(as, ARMI_VSTR_D, src, RID_SP, 0);
++#endif
++      } else if (irref_isk(ref)) {
++	/* Use the number constant itself as a TValue. */
++	ra_allockreg(as, i32ptr(ir_knum(ir)), dest);
++      } else {
++#if LJ_SOFTFP
++	lj_assertA(0, "unsplit FP op");
++#else
++	/* Otherwise force a spill and use the spill slot. */
++	emit_opk(as, ARMI_ADD, dest, RID_SP, ra_spill(as, ir), RSET_GPR);
+ #endif
++      }
++    } else {
++      /* Otherwise use [sp] and [sp+4] to hold the TValue.
++      ** This assumes the following call has max. 4 args.
++      */
++      Reg type;
++      emit_dm(as, ARMI_MOV, dest, RID_SP);
++      if (!irt_ispri(ir->t)) {
++	Reg src = ra_alloc1(as, ref, RSET_GPR);
++	emit_lso(as, ARMI_STR, src, RID_SP, 0);
++      }
++      if (LJ_SOFTFP && (ir+1)->o == IR_HIOP && !irt_isnil((ir+1)->t))
++	type = ra_alloc1(as, ref+1, RSET_GPR);
++      else
++	type = ra_allock(as, irt_toitype(ir->t), RSET_GPR);
++      emit_lso(as, ARMI_STR, type, RID_SP, 4);
+     }
+   } else {
+-    /* Otherwise use [sp] and [sp+4] to hold the TValue. */
+-    RegSet allow = rset_exclude(RSET_GPR, dest);
+-    Reg type;
+     emit_dm(as, ARMI_MOV, dest, RID_SP);
+-    if (!irt_ispri(ir->t)) {
+-      Reg src = ra_alloc1(as, ref, allow);
+-      emit_lso(as, ARMI_STR, src, RID_SP, 0);
+-    }
+-    if (LJ_SOFTFP && (ir+1)->o == IR_HIOP)
+-      type = ra_alloc1(as, ref+1, allow);
+-    else
+-      type = ra_allock(as, irt_toitype(ir->t), allow);
+-    emit_lso(as, ARMI_STR, type, RID_SP, 4);
+   }
+ }
+ 
+@@ -918,24 +969,32 @@ static void asm_hrefk(ASMState *as, IRIn
+ static void asm_uref(ASMState *as, IRIns *ir)
+ {
+   Reg dest = ra_dest(as, ir, RSET_GPR);
+-  if (irref_isk(ir->op1)) {
++  int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC);
++  if (irref_isk(ir->op1) && !guarded) {
+     GCfunc *fn = ir_kfunc(IR(ir->op1));
+     MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
+     emit_lsptr(as, ARMI_LDR, dest, v);
+   } else {
+-    Reg uv = ra_scratch(as, RSET_GPR);
+-    Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
+-    if (ir->o == IR_UREFC) {
+-      asm_guardcc(as, CC_NE);
++    if (guarded) {
++      asm_guardcc(as, ir->o == IR_UREFC ? CC_NE : CC_EQ);
+       emit_n(as, ARMI_CMP|ARMI_K12|1, RID_TMP);
+-      emit_opk(as, ARMI_ADD, dest, uv,
++    }
++    if (ir->o == IR_UREFC)
++      emit_opk(as, ARMI_ADD, dest, dest,
+ 	       (int32_t)offsetof(GCupval, tv), RSET_GPR);
+-      emit_lso(as, ARMI_LDRB, RID_TMP, uv, (int32_t)offsetof(GCupval, closed));
++    else
++      emit_lso(as, ARMI_LDR, dest, dest, (int32_t)offsetof(GCupval, v));
++    if (guarded)
++      emit_lso(as, ARMI_LDRB, RID_TMP, dest,
++	       (int32_t)offsetof(GCupval, closed));
++    if (irref_isk(ir->op1)) {
++      GCfunc *fn = ir_kfunc(IR(ir->op1));
++      int32_t k = (int32_t)gcrefu(fn->l.uvptr[(ir->op2 >> 8)]);
++      emit_loadi(as, dest, k);
+     } else {
+-      emit_lso(as, ARMI_LDR, dest, uv, (int32_t)offsetof(GCupval, v));
++      emit_lso(as, ARMI_LDR, dest, ra_alloc1(as, ir->op1, RSET_GPR),
++	       (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)(ir->op2 >> 8));
+     }
+-    emit_lso(as, ARMI_LDR, uv, func,
+-	     (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)(ir->op2 >> 8));
+   }
+ }
+ 
+@@ -1086,6 +1145,7 @@ static void asm_ahuvload(ASMState *as, I
+   }
+   idx = asm_fuseahuref(as, ir->op1, &ofs, allow,
+ 		       (!LJ_SOFTFP && t == IRT_NUM) ? 1024 : 4096);
++  if (ir->o == IR_VLOAD) ofs += 8 * ir->op2;
+   if (!hiop || type == RID_NONE) {
+     rset_clear(allow, idx);
+     if (ofs < 256 && ra_hasreg(dest) && (dest & 1) == 0 &&
+@@ -1202,7 +1262,12 @@ dotypecheck:
+       }
+     }
+     asm_guardcc(as, t == IRT_NUM ? CC_HS : CC_NE);
+-    emit_n(as, ARMI_CMN|ARMI_K12|-irt_toitype_(t), type);
++    if ((ir->op2 & IRSLOAD_KEYINDEX)) {
++      emit_n(as, ARMI_CMN|ARMI_K12|1, type);
++      emit_dn(as, ARMI_EOR^emit_isk12(ARMI_EOR, ~LJ_KEYINDEX), type, type);
++    } else {
++      emit_n(as, ARMI_CMN|ARMI_K12|-irt_toitype_(t), type);
++    }
+   }
+   if (ra_hasreg(dest)) {
+ #if !LJ_SOFTFP
+@@ -1837,15 +1902,15 @@ static void asm_int64comp(ASMState *as,
+ }
+ #endif
+ 
+-/* -- Support for 64 bit ops in 32 bit mode ------------------------------- */
++/* -- Split register ops -------------------------------------------------- */
+ 
+-/* Hiword op of a split 64 bit op. Previous op must be the loword op. */
++/* Hiword op of a split 32/32 bit op. Previous op is the loword op. */
+ static void asm_hiop(ASMState *as, IRIns *ir)
+ {
+-#if LJ_HASFFI || LJ_SOFTFP
+   /* HIOP is marked as a store because it needs its own DCE logic. */
+   int uselo = ra_used(ir-1), usehi = ra_used(ir);  /* Loword/hiword used? */
+   if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1;
++#if LJ_HASFFI || LJ_SOFTFP
+   if ((ir-1)->o <= IR_NE) {  /* 64 bit integer or FP comparisons. ORDER IR. */
+     as->curins--;  /* Always skip the loword comparison. */
+ #if LJ_SOFTFP
+@@ -1876,6 +1941,7 @@ static void asm_hiop(ASMState *as, IRIns
+       asm_xstore_(as, ir, 4);
+     return;
+   }
++#endif
+   if (!usehi) return;  /* Skip unused hiword op for all remaining ops. */
+   switch ((ir-1)->o) {
+ #if LJ_HASFFI
+@@ -1894,6 +1960,9 @@ static void asm_hiop(ASMState *as, IRIns
+     asm_intneg(as, ir, ARMI_RSC);
+     asm_intneg(as, ir-1, ARMI_RSB|ARMI_S);
+     break;
++  case IR_CNEWI:
++    /* Nothing to do here. Handled by lo op itself. */
++    break;
+ #endif
+ #if LJ_SOFTFP
+   case IR_SLOAD: case IR_ALOAD: case IR_HLOAD: case IR_ULOAD: case IR_VLOAD:
+@@ -1901,25 +1970,16 @@ static void asm_hiop(ASMState *as, IRIns
+     if (!uselo)
+       ra_allocref(as, ir->op1, RSET_GPR);  /* Mark lo op as used. */
+     break;
++  case IR_ASTORE: case IR_HSTORE: case IR_USTORE: case IR_TOSTR: case IR_TMPREF:
++    /* Nothing to do here. Handled by lo op itself. */
++    break;
+ #endif
+-  case IR_CALLN:
+-  case IR_CALLS:
+-  case IR_CALLXS:
++  case IR_CALLN: case IR_CALLL: case IR_CALLS: case IR_CALLXS:
+     if (!uselo)
+       ra_allocref(as, ir->op1, RID2RSET(RID_RETLO));  /* Mark lo op as used. */
+     break;
+-#if LJ_SOFTFP
+-  case IR_ASTORE: case IR_HSTORE: case IR_USTORE: case IR_TOSTR:
+-#endif
+-  case IR_CNEWI:
+-    /* Nothing to do here. Handled by lo op itself. */
+-    break;
+   default: lj_assertA(0, "bad HIOP for op %d", (ir-1)->o); break;
+   }
+-#else
+-  /* Unused without SOFTFP or FFI. */
+-  UNUSED(as); UNUSED(ir); lj_assertA(0, "unexpected HIOP");
+-#endif
+ }
+ 
+ /* -- Profiling ----------------------------------------------------------- */
+@@ -1938,6 +1998,7 @@ static void asm_prof(ASMState *as, IRIns
+ static void asm_stack_check(ASMState *as, BCReg topslot,
+ 			    IRIns *irp, RegSet allow, ExitNo exitno)
+ {
++  int savereg = 0;
+   Reg pbase;
+   uint32_t k;
+   if (irp) {
+@@ -1948,12 +2009,14 @@ static void asm_stack_check(ASMState *as
+       pbase = rset_pickbot(allow);
+     } else {
+       pbase = RID_RET;
+-      emit_lso(as, ARMI_LDR, RID_RET, RID_SP, 0);  /* Restore temp. register. */
++      savereg = 1;
+     }
+   } else {
+     pbase = RID_BASE;
+   }
+   emit_branch(as, ARMF_CC(ARMI_BL, CC_LS), exitstub_addr(as->J, exitno));
++  if (savereg)
++    emit_lso(as, ARMI_LDR, RID_RET, RID_SP, 0);  /* Restore temp. register. */
+   k = emit_isk12(0, (int32_t)(8*topslot));
+   lj_assertA(k, "slot offset %d does not fit in K12", 8*topslot);
+   emit_n(as, ARMI_CMP^k, RID_TMP);
+@@ -1965,7 +2028,7 @@ static void asm_stack_check(ASMState *as
+     if (ra_hasspill(irp->s))
+       emit_lso(as, ARMI_LDR, pbase, RID_SP, sps_scale(irp->s));
+     emit_lso(as, ARMI_LDR, RID_TMP, RID_TMP, (i & 4095));
+-    if (ra_hasspill(irp->s) && !allow)
++    if (savereg)
+       emit_lso(as, ARMI_STR, RID_RET, RID_SP, 0);  /* Save temp. register. */
+     emit_loadi(as, RID_TMP, (i & ~4095));
+   } else {
+@@ -2021,6 +2084,8 @@ static void asm_stack_restore(ASMState *
+       } else if ((sn & SNAP_SOFTFPNUM)) {
+ 	type = ra_alloc1(as, ref+1, rset_exclude(RSET_GPRODD, RID_BASE));
+ #endif
++      } else if ((sn & SNAP_KEYINDEX)) {
++	type = ra_allock(as, (int32_t)LJ_KEYINDEX, odd);
+       } else {
+ 	type = ra_allock(as, (int32_t)irt_toitype(ir->t), odd);
+       }
+@@ -2082,6 +2147,12 @@ static void asm_loop_fixup(ASMState *as)
+   }
+ }
+ 
++/* Fixup the tail of the loop. */
++static void asm_loop_tail_fixup(ASMState *as)
++{
++  UNUSED(as);  /* Nothing to do. */
++}
++
+ /* -- Head of trace ------------------------------------------------------- */
+ 
+ /* Reload L register from g->cur_L. */
+@@ -2107,7 +2178,7 @@ static void asm_head_root_base(ASMState
+ }
+ 
+ /* Coalesce BASE register for a side trace. */
+-static RegSet asm_head_side_base(ASMState *as, IRIns *irp, RegSet allow)
++static Reg asm_head_side_base(ASMState *as, IRIns *irp)
+ {
+   IRIns *ir;
+   asm_head_lreg(as);
+@@ -2115,16 +2186,15 @@ static RegSet asm_head_side_base(ASMStat
+   if (ra_hasreg(ir->r) && (rset_test(as->modset, ir->r) || irt_ismarked(ir->t)))
+     ra_spill(as, ir);
+   if (ra_hasspill(irp->s)) {
+-    rset_clear(allow, ra_dest(as, ir, allow));
++    return ra_dest(as, ir, RSET_GPR);
+   } else {
+     Reg r = irp->r;
+     lj_assertA(ra_hasreg(r), "base reg lost");
+-    rset_clear(allow, r);
+     if (r != ir->r && !rset_test(as->freeset, r))
+       ra_restore(as, regcost_ref(as->cost[r]));
+     ra_destreg(as, ir, r);
++    return r;
+   }
+-  return allow;
+ }
+ 
+ /* -- Tail of trace ------------------------------------------------------- */
+@@ -2193,7 +2263,7 @@ static Reg asm_setup_call_slots(ASMState
+   }
+   if (nslots > as->evenspill)  /* Leave room for args in stack slots. */
+     as->evenspill = nslots;
+-  return REGSP_HINT(RID_RET);
++  return REGSP_HINT(irt_isfp(ir->t) ? RID_FPRET : RID_RET);
+ }
+ 
+ static void asm_setup_target(ASMState *as)
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_asm_arm64.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_asm_arm64.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_asm_arm64.h
+@@ -1,6 +1,6 @@
+ /*
+ ** ARM64 IR assembler (SSA IR -> machine code).
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
+ ** Sponsored by Cisco Systems, Inc.
+@@ -84,18 +84,23 @@ static void asm_guardcc(ASMState *as, A6
+   emit_cond_branch(as, cc, target);
+ }
+ 
+-/* Emit test and branch instruction to exit for guard. */
+-static void asm_guardtnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit)
++/* Emit test and branch instruction to exit for guard, if in range. */
++static int asm_guardtnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit)
+ {
+   MCode *target = asm_exitstub_addr(as, as->snapno);
+   MCode *p = as->mcp;
++  ptrdiff_t delta = target - p;
+   if (LJ_UNLIKELY(p == as->invmcp)) {
++    if (as->orignins > 1023) return 0;  /* Delta might end up too large. */
+     as->loopinv = 1;
+-    *p = A64I_B | A64F_S26(target-p);
+-    emit_tnb(as, ai^0x01000000u, r, bit, p-1);
+-    return;
++    *p = A64I_B | A64F_S26(delta);
++    ai ^= 0x01000000u;
++    target = p-1;
++  } else if (LJ_UNLIKELY(delta >= 0x1fff)) {
++    return 0;
+   }
+   emit_tnb(as, ai, r, bit, target);
++  return 1;
+ }
+ 
+ /* Emit compare and branch instruction to exit for guard. */
+@@ -198,6 +203,9 @@ static Reg asm_fuseahuref(ASMState *as,
+ 	  return RID_GL;
+ 	}
+       }
++    } else if (ir->o == IR_TMPREF) {
++      *ofsp = (int32_t)glofs(as, &J2G(as->J)->tmptv);
++      return RID_GL;
+     }
+   }
+   *ofsp = 0;
+@@ -208,16 +216,14 @@ static Reg asm_fuseahuref(ASMState *as,
+ static uint32_t asm_fuseopm(ASMState *as, A64Ins ai, IRRef ref, RegSet allow)
+ {
+   IRIns *ir = IR(ref);
++  int logical = (ai & 0x1f000000) == 0x0a000000;
+   if (ra_hasreg(ir->r)) {
+     ra_noweak(as, ir->r);
+     return A64F_M(ir->r);
+   } else if (irref_isk(ref)) {
+-    uint32_t m;
+     int64_t k = get_k64val(as, ref);
+-    if ((ai & 0x1f000000) == 0x0a000000)
+-      m = emit_isk13(k, irt_is64(ir->t));
+-    else
+-      m = emit_isk12(k);
++    uint32_t m = logical ? emit_isk13(k, irt_is64(ir->t)) :
++			   emit_isk12(irt_is64(ir->t) ? k : (int32_t)k);
+     if (m)
+       return m;
+   } else if (mayfuse(as, ref)) {
+@@ -229,7 +235,7 @@ static uint32_t asm_fuseopm(ASMState *as
+ 		    (IR(ir->op2)->i & (irt_is64(ir->t) ? 63 : 31));
+       IRIns *irl = IR(ir->op1);
+       if (sh == A64SH_LSL &&
+-	  irl->o == IR_CONV &&
++	  irl->o == IR_CONV && !logical &&
+ 	  irl->op2 == ((IRT_I64<<IRCONV_DSH)|IRT_INT|IRCONV_SEXT) &&
+ 	  shift <= 4 &&
+ 	  canfuse(as, irl)) {
+@@ -239,7 +245,11 @@ static uint32_t asm_fuseopm(ASMState *as
+ 	Reg m = ra_alloc1(as, ir->op1, allow);
+ 	return A64F_M(m) | A64F_SH(sh, shift);
+       }
+-    } else if (ir->o == IR_CONV &&
++    } else if (ir->o == IR_BROR && logical && irref_isk(ir->op2)) {
++      Reg m = ra_alloc1(as, ir->op1, allow);
++      int shift = (IR(ir->op2)->i & (irt_is64(ir->t) ? 63 : 31));
++      return A64F_M(m) | A64F_SH(A64SH_ROR, shift);
++    } else if (ir->o == IR_CONV && !logical &&
+ 	       ir->op2 == ((IRT_I64<<IRCONV_DSH)|IRT_INT|IRCONV_SEXT)) {
+       Reg m = ra_alloc1(as, ir->op1, allow);
+       return A64F_M(m) | A64F_EX(A64EX_SXTW);
+@@ -334,7 +344,8 @@ static int asm_fusemadd(ASMState *as, IR
+ {
+   IRRef lref = ir->op1, rref = ir->op2;
+   IRIns *irm;
+-  if (lref != rref &&
++  if ((as->flags & JIT_F_OPT_FMA) &&
++      lref != rref &&
+       ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) &&
+        ra_noreg(irm->r)) ||
+        (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) &&
+@@ -415,13 +426,18 @@ static int asm_fuseorshift(ASMState *as,
+ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
+ {
+   uint32_t n, nargs = CCI_XNARGS(ci);
+-  int32_t ofs = 0;
++  int32_t spofs = 0, spalign = LJ_HASFFI && LJ_TARGET_OSX ? 0 : 7;
+   Reg gpr, fpr = REGARG_FIRSTFPR;
+-  if ((void *)ci->func)
+-    emit_call(as, (void *)ci->func);
++  if (ci->func)
++    emit_call(as, ci->func);
+   for (gpr = REGARG_FIRSTGPR; gpr <= REGARG_LASTGPR; gpr++)
+     as->cost[gpr] = REGCOST(~0u, ASMREF_L);
+   gpr = REGARG_FIRSTGPR;
++#if LJ_HASFFI && LJ_ABI_WIN
++  if ((ci->flags & CCI_VARARG)) {
++    fpr = REGARG_LASTFPR+1;
++  }
++#endif
+   for (n = 0; n < nargs; n++) { /* Setup args. */
+     IRRef ref = args[n];
+     IRIns *ir = IR(ref);
+@@ -432,10 +448,21 @@ static void asm_gencall(ASMState *as, co
+ 		     "reg %d not free", fpr);  /* Must have been evicted. */
+ 	  ra_leftov(as, fpr, ref);
+ 	  fpr++;
++#if LJ_HASFFI && LJ_ABI_WIN
++	} else if ((ci->flags & CCI_VARARG) && (gpr <= REGARG_LASTGPR)) {
++	  Reg rf = ra_alloc1(as, ref, RSET_FPR);
++	  emit_dn(as, A64I_FMOV_R_D, gpr++, rf & 31);
++#endif
+ 	} else {
+ 	  Reg r = ra_alloc1(as, ref, RSET_FPR);
+-	  emit_spstore(as, ir, r, ofs + ((LJ_BE && !irt_isnum(ir->t)) ? 4 : 0));
+-	  ofs += 8;
++	  int32_t al = spalign;
++#if LJ_HASFFI && LJ_TARGET_OSX
++	  al |= irt_isnum(ir->t) ? 7 : 3;
++#endif
++	  spofs = (spofs + al) & ~al;
++	  if (LJ_BE && al >= 7 && !irt_isnum(ir->t)) spofs += 4, al -= 4;
++	  emit_spstore(as, ir, r, spofs);
++	  spofs += al + 1;
+ 	}
+       } else {
+ 	if (gpr <= REGARG_LASTGPR) {
+@@ -445,10 +472,27 @@ static void asm_gencall(ASMState *as, co
+ 	  gpr++;
+ 	} else {
+ 	  Reg r = ra_alloc1(as, ref, RSET_GPR);
+-	  emit_spstore(as, ir, r, ofs + ((LJ_BE && !irt_is64(ir->t)) ? 4 : 0));
+-	  ofs += 8;
++	  int32_t al = spalign;
++#if LJ_HASFFI && LJ_TARGET_OSX
++	  al |= irt_size(ir->t) - 1;
++#endif
++	  spofs = (spofs + al) & ~al;
++	  if (al >= 3) {
++	    if (LJ_BE && al >= 7 && !irt_is64(ir->t)) spofs += 4, al -= 4;
++	    emit_spstore(as, ir, r, spofs);
++	  } else {
++	    lj_assertA(al == 0 || al == 1, "size %d unexpected", al + 1);
++	    emit_lso(as, al ? A64I_STRH : A64I_STRB, r, RID_SP, spofs);
++	  }
++	  spofs += al + 1;
+ 	}
+       }
++#if LJ_HASFFI && LJ_TARGET_OSX
++    } else {  /* Marker for start of varargs. */
++      gpr = REGARG_LASTGPR+1;
++      fpr = REGARG_LASTFPR+1;
++      spalign = 7;
++#endif
+     }
+   }
+ }
+@@ -457,8 +501,11 @@ static void asm_gencall(ASMState *as, co
+ static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
+ {
+   RegSet drop = RSET_SCRATCH;
++  int hiop = ((ir+1)->o == IR_HIOP && !irt_isnil((ir+1)->t));
+   if (ra_hasreg(ir->r))
+     rset_clear(drop, ir->r); /* Dest reg handled below. */
++  if (hiop && ra_hasreg((ir+1)->r))
++    rset_clear(drop, (ir+1)->r);  /* Dest reg handled below. */
+   ra_evictset(as, drop); /* Evictions must be performed first. */
+   if (ra_used(ir)) {
+     lj_assertA(!irt_ispri(ir->t), "PRI dest");
+@@ -470,6 +517,8 @@ static void asm_setupresult(ASMState *as
+       } else {
+ 	ra_destreg(as, ir, RID_FPRET);
+       }
++    } else if (hiop) {
++      ra_destpair(as, ir);
+     } else {
+       ra_destreg(as, ir, RID_RET);
+     }
+@@ -492,7 +541,7 @@ static void asm_callx(ASMState *as, IRIn
+     ci.func = (ASMFunction)(ir_k64(irf)->u64);
+   } else {  /* Need a non-argument register for indirect calls. */
+     Reg freg = ra_alloc1(as, func, RSET_RANGE(RID_X8, RID_MAX_GPR)-RSET_FIXED);
+-    emit_n(as, A64I_BLR, freg);
++    emit_n(as, A64I_BLR_AUTH, freg);
+     ci.func = (ASMFunction)(void *)0;
+   }
+   asm_gencall(as, &ci, args);
+@@ -509,8 +558,6 @@ static void asm_retf(ASMState *as, IRIns
+   as->topslot -= (BCReg)delta;
+   if ((int32_t)as->topslot < 0) as->topslot = 0;
+   irt_setmark(IR(REF_BASE)->t);  /* Children must not coalesce with BASE reg. */
+-  /* Need to force a spill on REF_BASE now to update the stack slot. */
+-  emit_lso(as, A64I_STRx, base, RID_SP, ra_spill(as, IR(REF_BASE)));
+   emit_setgl(as, base, jit_base);
+   emit_addptr(as, base, -8*delta);
+   asm_guardcc(as, CC_NE);
+@@ -519,6 +566,21 @@ static void asm_retf(ASMState *as, IRIns
+   emit_lso(as, A64I_LDRx, RID_TMP, base, -8);
+ }
+ 
++/* -- Buffer operations --------------------------------------------------- */
++
++#if LJ_HASBUFFER
++static void asm_bufhdr_write(ASMState *as, Reg sb)
++{
++  Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, sb));
++  IRIns irgc;
++  irgc.ot = IRT(0, IRT_PGC);  /* GC type. */
++  emit_storeofs(as, &irgc, RID_TMP, sb, offsetof(SBuf, L));
++  emit_dn(as, A64I_BFMx | A64F_IMMS(lj_fls(SBUF_MASK_FLAG)) | A64F_IMMR(0), RID_TMP, tmp);
++  emit_getgl(as, RID_TMP, cur_L);
++  emit_loadofs(as, &irgc, tmp, sb, offsetof(SBuf, L));
++}
++#endif
++
+ /* -- Type conversions ---------------------------------------------------- */
+ 
+ static void asm_tointg(ASMState *as, IRIns *ir, Reg left)
+@@ -602,7 +664,7 @@ static void asm_conv(ASMState *as, IRIns
+ 	emit_dn(as, A64I_SXTW, dest, left);
+       }
+     } else {
+-      if (st64) {
++      if (st64 && !(ir->op2 & IRCONV_NONE)) {
+ 	/* This is either a 32 bit reg/reg mov which zeroes the hiword
+ 	** or a load of the loword from a 64 bit address.
+ 	*/
+@@ -619,25 +681,22 @@ static void asm_strto(ASMState *as, IRIn
+ {
+   const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_strscan_num];
+   IRRef args[2];
+-  Reg dest = 0, tmp;
+-  int destused = ra_used(ir);
++  Reg tmp;
+   int32_t ofs = 0;
+   ra_evictset(as, RSET_SCRATCH);
+-  if (destused) {
++  if (ra_used(ir)) {
+     if (ra_hasspill(ir->s)) {
+       ofs = sps_scale(ir->s);
+-      destused = 0;
+       if (ra_hasreg(ir->r)) {
+ 	ra_free(as, ir->r);
+ 	ra_modified(as, ir->r);
+ 	emit_spload(as, ir, ir->r, ofs);
+       }
+     } else {
+-      dest = ra_dest(as, ir, RSET_FPR);
++      Reg dest = ra_dest(as, ir, RSET_FPR);
++      emit_lso(as, A64I_LDRd, (dest & 31), RID_SP, 0);
+     }
+   }
+-  if (destused)
+-    emit_lso(as, A64I_LDRd, (dest & 31), RID_SP, 0);
+   asm_guardcnb(as, A64I_CBZ, RID_RET);
+   args[0] = ir->op1; /* GCstr *str */
+   args[1] = ASMREF_TMP1; /* TValue *n  */
+@@ -675,22 +734,23 @@ static void asm_tvstore64(ASMState *as,
+ }
+ 
+ /* Get pointer to TValue. */
+-static void asm_tvptr(ASMState *as, Reg dest, IRRef ref)
++static void asm_tvptr(ASMState *as, Reg dest, IRRef ref, MSize mode)
+ {
+-  IRIns *ir = IR(ref);
+-  if (irt_isnum(ir->t)) {
+-    if (irref_isk(ref)) {
+-      /* Use the number constant itself as a TValue. */
+-      ra_allockreg(as, i64ptr(ir_knum(ir)), dest);
++  if ((mode & IRTMPREF_IN1)) {
++    IRIns *ir = IR(ref);
++    if (irt_isnum(ir->t)) {
++      if (irref_isk(ref) && !(mode & IRTMPREF_OUT1)) {
++	/* Use the number constant itself as a TValue. */
++	ra_allockreg(as, i64ptr(ir_knum(ir)), dest);
++	return;
++      }
++      emit_lso(as, A64I_STRd, (ra_alloc1(as, ref, RSET_FPR) & 31), dest, 0);
+     } else {
+-      /* Otherwise force a spill and use the spill slot. */
+-      emit_opk(as, A64I_ADDx, dest, RID_SP, ra_spill(as, ir), RSET_GPR);
++      asm_tvstore64(as, dest, 0, ref);
+     }
+-  } else {
+-    /* Otherwise use g->tmptv to hold the TValue. */
+-    asm_tvstore64(as, dest, 0, ref);
+-    ra_allockreg(as, i64ptr(&J2G(as->J)->tmptv), dest);
+   }
++  /* g->tmptv holds the TValue(s). */
++  emit_dn(as, A64I_ADDx^emit_isk12(glofs(as, &J2G(as->J)->tmptv)), dest, RID_GL);
+ }
+ 
+ static void asm_aref(ASMState *as, IRIns *ir)
+@@ -727,113 +787,75 @@ static void asm_href(ASMState *as, IRIns
+   int destused = ra_used(ir);
+   Reg dest = ra_dest(as, ir, allow);
+   Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest));
+-  Reg key = 0, tmp = RID_TMP;
+-  Reg ftmp = RID_NONE, type = RID_NONE, scr = RID_NONE, tisnum = RID_NONE;
++  Reg tmp = RID_TMP, type = RID_NONE, key, tkey;
+   IRRef refkey = ir->op2;
+   IRIns *irkey = IR(refkey);
+-  int isk = irref_isk(ir->op2);
++  int isk = irref_isk(refkey);
+   IRType1 kt = irkey->t;
+   uint32_t k = 0;
+   uint32_t khash;
+-  MCLabel l_end, l_loop, l_next;
++  MCLabel l_end, l_loop;
+   rset_clear(allow, tab);
+ 
+-  if (!isk) {
+-    key = ra_alloc1(as, ir->op2, irt_isnum(kt) ? RSET_FPR : allow);
+-    rset_clear(allow, key);
+-    if (!irt_isstr(kt)) {
+-      tmp = ra_scratch(as, allow);
+-      rset_clear(allow, tmp);
+-    }
+-  } else if (irt_isnum(kt)) {
+-    int64_t val = (int64_t)ir_knum(irkey)->u64;
+-    if (!(k = emit_isk12(val))) {
+-      key = ra_allock(as, val, allow);
+-      rset_clear(allow, key);
+-    }
+-  } else if (!irt_ispri(kt)) {
+-    if (!(k = emit_isk12(irkey->i))) {
+-      key = ra_alloc1(as, refkey, allow);
+-      rset_clear(allow, key);
+-    }
+-  }
+-
+-  /* Allocate constants early. */
+-  if (irt_isnum(kt)) {
+-    if (!isk) {
+-      tisnum = ra_allock(as, LJ_TISNUM << 15, allow);
+-      ftmp = ra_scratch(as, rset_exclude(RSET_FPR, key));
+-      rset_clear(allow, tisnum);
+-    }
+-  } else if (irt_isaddr(kt)) {
+-    if (isk) {
+-      int64_t kk = ((int64_t)irt_toitype(irkey->t) << 47) | irkey[1].tv.u64;
+-      scr = ra_allock(as, kk, allow);
++  /* Allocate register for tkey outside of the loop. */
++  if (isk) {
++    int64_t kk;
++    if (irt_isaddr(kt)) {
++      kk = ((int64_t)irt_toitype(kt) << 47) | irkey[1].tv.u64;
++    } else if (irt_isnum(kt)) {
++      kk = (int64_t)ir_knum(irkey)->u64;
++      /* Assumes -0.0 is already canonicalized to +0.0. */
+     } else {
+-      scr = ra_scratch(as, allow);
++      lj_assertA(irt_ispri(kt) && !irt_isnil(kt), "bad HREF key type");
++      kk = ~((int64_t)~irt_toitype(kt) << 47);
+     }
+-    rset_clear(allow, scr);
++    k = emit_isk12(kk);
++    tkey = k ? 0 : ra_allock(as, kk, allow);
+   } else {
+-    lj_assertA(irt_ispri(kt) && !irt_isnil(kt), "bad HREF key type");
+-    type = ra_allock(as, ~((int64_t)~irt_toitype(ir->t) << 47), allow);
+-    scr = ra_scratch(as, rset_clear(allow, type));
+-    rset_clear(allow, scr);
++    tkey = ra_scratch(as, allow);
+   }
+ 
+   /* Key not found in chain: jump to exit (if merged) or load niltv. */
+   l_end = emit_label(as);
+   as->invmcp = NULL;
+-  if (merge == IR_NE)
++  if (merge == IR_NE) {
+     asm_guardcc(as, CC_AL);
+-  else if (destused)
+-    emit_loada(as, dest, niltvg(J2G(as->J)));
++  } else if (destused) {
++    uint32_t k12 = emit_isk12(offsetof(global_State, nilnode.val));
++    lj_assertA(k12 != 0, "Cannot k12 encode niltv(L)");
++    emit_dn(as, A64I_ADDx^k12, dest, RID_GL);
++  }
+ 
+   /* Follow hash chain until the end. */
+   l_loop = --as->mcp;
+-  emit_n(as, A64I_CMPx^A64I_K12^0, dest);
+-  emit_lso(as, A64I_LDRx, dest, dest, offsetof(Node, next));
+-  l_next = emit_label(as);
++  if (destused)
++    emit_lso(as, A64I_LDRx, dest, dest, offsetof(Node, next));
+ 
+   /* Type and value comparison. */
+   if (merge == IR_EQ)
+     asm_guardcc(as, CC_EQ);
+   else
+     emit_cond_branch(as, CC_EQ, l_end);
++  emit_nm(as, A64I_CMPx^k, tmp, tkey);
++  if (!destused)
++    emit_lso(as, A64I_LDRx, dest, dest, offsetof(Node, next));
++  emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key));
++  *l_loop = A64I_X | A64I_CBNZ | A64F_S19(as->mcp - l_loop) | dest;
+ 
+-  if (irt_isnum(kt)) {
+-    if (isk) {
+-      /* Assumes -0.0 is already canonicalized to +0.0. */
+-      if (k)
+-	emit_n(as, A64I_CMPx^k, tmp);
+-      else
+-	emit_nm(as, A64I_CMPx, key, tmp);
+-      emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.u64));
+-    } else {
+-      emit_nm(as, A64I_FCMPd, key, ftmp);
+-      emit_dn(as, A64I_FMOV_D_R, (ftmp & 31), (tmp & 31));
+-      emit_cond_branch(as, CC_LO, l_next);
+-      emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32), tisnum, tmp);
+-      emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.n));
+-    }
+-  } else if (irt_isaddr(kt)) {
+-    if (isk) {
+-      emit_nm(as, A64I_CMPx, scr, tmp);
+-      emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.u64));
++  /* Construct tkey as canonicalized or tagged key. */
++  if (!isk) {
++    if (irt_isnum(kt)) {
++      key = ra_alloc1(as, refkey, RSET_FPR);
++      emit_dnm(as, A64I_CSELx | A64F_CC(CC_EQ), tkey, RID_ZERO, tkey);
++      /* A64I_FMOV_R_D from key to tkey done below. */
+     } else {
+-      emit_nm(as, A64I_CMPx, tmp, scr);
+-      emit_lso(as, A64I_LDRx, scr, dest, offsetof(Node, key.u64));
++      lj_assertA(irt_isaddr(kt), "bad HREF key type");
++      key = ra_alloc1(as, refkey, allow);
++      type = ra_allock(as, irt_toitype(kt) << 15, rset_clear(allow, key));
++      emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 32), tkey, key, type);
+     }
+-  } else {
+-    emit_nm(as, A64I_CMPw, scr, type);
+-    emit_lso(as, A64I_LDRx, scr, dest, offsetof(Node, key));
+   }
+ 
+-  *l_loop = A64I_BCC | A64F_S19(as->mcp - l_loop) | CC_NE;
+-  if (!isk && irt_isaddr(kt)) {
+-    type = ra_allock(as, (int32_t)irt_toitype(kt), allow);
+-    emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), tmp, key, type);
+-    rset_clear(allow, type);
+-  }
+   /* Load main position relative to tab->node into dest. */
+   khash = isk ? ir_khash(as, irkey) : 1;
+   if (khash == 0) {
+@@ -847,7 +869,6 @@ static void asm_href(ASMState *as, IRIns
+       emit_dnm(as, A64I_ANDw, dest, dest, tmphash);
+       emit_lso(as, A64I_LDRw, dest, tab, offsetof(GCtab, hmask));
+     } else if (irt_isstr(kt)) {
+-      /* Fetch of str->sid is cheaper than ra_allock. */
+       emit_dnm(as, A64I_ANDw, dest, dest, tmp);
+       emit_lso(as, A64I_LDRw, tmp, key, offsetof(GCstr, sid));
+       emit_lso(as, A64I_LDRw, dest, tab, offsetof(GCtab, hmask));
+@@ -856,23 +877,18 @@ static void asm_href(ASMState *as, IRIns
+       emit_lso(as, A64I_LDRw, tmp, tab, offsetof(GCtab, hmask));
+       emit_dnm(as, A64I_SUBw, dest, dest, tmp);
+       emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT3)), tmp, tmp, tmp);
+-      emit_dnm(as, A64I_EORw, dest, dest, tmp);
+-      emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT2)), dest, dest, dest);
++      emit_dnm(as, A64I_EORw | A64F_SH(A64SH_ROR, 32-HASH_ROT2), dest, tmp, dest);
+       emit_dnm(as, A64I_SUBw, tmp, tmp, dest);
+       emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT1)), dest, dest, dest);
+-      emit_dnm(as, A64I_EORw, tmp, tmp, dest);
+       if (irt_isnum(kt)) {
++	emit_dnm(as, A64I_EORw, tmp, tkey, dest);
+ 	emit_dnm(as, A64I_ADDw, dest, dest, dest);
+-	emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, dest);
+-	emit_dm(as, A64I_MOVw, tmp, dest);
+-	emit_dn(as, A64I_FMOV_R_D, dest, (key & 31));
++	emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, tkey);
++	emit_nm(as, A64I_FCMPZd, (key & 31), 0);
++	emit_dn(as, A64I_FMOV_R_D, tkey, (key & 31));
+       } else {
+-	checkmclim(as);
+-	emit_dm(as, A64I_MOVw, tmp, key);
+-	emit_dnm(as, A64I_EORw, dest, dest,
+-		 ra_allock(as, irt_toitype(kt) << 15, allow));
+-	emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, dest);
+-	emit_dm(as, A64I_MOVx, dest, key);
++	emit_dnm(as, A64I_EORw, tmp, key, dest);
++	emit_dnm(as, A64I_EORx | A64F_SH(A64SH_LSR, 32), dest, type, key);
+       }
+     }
+   }
+@@ -884,10 +900,10 @@ static void asm_hrefk(ASMState *as, IRIn
+   IRIns *irkey = IR(kslot->op1);
+   int32_t ofs = (int32_t)(kslot->op2 * sizeof(Node));
+   int32_t kofs = ofs + (int32_t)offsetof(Node, key);
+-  int bigofs = !emit_checkofs(A64I_LDRx, ofs);
++  int bigofs = !emit_checkofs(A64I_LDRx, kofs);
+   Reg dest = (ra_used(ir) || bigofs) ? ra_dest(as, ir, RSET_GPR) : RID_NONE;
+   Reg node = ra_alloc1(as, ir->op1, RSET_GPR);
+-  Reg key, idx = node;
++  Reg idx = node;
+   RegSet allow = rset_exclude(RSET_GPR, node);
+   uint64_t k;
+   lj_assertA(ofs % sizeof(Node) == 0, "unaligned HREFK slot");
+@@ -906,34 +922,39 @@ static void asm_hrefk(ASMState *as, IRIn
+   } else {
+     k = ((uint64_t)irt_toitype(irkey->t) << 47) | (uint64_t)ir_kgc(irkey);
+   }
+-  key = ra_scratch(as, allow);
+-  emit_nm(as, A64I_CMPx, key, ra_allock(as, k, rset_exclude(allow, key)));
+-  emit_lso(as, A64I_LDRx, key, idx, kofs);
++  emit_nm(as, A64I_CMPx, RID_TMP, ra_allock(as, k, allow));
++  emit_lso(as, A64I_LDRx, RID_TMP, idx, kofs);
+   if (bigofs)
+-    emit_opk(as, A64I_ADDx, dest, node, ofs, RSET_GPR);
++    emit_opk(as, A64I_ADDx, dest, node, ofs, rset_exclude(RSET_GPR, node));
+ }
+ 
+ static void asm_uref(ASMState *as, IRIns *ir)
+ {
+   Reg dest = ra_dest(as, ir, RSET_GPR);
+-  if (irref_isk(ir->op1)) {
++  int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC);
++  if (irref_isk(ir->op1) && !guarded) {
+     GCfunc *fn = ir_kfunc(IR(ir->op1));
+     MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
+     emit_lsptr(as, A64I_LDRx, dest, v);
+   } else {
+-    Reg uv = ra_scratch(as, RSET_GPR);
+-    Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
+-    if (ir->o == IR_UREFC) {
+-      asm_guardcc(as, CC_NE);
+-      emit_n(as, (A64I_CMPx^A64I_K12) | A64F_U12(1), RID_TMP);
+-      emit_opk(as, A64I_ADDx, dest, uv,
++    if (guarded)
++      asm_guardcnb(as, ir->o == IR_UREFC ? A64I_CBZ : A64I_CBNZ, RID_TMP);
++    if (ir->o == IR_UREFC)
++      emit_opk(as, A64I_ADDx, dest, dest,
+ 	       (int32_t)offsetof(GCupval, tv), RSET_GPR);
+-      emit_lso(as, A64I_LDRB, RID_TMP, uv, (int32_t)offsetof(GCupval, closed));
++    else
++      emit_lso(as, A64I_LDRx, dest, dest, (int32_t)offsetof(GCupval, v));
++    if (guarded)
++      emit_lso(as, A64I_LDRB, RID_TMP, dest,
++	       (int32_t)offsetof(GCupval, closed));
++    if (irref_isk(ir->op1)) {
++      GCfunc *fn = ir_kfunc(IR(ir->op1));
++      uint64_t k = gcrefu(fn->l.uvptr[(ir->op2 >> 8)]);
++      emit_loadu64(as, dest, k);
+     } else {
+-      emit_lso(as, A64I_LDRx, dest, uv, (int32_t)offsetof(GCupval, v));
++      emit_lso(as, A64I_LDRx, dest, ra_alloc1(as, ir->op1, RSET_GPR),
++	       (int32_t)offsetof(GCfuncL, uvptr) + 8*(int32_t)(ir->op2 >> 8));
+     }
+-    emit_lso(as, A64I_LDRx, uv, func,
+-	     (int32_t)offsetof(GCfuncL, uvptr) + 8*(int32_t)(ir->op2 >> 8));
+   }
+ }
+ 
+@@ -1038,7 +1059,7 @@ static void asm_xstore(ASMState *as, IRI
+ 
+ static void asm_ahuvload(ASMState *as, IRIns *ir)
+ {
+-  Reg idx, tmp, type;
++  Reg idx, tmp;
+   int32_t ofs = 0;
+   RegSet gpr = RSET_GPR, allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
+   lj_assertA(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t) ||
+@@ -1057,18 +1078,20 @@ static void asm_ahuvload(ASMState *as, I
+   } else {
+     tmp = ra_scratch(as, gpr);
+   }
+-  type = ra_scratch(as, rset_clear(gpr, tmp));
+-  idx = asm_fuseahuref(as, ir->op1, &ofs, rset_clear(gpr, type), A64I_LDRx);
++  idx = asm_fuseahuref(as, ir->op1, &ofs, rset_clear(gpr, tmp), A64I_LDRx);
++  rset_clear(gpr, idx);
++  if (ofs & FUSE_REG) rset_clear(gpr, ofs & 31);
++  if (ir->o == IR_VLOAD) ofs += 8 * ir->op2;
+   /* Always do the type check, even if the load result is unused. */
+   asm_guardcc(as, irt_isnum(ir->t) ? CC_LS : CC_NE);
+   if (irt_type(ir->t) >= IRT_NUM) {
+     lj_assertA(irt_isinteger(ir->t) || irt_isnum(ir->t),
+ 	       "bad load type %d", irt_type(ir->t));
+     emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32),
+-	    ra_allock(as, LJ_TISNUM << 15, rset_exclude(gpr, idx)), tmp);
++	    ra_allock(as, LJ_TISNUM << 15, gpr), tmp);
+   } else if (irt_isaddr(ir->t)) {
+-    emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(ir->t)), type);
+-    emit_dn(as, A64I_ASRx | A64F_IMMR(47), type, tmp);
++    emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(ir->t)), RID_TMP);
++    emit_dn(as, A64I_ASRx | A64F_IMMR(47), RID_TMP, tmp);
+   } else if (irt_isnil(ir->t)) {
+     emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(1), tmp);
+   } else {
+@@ -1176,7 +1199,7 @@ dotypecheck:
+       tmp = ra_scratch(as, allow);
+       rset_clear(allow, tmp);
+     }
+-    if (irt_isnum(t) && !(ir->op2 & IRSLOAD_CONVERT))
++    if (ra_hasreg(dest) && tmp != dest)
+       emit_dn(as, A64I_FMOV_D_R, (dest & 31), tmp);
+     /* Need type check, even if the load result is unused. */
+     asm_guardcc(as, irt_isnum(t) ? CC_LS : CC_NE);
+@@ -1184,16 +1207,15 @@ dotypecheck:
+       lj_assertA(irt_isinteger(t) || irt_isnum(t),
+ 		 "bad SLOAD type %d", irt_type(t));
+       emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32),
+-	      ra_allock(as, LJ_TISNUM << 15, allow), tmp);
++	      ra_allock(as, (ir->op2 & IRSLOAD_KEYINDEX) ? LJ_KEYINDEX : (LJ_TISNUM << 15), allow), tmp);
+     } else if (irt_isnil(t)) {
+       emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(1), tmp);
+     } else if (irt_ispri(t)) {
+       emit_nm(as, A64I_CMPx,
+ 	      ra_allock(as, ~((int64_t)~irt_toitype(t) << 47) , allow), tmp);
+     } else {
+-      Reg type = ra_scratch(as, allow);
+-      emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(t)), type);
+-      emit_dn(as, A64I_ASRx | A64F_IMMR(47), type, tmp);
++      emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(t)), RID_TMP);
++      emit_dn(as, A64I_ASRx | A64F_IMMR(47), RID_TMP, tmp);
+     }
+     emit_lso(as, A64I_LDRx, tmp, base, ofs);
+     return;
+@@ -1261,17 +1283,14 @@ static void asm_tbar(ASMState *as, IRIns
+ {
+   Reg tab = ra_alloc1(as, ir->op1, RSET_GPR);
+   Reg link = ra_scratch(as, rset_exclude(RSET_GPR, tab));
+-  Reg gr = ra_allock(as, i64ptr(J2G(as->J)),
+-		     rset_exclude(rset_exclude(RSET_GPR, tab), link));
+   Reg mark = RID_TMP;
+   MCLabel l_end = emit_label(as);
+-  emit_lso(as, A64I_STRx, link, tab, (int32_t)offsetof(GCtab, gclist));
+   emit_lso(as, A64I_STRB, mark, tab, (int32_t)offsetof(GCtab, marked));
+-  emit_lso(as, A64I_STRx, tab, gr,
+-	   (int32_t)offsetof(global_State, gc.grayagain));
++  /* Keep STRx in the middle to avoid LDP/STP fusion with surrounding code. */
++  emit_lso(as, A64I_STRx, link, tab, (int32_t)offsetof(GCtab, gclist));
++  emit_setgl(as, tab, gc.grayagain);
+   emit_dn(as, A64I_ANDw^emit_isk13(~LJ_GC_BLACK, 0), mark, mark);
+-  emit_lso(as, A64I_LDRx, link, gr,
+-	   (int32_t)offsetof(global_State, gc.grayagain));
++  emit_getgl(as, link, gc.grayagain);
+   emit_cond_branch(as, CC_EQ, l_end);
+   emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_BLACK, 0), mark);
+   emit_lso(as, A64I_LDRB, mark, tab, (int32_t)offsetof(GCtab, marked));
+@@ -1282,7 +1301,6 @@ static void asm_obar(ASMState *as, IRIns
+   const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_barrieruv];
+   IRRef args[2];
+   MCLabel l_end;
+-  RegSet allow = RSET_GPR;
+   Reg obj, val, tmp;
+   /* No need for other object barriers (yet). */
+   lj_assertA(IR(ir->op1)->o == IR_UREFC, "bad OBAR type");
+@@ -1291,16 +1309,15 @@ static void asm_obar(ASMState *as, IRIns
+   args[0] = ASMREF_TMP1;  /* global_State *g */
+   args[1] = ir->op1;      /* TValue *tv      */
+   asm_gencall(as, ci, args);
+-  ra_allockreg(as, i64ptr(J2G(as->J)), ra_releasetmp(as, ASMREF_TMP1) );
++  emit_dm(as, A64I_MOVx, ra_releasetmp(as, ASMREF_TMP1), RID_GL);
+   obj = IR(ir->op1)->r;
+-  tmp = ra_scratch(as, rset_exclude(allow, obj));
+-  emit_cond_branch(as, CC_EQ, l_end);
+-  emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_BLACK, 0), tmp);
++  tmp = ra_scratch(as, rset_exclude(RSET_GPR, obj));
++  emit_tnb(as, A64I_TBZ, tmp, lj_ffs(LJ_GC_BLACK), l_end);
+   emit_cond_branch(as, CC_EQ, l_end);
+   emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_WHITES, 0), RID_TMP);
+   val = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, obj));
+   emit_lso(as, A64I_LDRB, tmp, obj,
+-     (int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv));
++	   (int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv));
+   emit_lso(as, A64I_LDRB, RID_TMP, val, (int32_t)offsetof(GChead, marked));
+ }
+ 
+@@ -1342,12 +1359,12 @@ static int asm_swapops(ASMState *as, IRR
+   if (irref_isk(lref))
+     return 1;  /* But swap constants to the right. */
+   ir = IR(rref);
+-  if ((ir->o >= IR_BSHL && ir->o <= IR_BSAR) ||
++  if ((ir->o >= IR_BSHL && ir->o <= IR_BROR) ||
+       (ir->o == IR_ADD && ir->op1 == ir->op2) ||
+       (ir->o == IR_CONV && ir->op2 == ((IRT_I64<<IRCONV_DSH)|IRT_INT|IRCONV_SEXT)))
+     return 0;  /* Don't swap fusable operands to the left. */
+   ir = IR(lref);
+-  if ((ir->o >= IR_BSHL && ir->o <= IR_BSAR) ||
++  if ((ir->o >= IR_BSHL && ir->o <= IR_BROR) ||
+       (ir->o == IR_ADD && ir->op1 == ir->op2) ||
+       (ir->o == IR_CONV && ir->op2 == ((IRT_I64<<IRCONV_DSH)|IRT_INT|IRCONV_SEXT)))
+     return 1;  /* But swap fusable operands to the right. */
+@@ -1393,13 +1410,12 @@ static void asm_intneg(ASMState *as, IRI
+ static void asm_intmul(ASMState *as, IRIns *ir)
+ {
+   Reg dest = ra_dest(as, ir, RSET_GPR);
+-  Reg left = ra_alloc1(as, ir->op1, rset_exclude(RSET_GPR, dest));
++  Reg left = ra_alloc1(as, ir->op1, RSET_GPR);
+   Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
+   if (irt_isguard(ir->t)) {  /* IR_MULOV */
+     asm_guardcc(as, CC_NE);
+     emit_dm(as, A64I_MOVw, dest, dest);  /* Zero-extend. */
+-    emit_nm(as, A64I_CMPw | A64F_SH(A64SH_ASR, 31), RID_TMP, dest);
+-    emit_dn(as, A64I_ASRx | A64F_IMMR(32), RID_TMP, dest);
++    emit_nm(as, A64I_CMPx | A64F_EX(A64EX_SXTW), dest, dest);
+     emit_dnm(as, A64I_SMULL, dest, right, left);
+   } else {
+     emit_dnm(as, irt_is64(ir->t) ? A64I_MULx : A64I_MULw, dest, left, right);
+@@ -1659,16 +1675,15 @@ static void asm_intcomp(ASMState *as, IR
+       if (asm_swapops(as, blref, brref)) {
+ 	Reg tmp = blref; blref = brref; brref = tmp;
+       }
++      bleft = ra_alloc1(as, blref, RSET_GPR);
+       if (irref_isk(brref)) {
+ 	uint64_t k = get_k64val(as, brref);
+-	if (k && !(k & (k-1)) && (cc == CC_EQ || cc == CC_NE)) {
+-	  asm_guardtnb(as, cc == CC_EQ ? A64I_TBZ : A64I_TBNZ,
+-		       ra_alloc1(as, blref, RSET_GPR), emit_ctz64(k));
++	if (k && !(k & (k-1)) && (cc == CC_EQ || cc == CC_NE) &&
++	    asm_guardtnb(as, cc == CC_EQ ? A64I_TBZ : A64I_TBNZ, bleft,
++			 emit_ctz64(k)))
+ 	  return;
+-	}
+ 	m2 = emit_isk13(k, irt_is64(irl->t));
+       }
+-      bleft = ra_alloc1(as, blref, RSET_GPR);
+       ai = (irt_is64(irl->t) ? A64I_TSTx : A64I_TSTw);
+       if (!m2)
+ 	m2 = asm_fuseopm(as, ai, brref, rset_exclude(RSET_GPR, bleft));
+@@ -1704,13 +1719,25 @@ static void asm_comp(ASMState *as, IRIns
+ 
+ #define asm_equal(as, ir)	asm_comp(as, ir)
+ 
+-/* -- Support for 64 bit ops in 32 bit mode ------------------------------- */
++/* -- Split register ops -------------------------------------------------- */
+ 
+-/* Hiword op of a split 64 bit op. Previous op must be the loword op. */
++/* Hiword op of a split 64/64 bit op. Previous op is the loword op. */
+ static void asm_hiop(ASMState *as, IRIns *ir)
+ {
+-  UNUSED(as); UNUSED(ir);
+-  lj_assertA(0, "unexpected HIOP");  /* Unused on 64 bit. */
++  /* HIOP is marked as a store because it needs its own DCE logic. */
++  int uselo = ra_used(ir-1), usehi = ra_used(ir);  /* Loword/hiword used? */
++  if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1;
++  if (!usehi) return;  /* Skip unused hiword op for all remaining ops. */
++  switch ((ir-1)->o) {
++  case IR_CALLN:
++  case IR_CALLL:
++  case IR_CALLS:
++  case IR_CALLXS:
++    if (!uselo)
++      ra_allocref(as, ir->op1, RID2RSET(RID_RETLO));  /* Mark lo op as used. */
++    break;
++  default: lj_assertA(0, "bad HIOP for op %d", (ir-1)->o); break;
++  }
+ }
+ 
+ /* -- Profiling ----------------------------------------------------------- */
+@@ -1731,37 +1758,28 @@ static void asm_prof(ASMState *as, IRIns
+ static void asm_stack_check(ASMState *as, BCReg topslot,
+ 			    IRIns *irp, RegSet allow, ExitNo exitno)
+ {
+-  Reg pbase;
+   uint32_t k;
++  Reg pbase = RID_BASE;
+   if (irp) {
+-    if (!ra_hasspill(irp->s)) {
+-      pbase = irp->r;
+-      lj_assertA(ra_hasreg(pbase), "base reg lost");
+-    } else if (allow) {
+-      pbase = rset_pickbot(allow);
+-    } else {
+-      pbase = RID_RET;
+-      emit_lso(as, A64I_LDRx, RID_RET, RID_SP, 0);  /* Restore temp register. */
+-    }
+-  } else {
+-    pbase = RID_BASE;
++    pbase = irp->r;
++    if (!ra_hasreg(pbase))
++      pbase = allow ? (0x40 | rset_pickbot(allow)) : (0xC0 | RID_RET);
+   }
+   emit_cond_branch(as, CC_LS, asm_exitstub_addr(as, exitno));
++  if (pbase & 0x80)  /* Restore temp. register. */
++    emit_lso(as, A64I_LDRx, (pbase & 31), RID_SP, 0);
+   k = emit_isk12((8*topslot));
+   lj_assertA(k, "slot offset %d does not fit in K12", 8*topslot);
+   emit_n(as, A64I_CMPx^k, RID_TMP);
+-  emit_dnm(as, A64I_SUBx, RID_TMP, RID_TMP, pbase);
++  emit_dnm(as, A64I_SUBx, RID_TMP, RID_TMP, (pbase & 31));
+   emit_lso(as, A64I_LDRx, RID_TMP, RID_TMP,
+ 	   (int32_t)offsetof(lua_State, maxstack));
+-  if (irp) {  /* Must not spill arbitrary registers in head of side trace. */
+-    if (ra_hasspill(irp->s))
+-      emit_lso(as, A64I_LDRx, pbase, RID_SP, sps_scale(irp->s));
+-    emit_lso(as, A64I_LDRx, RID_TMP, RID_GL, glofs(as, &J2G(as->J)->cur_L));
+-    if (ra_hasspill(irp->s) && !allow)
+-      emit_lso(as, A64I_STRx, RID_RET, RID_SP, 0);  /* Save temp register. */
+-  } else {
+-    emit_getgl(as, RID_TMP, cur_L);
++  if (pbase & 0x40) {
++    emit_getgl(as, (pbase & 31), jit_base);
++    if (pbase & 0x80)  /* Save temp register. */
++      emit_lso(as, A64I_STRx, (pbase & 31), RID_SP, 0);
+   }
++  emit_getgl(as, RID_TMP, cur_L);
+ }
+ 
+ /* Restore Lua stack from on-trace state. */
+@@ -1781,7 +1799,14 @@ static void asm_stack_restore(ASMState *
+     IRIns *ir = IR(ref);
+     if ((sn & SNAP_NORESTORE))
+       continue;
+-    if (irt_isnum(ir->t)) {
++    if ((sn & SNAP_KEYINDEX)) {
++      RegSet allow = rset_exclude(RSET_GPR, RID_BASE);
++      Reg r = irref_isk(ref) ? ra_allock(as, ir->i, allow) :
++			       ra_alloc1(as, ref, allow);
++      rset_clear(allow, r);
++      emit_lso(as, A64I_STRw, r, RID_BASE, ofs);
++      emit_lso(as, A64I_STRw, ra_allock(as, LJ_KEYINDEX, allow), RID_BASE, ofs+4);
++    } else if (irt_isnum(ir->t)) {
+       Reg src = ra_alloc1(as, ref, RSET_FPR);
+       emit_lso(as, A64I_STRd, (src & 31), RID_BASE, ofs);
+     } else {
+@@ -1796,7 +1821,7 @@ static void asm_stack_restore(ASMState *
+ 
+ /* Marker to prevent patching the GC check exit. */
+ #define ARM64_NOPATCH_GC_CHECK \
+-  (A64I_ORRx|A64F_D(RID_TMP)|A64F_M(RID_TMP)|A64F_N(RID_TMP))
++  (A64I_ORRx|A64F_D(RID_ZERO)|A64F_M(RID_ZERO)|A64F_N(RID_ZERO))
+ 
+ /* Check GC threshold and do one or more GC steps. */
+ static void asm_gc_check(ASMState *as)
+@@ -1804,7 +1829,7 @@ static void asm_gc_check(ASMState *as)
+   const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_step_jit];
+   IRRef args[2];
+   MCLabel l_end;
+-  Reg tmp1, tmp2;
++  Reg tmp2;
+   ra_evictset(as, RSET_SCRATCH);
+   l_end = emit_label(as);
+   /* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */
+@@ -1813,17 +1838,14 @@ static void asm_gc_check(ASMState *as)
+   args[0] = ASMREF_TMP1;  /* global_State *g */
+   args[1] = ASMREF_TMP2;  /* MSize steps     */
+   asm_gencall(as, ci, args);
+-  tmp1 = ra_releasetmp(as, ASMREF_TMP1);
++  emit_dm(as, A64I_MOVx, ra_releasetmp(as, ASMREF_TMP1), RID_GL);
+   tmp2 = ra_releasetmp(as, ASMREF_TMP2);
+   emit_loadi(as, tmp2, as->gcsteps);
+   /* Jump around GC step if GC total < GC threshold. */
+   emit_cond_branch(as, CC_LS, l_end);
+   emit_nm(as, A64I_CMPx, RID_TMP, tmp2);
+-  emit_lso(as, A64I_LDRx, tmp2, tmp1,
+-	   (int32_t)offsetof(global_State, gc.threshold));
+-  emit_lso(as, A64I_LDRx, RID_TMP, tmp1,
+-	   (int32_t)offsetof(global_State, gc.total));
+-  ra_allockreg(as, i64ptr(J2G(as->J)), tmp1);
++  emit_getgl(as, tmp2, gc.threshold);
++  emit_getgl(as, RID_TMP, gc.total);
+   as->gcsteps = 0;
+   checkmclim(as);
+ }
+@@ -1846,49 +1868,48 @@ static void asm_loop_fixup(ASMState *as)
+   }
+ }
+ 
+-/* -- Head of trace ------------------------------------------------------- */
+-
+-/* Reload L register from g->cur_L. */
+-static void asm_head_lreg(ASMState *as)
++/* Fixup the tail of the loop. */
++static void asm_loop_tail_fixup(ASMState *as)
+ {
+-  IRIns *ir = IR(ASMREF_L);
+-  if (ra_used(ir)) {
+-    Reg r = ra_dest(as, ir, RSET_GPR);
+-    emit_getgl(as, r, cur_L);
+-    ra_evictk(as);
+-  }
++  UNUSED(as);  /* Nothing to do. */
+ }
+ 
++/* -- Head of trace ------------------------------------------------------- */
++
+ /* Coalesce BASE register for a root trace. */
+ static void asm_head_root_base(ASMState *as)
+ {
+-  IRIns *ir;
+-  asm_head_lreg(as);
+-  ir = IR(REF_BASE);
+-  if (ra_hasreg(ir->r) && (rset_test(as->modset, ir->r) || irt_ismarked(ir->t)))
+-    ra_spill(as, ir);
+-  ra_destreg(as, ir, RID_BASE);
++  IRIns *ir = IR(REF_BASE);
++  Reg r = ir->r;
++  if (ra_hasreg(r)) {
++    ra_free(as, r);
++    if (rset_test(as->modset, r) || irt_ismarked(ir->t))
++      ir->r = RID_INIT;  /* No inheritance for modified BASE register. */
++    if (r != RID_BASE)
++      emit_movrr(as, ir, r, RID_BASE);
++  }
+ }
+ 
+ /* Coalesce BASE register for a side trace. */
+-static RegSet asm_head_side_base(ASMState *as, IRIns *irp, RegSet allow)
++static Reg asm_head_side_base(ASMState *as, IRIns *irp)
+ {
+-  IRIns *ir;
+-  asm_head_lreg(as);
+-  ir = IR(REF_BASE);
+-  if (ra_hasreg(ir->r) && (rset_test(as->modset, ir->r) || irt_ismarked(ir->t)))
+-    ra_spill(as, ir);
+-  if (ra_hasspill(irp->s)) {
+-    rset_clear(allow, ra_dest(as, ir, allow));
+-  } else {
+-    Reg r = irp->r;
+-    lj_assertA(ra_hasreg(r), "base reg lost");
+-    rset_clear(allow, r);
+-    if (r != ir->r && !rset_test(as->freeset, r))
+-      ra_restore(as, regcost_ref(as->cost[r]));
+-    ra_destreg(as, ir, r);
++  IRIns *ir = IR(REF_BASE);
++  Reg r = ir->r;
++  if (ra_hasreg(r)) {
++    ra_free(as, r);
++    if (rset_test(as->modset, r) || irt_ismarked(ir->t))
++      ir->r = RID_INIT;  /* No inheritance for modified BASE register. */
++    if (irp->r == r) {
++      return r;  /* Same BASE register already coalesced. */
++    } else if (ra_hasreg(irp->r) && rset_test(as->freeset, irp->r)) {
++      /* Move from coalesced parent reg. */
++      emit_movrr(as, ir, r, irp->r);
++      return irp->r;
++    } else {
++      emit_getgl(as, r, jit_base);  /* Otherwise reload BASE. */
++    }
+   }
+-  return allow;
++  return RID_NONE;
+ }
+ 
+ /* -- Tail of trace ------------------------------------------------------- */
+@@ -1932,20 +1953,47 @@ static void asm_tail_prep(ASMState *as)
+ /* Ensure there are enough stack slots for call arguments. */
+ static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci)
+ {
+-  IRRef args[CCI_NARGS_MAX*2];
++#if LJ_HASFFI
+   uint32_t i, nargs = CCI_XNARGS(ci);
+-  int nslots = 0, ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR;
+-  asm_collectargs(as, ir, ci, args);
+-  for (i = 0; i < nargs; i++) {
+-    if (args[i] && irt_isfp(IR(args[i])->t)) {
+-      if (nfpr > 0) nfpr--; else nslots += 2;
+-    } else {
+-      if (ngpr > 0) ngpr--; else nslots += 2;
++  if (nargs > (REGARG_NUMGPR < REGARG_NUMFPR ? REGARG_NUMGPR : REGARG_NUMFPR) ||
++      (LJ_TARGET_OSX && (ci->flags & CCI_VARARG))) {
++    IRRef args[CCI_NARGS_MAX*2];
++    int ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR;
++    int spofs = 0, spalign = LJ_TARGET_OSX ? 0 : 7, nslots;
++    asm_collectargs(as, ir, ci, args);
++#if LJ_ABI_WIN
++    if ((ci->flags & CCI_VARARG)) nfpr = 0;
++#endif
++    for (i = 0; i < nargs; i++) {
++      int al = spalign;
++      if (!args[i]) {
++#if LJ_TARGET_OSX
++	/* Marker for start of varaargs. */
++	nfpr = 0;
++	ngpr = 0;
++	spalign = 7;
++#endif
++      } else if (irt_isfp(IR(args[i])->t)) {
++	if (nfpr > 0) { nfpr--; continue; }
++#if LJ_ABI_WIN
++	if ((ci->flags & CCI_VARARG) && ngpr > 0) { ngpr--; continue; }
++#elif LJ_TARGET_OSX
++	al |= irt_isnum(IR(args[i])->t) ? 7 : 3;
++#endif
++      } else {
++	if (ngpr > 0) { ngpr--; continue; }
++#if LJ_TARGET_OSX
++	al |= irt_size(IR(args[i])->t) - 1;
++#endif
++      }
++      spofs = (spofs + 2*al+1) & ~al;  /* Align and bump stack pointer. */
+     }
++    nslots = (spofs + 3) >> 2;
++    if (nslots > as->evenspill)  /* Leave room for args in stack slots. */
++      as->evenspill = nslots;
+   }
+-  if (nslots > as->evenspill)  /* Leave room for args in stack slots. */
+-    as->evenspill = nslots;
+-  return REGSP_HINT(RID_RET);
++#endif
++  return REGSP_HINT(irt_isfp(ir->t) ? RID_FPRET : RID_RET);
+ }
+ 
+ static void asm_setup_target(ASMState *as)
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_asm_mips.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_asm_mips.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_asm_mips.h
+@@ -1,6 +1,6 @@
+ /*
+ ** MIPS IR assembler (SSA IR -> machine code).
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ /* -- Register allocator extensions --------------------------------------- */
+@@ -64,17 +64,29 @@ static Reg ra_alloc2(ASMState *as, IRIns
+ /* Setup spare long-range jump slots per mcarea. */
+ static void asm_sparejump_setup(ASMState *as)
+ {
+-  MCode *mxp = as->mcbot;
+-  if (((uintptr_t)mxp & (LJ_PAGESIZE-1)) == sizeof(MCLink)) {
++  MCode *mxp = as->mctop;
++  if ((char *)mxp == (char *)as->J->mcarea + as->J->szmcarea) {
++    mxp -= MIPS_SPAREJUMP*2;
+     lj_assertA(MIPSI_NOP == 0, "bad NOP");
+     memset(mxp, 0, MIPS_SPAREJUMP*2*sizeof(MCode));
+-    mxp += MIPS_SPAREJUMP*2;
+-    lj_assertA(mxp < as->mctop, "MIPS_SPAREJUMP too big");
+-    lj_mcode_sync(as->mcbot, mxp);
+-    lj_mcode_commitbot(as->J, mxp);
+-    as->mcbot = mxp;
+-    as->mclim = as->mcbot + MCLIM_REDZONE;
++    as->mctop = mxp;
++  }
++}
++
++static MCode *asm_sparejump_use(MCode *mcarea, MCode tjump)
++{
++  MCode *mxp = (MCode *)((char *)mcarea + ((MCLink *)mcarea)->size);
++  int slot = MIPS_SPAREJUMP;
++  while (slot--) {
++    mxp -= 2;
++    if (*mxp == tjump) {
++      return mxp;
++    } else if (*mxp == MIPSI_NOP) {
++      *mxp = tjump;
++      return mxp;
++    }
+   }
++  return NULL;
+ }
+ 
+ /* Setup exit stub after the end of each trace. */
+@@ -181,6 +193,9 @@ static Reg asm_fuseahuref(ASMState *as,
+ 	  return ra_allock(as, ofs-(int16_t)ofs, allow);
+ 	}
+       }
++    } else if (ir->o == IR_TMPREF) {
++      *ofsp = (int32_t)(offsetof(global_State, tmptv)-32768);
++      return RID_JGL;
+     }
+   }
+   *ofsp = 0;
+@@ -336,19 +351,15 @@ static void asm_gencall(ASMState *as, co
+ static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
+ {
+   RegSet drop = RSET_SCRATCH;
+-#if LJ_32
+   int hiop = ((ir+1)->o == IR_HIOP && !irt_isnil((ir+1)->t));
+-#endif
+ #if !LJ_SOFTFP
+   if ((ci->flags & CCI_NOFPRCLOBBER))
+     drop &= ~RSET_FPR;
+ #endif
+   if (ra_hasreg(ir->r))
+     rset_clear(drop, ir->r);  /* Dest reg handled below. */
+-#if LJ_32
+   if (hiop && ra_hasreg((ir+1)->r))
+     rset_clear(drop, (ir+1)->r);  /* Dest reg handled below. */
+-#endif
+   ra_evictset(as, drop);  /* Evictions must be performed first. */
+   if (ra_used(ir)) {
+     lj_assertA(!irt_ispri(ir->t), "PRI dest");
+@@ -377,10 +388,8 @@ static void asm_setupresult(ASMState *as
+       } else {
+ 	ra_destreg(as, ir, RID_FPRET);
+       }
+-#if LJ_32
+     } else if (hiop) {
+       ra_destpair(as, ir);
+-#endif
+     } else {
+       ra_destreg(as, ir, RID_RET);
+     }
+@@ -450,6 +459,27 @@ static void asm_retf(ASMState *as, IRIns
+   emit_tsi(as, MIPSI_AL, RID_TMP, base, -8);
+ }
+ 
++/* -- Buffer operations --------------------------------------------------- */
++
++#if LJ_HASBUFFER
++static void asm_bufhdr_write(ASMState *as, Reg sb)
++{
++  Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, sb));
++  IRIns irgc;
++  irgc.ot = IRT(0, IRT_PGC);  /* GC type. */
++  emit_storeofs(as, &irgc, RID_TMP, sb, offsetof(SBuf, L));
++  if ((as->flags & JIT_F_MIPSXXR2)) {
++    emit_tsml(as, LJ_64 ? MIPSI_DINS : MIPSI_INS, RID_TMP, tmp,
++	      lj_fls(SBUF_MASK_FLAG), 0);
++  } else {
++    emit_dst(as, MIPSI_OR, RID_TMP, RID_TMP, tmp);
++    emit_tsi(as, MIPSI_ANDI, tmp, tmp, SBUF_MASK_FLAG);
++  }
++  emit_getgl(as, RID_TMP, cur_L);
++  emit_loadofs(as, &irgc, tmp, sb, offsetof(SBuf, L));
++}
++#endif
++
+ /* -- Type conversions ---------------------------------------------------- */
+ 
+ #if !LJ_SOFTFP
+@@ -739,7 +769,7 @@ static void asm_conv(ASMState *as, IRIns
+ 	  }
+ 	}
+       } else {
+-	if (st64) {
++	if (st64 && !(ir->op2 & IRCONV_NONE)) {
+ 	  /* This is either a 32 bit reg/reg mov which zeroes the hiword
+ 	  ** or a load of the loword from a 64 bit address.
+ 	  */
+@@ -827,34 +857,63 @@ static void asm_tvstore64(ASMState *as,
+ #endif
+ 
+ /* Get pointer to TValue. */
+-static void asm_tvptr(ASMState *as, Reg dest, IRRef ref)
++static void asm_tvptr(ASMState *as, Reg dest, IRRef ref, MSize mode)
+ {
+-  IRIns *ir = IR(ref);
+-  if (irt_isnum(ir->t)) {
+-    if (irref_isk(ref))  /* Use the number constant itself as a TValue. */
+-      ra_allockreg(as, igcptr(ir_knum(ir)), dest);
+-    else  /* Otherwise force a spill and use the spill slot. */
+-      emit_tsi(as, MIPSI_AADDIU, dest, RID_SP, ra_spill(as, ir));
+-  } else {
+-    /* Otherwise use g->tmptv to hold the TValue. */
+-#if LJ_32
+-    RegSet allow = rset_exclude(RSET_GPR, dest);
+-    Reg type;
+-    emit_tsi(as, MIPSI_ADDIU, dest, RID_JGL, (int32_t)(offsetof(global_State, tmptv)-32768));
+-    if (!irt_ispri(ir->t)) {
+-      Reg src = ra_alloc1(as, ref, allow);
+-      emit_setgl(as, src, tmptv.gcr);
+-    }
+-    if (LJ_SOFTFP && (ir+1)->o == IR_HIOP)
+-      type = ra_alloc1(as, ref+1, allow);
+-    else
+-      type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow);
+-    emit_setgl(as, type, tmptv.it);
++  int32_t tmpofs = (int32_t)(offsetof(global_State, tmptv)-32768);
++  if ((mode & IRTMPREF_IN1)) {
++    IRIns *ir = IR(ref);
++    if (irt_isnum(ir->t)) {
++      if ((mode & IRTMPREF_OUT1)) {
++#if LJ_SOFTFP
++	emit_tsi(as, MIPSI_AADDIU, dest, RID_JGL, tmpofs);
++#if LJ_64
++	emit_setgl(as, ra_alloc1(as, ref, RSET_GPR), tmptv.u64);
++#else
++	lj_assertA(irref_isk(ref), "unsplit FP op");
++	emit_setgl(as,
++		   ra_allock(as, (int32_t)ir_knum(ir)->u32.lo, RSET_GPR),
++		   tmptv.u32.lo);
++	emit_setgl(as,
++		   ra_allock(as, (int32_t)ir_knum(ir)->u32.hi, RSET_GPR),
++		   tmptv.u32.hi);
++#endif
++#else
++	Reg src = ra_alloc1(as, ref, RSET_FPR);
++	emit_tsi(as, MIPSI_AADDIU, dest, RID_JGL, tmpofs);
++	emit_tsi(as, MIPSI_SDC1, (src & 31),  RID_JGL, tmpofs);
++#endif
++      } else if (irref_isk(ref)) {
++	/* Use the number constant itself as a TValue. */
++	ra_allockreg(as, igcptr(ir_knum(ir)), dest);
++      } else {
++#if LJ_SOFTFP32
++	lj_assertA(0, "unsplit FP op");
+ #else
+-    asm_tvstore64(as, dest, 0, ref);
+-    emit_tsi(as, MIPSI_DADDIU, dest, RID_JGL,
+-	     (int32_t)(offsetof(global_State, tmptv)-32768));
++	/* Otherwise force a spill and use the spill slot. */
++	emit_tsi(as, MIPSI_AADDIU, dest, RID_SP, ra_spill(as, ir));
+ #endif
++      }
++    } else {
++      /* Otherwise use g->tmptv to hold the TValue. */
++#if LJ_32
++      Reg type;
++      emit_tsi(as, MIPSI_ADDIU, dest, RID_JGL, tmpofs);
++      if (!irt_ispri(ir->t)) {
++	Reg src = ra_alloc1(as, ref, RSET_GPR);
++	emit_setgl(as, src, tmptv.gcr);
++      }
++      if (LJ_SOFTFP && (ir+1)->o == IR_HIOP && !irt_isnil((ir+1)->t))
++	type = ra_alloc1(as, ref+1, RSET_GPR);
++      else
++	type = ra_allock(as, (int32_t)irt_toitype(ir->t), RSET_GPR);
++      emit_setgl(as, type, tmptv.it);
++#else
++      asm_tvstore64(as, dest, 0, ref);
++      emit_tsi(as, MIPSI_DADDIU, dest, RID_JGL, tmpofs);
++#endif
++    }
++  } else {
++    emit_tsi(as, MIPSI_AADDIU, dest, RID_JGL, tmpofs);
+   }
+ }
+ 
+@@ -909,11 +968,16 @@ static void asm_href(ASMState *as, IRIns
+   MCLabel l_end, l_loop, l_next;
+ 
+   rset_clear(allow, tab);
+-#if LJ_SOFTFP32
+-  if (!isk) {
+-    key = ra_alloc1(as, refkey, allow);
+-    rset_clear(allow, key);
+-    if (irkey[1].o == IR_HIOP) {
++  if (!LJ_SOFTFP && irt_isnum(kt)) {
++    key = ra_alloc1(as, refkey, RSET_FPR);
++    tmpnum = ra_scratch(as, rset_exclude(RSET_FPR, key));
++  } else {
++    if (!irt_ispri(kt)) {
++      key = ra_alloc1(as, refkey, allow);
++      rset_clear(allow, key);
++    }
++#if LJ_32
++    if (LJ_SOFTFP && irkey[1].o == IR_HIOP) {
+       if (ra_hasreg((irkey+1)->r)) {
+ 	type = tmpnum = (irkey+1)->r;
+ 	tmp1 = ra_scratch(as, allow);
+@@ -924,23 +988,11 @@ static void asm_href(ASMState *as, IRIns
+       }
+       rset_clear(allow, tmpnum);
+     } else {
+-      type = ra_allock(as, (int32_t)irt_toitype(irkey->t), allow);
++      type = ra_allock(as, (int32_t)irt_toitype(kt), allow);
+       rset_clear(allow, type);
+     }
+-  }
+-#else
+-  if (!LJ_SOFTFP && irt_isnum(kt)) {
+-    key = ra_alloc1(as, refkey, RSET_FPR);
+-    tmpnum = ra_scratch(as, rset_exclude(RSET_FPR, key));
+-  } else if (!irt_ispri(kt)) {
+-    key = ra_alloc1(as, refkey, allow);
+-    rset_clear(allow, key);
+-#if LJ_32
+-    type = ra_allock(as, (int32_t)irt_toitype(irkey->t), allow);
+-    rset_clear(allow, type);
+ #endif
+   }
+-#endif
+   tmp2 = ra_scratch(as, allow);
+   rset_clear(allow, tmp2);
+ #if LJ_64
+@@ -953,10 +1005,10 @@ static void asm_href(ASMState *as, IRIns
+     } else {
+       int64_t k;
+       if (isk && irt_isaddr(kt)) {
+-	k = ((int64_t)irt_toitype(irkey->t) << 47) | irkey[1].tv.u64;
++	k = ((int64_t)irt_toitype(kt) << 47) | irkey[1].tv.u64;
+       } else {
+ 	lj_assertA(irt_ispri(kt) && !irt_isnil(kt), "bad HREF key type");
+-	k = ~((int64_t)~irt_toitype(ir->t) << 47);
++	k = ~((int64_t)~irt_toitype(kt) << 47);
+       }
+       cmp64 = ra_allock(as, k, allow);
+       rset_clear(allow, cmp64);
+@@ -1155,22 +1207,29 @@ nolo:
+ static void asm_uref(ASMState *as, IRIns *ir)
+ {
+   Reg dest = ra_dest(as, ir, RSET_GPR);
+-  if (irref_isk(ir->op1)) {
++  int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC);
++  if (irref_isk(ir->op1) && !guarded) {
+     GCfunc *fn = ir_kfunc(IR(ir->op1));
+     MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
+     emit_lsptr(as, MIPSI_AL, dest, v, RSET_GPR);
+   } else {
+-    Reg uv = ra_scratch(as, RSET_GPR);
+-    Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
+-    if (ir->o == IR_UREFC) {
+-      asm_guard(as, MIPSI_BEQ, RID_TMP, RID_ZERO);
+-      emit_tsi(as, MIPSI_AADDIU, dest, uv, (int32_t)offsetof(GCupval, tv));
+-      emit_tsi(as, MIPSI_LBU, RID_TMP, uv, (int32_t)offsetof(GCupval, closed));
+-    } else {
+-      emit_tsi(as, MIPSI_AL, dest, uv, (int32_t)offsetof(GCupval, v));
++    if (guarded)
++      asm_guard(as, ir->o == IR_UREFC ? MIPSI_BEQ : MIPSI_BNE, RID_TMP, RID_ZERO);
++    if (ir->o == IR_UREFC)
++      emit_tsi(as, MIPSI_AADDIU, dest, dest, (int32_t)offsetof(GCupval, tv));
++    else
++      emit_tsi(as, MIPSI_AL, dest, dest, (int32_t)offsetof(GCupval, v));
++    if (guarded)
++      emit_tsi(as, MIPSI_LBU, RID_TMP, dest, (int32_t)offsetof(GCupval, closed));
++    if (irref_isk(ir->op1)) {
++      GCfunc *fn = ir_kfunc(IR(ir->op1));
++      GCobj *o = gcref(fn->l.uvptr[(ir->op2 >> 8)]);
++      emit_loada(as, dest, o);
++    } else {
++      emit_tsi(as, MIPSI_AL, dest, ra_alloc1(as, ir->op1, RSET_GPR),
++	       (int32_t)offsetof(GCfuncL, uvptr) +
++	       (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8));
+     }
+-    emit_tsi(as, MIPSI_AL, uv, func, (int32_t)offsetof(GCfuncL, uvptr) +
+-	     (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8));
+   }
+ }
+ 
+@@ -1285,8 +1344,8 @@ static void asm_fload(ASMState *as, IRIn
+       }
+     }
+     ofs = field_ofs[ir->op2];
++    lj_assertA(!irt_isfp(ir->t), "bad FP FLOAD");
+   }
+-  lj_assertA(!irt_isfp(ir->t), "bad FP FLOAD");
+   emit_tsi(as, mi, dest, idx, ofs);
+ }
+ 
+@@ -1352,6 +1411,7 @@ static void asm_ahuvload(ASMState *as, I
+ #endif
+   }
+   idx = asm_fuseahuref(as, ir->op1, &ofs, allow);
++  if (ir->o == IR_VLOAD) ofs += 8 * ir->op2;
+   rset_clear(allow, idx);
+   if (irt_isnum(t)) {
+     asm_guard(as, MIPSI_BEQ, RID_TMP, RID_ZERO);
+@@ -1524,7 +1584,7 @@ dotypecheck:
+       asm_guard(as, MIPSI_BEQ, RID_TMP, RID_ZERO);
+       emit_tsi(as, MIPSI_SLTIU, RID_TMP, type, (int32_t)LJ_TISNUM);
+     } else {
+-      Reg ktype = ra_allock(as, irt_toitype(t), allow);
++      Reg ktype = ra_allock(as, (ir->op2 & IRSLOAD_KEYINDEX) ? LJ_KEYINDEX : irt_toitype(t), allow);
+       asm_guard(as, MIPSI_BNE, type, ktype);
+     }
+   }
+@@ -1542,6 +1602,10 @@ dotypecheck:
+     if (irt_ispri(t)) {
+       asm_guard(as, MIPSI_BNE, type,
+ 		ra_allock(as, ~((int64_t)~irt_toitype(t) << 47) , allow));
++    } else if ((ir->op2 & IRSLOAD_KEYINDEX)) {
++      asm_guard(as, MIPSI_BNE, RID_TMP,
++		ra_allock(as, (int32_t)LJ_KEYINDEX, allow));
++      emit_dta(as, MIPSI_DSRA32, RID_TMP, type, 0);
+     } else {
+       if (irt_isnum(t)) {
+ 	asm_guard(as, MIPSI_BEQ, RID_TMP, RID_ZERO);
+@@ -1837,7 +1901,7 @@ static void asm_arithov(ASMState *as, IR
+   lj_assertA(!irt_is64(ir->t), "bad usage");
+   if (irref_isk(ir->op2)) {
+     int k = IR(ir->op2)->i;
+-    if (ir->o == IR_SUBOV) k = -k;
++    if (ir->o == IR_SUBOV) k = (int)(~(unsigned int)k+1u);
+     if (checki16(k)) {  /* (dest < left) == (k >= 0 ? 1 : 0) */
+       left = ra_alloc1(as, ir->op1, RSET_GPR);
+       asm_guard(as, k >= 0 ? MIPSI_BNE : MIPSI_BEQ, RID_TMP, RID_ZERO);
+@@ -2327,15 +2391,15 @@ static void asm_comp64eq(ASMState *as, I
+ }
+ #endif
+ 
+-/* -- Support for 64 bit ops in 32 bit mode ------------------------------- */
++/* -- Split register ops -------------------------------------------------- */
+ 
+-/* Hiword op of a split 64 bit op. Previous op must be the loword op. */
++/* Hiword op of a split 32/32 or 64/64 bit op. Previous op is the loword op. */
+ static void asm_hiop(ASMState *as, IRIns *ir)
+ {
+-#if LJ_32 && (LJ_HASFFI || LJ_SOFTFP)
+   /* HIOP is marked as a store because it needs its own DCE logic. */
+   int uselo = ra_used(ir-1), usehi = ra_used(ir);  /* Loword/hiword used? */
+   if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1;
++#if LJ_32 && (LJ_HASFFI || LJ_SOFTFP)
+   if ((ir-1)->o == IR_CONV) {  /* Conversions to/from 64 bit. */
+     as->curins--;  /* Always skip the CONV. */
+ #if LJ_HASFFI && !LJ_SOFTFP
+@@ -2382,38 +2446,33 @@ static void asm_hiop(ASMState *as, IRIns
+     }
+     return;
+   }
++#endif
+   if (!usehi) return;  /* Skip unused hiword op for all remaining ops. */
+   switch ((ir-1)->o) {
+-#if LJ_HASFFI
++#if LJ_32 && LJ_HASFFI
+   case IR_ADD: as->curins--; asm_add64(as, ir); break;
+   case IR_SUB: as->curins--; asm_sub64(as, ir); break;
+   case IR_NEG: as->curins--; asm_neg64(as, ir); break;
++  case IR_CNEWI:
++    /* Nothing to do here. Handled by lo op itself. */
++    break;
+ #endif
+-#if LJ_SOFTFP
++#if LJ_32 && LJ_SOFTFP
+   case IR_SLOAD: case IR_ALOAD: case IR_HLOAD: case IR_ULOAD: case IR_VLOAD:
+   case IR_STRTO:
+     if (!uselo)
+       ra_allocref(as, ir->op1, RSET_GPR);  /* Mark lo op as used. */
+     break;
++  case IR_ASTORE: case IR_HSTORE: case IR_USTORE: case IR_TOSTR: case IR_TMPREF:
++    /* Nothing to do here. Handled by lo op itself. */
++    break;
+ #endif
+-  case IR_CALLN:
+-  case IR_CALLS:
+-  case IR_CALLXS:
++  case IR_CALLN: case IR_CALLL: case IR_CALLS: case IR_CALLXS:
+     if (!uselo)
+       ra_allocref(as, ir->op1, RID2RSET(RID_RETLO));  /* Mark lo op as used. */
+     break;
+-#if LJ_SOFTFP
+-  case IR_ASTORE: case IR_HSTORE: case IR_USTORE: case IR_TOSTR:
+-#endif
+-  case IR_CNEWI:
+-    /* Nothing to do here. Handled by lo op itself. */
+-    break;
+   default: lj_assertA(0, "bad HIOP for op %d", (ir-1)->o); break;
+   }
+-#else
+-  /* Unused on MIPS64 or without SOFTFP or FFI. */
+-  UNUSED(as); UNUSED(ir); lj_assertA(0, "unexpected HIOP");
+-#endif
+ }
+ 
+ /* -- Profiling ----------------------------------------------------------- */
+@@ -2513,12 +2572,29 @@ static void asm_stack_restore(ASMState *
+       } else if ((sn & SNAP_SOFTFPNUM)) {
+ 	type = ra_alloc1(as, ref+1, rset_exclude(RSET_GPR, RID_BASE));
+ #endif
++      } else if ((sn & SNAP_KEYINDEX)) {
++	type = ra_allock(as, (int32_t)LJ_KEYINDEX, allow);
+       } else {
+ 	type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow);
+       }
+       emit_tsi(as, MIPSI_SW, type, RID_BASE, ofs+(LJ_BE?0:4));
+ #else
+-      asm_tvstore64(as, RID_BASE, ofs, ref);
++      if ((sn & SNAP_KEYINDEX)) {
++	RegSet allow = rset_exclude(RSET_GPR, RID_BASE);
++	int64_t kki = (int64_t)LJ_KEYINDEX << 32;
++	if (irref_isk(ref)) {
++	  emit_tsi(as, MIPSI_SD,
++		   ra_allock(as, kki | (int64_t)(uint32_t)ir->i, allow),
++		   RID_BASE, ofs);
++	} else {
++	  Reg src = ra_alloc1(as, ref, allow);
++	  Reg rki = ra_allock(as, kki, rset_exclude(allow, src));
++	  emit_tsi(as, MIPSI_SD, RID_TMP, RID_BASE, ofs);
++	  emit_dst(as, MIPSI_DADDU, RID_TMP, src, rki);
++	}
++      } else {
++	asm_tvstore64(as, RID_BASE, ofs, ref);
++      }
+ #endif
+     }
+     checkmclim(as);
+@@ -2575,6 +2651,12 @@ static void asm_loop_fixup(ASMState *as)
+   }
+ }
+ 
++/* Fixup the tail of the loop. */
++static void asm_loop_tail_fixup(ASMState *as)
++{
++  if (as->loopinv) as->mctop--;
++}
++
+ /* -- Head of trace ------------------------------------------------------- */
+ 
+ /* Coalesce BASE register for a root trace. */
+@@ -2582,7 +2664,6 @@ static void asm_head_root_base(ASMState
+ {
+   IRIns *ir = IR(REF_BASE);
+   Reg r = ir->r;
+-  if (as->loopinv) as->mctop--;
+   if (ra_hasreg(r)) {
+     ra_free(as, r);
+     if (rset_test(as->modset, r) || irt_ismarked(ir->t))
+@@ -2593,25 +2674,24 @@ static void asm_head_root_base(ASMState
+ }
+ 
+ /* Coalesce BASE register for a side trace. */
+-static RegSet asm_head_side_base(ASMState *as, IRIns *irp, RegSet allow)
++static Reg asm_head_side_base(ASMState *as, IRIns *irp)
+ {
+   IRIns *ir = IR(REF_BASE);
+   Reg r = ir->r;
+-  if (as->loopinv) as->mctop--;
+   if (ra_hasreg(r)) {
+     ra_free(as, r);
+     if (rset_test(as->modset, r) || irt_ismarked(ir->t))
+       ir->r = RID_INIT;  /* No inheritance for modified BASE register. */
+     if (irp->r == r) {
+-      rset_clear(allow, r);  /* Mark same BASE register as coalesced. */
++      return r;  /* Same BASE register already coalesced. */
+     } else if (ra_hasreg(irp->r) && rset_test(as->freeset, irp->r)) {
+-      rset_clear(allow, irp->r);
+       emit_move(as, r, irp->r);  /* Move from coalesced parent reg. */
++      return irp->r;
+     } else {
+       emit_getgl(as, r, jit_base);  /* Otherwise reload BASE. */
+     }
+   }
+-  return allow;
++  return RID_NONE;
+ }
+ 
+ /* -- Tail of trace ------------------------------------------------------- */
+@@ -2704,21 +2784,17 @@ void lj_asm_patchexit(jit_State *J, GCtr
+ 	patchbranch:
+ 	  p[-1] = (p[-1] & 0xffff0000u) | (delta & 0xffffu);
+ 	  *p = MIPSI_NOP;  /* Replace the load of the exit number. */
+-	  cstop = p;
++	  cstop = p+1;
+ 	  if (!cstart) cstart = p-1;
+ 	} else {  /* Branch out of range. Use spare jump slot in mcarea. */
+-	  int i;
+-	  for (i = (int)(sizeof(MCLink)/sizeof(MCode));
+-	       i < (int)(sizeof(MCLink)/sizeof(MCode)+MIPS_SPAREJUMP*2);
+-	       i += 2) {
+-	    if (mcarea[i] == tjump) {
+-	      delta = mcarea+i - p;
+-	      goto patchbranch;
+-	    } else if (mcarea[i] == MIPSI_NOP) {
+-	      mcarea[i] = tjump;
+-	      cstart = mcarea+i;
+-	      delta = mcarea+i - p;
++	  MCode *mcjump = asm_sparejump_use(mcarea, tjump);
++	  if (mcjump) {
++	    lj_mcode_sync(mcjump, mcjump+1);
++	    delta = mcjump - p;
++	    if (((delta + 0x8000) >> 16) == 0) {
+ 	      goto patchbranch;
++	    } else {
++	      lj_assertJ(0, "spare jump out of range: -Osizemcode too big");
+ 	    }
+ 	  }
+ 	  /* Ignore jump slot overflow. Child trace is simply not attached. */
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_asm_ppc.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_asm_ppc.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_asm_ppc.h
+@@ -1,6 +1,6 @@
+ /*
+ ** PPC IR assembler (SSA IR -> machine code).
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ /* -- Register allocator extensions --------------------------------------- */
+@@ -156,6 +156,9 @@ static Reg asm_fuseahuref(ASMState *as,
+ 	  return ra_allock(as, ofs-(int16_t)ofs, allow);
+ 	}
+       }
++    } else if (ir->o == IR_TMPREF) {
++      *ofsp = (int32_t)(offsetof(global_State, tmptv)-32768);
++      return RID_JGL;
+     }
+   }
+   *ofsp = 0;
+@@ -232,7 +235,8 @@ static int asm_fusemadd(ASMState *as, IR
+ {
+   IRRef lref = ir->op1, rref = ir->op2;
+   IRIns *irm;
+-  if (lref != rref &&
++  if ((as->flags & JIT_F_OPT_FMA) &&
++      lref != rref &&
+       ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) &&
+ 	ra_noreg(irm->r)) ||
+        (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) &&
+@@ -337,10 +341,8 @@ static void asm_setupresult(ASMState *as
+       } else {
+ 	ra_destreg(as, ir, RID_FPRET);
+       }
+-#if LJ_32
+     } else if (hiop) {
+       ra_destpair(as, ir);
+-#endif
+     } else {
+       ra_destreg(as, ir, RID_RET);
+     }
+@@ -389,6 +391,21 @@ static void asm_retf(ASMState *as, IRIns
+   emit_tai(as, PPCI_LWZ, RID_TMP, base, -8);
+ }
+ 
++/* -- Buffer operations --------------------------------------------------- */
++
++#if LJ_HASBUFFER
++static void asm_bufhdr_write(ASMState *as, Reg sb)
++{
++  Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, sb));
++  IRIns irgc;
++  irgc.ot = IRT(0, IRT_PGC);  /* GC type. */
++  emit_storeofs(as, &irgc, RID_TMP, sb, offsetof(SBuf, L));
++  emit_rot(as, PPCI_RLWIMI, RID_TMP, tmp, 0, 31-lj_fls(SBUF_MASK_FLAG), 31);
++  emit_getgl(as, RID_TMP, cur_L);
++  emit_loadofs(as, &irgc, tmp, sb, offsetof(SBuf, L));
++}
++#endif
++
+ /* -- Type conversions ---------------------------------------------------- */
+ 
+ #if !LJ_SOFTFP
+@@ -567,28 +584,54 @@ static void asm_strto(ASMState *as, IRIn
+ /* -- Memory references --------------------------------------------------- */
+ 
+ /* Get pointer to TValue. */
+-static void asm_tvptr(ASMState *as, Reg dest, IRRef ref)
++static void asm_tvptr(ASMState *as, Reg dest, IRRef ref, MSize mode)
+ {
+-  IRIns *ir = IR(ref);
+-  if (irt_isnum(ir->t)) {
+-    if (irref_isk(ref))  /* Use the number constant itself as a TValue. */
+-      ra_allockreg(as, i32ptr(ir_knum(ir)), dest);
+-    else  /* Otherwise force a spill and use the spill slot. */
+-      emit_tai(as, PPCI_ADDI, dest, RID_SP, ra_spill(as, ir));
+-  } else {
+-    /* Otherwise use g->tmptv to hold the TValue. */
+-    RegSet allow = rset_exclude(RSET_GPR, dest);
+-    Reg type;
+-    emit_tai(as, PPCI_ADDI, dest, RID_JGL, (int32_t)offsetof(global_State, tmptv)-32768);
+-    if (!irt_ispri(ir->t)) {
+-      Reg src = ra_alloc1(as, ref, allow);
+-      emit_setgl(as, src, tmptv.gcr);
++  int32_t tmpofs = (int32_t)(offsetof(global_State, tmptv)-32768);
++  if ((mode & IRTMPREF_IN1)) {
++    IRIns *ir = IR(ref);
++    if (irt_isnum(ir->t)) {
++      if ((mode & IRTMPREF_OUT1)) {
++#if LJ_SOFTFP
++	lj_assertA(irref_isk(ref), "unsplit FP op");
++	emit_tai(as, PPCI_ADDI, dest, RID_JGL, tmpofs);
++	emit_setgl(as,
++		   ra_allock(as, (int32_t)ir_knum(ir)->u32.lo, RSET_GPR),
++		   tmptv.u32.lo);
++	emit_setgl(as,
++		   ra_allock(as, (int32_t)ir_knum(ir)->u32.hi, RSET_GPR),
++		   tmptv.u32.hi);
++#else
++	Reg src = ra_alloc1(as, ref, RSET_FPR);
++	emit_tai(as, PPCI_ADDI, dest, RID_JGL, tmpofs);
++	emit_fai(as, PPCI_STFD, src, RID_JGL, tmpofs);
++#endif
++      } else if (irref_isk(ref)) {
++	/* Use the number constant itself as a TValue. */
++	ra_allockreg(as, i32ptr(ir_knum(ir)), dest);
++      } else {
++#if LJ_SOFTFP
++	lj_assertA(0, "unsplit FP op");
++#else
++	/* Otherwise force a spill and use the spill slot. */
++	emit_tai(as, PPCI_ADDI, dest, RID_SP, ra_spill(as, ir));
++#endif
++      }
++    } else {
++      /* Otherwise use g->tmptv to hold the TValue. */
++      Reg type;
++      emit_tai(as, PPCI_ADDI, dest, RID_JGL, tmpofs);
++      if (!irt_ispri(ir->t)) {
++	Reg src = ra_alloc1(as, ref, RSET_GPR);
++	emit_setgl(as, src, tmptv.gcr);
++      }
++      if (LJ_SOFTFP && (ir+1)->o == IR_HIOP && !irt_isnil((ir+1)->t))
++	type = ra_alloc1(as, ref+1, RSET_GPR);
++      else
++	type = ra_allock(as, irt_toitype(ir->t), RSET_GPR);
++      emit_setgl(as, type, tmptv.it);
+     }
+-    if (LJ_SOFTFP && (ir+1)->o == IR_HIOP)
+-      type = ra_alloc1(as, ref+1, allow);
+-    else
+-      type = ra_allock(as, irt_toitype(ir->t), allow);
+-    emit_setgl(as, type, tmptv.it);
++  } else {
++    emit_tai(as, PPCI_ADDI, dest, RID_JGL, tmpofs);
+   }
+ }
+ 
+@@ -797,23 +840,30 @@ static void asm_hrefk(ASMState *as, IRIn
+ static void asm_uref(ASMState *as, IRIns *ir)
+ {
+   Reg dest = ra_dest(as, ir, RSET_GPR);
+-  if (irref_isk(ir->op1)) {
++  int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC);
++  if (irref_isk(ir->op1) && !guarded) {
+     GCfunc *fn = ir_kfunc(IR(ir->op1));
+     MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
+     emit_lsptr(as, PPCI_LWZ, dest, v, RSET_GPR);
+   } else {
+-    Reg uv = ra_scratch(as, RSET_GPR);
+-    Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
+-    if (ir->o == IR_UREFC) {
+-      asm_guardcc(as, CC_NE);
++    if (guarded) {
++      asm_guardcc(as, ir->o == IR_UREFC ? CC_NE : CC_EQ);
+       emit_ai(as, PPCI_CMPWI, RID_TMP, 1);
+-      emit_tai(as, PPCI_ADDI, dest, uv, (int32_t)offsetof(GCupval, tv));
+-      emit_tai(as, PPCI_LBZ, RID_TMP, uv, (int32_t)offsetof(GCupval, closed));
++    }
++    if (ir->o == IR_UREFC)
++      emit_tai(as, PPCI_ADDI, dest, dest, (int32_t)offsetof(GCupval, tv));
++    else
++      emit_tai(as, PPCI_LWZ, dest, dest, (int32_t)offsetof(GCupval, v));
++    if (guarded)
++      emit_tai(as, PPCI_LBZ, RID_TMP, dest, (int32_t)offsetof(GCupval, closed));
++    if (irref_isk(ir->op1)) {
++      GCfunc *fn = ir_kfunc(IR(ir->op1));
++      int32_t k = (int32_t)gcrefu(fn->l.uvptr[(ir->op2 >> 8)]);
++      emit_loadi(as, dest, k);
+     } else {
+-      emit_tai(as, PPCI_LWZ, dest, uv, (int32_t)offsetof(GCupval, v));
++      emit_tai(as, PPCI_LWZ, dest, ra_alloc1(as, ir->op1, RSET_GPR),
++	       (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)(ir->op2 >> 8));
+     }
+-    emit_tai(as, PPCI_LWZ, uv, func,
+-	     (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)(ir->op2 >> 8));
+   }
+ }
+ 
+@@ -894,7 +944,7 @@ static void asm_fload(ASMState *as, IRIn
+   int32_t ofs;
+   if (ir->op1 == REF_NIL) {  /* FLOAD from GG_State with offset. */
+     idx = RID_JGL;
+-    ofs = (ir->op2 << 2) - 32768;
++    ofs = (ir->op2 << 2) - 32768 - GG_OFS(g);
+   } else {
+     idx = ra_alloc1(as, ir->op1, RSET_GPR);
+     if (ir->op2 == IRFL_TAB_ARRAY) {
+@@ -975,6 +1025,10 @@ static void asm_ahuvload(ASMState *as, I
+     rset_clear(allow, dest);
+   }
+   idx = asm_fuseahuref(as, ir->op1, &ofs, allow);
++  if (ir->o == IR_VLOAD) {
++    ofs = ofs != AHUREF_LSX ? ofs + 8 * ir->op2 :
++	  ir->op2 ? 8 * ir->op2 : AHUREF_LSX;
++  }
+   if (irt_isnum(t)) {
+     Reg tisnum = ra_allock(as, (int32_t)LJ_TISNUM, rset_exclude(allow, idx));
+     asm_guardcc(as, CC_GE);
+@@ -1057,7 +1111,8 @@ static void asm_sload(ASMState *as, IRIn
+   lj_assertA(irt_isguard(ir->t) || !(ir->op2 & IRSLOAD_TYPECHECK),
+ 	     "inconsistent SLOAD variant");
+   lj_assertA(LJ_DUALNUM ||
+-	     !irt_isint(t) || (ir->op2 & (IRSLOAD_CONVERT|IRSLOAD_FRAME)),
++	     !irt_isint(t) ||
++	     (ir->op2 & (IRSLOAD_CONVERT|IRSLOAD_FRAME|IRSLOAD_KEYINDEX)),
+ 	     "bad SLOAD type");
+ #if LJ_SOFTFP
+   lj_assertA(!(ir->op2 & IRSLOAD_CONVERT),
+@@ -1122,7 +1177,12 @@ dotypecheck:
+   } else {
+     if ((ir->op2 & IRSLOAD_TYPECHECK)) {
+       asm_guardcc(as, CC_NE);
+-      emit_ai(as, PPCI_CMPWI, RID_TMP, irt_toitype(t));
++      if ((ir->op2 & IRSLOAD_KEYINDEX)) {
++	emit_ai(as, PPCI_CMPWI, RID_TMP, (LJ_KEYINDEX & 0xffff));
++	emit_asi(as, PPCI_XORIS, RID_TMP, RID_TMP, (LJ_KEYINDEX >> 16));
++      } else {
++	emit_ai(as, PPCI_CMPWI, RID_TMP, irt_toitype(t));
++      }
+       type = RID_TMP;
+     }
+     if (ra_hasreg(dest)) emit_tai(as, PPCI_LWZ, dest, base, ofs);
+@@ -1894,15 +1954,15 @@ static void asm_comp64(ASMState *as, IRI
+ }
+ #endif
+ 
+-/* -- Support for 64 bit ops in 32 bit mode ------------------------------- */
++/* -- Split register ops -------------------------------------------------- */
+ 
+-/* Hiword op of a split 64 bit op. Previous op must be the loword op. */
++/* Hiword op of a split 32/32 bit op. Previous op is be the loword op. */
+ static void asm_hiop(ASMState *as, IRIns *ir)
+ {
+-#if LJ_HASFFI || LJ_SOFTFP
+   /* HIOP is marked as a store because it needs its own DCE logic. */
+   int uselo = ra_used(ir-1), usehi = ra_used(ir);  /* Loword/hiword used? */
+   if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1;
++#if LJ_HASFFI || LJ_SOFTFP
+   if ((ir-1)->o == IR_CONV) {  /* Conversions to/from 64 bit. */
+     as->curins--;  /* Always skip the CONV. */
+ #if LJ_HASFFI && !LJ_SOFTFP
+@@ -1937,12 +1997,16 @@ static void asm_hiop(ASMState *as, IRIns
+     }
+     return;
+   }
++#endif
+   if (!usehi) return;  /* Skip unused hiword op for all remaining ops. */
+   switch ((ir-1)->o) {
+ #if LJ_HASFFI
+   case IR_ADD: as->curins--; asm_add64(as, ir); break;
+   case IR_SUB: as->curins--; asm_sub64(as, ir); break;
+   case IR_NEG: as->curins--; asm_neg64(as, ir); break;
++  case IR_CNEWI:
++    /* Nothing to do here. Handled by lo op itself. */
++    break;
+ #endif
+ #if LJ_SOFTFP
+   case IR_SLOAD: case IR_ALOAD: case IR_HLOAD: case IR_ULOAD: case IR_VLOAD:
+@@ -1950,25 +2014,16 @@ static void asm_hiop(ASMState *as, IRIns
+     if (!uselo)
+       ra_allocref(as, ir->op1, RSET_GPR);  /* Mark lo op as used. */
+     break;
++  case IR_ASTORE: case IR_HSTORE: case IR_USTORE: case IR_TOSTR: case IR_TMPREF:
++    /* Nothing to do here. Handled by lo op itself. */
++    break;
+ #endif
+-  case IR_CALLN:
+-  case IR_CALLS:
+-  case IR_CALLXS:
++  case IR_CALLN: case IR_CALLL: case IR_CALLS: case IR_CALLXS:
+     if (!uselo)
+       ra_allocref(as, ir->op1, RID2RSET(RID_RETLO));  /* Mark lo op as used. */
+     break;
+-#if LJ_SOFTFP
+-  case IR_ASTORE: case IR_HSTORE: case IR_USTORE: case IR_TOSTR:
+-#endif
+-  case IR_CNEWI:
+-    /* Nothing to do here. Handled by lo op itself. */
+-    break;
+   default: lj_assertA(0, "bad HIOP for op %d", (ir-1)->o); break;
+   }
+-#else
+-  /* Unused without SOFTFP or FFI. */
+-  UNUSED(as); UNUSED(ir); lj_assertA(0, "unexpected HIOP");
+-#endif
+ }
+ 
+ /* -- Profiling ----------------------------------------------------------- */
+@@ -2055,6 +2110,8 @@ static void asm_stack_restore(ASMState *
+       } else if ((sn & SNAP_SOFTFPNUM)) {
+ 	type = ra_alloc1(as, ref+1, rset_exclude(RSET_GPR, RID_BASE));
+ #endif
++      } else if ((sn & SNAP_KEYINDEX)) {
++	type = ra_allock(as, (int32_t)LJ_KEYINDEX, allow);
+       } else {
+ 	type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow);
+       }
+@@ -2113,6 +2170,12 @@ static void asm_loop_fixup(ASMState *as)
+   }
+ }
+ 
++/* Fixup the tail of the loop. */
++static void asm_loop_tail_fixup(ASMState *as)
++{
++  UNUSED(as);  /* Nothing to do. */
++}
++
+ /* -- Head of trace ------------------------------------------------------- */
+ 
+ /* Coalesce BASE register for a root trace. */
+@@ -2130,7 +2193,7 @@ static void asm_head_root_base(ASMState
+ }
+ 
+ /* Coalesce BASE register for a side trace. */
+-static RegSet asm_head_side_base(ASMState *as, IRIns *irp, RegSet allow)
++static Reg asm_head_side_base(ASMState *as, IRIns *irp)
+ {
+   IRIns *ir = IR(REF_BASE);
+   Reg r = ir->r;
+@@ -2139,15 +2202,15 @@ static RegSet asm_head_side_base(ASMStat
+     if (rset_test(as->modset, r) || irt_ismarked(ir->t))
+       ir->r = RID_INIT;  /* No inheritance for modified BASE register. */
+     if (irp->r == r) {
+-      rset_clear(allow, r);  /* Mark same BASE register as coalesced. */
++      return r;  /* Same BASE register already coalesced. */
+     } else if (ra_hasreg(irp->r) && rset_test(as->freeset, irp->r)) {
+-      rset_clear(allow, irp->r);
+       emit_mr(as, r, irp->r);  /* Move from coalesced parent reg. */
++      return irp->r;
+     } else {
+       emit_getgl(as, r, jit_base);  /* Otherwise reload BASE. */
+     }
+   }
+-  return allow;
++  return RID_NONE;
+ }
+ 
+ /* -- Tail of trace ------------------------------------------------------- */
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_asm_riscv64.h
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_asm_riscv64.h
+@@ -0,0 +1,1976 @@
++/*
++** RISC-V IR assembler (SSA IR -> machine code).
++** Copyright (C) 2005-2022 Mike Pall. See Copyright Notice in luajit.h
++*/
++
++/* -- Register allocator extensions --------------------------------------- */
++
++/* Allocate a register with a hint. */
++static Reg ra_hintalloc(ASMState *as, IRRef ref, Reg hint, RegSet allow)
++{
++  Reg r = IR(ref)->r;
++  if (ra_noreg(r)) {
++    if (!ra_hashint(r) && !iscrossref(as, ref))
++      ra_sethint(IR(ref)->r, hint);  /* Propagate register hint. */
++    r = ra_allocref(as, ref, allow);
++  }
++  ra_noweak(as, r);
++  return r;
++}
++
++/* Allocate a register or RID_ZERO. */
++static Reg ra_alloc1z(ASMState *as, IRRef ref, RegSet allow)
++{
++  Reg r = IR(ref)->r;
++  if (ra_noreg(r)) {
++    if (!(allow & RSET_FPR) && irref_isk(ref) && get_kval(as, ref) == 0)
++      return RID_ZERO;
++    r = ra_allocref(as, ref, allow);
++  } else {
++    ra_noweak(as, r);
++  }
++  return r;
++}
++
++/* Allocate two source registers for three-operand instructions. */
++static Reg ra_alloc2(ASMState *as, IRIns *ir, RegSet allow)
++{
++  IRIns *irl = IR(ir->op1), *irr = IR(ir->op2);
++  Reg left = irl->r, right = irr->r;
++  if (ra_hasreg(left)) {
++    ra_noweak(as, left);
++    if (ra_noreg(right))
++      right = ra_alloc1z(as, ir->op2, rset_exclude(allow, left));
++    else
++      ra_noweak(as, right);
++  } else if (ra_hasreg(right)) {
++    ra_noweak(as, right);
++    left = ra_alloc1z(as, ir->op1, rset_exclude(allow, right));
++  } else if (ra_hashint(right)) {
++    right = ra_alloc1z(as, ir->op2, allow);
++    left = ra_alloc1z(as, ir->op1, rset_exclude(allow, right));
++  } else {
++    left = ra_alloc1z(as, ir->op1, allow);
++    right = ra_alloc1z(as, ir->op2, rset_exclude(allow, left));
++  }
++  return left | (right << 8);
++}
++
++/* -- Guard handling ------------------------------------------------------ */
++
++/* Copied from MIPS, AUIPC+JALR is expensive to setup in-place */
++#define RISCV_SPAREJUMP		4
++
++/* Setup spare long-range jump (trampoline?) slots per mcarea. */
++
++static void asm_sparejump_setup(ASMState *as)
++{
++  MCode *mxp = as->mctop;
++  if ((char *)mxp == (char *)as->J->mcarea + as->J->szmcarea) {
++    for (int i = RISCV_SPAREJUMP*2; i--; )
++      *--mxp = RISCVI_EBREAK;
++    as->mctop = mxp;
++  }
++}
++
++static MCode *asm_sparejump_use(MCode *mcarea, MCode *target)
++{
++  MCode *mxp = (MCode *)((char *)mcarea + ((MCLink *)mcarea)->size);
++  int slot = RISCV_SPAREJUMP;
++  RISCVIns tslot = RISCVI_EBREAK, tauipc, tjalr;
++  while (slot--) {
++    mxp -= 2;
++    ptrdiff_t delta = (char *)target - (char *)mxp;
++    tauipc = RISCVI_AUIPC | RISCVF_D(RID_TMP) | RISCVF_IMMU(RISCVF_HI(delta)),
++    tjalr = RISCVI_JALR | RISCVF_S1(RID_TMP) | RISCVF_IMMI(RISCVF_LO(delta));
++    if (mxp[0] == tauipc && mxp[1] == tjalr) {
++      return mxp;
++    } else if (mxp[0] == tslot) {
++      mxp[0] = tauipc, mxp[1] = tjalr;
++      return mxp;
++    }
++  }
++  return NULL;
++}
++
++/* Setup exit stub after the end of each trace. */
++static void asm_exitstub_setup(ASMState *as, ExitNo nexits)
++{
++  ExitNo i;
++  MCode *mxp = as->mctop;
++  if (mxp - (nexits + 4 + MCLIM_REDZONE) < as->mclim)
++    asm_mclimit(as);
++  for (i = nexits-1; (int32_t)i >= 0; i--)
++    *--mxp = RISCVI_JAL | RISCVF_D(RID_RA) | RISCVF_IMMJ((uintptr_t)(4*(-4-i)));
++  ptrdiff_t delta = (char *)lj_vm_exit_handler - (char *)(mxp-3);
++  /* 1: sw ra, 0(sp); auipc+jalr ->vm_exit_handler; lui x0, traceno; jal <1; jal <1; ... */
++  *--mxp = RISCVI_LUI | RISCVF_IMMU(as->T->traceno);
++  *--mxp = RISCVI_JALR | RISCVF_D(RID_RA) | RISCVF_S1(RID_TMP)
++         | RISCVF_IMMI(RISCVF_LO((uintptr_t)(void *)delta));
++  *--mxp = RISCVI_AUIPC | RISCVF_D(RID_TMP)
++         | RISCVF_IMMU(RISCVF_HI((uintptr_t)(void *)delta));
++  *--mxp = RISCVI_SD | RISCVF_S2(RID_RA) | RISCVF_S1(RID_SP);
++  as->mctop = mxp;
++}
++
++static MCode *asm_exitstub_addr(ASMState *as, ExitNo exitno)
++{
++  /* Keep this in-sync with exitstub_trace_addr(). */
++  return as->mctop + exitno + 4;
++}
++
++/* Emit conditional branch to exit for guard. */
++static void asm_guard(ASMState *as, RISCVIns riscvi, Reg rs1, Reg rs2)
++{
++  MCode *target = asm_exitstub_addr(as, as->snapno);
++  MCode *p = as->mcp;
++  if (LJ_UNLIKELY(p == as->invmcp)) {
++    as->loopinv = 1;
++    as->mcp = ++p;
++    *p = RISCVI_JAL | RISCVF_IMMJ((char *)target - (char *)p);
++    riscvi = riscvi^RISCVF_FUNCT3(1);  /* Invert cond. */
++    target = p - 1;  /* Patch target later in asm_loop_fixup. */
++  }
++    ptrdiff_t delta = (char *)target - (char *)(p - 1);
++    *--p = RISCVI_JAL | RISCVF_IMMJ(delta);
++    *--p = (riscvi^RISCVF_FUNCT3(1)) | RISCVF_S1(rs1) | RISCVF_S2(rs2) | RISCVF_IMMB(8);
++    as->mcp = p;
++}
++
++/* -- Operand fusion ------------------------------------------------------ */
++
++/* Limit linear search to this distance. Avoids O(n^2) behavior. */
++#define CONFLICT_SEARCH_LIM	31
++
++/* Check if there's no conflicting instruction between curins and ref. */
++static int noconflict(ASMState *as, IRRef ref, IROp conflict)
++{
++  IRIns *ir = as->ir;
++  IRRef i = as->curins;
++  if (i > ref + CONFLICT_SEARCH_LIM)
++    return 0;  /* Give up, ref is too far away. */
++  while (--i > ref)
++    if (ir[i].o == conflict)
++      return 0;  /* Conflict found. */
++  return 1;  /* Ok, no conflict. */
++}
++
++/* Fuse the array base of colocated arrays. */
++static int32_t asm_fuseabase(ASMState *as, IRRef ref)
++{
++  IRIns *ir = IR(ref);
++  if (ir->o == IR_TNEW && ir->op1 <= LJ_MAX_COLOSIZE &&
++      !neverfuse(as) && noconflict(as, ref, IR_NEWREF))
++    return (int32_t)sizeof(GCtab);
++  return 0;
++}
++
++/* Fuse array/hash/upvalue reference into register+offset operand. */
++static Reg asm_fuseahuref(ASMState *as, IRRef ref, int32_t *ofsp, RegSet allow)
++{
++  IRIns *ir = IR(ref);
++  if (ra_noreg(ir->r)) {
++    if (ir->o == IR_AREF) {
++      if (mayfuse(as, ref)) {
++	if (irref_isk(ir->op2)) {
++	  IRRef tab = IR(ir->op1)->op1;
++	  int32_t ofs = asm_fuseabase(as, tab);
++	  IRRef refa = ofs ? tab : ir->op1;
++	  ofs += 8*IR(ir->op2)->i;
++	  if (checki12(ofs)) {
++	    *ofsp = ofs;
++	    return ra_alloc1(as, refa, allow);
++	  }
++	}
++      }
++    } else if (ir->o == IR_HREFK) {
++      if (mayfuse(as, ref)) {
++	int32_t ofs = (int32_t)(IR(ir->op2)->op2 * sizeof(Node));
++	if (checki12(ofs)) {
++	  *ofsp = ofs;
++	  return ra_alloc1(as, ir->op1, allow);
++	}
++      }
++    } else if (ir->o == IR_UREFC) {
++      if (irref_isk(ir->op1)) {
++	GCfunc *fn = ir_kfunc(IR(ir->op1));
++	GCupval *uv = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv;
++  intptr_t ofs = ((intptr_t)((uintptr_t)(&uv->tv) - (uintptr_t)&J2GG(as->J)->g));
++	if (checki12(ofs)) {
++	  *ofsp = (int32_t)ofs;
++	  return RID_GL;
++	}
++      }
++    } else if (ir->o == IR_TMPREF) {
++      *ofsp = (int32_t)offsetof(global_State, tmptv);
++      return RID_GL;
++    }
++  }
++  *ofsp = 0;
++  return ra_alloc1(as, ref, allow);
++}
++
++/* Fuse XLOAD/XSTORE reference into load/store operand. */
++static void asm_fusexref(ASMState *as, RISCVIns riscvi, Reg rd, IRRef ref,
++			 RegSet allow, int32_t ofs)
++{
++  IRIns *ir = IR(ref);
++  Reg base;
++  if (ra_noreg(ir->r) && canfuse(as, ir)) {
++    intptr_t ofs2;
++    if (ir->o == IR_ADD) {
++      if (irref_isk(ir->op2) && (ofs2 = ofs + get_kval(as, ir->op2),
++				 checki12(ofs2))) {
++	ref = ir->op1;
++	ofs = (int32_t)ofs2;
++      }
++    } else if (ir->o == IR_STRREF) {
++      ofs2 = 4096;
++      lj_assertA(ofs == 0, "bad usage");
++      ofs = (int32_t)sizeof(GCstr);
++      if (irref_isk(ir->op2)) {
++	ofs2 = ofs + get_kval(as, ir->op2);
++	ref = ir->op1;
++      } else if (irref_isk(ir->op1)) {
++	ofs2 = ofs + get_kval(as, ir->op1);
++	ref = ir->op2;
++      }
++      if (!checki12(ofs2)) {
++        /* NYI: Fuse ADD with constant. */
++        Reg right, left = ra_alloc2(as, ir, allow);
++        right = (left >> 8); left &= 255;
++        emit_lso(as, riscvi, rd, RID_TMP, ofs);
++        emit_ds1s2(as, RISCVI_ADD, RID_TMP, left, right);
++        return;
++      }
++      ofs = ofs2;
++    }
++  }
++  base = ra_alloc1(as, ref, allow);
++  emit_lso(as, riscvi, rd, base, ofs);
++}
++
++/* Fuse Integer multiply-accumulate. */
++
++static int asm_fusemac(ASMState *as, IRIns *ir, RISCVIns riscvi)
++{
++  IRRef lref = ir->op1, rref = ir->op2;
++  IRIns *irm;
++  if (lref != rref &&
++      ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) &&
++       ra_noreg(irm->r)) ||
++       (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) &&
++       (rref = lref, ra_noreg(irm->r))))) {
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    Reg add = ra_hintalloc(as, rref, dest, RSET_GPR);
++    Reg left = ra_alloc2(as, irm,
++       rset_exclude(rset_exclude(RSET_GPR, dest), add));
++    Reg right = (left >> 8); left &= 255;
++    emit_ds1s2(as, riscvi, dest, left, right);
++    if (dest != add) emit_mv(as, dest, add);
++    return 1;
++  }
++  return 0;
++}
++
++/* Fuse FP multiply-add/sub. */
++
++static int asm_fusemadd(ASMState *as, IRIns *ir, RISCVIns riscvi, RISCVIns riscvir)
++{
++  IRRef lref = ir->op1, rref = ir->op2;
++  IRIns *irm;
++  if ((as->flags & JIT_F_OPT_FMA) &&
++      lref != rref &&
++      ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) &&
++       ra_noreg(irm->r)) ||
++       (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) &&
++       (rref = lref, riscvi = riscvir, ra_noreg(irm->r))))) {
++    Reg dest = ra_dest(as, ir, RSET_FPR);
++    Reg add = ra_hintalloc(as, rref, dest, RSET_FPR);
++    Reg left = ra_alloc2(as, irm,
++       rset_exclude(rset_exclude(RSET_FPR, dest), add));
++    Reg right = (left >> 8); left &= 255;
++    emit_ds1s2s3(as, riscvi, dest, left, right, add);
++    return 1;
++  }
++  return 0;
++}
++/* -- Calls --------------------------------------------------------------- */
++
++/* Generate a call to a C function. */
++static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
++{
++  uint32_t n, nargs = CCI_XNARGS(ci);
++  int32_t ofs = 0;
++  Reg gpr, fpr = REGARG_FIRSTFPR;
++  if ((void *)ci->func)
++    emit_call(as, (void *)ci->func, 1);
++  for (gpr = REGARG_FIRSTGPR; gpr <= REGARG_LASTGPR; gpr++)
++    as->cost[gpr] = REGCOST(~0u, ASMREF_L);
++  gpr = REGARG_FIRSTGPR;
++  for (n = 0; n < nargs; n++) { /* Setup args. */
++    IRRef ref = args[n];
++    IRIns *ir = IR(ref);
++    if (ref) {
++      if (irt_isfp(ir->t)) {
++        if (fpr <= REGARG_LASTFPR) {
++	  lj_assertA(rset_test(as->freeset, fpr),
++	             "reg %d not free", fpr);  /* Must have been evicted. */
++          ra_leftov(as, fpr, ref);
++	  fpr++; if(ci->flags & CCI_VARARG) gpr++;
++	} else if (!(ci->flags & CCI_VARARG) && gpr <= REGARG_LASTGPR) {
++	  lj_assertA(rset_test(as->freeset, gpr),
++	             "reg %d not free", gpr);  /* Must have been evicted. */
++          ra_leftov(as, gpr, ref);
++	  gpr++;
++	} else {
++	  Reg r = ra_alloc1(as, ref, RSET_FPR);
++	  emit_spstore(as, ir, r, ofs);
++	  ofs += 8;
++	}
++      } else {
++        if (gpr <= REGARG_LASTGPR) {
++	  lj_assertA(rset_test(as->freeset, gpr),
++	             "reg %d not free", gpr);  /* Must have been evicted. */
++          ra_leftov(as, gpr, ref);
++	  gpr++; if(ci->flags & CCI_VARARG) fpr++;
++	} else {
++	  Reg r = ra_alloc1z(as, ref, RSET_GPR);
++	  emit_spstore(as, ir, r, ofs);
++	  ofs += 8;
++	}
++      }
++    }
++  }
++}
++
++/* Setup result reg/sp for call. Evict scratch regs. */
++static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
++{
++  RegSet drop = RSET_SCRATCH;
++  int hiop = ((ir+1)->o == IR_HIOP && !irt_isnil((ir+1)->t));
++  if (ra_hasreg(ir->r))
++    rset_clear(drop, ir->r);  /* Dest reg handled below. */
++  if (hiop && ra_hasreg((ir+1)->r))
++    rset_clear(drop, (ir+1)->r);  /* Dest reg handled below. */
++  ra_evictset(as, drop);  /* Evictions must be performed first. */
++  if (ra_used(ir)) {
++    lj_assertA(!irt_ispri(ir->t), "PRI dest");
++    if (irt_isfp(ir->t)) {
++      if ((ci->flags & CCI_CASTU64)) {
++        Reg dest = ra_dest(as, ir, RSET_FPR);
++  emit_ds(as, irt_isnum(ir->t) ? RISCVI_FMV_D_X : RISCVI_FMV_W_X,
++	        dest, RID_RET);
++      } else {
++	ra_destreg(as, ir, RID_FPRET);
++      }
++    } else if (hiop) {
++      ra_destpair(as, ir);
++    } else {
++      ra_destreg(as, ir, RID_RET);
++    }
++  }
++}
++
++static void asm_callx(ASMState *as, IRIns *ir)
++{
++  IRRef args[CCI_NARGS_MAX*2];
++  CCallInfo ci;
++  IRRef func;
++  IRIns *irf;
++  ci.flags = asm_callx_flags(as, ir);
++  asm_collectargs(as, ir, &ci, args);
++  asm_setupresult(as, ir, &ci);
++  func = ir->op2; irf = IR(func);
++  if (irf->o == IR_CARG) { func = irf->op1; irf = IR(func); }
++  if (irref_isk(func)) {  /* Call to constant address. */
++    ci.func = (ASMFunction)(void *)get_kval(as, func);
++  } else {  /* Need specific register for indirect calls. */
++    Reg r = ra_alloc1(as, func, RID2RSET(RID_CFUNCADDR));
++    MCode *p = as->mcp;
++    *--p = RISCVI_JALR | RISCVF_D(RID_RA) | RISCVF_S1(r);
++    if (r == RID_CFUNCADDR)
++      *--p = RISCVI_ADDI | RISCVF_D(RID_CFUNCADDR) | RISCVF_S1(r);
++    else
++      *--p = RISCVI_MV | RISCVF_D(RID_CFUNCADDR) | RISCVF_S1(r);
++    as->mcp = p;
++    ci.func = (ASMFunction)(void *)0;
++  }
++  asm_gencall(as, &ci, args);
++}
++
++static void asm_callround(ASMState *as, IRIns *ir, IRCallID id)
++{
++  /* The modified regs must match with the *.dasc implementation. */
++  RegSet drop = RID2RSET(RID_X6)|RID2RSET(RID_X7)|RID2RSET(RID_F10)|
++                RID2RSET(RID_F14)|RID2RSET(RID_F1)|RID2RSET(RID_F3)|
++                RID2RSET(RID_F4);
++  if (ra_hasreg(ir->r)) rset_clear(drop, ir->r);
++  ra_evictset(as, drop);
++  ra_destreg(as, ir, RID_FPRET);
++  emit_call(as, (void *)lj_ir_callinfo[id].func, 0);
++  ra_leftov(as, REGARG_FIRSTFPR, ir->op1);
++}
++
++/* -- Returns ------------------------------------------------------------- */
++
++/* Return to lower frame. Guard that it goes to the right spot. */
++static void asm_retf(ASMState *as, IRIns *ir)
++{
++  Reg base = ra_alloc1(as, REF_BASE, RSET_GPR);
++  void *pc = ir_kptr(IR(ir->op2));
++  int32_t delta = 1+LJ_FR2+bc_a(*((const BCIns *)pc - 1));
++  as->topslot -= (BCReg)delta;
++  if ((int32_t)as->topslot < 0) as->topslot = 0;
++  irt_setmark(IR(REF_BASE)->t);  /* Children must not coalesce with BASE reg. */
++  emit_setgl(as, base, jit_base);
++  emit_addptr(as, base, -8*delta);
++  asm_guard(as, RISCVI_BNE, RID_TMP,
++	    ra_allock(as, igcptr(pc), rset_exclude(RSET_GPR, base)));
++  emit_lso(as, RISCVI_LD, RID_TMP, base, -8);
++}
++
++/* -- Buffer operations --------------------------------------------------- */
++
++#if LJ_HASBUFFER
++static void asm_bufhdr_write(ASMState *as, Reg sb)
++{
++  Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, sb));
++  IRIns irgc;
++  irgc.ot = IRT(0, IRT_PGC);  /* GC type. */
++  emit_storeofs(as, &irgc, RID_TMP, sb, offsetof(SBuf, L));
++  emit_ds1s2(as, RISCVI_OR, RID_TMP, RID_TMP, tmp);
++  emit_dsi(as, RISCVI_ANDI, tmp, tmp, SBUF_MASK_FLAG);
++  emit_getgl(as, RID_TMP, cur_L);
++  emit_loadofs(as, &irgc, tmp, sb, offsetof(SBuf, L));
++}
++#endif
++
++/* -- Type conversions ---------------------------------------------------- */
++
++static void asm_tointg(ASMState *as, IRIns *ir, Reg left)
++{
++  Reg tmp = ra_scratch(as, rset_exclude(RSET_FPR, left));
++  Reg dest = ra_dest(as, ir, RSET_GPR), cmp = ra_scratch(as, rset_exclude(RSET_GPR, dest));
++  asm_guard(as, RISCVI_BEQ, cmp, RID_ZERO);
++  emit_ds1s2(as, RISCVI_FEQ_D, cmp, tmp, left);
++  emit_ds(as, RISCVI_FCVT_D_W, tmp, dest);
++  emit_ds(as, RISCVI_FCVT_W_D, dest, left);
++}
++
++static void asm_tobit(ASMState *as, IRIns *ir)
++{
++  RegSet allow = RSET_FPR;
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  Reg left = ra_alloc1(as, ir->op1, allow);
++  Reg right = ra_alloc1(as, ir->op2, rset_clear(allow, left));
++  Reg tmp = ra_scratch(as, rset_clear(allow, right));
++  emit_ds(as, RISCVI_FMV_X_W, dest, tmp);
++  emit_ds1s2(as, RISCVI_FADD_D, tmp, left, right);
++}
++
++static void asm_conv(ASMState *as, IRIns *ir)
++{
++  IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
++  int st64 = (st == IRT_I64 || st == IRT_U64 || st == IRT_P64);
++  int stfp = (st == IRT_NUM || st == IRT_FLOAT);
++  IRRef lref = ir->op1;
++  lj_assertA(irt_type(ir->t) != st, "inconsistent types for CONV");
++  /* Use GPR to pass floating-point arguments */
++  if (irt_isfp(ir->t) && ir->r >= RID_X10 && ir->r <= RID_X17) {
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    Reg ftmp = ra_scratch(as, RSET_FPR);
++    if (stfp) {  /* FP to FP conversion. */
++      emit_ds(as, st == IRT_NUM ? RISCVI_FMV_X_W : RISCVI_FMV_X_D, dest, ftmp);
++      emit_ds(as, st == IRT_NUM ? RISCVI_FCVT_S_D : RISCVI_FCVT_D_S,
++        ftmp, ra_alloc1(as, lref, RSET_FPR));
++    } else {  /* Integer to FP conversion. */
++      Reg left = ra_alloc1(as, lref, RSET_GPR);
++      RISCVIns riscvi = irt_isfloat(ir->t) ?
++  (((IRT_IS64 >> st) & 1) ?
++   (st == IRT_I64 ? RISCVI_FCVT_S_L : RISCVI_FCVT_S_LU) :
++   (st == IRT_INT ? RISCVI_FCVT_S_W : RISCVI_FCVT_S_WU)) :
++  (((IRT_IS64 >> st) & 1) ?
++   (st == IRT_I64 ? RISCVI_FCVT_D_L : RISCVI_FCVT_D_LU) :
++   (st == IRT_INT ? RISCVI_FCVT_D_W : RISCVI_FCVT_D_WU));
++      emit_ds(as, st64 ? RISCVI_FMV_X_D : RISCVI_FMV_X_W, dest, ftmp);
++      emit_ds(as, riscvi, ftmp, left);
++    }
++  } else if (irt_isfp(ir->t)) {
++    Reg dest = ra_dest(as, ir, RSET_FPR);
++    if (stfp) {  /* FP to FP conversion. */
++      emit_ds(as, st == IRT_NUM ? RISCVI_FCVT_S_D : RISCVI_FCVT_D_S,
++	      dest, ra_alloc1(as, lref, RSET_FPR));
++    } else {  /* Integer to FP conversion. */
++      Reg left = ra_alloc1(as, lref, RSET_GPR);
++      RISCVIns riscvi = irt_isfloat(ir->t) ?
++  (((IRT_IS64 >> st) & 1) ?
++   (st == IRT_I64 ? RISCVI_FCVT_S_L : RISCVI_FCVT_S_LU) :
++   (st == IRT_INT ? RISCVI_FCVT_S_W : RISCVI_FCVT_S_WU)) :
++  (((IRT_IS64 >> st) & 1) ?
++   (st == IRT_I64 ? RISCVI_FCVT_D_L : RISCVI_FCVT_D_LU) :
++   (st == IRT_INT ? RISCVI_FCVT_D_W : RISCVI_FCVT_D_WU));
++      emit_ds(as, riscvi, dest, left);
++    }
++  } else if (stfp) {  /* FP to integer conversion. */
++    if (irt_isguard(ir->t)) {
++      /* Checked conversions are only supported from number to int. */
++      lj_assertA(irt_isint(ir->t) && st == IRT_NUM,
++		 "bad type for checked CONV");
++      asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR));
++    } else {
++      Reg left = ra_alloc1(as, lref, RSET_FPR);
++      Reg dest = ra_dest(as, ir, RSET_GPR);
++      RISCVIns riscvi = irt_is64(ir->t) ?
++  (st == IRT_NUM ?
++   (irt_isi64(ir->t) ? RISCVI_FCVT_L_D : RISCVI_FCVT_LU_D) :
++   (irt_isi64(ir->t) ? RISCVI_FCVT_L_S : RISCVI_FCVT_LU_S)) :
++  (st == IRT_NUM ?
++   (irt_isint(ir->t) ? RISCVI_FCVT_W_D : RISCVI_FCVT_WU_D) :
++   (irt_isint(ir->t) ? RISCVI_FCVT_W_S : RISCVI_FCVT_WU_S));
++      emit_ds(as, riscvi|RISCVF_RM(RISCVRM_RTZ), dest, left);
++    }
++  } else if (st >= IRT_I8 && st <= IRT_U16) { /* Extend to 32 bit integer. */
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    Reg left = ra_alloc1(as, lref, RSET_GPR);
++    RISCVIns riscvi = st == IRT_I8 ? RISCVI_SEXT_B :
++    st == IRT_U8 ? RISCVI_ZEXT_B :
++    st == IRT_I16 ? RISCVI_SEXT_H : RISCVI_ZEXT_H;
++    lj_assertA(irt_isint(ir->t) || irt_isu32(ir->t), "bad type for CONV EXT");
++    emit_ext(as, riscvi, dest, left);
++  } else {  /* 32/64 bit integer conversions. */
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    if (irt_is64(ir->t)) {
++	    if (st64) {
++	/* 64/64 bit no-op (cast)*/
++	ra_leftov(as, dest, lref);  /* Do nothing, but may need to move regs. */
++      } else {  /* 32 to 64 bit sign extension. */
++	Reg left = ra_alloc1(as, lref, RSET_GPR);
++	  if ((ir->op2 & IRCONV_SEXT)) {  /* 32 to 64 bit sign extension. */
++	    emit_ext(as, RISCVI_SEXT_W, dest, left);
++	  } else {  /* 32 to 64 bit zero extension. */
++	    emit_ext(as, RISCVI_ZEXT_W, dest, left);
++	  }
++	    }
++    } else {
++	    if (st64 && !(ir->op2 & IRCONV_NONE)) {
++	/* This is either a 32 bit reg/reg mov which zeroes the hiword
++	** or a load of the loword from a 64 bit address.
++	*/
++	Reg left = ra_alloc1(as, lref, RSET_GPR);
++	emit_ext(as, RISCVI_ZEXT_W, dest, left);
++	    } else {  /* 32/32 bit no-op (cast). */
++	ra_leftov(as, dest, lref);  /* Do nothing, but may need to move regs. */
++    	}
++    }
++  }
++}
++
++static void asm_strto(ASMState *as, IRIns *ir)
++{
++  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_strscan_num];
++  IRRef args[2];
++  int32_t ofs = SPOFS_TMP;
++  RegSet drop = RSET_SCRATCH;
++  if (ra_hasreg(ir->r)) rset_set(drop, ir->r);  /* Spill dest reg (if any). */
++  ra_evictset(as, drop);
++  if (ir->s) ofs = sps_scale(ir->s);
++  asm_guard(as, RISCVI_BEQ, RID_RET, RID_ZERO);  /* Test return status. */
++  args[0] = ir->op1;      /* GCstr *str */
++  args[1] = ASMREF_TMP1;  /* TValue *n  */
++  asm_gencall(as, ci, args);
++  /* Store the result to the spill slot or temp slots. */
++  Reg tmp = ra_releasetmp(as, ASMREF_TMP1);
++  emit_opk(as, RISCVI_ADDI, tmp, RID_SP, tmp, ofs);
++}
++
++/* -- Memory references --------------------------------------------------- */
++
++/* Store tagged value for ref at base+ofs. */
++static void asm_tvstore64(ASMState *as, Reg base, int32_t ofs, IRRef ref)
++{
++  RegSet allow = rset_exclude(RSET_GPR, base);
++  IRIns *ir = IR(ref);
++  lj_assertA(irt_ispri(ir->t) || irt_isaddr(ir->t) || irt_isinteger(ir->t),
++	     "store of IR type %d", irt_type(ir->t));
++  if (irref_isk(ref)) {
++    TValue k;
++    lj_ir_kvalue(as->J->L, &k, ir);
++    emit_lso(as, RISCVI_SD, ra_allock(as, (int64_t)k.u64, allow), base, ofs);
++  } else {
++    Reg src = ra_alloc1(as, ref, allow);
++    rset_clear(allow, src);
++    Reg type = ra_allock(as, (int64_t)irt_toitype(ir->t) << 47, allow);
++    emit_lso(as, RISCVI_SD, RID_TMP, base, ofs);
++    if (irt_isinteger(ir->t)) {
++      emit_ds1s2(as, RISCVI_ADD, RID_TMP, RID_TMP, type);
++      emit_ext(as, RISCVI_ZEXT_W, RID_TMP, src);
++    } else {
++      emit_ds1s2(as, RISCVI_ADD, RID_TMP, src, type);
++    }
++  }
++}
++
++/* Get pointer to TValue. */
++static void asm_tvptr(ASMState *as, Reg dest, IRRef ref, MSize mode)	// todo-new
++{
++  if ((mode & IRTMPREF_IN1)) {
++    IRIns *ir = IR(ref);
++    if (irt_isnum(ir->t)) {
++      if (irref_isk(ref) && !(mode & IRTMPREF_OUT1)) {
++  /* Use the number constant itself as a TValue. */
++  ra_allockreg(as, igcptr(ir_knum(ir)), dest);
++  return;
++      }
++      emit_lso(as, RISCVI_FSD, ra_alloc1(as, ref, RSET_FPR), dest, 0);
++    } else {
++      asm_tvstore64(as, dest, 0, ref);
++    }
++  }
++  /* g->tmptv holds the TValue(s). */
++  emit_opk(as, RISCVI_ADDI, dest, RID_GL, dest, offsetof(global_State, tmptv));
++}
++
++static void asm_aref(ASMState *as, IRIns *ir)
++{
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  Reg idx, base;
++  if (irref_isk(ir->op2)) {
++    IRRef tab = IR(ir->op1)->op1;
++    int32_t ofs = asm_fuseabase(as, tab);
++    IRRef refa = ofs ? tab : ir->op1;
++    ofs += 8*IR(ir->op2)->i;
++    if (checki12(ofs)) {
++      base = ra_alloc1(as, refa, RSET_GPR);
++      emit_dsi(as, RISCVI_ADDI, dest, base, ofs);
++      return;
++    }
++  }
++  base = ra_alloc1(as, ir->op1, RSET_GPR);
++  idx = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, base));
++  emit_sh3add(as, dest, base, idx, RID_TMP);
++}
++
++/* Inlined hash lookup. Specialized for key type and for const keys.
++** The equivalent C code is:
++**   Node *n = hashkey(t, key);
++**   do {
++**     if (lj_obj_equal(&n->key, key)) return &n->val;
++**   } while ((n = nextnode(n)));
++**   return niltv(L);
++*/
++static void asm_href(ASMState *as, IRIns *ir, IROp merge)
++{
++  RegSet allow = RSET_GPR;
++  int destused = ra_used(ir);
++  Reg dest = ra_dest(as, ir, allow);
++  Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest));
++  Reg key = RID_NONE, type = RID_NONE, tmpnum = RID_NONE, tmp1, tmp2;
++  Reg cmp64 = RID_NONE;
++  IRRef refkey = ir->op2;
++  IRIns *irkey = IR(refkey);
++  int isk = irref_isk(refkey);
++  IRType1 kt = irkey->t;
++  uint32_t khash;
++  MCLabel l_end, l_loop, l_next;
++  rset_clear(allow, tab);
++  tmp1 = ra_scratch(as, allow);
++  rset_clear(allow, tmp1);
++  tmp2 = ra_scratch(as, allow);
++  rset_clear(allow, tmp2);
++
++  if (irt_isnum(kt)) {
++    key = ra_alloc1(as, refkey, RSET_FPR);
++    tmpnum = ra_scratch(as, rset_exclude(RSET_FPR, key));
++  } else {
++    /* Allocate cmp64 register used for 64-bit comparisons */
++    if (!isk && irt_isaddr(kt)) {
++      cmp64 = tmp2;
++    } else {
++      int64_t k;
++      if (isk && irt_isaddr(kt)) {
++	k = ((int64_t)irt_toitype(kt) << 47) | irkey[1].tv.u64;
++      } else {
++	lj_assertA(irt_ispri(kt) && !irt_isnil(kt), "bad HREF key type");
++	k = ~((int64_t)~irt_toitype(kt) << 47);
++      }
++      cmp64 = ra_allock(as, k, allow);
++      rset_clear(allow, cmp64);
++    }
++    if (!irt_ispri(kt)) {
++      key = ra_alloc1(as, refkey, allow);
++      rset_clear(allow, key);
++    }
++  } 
++
++  /* Key not found in chain: jump to exit (if merged) or load niltv. */
++  l_end = emit_label(as);
++  int is_lend_exit = 0;
++  as->invmcp = NULL;
++  if (merge == IR_NE)
++    asm_guard(as, RISCVI_BEQ, RID_ZERO, RID_ZERO);
++  else if (destused)
++    emit_loada(as, dest, niltvg(J2G(as->J)));
++
++  /* Follow hash chain until the end. */
++  l_loop = --as->mcp;
++  emit_mv(as, dest, tmp1);
++  emit_lso(as, RISCVI_LD, tmp1, dest, (int32_t)offsetof(Node, next));
++  l_next = emit_label(as);
++
++  /* Type and value comparison. */
++  if (merge == IR_EQ) {  /* Must match asm_guard(). */
++    l_end = asm_exitstub_addr(as, as->snapno);
++    is_lend_exit = 1;
++  }
++  if (irt_isnum(kt)) {
++    emit_branch(as, RISCVI_BNE, tmp1, RID_ZERO, l_end, is_lend_exit);
++    emit_ds1s2(as, RISCVI_FEQ_D, tmp1, tmpnum, key);
++    emit_branch(as, RISCVI_BEQ, tmp1, RID_ZERO, l_next, 0);
++    emit_dsi(as, RISCVI_SLTIU, tmp1, tmp1, ((int32_t)LJ_TISNUM));
++    emit_dsshamt(as, RISCVI_SRAI, tmp1, tmp1, 47);
++    emit_ds(as, RISCVI_FMV_D_X, tmpnum, tmp1);
++  } else {
++    emit_branch(as, RISCVI_BEQ, tmp1, cmp64, l_end, is_lend_exit);
++  }
++  emit_lso(as, RISCVI_LD, tmp1, dest, (int32_t)offsetof(Node, key.u64));
++  *l_loop = RISCVI_BNE | RISCVF_S1(tmp1) | RISCVF_S2(RID_ZERO)
++          | RISCVF_IMMB((char *)as->mcp-(char *)l_loop);
++  if (!isk && irt_isaddr(kt)) {
++    type = ra_allock(as, (int64_t)irt_toitype(kt) << 47, allow);
++    emit_ds1s2(as, RISCVI_ADD, tmp2, key, type);
++    rset_clear(allow, type);
++  }
++
++  /* Load main position relative to tab->node into dest. */
++  khash = isk ? ir_khash(as, irkey) : 1;
++  if (khash == 0) {
++    emit_lso(as, RISCVI_LD, dest, tab, (int32_t)offsetof(GCtab, node));
++  } else {
++    Reg tmphash = tmp1;
++    if (isk)
++      tmphash = ra_allock(as, khash, allow);
++    /* node = tab->node + (idx*32-idx*8) */
++    emit_ds1s2(as, RISCVI_ADD, dest, dest, tmp1);
++    lj_assertA(sizeof(Node) == 24, "bad Node size");
++    emit_ds1s2(as, RISCVI_SUBW, tmp1, tmp2, tmp1);
++    emit_dsshamt(as, RISCVI_SLLIW, tmp1, tmp1, 3);
++    emit_dsshamt(as, RISCVI_SLLIW, tmp2, tmp1, 5);
++    emit_ds1s2(as, RISCVI_AND, tmp1, tmp2, tmphash);	// idx = hi & tab->hmask
++    emit_lso(as, RISCVI_LD, dest, tab, (int32_t)offsetof(GCtab, node));
++    emit_lso(as, RISCVI_LW, tmp2, tab, (int32_t)offsetof(GCtab, hmask));
++    if (isk) {
++      /* Nothing to do. */
++    } else if (irt_isstr(kt)) {
++      emit_lso(as, RISCVI_LW, tmp1, key, (int32_t)offsetof(GCstr, sid));
++    } else {  /* Must match with hash*() in lj_tab.c. */
++      emit_ds1s2(as, RISCVI_SUBW, tmp1, tmp1, tmp2);
++      emit_roti(as, RISCVI_RORIW, tmp2, tmp2, dest, (-HASH_ROT3)&0x1f);
++      emit_ds1s2(as, RISCVI_XOR, tmp1, tmp1, tmp2);
++      emit_roti(as, RISCVI_RORIW, tmp1, tmp1, dest, (-HASH_ROT2-HASH_ROT1)&0x1f);
++      emit_ds1s2(as, RISCVI_SUBW, tmp2, tmp2, dest);
++      emit_ds1s2(as, RISCVI_XOR, tmp2, tmp2, tmp1);
++      emit_roti(as, RISCVI_RORIW, dest, tmp1, RID_TMP, (-HASH_ROT1)&0x1f);
++      if (irt_isnum(kt)) {
++	emit_dsshamt(as, RISCVI_SLLIW, tmp1, tmp1, 1);
++	emit_dsshamt(as, RISCVI_SRAI, tmp1, tmp1, 32);	// hi
++	emit_ext(as, RISCVI_SEXT_W, tmp2, tmp1);	// lo
++	emit_ds(as, RISCVI_FMV_X_D, tmp1, key);
++      } else {
++	checkmclim(as);
++	emit_dsshamt(as, RISCVI_SRAI, tmp1, tmp1, 32);	// hi
++	emit_ext(as, RISCVI_SEXT_W, tmp2, key);	// lo
++	emit_ds1s2(as, RISCVI_ADD, tmp1, key, type);
++      }
++    }
++  }
++}
++
++static void asm_hrefk(ASMState *as, IRIns *ir)
++{
++  IRIns *kslot = IR(ir->op2);
++  IRIns *irkey = IR(kslot->op1);
++  int32_t ofs = (int32_t)(kslot->op2 * sizeof(Node));
++  int32_t kofs = ofs + (int32_t)offsetof(Node, key);
++  int bigofs = !checki12(kofs);
++  Reg dest = (ra_used(ir) || bigofs) ? ra_dest(as, ir, RSET_GPR) : RID_NONE;
++  Reg node = ra_alloc1(as, ir->op1, RSET_GPR);
++  RegSet allow = rset_exclude(RSET_GPR, node);
++  Reg idx = node;
++  int64_t k;
++  lj_assertA(ofs % sizeof(Node) == 0, "unaligned HREFK slot");
++  if (bigofs) {
++    idx = dest;
++    rset_clear(allow, dest);
++    kofs = (int32_t)offsetof(Node, key);
++  } else if (ra_hasreg(dest)) {
++    emit_dsi(as, RISCVI_ADDI, dest, node, ofs);
++  }
++  if (irt_ispri(irkey->t)) {
++    lj_assertA(!irt_isnil(irkey->t), "bad HREFK key type");
++    k = ~((int64_t)~irt_toitype(irkey->t) << 47);
++  } else if (irt_isnum(irkey->t)) {
++    k = (int64_t)ir_knum(irkey)->u64;
++  } else {
++    k = ((int64_t)irt_toitype(irkey->t) << 47) | (int64_t)ir_kgc(irkey);
++  }
++  asm_guard(as, RISCVI_BNE, RID_TMP, ra_allock(as, k, allow));
++  emit_lso(as, RISCVI_LD, RID_TMP, idx, kofs);
++  if (bigofs)
++    emit_ds1s2(as, RISCVI_ADD, dest, node, ra_allock(as, ofs, allow));
++}
++
++static void asm_uref(ASMState *as, IRIns *ir)
++{
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC);
++  if (irref_isk(ir->op1) && !guarded) {
++    GCfunc *fn = ir_kfunc(IR(ir->op1));
++    MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
++    emit_lsptr(as, RISCVI_LD, dest, v, RSET_GPR);
++  } else {
++    if (guarded)
++      asm_guard(as, ir->o == IR_UREFC ? RISCVI_BEQ : RISCVI_BNE, RID_TMP, RID_ZERO);
++    if (ir->o == IR_UREFC)
++      emit_dsi(as, RISCVI_ADDI, dest, dest, (int32_t)offsetof(GCupval, tv));
++    else
++      emit_lso(as, RISCVI_LD, dest, dest, (int32_t)offsetof(GCupval, v));
++    if (guarded)
++      emit_lso(as, RISCVI_LBU, RID_TMP, dest, (int32_t)offsetof(GCupval, closed));
++    if (irref_isk(ir->op1)) {
++      GCfunc *fn = ir_kfunc(IR(ir->op1));
++      GCobj *o = gcref(fn->l.uvptr[(ir->op2 >> 8)]);
++      emit_loada(as, dest, o);
++    } else {
++      emit_lso(as, RISCVI_LD, dest, ra_alloc1(as, ir->op1, RSET_GPR),
++         (int32_t)offsetof(GCfuncL, uvptr) +
++         (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8));
++    }
++  }
++}
++
++static void asm_fref(ASMState *as, IRIns *ir)
++{
++  UNUSED(as); UNUSED(ir);
++  lj_assertA(!ra_used(ir), "unfused FREF");
++}
++
++static void asm_strref(ASMState *as, IRIns *ir)
++{
++  RegSet allow = RSET_GPR;
++  Reg dest = ra_dest(as, ir, allow);
++  Reg base = ra_alloc1(as, ir->op1, allow);
++  IRIns *irr = IR(ir->op2);
++  int32_t ofs = sizeof(GCstr);
++  rset_clear(allow, base);
++  if (irref_isk(ir->op2) && checki12(ofs + irr->i)) {
++    emit_dsi(as, RISCVI_ADDI, dest, base, ofs + irr->i);
++  } else {
++    emit_dsi(as, RISCVI_ADDI, dest, dest, ofs);
++    emit_ds1s2(as, RISCVI_ADD, dest, base, ra_alloc1(as, ir->op2, allow));
++  }
++}
++
++/* -- Loads and stores ---------------------------------------------------- */
++
++static RISCVIns asm_fxloadins(IRIns *ir)
++{
++  switch (irt_type(ir->t)) {
++  case IRT_I8: return RISCVI_LB;
++  case IRT_U8: return RISCVI_LBU;
++  case IRT_I16: return RISCVI_LH;
++  case IRT_U16: return RISCVI_LHU;
++  case IRT_NUM: return RISCVI_FLD;
++  case IRT_FLOAT: return RISCVI_FLW;
++  default: return irt_is64(ir->t) ? RISCVI_LD : RISCVI_LW;
++  }
++}
++
++static RISCVIns asm_fxstoreins(IRIns *ir)
++{
++  switch (irt_type(ir->t)) {
++  case IRT_I8: case IRT_U8: return RISCVI_SB;
++  case IRT_I16: case IRT_U16: return RISCVI_SH;
++  case IRT_NUM: return RISCVI_FSD;
++  case IRT_FLOAT: return RISCVI_FSW;
++  default: return irt_is64(ir->t) ? RISCVI_SD : RISCVI_SW;
++  }
++}
++
++static void asm_fload(ASMState *as, IRIns *ir)
++{
++  RegSet allow = RSET_GPR;
++  Reg idx, dest = ra_dest(as, ir, allow);
++  rset_clear(allow, dest);
++  RISCVIns riscvi = asm_fxloadins(ir);
++  int32_t ofs;
++  if (ir->op1 == REF_NIL) {  /* FLOAD from GG_State with offset. */
++    idx = RID_GL;
++    ofs = (ir->op2 << 2) - GG_OFS(g);
++  } else {
++    idx = ra_alloc1(as, ir->op1, allow);
++    if (ir->op2 == IRFL_TAB_ARRAY) {
++      ofs = asm_fuseabase(as, ir->op1);
++      if (ofs) {  /* Turn the t->array load into an add for colocated arrays. */
++	emit_dsi(as, RISCVI_ADDI, dest, idx, ofs);
++	return;
++      }
++    }
++    ofs = field_ofs[ir->op2];
++    lj_assertA(!irt_isfp(ir->t), "bad FP FLOAD");
++  }
++  rset_clear(allow, idx);
++  emit_lso(as, riscvi, dest, idx, ofs);
++}
++
++static void asm_fstore(ASMState *as, IRIns *ir)
++{
++  if (ir->r != RID_SINK) {
++    Reg src = ra_alloc1z(as, ir->op2, RSET_GPR);
++    IRIns *irf = IR(ir->op1);
++    Reg idx = ra_alloc1(as, irf->op1, rset_exclude(RSET_GPR, src));
++    int32_t ofs = field_ofs[irf->op2];
++    lj_assertA(!irt_isfp(ir->t), "bad FP FSTORE");
++    emit_lso(as, asm_fxstoreins(ir), src, idx, ofs);
++  }
++}
++
++static void asm_xload(ASMState *as, IRIns *ir)
++{
++  Reg dest = ra_dest(as, ir, (irt_isfp(ir->t)) ? RSET_FPR : RSET_GPR);
++  lj_assertA(LJ_TARGET_UNALIGNED || !(ir->op2 & IRXLOAD_UNALIGNED),
++	     "unaligned XLOAD");
++  asm_fusexref(as, asm_fxloadins(ir), dest, ir->op1, RSET_GPR, 0);
++}
++
++static void asm_xstore_(ASMState *as, IRIns *ir, int32_t ofs)
++{
++  if (ir->r != RID_SINK) {
++    Reg src = ra_alloc1z(as, ir->op2, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR);
++    asm_fusexref(as, asm_fxstoreins(ir), src, ir->op1,
++	  	 rset_exclude(RSET_GPR, src), ofs);
++  }
++}
++
++#define asm_xstore(as, ir)	asm_xstore_(as, ir, 0)
++
++static void asm_ahuvload(ASMState *as, IRIns *ir)
++{
++  Reg dest = RID_NONE, type = RID_TMP, idx;
++  RegSet allow = RSET_GPR;
++  int32_t ofs = 0;
++  IRType1 t = ir->t;
++  if (ra_used(ir)) {
++    lj_assertA((irt_isnum(ir->t)) || irt_isint(ir->t) || irt_isaddr(ir->t),
++	       "bad load type %d", irt_type(ir->t));
++    dest = ra_dest(as, ir, irt_isnum(t) ? RSET_FPR : allow);
++    rset_clear(allow, dest);
++    if (irt_isaddr(t)) {
++      emit_cleartp(as, dest, dest);
++    } else if (irt_isint(t))
++      emit_ext(as, RISCVI_SEXT_W, dest, dest);
++  }
++  idx = asm_fuseahuref(as, ir->op1, &ofs, allow);
++  if (ir->o == IR_VLOAD) ofs += 8 * ir->op2;
++  rset_clear(allow, idx);
++  if (irt_isnum(t)) {
++    asm_guard(as, RISCVI_BEQ, RID_TMP, RID_ZERO);
++    emit_dsi(as, RISCVI_SLTIU, RID_TMP, type, (int32_t)LJ_TISNUM);
++  } else {
++    asm_guard(as, RISCVI_BNE, type,
++	      ra_allock(as, (int32_t)irt_toitype(t), allow));
++  }
++  if (ra_hasreg(dest)) {
++    if (irt_isnum(t)) {
++      emit_lso(as, RISCVI_FLD, dest, idx, ofs);
++      dest = type;
++    }
++  } else {
++    dest = type;
++  }
++  emit_dsshamt(as, RISCVI_SRAI, type, dest, 47);
++  emit_lso(as, RISCVI_LD, dest, idx, ofs);
++}
++
++static void asm_ahustore(ASMState *as, IRIns *ir)
++{
++  RegSet allow = RSET_GPR;
++  Reg idx, src = RID_NONE, type = RID_NONE;
++  int32_t ofs = 0;
++  if (ir->r == RID_SINK)
++    return;
++  if (irt_isnum(ir->t)) {
++    src = ra_alloc1(as, ir->op2, RSET_FPR);
++    idx = asm_fuseahuref(as, ir->op1, &ofs, allow);
++    emit_lso(as, RISCVI_FSD, src, idx, ofs);
++  } else {
++    Reg tmp = RID_TMP;
++    if (irt_ispri(ir->t)) {
++      tmp = ra_allock(as, ~((int64_t)~irt_toitype(ir->t) << 47), allow);
++      rset_clear(allow, tmp);
++    } else {
++      src = ra_alloc1(as, ir->op2, allow);
++      rset_clear(allow, src);
++      type = ra_allock(as, (int64_t)irt_toitype(ir->t) << 47, allow);
++      rset_clear(allow, type);
++    }
++    idx = asm_fuseahuref(as, ir->op1, &ofs, allow);
++    emit_lso(as, RISCVI_SD, tmp, idx, ofs);
++    if (ra_hasreg(src)) {
++      if (irt_isinteger(ir->t)) {
++	emit_ds1s2(as, RISCVI_ADD, tmp, tmp, type);
++  emit_ext(as, RISCVI_ZEXT_W, tmp, src);
++      } else {
++	emit_ds1s2(as, RISCVI_ADD, tmp, src, type);
++      }
++    }
++  }
++}
++
++static void asm_sload(ASMState *as, IRIns *ir)
++{
++  Reg dest = RID_NONE, type = RID_NONE, base;
++  RegSet allow = RSET_GPR;
++  IRType1 t = ir->t;
++  int32_t ofs = 8*((int32_t)ir->op1-2);
++  lj_assertA(checki12(ofs), "sload IR operand out of range");
++  lj_assertA(!(ir->op2 & IRSLOAD_PARENT),
++	     "bad parent SLOAD");  /* Handled by asm_head_side(). */
++  lj_assertA(irt_isguard(t) || !(ir->op2 & IRSLOAD_TYPECHECK),
++	     "inconsistent SLOAD variant");
++  if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t) && irt_isint(t)) {
++    dest = ra_scratch(as, RSET_FPR);
++    asm_tointg(as, ir, dest);
++    t.irt = IRT_NUM;  /* Continue with a regular number type check. */
++  } else if (ra_used(ir)) {
++    Reg tmp = RID_NONE;
++    if ((ir->op2 & IRSLOAD_CONVERT))
++      tmp = ra_scratch(as, irt_isint(t) ? RSET_FPR : RSET_GPR);
++    lj_assertA((irt_isnum(t)) || irt_isint(t) || irt_isaddr(t),
++	       "bad SLOAD type %d", irt_type(t));
++    dest = ra_dest(as, ir, irt_isnum(t) ? RSET_FPR : allow);
++    rset_clear(allow, dest);
++    base = ra_alloc1(as, REF_BASE, allow);
++    rset_clear(allow, base);
++    if (irt_isaddr(t)) { /* Clear type from pointers. */
++      emit_cleartp(as, dest, dest);
++    } else if (ir->op2 & IRSLOAD_CONVERT) {
++      if (irt_isint(t)) {
++	emit_ds(as, RISCVI_FCVT_W_D|RISCVF_RM(RISCVRM_RTZ), dest, tmp);
++  /* If value is already loaded for type check, move it to FPR. */
++	if ((ir->op2 & IRSLOAD_TYPECHECK))
++	  emit_ds(as, RISCVI_FMV_D_X, tmp, dest);
++	else
++	  dest = tmp;
++	t.irt = IRT_NUM;  /* Check for original type. */
++      } else {
++	emit_ds(as, RISCVI_FCVT_D_W, dest, tmp);
++	dest = tmp;
++	t.irt = IRT_INT;  /* Check for original type. */
++      }
++    } else if (irt_isint(t) && (ir->op2 & IRSLOAD_TYPECHECK)) {
++      /* Sign-extend integers. */
++      emit_ext(as, RISCVI_SEXT_W, dest, dest);
++    }
++    goto dotypecheck;
++  }
++  base = ra_alloc1(as, REF_BASE, allow);
++  rset_clear(allow, base);
++dotypecheck:
++  if ((ir->op2 & IRSLOAD_TYPECHECK)) {
++    type = dest < RID_MAX_GPR ? dest : RID_TMP;
++    if (irt_ispri(t)) {
++      asm_guard(as, RISCVI_BNE, type,
++		ra_allock(as, ~((int64_t)~irt_toitype(t) << 47) , allow));
++    } else if ((ir->op2 & IRSLOAD_KEYINDEX)) {
++      asm_guard(as, RISCVI_BNE, RID_TMP,
++               ra_allock(as, (int32_t)LJ_KEYINDEX, allow));
++      emit_dsshamt(as, RISCVI_SRAI, RID_TMP, type, 32);
++    } else {
++      if (irt_isnum(t)) {
++        asm_guard(as, RISCVI_BEQ, RID_TMP, RID_ZERO);
++        emit_dsi(as, RISCVI_SLTIU, RID_TMP, RID_TMP, LJ_TISNUM);
++	if (ra_hasreg(dest)) {
++	  emit_lso(as, RISCVI_FLD, dest, base, ofs);
++	}
++      } else {
++	asm_guard(as, RISCVI_BNE, RID_TMP,
++		  ra_allock(as, (int32_t)irt_toitype(t), allow));
++      }
++      emit_dsshamt(as, RISCVI_SRAI, RID_TMP, type, 47);
++    }
++    emit_lso(as, RISCVI_LD, type, base, ofs);
++  } else if (ra_hasreg(dest)) {
++    emit_lso(as, irt_isnum(t) ? RISCVI_FLD :
++             irt_isint(t) ? RISCVI_LW : RISCVI_LD,
++             dest, base, ofs);
++  }
++}
++
++/* -- Allocations --------------------------------------------------------- */
++
++#if LJ_HASFFI
++static void asm_cnew(ASMState *as, IRIns *ir)
++{
++  CTState *cts = ctype_ctsG(J2G(as->J));
++  CTypeID id = (CTypeID)IR(ir->op1)->i;
++  CTSize sz;
++  CTInfo info = lj_ctype_info(cts, id, &sz);
++  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_mem_newgco];
++  IRRef args[4];
++  RegSet drop = RSET_SCRATCH;
++  lj_assertA(sz != CTSIZE_INVALID || (ir->o == IR_CNEW && ir->op2 != REF_NIL),
++	     "bad CNEW/CNEWI operands");
++
++  as->gcsteps++;
++  if (ra_hasreg(ir->r))
++    rset_clear(drop, ir->r);  /* Dest reg handled below. */
++  ra_evictset(as, drop);
++  if (ra_used(ir))
++    ra_destreg(as, ir, RID_RET);  /* GCcdata * */
++
++  /* Initialize immutable cdata object. */
++  if (ir->o == IR_CNEWI) {
++    RegSet allow = (RSET_GPR & ~RSET_SCRATCH);
++    emit_lso(as, sz == 8 ? RISCVI_SD : RISCVI_SW, ra_alloc1(as, ir->op2, allow),
++	     RID_RET, (sizeof(GCcdata)));
++    lj_assertA(sz == 4 || sz == 8, "bad CNEWI size %d", sz);
++  } else if (ir->op2 != REF_NIL) {  /* Create VLA/VLS/aligned cdata. */
++    ci = &lj_ir_callinfo[IRCALL_lj_cdata_newv];
++    args[0] = ASMREF_L;     /* lua_State *L */
++    args[1] = ir->op1;      /* CTypeID id   */
++    args[2] = ir->op2;      /* CTSize sz    */
++    args[3] = ASMREF_TMP1;  /* CTSize align */
++    asm_gencall(as, ci, args);
++    emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), (int32_t)ctype_align(info));
++    return;
++  }
++
++  /* Initialize gct and ctypeid. lj_mem_newgco() already sets marked. */
++  emit_lso(as, RISCVI_SB, RID_RET+1, RID_RET, (offsetof(GCcdata, gct)));
++  emit_lso(as, RISCVI_SH, RID_TMP, RID_RET, (offsetof(GCcdata, ctypeid)));
++  emit_loadk12(as, RID_RET+1, ~LJ_TCDATA);
++  emit_loadk32(as, RID_TMP, id);
++  args[0] = ASMREF_L;     /* lua_State *L */
++  args[1] = ASMREF_TMP1;  /* MSize size   */
++  asm_gencall(as, ci, args);
++  ra_allockreg(as, (int32_t)(sz+sizeof(GCcdata)),
++         ra_releasetmp(as, ASMREF_TMP1));
++}
++#endif
++
++/* -- Write barriers ------------------------------------------------------ */
++
++static void asm_tbar(ASMState *as, IRIns *ir)
++{
++  Reg tab = ra_alloc1(as, ir->op1, RSET_GPR);
++  Reg mark = ra_scratch(as, rset_exclude(RSET_GPR, tab));
++  Reg link = RID_TMP;
++  MCLabel l_end = emit_label(as);
++  emit_lso(as, RISCVI_SD, link, tab, (int32_t)offsetof(GCtab, gclist));
++  emit_lso(as, RISCVI_SB, mark, tab, (int32_t)offsetof(GCtab, marked));
++  emit_setgl(as, tab, gc.grayagain);	// make tab gray again
++  emit_getgl(as, link, gc.grayagain);
++  emit_branch(as, RISCVI_BEQ, RID_TMP, RID_ZERO, l_end, 0);	// black: not jump
++  emit_ds1s2(as, RISCVI_XOR, mark, mark, RID_TMP);	// mark=0: gray
++  emit_dsi(as, RISCVI_ANDI, RID_TMP, mark, LJ_GC_BLACK);
++  emit_lso(as, RISCVI_LBU, mark, tab, ((int32_t)offsetof(GCtab, marked)));
++}
++
++static void asm_obar(ASMState *as, IRIns *ir)
++{
++  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_barrieruv];
++  IRRef args[2];
++  MCLabel l_end;
++  Reg obj, val, tmp;
++  /* No need for other object barriers (yet). */
++  lj_assertA(IR(ir->op1)->o == IR_UREFC, "bad OBAR type");	// Closed upvalue
++  ra_evictset(as, RSET_SCRATCH);
++  l_end = emit_label(as);
++  args[0] = ASMREF_TMP1;  /* global_State *g */
++  args[1] = ir->op1;      /* TValue *tv      */
++  asm_gencall(as, ci, args);
++  emit_ds(as, RISCVI_MV, ra_releasetmp(as, ASMREF_TMP1), RID_GL);
++  obj = IR(ir->op1)->r;
++  tmp = ra_scratch(as, rset_exclude(RSET_GPR, obj));
++  emit_branch(as, RISCVI_BEQ, tmp, RID_ZERO, l_end, 0);
++  emit_branch(as, RISCVI_BEQ, RID_TMP, RID_ZERO, l_end, 0);	// black: jump
++  emit_dsi(as, RISCVI_ANDI, tmp, tmp, LJ_GC_BLACK);
++  emit_dsi(as, RISCVI_ANDI, RID_TMP, RID_TMP, LJ_GC_WHITES);
++  val = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, obj));
++  emit_lso(as, RISCVI_LBU, tmp, obj,
++	   ((int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv)));
++  emit_lso(as, RISCVI_LBU, RID_TMP, val, ((int32_t)offsetof(GChead, marked)));
++}
++
++/* -- Arithmetic and logic operations ------------------------------------- */
++
++static void asm_fparith(ASMState *as, IRIns *ir, RISCVIns riscvi)
++{
++  Reg dest = ra_dest(as, ir, RSET_FPR);
++  Reg right, left = ra_alloc2(as, ir, RSET_FPR);
++  right = (left >> 8); left &= 255;
++  emit_ds1s2(as, riscvi, dest, left, right);
++}
++
++static void asm_fpunary(ASMState *as, IRIns *ir, RISCVIns riscvi)
++{
++  Reg dest = ra_dest(as, ir, RSET_FPR);
++  Reg left = ra_hintalloc(as, ir->op1, dest, RSET_FPR);
++  switch(riscvi) {
++    case RISCVI_FSQRT_S: case RISCVI_FSQRT_D:
++      emit_ds(as, riscvi, dest, left);
++      break;
++    case RISCVI_FMV_S: case RISCVI_FMV_D:
++    case RISCVI_FABS_S: case RISCVI_FABS_D:
++    case RISCVI_FNEG_S: case RISCVI_FNEG_D:
++      emit_ds1s2(as, riscvi, dest, left, left);
++      break;
++    default:
++      lj_assertA(0, "bad fp unary instruction");
++      return;
++  }
++}
++
++static void asm_fpmath(ASMState *as, IRIns *ir)
++{
++  IRFPMathOp fpm = (IRFPMathOp)ir->op2;
++  if (fpm <= IRFPM_TRUNC)
++    asm_callround(as, ir, IRCALL_lj_vm_floor + fpm);
++  else if (fpm == IRFPM_SQRT)
++    asm_fpunary(as, ir, RISCVI_FSQRT_D);
++  else
++    asm_callid(as, ir, IRCALL_lj_vm_floor + fpm);
++}
++
++static void asm_add(ASMState *as, IRIns *ir)
++{
++  IRType1 t = ir->t;
++  if (irt_isnum(t)) {
++    if (!asm_fusemadd(as, ir, RISCVI_FMADD_D, RISCVI_FMADD_D))
++      asm_fparith(as, ir, RISCVI_FADD_D);
++    return;
++  } else {
++    if ((as->flags & JIT_F_RVXThead) && asm_fusemac(as, ir, RISCVI_TH_MULA))
++      return;
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    Reg left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
++    if (irref_isk(ir->op2)) {
++      intptr_t k = get_kval(as, ir->op2);
++      if (checki12(k)) {
++  if (irt_is64(t)) {
++    emit_dsi(as, RISCVI_ADDI, dest, left, k);
++  } else {
++	  emit_dsi(as, RISCVI_ADDIW, dest, left, k);
++  }
++	return;
++      }
++    }
++    Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
++    emit_ds1s2(as, irt_is64(t) ? RISCVI_ADD : RISCVI_ADDW, dest,
++	     left, right);
++  }
++}
++
++static void asm_sub(ASMState *as, IRIns *ir)
++{
++  if (irt_isnum(ir->t)) {
++    if (!asm_fusemadd(as, ir, RISCVI_FMSUB_D, RISCVI_FNMSUB_D))
++      asm_fparith(as, ir, RISCVI_FSUB_D);
++    return;
++  } else {
++    if ((as->flags & JIT_F_RVXThead) && asm_fusemac(as, ir, RISCVI_TH_MULS))
++      return;
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    Reg right, left = ra_alloc2(as, ir, RSET_GPR);
++    right = (left >> 8); left &= 255;
++    emit_ds1s2(as, irt_is64(ir->t) ? RISCVI_SUB : RISCVI_SUBW, dest,
++	     left, right);
++  }
++}
++
++static void asm_mul(ASMState *as, IRIns *ir)
++{
++  if (irt_isnum(ir->t)) {
++    asm_fparith(as, ir, RISCVI_FMUL_D);
++  } else {
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    Reg right, left = ra_alloc2(as, ir, RSET_GPR);
++    right = (left >> 8); left &= 255;
++    emit_ds1s2(as, irt_is64(ir->t) ? RISCVI_MUL : RISCVI_MULW, dest,
++	     left, right);
++  }
++}
++
++static void asm_fpdiv(ASMState *as, IRIns *ir)
++{
++    asm_fparith(as, ir, RISCVI_FDIV_D);
++}
++
++static void asm_neg(ASMState *as, IRIns *ir)
++{
++  if (irt_isnum(ir->t)) {
++    asm_fpunary(as, ir, RISCVI_FNEG_D);
++  } else {
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    Reg left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
++    emit_ds1s2(as, irt_is64(ir->t) ? RISCVI_SUB : RISCVI_SUBW, dest,
++	     RID_ZERO, left);
++  }
++}
++
++#define asm_abs(as, ir)		asm_fpunary(as, ir, RISCVI_FABS_D)
++
++static void asm_arithov(ASMState *as, IRIns *ir)
++{
++  Reg right, left, tmp, dest = ra_dest(as, ir, RSET_GPR);
++  lj_assertA(!irt_is64(ir->t), "bad usage");
++  if (irref_isk(ir->op2)) {
++    int k = IR(ir->op2)->i;
++    if (ir->o == IR_SUBOV) k = (int)(~(unsigned int)k+1u);
++    if (checki12(k)) {	/* (dest < left) == (k >= 0 ? 1 : 0) */
++      left = ra_alloc1(as, ir->op1, RSET_GPR);
++      asm_guard(as, k >= 0 ? RISCVI_BLT : RISCVI_BGE, dest, dest == left ? RID_TMP : left);
++      emit_dsi(as, RISCVI_ADDI, dest, left, k);
++      if (dest == left) emit_mv(as, RID_TMP, left);
++      return;
++    }
++  }
++  left = ra_alloc2(as, ir, RSET_GPR);
++  right = (left >> 8); left &= 255;
++  tmp = ra_scratch(as, rset_exclude(rset_exclude(rset_exclude(RSET_GPR, left),
++						 right), dest));
++  asm_guard(as, RISCVI_BLT, RID_TMP, RID_ZERO);
++  emit_ds1s2(as, RISCVI_AND, RID_TMP, RID_TMP, tmp);
++  if (ir->o == IR_ADDOV) {  /* ((dest^left) & (dest^right)) < 0 */
++    emit_ds1s2(as, RISCVI_XOR, RID_TMP, dest, dest == right ? RID_TMP : right);
++  } else {  /* ((dest^left) & (dest^~right)) < 0 */
++    emit_xnor(as, RID_TMP, dest, dest == right ? RID_TMP : right);
++  }
++  emit_ds1s2(as, RISCVI_XOR, tmp, dest, dest == left ? RID_TMP : left);
++  emit_ds1s2(as, ir->o == IR_ADDOV ? RISCVI_ADDW : RISCVI_SUBW, dest, left, right);
++  if (dest == left || dest == right)
++    emit_mv(as, RID_TMP, dest == left ? left : right);
++}
++
++#define asm_addov(as, ir)	asm_arithov(as, ir)
++#define asm_subov(as, ir)	asm_arithov(as, ir)
++
++static void asm_mulov(ASMState *as, IRIns *ir)
++{
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  Reg right, left = ra_alloc2(as, ir, RSET_GPR);
++  right = (left >> 8); left &= 255;
++  asm_guard(as, RISCVI_BNE, dest, RID_TMP);
++  emit_ext(as, RISCVI_SEXT_W, dest, RID_TMP);	// dest: [31:0]+signextend
++  emit_ds1s2(as, RISCVI_MUL, RID_TMP, left, right);	// RID_TMP: [63:0]
++}
++
++static void asm_bnot(ASMState *as, IRIns *ir)
++{
++  Reg left, right, dest = ra_dest(as, ir, RSET_GPR);
++  IRIns *irl = IR(ir->op1);
++  if (as->flags & JIT_F_RVZbb && mayfuse(as, ir->op1) && irl->o == IR_BXOR) {
++    left = ra_alloc2(as, irl, RSET_GPR);
++    right = (left >> 8); left &= 255;
++    emit_ds1s2(as, RISCVI_XNOR, dest, left, right);
++  } else {
++    left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
++    emit_ds(as, RISCVI_NOT, dest, left);
++  }
++}
++
++static void asm_bswap(ASMState *as, IRIns *ir)
++{
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  Reg left = ra_alloc1(as, ir->op1, RSET_GPR);
++  RegSet allow = rset_exclude(rset_exclude(RSET_GPR, dest), left);
++  if (as->flags & JIT_F_RVZbb) {
++    if (!irt_is64(ir->t))
++      emit_dsshamt(as, RISCVI_SRAI, dest, dest, 32);
++    emit_ds(as, RISCVI_REV8, dest, left);
++  } else if (as->flags & JIT_F_RVXThead) {
++    emit_ds(as, irt_is64(ir->t) ? RISCVI_TH_REV : RISCVI_TH_REVW,
++       dest, left);
++  } else if (irt_is64(ir->t)) {
++    Reg tmp1, tmp2, tmp3, tmp4;
++    tmp1 = ra_scratch(as, allow), allow = rset_exclude(allow, tmp1);
++    tmp2 = ra_scratch(as, allow), allow = rset_exclude(allow, tmp2);
++    tmp3 = ra_scratch(as, allow), allow = rset_exclude(allow, tmp3);
++    tmp4 = ra_scratch(as, allow);
++    emit_ds1s2(as, RISCVI_OR, dest, dest, tmp4);
++    emit_ds1s2(as, RISCVI_OR, dest, dest, tmp3);
++    emit_ds1s2(as, RISCVI_OR, dest, dest, tmp2);
++    emit_dsshamt(as, RISCVI_SLLI, tmp4, tmp4, 40);
++    emit_dsshamt(as, RISCVI_SLLI, dest, left, 56);
++    emit_ds1s2(as, RISCVI_OR, tmp3, tmp1, tmp3);
++    emit_ds1s2(as, RISCVI_AND, tmp4, left, RID_TMP);
++    emit_dsshamt(as, RISCVI_SLLI, tmp3, tmp3, 32);
++    emit_dsshamt(as, RISCVI_SLLI, tmp1, tmp1, 24);
++    emit_dsshamt(as, RISCVI_SRLIW, tmp3, left, 24);
++    emit_ds1s2(as, RISCVI_OR, tmp2, tmp3, tmp2);
++    emit_ds1s2(as, RISCVI_AND, tmp1, left, tmp1);
++    emit_ds1s2(as, RISCVI_OR, tmp3, tmp4, tmp3);
++    emit_dsshamt(as, RISCVI_SLLI, tmp4, tmp4, 24);
++    emit_dsshamt(as, RISCVI_SRLIW, tmp4, tmp4, 24);
++    emit_ds1s2(as, RISCVI_AND, tmp3, tmp3, tmp1);
++    emit_dsshamt(as, RISCVI_SRLI, tmp4, left, 8);
++    emit_dsshamt(as, RISCVI_SRLI, tmp3, left, 24);
++    emit_ds1s2(as, RISCVI_OR, tmp2, tmp2, tmp3);
++    emit_du(as, RISCVI_LUI, tmp1, RISCVF_HI(0xff0000u));
++    emit_ds1s2(as, RISCVI_AND, tmp2, tmp2, RID_TMP);
++    emit_dsshamt(as, RISCVI_SRLI, tmp3, left, 56);
++    emit_dsi(as, RISCVI_ADDI, RID_TMP, RID_TMP, RISCVF_LO(0xff00));
++    emit_du(as, RISCVI_LUI, RID_TMP, RISCVF_HI(0xff00u));
++    emit_dsshamt(as, RISCVI_SRLI, tmp2, left, 40);
++  } else {
++    Reg tmp1, tmp2;
++    tmp1 = ra_scratch(as, allow), allow = rset_exclude(allow, tmp1);
++    tmp2 = ra_scratch(as, allow);
++    emit_ds1s2(as, RISCVI_OR, dest, dest, tmp2);
++    emit_ds1s2(as, RISCVI_OR, dest, dest, tmp1);
++    emit_dsshamt(as, RISCVI_SLLI, tmp2, RID_TMP, 8);
++    emit_dsshamt(as, RISCVI_SLLIW, dest, left, 24);
++    emit_ds1s2(as, RISCVI_OR, tmp1, tmp1, tmp2);
++    emit_ds1s2(as, RISCVI_AND, RID_TMP, left, RID_TMP);
++    emit_ds1s2(as, RISCVI_AND, tmp1, tmp1, RID_TMP);
++    emit_dsshamt(as, RISCVI_SRLIW, tmp2, left, 24);
++    emit_dsi(as, RISCVI_ADDI, RID_TMP, RID_TMP, RISCVF_LO(0xff00));
++    emit_du(as, RISCVI_LUI, RID_TMP, RISCVF_HI(0xff00u));
++    emit_dsshamt(as, RISCVI_SRLI, tmp1, left, 8);
++  }
++}
++
++static void asm_bitop(ASMState *as, IRIns *ir, RISCVIns riscvi, RISCVIns riscvik, RISCVIns riscvin)
++{
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  Reg left, right;
++  IRIns *irl = IR(ir->op1), *irr = IR(ir->op2);
++  if (irref_isk(ir->op2)) {
++    intptr_t k = get_kval(as, ir->op2);
++    if (checki12(k)) {
++      left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
++      emit_dsi(as, riscvik, dest, left, k);
++      return;
++    }
++  } else if (as->flags & JIT_F_RVZbb) {
++    if (mayfuse(as, ir->op1) && irl->o == IR_BNOT) {
++      left = ra_alloc1(as, irl->op1, RSET_GPR);
++      right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
++      emit_ds1s2(as, riscvin, dest, right, left);
++      return;
++    } else if (mayfuse(as, ir->op2) && irr->o == IR_BNOT) {
++      left = ra_alloc1(as, ir->op1, RSET_GPR);
++      right = ra_alloc1(as, irr->op1, rset_exclude(RSET_GPR, left));
++      emit_ds1s2(as, riscvin, dest, left, right);
++      return;
++    }
++  }
++  left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
++  right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
++  emit_ds1s2(as, riscvi, dest, left, right);
++}
++
++#define asm_band(as, ir)	asm_bitop(as, ir, RISCVI_AND, RISCVI_ANDI, RISCVI_ANDN)
++#define asm_bor(as, ir)	asm_bitop(as, ir, RISCVI_OR, RISCVI_ORI, RISCVI_ORN)
++#define asm_bxor(as, ir)	asm_bitop(as, ir, RISCVI_XOR, RISCVI_XORI, RISCVI_XNOR)
++
++static void asm_bitshift(ASMState *as, IRIns *ir, RISCVIns riscvi, RISCVIns riscvik)
++{
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  Reg left = ra_alloc1(as, ir->op1, RSET_GPR);
++  uint32_t shmsk = irt_is64(ir->t) ? 63 : 31;
++  if (irref_isk(ir->op2)) {  /* Constant shifts. */
++    uint32_t shift = (uint32_t)(IR(ir->op2)->i & shmsk);
++    switch (riscvik) {
++      case RISCVI_SRAI: case RISCVI_SRLI: case RISCVI_SLLI:
++      case RISCVI_SRAIW: case RISCVI_SLLIW: case RISCVI_SRLIW:
++        emit_dsshamt(as, riscvik, dest, left, shift);
++        break;
++      case RISCVI_RORI: case RISCVI_RORIW:
++        emit_roti(as, riscvik, dest, left, RID_TMP, shift);
++        break;
++      default:
++        lj_assertA(0, "bad shift instruction");
++        return;
++    }
++  } else {
++    Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
++    switch (riscvi) {
++      case RISCVI_SRA: case RISCVI_SRL: case RISCVI_SLL:
++      case RISCVI_SRAW: case RISCVI_SRLW: case RISCVI_SLLW:
++        emit_ds1s2(as, riscvi, dest, left, right);
++        break;
++      case RISCVI_ROR: case RISCVI_ROL:
++      case RISCVI_RORW: case RISCVI_ROLW:
++        emit_rot(as, riscvi, dest, left, right, RID_TMP);
++        break;
++      default:
++        lj_assertA(0, "bad shift instruction");
++        return;
++    }
++  }
++}
++
++#define asm_bshl(as, ir)	(irt_is64(ir->t) ? \
++  asm_bitshift(as, ir, RISCVI_SLL, RISCVI_SLLI) : \
++  asm_bitshift(as, ir, RISCVI_SLLW, RISCVI_SLLIW))
++#define asm_bshr(as, ir)	(irt_is64(ir->t) ? \
++  asm_bitshift(as, ir, RISCVI_SRL, RISCVI_SRLI) : \
++  asm_bitshift(as, ir, RISCVI_SRLW, RISCVI_SRLIW))
++#define asm_bsar(as, ir)	(irt_is64(ir->t) ? \
++  asm_bitshift(as, ir, RISCVI_SRA, RISCVI_SRAI) : \
++  asm_bitshift(as, ir, RISCVI_SRAW, RISCVI_SRAIW))
++#define asm_brol(as, ir)	lj_assertA(0, "unexpected BROL")
++#define asm_bror(as, ir)	(irt_is64(ir->t) ? \
++  asm_bitshift(as, ir, RISCVI_ROR, RISCVI_RORI) : \
++  asm_bitshift(as, ir, RISCVI_RORW, RISCVI_RORIW))
++
++static void asm_min_max(ASMState *as, IRIns *ir, int ismax)
++{
++  if (irt_isnum(ir->t)) {
++    Reg dest = ra_dest(as, ir, RSET_FPR);
++    Reg right, left = ra_alloc2(as, ir, RSET_FPR);
++    right = (left >> 8); left &= 255;
++    emit_ds1s2(as, ismax ? RISCVI_FMAX_D : RISCVI_FMIN_D, dest, left, right);
++  } else {
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    Reg left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
++    Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
++    if (as->flags & JIT_F_RVZbb) {
++      emit_ds1s2(as, ismax ? RISCVI_MAX : RISCVI_MIN, dest, left, right);
++    } else {
++      if (as->flags & JIT_F_RVXThead) {
++  if (left == right) {
++    if (dest != left) emit_mv(as, dest, left);
++  } else {
++    if (dest == left) {
++	    emit_ds1s2(as, RISCVI_TH_MVNEZ, dest, right, RID_TMP);
++    } else {
++	    emit_ds1s2(as, RISCVI_TH_MVEQZ, dest, left, RID_TMP);
++	    if (dest != right) emit_mv(as, dest, right);
++    }
++  }
++      } else {
++  emit_ds1s2(as, RISCVI_OR, dest, dest, RID_TMP);
++  if (dest != right) {
++    emit_ds1s2(as, RISCVI_AND, RID_TMP, right, RID_TMP);
++    emit_ds(as, RISCVI_NOT, RID_TMP, RID_TMP);
++    emit_ds1s2(as, RISCVI_AND, dest, left, RID_TMP);
++  } else {
++    emit_ds1s2(as, RISCVI_AND, RID_TMP, left, RID_TMP);
++    emit_ds(as, RISCVI_NOT, RID_TMP, RID_TMP);
++    emit_ds1s2(as, RISCVI_AND, dest, right, RID_TMP);
++  }
++  emit_dsi(as, RISCVI_ADDI, RID_TMP, RID_TMP, -1);
++      }
++      emit_ds1s2(as, RISCVI_SLT, RID_TMP,
++         ismax ? left : right, ismax ? right : left);
++    }
++  }
++}
++
++#define asm_min(as, ir)		asm_min_max(as, ir, 0)
++#define asm_max(as, ir)		asm_min_max(as, ir, 1)
++
++/* -- Comparisons --------------------------------------------------------- */
++
++/* FP comparisons. */
++static void asm_fpcomp(ASMState *as, IRIns *ir)
++{
++  IROp op = ir->o;
++  Reg right, left = ra_alloc2(as, ir, RSET_FPR);
++  right = (left >> 8); left &= 255;
++  asm_guard(as, (op < IR_EQ ? (op&4) : (op&1))
++            ? RISCVI_BNE : RISCVI_BEQ, RID_TMP, RID_ZERO);
++  switch (op) {
++    case IR_LT: case IR_UGE:
++      emit_ds1s2(as, RISCVI_FLT_D, RID_TMP, left, right);
++      break;
++    case IR_LE: case IR_UGT: case IR_ABC:
++      emit_ds1s2(as, RISCVI_FLE_D, RID_TMP, left, right);
++      break;
++    case IR_GT: case IR_ULE:
++      emit_ds1s2(as, RISCVI_FLT_D, RID_TMP, right, left);
++      break;
++    case IR_GE: case IR_ULT:
++      emit_ds1s2(as, RISCVI_FLE_D, RID_TMP, right, left);
++      break;
++    case IR_EQ: case IR_NE:
++      emit_ds1s2(as, RISCVI_FEQ_D, RID_TMP, left, right);
++      break;
++    default:
++      break;
++  }
++}
++
++/* Integer comparisons. */
++static void asm_intcomp(ASMState *as, IRIns *ir)
++{
++  /* ORDER IR: LT GE LE GT  ULT UGE ULE UGT. */
++  /*           00 01 10 11  100 101 110 111  */
++  IROp op = ir->o;
++  Reg right, left = ra_alloc1(as, ir->op1, RSET_GPR);
++  if (op == IR_ABC) op = IR_UGT;
++  if ((op&4) == 0 && irref_isk(ir->op2) && get_kval(as, ir->op2) == 0) {
++    switch (op) {
++      case IR_LT: asm_guard(as, RISCVI_BGE, left, RID_ZERO); break;
++      case IR_GE: asm_guard(as, RISCVI_BLT, left, RID_ZERO); break;
++      case IR_LE: asm_guard(as, RISCVI_BLT, RID_ZERO, left); break;
++      case IR_GT: asm_guard(as, RISCVI_BGE, RID_ZERO, left); break;
++      default: break;
++    }
++    return;
++  }
++  if (irref_isk(ir->op2)) {
++    intptr_t k = get_kval(as, ir->op2);
++    if ((op&2)) k++;
++    if (checki12(k)) {
++      asm_guard(as, (op&1) ? RISCVI_BNE : RISCVI_BEQ, RID_TMP, RID_ZERO);
++      emit_dsi(as, (op&4) ? RISCVI_SLTIU : RISCVI_SLTI, RID_TMP, left, k);
++      return;
++    }
++  }
++  right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
++  asm_guard(as, ((op&4) ? RISCVI_BGEU : RISCVI_BGE) ^ RISCVF_FUNCT3((op^(op>>1))&1),
++             (op&2) ? right : left, (op&2) ? left : right);
++}
++
++static void asm_comp(ASMState *as, IRIns *ir)
++{
++  if (irt_isnum(ir->t))
++    asm_fpcomp(as, ir);
++  else
++    asm_intcomp(as, ir);
++}
++
++static void asm_equal(ASMState *as, IRIns *ir)
++{
++  if (irt_isnum(ir->t)) {
++    asm_fpcomp(as, ir);
++  } else {
++    Reg right, left = ra_alloc2(as, ir, RSET_GPR);
++    right = (left >> 8); left &= 255;
++    asm_guard(as, (ir->o & 1) ? RISCVI_BEQ : RISCVI_BNE, left, right);
++  }
++}
++
++/* -- Split register ops -------------------------------------------------- */
++
++/* Hiword op of a split 64 bit op. Previous op must be the loword op. */
++static void asm_hiop(ASMState *as, IRIns *ir)
++{
++  /* HIOP is marked as a store because it needs its own DCE logic. */
++  int uselo = ra_used(ir-1), usehi = ra_used(ir);  /* Loword/hiword used? */
++  if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1;
++  if (!usehi) return;  /* Skip unused hiword op for all remaining ops. */
++  switch ((ir-1)->o) {
++  case IR_CALLN:
++  case IR_CALLL:
++  case IR_CALLS:
++  case IR_CALLXS:
++    if (!uselo)
++      ra_allocref(as, ir->op1, RID2RSET(RID_RETLO));  /* Mark lo op as used. */
++    break;
++  default: lj_assertA(0, "bad HIOP for op %d", (ir-1)->o); break;
++  }
++}
++
++/* -- Profiling ----------------------------------------------------------- */
++
++static void asm_prof(ASMState *as, IRIns *ir)
++{
++  UNUSED(ir);
++  asm_guard(as, RISCVI_BNE, RID_TMP, RID_ZERO);
++  emit_dsi(as, RISCVI_ANDI, RID_TMP, RID_TMP, HOOK_PROFILE);
++  emit_lsglptr(as, RISCVI_LBU, RID_TMP,
++         (int32_t)offsetof(global_State, hookmask));
++}
++
++/* -- Stack handling ------------------------------------------------------ */
++
++/* Check Lua stack size for overflow. Use exit handler as fallback. */
++static void asm_stack_check(ASMState *as, BCReg topslot,
++			    IRIns *irp, RegSet allow, ExitNo exitno)
++{
++  /* Try to get an unused temp register, otherwise spill/restore RID_RET*. */
++  Reg tmp, pbase = irp ? (ra_hasreg(irp->r) ? irp->r : RID_TMP) : RID_BASE;
++  ExitNo oldsnap = as->snapno;
++  rset_clear(allow, pbase);
++  as->snapno = exitno;
++  asm_guard(as, RISCVI_BNE, RID_TMP, RID_ZERO);
++  as->snapno = oldsnap;
++  if (allow) {
++    tmp = rset_pickbot(allow);
++    ra_modified(as, tmp);
++  } else {	// allow == RSET_EMPTY
++    tmp = RID_RET;
++    emit_lso(as, RISCVI_LD, tmp, RID_SP, 0);	/* Restore tmp1 register. */
++  }
++  emit_dsi(as, RISCVI_SLTIU, RID_TMP, RID_TMP, (int32_t)(8*topslot));
++  emit_ds1s2(as, RISCVI_SUB, RID_TMP, tmp, pbase);
++  emit_lso(as, RISCVI_LD, tmp, tmp, offsetof(lua_State, maxstack));
++  if (pbase == RID_TMP)
++    emit_getgl(as, RID_TMP, jit_base);
++  emit_getgl(as, tmp, cur_L);
++  if (allow == RSET_EMPTY)  /* Spill temp register. */
++    emit_lso(as, RISCVI_SD, tmp, RID_SP, 0);
++}
++
++/* Restore Lua stack from on-trace state. */
++static void asm_stack_restore(ASMState *as, SnapShot *snap)
++{
++  SnapEntry *map = &as->T->snapmap[snap->mapofs];
++#ifdef LUA_USE_ASSERT
++  SnapEntry *flinks = &as->T->snapmap[snap_nextofs(as->T, snap)-1-LJ_FR2];
++#endif
++  MSize n, nent = snap->nent;
++  /* Store the value of all modified slots to the Lua stack. */
++  for (n = 0; n < nent; n++) {
++    SnapEntry sn = map[n];
++    BCReg s = snap_slot(sn);
++    int32_t ofs = 8*((int32_t)s-1-LJ_FR2);
++    IRRef ref = snap_ref(sn);
++    IRIns *ir = IR(ref);
++    if ((sn & SNAP_NORESTORE))
++      continue;
++    if (irt_isnum(ir->t)) {
++      Reg src = ra_alloc1(as, ref, RSET_FPR);
++      emit_lso(as, RISCVI_FSD, src, RID_BASE, ofs);
++    } else {
++      if ((sn & SNAP_KEYINDEX)) {
++        RegSet allow = rset_exclude(RSET_GPR, RID_BASE);
++	int64_t kki = (int64_t)LJ_KEYINDEX << 32;
++	if (irref_isk(ref)) {
++	  emit_lso(as, RISCVI_SD,
++       ra_allock(as, kki | (int64_t)(uint32_t)ir->i, allow),
++       RID_BASE, ofs);
++	} else {
++	  Reg src = ra_alloc1(as, ref, allow);
++	  Reg rki = ra_allock(as, kki, rset_exclude(allow, src));
++	  emit_lso(as, RISCVI_SD, RID_TMP, RID_BASE, ofs);
++	  emit_ds1s2(as, RISCVI_ADD, RID_TMP, src, rki);
++	}
++      } else {
++        asm_tvstore64(as, RID_BASE, ofs, ref);
++      }
++    }
++    checkmclim(as);
++  }
++  lj_assertA(map + nent == flinks, "inconsistent frames in snapshot");
++}
++
++/* -- GC handling --------------------------------------------------------- */
++
++/* Marker to prevent patching the GC check exit. */
++#define RISCV_NOPATCH_GC_CHECK \
++  (RISCVI_OR|RISCVF_D(RID_TMP)|RISCVF_S1(RID_TMP)|RISCVF_S2(RID_TMP))
++
++/* Check GC threshold and do one or more GC steps. */
++static void asm_gc_check(ASMState *as)
++{
++  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_step_jit];
++  IRRef args[2];
++  MCLabel l_end;
++  Reg tmp;
++  ra_evictset(as, RSET_SCRATCH);
++  l_end = emit_label(as);
++  /* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */
++  asm_guard(as, RISCVI_BNE, RID_RET, RID_ZERO);	/* Assumes asm_snap_prep() already done. */
++  *--as->mcp = RISCV_NOPATCH_GC_CHECK;
++  args[0] = ASMREF_TMP1;  /* global_State *g */
++  args[1] = ASMREF_TMP2;  /* MSize steps     */
++  asm_gencall(as, ci, args);
++  emit_ds(as, RISCVI_MV, ra_releasetmp(as, ASMREF_TMP1), RID_GL);
++  tmp = ra_releasetmp(as, ASMREF_TMP2);
++  emit_loadi(as, tmp, as->gcsteps);
++  /* Jump around GC step if GC total < GC threshold. */
++  emit_branch(as, RISCVI_BLTU, RID_TMP, tmp, l_end, 0);
++  emit_getgl(as, tmp, gc.threshold);
++  emit_getgl(as, RID_TMP, gc.total);
++  as->gcsteps = 0;
++  checkmclim(as);
++}
++
++/* -- Loop handling ------------------------------------------------------- */
++
++/* Fixup the loop branch. */
++static void asm_loop_fixup(ASMState *as)
++{
++  MCode *p = as->mctop;
++  MCode *target = as->mcp;
++  ptrdiff_t delta;
++  if (as->loopinv) {  /* Inverted loop branch? */
++    delta = (char *)target - (char *)(p - 2);
++    /* asm_guard* already inverted the branch, and patched the final b. */
++    lj_assertA(checki21(delta), "branch target out of range");
++    p[-2] = (p[-2]&0x00000fff) | RISCVF_IMMJ(delta);
++  } else {
++    /* J */
++    delta = (char *)target - (char *)(p - 1);
++    p[-1] = RISCVI_JAL | RISCVF_IMMJ(delta);
++  }
++}
++
++/* Fixup the tail of the loop. */
++static void asm_loop_tail_fixup(ASMState *as)
++{
++  UNUSED(as);  /* Nothing to do(?) */
++}
++
++/* -- Head of trace ------------------------------------------------------- */
++
++/* Coalesce BASE register for a root trace. */
++static void asm_head_root_base(ASMState *as)
++{
++  IRIns *ir = IR(REF_BASE);
++  Reg r = ir->r;
++  if (ra_hasreg(r)) {
++    ra_free(as, r);
++    if (rset_test(as->modset, r) || irt_ismarked(ir->t))
++      ir->r = RID_INIT;  /* No inheritance for modified BASE register. */
++    if (r != RID_BASE)
++      emit_mv(as, r, RID_BASE);
++  }
++}
++
++/* Coalesce BASE register for a side trace. */
++static Reg asm_head_side_base(ASMState *as, IRIns *irp)
++{
++  IRIns *ir = IR(REF_BASE);
++  Reg r = ir->r;
++  if (ra_hasreg(r)) {
++    ra_free(as, r);
++    if (rset_test(as->modset, r) || irt_ismarked(ir->t))
++      ir->r = RID_INIT;  /* No inheritance for modified BASE register. */
++    if (irp->r == r) {
++      return r;  /* Same BASE register already coalesced. */
++    } else if (ra_hasreg(irp->r) && rset_test(as->freeset, irp->r)) {
++      emit_mv(as, r, irp->r);  /* Move from coalesced parent reg. */
++      return irp->r;
++    } else {
++      emit_getgl(as, r, jit_base);  /* Otherwise reload BASE. */
++    }
++  }
++  return RID_NONE;
++}
++
++/* -- Tail of trace ------------------------------------------------------- */
++
++/* Fixup the tail code. */
++static void asm_tail_fixup(ASMState *as, TraceNo lnk)
++{
++  MCode *p = as->mctop;
++  MCode *target = lnk ? traceref(as->J,lnk)->mcode : (MCode *)lj_vm_exit_interp;
++  int32_t spadj = as->T->spadjust;
++  if (spadj == 0) {
++    p[-3] = RISCVI_NOP;
++    // as->mctop = p-2;
++  } else {
++    /* Patch stack adjustment. */
++    p[-3] = RISCVI_ADDI | RISCVF_D(RID_SP) | RISCVF_S1(RID_SP) | RISCVF_IMMI(spadj);
++  }
++  /* Patch exit jump. */
++  ptrdiff_t delta = (char *)target - (char *)(p - 2);
++  p[-2] = RISCVI_AUIPC | RISCVF_D(RID_TMP) | RISCVF_IMMU(RISCVF_HI(delta));
++  p[-1] = RISCVI_JALR | RISCVF_S1(RID_TMP) | RISCVF_IMMI(RISCVF_LO(delta));
++}
++
++/* Prepare tail of code. */
++static void asm_tail_prep(ASMState *as)
++{
++  MCode *p = as->mctop - 2;  /* Leave room for exitstub. */
++  if (as->loopref) {
++    as->invmcp = as->mcp = p;
++  } else {
++    as->mcp = p-1;  /* Leave room for stack pointer adjustment. */
++    as->invmcp = NULL;
++  }
++  p[0] = p[1] = RISCVI_NOP;  /* Prevent load/store merging. */
++}
++
++/* -- Trace setup --------------------------------------------------------- */
++
++/* Ensure there are enough stack slots for call arguments. */
++static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci)
++{
++  IRRef args[CCI_NARGS_MAX*2];
++  uint32_t i, nargs = CCI_XNARGS(ci);
++  int nslots = 0, ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR;
++  asm_collectargs(as, ir, ci, args);
++  for (i = 0; i < nargs; i++) {
++    if (args[i] && irt_isfp(IR(args[i])->t)) {
++      if (nfpr > 0) {
++        nfpr--; if(ci->flags & CCI_VARARG) ngpr--;
++      } else if (!(ci->flags & CCI_VARARG) && ngpr > 0) ngpr--;
++      else nslots += 2;
++    } else {
++      if (ngpr > 0) {
++        ngpr--; if(ci->flags & CCI_VARARG) nfpr--;
++      } else nslots += 2;
++    }
++  }
++  if (nslots > as->evenspill)  /* Leave room for args in stack slots. */
++    as->evenspill = nslots;
++  return REGSP_HINT(irt_isfp(ir->t) ? RID_FPRET : RID_RET);
++}
++
++static void asm_setup_target(ASMState *as)
++{
++  asm_sparejump_setup(as);
++  asm_exitstub_setup(as, as->T->nsnap + (as->parent ? 1 : 0));
++}
++
++/* -- Trace patching ------------------------------------------------------ */
++
++/* Patch exit jumps of existing machine code to a new target. */
++void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target)
++{
++  MCode *p = T->mcode;
++  MCode *pe = (MCode *)((char *)p + T->szmcode);
++  MCode *px = exitstub_trace_addr(T, exitno);
++  MCode *cstart = NULL;
++  MCode *mcarea = lj_mcode_patch(J, p, 0);
++
++  for (; p < pe; p++) {
++    /* Look for exitstub branch, replace with branch to target. */
++    ptrdiff_t odelta = (char *)px - (char *)(p+1),
++              ndelta = (char *)target - (char *)(p+1);
++    if ((((p[0] ^ RISCVF_IMMB(8)) & 0xfe000f80u) == 0 &&
++         ((p[0] & 0x0000007fu) == 0x63u) &&
++         ((p[1] ^ RISCVF_IMMJ(odelta)) & 0xfffff000u) == 0 &&
++         ((p[1] & 0x0000007fu) == 0x6fu) && p[-1] != RISCV_NOPATCH_GC_CHECK) ||
++        (((p[1] ^ RISCVF_IMMJ(odelta)) & 0xfffff000u) == 0 &&
++         ((p[1] & 0x0000007fu) == 0x6fu) && p[0] != RISCV_NOPATCH_GC_CHECK)) {
++      lj_assertJ(checki32(ndelta), "branch target out of range");
++      /* Patch jump, if within range. */
++	    patchbranch:
++      if (checki21(ndelta)) { /* Patch jump */
++  p[1] = RISCVI_JAL | RISCVF_IMMJ(ndelta);
++  if (!cstart) cstart = p + 1;
++      } else {  /* Branch out of range. Use spare jump slot in mcarea. */
++  MCode *mcjump = asm_sparejump_use(mcarea, target);
++  if (mcjump) {
++	  lj_mcode_sync(mcjump, mcjump+2);
++    ndelta = (char *)mcjump - (char *)(p+1);
++    if (checki21(ndelta)) {
++      goto patchbranch;
++    } else {
++      lj_assertJ(0, "spare jump out of range: -Osizemcode too big");
++    }
++  }
++	/* Ignore jump slot overflow. Child trace is simply not attached. */
++      }
++    } else if (p+2 == pe) {
++      if (p[0] == RISCVI_NOP && p[1] == RISCVI_NOP) {
++  ptrdiff_t delta = (char *)target - (char *)p;
++  lj_assertJ(checki32(delta), "jump target out of range");
++  p[0] = RISCVI_AUIPC | RISCVF_D(RID_TMP) | RISCVF_IMMU(RISCVF_HI(delta));
++  p[1] = RISCVI_JALR | RISCVF_S1(RID_TMP) | RISCVF_IMMI(RISCVF_LO(delta));
++  if (!cstart) cstart = p;
++      }
++    }
++  }
++  if (cstart) lj_mcode_sync(cstart, px+1);
++  lj_mcode_patch(J, mcarea, 1);
++}
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_asm_x86.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_asm_x86.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_asm_x86.h
+@@ -1,6 +1,6 @@
+ /*
+ ** x86/x64 IR assembler (SSA IR -> machine code).
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ /* -- Guard handling ------------------------------------------------------ */
+@@ -140,7 +140,8 @@ static IRRef asm_fuseabase(ASMState *as,
+     }
+   } else if (irb->o == IR_ADD && irref_isk(irb->op2)) {
+     /* Fuse base offset (vararg load). */
+-    as->mrm.ofs = IR(irb->op2)->i;
++    IRIns *irk = IR(irb->op2);
++    as->mrm.ofs = irk->o == IR_KINT ? irk->i : (int32_t)ir_kint64(irk)->u64;
+     return irb->op1;
+   }
+   return ref;  /* Otherwise use the given array base. */
+@@ -216,10 +217,17 @@ static void asm_fuseahuref(ASMState *as,
+ #endif
+       }
+       break;
++    case IR_TMPREF:
++#if LJ_GC64
++      as->mrm.ofs = (int32_t)dispofs(as, &J2G(as->J)->tmptv);
++      as->mrm.base = RID_DISPATCH;
++      as->mrm.idx = RID_NONE;
++#else
++      as->mrm.ofs = igcptr(&J2G(as->J)->tmptv);
++      as->mrm.base = as->mrm.idx = RID_NONE;
++#endif
++      return;
+     default:
+-      lj_assertA(ir->o == IR_HREF || ir->o == IR_NEWREF || ir->o == IR_UREFO ||
+-		 ir->o == IR_KKPTR,
+-		 "bad IR op %d", ir->o);
+       break;
+     }
+   }
+@@ -478,8 +486,10 @@ static Reg asm_fuseload(ASMState *as, IR
+ 	asm_fusexref(as, ir->op1, xallow);
+ 	return RID_MRM;
+       }
+-    } else if (ir->o == IR_VLOAD && !(LJ_GC64 && irt_isaddr(ir->t))) {
++    } else if (ir->o == IR_VLOAD && IR(ir->op1)->o == IR_AREF &&
++	       !(LJ_GC64 && irt_isaddr(ir->t))) {
+       asm_fuseahuref(as, ir->op1, xallow);
++      as->mrm.ofs += 8 * ir->op2;
+       return RID_MRM;
+     }
+   }
+@@ -651,7 +661,7 @@ static void asm_gencall(ASMState *as, co
+ static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
+ {
+   RegSet drop = RSET_SCRATCH;
+-  int hiop = (LJ_32 && (ir+1)->o == IR_HIOP && !irt_isnil((ir+1)->t));
++  int hiop = ((ir+1)->o == IR_HIOP && !irt_isnil((ir+1)->t));
+   if ((ci->flags & CCI_NOFPRCLOBBER))
+     drop &= ~RSET_FPR;
+   if (ra_hasreg(ir->r))
+@@ -691,10 +701,8 @@ static void asm_setupresult(ASMState *as
+ 		  irt_isnum(ir->t) ? XOg_FSTPq : XOg_FSTPd, RID_ESP, ofs);
+       }
+ #endif
+-#if LJ_32
+     } else if (hiop) {
+       ra_destpair(as, ir);
+-#endif
+     } else {
+       lj_assertA(!irt_ispri(ir->t), "PRI dest");
+       ra_destreg(as, ir, RID_RET);
+@@ -781,6 +789,21 @@ static void asm_retf(ASMState *as, IRIns
+ #endif
+ }
+ 
++/* -- Buffer operations --------------------------------------------------- */
++
++#if LJ_HASBUFFER
++static void asm_bufhdr_write(ASMState *as, Reg sb)
++{
++  Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, sb));
++  IRIns irgc;
++  irgc.ot = IRT(0, IRT_PGC);  /* GC type. */
++  emit_storeofs(as, &irgc, tmp, sb, offsetof(SBuf, L));
++  emit_opgl(as, XO_ARITH(XOg_OR), tmp|REX_GC64, cur_L);
++  emit_gri(as, XG_ARITHi(XOg_AND), tmp, SBUF_MASK_FLAG);
++  emit_loadofs(as, &irgc, tmp, sb, offsetof(SBuf, L));
++}
++#endif
++
+ /* -- Type conversions ---------------------------------------------------- */
+ 
+ static void asm_tointg(ASMState *as, IRIns *ir, Reg left)
+@@ -924,7 +947,7 @@ static void asm_conv(ASMState *as, IRIns
+       }
+     } else {
+       Reg dest = ra_dest(as, ir, RSET_GPR);
+-      if (st64) {
++      if (st64 && !(ir->op2 & IRCONV_NONE)) {
+ 	Reg left = asm_fuseload(as, lref, RSET_GPR);
+ 	/* This is either a 32 bit reg/reg mov which zeroes the hiword
+ 	** or a load of the loword from a 64 bit address.
+@@ -1050,47 +1073,48 @@ static void asm_strto(ASMState *as, IRIn
+ /* -- Memory references --------------------------------------------------- */
+ 
+ /* Get pointer to TValue. */
+-static void asm_tvptr(ASMState *as, Reg dest, IRRef ref)
++static void asm_tvptr(ASMState *as, Reg dest, IRRef ref, MSize mode)
+ {
+-  IRIns *ir = IR(ref);
+-  if (irt_isnum(ir->t)) {
+-    /* For numbers use the constant itself or a spill slot as a TValue. */
+-    if (irref_isk(ref))
+-      emit_loada(as, dest, ir_knum(ir));
+-    else
+-      emit_rmro(as, XO_LEA, dest|REX_64, RID_ESP, ra_spill(as, ir));
+-  } else {
+-    /* Otherwise use g->tmptv to hold the TValue. */
++  if ((mode & IRTMPREF_IN1)) {
++    IRIns *ir = IR(ref);
++    if (irt_isnum(ir->t)) {
++      if (irref_isk(ref) && !(mode & IRTMPREF_OUT1)) {
++	/* Use the number constant itself as a TValue. */
++	emit_loada(as, dest, ir_knum(ir));
++	return;
++      }
++      emit_rmro(as, XO_MOVSDto, ra_alloc1(as, ref, RSET_FPR), dest, 0);
++    } else {
+ #if LJ_GC64
+-    if (irref_isk(ref)) {
+-      TValue k;
+-      lj_ir_kvalue(as->J->L, &k, ir);
+-      emit_movmroi(as, dest, 4, k.u32.hi);
+-      emit_movmroi(as, dest, 0, k.u32.lo);
+-    } else {
+-      /* TODO: 64 bit store + 32 bit load-modify-store is suboptimal. */
+-      Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, dest));
+-      if (irt_is64(ir->t)) {
+-	emit_u32(as, irt_toitype(ir->t) << 15);
+-	emit_rmro(as, XO_ARITHi, XOg_OR, dest, 4);
++      if (irref_isk(ref)) {
++	TValue k;
++	lj_ir_kvalue(as->J->L, &k, ir);
++	emit_movmroi(as, dest, 4, k.u32.hi);
++	emit_movmroi(as, dest, 0, k.u32.lo);
+       } else {
+-	/* Currently, no caller passes integers that might end up here. */
+-	emit_movmroi(as, dest, 4, (irt_toitype(ir->t) << 15));
++	/* TODO: 64 bit store + 32 bit load-modify-store is suboptimal. */
++	Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, dest));
++	if (irt_is64(ir->t)) {
++	  emit_u32(as, irt_toitype(ir->t) << 15);
++	  emit_rmro(as, XO_ARITHi, XOg_OR, dest, 4);
++	} else {
++	  emit_movmroi(as, dest, 4, (irt_toitype(ir->t) << 15));
++	}
++	emit_movtomro(as, REX_64IR(ir, src), dest, 0);
+       }
+-      emit_movtomro(as, REX_64IR(ir, src), dest, 0);
+-    }
+ #else
+-    if (!irref_isk(ref)) {
+-      Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, dest));
+-      emit_movtomro(as, REX_64IR(ir, src), dest, 0);
+-    } else if (!irt_ispri(ir->t)) {
+-      emit_movmroi(as, dest, 0, ir->i);
+-    }
+-    if (!(LJ_64 && irt_islightud(ir->t)))
+-      emit_movmroi(as, dest, 4, irt_toitype(ir->t));
++      if (!irref_isk(ref)) {
++	Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, dest));
++	emit_movtomro(as, REX_64IR(ir, src), dest, 0);
++      } else if (!irt_ispri(ir->t)) {
++	emit_movmroi(as, dest, 0, ir->i);
++      }
++      if (!(LJ_64 && irt_islightud(ir->t)))
++	emit_movmroi(as, dest, 4, irt_toitype(ir->t));
+ #endif
+-    emit_loada(as, dest, &J2G(as->J)->tmptv);
++    }
+   }
++  emit_loada(as, dest, &J2G(as->J)->tmptv); /* g->tmptv holds the TValue(s). */
+ }
+ 
+ static void asm_aref(ASMState *as, IRIns *ir)
+@@ -1349,24 +1373,31 @@ static void asm_hrefk(ASMState *as, IRIn
+ static void asm_uref(ASMState *as, IRIns *ir)
+ {
+   Reg dest = ra_dest(as, ir, RSET_GPR);
+-  if (irref_isk(ir->op1)) {
++  int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC);
++  if (irref_isk(ir->op1) && !guarded) {
+     GCfunc *fn = ir_kfunc(IR(ir->op1));
+     MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
+     emit_rma(as, XO_MOV, dest|REX_GC64, v);
+   } else {
+     Reg uv = ra_scratch(as, RSET_GPR);
+-    Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
+-    if (ir->o == IR_UREFC) {
++    if (ir->o == IR_UREFC)
+       emit_rmro(as, XO_LEA, dest|REX_GC64, uv, offsetof(GCupval, tv));
+-      asm_guardcc(as, CC_NE);
+-      emit_i8(as, 1);
+-      emit_rmro(as, XO_ARITHib, XOg_CMP, uv, offsetof(GCupval, closed));
+-    } else {
++    else
+       emit_rmro(as, XO_MOV, dest|REX_GC64, uv, offsetof(GCupval, v));
++    if (guarded) {
++      asm_guardcc(as, ir->o == IR_UREFC ? CC_E : CC_NE);
++      emit_i8(as, 0);
++      emit_rmro(as, XO_ARITHib, XOg_CMP, uv, offsetof(GCupval, closed));
++    }
++    if (irref_isk(ir->op1)) {
++      GCfunc *fn = ir_kfunc(IR(ir->op1));
++      GCobj *o = gcref(fn->l.uvptr[(ir->op2 >> 8)]);
++      emit_loada(as, uv, o);
++    } else {
++      emit_rmro(as, XO_MOV, uv|REX_GC64, ra_alloc1(as, ir->op1, RSET_GPR),
++	        (int32_t)offsetof(GCfuncL, uvptr) +
++	        (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8));
+     }
+-    emit_rmro(as, XO_MOV, uv|REX_GC64, func,
+-	      (int32_t)offsetof(GCfuncL, uvptr) +
+-	      (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8));
+   }
+ }
+ 
+@@ -1524,6 +1555,7 @@ static void asm_ahuvload(ASMState *as, I
+     Reg dest = asm_load_lightud64(as, ir, 1);
+     if (ra_hasreg(dest)) {
+       asm_fuseahuref(as, ir->op1, RSET_GPR);
++      if (ir->o == IR_VLOAD) as->mrm.ofs += 8 * ir->op2;
+       emit_mrm(as, XO_MOV, dest|REX_64, RID_MRM);
+     }
+     return;
+@@ -1533,6 +1565,7 @@ static void asm_ahuvload(ASMState *as, I
+     RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
+     Reg dest = ra_dest(as, ir, allow);
+     asm_fuseahuref(as, ir->op1, RSET_GPR);
++    if (ir->o == IR_VLOAD) as->mrm.ofs += 8 * ir->op2;
+ #if LJ_GC64
+     if (irt_isaddr(ir->t)) {
+       emit_shifti(as, XOg_SHR|REX_64, dest, 17);
+@@ -1560,6 +1593,7 @@ static void asm_ahuvload(ASMState *as, I
+     }
+ #endif
+     asm_fuseahuref(as, ir->op1, gpr);
++    if (ir->o == IR_VLOAD) as->mrm.ofs += 8 * ir->op2;
+   }
+   /* Always do the type check, even if the load result is unused. */
+   as->mrm.ofs += 4;
+@@ -1675,7 +1709,8 @@ static void asm_sload(ASMState *as, IRIn
+   lj_assertA(irt_isguard(t) || !(ir->op2 & IRSLOAD_TYPECHECK),
+ 	     "inconsistent SLOAD variant");
+   lj_assertA(LJ_DUALNUM ||
+-	     !irt_isint(t) || (ir->op2 & (IRSLOAD_CONVERT|IRSLOAD_FRAME)),
++	     !irt_isint(t) ||
++	     (ir->op2 & (IRSLOAD_CONVERT|IRSLOAD_FRAME|IRSLOAD_KEYINDEX)),
+ 	     "bad SLOAD type");
+   if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t) && irt_isint(t)) {
+     Reg left = ra_scratch(as, RSET_FPR);
+@@ -1742,14 +1777,11 @@ static void asm_sload(ASMState *as, IRIn
+   if ((ir->op2 & IRSLOAD_TYPECHECK)) {
+     /* Need type check, even if the load result is unused. */
+     asm_guardcc(as, irt_isnum(t) ? CC_AE : CC_NE);
+-    if (LJ_64 && irt_type(t) >= IRT_NUM) {
++    if ((LJ_64 && irt_type(t) >= IRT_NUM) || (ir->op2 & IRSLOAD_KEYINDEX)) {
+       lj_assertA(irt_isinteger(t) || irt_isnum(t),
+ 		 "bad SLOAD type %d", irt_type(t));
+-#if LJ_GC64
+-      emit_u32(as, LJ_TISNUM << 15);
+-#else
+-      emit_u32(as, LJ_TISNUM);
+-#endif
++      emit_u32(as, (ir->op2 & IRSLOAD_KEYINDEX) ? LJ_KEYINDEX :
++		   LJ_GC64 ? (LJ_TISNUM << 15) : LJ_TISNUM);
+       emit_rmro(as, XO_ARITHi, XOg_CMP, base, ofs+4);
+ #if LJ_GC64
+     } else if (irt_isnil(t)) {
+@@ -1991,19 +2023,6 @@ static void asm_ldexp(ASMState *as, IRIn
+   asm_x87load(as, ir->op2);
+ }
+ 
+-static void asm_fppowi(ASMState *as, IRIns *ir)
+-{
+-  /* The modified regs must match with the *.dasc implementation. */
+-  RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX);
+-  if (ra_hasreg(ir->r))
+-    rset_clear(drop, ir->r);  /* Dest reg handled below. */
+-  ra_evictset(as, drop);
+-  ra_destreg(as, ir, RID_XMM0);
+-  emit_call(as, lj_vm_powi_sse);
+-  ra_left(as, RID_XMM0, ir->op1);
+-  ra_left(as, RID_EAX, ir->op2);
+-}
+-
+ static int asm_swapops(ASMState *as, IRIns *ir)
+ {
+   IRIns *irl = IR(ir->op1);
+@@ -2584,15 +2603,15 @@ static void asm_comp_int64(ASMState *as,
+ }
+ #endif
+ 
+-/* -- Support for 64 bit ops in 32 bit mode ------------------------------- */
++/* -- Split register ops -------------------------------------------------- */
+ 
+-/* Hiword op of a split 64 bit op. Previous op must be the loword op. */
++/* Hiword op of a split 32/32 or 64/64 bit op. Previous op is the loword op. */
+ static void asm_hiop(ASMState *as, IRIns *ir)
+ {
+-#if LJ_32 && LJ_HASFFI
+   /* HIOP is marked as a store because it needs its own DCE logic. */
+   int uselo = ra_used(ir-1), usehi = ra_used(ir);  /* Loword/hiword used? */
+   if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1;
++#if LJ_32 && LJ_HASFFI
+   if ((ir-1)->o == IR_CONV) {  /* Conversions to/from 64 bit. */
+     as->curins--;  /* Always skip the CONV. */
+     if (usehi || uselo)
+@@ -2606,8 +2625,10 @@ static void asm_hiop(ASMState *as, IRIns
+       asm_fxstore(as, ir);
+     return;
+   }
++#endif
+   if (!usehi) return;  /* Skip unused hiword op for all remaining ops. */
+   switch ((ir-1)->o) {
++#if LJ_32 && LJ_HASFFI
+   case IR_ADD:
+     as->flagmcp = NULL;
+     as->curins--;
+@@ -2630,20 +2651,16 @@ static void asm_hiop(ASMState *as, IRIns
+     asm_neg_not(as, ir-1, XOg_NEG);
+     break;
+     }
+-  case IR_CALLN:
+-  case IR_CALLXS:
+-    if (!uselo)
+-      ra_allocref(as, ir->op1, RID2RSET(RID_RETLO));  /* Mark lo op as used. */
+-    break;
+   case IR_CNEWI:
+     /* Nothing to do here. Handled by CNEWI itself. */
+     break;
++#endif
++  case IR_CALLN: case IR_CALLL: case IR_CALLS: case IR_CALLXS:
++    if (!uselo)
++      ra_allocref(as, ir->op1, RID2RSET(RID_RETLO));  /* Mark lo op as used. */
++    break;
+   default: lj_assertA(0, "bad HIOP for op %d", (ir-1)->o); break;
+   }
+-#else
+-  /* Unused on x64 or without FFI. */
+-  UNUSED(as); UNUSED(ir); lj_assertA(0, "unexpected HIOP");
+-#endif
+ }
+ 
+ /* -- Profiling ----------------------------------------------------------- */
+@@ -2704,7 +2721,15 @@ static void asm_stack_restore(ASMState *
+     IRIns *ir = IR(ref);
+     if ((sn & SNAP_NORESTORE))
+       continue;
+-    if (irt_isnum(ir->t)) {
++    if ((sn & SNAP_KEYINDEX)) {
++      emit_movmroi(as, RID_BASE, ofs+4, LJ_KEYINDEX);
++      if (irref_isk(ref)) {
++	emit_movmroi(as, RID_BASE, ofs, ir->i);
++      } else {
++	Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, RID_BASE));
++	emit_movtomro(as, src, RID_BASE, ofs);
++      }
++    } else if (irt_isnum(ir->t)) {
+       Reg src = ra_alloc1(as, ref, RSET_FPR);
+       emit_rmro(as, XO_MOVSDto, src, RID_BASE, ofs);
+     } else {
+@@ -2837,6 +2862,12 @@ static void asm_loop_fixup(ASMState *as)
+   }
+ }
+ 
++/* Fixup the tail of the loop. */
++static void asm_loop_tail_fixup(ASMState *as)
++{
++  UNUSED(as);  /* Nothing to do. */
++}
++
+ /* -- Head of trace ------------------------------------------------------- */
+ 
+ /* Coalesce BASE register for a root trace. */
+@@ -2854,7 +2885,7 @@ static void asm_head_root_base(ASMState
+ }
+ 
+ /* Coalesce or reload BASE register for a side trace. */
+-static RegSet asm_head_side_base(ASMState *as, IRIns *irp, RegSet allow)
++static Reg asm_head_side_base(ASMState *as, IRIns *irp)
+ {
+   IRIns *ir = IR(REF_BASE);
+   Reg r = ir->r;
+@@ -2863,16 +2894,16 @@ static RegSet asm_head_side_base(ASMStat
+     if (rset_test(as->modset, r) || irt_ismarked(ir->t))
+       ir->r = RID_INIT;  /* No inheritance for modified BASE register. */
+     if (irp->r == r) {
+-      rset_clear(allow, r);  /* Mark same BASE register as coalesced. */
++      return r;  /* Same BASE register already coalesced. */
+     } else if (ra_hasreg(irp->r) && rset_test(as->freeset, irp->r)) {
+       /* Move from coalesced parent reg. */
+-      rset_clear(allow, irp->r);
+       emit_rr(as, XO_MOV, r|REX_GC64, irp->r);
++      return irp->r;
+     } else {
+       emit_getgl(as, r, jit_base);  /* Otherwise reload BASE. */
+     }
+   }
+-  return allow;
++  return RID_NONE;
+ }
+ 
+ /* -- Tail of trace ------------------------------------------------------- */
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_assert.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_assert.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_assert.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Internal assertions.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_assert_c
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_bc.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_bc.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_bc.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Bytecode instruction modes.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_bc_c
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_bc.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_bc.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_bc.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Bytecode instruction format.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_BC_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_bcdump.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_bcdump.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_bcdump.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Bytecode dump definitions.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_BCDUMP_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_bcread.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_bcread.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_bcread.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Bytecode reader.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_bcread_c
+@@ -53,11 +53,11 @@ static LJ_NOINLINE void bcread_fill(LexS
+   do {
+     const char *buf;
+     size_t sz;
+-    char *p = sbufB(&ls->sb);
++    char *p = ls->sb.b;
+     MSize n = (MSize)(ls->pe - ls->p);
+     if (n) {  /* Copy remainder to buffer. */
+       if (sbuflen(&ls->sb)) {  /* Move down in buffer. */
+-	lj_assertLS(ls->pe == sbufP(&ls->sb), "bad buffer pointer");
++	lj_assertLS(ls->pe == ls->sb.w, "bad buffer pointer");
+ 	if (ls->p != p) memmove(p, ls->p, n);
+       } else {  /* Copy from buffer provided by reader. */
+ 	p = lj_buf_need(&ls->sb, len);
+@@ -66,7 +66,7 @@ static LJ_NOINLINE void bcread_fill(LexS
+       ls->p = p;
+       ls->pe = p + n;
+     }
+-    setsbufP(&ls->sb, p + n);
++    ls->sb.w = p + n;
+     buf = ls->rfunc(ls->L, ls->rdata, &sz);  /* Get more data from reader. */
+     if (buf == NULL || sz == 0) {  /* EOF? */
+       if (need) bcread_error(ls, LJ_ERR_BCBAD);
+@@ -77,8 +77,8 @@ static LJ_NOINLINE void bcread_fill(LexS
+     if (n) {  /* Append to buffer. */
+       n += (MSize)sz;
+       p = lj_buf_need(&ls->sb, n < len ? len : n);
+-      memcpy(sbufP(&ls->sb), buf, sz);
+-      setsbufP(&ls->sb, p + n);
++      memcpy(ls->sb.w, buf, sz);
++      ls->sb.w = p + n;
+       ls->p = p;
+       ls->pe = p + n;
+     } else {  /* Return buffer provided by reader. */
+@@ -399,11 +399,7 @@ static int bcread_header(LexState *ls)
+   if ((flags & BCDUMP_F_FFI)) {
+ #if LJ_HASFFI
+     lua_State *L = ls->L;
+-    if (!ctype_ctsG(G(L))) {
+-      ptrdiff_t oldtop = savestack(L, L->top);
+-      luaopen_ffi(L);  /* Load FFI library on-demand. */
+-      L->top = restorestack(L, oldtop);
+-    }
++    ctype_loadffi(L);
+ #else
+     return 0;
+ #endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_bcwrite.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_bcwrite.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_bcwrite.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Bytecode writer.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_bcwrite_c
+@@ -62,7 +62,7 @@ static void bcwrite_ktabk(BCWriteCtx *ct
+       if (num == (lua_Number)k) {  /* -0 is never a constant. */
+ 	*p++ = BCDUMP_KTAB_INT;
+ 	p = lj_strfmt_wuleb128(p, k);
+-	setsbufP(&ctx->sb, p);
++	ctx->sb.w = p;
+ 	return;
+       }
+     }
+@@ -73,7 +73,7 @@ static void bcwrite_ktabk(BCWriteCtx *ct
+     lj_assertBCW(tvispri(o), "unhandled type %d", itype(o));
+     *p++ = BCDUMP_KTAB_NIL+~itype(o);
+   }
+-  setsbufP(&ctx->sb, p);
++  ctx->sb.w = p;
+ }
+ 
+ /* Write a template table. */
+@@ -97,7 +97,7 @@ static void bcwrite_ktab(BCWriteCtx *ctx
+   /* Write number of array slots and hash slots. */
+   p = lj_strfmt_wuleb128(p, narray);
+   p = lj_strfmt_wuleb128(p, nhash);
+-  setsbufP(&ctx->sb, p);
++  ctx->sb.w = p;
+   if (narray) {  /* Write array entries (may contain nil). */
+     MSize i;
+     TValue *o = tvref(t->array);
+@@ -172,7 +172,7 @@ static void bcwrite_kgc(BCWriteCtx *ctx,
+       }
+ #endif
+     }
+-    setsbufP(&ctx->sb, p);
++    ctx->sb.w = p;
+   }
+ }
+ 
+@@ -189,7 +189,8 @@ static void bcwrite_knum(BCWriteCtx *ctx
+       goto save_int;
+     } else {
+       /* Write a 33 bit ULEB128 for the int (lsb=0) or loword (lsb=1). */
+-      if (!LJ_DUALNUM) {  /* Narrow number constants to integers. */
++      if (!LJ_DUALNUM && o->u32.hi != LJ_KEYINDEX) {
++	/* Narrow number constants to integers. */
+ 	lua_Number num = numV(o);
+ 	k = lj_num2int(num);
+ 	if (num == (lua_Number)k) {  /* -0 is never a constant. */
+@@ -206,7 +207,7 @@ static void bcwrite_knum(BCWriteCtx *ctx
+       p = lj_strfmt_wuleb128(p, o->u32.hi);
+     }
+   }
+-  setsbufP(&ctx->sb, p);
++  ctx->sb.w = p;
+ }
+ 
+ /* Write bytecode instructions. */
+@@ -281,7 +282,7 @@ static void bcwrite_proto(BCWriteCtx *ct
+   /* Write bytecode instructions and upvalue refs. */
+   p = bcwrite_bytecode(ctx, p, pt);
+   p = lj_buf_wmem(p, proto_uv(pt), pt->sizeuv*2);
+-  setsbufP(&ctx->sb, p);
++  ctx->sb.w = p;
+ 
+   /* Write constants. */
+   bcwrite_kgc(ctx, pt);
+@@ -291,16 +292,16 @@ static void bcwrite_proto(BCWriteCtx *ct
+   if (sizedbg) {
+     p = lj_buf_more(&ctx->sb, sizedbg);
+     p = lj_buf_wmem(p, proto_lineinfo(pt), sizedbg);
+-    setsbufP(&ctx->sb, p);
++    ctx->sb.w = p;
+   }
+ 
+   /* Pass buffer to writer function. */
+   if (ctx->status == 0) {
+     MSize n = sbuflen(&ctx->sb) - 5;
+     MSize nn = (lj_fls(n)+8)*9 >> 6;
+-    char *q = sbufB(&ctx->sb) + (5 - nn);
++    char *q = ctx->sb.b + (5 - nn);
+     p = lj_strfmt_wuleb128(q, n);  /* Fill in final size. */
+-    lj_assertBCW(p == sbufB(&ctx->sb) + 5, "bad ULEB128 write");
++    lj_assertBCW(p == ctx->sb.b + 5, "bad ULEB128 write");
+     ctx->status = ctx->wfunc(sbufL(&ctx->sb), q, nn+n, ctx->wdata);
+   }
+ }
+@@ -324,8 +325,8 @@ static void bcwrite_header(BCWriteCtx *c
+     p = lj_strfmt_wuleb128(p, len);
+     p = lj_buf_wmem(p, name, len);
+   }
+-  ctx->status = ctx->wfunc(sbufL(&ctx->sb), sbufB(&ctx->sb),
+-			   (MSize)(p - sbufB(&ctx->sb)), ctx->wdata);
++  ctx->status = ctx->wfunc(sbufL(&ctx->sb), ctx->sb.b,
++			   (MSize)(p - ctx->sb.b), ctx->wdata);
+ }
+ 
+ /* Write footer of bytecode dump. */
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_buf.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_buf.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_buf.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Buffer handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_buf_c
+@@ -20,12 +20,32 @@ static void buf_grow(SBuf *sb, MSize sz)
+ {
+   MSize osz = sbufsz(sb), len = sbuflen(sb), nsz = osz;
+   char *b;
++  GCSize flag;
+   if (nsz < LJ_MIN_SBUF) nsz = LJ_MIN_SBUF;
+   while (nsz < sz) nsz += nsz;
+-  b = (char *)lj_mem_realloc(sbufL(sb), sbufB(sb), osz, nsz);
+-  setmref(sb->b, b);
+-  setmref(sb->p, b + len);
+-  setmref(sb->e, b + nsz);
++  flag = sbufflag(sb);
++  if ((flag & SBUF_FLAG_COW)) {  /* Copy-on-write semantics. */
++    lj_assertG_(G(sbufL(sb)), sb->w == sb->e, "bad SBuf COW");
++    b = (char *)lj_mem_new(sbufL(sb), nsz);
++    setsbufflag(sb, flag & ~(GCSize)SBUF_FLAG_COW);
++    setgcrefnull(sbufX(sb)->cowref);
++    memcpy(b, sb->b, osz);
++  } else {
++    b = (char *)lj_mem_realloc(sbufL(sb), sb->b, osz, nsz);
++  }
++  if ((flag & SBUF_FLAG_EXT)) {
++    sbufX(sb)->r = sbufX(sb)->r - sb->b + b;  /* Adjust read pointer, too. */
++  }
++  /* Adjust buffer pointers. */
++  sb->b = b;
++  sb->w = b + len;
++  sb->e = b + nsz;
++  if ((flag & SBUF_FLAG_BORROW)) {  /* Adjust borrowed buffer pointers. */
++    SBuf *bsb = mref(sbufX(sb)->bsb, SBuf);
++    bsb->b = b;
++    bsb->w = b + len;
++    bsb->e = b + nsz;
++  }
+ }
+ 
+ LJ_NOINLINE char *LJ_FASTCALL lj_buf_need2(SBuf *sb, MSize sz)
+@@ -34,30 +54,51 @@ LJ_NOINLINE char *LJ_FASTCALL lj_buf_nee
+   if (LJ_UNLIKELY(sz > LJ_MAX_BUF))
+     lj_err_mem(sbufL(sb));
+   buf_grow(sb, sz);
+-  return sbufB(sb);
++  return sb->b;
+ }
+ 
+ LJ_NOINLINE char *LJ_FASTCALL lj_buf_more2(SBuf *sb, MSize sz)
+ {
+-  MSize len = sbuflen(sb);
+-  lj_assertG_(G(sbufL(sb)), sz > sbufleft(sb), "SBuf overflow");
+-  if (LJ_UNLIKELY(sz > LJ_MAX_BUF || len + sz > LJ_MAX_BUF))
+-    lj_err_mem(sbufL(sb));
+-  buf_grow(sb, len + sz);
+-  return sbufP(sb);
++  if (sbufisext(sb)) {
++    SBufExt *sbx = (SBufExt *)sb;
++    MSize len = sbufxlen(sbx);
++    if (LJ_UNLIKELY(sz > LJ_MAX_BUF || len + sz > LJ_MAX_BUF))
++      lj_err_mem(sbufL(sbx));
++    if (len + sz > sbufsz(sbx)) {  /* Must grow. */
++      buf_grow((SBuf *)sbx, len + sz);
++    } else if (sbufiscow(sb) || sbufxslack(sbx) < (sbufsz(sbx) >> 3)) {
++      /* Also grow to avoid excessive compactions, if slack < size/8. */
++      buf_grow((SBuf *)sbx, sbuflen(sbx) + sz);  /* Not sbufxlen! */
++      return sbx->w;
++    }
++    if (sbx->r != sbx->b) {  /* Compact by moving down. */
++      memmove(sbx->b, sbx->r, len);
++      sbx->r = sbx->b;
++      sbx->w = sbx->b + len;
++      lj_assertG_(G(sbufL(sbx)), len + sz <= sbufsz(sbx), "bad SBuf compact");
++    }
++  } else {
++    MSize len = sbuflen(sb);
++    lj_assertG_(G(sbufL(sb)), sz > sbufleft(sb), "SBuf overflow");
++    if (LJ_UNLIKELY(sz > LJ_MAX_BUF || len + sz > LJ_MAX_BUF))
++      lj_err_mem(sbufL(sb));
++    buf_grow(sb, len + sz);
++  }
++  return sb->w;
+ }
+ 
+ void LJ_FASTCALL lj_buf_shrink(lua_State *L, SBuf *sb)
+ {
+-  char *b = sbufB(sb);
+-  MSize osz = (MSize)(sbufE(sb) - b);
++  char *b = sb->b;
++  MSize osz = (MSize)(sb->e - b);
+   if (osz > 2*LJ_MIN_SBUF) {
+-    MSize n = (MSize)(sbufP(sb) - b);
++    MSize n = (MSize)(sb->w - b);
+     b = lj_mem_realloc(L, b, osz, (osz >> 1));
+-    setmref(sb->b, b);
+-    setmref(sb->p, b + n);
+-    setmref(sb->e, b + (osz >> 1));
++    sb->b = b;
++    sb->w = b + n;
++    sb->e = b + (osz >> 1);
+   }
++  lj_assertG_(G(sbufL(sb)), !sbufisext(sb), "YAGNI shrink SBufExt");
+ }
+ 
+ char * LJ_FASTCALL lj_buf_tmp(lua_State *L, MSize sz)
+@@ -67,30 +108,62 @@ char * LJ_FASTCALL lj_buf_tmp(lua_State
+   return lj_buf_need(sb, sz);
+ }
+ 
++#if LJ_HASBUFFER && LJ_HASJIT
++void lj_bufx_set(SBufExt *sbx, const char *p, MSize len, GCobj *ref)
++{
++  lua_State *L = sbufL(sbx);
++  lj_bufx_free(L, sbx);
++  lj_bufx_set_cow(L, sbx, p, len);
++  setgcref(sbx->cowref, ref);
++  lj_gc_objbarrier(L, (GCudata *)sbx - 1, ref);
++}
++
++#if LJ_HASFFI
++MSize LJ_FASTCALL lj_bufx_more(SBufExt *sbx, MSize sz)
++{
++  lj_buf_more((SBuf *)sbx, sz);
++  return sbufleft(sbx);
++}
++#endif
++#endif
++
+ /* -- Low-level buffer put operations ------------------------------------- */
+ 
+ SBuf *lj_buf_putmem(SBuf *sb, const void *q, MSize len)
+ {
+-  char *p = lj_buf_more(sb, len);
+-  p = lj_buf_wmem(p, q, len);
+-  setsbufP(sb, p);
++  char *w = lj_buf_more(sb, len);
++  w = lj_buf_wmem(w, q, len);
++  sb->w = w;
+   return sb;
+ }
+ 
+-SBuf * LJ_FASTCALL lj_buf_putchar(SBuf *sb, int c)
++#if LJ_HASJIT || LJ_HASFFI
++static LJ_NOINLINE SBuf * LJ_FASTCALL lj_buf_putchar2(SBuf *sb, int c)
+ {
+-  char *p = lj_buf_more(sb, 1);
+-  *p++ = (char)c;
+-  setsbufP(sb, p);
++  char *w = lj_buf_more2(sb, 1);
++  *w++ = (char)c;
++  sb->w = w;
+   return sb;
+ }
+ 
++SBuf * LJ_FASTCALL lj_buf_putchar(SBuf *sb, int c)
++{
++  char *w = sb->w;
++  if (LJ_LIKELY(w < sb->e)) {
++    *w++ = (char)c;
++    sb->w = w;
++    return sb;
++  }
++  return lj_buf_putchar2(sb, c);
++}
++#endif
++
+ SBuf * LJ_FASTCALL lj_buf_putstr(SBuf *sb, GCstr *s)
+ {
+   MSize len = s->len;
+-  char *p = lj_buf_more(sb, len);
+-  p = lj_buf_wmem(p, strdata(s), len);
+-  setsbufP(sb, p);
++  char *w = lj_buf_more(sb, len);
++  w = lj_buf_wmem(w, strdata(s), len);
++  sb->w = w;
+   return sb;
+ }
+ 
+@@ -99,47 +172,47 @@ SBuf * LJ_FASTCALL lj_buf_putstr(SBuf *s
+ SBuf * LJ_FASTCALL lj_buf_putstr_reverse(SBuf *sb, GCstr *s)
+ {
+   MSize len = s->len;
+-  char *p = lj_buf_more(sb, len), *e = p+len;
++  char *w = lj_buf_more(sb, len), *e = w+len;
+   const char *q = strdata(s)+len-1;
+-  while (p < e)
+-    *p++ = *q--;
+-  setsbufP(sb, p);
++  while (w < e)
++    *w++ = *q--;
++  sb->w = w;
+   return sb;
+ }
+ 
+ SBuf * LJ_FASTCALL lj_buf_putstr_lower(SBuf *sb, GCstr *s)
+ {
+   MSize len = s->len;
+-  char *p = lj_buf_more(sb, len), *e = p+len;
++  char *w = lj_buf_more(sb, len), *e = w+len;
+   const char *q = strdata(s);
+-  for (; p < e; p++, q++) {
++  for (; w < e; w++, q++) {
+     uint32_t c = *(unsigned char *)q;
+ #if LJ_TARGET_PPC
+-    *p = c + ((c >= 'A' && c <= 'Z') << 5);
++    *w = c + ((c >= 'A' && c <= 'Z') << 5);
+ #else
+     if (c >= 'A' && c <= 'Z') c += 0x20;
+-    *p = c;
++    *w = c;
+ #endif
+   }
+-  setsbufP(sb, p);
++  sb->w = w;
+   return sb;
+ }
+ 
+ SBuf * LJ_FASTCALL lj_buf_putstr_upper(SBuf *sb, GCstr *s)
+ {
+   MSize len = s->len;
+-  char *p = lj_buf_more(sb, len), *e = p+len;
++  char *w = lj_buf_more(sb, len), *e = w+len;
+   const char *q = strdata(s);
+-  for (; p < e; p++, q++) {
++  for (; w < e; w++, q++) {
+     uint32_t c = *(unsigned char *)q;
+ #if LJ_TARGET_PPC
+-    *p = c - ((c >= 'a' && c <= 'z') << 5);
++    *w = c - ((c >= 'a' && c <= 'z') << 5);
+ #else
+     if (c >= 'a' && c <= 'z') c -= 0x20;
+-    *p = c;
++    *w = c;
+ #endif
+   }
+-  setsbufP(sb, p);
++  sb->w = w;
+   return sb;
+ }
+ 
+@@ -148,21 +221,21 @@ SBuf *lj_buf_putstr_rep(SBuf *sb, GCstr
+   MSize len = s->len;
+   if (rep > 0 && len) {
+     uint64_t tlen = (uint64_t)rep * len;
+-    char *p;
++    char *w;
+     if (LJ_UNLIKELY(tlen > LJ_MAX_STR))
+       lj_err_mem(sbufL(sb));
+-    p = lj_buf_more(sb, (MSize)tlen);
++    w = lj_buf_more(sb, (MSize)tlen);
+     if (len == 1) {  /* Optimize a common case. */
+       uint32_t c = strdata(s)[0];
+-      do { *p++ = c; } while (--rep > 0);
++      do { *w++ = c; } while (--rep > 0);
+     } else {
+       const char *e = strdata(s) + len;
+       do {
+ 	const char *q = strdata(s);
+-	do { *p++ = *q++; } while (q < e);
++	do { *w++ = *q++; } while (q < e);
+       } while (--rep > 0);
+     }
+-    setsbufP(sb, p);
++    sb->w = w;
+   }
+   return sb;
+ }
+@@ -173,27 +246,27 @@ SBuf *lj_buf_puttab(SBuf *sb, GCtab *t,
+   if (i <= e) {
+     for (;;) {
+       cTValue *o = lj_tab_getint(t, i);
+-      char *p;
++      char *w;
+       if (!o) {
+       badtype:  /* Error: bad element type. */
+-	setsbufP(sb, (void *)(intptr_t)i);  /* Store failing index. */
++	sb->w = (char *)(intptr_t)i;  /* Store failing index. */
+ 	return NULL;
+       } else if (tvisstr(o)) {
+ 	MSize len = strV(o)->len;
+-	p = lj_buf_wmem(lj_buf_more(sb, len + seplen), strVdata(o), len);
++	w = lj_buf_wmem(lj_buf_more(sb, len + seplen), strVdata(o), len);
+       } else if (tvisint(o)) {
+-	p = lj_strfmt_wint(lj_buf_more(sb, STRFMT_MAXBUF_INT+seplen), intV(o));
++	w = lj_strfmt_wint(lj_buf_more(sb, STRFMT_MAXBUF_INT+seplen), intV(o));
+       } else if (tvisnum(o)) {
+-	p = lj_buf_more(lj_strfmt_putfnum(sb, STRFMT_G14, numV(o)), seplen);
++	w = lj_buf_more(lj_strfmt_putfnum(sb, STRFMT_G14, numV(o)), seplen);
+       } else {
+ 	goto badtype;
+       }
+       if (i++ == e) {
+-	setsbufP(sb, p);
++	sb->w = w;
+ 	break;
+       }
+-      if (seplen) p = lj_buf_wmem(p, strdata(sep), seplen);
+-      setsbufP(sb, p);
++      if (seplen) w = lj_buf_wmem(w, strdata(sep), seplen);
++      sb->w = w;
+     }
+   }
+   return sb;
+@@ -203,7 +276,7 @@ SBuf *lj_buf_puttab(SBuf *sb, GCtab *t,
+ 
+ GCstr * LJ_FASTCALL lj_buf_tostr(SBuf *sb)
+ {
+-  return lj_str_new(sbufL(sb), sbufB(sb), sbuflen(sb));
++  return lj_str_new(sbufL(sb), sb->b, sbuflen(sb));
+ }
+ 
+ /* Concatenate two strings. */
+@@ -219,14 +292,14 @@ GCstr *lj_buf_cat2str(lua_State *L, GCst
+ /* Read ULEB128 from buffer. */
+ uint32_t LJ_FASTCALL lj_buf_ruleb128(const char **pp)
+ {
+-  const uint8_t *p = (const uint8_t *)*pp;
+-  uint32_t v = *p++;
++  const uint8_t *w = (const uint8_t *)*pp;
++  uint32_t v = *w++;
+   if (LJ_UNLIKELY(v >= 0x80)) {
+     int sh = 0;
+     v &= 0x7f;
+-    do { v |= ((*p & 0x7f) << (sh += 7)); } while (*p++ >= 0x80);
++    do { v |= ((*w & 0x7f) << (sh += 7)); } while (*w++ >= 0x80);
+   }
+-  *pp = (const char *)p;
++  *pp = (const char *)w;
+   return v;
+ }
+ 
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_buf.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_buf.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_buf.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Buffer handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_BUF_H
+@@ -10,16 +10,60 @@
+ #include "lj_gc.h"
+ #include "lj_str.h"
+ 
+-/* Resizable string buffers. Struct definition in lj_obj.h. */
+-#define sbufB(sb)	(mref((sb)->b, char))
+-#define sbufP(sb)	(mref((sb)->p, char))
+-#define sbufE(sb)	(mref((sb)->e, char))
+-#define sbufL(sb)	(mref((sb)->L, lua_State))
+-#define sbufsz(sb)	((MSize)(sbufE((sb)) - sbufB((sb))))
+-#define sbuflen(sb)	((MSize)(sbufP((sb)) - sbufB((sb))))
+-#define sbufleft(sb)	((MSize)(sbufE((sb)) - sbufP((sb))))
+-#define setsbufP(sb, q)	(setmref((sb)->p, (q)))
+-#define setsbufL(sb, l)	(setmref((sb)->L, (l)))
++/* Resizable string buffers. */
++
++/* The SBuf struct definition is in lj_obj.h:
++**   char *w;	Write pointer.
++**   char *e;	End pointer.
++**   char *b;	Base pointer.
++**   MRef L;	lua_State, used for buffer resizing. Extension bits in 3 LSB.
++*/
++
++/* Extended string buffer. */
++typedef struct SBufExt {
++  SBufHeader;
++  union {
++    GCRef cowref;	/* Copy-on-write object reference. */
++    MRef bsb;		/* Borrowed string buffer. */
++  };
++  char *r;		/* Read pointer. */
++  GCRef dict_str;	/* Serialization string dictionary table. */
++  GCRef dict_mt;	/* Serialization metatable dictionary table. */
++  int depth;		/* Remaining recursion depth. */
++} SBufExt;
++
++#define sbufsz(sb)		((MSize)((sb)->e - (sb)->b))
++#define sbuflen(sb)		((MSize)((sb)->w - (sb)->b))
++#define sbufleft(sb)		((MSize)((sb)->e - (sb)->w))
++#define sbufxlen(sbx)		((MSize)((sbx)->w - (sbx)->r))
++#define sbufxslack(sbx)		((MSize)((sbx)->r - (sbx)->b))
++
++#define SBUF_MASK_FLAG		(7)
++#define SBUF_MASK_L		(~(GCSize)SBUF_MASK_FLAG)
++#define SBUF_FLAG_EXT		1	/* Extended string buffer. */
++#define SBUF_FLAG_COW		2	/* Copy-on-write buffer. */
++#define SBUF_FLAG_BORROW	4	/* Borrowed string buffer. */
++
++#define sbufL(sb) \
++  ((lua_State *)(void *)(uintptr_t)(mrefu((sb)->L) & SBUF_MASK_L))
++#define setsbufL(sb, l)		(setmref((sb)->L, (l)))
++#define setsbufXL(sb, l, flag) \
++  (setmrefu((sb)->L, (GCSize)(uintptr_t)(void *)(l) + (flag)))
++#define setsbufXL_(sb, l) \
++  (setmrefu((sb)->L, (GCSize)(uintptr_t)(void *)(l) | (mrefu((sb)->L) & SBUF_MASK_FLAG)))
++
++#define sbufflag(sb)		(mrefu((sb)->L))
++#define sbufisext(sb)		(sbufflag((sb)) & SBUF_FLAG_EXT)
++#define sbufiscow(sb)		(sbufflag((sb)) & SBUF_FLAG_COW)
++#define sbufisborrow(sb)	(sbufflag((sb)) & SBUF_FLAG_BORROW)
++#define sbufiscoworborrow(sb)	(sbufflag((sb)) & (SBUF_FLAG_COW|SBUF_FLAG_BORROW))
++#define sbufX(sb) \
++  (lj_assertG_(G(sbufL(sb)), sbufisext(sb), "not an SBufExt"), (SBufExt *)(sb))
++#define setsbufflag(sb, flag)	(setmrefu((sb)->L, (flag)))
++
++#define tvisbuf(o) \
++  (LJ_HASBUFFER && tvisudata(o) && udataV(o)->udtype == UDTYPE_BUFFER)
++#define bufV(o)		check_exp(tvisbuf(o), ((SBufExt *)uddata(udataV(o))))
+ 
+ /* Buffer management */
+ LJ_FUNC char *LJ_FASTCALL lj_buf_need2(SBuf *sb, MSize sz);
+@@ -30,12 +74,12 @@ LJ_FUNC char * LJ_FASTCALL lj_buf_tmp(lu
+ static LJ_AINLINE void lj_buf_init(lua_State *L, SBuf *sb)
+ {
+   setsbufL(sb, L);
+-  setmref(sb->p, NULL); setmref(sb->e, NULL); setmref(sb->b, NULL);
++  sb->w = sb->e = sb->b = NULL;
+ }
+ 
+ static LJ_AINLINE void lj_buf_reset(SBuf *sb)
+ {
+-  setmrefr(sb->p, sb->b);
++  sb->w = sb->b;
+ }
+ 
+ static LJ_AINLINE SBuf *lj_buf_tmp_(lua_State *L)
+@@ -48,26 +92,77 @@ static LJ_AINLINE SBuf *lj_buf_tmp_(lua_
+ 
+ static LJ_AINLINE void lj_buf_free(global_State *g, SBuf *sb)
+ {
+-  lj_mem_free(g, sbufB(sb), sbufsz(sb));
++  lj_assertG(!sbufisext(sb), "bad free of SBufExt");
++  lj_mem_free(g, sb->b, sbufsz(sb));
+ }
+ 
+ static LJ_AINLINE char *lj_buf_need(SBuf *sb, MSize sz)
+ {
+   if (LJ_UNLIKELY(sz > sbufsz(sb)))
+     return lj_buf_need2(sb, sz);
+-  return sbufB(sb);
++  return sb->b;
+ }
+ 
+ static LJ_AINLINE char *lj_buf_more(SBuf *sb, MSize sz)
+ {
+   if (LJ_UNLIKELY(sz > sbufleft(sb)))
+     return lj_buf_more2(sb, sz);
+-  return sbufP(sb);
++  return sb->w;
++}
++
++/* Extended buffer management */
++static LJ_AINLINE void lj_bufx_init(lua_State *L, SBufExt *sbx)
++{
++  memset(sbx, 0, sizeof(SBufExt));
++  setsbufXL(sbx, L, SBUF_FLAG_EXT);
++}
++
++static LJ_AINLINE void lj_bufx_set_borrow(lua_State *L, SBufExt *sbx, SBuf *sb)
++{
++  setsbufXL(sbx, L, SBUF_FLAG_EXT | SBUF_FLAG_BORROW);
++  setmref(sbx->bsb, sb);
++  sbx->r = sbx->w = sbx->b = sb->b;
++  sbx->e = sb->e;
++}
++
++static LJ_AINLINE void lj_bufx_set_cow(lua_State *L, SBufExt *sbx,
++				       const char *p, MSize len)
++{
++  setsbufXL(sbx, L, SBUF_FLAG_EXT | SBUF_FLAG_COW);
++  sbx->r = sbx->b = (char *)p;
++  sbx->w = sbx->e = (char *)p + len;
++}
++
++static LJ_AINLINE void lj_bufx_reset(SBufExt *sbx)
++{
++  if (sbufiscow(sbx)) {
++    setmrefu(sbx->L, (mrefu(sbx->L) & ~(GCSize)SBUF_FLAG_COW));
++    setgcrefnull(sbx->cowref);
++    sbx->b = sbx->e = NULL;
++  }
++  sbx->r = sbx->w = sbx->b;
+ }
+ 
++static LJ_AINLINE void lj_bufx_free(lua_State *L, SBufExt *sbx)
++{
++  if (!sbufiscoworborrow(sbx)) lj_mem_free(G(L), sbx->b, sbufsz(sbx));
++  setsbufXL(sbx, L, SBUF_FLAG_EXT);
++  setgcrefnull(sbx->cowref);
++  sbx->r = sbx->w = sbx->b = sbx->e = NULL;
++}
++
++#if LJ_HASBUFFER && LJ_HASJIT
++LJ_FUNC void lj_bufx_set(SBufExt *sbx, const char *p, MSize len, GCobj *o);
++#if LJ_HASFFI
++LJ_FUNC MSize LJ_FASTCALL lj_bufx_more(SBufExt *sbx, MSize sz);
++#endif
++#endif
++
+ /* Low-level buffer put operations */
+ LJ_FUNC SBuf *lj_buf_putmem(SBuf *sb, const void *q, MSize len);
++#if LJ_HASJIT || LJ_HASFFI
+ LJ_FUNC SBuf * LJ_FASTCALL lj_buf_putchar(SBuf *sb, int c);
++#endif
+ LJ_FUNC SBuf * LJ_FASTCALL lj_buf_putstr(SBuf *sb, GCstr *s);
+ 
+ static LJ_AINLINE char *lj_buf_wmem(char *p, const void *q, MSize len)
+@@ -77,9 +172,9 @@ static LJ_AINLINE char *lj_buf_wmem(char
+ 
+ static LJ_AINLINE void lj_buf_putb(SBuf *sb, int c)
+ {
+-  char *p = lj_buf_more(sb, 1);
+-  *p++ = (char)c;
+-  setsbufP(sb, p);
++  char *w = lj_buf_more(sb, 1);
++  *w++ = (char)c;
++  sb->w = w;
+ }
+ 
+ /* High-level buffer put operations */
+@@ -97,7 +192,7 @@ LJ_FUNC uint32_t LJ_FASTCALL lj_buf_rule
+ 
+ static LJ_AINLINE GCstr *lj_buf_str(lua_State *L, SBuf *sb)
+ {
+-  return lj_str_new(L, sbufB(sb), sbuflen(sb));
++  return lj_str_new(L, sb->b, sbuflen(sb));
+ }
+ 
+ #endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_carith.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_carith.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_carith.c
+@@ -1,6 +1,6 @@
+ /*
+ ** C data arithmetic.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #include "lj_obj.h"
+@@ -44,9 +44,13 @@ static int carith_checkarg(lua_State *L,
+ 	p = (uint8_t *)cdata_getptr(p, ct->size);
+ 	if (ctype_isref(ct->info)) ct = ctype_rawchild(cts, ct);
+       } else if (ctype_isfunc(ct->info)) {
++	CTypeID id0 = i ? ctype_typeid(cts, ca->ct[0]) : 0;
+ 	p = (uint8_t *)*(void **)p;
+ 	ct = ctype_get(cts,
+ 	  lj_ctype_intern(cts, CTINFO(CT_PTR, CTALIGN_PTR|id), CTSIZE_PTR));
++	if (i) {  /* cts->tab may have been reallocated. */
++	  ca->ct[0] = ctype_get(cts, id0);
++	}
+       }
+       if (ctype_isenum(ct->info)) ct = ctype_child(cts, ct);
+       ca->ct[i] = ct;
+@@ -207,7 +211,7 @@ static int carith_int64(lua_State *L, CT
+       else
+ 	*up = lj_carith_powu64(u0, u1);
+       break;
+-    case MM_unm: *up = (uint64_t)-(int64_t)u0; break;
++    case MM_unm: *up = ~u0+1u; break;
+     default:
+       lj_assertL(0, "bad metamethod %d", mm);
+       break;
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_carith.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_carith.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_carith.h
+@@ -1,6 +1,6 @@
+ /*
+ ** C data arithmetic.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_CARITH_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ccall.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_ccall.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ccall.c
+@@ -1,6 +1,6 @@
+ /*
+ ** FFI C call handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #include "lj_obj.h"
+@@ -20,12 +20,15 @@
+ #if LJ_TARGET_X86
+ /* -- x86 calling conventions --------------------------------------------- */
+ 
++#define CCALL_PUSH(arg) \
++  *(GPRArg *)((uint8_t *)cc->stack + nsp) = (GPRArg)(arg), nsp += CTSIZE_PTR
++
+ #if LJ_ABI_WIN
+ 
+ #define CCALL_HANDLE_STRUCTRET \
+   /* Return structs bigger than 8 by reference (on stack only). */ \
+   cc->retref = (sz > 8); \
+-  if (cc->retref) cc->stack[nsp++] = (GPRArg)dp;
++  if (cc->retref) CCALL_PUSH(dp);
+ 
+ #define CCALL_HANDLE_COMPLEXRET CCALL_HANDLE_STRUCTRET
+ 
+@@ -40,7 +43,7 @@
+     if (ngpr < maxgpr) \
+       cc->gpr[ngpr++] = (GPRArg)dp; \
+     else \
+-      cc->stack[nsp++] = (GPRArg)dp; \
++      CCALL_PUSH(dp); \
+   } else {  /* Struct with single FP field ends up in FPR. */ \
+     cc->resx87 = ccall_classify_struct(cts, ctr); \
+   }
+@@ -56,7 +59,7 @@
+   if (ngpr < maxgpr) \
+     cc->gpr[ngpr++] = (GPRArg)dp; \
+   else \
+-    cc->stack[nsp++] = (GPRArg)dp;
++    CCALL_PUSH(dp);
+ 
+ #endif
+ 
+@@ -67,7 +70,7 @@
+     if (ngpr < maxgpr) \
+       cc->gpr[ngpr++] = (GPRArg)dp; \
+     else \
+-      cc->stack[nsp++] = (GPRArg)dp; \
++      CCALL_PUSH(dp); \
+   }
+ 
+ #endif
+@@ -278,8 +281,8 @@
+   if (ngpr < maxgpr) { \
+     dp = &cc->gpr[ngpr]; \
+     if (ngpr + n > maxgpr) { \
+-      nsp += ngpr + n - maxgpr;  /* Assumes contiguous gpr/stack fields. */ \
+-      if (nsp > CCALL_MAXSTACK) goto err_nyi;  /* Too many arguments. */ \
++      nsp += (ngpr + n - maxgpr) * CTSIZE_PTR;  /* Assumes contiguous gpr/stack fields. */ \
++      if (nsp > CCALL_SIZE_STACK) goto err_nyi;  /* Too many arguments. */ \
+       ngpr = maxgpr; \
+     } else { \
+       ngpr += n; \
+@@ -334,7 +337,7 @@
+   isfp = sz == 2*sizeof(float) ? 2 : 1;
+ 
+ #define CCALL_HANDLE_REGARG \
+-  if (LJ_TARGET_IOS && isva) { \
++  if (LJ_TARGET_OSX && isva) { \
+     /* IOS: All variadic arguments are on the stack. */ \
+   } else if (isfp) {  /* Try to pass argument in FPRs. */ \
+     int n2 = ctype_isvector(d->info) ? 1 : \
+@@ -345,10 +348,9 @@
+       goto done; \
+     } else { \
+       nfpr = CCALL_NARG_FPR;  /* Prevent reordering. */ \
+-      if (LJ_TARGET_IOS && d->size < 8) goto err_nyi; \
+     } \
+   } else {  /* Try to pass argument in GPRs. */ \
+-    if (!LJ_TARGET_IOS && (d->info & CTF_ALIGN) > CTALIGN_PTR) \
++    if (!LJ_TARGET_OSX && (d->info & CTF_ALIGN) > CTALIGN_PTR) \
+       ngpr = (ngpr + 1u) & ~1u;  /* Align to regpair. */ \
+     if (ngpr + n <= maxgpr) { \
+       dp = &cc->gpr[ngpr]; \
+@@ -356,7 +358,6 @@
+       goto done; \
+     } else { \
+       ngpr = maxgpr;  /* Prevent reordering. */ \
+-      if (LJ_TARGET_IOS && d->size < 8) goto err_nyi; \
+     } \
+   }
+ 
+@@ -471,8 +472,8 @@
+   if (ngpr < maxgpr) { \
+     dp = &cc->gpr[ngpr]; \
+     if (ngpr + n > maxgpr) { \
+-     nsp += ngpr + n - maxgpr;  /* Assumes contiguous gpr/stack fields. */ \
+-     if (nsp > CCALL_MAXSTACK) goto err_nyi;  /* Too many arguments. */ \
++     nsp += (ngpr + n - maxgpr) * CTSIZE_PTR;  /* Assumes contiguous gpr/stack fields. */ \
++     if (nsp > CCALL_SIZE_STACK) goto err_nyi;  /* Too many arguments. */ \
+      ngpr = maxgpr; \
+     } else { \
+      ngpr += n; \
+@@ -565,8 +566,8 @@
+   if (ngpr < maxgpr) { \
+     dp = &cc->gpr[ngpr]; \
+     if (ngpr + n > maxgpr) { \
+-      nsp += ngpr + n - maxgpr;  /* Assumes contiguous gpr/stack fields. */ \
+-      if (nsp > CCALL_MAXSTACK) goto err_nyi;  /* Too many arguments. */ \
++      nsp += (ngpr + n - maxgpr) * CTSIZE_PTR;  /* Assumes contiguous gpr/stack fields. */ \
++      if (nsp > CCALL_SIZE_STACK) goto err_nyi;  /* Too many arguments. */ \
+       ngpr = maxgpr; \
+     } else { \
+       ngpr += n; \
+@@ -574,6 +575,97 @@
+     goto done; \
+   }
+ 
++#elif LJ_TARGET_RISCV64
++/* -- RISC-V lp64d calling conventions ------------------------------------ */
++
++#define CCALL_HANDLE_STRUCTRET \
++  /* Return structs of size > 16 by reference. */ \
++  cc->retref = !(sz <= 16); \
++  if (cc->retref) cc->gpr[ngpr++] = (GPRArg)dp;
++
++#define CCALL_HANDLE_STRUCTRET2 \
++  unsigned int cl = ccall_classify_struct(cts, ctr); \
++  if ((cl & 4) && (cl >> 8) <= 2) { \
++    CTSize i = (cl >> 8) - 1; \
++    do { ((float *)dp)[i] = cc->fpr[i].f; } while (i--); \
++  } else { \
++    if (cl > 1) { \
++      sp = (uint8_t *)&cc->fpr[0]; \
++      if ((cl >> 8) > 2) \
++        sp = (uint8_t *)&cc->gpr[0]; \
++    } \
++      memcpy(dp, sp, ctr->size); \
++  } \
++
++#define CCALL_HANDLE_COMPLEXRET \
++  /* Complex values are returned in 1 or 2 FPRs. */ \
++  cc->retref = 0;
++
++#define CCALL_HANDLE_COMPLEXRET2 \
++  if (ctr->size == 2*sizeof(float)) {  /* Copy complex float from FPRs. */ \
++    ((float *)dp)[0] = cc->fpr[0].f; \
++    ((float *)dp)[1] = cc->fpr[1].f; \
++  } else {  /* Copy complex double from FPRs. */ \
++    ((double *)dp)[0] = cc->fpr[0].d; \
++    ((double *)dp)[1] = cc->fpr[1].d; \
++  }
++
++#define CCALL_HANDLE_COMPLEXARG \
++  /* Pass long double complex by reference. */ \
++  if (sz == 2*sizeof(long double)) { \
++    rp = cdataptr(lj_cdata_new(cts, did, sz)); \
++    sz = CTSIZE_PTR; \
++  } \
++  /* Pass complex in two FPRs or on stack. */ \
++  else if (sz == 2*sizeof(float)) { \
++    isfp = 2; \
++    sz = 2*CTSIZE_PTR; \
++  } else { \
++    isfp = 1; \
++    sz = 2*CTSIZE_PTR; \
++  }
++
++#define CCALL_HANDLE_RET \
++  if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \
++    sp = (uint8_t *)&cc->fpr[0].f;
++
++#define CCALL_HANDLE_STRUCTARG \
++  /* Pass structs of size >16 by reference. */ \
++  unsigned int cl = ccall_classify_struct(cts, d); \
++  nff = cl >> 8; \
++  if (sz > 16) { \
++    rp = cdataptr(lj_cdata_new(cts, did, sz)); \
++    sz = CTSIZE_PTR; \
++  } \
++  /* Pass struct in FPRs. */ \
++  if (cl > 1) { \
++    isfp = (cl & 4) ? 2 : 1; \
++  }
++
++
++#define CCALL_HANDLE_REGARG \
++  if (isfp && (!isva)) {  /* Try to pass argument in FPRs. */ \
++    int n2 = ctype_isvector(d->info) ? 1 : \
++            isfp == 1 ? n : 2; \
++    if (nfpr + n2 <= CCALL_NARG_FPR && nff <= 2) { \
++      dp = &cc->fpr[nfpr]; \
++      nfpr += n2; \
++      goto done; \
++    } else { \
++      if (ngpr + n2 <= maxgpr) { \
++       dp = &cc->gpr[ngpr]; \
++       ngpr += n2; \
++       goto done; \
++      } \
++    } \
++  } else {  /* Try to pass argument in GPRs. */ \
++      if (ngpr + n <= maxgpr) { \
++        dp = &cc->gpr[ngpr]; \
++        ngpr += n; \
++        goto done; \
++    } \
++  }
++
+ #else
+ #error "Missing calling convention definitions for this architecture"
+ #endif
+@@ -698,10 +790,11 @@ static int ccall_struct_arg(CCallState *
+   lj_cconv_ct_tv(cts, d, (uint8_t *)dp, o, CCF_ARG(narg));
+   if (ccall_struct_reg(cc, cts, dp, rcl)) {
+     /* Register overflow? Pass on stack. */
+-    MSize nsp = cc->nsp, n = rcl[1] ? 2 : 1;
+-    if (nsp + n > CCALL_MAXSTACK) return 1;  /* Too many arguments. */
+-    cc->nsp = nsp + n;
+-    memcpy(&cc->stack[nsp], dp, n*CTSIZE_PTR);
++    MSize nsp = cc->nsp, sz = rcl[1] ? 2*CTSIZE_PTR : CTSIZE_PTR;
++    if (nsp + sz > CCALL_SIZE_STACK)
++      return 1;  /* Too many arguments. */
++    cc->nsp = nsp + sz;
++    memcpy((uint8_t *)cc->stack + nsp, dp, sz);
+   }
+   return 0;  /* Ok. */
+ }
+@@ -889,6 +982,51 @@ static void ccall_copy_struct(CCallState
+ 
+ #endif
+ 
++/* -- RISC-V ABI struct classification ---------------------------- */
++
++#if LJ_TARGET_RISCV64
++
++static unsigned int ccall_classify_struct(CTState *cts, CType *ct)
++{
++  CTSize sz = ct->size;
++  unsigned int r = 0, n = 0, isu = (ct->info & CTF_UNION);
++  while (ct->sib) {
++    CType *sct;
++    ct = ctype_get(cts, ct->sib);
++    if (ctype_isfield(ct->info)) {
++      sct = ctype_rawchild(cts, ct);
++      if (ctype_isfp(sct->info)) {
++	r |= sct->size;
++	if (!isu) n++; else if (n == 0) n = 1;
++      } else if (ctype_iscomplex(sct->info)) {
++	r |= (sct->size >> 1);
++	if (!isu) n += 2; else if (n < 2) n = 2;
++      } else if (ctype_isstruct(sct->info)) {
++	goto substruct;
++      } else {
++	goto noth;
++      }
++    } else if (ctype_isbitfield(ct->info)) {
++      goto noth;
++    } else if (ctype_isxattrib(ct->info, CTA_SUBTYPE)) {
++      sct = ctype_rawchild(cts, ct);
++    substruct:
++      if (sct->size > 0) {
++	unsigned int s = ccall_classify_struct(cts, sct);
++	if (s <= 1) goto noth;
++	r |= (s & 255);
++	if (!isu) n += (s >> 8); else if (n < (s >>8)) n = (s >> 8);
++      }
++    }
++  }
++  if ((r == 4 || r == 8) && n <= 4)
++    return r + (n << 8);
++noth:  /* Not a homogeneous float/double aggregate. */
++  return (sz <= 16);  /* Return structs of size <= 16 in GPRs. */
++}
++
++#endif
++
+ /* -- Common C call handling ---------------------------------------------- */
+ 
+ /* Infer the destination CTypeID for a vararg argument. */
+@@ -935,6 +1073,10 @@ static int ccall_set_args(lua_State *L,
+ #endif
+ #endif
+ 
++#if LJ_TARGET_RISCV64
++  int nff = 0;
++#endif
++
+   /* Clear unused regs to get some determinism in case of misdeclaration. */
+   memset(cc->gpr, 0, sizeof(cc->gpr));
+ #if CCALL_NUM_FPR
+@@ -983,6 +1125,14 @@ static int ccall_set_args(lua_State *L,
+     fid = ctf->sib;
+   }
+ 
++#if LJ_TARGET_ARM64 && LJ_ABI_WIN
++  if ((ct->info & CTF_VARARG)) {
++    nsp -= maxgpr * CTSIZE_PTR;  /* May end up with negative nsp. */
++    ngpr = maxgpr;
++    nfpr = CCALL_NARG_FPR;
++  }
++#endif
++
+   /* Walk through all passed arguments. */
+   for (o = L->base+1, narg = 1; o < top; o++, narg++) {
+     CTypeID did;
+@@ -1019,25 +1169,31 @@ static int ccall_set_args(lua_State *L,
+       CCALL_HANDLE_STRUCTARG
+     } else if (ctype_iscomplex(d->info)) {
+       CCALL_HANDLE_COMPLEXARG
+-    } else {
++    } else if (!(CCALL_PACK_STACKARG && ctype_isenum(d->info))) {
+       sz = CTSIZE_PTR;
+     }
+-    sz = (sz + CTSIZE_PTR-1) & ~(CTSIZE_PTR-1);
+-    n = sz / CTSIZE_PTR;  /* Number of GPRs or stack slots needed. */
++    n = (sz + CTSIZE_PTR-1) / CTSIZE_PTR;  /* Number of GPRs or stack slots needed. */
+ 
+     CCALL_HANDLE_REGARG  /* Handle register arguments. */
+ 
+     /* Otherwise pass argument on stack. */
+-    if (CCALL_ALIGN_STACKARG && !rp && (d->info & CTF_ALIGN) > CTALIGN_PTR) {
+-      MSize align = (1u << ctype_align(d->info-CTALIGN_PTR)) -1;
+-      nsp = (nsp + align) & ~align;  /* Align argument on stack. */
++    if (CCALL_ALIGN_STACKARG) {  /* Align argument on stack. */
++      MSize align = (1u << ctype_align(d->info)) - 1;
++      if (rp || (CCALL_PACK_STACKARG && isva && align < CTSIZE_PTR-1))
++	align = CTSIZE_PTR-1;
++      nsp = (nsp + align) & ~align;
+     }
+-    if (nsp + n > CCALL_MAXSTACK) {  /* Too many arguments. */
++#if LJ_TARGET_ARM64 && LJ_ABI_WIN
++    /* A negative nsp points into cc->gpr. Blame MS for their messy ABI. */
++    dp = ((uint8_t *)cc->stack) + (int32_t)nsp;
++#else
++    dp = ((uint8_t *)cc->stack) + nsp;
++#endif
++    nsp += CCALL_PACK_STACKARG ? sz : n * CTSIZE_PTR;
++    if ((int32_t)nsp > CCALL_SIZE_STACK) {  /* Too many arguments. */
+     err_nyi:
+       lj_err_caller(L, LJ_ERR_FFI_NYICALL);
+     }
+-    dp = &cc->stack[nsp];
+-    nsp += n;
+     isva = 0;
+ 
+   done:
+@@ -1048,7 +1204,8 @@ static int ccall_set_args(lua_State *L,
+     }
+     lj_cconv_ct_tv(cts, d, (uint8_t *)dp, o, CCF_ARG(narg));
+     /* Extend passed integers to 32 bits at least. */
+-    if (ctype_isinteger_or_bool(d->info) && d->size < 4) {
++    if (ctype_isinteger_or_bool(d->info) && d->size < 4 &&
++	(!CCALL_PACK_STACKARG || !((uintptr_t)dp & 3))) {  /* Assumes LJ_LE. */
+       if (d->info & CTF_UNSIGNED)
+ 	*(uint32_t *)dp = d->size == 1 ? (uint32_t)*(uint8_t *)dp :
+ 					 (uint32_t)*(uint16_t *)dp;
+@@ -1060,7 +1217,11 @@ static int ccall_set_args(lua_State *L,
+     if (isfp && d->size == sizeof(float))
+       ((float *)dp)[1] = ((float *)dp)[0];  /* Floats occupy high slot. */
+ #endif
+-#if LJ_TARGET_MIPS64 || (LJ_TARGET_ARM64 && LJ_BE)
++#if LJ_TARGET_RISCV64
++    if (isfp && d->size == sizeof(float))
++      ((uint32_t *)dp)[1] = 0xffffffffu;  /* Float NaN boxing */
++#endif
++#if LJ_TARGET_MIPS64 || (LJ_TARGET_ARM64 && LJ_BE) || LJ_TARGET_RISCV64
+     if ((ctype_isinteger_or_bool(d->info) || ctype_isenum(d->info)
+ #if LJ_TARGET_MIPS64
+ 	 || (isfp && nsp == 0)
+@@ -1090,19 +1251,30 @@ static int ccall_set_args(lua_State *L,
+       CTSize i = (sz >> 2) - 1;
+       do { ((uint64_t *)dp)[i] = ((uint32_t *)dp)[i]; } while (i--);
+     }
++#elif LJ_TARGET_RISCV64
++    if (isfp == 2 && nff <= 2) {
++      /* Split complex float into separate registers. */
++      CTSize i = (sz >> 2) - 1;
++      do {
++        ((uint64_t *)dp)[i] = 0xffffffff00000000ul | ((uint32_t *)dp)[i];
++      } while (i--);
++    }
+ #else
+     UNUSED(isfp);
+ #endif
+   }
+   if (fid) lj_err_caller(L, LJ_ERR_FFI_NUMARG);  /* Too few arguments. */
++#if LJ_TARGET_ARM64 && LJ_ABI_WIN
++  if ((int32_t)nsp < 0) nsp = 0;
++#endif
+ 
+-#if LJ_TARGET_X64 || (LJ_TARGET_PPC && !LJ_ABI_SOFTFP)
++#if LJ_TARGET_X64 || (LJ_TARGET_PPC && !LJ_ABI_SOFTFP) || LJ_TARGET_RISCV64
+   cc->nfpr = nfpr;  /* Required for vararg functions. */
+ #endif
+-  cc->nsp = nsp;
+-  cc->spadj = (CCALL_SPS_FREE + CCALL_SPS_EXTRA)*CTSIZE_PTR;
+-  if (nsp > CCALL_SPS_FREE)
+-    cc->spadj += (((nsp-CCALL_SPS_FREE)*CTSIZE_PTR + 15u) & ~15u);
++  cc->nsp = (nsp + CTSIZE_PTR-1) & ~(CTSIZE_PTR-1);
++  cc->spadj = (CCALL_SPS_FREE + CCALL_SPS_EXTRA) * CTSIZE_PTR;
++  if (cc->nsp > CCALL_SPS_FREE * CTSIZE_PTR)
++    cc->spadj += (((cc->nsp - CCALL_SPS_FREE * CTSIZE_PTR) + 15u) & ~15u);
+   return gcsteps;
+ }
+ 
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ccall.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_ccall.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ccall.h
+@@ -1,6 +1,6 @@
+ /*
+ ** FFI C call handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_CCALL_H
+@@ -75,6 +75,9 @@ typedef union FPRArg {
+ #define CCALL_NARG_FPR		8
+ #define CCALL_NRET_FPR		4
+ #define CCALL_SPS_FREE		0
++#if LJ_TARGET_OSX
++#define CCALL_PACK_STACKARG	1
++#endif
+ 
+ typedef intptr_t GPRArg;
+ typedef union FPRArg {
+@@ -126,6 +129,21 @@ typedef union FPRArg {
+   struct { LJ_ENDIAN_LOHI(float f; , float g;) };
+ } FPRArg;
+ 
++#elif LJ_TARGET_RISCV64
++
++#define CCALL_NARG_GPR		8
++#define CCALL_NARG_FPR		8
++#define CCALL_NRET_GPR		2
++#define CCALL_NRET_FPR		2
++#define CCALL_SPS_EXTRA		3
++#define CCALL_SPS_FREE		1
++
++typedef intptr_t GPRArg;
++typedef union FPRArg {
++  double d;
++  struct { LJ_ENDIAN_LOHI(float f; , float g;) };
++} FPRArg;
++
+ #else
+ #error "Missing calling convention definitions for this architecture"
+ #endif
+@@ -139,6 +157,9 @@ typedef union FPRArg {
+ #ifndef CCALL_ALIGN_STACKARG
+ #define CCALL_ALIGN_STACKARG	1
+ #endif
++#ifndef CCALL_PACK_STACKARG
++#define CCALL_PACK_STACKARG	0
++#endif
+ #ifndef CCALL_ALIGN_CALLSTATE
+ #define CCALL_ALIGN_CALLSTATE	8
+ #endif
+@@ -152,14 +173,15 @@ typedef union FPRArg {
+ LJ_STATIC_ASSERT(CCALL_NUM_GPR <= CCALL_MAX_GPR);
+ LJ_STATIC_ASSERT(CCALL_NUM_FPR <= CCALL_MAX_FPR);
+ 
+-#define CCALL_MAXSTACK		32
++#define CCALL_NUM_STACK		31
++#define CCALL_SIZE_STACK	(CCALL_NUM_STACK * CTSIZE_PTR)
+ 
+ /* -- C call state -------------------------------------------------------- */
+ 
+ typedef LJ_ALIGN(CCALL_ALIGN_CALLSTATE) struct CCallState {
+   void (*func)(void);		/* Pointer to called function. */
+   uint32_t spadj;		/* Stack pointer adjustment. */
+-  uint8_t nsp;			/* Number of stack slots. */
++  uint8_t nsp;			/* Number of bytes on stack. */
+   uint8_t retref;		/* Return value by reference. */
+ #if LJ_TARGET_X64
+   uint8_t ngpr;			/* Number of arguments in GPRs. */
+@@ -168,7 +190,7 @@ typedef LJ_ALIGN(CCALL_ALIGN_CALLSTATE)
+   uint8_t resx87;		/* Result on x87 stack: 1:float, 2:double. */
+ #elif LJ_TARGET_ARM64
+   void *retp;			/* Aggregate return pointer in x8. */
+-#elif LJ_TARGET_PPC
++#elif LJ_TARGET_PPC || LJ_TARGET_RISCV64
+   uint8_t nfpr;			/* Number of arguments in FPRs. */
+ #endif
+ #if LJ_32
+@@ -178,7 +200,7 @@ typedef LJ_ALIGN(CCALL_ALIGN_CALLSTATE)
+   FPRArg fpr[CCALL_NUM_FPR];	/* Arguments/results in FPRs. */
+ #endif
+   GPRArg gpr[CCALL_NUM_GPR];	/* Arguments/results in GPRs. */
+-  GPRArg stack[CCALL_MAXSTACK];	/* Stack slots. */
++  GPRArg stack[CCALL_NUM_STACK];	/* Stack slots. */
+ } CCallState;
+ 
+ /* -- C call handling ----------------------------------------------------- */
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ccallback.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_ccallback.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ccallback.c
+@@ -1,6 +1,6 @@
+ /*
+ ** FFI C callback handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #include "lj_obj.h"
+@@ -71,6 +71,10 @@ static MSize CALLBACK_OFS2SLOT(MSize ofs
+ 
+ #define CALLBACK_MCODE_HEAD		52
+ 
++#elif LJ_TARGET_RISCV64
++
++#define CALLBACK_MCODE_HEAD		68
++
+ #else
+ 
+ /* Missing support for this architecture. */
+@@ -171,13 +175,13 @@ static void *callback_mcode_init(global_
+ static void *callback_mcode_init(global_State *g, uint32_t *page)
+ {
+   uint32_t *p = page;
+-  void *target = (void *)lj_vm_ffi_callback;
++  ASMFunction target = lj_vm_ffi_callback;
+   MSize slot;
+   *p++ = A64I_LE(A64I_LDRLx | A64F_D(RID_X11) | A64F_S19(4));
+   *p++ = A64I_LE(A64I_LDRLx | A64F_D(RID_X10) | A64F_S19(5));
+-  *p++ = A64I_LE(A64I_BR | A64F_N(RID_X11));
++  *p++ = A64I_LE(A64I_BR_AUTH | A64F_N(RID_X11));
+   *p++ = A64I_LE(A64I_NOP);
+-  ((void **)p)[0] = target;
++  ((ASMFunction *)p)[0] = target;
+   ((void **)p)[1] = g;
+   p += 4;
+   for (slot = 0; slot < CALLBACK_MAX_SLOT; slot++) {
+@@ -238,6 +242,39 @@ static void *callback_mcode_init(global_
+   }
+   return p;
+ }
++#elif LJ_TARGET_RISCV64
++static void *callback_mcode_init(global_State *g, uint32_t *page)
++{
++  uint32_t *p = page;
++  uintptr_t target = (uintptr_t)(void *)lj_vm_ffi_callback;
++  uintptr_t ug = (uintptr_t)(void *)g;
++  uintptr_t target_hi = (target >> 32), target_lo = target & 0xffffffffULL;
++  uintptr_t ug_hi = (ug >> 32), ug_lo = ug & 0xffffffffULL;
++  MSize slot;
++  *p++ = RISCVI_LUI  | RISCVF_D(RID_X6) | RISCVF_IMMU(RISCVF_HI(target_hi));
++  *p++ = RISCVI_LUI  | RISCVF_D(RID_X7) | RISCVF_IMMU(RISCVF_HI(ug_hi));
++  *p++ = RISCVI_ADDI | RISCVF_D(RID_X6) | RISCVF_S1(RID_X6) | RISCVF_IMMI(RISCVF_LO(target_hi));
++  *p++ = RISCVI_ADDI | RISCVF_D(RID_X7) | RISCVF_S1(RID_X7) | RISCVF_IMMI(RISCVF_LO(ug_hi));
++  *p++ = RISCVI_SLLI | RISCVF_D(RID_X6) | RISCVF_S1(RID_X6) | RISCVF_SHAMT(11);
++  *p++ = RISCVI_SLLI | RISCVF_D(RID_X7) | RISCVF_S1(RID_X7) | RISCVF_SHAMT(11);
++  *p++ = RISCVI_ADDI | RISCVF_D(RID_X6) | RISCVF_S1(RID_X6) | RISCVF_IMMI(target_lo >> 21);
++  *p++ = RISCVI_ADDI | RISCVF_D(RID_X7) | RISCVF_S1(RID_X7) | RISCVF_IMMI(ug_lo >> 21);
++  *p++ = RISCVI_SLLI | RISCVF_D(RID_X6) | RISCVF_S1(RID_X6) | RISCVF_SHAMT(11);
++  *p++ = RISCVI_SLLI | RISCVF_D(RID_X7) | RISCVF_S1(RID_X7) | RISCVF_SHAMT(11);
++  *p++ = RISCVI_ADDI | RISCVF_D(RID_X6) | RISCVF_S1(RID_X6) | RISCVF_IMMI((target_lo >> 10) & 0x7ff);
++  *p++ = RISCVI_ADDI | RISCVF_D(RID_X7) | RISCVF_S1(RID_X7) | RISCVF_IMMI((ug_lo >> 10) & 0x7ff);
++  *p++ = RISCVI_SLLI | RISCVF_D(RID_X6) | RISCVF_S1(RID_X6) | RISCVF_SHAMT(10);
++  *p++ = RISCVI_SLLI | RISCVF_D(RID_X7) | RISCVF_S1(RID_X7) | RISCVF_SHAMT(10);
++  *p++ = RISCVI_ADDI | RISCVF_D(RID_X6) | RISCVF_S1(RID_X6) | RISCVF_IMMI(target_lo & 0x3ff);
++  *p++ = RISCVI_ADDI | RISCVF_D(RID_X7) | RISCVF_S1(RID_X7) | RISCVF_IMMI(ug_lo & 0x3ff);
++  *p++ = RISCVI_JALR | RISCVF_D(RID_X0) | RISCVF_S1(RID_X6) | RISCVF_IMMJ(0);
++  for (slot = 0; slot < CALLBACK_MAX_SLOT; slot++) {
++    *p++ = RISCVI_LUI | RISCVF_D(RID_X5) | RISCVF_IMMU(slot);
++    *p = RISCVI_JAL | RISCVF_IMMJ(((char *)page-(char *)p));
++    p++;
++  }
++  return p;
++}
+ #else
+ /* Missing support for this architecture. */
+ #define callback_mcode_init(g, p)	(p)
+@@ -256,6 +293,11 @@ static void *callback_mcode_init(global_
+ #ifndef MAP_ANONYMOUS
+ #define MAP_ANONYMOUS   MAP_ANON
+ #endif
++#ifdef PROT_MPROTECT
++#define CCPROT_CREATE	(PROT_MPROTECT(PROT_EXEC))
++#else
++#define CCPROT_CREATE	0
++#endif
+ 
+ #endif
+ 
+@@ -271,7 +313,7 @@ static void callback_mcode_new(CTState *
+   if (!p)
+     lj_err_caller(cts->L, LJ_ERR_FFI_CBACKOV);
+ #elif LJ_TARGET_POSIX
+-  p = mmap(NULL, sz, (PROT_READ|PROT_WRITE), MAP_PRIVATE|MAP_ANONYMOUS,
++  p = mmap(NULL, sz, (PROT_READ|PROT_WRITE|CCPROT_CREATE), MAP_PRIVATE|MAP_ANONYMOUS,
+ 	   -1, 0);
+   if (p == MAP_FAILED)
+     lj_err_caller(cts->L, LJ_ERR_FFI_CBACKOV);
+@@ -409,7 +451,7 @@ void lj_ccallback_mcode_free(CTState *ct
+       nfpr = CCALL_NARG_FPR;  /* Prevent reordering. */ \
+     } \
+   } else { \
+-    if (!LJ_TARGET_IOS && n > 1) \
++    if (!LJ_TARGET_OSX && n > 1) \
+       ngpr = (ngpr + 1u) & ~1u;  /* Align to regpair. */ \
+     if (ngpr + n <= maxgpr) { \
+       sp = &cts->cb.gpr[ngpr]; \
+@@ -511,6 +553,31 @@ void lj_ccallback_mcode_free(CTState *ct
+   if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \
+     ((float *)dp)[1] = *(float *)dp;
+ 
++#elif LJ_TARGET_RISCV64
++
++#define CALLBACK_HANDLE_REGARG \
++  if (isfp) { \
++    if (nfpr + n <= CCALL_NARG_FPR) { \
++      sp = &cts->cb.fpr[nfpr]; \
++      nfpr += n; \
++      goto done; \
++    } else if (ngpr + n <= maxgpr) { \
++      sp = &cts->cb.gpr[ngpr]; \
++      ngpr += n; \
++      goto done; \
++    } \
++  } else { \
++    if (ngpr + n <= maxgpr) { \
++      sp = &cts->cb.gpr[ngpr]; \
++      ngpr += n; \
++      goto done; \
++    } \
++  }
++
++#define CALLBACK_HANDLE_RET \
++  if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \
++    ((float *)dp)[1] = *(float *)dp;
++
+ #else
+ #error "Missing calling convention definitions for this architecture"
+ #endif
+@@ -657,7 +724,7 @@ static void callback_conv_result(CTState
+ 	*(int32_t *)dp = ctr->size == 1 ? (int32_t)*(int8_t *)dp :
+ 					  (int32_t)*(int16_t *)dp;
+     }
+-#if LJ_TARGET_MIPS64 || (LJ_TARGET_ARM64 && LJ_BE)
++#if LJ_TARGET_MIPS64 || (LJ_TARGET_ARM64 && LJ_BE) || LJ_TARGET_RISCV64
+     /* Always sign-extend results to 64 bits. Even a soft-fp 'float'. */
+     if (ctr->size <= 4 &&
+ 	(LJ_ABI_SOFTFP || ctype_isinteger_or_bool(ctr->info)))
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ccallback.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_ccallback.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ccallback.h
+@@ -1,6 +1,6 @@
+ /*
+ ** FFI C callback handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_CCALLBACK_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_cconv.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_cconv.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_cconv.c
+@@ -1,6 +1,6 @@
+ /*
+ ** C type conversions.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #include "lj_obj.h"
+@@ -8,6 +8,7 @@
+ #if LJ_HASFFI
+ 
+ #include "lj_err.h"
++#include "lj_buf.h"
+ #include "lj_tab.h"
+ #include "lj_ctype.h"
+ #include "lj_cdata.h"
+@@ -568,7 +569,9 @@ void lj_cconv_ct_tv(CTState *cts, CType
+     }
+     s = ctype_raw(cts, sid);
+     if (ctype_isfunc(s->info)) {
++      CTypeID did = ctype_typeid(cts, d);
+       sid = lj_ctype_intern(cts, CTINFO(CT_PTR, CTALIGN_PTR|sid), CTSIZE_PTR);
++      d = ctype_get(cts, did);  /* cts->tab may have been reallocated. */
+     } else {
+       if (ctype_isenum(s->info)) s = ctype_child(cts, s);
+       goto doconv;
+@@ -619,6 +622,8 @@ void lj_cconv_ct_tv(CTState *cts, CType
+     tmpptr = uddata(ud);
+     if (ud->udtype == UDTYPE_IO_FILE)
+       tmpptr = *(void **)tmpptr;
++    else if (ud->udtype == UDTYPE_BUFFER)
++      tmpptr = ((SBufExt *)tmpptr)->r;
+   } else if (tvislightud(o)) {
+     tmpptr = lightudV(cts->g, o);
+   } else if (tvisfunc(o)) {
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_cconv.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_cconv.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_cconv.h
+@@ -1,6 +1,6 @@
+ /*
+ ** C type conversions.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_CCONV_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_cdata.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_cdata.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_cdata.c
+@@ -1,6 +1,6 @@
+ /*
+ ** C data management.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #include "lj_obj.h"
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_cdata.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_cdata.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_cdata.h
+@@ -1,6 +1,6 @@
+ /*
+ ** C data management.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_CDATA_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_clib.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_clib.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_clib.c
+@@ -1,6 +1,6 @@
+ /*
+ ** FFI C library loader.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #include "lj_obj.h"
+@@ -25,7 +25,7 @@
+ #include <dlfcn.h>
+ #include <stdio.h>
+ 
+-#if defined(RTLD_DEFAULT)
++#if defined(RTLD_DEFAULT) && !defined(NO_RTLD_DEFAULT)
+ #define CLIB_DEFHANDLE	RTLD_DEFAULT
+ #elif LJ_TARGET_OSX || LJ_TARGET_BSD
+ #define CLIB_DEFHANDLE	((void *)(intptr_t)-2)
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_clib.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_clib.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_clib.h
+@@ -1,6 +1,6 @@
+ /*
+ ** FFI C library loader.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_CLIB_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_cparse.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_cparse.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_cparse.c
+@@ -1,6 +1,6 @@
+ /*
+ ** C declaration parser.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #include "lj_obj.h"
+@@ -133,9 +133,9 @@ LJ_NORET static void cp_errmsg(CPState *
+     tokstr = NULL;
+   } else if (tok == CTOK_IDENT || tok == CTOK_INTEGER || tok == CTOK_STRING ||
+ 	     tok >= CTOK_FIRSTDECL) {
+-    if (sbufP(&cp->sb) == sbufB(&cp->sb)) cp_save(cp, '$');
++    if (cp->sb.w == cp->sb.b) cp_save(cp, '$');
+     cp_save(cp, '\0');
+-    tokstr = sbufB(&cp->sb);
++    tokstr = cp->sb.b;
+   } else {
+     tokstr = cp_tok2str(cp, tok);
+   }
+@@ -175,7 +175,7 @@ static CPToken cp_number(CPState *cp)
+   TValue o;
+   do { cp_save(cp, cp->c); } while (lj_char_isident(cp_get(cp)));
+   cp_save(cp, '\0');
+-  fmt = lj_strscan_scan((const uint8_t *)sbufB(&cp->sb), sbuflen(&cp->sb)-1,
++  fmt = lj_strscan_scan((const uint8_t *)(cp->sb.b), sbuflen(&cp->sb)-1,
+ 			&o, STRSCAN_OPT_C);
+   if (fmt == STRSCAN_INT) cp->val.id = CTID_INT32;
+   else if (fmt == STRSCAN_U32) cp->val.id = CTID_UINT32;
+@@ -279,7 +279,7 @@ static CPToken cp_string(CPState *cp)
+     return CTOK_STRING;
+   } else {
+     if (sbuflen(&cp->sb) != 1) cp_err_token(cp, '\'');
+-    cp->val.i32 = (int32_t)(char)*sbufB(&cp->sb);
++    cp->val.i32 = (int32_t)(char)*cp->sb.b;
+     cp->val.id = CTID_INT32;
+     return CTOK_INTEGER;
+   }
+@@ -468,7 +468,7 @@ static void cp_expr_sizeof(CPState *cp,
+   } else {
+     cp_expr_unary(cp, k);
+   }
+-  info = lj_ctype_info(cp->cts, k->id, &sz);
++  info = lj_ctype_info_raw(cp->cts, k->id, &sz);
+   if (wantsz) {
+     if (sz != CTSIZE_INVALID)
+       k->u32 = sz;
+@@ -488,7 +488,7 @@ static void cp_expr_prefix(CPState *cp,
+   } else if (cp_opt(cp, '+')) {
+     cp_expr_unary(cp, k);  /* Nothing to do (well, integer promotion). */
+   } else if (cp_opt(cp, '-')) {
+-    cp_expr_unary(cp, k); k->i32 = -k->i32;
++    cp_expr_unary(cp, k); k->i32 = (int32_t)(~(uint32_t)k->i32+1);
+   } else if (cp_opt(cp, '~')) {
+     cp_expr_unary(cp, k); k->i32 = ~k->i32;
+   } else if (cp_opt(cp, '!')) {
+@@ -1766,9 +1766,11 @@ static void cp_pragma(CPState *cp, BCLin
+     cp_check(cp, '(');
+     if (cp->tok == CTOK_IDENT) {
+       if (cp_str_is(cp->str, "push")) {
+-	if (cp->curpack < CPARSE_MAX_PACKSTACK) {
++	if (cp->curpack < CPARSE_MAX_PACKSTACK-1) {
+ 	  cp->packstack[cp->curpack+1] = cp->packstack[cp->curpack];
+ 	  cp->curpack++;
++	} else {
++	  cp_errmsg(cp, cp->tok, LJ_ERR_XLEVELS);
+ 	}
+       } else if (cp_str_is(cp->str, "pop")) {
+ 	if (cp->curpack > 0) cp->curpack--;
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_cparse.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_cparse.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_cparse.h
+@@ -1,6 +1,6 @@
+ /*
+ ** C declaration parser.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_CPARSE_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_crecord.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_crecord.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_crecord.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Trace recorder for C data operations.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_ffrecord_c
+@@ -78,7 +78,7 @@ static CTypeID argv2ctype(jit_State *J,
+     /* Specialize to the string containing the C type declaration. */
+     emitir(IRTG(IR_EQ, IRT_STR), tr, lj_ir_kstr(J, s));
+     cp.L = J->L;
+-    cp.cts = ctype_ctsG(J2G(J));
++    cp.cts = ctype_cts(J->L);
+     oldtop = cp.cts->top;
+     cp.srcname = strdata(s);
+     cp.p = strdata(s);
+@@ -616,10 +616,12 @@ static TRef crec_ct_tv(jit_State *J, CTy
+     sp = lj_ir_kptr(J, NULL);
+   } else if (tref_isudata(sp)) {
+     GCudata *ud = udataV(sval);
+-    if (ud->udtype == UDTYPE_IO_FILE) {
++    if (ud->udtype == UDTYPE_IO_FILE || ud->udtype == UDTYPE_BUFFER) {
+       TRef tr = emitir(IRT(IR_FLOAD, IRT_U8), sp, IRFL_UDATA_UDTYPE);
+-      emitir(IRTGI(IR_EQ), tr, lj_ir_kint(J, UDTYPE_IO_FILE));
+-      sp = emitir(IRT(IR_FLOAD, IRT_PTR), sp, IRFL_UDATA_FILE);
++      emitir(IRTGI(IR_EQ), tr, lj_ir_kint(J, ud->udtype));
++      sp = emitir(IRT(IR_FLOAD, IRT_PTR), sp,
++		  ud->udtype == UDTYPE_IO_FILE ? IRFL_UDATA_FILE :
++						 IRFL_SBUF_R);
+     } else {
+       sp = emitir(IRT(IR_ADD, IRT_PTR), sp, lj_ir_kintp(J, sizeof(GCudata)));
+     }
+@@ -1024,8 +1026,26 @@ static void crec_alloc(jit_State *J, Rec
+ 	crec_ct_tv(J, dc, dp, sp, sval);
+       }
+     } else if (ctype_isstruct(d->info)) {
+-      CTypeID fid = d->sib;
++      CTypeID fid;
+       MSize i = 1;
++      if (!J->base[1]) {  /* Handle zero-fill of struct-of-NYI. */
++	fid = d->sib;
++	while (fid) {
++	  CType *df = ctype_get(cts, fid);
++	  fid = df->sib;
++	  if (ctype_isfield(df->info)) {
++	    CType *dc;
++	    if (!gcref(df->name)) continue;  /* Ignore unnamed fields. */
++	    dc = ctype_rawchild(cts, df);  /* Field type. */
++	    if (!(ctype_isnum(dc->info) || ctype_isptr(dc->info) ||
++		  ctype_isenum(dc->info)))
++	      goto special;
++	  } else if (!ctype_isconstval(df->info)) {
++	    goto special;
++	  }
++	}
++      }
++      fid = d->sib;
+       while (fid) {
+ 	CType *df = ctype_get(cts, fid);
+ 	fid = df->sib;
+@@ -1098,6 +1118,8 @@ static TRef crec_call_args(jit_State *J,
+     ngpr = 1;
+   else if (ctype_cconv(ct->info) == CTCC_FASTCALL)
+     ngpr = 2;
++#elif LJ_TARGET_ARM64 && LJ_TARGET_OSX
++  int ngpr = CCALL_NARG_GPR;
+ #endif
+ 
+   /* Skip initial attributes. */
+@@ -1123,6 +1145,14 @@ static TRef crec_call_args(jit_State *J,
+     } else {
+       if (!(ct->info & CTF_VARARG))
+ 	lj_trace_err(J, LJ_TRERR_NYICALL);  /* Too many arguments. */
++#if LJ_TARGET_ARM64 && LJ_TARGET_OSX
++      if (ngpr >= 0) {
++	ngpr = -1;
++	args[n++] = TREF_NIL;  /* Marker for start of varargs. */
++	if (n >= CCI_NARGS_MAX)
++	  lj_trace_err(J, LJ_TRERR_NYICALL);
++      }
++#endif
+       did = lj_ccall_ctid_vararg(cts, o);  /* Infer vararg type. */
+     }
+     d = ctype_raw(cts, did);
+@@ -1131,6 +1161,15 @@ static TRef crec_call_args(jit_State *J,
+       lj_trace_err(J, LJ_TRERR_NYICALL);
+     tr = crec_ct_tv(J, d, 0, *base, o);
+     if (ctype_isinteger_or_bool(d->info)) {
++#if LJ_TARGET_ARM64 && LJ_TARGET_OSX
++      if (!ngpr) {
++	/* Fixed args passed on the stack use their unpromoted size. */
++	if (d->size != lj_ir_type_size[tref_type(tr)]) {
++	  lj_assertJ(d->size == 1 || d->size==2, "unexpected size %d", d->size);
++	  tr = emitconv(tr, d->size==1 ? IRT_U8 : IRT_U16, tref_type(tr), 0);
++	}
++      } else
++#endif
+       if (d->size < 4) {
+ 	if ((d->info & CTF_UNSIGNED))
+ 	  tr = emitconv(tr, IRT_INT, d->size==1 ? IRT_U8 : IRT_U16, 0);
+@@ -1168,6 +1207,10 @@ static TRef crec_call_args(jit_State *J,
+       }
+     }
+ #endif
++#elif LJ_TARGET_ARM64 && LJ_TARGET_OSX
++    if (!ctype_isfp(d->info) && ngpr) {
++      ngpr--;
++    }
+ #endif
+     args[n] = tr;
+   }
+@@ -1484,9 +1527,13 @@ void LJ_FASTCALL recff_cdata_arith(jit_S
+ 	if (ctype_isenum(ct->info)) ct = ctype_child(cts, ct);
+ 	goto ok;
+       } else if (ctype_isfunc(ct->info)) {
++	CTypeID id0 = i ? ctype_typeid(cts, s[0]) : 0;
+ 	tr = emitir(IRT(IR_FLOAD, IRT_PTR), tr, IRFL_CDATA_PTR);
+ 	ct = ctype_get(cts,
+ 	  lj_ctype_intern(cts, CTINFO(CT_PTR, CTALIGN_PTR|id), CTSIZE_PTR));
++	if (i) {
++	  s[0] = ctype_get(cts, id0);  /* cts->tab may have been reallocated. */
++	}
+ 	goto ok;
+       } else {
+ 	tr = emitir(IRT(IR_ADD, IRT_PTR), tr, lj_ir_kintp(J, sizeof(GCcdata)));
+@@ -1855,7 +1902,8 @@ TRef recff_bit64_tohex(jit_State *J, Rec
+   } else {
+     n = id ? 16 : 8;
+   }
+-  if (n < 0) { n = -n; sf |= STRFMT_F_UPPER; }
++  if (n < 0) { n = (int32_t)(~n+1u); sf |= STRFMT_F_UPPER; }
++  if ((uint32_t)n > 254) n = 254;
+   sf |= ((SFormat)((n+1)&255) << STRFMT_SH_PREC);
+   if (id) {
+     tr = crec_ct_tv(J, ctype_get(cts, id), 0, J->base[0], &rd->argv[0]);
+@@ -1893,6 +1941,30 @@ void LJ_FASTCALL lj_crecord_tonumber(jit
+   }
+ }
+ 
++TRef lj_crecord_loadiu64(jit_State *J, TRef tr, cTValue *o)
++{
++  CTypeID id = argv2cdata(J, tr, o)->ctypeid;
++  if (!(id == CTID_INT64 || id == CTID_UINT64))
++    lj_trace_err(J, LJ_TRERR_BADTYPE);
++  lj_needsplit(J);
++  return emitir(IRT(IR_FLOAD, id == CTID_INT64 ? IRT_I64 : IRT_U64), tr,
++		IRFL_CDATA_INT64);
++}
++
++#if LJ_HASBUFFER
++TRef lj_crecord_topcvoid(jit_State *J, TRef tr, cTValue *o)
++{
++  CTState *cts = ctype_ctsG(J2G(J));
++  if (!tref_iscdata(tr)) lj_trace_err(J, LJ_TRERR_BADTYPE);
++  return crec_ct_tv(J, ctype_get(cts, CTID_P_CVOID), 0, tr, o);
++}
++
++TRef lj_crecord_topuint8(jit_State *J, TRef tr)
++{
++  return emitir(IRTG(IR_CNEWI, IRT_CDATA), lj_ir_kint(J, CTID_P_UINT8), tr);
++}
++#endif
++
+ #undef IR
+ #undef emitir
+ #undef emitconv
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_crecord.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_crecord.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_crecord.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Trace recorder for C data operations.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_CRECORD_H
+@@ -33,6 +33,11 @@ LJ_FUNC int LJ_FASTCALL recff_bit64_shif
+ LJ_FUNC TRef recff_bit64_tohex(jit_State *J, RecordFFData *rd, TRef hdr);
+ 
+ LJ_FUNC void LJ_FASTCALL lj_crecord_tonumber(jit_State *J, RecordFFData *rd);
++LJ_FUNC TRef lj_crecord_loadiu64(jit_State *J, TRef tr, cTValue *o);
++#if LJ_HASBUFFER
++LJ_FUNC TRef lj_crecord_topcvoid(jit_State *J, TRef tr, cTValue *o);
++LJ_FUNC TRef lj_crecord_topuint8(jit_State *J, TRef tr);
++#endif
+ #endif
+ 
+ #endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ctype.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_ctype.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ctype.c
+@@ -1,6 +1,6 @@
+ /*
+ ** C type management.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #include "lj_obj.h"
+@@ -191,8 +191,20 @@ CTypeID lj_ctype_intern(CTState *cts, CT
+   }
+   id = cts->top;
+   if (LJ_UNLIKELY(id >= cts->sizetab)) {
++#ifdef LUAJIT_CTYPE_CHECK_ANCHOR
++    CType *ct;
++#endif
+     if (id >= CTID_MAX) lj_err_msg(cts->L, LJ_ERR_TABOV);
++#ifdef LUAJIT_CTYPE_CHECK_ANCHOR
++    ct = lj_mem_newvec(cts->L, id+1, CType);
++    memcpy(ct, cts->tab, id*sizeof(CType));
++    memset(cts->tab, 0, id*sizeof(CType));
++    lj_mem_freevec(cts->g, cts->tab, cts->sizetab, CType);
++    cts->tab = ct;
++    cts->sizetab = id+1;
++#else
+     lj_mem_growvec(cts->L, cts->tab, cts->sizetab, CTID_MAX, CType);
++#endif
+   }
+   cts->top = id+1;
+   cts->tab[id].info = info;
+@@ -333,6 +345,14 @@ CTInfo lj_ctype_info(CTState *cts, CType
+   return qual;
+ }
+ 
++/* Ditto, but follow a reference. */
++CTInfo lj_ctype_info_raw(CTState *cts, CTypeID id, CTSize *szp)
++{
++  CType *ct = ctype_get(cts, id);
++  if (ctype_isref(ct->info)) id = ctype_cid(ct->info);
++  return lj_ctype_info(cts, id, szp);
++}
++
+ /* Get ctype metamethod. */
+ cTValue *lj_ctype_meta(CTState *cts, CTypeID id, MMS mm)
+ {
+@@ -562,7 +582,7 @@ GCstr *lj_ctype_repr_int64(lua_State *L,
+   if (isunsigned) {
+     *--p = 'U';
+   } else if ((int64_t)n < 0) {
+-    n = (uint64_t)-(int64_t)n;
++    n = ~n+1u;
+     sign = 1;
+   }
+   do { *--p = (char)('0' + n % 10); } while (n /= 10);
+@@ -583,7 +603,7 @@ GCstr *lj_ctype_repr_complex(lua_State *
+   lj_strfmt_putfnum(sb, STRFMT_G14, re.n);
+   if (!(im.u32.hi & 0x80000000u) || im.n != im.n) lj_buf_putchar(sb, '+');
+   lj_strfmt_putfnum(sb, STRFMT_G14, im.n);
+-  lj_buf_putchar(sb, sbufP(sb)[-1] >= 'a' ? 'I' : 'i');
++  lj_buf_putchar(sb, sb->w[-1] >= 'a' ? 'I' : 'i');
+   return lj_buf_str(L, sb);
+ }
+ 
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ctype.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_ctype.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ctype.h
+@@ -1,6 +1,6 @@
+ /*
+ ** C type management.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_CTYPE_H
+@@ -276,6 +276,8 @@ typedef struct CTState {
+ #define CTTYDEFP(_)
+ #endif
+ 
++#define CTF_LONG_IF8		(CTF_LONG * (sizeof(long) == 8))
++
+ /* Common types. */
+ #define CTTYDEF(_) \
+   _(NONE,		0,	CT_ATTRIB, CTATTRIB(CTA_BAD)) \
+@@ -289,8 +291,8 @@ typedef struct CTState {
+   _(UINT16,		2,	CT_NUM, CTF_UNSIGNED|CTALIGN(1)) \
+   _(INT32,		4,	CT_NUM, CTALIGN(2)) \
+   _(UINT32,		4,	CT_NUM, CTF_UNSIGNED|CTALIGN(2)) \
+-  _(INT64,		8,	CT_NUM, CTF_LONG|CTALIGN(3)) \
+-  _(UINT64,		8,	CT_NUM, CTF_UNSIGNED|CTF_LONG|CTALIGN(3)) \
++  _(INT64,		8,	CT_NUM, CTF_LONG_IF8|CTALIGN(3)) \
++  _(UINT64,		8,	CT_NUM, CTF_UNSIGNED|CTF_LONG_IF8|CTALIGN(3)) \
+   _(FLOAT,		4,	CT_NUM, CTF_FP|CTALIGN(2)) \
+   _(DOUBLE,		8,	CT_NUM, CTF_FP|CTALIGN(3)) \
+   _(COMPLEX_FLOAT,	8,	CT_ARRAY, CTF_COMPLEX|CTALIGN(2)|CTID_FLOAT) \
+@@ -298,6 +300,7 @@ typedef struct CTState {
+   _(P_VOID,	CTSIZE_PTR,	CT_PTR, CTALIGN_PTR|CTID_VOID) \
+   _(P_CVOID,	CTSIZE_PTR,	CT_PTR, CTALIGN_PTR|CTID_CVOID) \
+   _(P_CCHAR,	CTSIZE_PTR,	CT_PTR, CTALIGN_PTR|CTID_CCHAR) \
++  _(P_UINT8,	CTSIZE_PTR,	CT_PTR, CTALIGN_PTR|CTID_UINT8) \
+   _(A_CCHAR,		-1,	CT_ARRAY, CTF_CONST|CTALIGN(0)|CTID_CCHAR) \
+   _(CTYPEID,		4,	CT_ENUM, CTALIGN(2)|CTID_INT32) \
+   CTTYDEFP(_) \
+@@ -389,6 +392,16 @@ static LJ_AINLINE CTState *ctype_cts(lua
+   return cts;
+ }
+ 
++/* Load FFI library on-demand. */
++#define ctype_loadffi(L) \
++  do { \
++    if (!ctype_ctsG(G(L))) { \
++      ptrdiff_t oldtop = (char *)L->top - mref(L->stack, char); \
++      luaopen_ffi(L); \
++      L->top = (TValue *)(mref(L->stack, char) + oldtop); \
++    } \
++  } while (0)
++
+ /* Save and restore state of C type table. */
+ #define LJ_CTYPE_SAVE(cts)	CTState savects_ = *(cts)
+ #define LJ_CTYPE_RESTORE(cts) \
+@@ -457,6 +470,7 @@ LJ_FUNC CType *lj_ctype_rawref(CTState *
+ LJ_FUNC CTSize lj_ctype_size(CTState *cts, CTypeID id);
+ LJ_FUNC CTSize lj_ctype_vlsize(CTState *cts, CType *ct, CTSize nelem);
+ LJ_FUNC CTInfo lj_ctype_info(CTState *cts, CTypeID id, CTSize *szp);
++LJ_FUNC CTInfo lj_ctype_info_raw(CTState *cts, CTypeID id, CTSize *szp);
+ LJ_FUNC cTValue *lj_ctype_meta(CTState *cts, CTypeID id, MMS mm);
+ LJ_FUNC GCstr *lj_ctype_repr(lua_State *L, CTypeID id, GCstr *name);
+ LJ_FUNC GCstr *lj_ctype_repr_int64(lua_State *L, uint64_t n, int isunsigned);
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_debug.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_debug.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_debug.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Debugging and introspection.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_debug_c
+@@ -101,9 +101,12 @@ static BCPos debug_framepc(lua_State *L,
+   pos = proto_bcpos(pt, ins) - 1;
+ #if LJ_HASJIT
+   if (pos > pt->sizebc) {  /* Undo the effects of lj_trace_exit for JLOOP. */
+-    GCtrace *T = (GCtrace *)((char *)(ins-1) - offsetof(GCtrace, startins));
+-    lj_assertL(bc_isret(bc_op(ins[-1])), "return bytecode expected");
+-    pos = proto_bcpos(pt, mref(T->startpc, const BCIns));
++    if (bc_isret(bc_op(ins[-1]))) {
++      GCtrace *T = (GCtrace *)((char *)(ins-1) - offsetof(GCtrace, startins));
++      pos = proto_bcpos(pt, mref(T->startpc, const BCIns));
++    } else {
++      pos = NO_BCPOS;  /* Punt in case of stack overflow for stitched trace. */
++    }
+   }
+ #endif
+   return pos;
+@@ -648,7 +651,7 @@ void lj_debug_dumpstack(lua_State *L, SB
+     level += dir;
+   }
+   if (lastlen)
+-    setsbufP(sb, sbufB(sb) + lastlen);  /* Zap trailing separator. */
++    sb->w = sb->b + lastlen;  /* Zap trailing separator. */
+ }
+ #endif
+ 
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_debug.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_debug.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_debug.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Debugging and introspection.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_DEBUG_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_def.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_def.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_def.h
+@@ -1,6 +1,6 @@
+ /*
+ ** LuaJIT common internal definitions.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_DEF_H
+@@ -69,7 +69,7 @@ typedef unsigned int uintptr_t;
+ #define LJ_MAX_UPVAL	60		/* Max. # of upvalues. */
+ 
+ #define LJ_MAX_IDXCHAIN	100		/* __index/__newindex chain limit. */
+-#define LJ_STACK_EXTRA	(5+2*LJ_FR2)	/* Extra stack space (metamethods). */
++#define LJ_STACK_EXTRA	(5+3*LJ_FR2)	/* Extra stack space (metamethods). */
+ 
+ #define LJ_NUM_CBPAGE	1		/* Number of FFI callback pages. */
+ 
+@@ -146,15 +146,9 @@ typedef uintptr_t BloomFilter;
+ #define LJ_UNLIKELY(x)	__builtin_expect(!!(x), 0)
+ 
+ #define lj_ffs(x)	((uint32_t)__builtin_ctz(x))
+-/* Don't ask ... */
+-#if defined(__INTEL_COMPILER) && (defined(__i386__) || defined(__x86_64__))
+-static LJ_AINLINE uint32_t lj_fls(uint32_t x)
+-{
+-  uint32_t r; __asm__("bsrl %1, %0" : "=r" (r) : "rm" (x) : "cc"); return r;
+-}
+-#else
+ #define lj_fls(x)	((uint32_t)(__builtin_clz(x)^31))
+-#endif
++#define lj_ffs64(x)	((uint32_t)__builtin_ctzll(x))
++#define lj_fls64(x)	((uint32_t)(__builtin_clzll(x)^63))
+ 
+ #if defined(__arm__)
+ static LJ_AINLINE uint32_t lj_bswap(uint32_t x)
+@@ -277,6 +271,23 @@ static LJ_AINLINE uint32_t lj_fls(uint32
+ {
+   unsigned long r; _BitScanReverse(&r, x); return (uint32_t)r;
+ }
++
++#if defined(_M_X64) || defined(_M_ARM64)
++unsigned char _BitScanForward64(unsigned long *, uint64_t);
++unsigned char _BitScanReverse64(unsigned long *, uint64_t);
++#pragma intrinsic(_BitScanForward64)
++#pragma intrinsic(_BitScanReverse64)
++
++static LJ_AINLINE uint32_t lj_ffs64(uint64_t x)
++{
++  unsigned long r; _BitScanForward64(&r, x); return (uint32_t)r;
++}
++
++static LJ_AINLINE uint32_t lj_fls64(uint64_t x)
++{
++  unsigned long r; _BitScanReverse64(&r, x); return (uint32_t)r;
++}
++#endif
+ #endif
+ 
+ unsigned long _byteswap_ulong(unsigned long);
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_dispatch.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_dispatch.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_dispatch.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Instruction dispatch handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_dispatch_c
+@@ -68,6 +68,8 @@ void lj_dispatch_init(GG_State *GG)
+   /* The JIT engine is off by default. luaopen_jit() turns it on. */
+   disp[BC_FORL] = disp[BC_IFORL];
+   disp[BC_ITERL] = disp[BC_IITERL];
++  /* Workaround for stable v2.1 bytecode. TODO: Replace with BC_IITERN. */
++  disp[BC_ITERN] = &lj_vm_IITERN;
+   disp[BC_LOOP] = disp[BC_ILOOP];
+   disp[BC_FUNCF] = disp[BC_IFUNCF];
+   disp[BC_FUNCV] = disp[BC_IFUNCV];
+@@ -118,19 +120,21 @@ void lj_dispatch_update(global_State *g)
+   mode |= (g->hookmask & LUA_MASKRET) ? DISPMODE_RET : 0;
+   if (oldmode != mode) {  /* Mode changed? */
+     ASMFunction *disp = G2GG(g)->dispatch;
+-    ASMFunction f_forl, f_iterl, f_loop, f_funcf, f_funcv;
++    ASMFunction f_forl, f_iterl, f_itern, f_loop, f_funcf, f_funcv;
+     g->dispatchmode = mode;
+ 
+     /* Hotcount if JIT is on, but not while recording. */
+     if ((mode & (DISPMODE_JIT|DISPMODE_REC)) == DISPMODE_JIT) {
+       f_forl = makeasmfunc(lj_bc_ofs[BC_FORL]);
+       f_iterl = makeasmfunc(lj_bc_ofs[BC_ITERL]);
++      f_itern = makeasmfunc(lj_bc_ofs[BC_ITERN]);
+       f_loop = makeasmfunc(lj_bc_ofs[BC_LOOP]);
+       f_funcf = makeasmfunc(lj_bc_ofs[BC_FUNCF]);
+       f_funcv = makeasmfunc(lj_bc_ofs[BC_FUNCV]);
+     } else {  /* Otherwise use the non-hotcounting instructions. */
+       f_forl = disp[GG_LEN_DDISP+BC_IFORL];
+       f_iterl = disp[GG_LEN_DDISP+BC_IITERL];
++      f_itern = &lj_vm_IITERN;
+       f_loop = disp[GG_LEN_DDISP+BC_ILOOP];
+       f_funcf = makeasmfunc(lj_bc_ofs[BC_IFUNCF]);
+       f_funcv = makeasmfunc(lj_bc_ofs[BC_IFUNCV]);
+@@ -138,6 +142,7 @@ void lj_dispatch_update(global_State *g)
+     /* Init static counting instruction dispatch first (may be copied below). */
+     disp[GG_LEN_DDISP+BC_FORL] = f_forl;
+     disp[GG_LEN_DDISP+BC_ITERL] = f_iterl;
++    disp[GG_LEN_DDISP+BC_ITERN] = f_itern;
+     disp[GG_LEN_DDISP+BC_LOOP] = f_loop;
+ 
+     /* Set dynamic instruction dispatch. */
+@@ -165,6 +170,7 @@ void lj_dispatch_update(global_State *g)
+       /* Otherwise set dynamic counting ins. */
+       disp[BC_FORL] = f_forl;
+       disp[BC_ITERL] = f_iterl;
++      disp[BC_ITERN] = f_itern;
+       disp[BC_LOOP] = f_loop;
+       /* Set dynamic return dispatch. */
+       if ((mode & DISPMODE_RET)) {
+@@ -301,9 +307,9 @@ int luaJIT_setmode(lua_State *L, int idx
+       } else {
+ 	return 0;  /* Failed. */
+       }
+-      g->bc_cfunc_ext = BCINS_AD(BC_FUNCCW, 0, 0);
++      setbc_op(&g->bc_cfunc_ext, BC_FUNCCW);
+     } else {
+-      g->bc_cfunc_ext = BCINS_AD(BC_FUNCC, 0, 0);
++      setbc_op(&g->bc_cfunc_ext, BC_FUNCC);
+     }
+     break;
+   default:
+@@ -447,7 +453,7 @@ static int call_init(lua_State *L, GCfun
+     int numparams = pt->numparams;
+     int gotparams = (int)(L->top - L->base);
+     int need = pt->framesize;
+-    if ((pt->flags & PROTO_VARARG)) need += 1+gotparams;
++    if ((pt->flags & PROTO_VARARG)) need += 1+LJ_FR2+gotparams;
+     lj_state_checkstack(L, (MSize)need);
+     numparams -= gotparams;
+     return numparams >= 0 ? numparams : 0;
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_dispatch.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_dispatch.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_dispatch.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Instruction dispatch handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_DISPATCH_H
+@@ -31,7 +31,7 @@ extern double __divdf3(double a, double
+ #define SFGOTDEF(_)
+ #endif
+ #if LJ_HASJIT
+-#define JITGOTDEF(_)	_(lj_trace_exit) _(lj_trace_hot)
++#define JITGOTDEF(_)	_(lj_err_trace) _(lj_trace_exit) _(lj_trace_hot)
+ #else
+ #define JITGOTDEF(_)
+ #endif
+@@ -46,7 +46,7 @@ extern double __divdf3(double a, double
+   _(asin) _(acos) _(atan) _(sinh) _(cosh) _(tanh) _(frexp) _(modf) _(atan2) \
+   _(pow) _(fmod) _(ldexp) _(lj_vm_modi) \
+   _(lj_dispatch_call) _(lj_dispatch_ins) _(lj_dispatch_stitch) \
+-  _(lj_dispatch_profile) _(lj_err_throw) _(lj_err_run) \
++  _(lj_dispatch_profile) _(lj_err_throw) \
+   _(lj_ffh_coroutine_wrap_err) _(lj_func_closeuv) _(lj_func_newL_gc) \
+   _(lj_gc_barrieruv) _(lj_gc_step) _(lj_gc_step_fixtop) _(lj_meta_arith) \
+   _(lj_meta_call) _(lj_meta_cat) _(lj_meta_comp) _(lj_meta_equal) \
+@@ -89,7 +89,7 @@ typedef uint16_t HotCount;
+ typedef struct GG_State {
+   lua_State L;				/* Main thread. */
+   global_State g;			/* Global state. */
+-#if LJ_TARGET_ARM
++#if LJ_TARGET_ARM && !LJ_TARGET_NX
+   /* Make g reachable via K12 encoded DISPATCH-relative addressing. */
+   uint8_t align1[(16-sizeof(global_State))&15];
+ #endif
+@@ -99,7 +99,7 @@ typedef struct GG_State {
+ #if LJ_HASJIT
+   jit_State J;				/* JIT state. */
+   HotCount hotcount[HOTCOUNT_SIZE];	/* Hot counters. */
+-#if LJ_TARGET_ARM
++#if LJ_TARGET_ARM && !LJ_TARGET_NX
+   /* Ditto for J. */
+   uint8_t align2[(16-sizeof(jit_State)-sizeof(HotCount)*HOTCOUNT_SIZE)&15];
+ #endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_emit_arm.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_emit_arm.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_emit_arm.h
+@@ -1,6 +1,6 @@
+ /*
+ ** ARM instruction emitter.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ /* -- Constant encoding --------------------------------------------------- */
+@@ -157,7 +157,7 @@ static int emit_kdelta2(ASMState *as, Re
+       if (other) {
+ 	int32_t delta = i - other;
+ 	uint32_t sh, inv = 0, k2, k;
+-	if (delta < 0) { delta = -delta; inv = ARMI_ADD^ARMI_SUB; }
++	if (delta < 0) { delta = (int32_t)(~(uint32_t)delta+1u); inv = ARMI_ADD^ARMI_SUB; }
+ 	sh = lj_ffs(delta) & ~1;
+ 	k2 = emit_isk12(0, delta & (255 << sh));
+ 	k = emit_isk12(0, delta & ~(255 << sh));
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_emit_arm64.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_emit_arm64.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_emit_arm64.h
+@@ -1,6 +1,6 @@
+ /*
+ ** ARM64 instruction emitter.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
+ ** Sponsored by Cisco Systems, Inc.
+@@ -20,49 +20,41 @@ static uint64_t get_k64val(ASMState *as,
+   } else {
+     lj_assertA(ir->o == IR_KINT || ir->o == IR_KNULL,
+ 	       "bad 64 bit const IR op %d", ir->o);
+-    return ir->i;  /* Sign-extended. */
++    return (uint32_t)ir->i;  /* Zero-extended. */
+   }
+ }
+ 
+ /* Encode constant in K12 format for data processing instructions. */
+ static uint32_t emit_isk12(int64_t n)
+ {
+-  uint64_t k = (n < 0) ? -n : n;
+-  uint32_t m = (n < 0) ? 0x40000000 : 0;
++  uint64_t k = n < 0 ? ~(uint64_t)n+1u : (uint64_t)n;
++  uint32_t m = n < 0 ? 0x40000000 : 0;
+   if (k < 0x1000) {
+-    return A64I_K12|m|A64F_U12(k);
++    return (uint32_t)(A64I_K12|m|A64F_U12(k));
+   } else if ((k & 0xfff000) == k) {
+-    return A64I_K12|m|0x400000|A64F_U12(k>>12);
++    return (uint32_t)(A64I_K12|m|0x400000|A64F_U12(k>>12));
+   }
+   return 0;
+ }
+ 
+-#define emit_clz64(n)	__builtin_clzll(n)
+-#define emit_ctz64(n)	__builtin_ctzll(n)
++#define emit_clz64(n)	(lj_fls64(n)^63)
++#define emit_ctz64(n)	lj_ffs64(n)
+ 
+ /* Encode constant in K13 format for logical data processing instructions. */
+ static uint32_t emit_isk13(uint64_t n, int is64)
+ {
+-  int inv = 0, w = 128, lz, tz;
+-  if (n & 1) { n = ~n; w = 64; inv = 1; }  /* Avoid wrap-around of ones. */
+-  if (!n) return 0;  /* Neither all-zero nor all-ones are allowed. */
+-  do {  /* Find the repeat width. */
+-    if (is64 && (uint32_t)(n^(n>>32))) break;
+-    n = (uint32_t)n;
+-    if (!n) return 0;  /* Ditto when passing n=0xffffffff and is64=0. */
+-    w = 32; if ((n^(n>>16)) & 0xffff) break;
+-    n = n & 0xffff; w = 16; if ((n^(n>>8)) & 0xff) break;
+-    n = n & 0xff; w = 8; if ((n^(n>>4)) & 0xf) break;
+-    n = n & 0xf; w = 4; if ((n^(n>>2)) & 0x3) break;
+-    n = n & 0x3; w = 2;
+-  } while (0);
+-  lz = emit_clz64(n);
+-  tz = emit_ctz64(n);
+-  if ((int64_t)(n << lz) >> (lz+tz) != -1ll) return 0; /* Non-contiguous? */
+-  if (inv)
+-    return A64I_K13 | (((lz-w) & 127) << 16) | (((lz+tz-w-1) & 63) << 10);
+-  else
+-    return A64I_K13 | ((w-tz) << 16) | (((63-lz-tz-w-w) & 63) << 10);
++  /* Thanks to: https://dougallj.wordpress.com/2021/10/30/ */
++  int rot, ones, size, immr, imms;
++  if (!is64) n = ((uint64_t)n << 32) | (uint32_t)n;
++  if ((n+1u) <= 1u) return 0;  /* Neither all-zero nor all-ones are allowed. */
++  rot = (n & (n+1u)) ? emit_ctz64(n & (n+1u)) : 64;
++  n = lj_ror(n, rot & 63);
++  ones = emit_ctz64(~n);
++  size = emit_clz64(n) + ones;
++  if (lj_ror(n, size & 63) != n) return 0;  /* Non-repeating? */
++  immr = -rot & (size - 1);
++  imms = (-(size << 1) | (ones - 1)) & 63;
++  return A64I_K13 | A64F_IMMR(immr | (size & 64)) | A64F_IMMS(imms);
+ }
+ 
+ static uint32_t emit_isfpk64(uint64_t n)
+@@ -121,9 +113,20 @@ static int emit_checkofs(A64Ins ai, int6
+   }
+ }
+ 
+-static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs)
++static LJ_AINLINE uint32_t emit_lso_pair_candidate(A64Ins ai, int ofs, int sc)
+ {
+-  int ot = emit_checkofs(ai, ofs), sc = (ai >> 30) & 3;
++  if (ofs >= 0) {
++    return ai | A64F_U12(ofs>>sc);  /* Subsequent lj_ror checks ofs. */
++  } else if (ofs >= -256) {
++    return (ai^A64I_LS_U) | A64F_S9(ofs & 0x1ff);
++  } else {
++    return A64F_D(31);  /* Will mismatch prev. */
++  }
++}
++
++static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs64)
++{
++  int ot = emit_checkofs(ai, ofs64), sc = (ai >> 30) & 3, ofs = (int)ofs64;
+   lj_assertA(ot, "load/store offset %d out of range", ofs);
+   /* Combine LDR/STR pairs to LDP/STP. */
+   if ((sc == 2 || sc == 3) &&
+@@ -132,18 +135,16 @@ static void emit_lso(ASMState *as, A64In
+     uint32_t prev = *as->mcp & ~A64F_D(31);
+     int ofsm = ofs - (1<<sc), ofsp = ofs + (1<<sc);
+     A64Ins aip;
+-    if (prev == (ai | A64F_N(rn) | A64F_U12(ofsm>>sc)) ||
+-	prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsm&0x1ff))) {
++    if (prev == emit_lso_pair_candidate(ai | A64F_N(rn), ofsm, sc)) {
+       aip = (A64F_A(rd) | A64F_D(*as->mcp & 31));
+-    } else if (prev == (ai | A64F_N(rn) | A64F_U12(ofsp>>sc)) ||
+-	       prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsp&0x1ff))) {
++    } else if (prev == emit_lso_pair_candidate(ai | A64F_N(rn), ofsp, sc)) {
+       aip = (A64F_D(rd) | A64F_A(*as->mcp & 31));
+       ofsm = ofs;
+     } else {
+       goto nopair;
+     }
+-    if (ofsm >= (int)((unsigned int)-64<<sc) && ofsm <= (63<<sc)) {
+-      *as->mcp = aip | A64F_N(rn) | ((ofsm >> sc) << 15) |
++    if (lj_ror((unsigned int)ofsm + (64u<<sc), sc) <= 127u) {
++      *as->mcp = aip | A64F_N(rn) | (((ofsm >> sc) & 0x7f) << 15) |
+ 	(ai ^ ((ai == A64I_LDRx || ai == A64I_STRx) ? 0x50000000 : 0x90000000));
+       return;
+     }
+@@ -158,13 +159,12 @@ nopair:
+ /* -- Emit loads/stores --------------------------------------------------- */
+ 
+ /* Prefer rematerialization of BASE/L from global_State over spills. */
+-#define emit_canremat(ref)	((ref) <= ASMREF_L)
++#define emit_canremat(ref)	((ref) <= REF_BASE)
+ 
+-/* Try to find an N-step delta relative to other consts with N < lim. */
+-static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int lim)
++/* Try to find a one-step delta relative to other consts. */
++static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int is64)
+ {
+-  RegSet work = ~as->freeset & RSET_GPR;
+-  if (lim <= 1) return 0;  /* Can't beat that. */
++  RegSet work = (~as->freeset & RSET_GPR) | RID2RSET(RID_GL);
+   while (work) {
+     Reg r = rset_picktop(work);
+     IRRef ref = regcost_ref(as->cost[r]);
+@@ -173,13 +173,14 @@ static int emit_kdelta(ASMState *as, Reg
+       uint64_t kx = ra_iskref(ref) ? (uint64_t)ra_krefk(as, ref) :
+ 				     get_k64val(as, ref);
+       int64_t delta = (int64_t)(k - kx);
++      if (!is64) delta = (int64_t)(int32_t)delta;  /* Sign-extend. */
+       if (delta == 0) {
+-	emit_dm(as, A64I_MOVx, rd, r);
++	emit_dm(as, is64|A64I_MOVw, rd, r);
+ 	return 1;
+       } else {
+-	uint32_t k12 = emit_isk12(delta < 0 ? -delta : delta);
++	uint32_t k12 = emit_isk12(delta < 0 ? (int64_t)(~(uint64_t)delta+1u) : delta);
+ 	if (k12) {
+-	  emit_dn(as, (delta < 0 ? A64I_SUBx : A64I_ADDx)^k12, rd, r);
++	  emit_dn(as, (delta < 0 ? A64I_SUBw : A64I_ADDw)^is64^k12, rd, r);
+ 	  return 1;
+ 	}
+ 	/* Do other ops or multi-step deltas pay off? Probably not.
+@@ -192,77 +193,101 @@ static int emit_kdelta(ASMState *as, Reg
+   return 0;  /* Failed. */
+ }
+ 
+-static void emit_loadk(ASMState *as, Reg rd, uint64_t u64, int is64)
++#define glofs(as, k) \
++  ((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g))
++#define mcpofs(as, k) \
++  ((intptr_t)((uintptr_t)(k) - (uintptr_t)(as->mcp - 1)))
++#define checkmcpofs(as, k) \
++  (A64F_S_OK(mcpofs(as, k)>>2, 19))
++
++/* Try to form a const as ADR or ADRP or ADRP + ADD. */
++static int emit_kadrp(ASMState *as, Reg rd, uint64_t k)
+ {
+-  uint32_t k13 = emit_isk13(u64, is64);
+-  if (k13) {  /* Can the constant be represented as a bitmask immediate? */
+-    emit_dn(as, (is64|A64I_ORRw)^k13, rd, RID_ZERO);
+-  } else {
+-    int i, zeros = 0, ones = 0, neg;
+-    if (!is64) u64 = (int64_t)(int32_t)u64;  /* Sign-extend. */
+-    /* Count homogeneous 16 bit fragments. */
+-    for (i = 0; i < 4; i++) {
+-      uint64_t frag = (u64 >> i*16) & 0xffff;
+-      zeros += (frag == 0);
+-      ones += (frag == 0xffff);
++  A64Ins ai = A64I_ADR;
++  int64_t ofs = mcpofs(as, k);
++  if (!A64F_S_OK((uint64_t)ofs, 21)) {
++    uint64_t kpage = k & ~0xfffull;
++    MCode *adrp = as->mcp - 1 - (k != kpage);
++    ofs = (int64_t)(kpage - ((uint64_t)adrp & ~0xfffull)) >> 12;
++    if (!A64F_S_OK(ofs, 21))
++      return 0;  /* Failed. */
++    if (k != kpage)
++      emit_dn(as, (A64I_ADDx^A64I_K12)|A64F_U12(k - kpage), rd, rd);
++    ai = A64I_ADRP;
++  }
++  emit_d(as, ai|(((uint32_t)ofs&3)<<29)|A64F_S19(ofs>>2), rd);
++  return 1;
++}
++
++static void emit_loadk(ASMState *as, Reg rd, uint64_t u64)
++{
++  int zeros = 0, ones = 0, neg, lshift = 0;
++  int is64 = (u64 >> 32) ? A64I_X : 0, i = is64 ? 4 : 2;
++  /* Count non-homogeneous 16 bit fragments. */
++  while (--i >= 0) {
++    uint32_t frag = (u64 >> i*16) & 0xffff;
++    zeros += (frag != 0);
++    ones += (frag != 0xffff);
++  }
++  neg = ones < zeros;  /* Use MOVN if it pays off. */
++  if ((neg ? ones : zeros) > 1) {  /* Need 2+ ins. Try 1 ins encodings. */
++    uint32_t k13 = emit_isk13(u64, is64);
++    if (k13) {
++      emit_dn(as, (is64|A64I_ORRw)^k13, rd, RID_ZERO);
++      return;
+     }
+-    neg = ones > zeros;  /* Use MOVN if it pays off. */
+-    if (!emit_kdelta(as, rd, u64, 4 - (neg ? ones : zeros))) {
+-      int shift = 0, lshift = 0;
+-      uint64_t n64 = neg ? ~u64 : u64;
+-      if (n64 != 0) {
+-	/* Find first/last fragment to be filled. */
+-	shift = (63-emit_clz64(n64)) & ~15;
+-	lshift = emit_ctz64(n64) & ~15;
+-      }
+-      /* MOVK requires the original value (u64). */
+-      while (shift > lshift) {
+-	uint32_t u16 = (u64 >> shift) & 0xffff;
+-	/* Skip fragments that are correctly filled by MOVN/MOVZ. */
+-	if (u16 != (neg ? 0xffff : 0))
+-	  emit_d(as, is64 | A64I_MOVKw | A64F_U16(u16) | A64F_LSL16(shift), rd);
+-	shift -= 16;
+-      }
+-      /* But MOVN needs an inverted value (n64). */
+-      emit_d(as, (neg ? A64I_MOVNx : A64I_MOVZx) |
+-		 A64F_U16((n64 >> lshift) & 0xffff) | A64F_LSL16(lshift), rd);
++    if (emit_kdelta(as, rd, u64, is64)) {
++      return;
++    }
++    if (emit_kadrp(as, rd, u64)) {  /* Either 1 or 2 ins. */
++      return;
++    }
++  }
++  if (neg) {
++    u64 = ~u64;
++    if (!is64) u64 = (uint32_t)u64;
++  }
++  if (u64) {
++    /* Find first/last fragment to be filled. */
++    int shift = (63-emit_clz64(u64)) & ~15;
++    lshift = emit_ctz64(u64) & ~15;
++    for (; shift > lshift; shift -= 16) {
++      uint32_t frag = (u64 >> shift) & 0xffff;
++      if (frag == 0) continue; /* Will be correctly filled by MOVN/MOVZ. */
++      if (neg) frag ^= 0xffff; /* MOVK requires the original value. */
++      emit_d(as, is64 | A64I_MOVKw | A64F_U16(frag) | A64F_LSL16(shift), rd);
+     }
+   }
++  /* But MOVN needs an inverted value. */
++  emit_d(as, is64 | (neg ? A64I_MOVNw : A64I_MOVZw) |
++	     A64F_U16((u64 >> lshift) & 0xffff) | A64F_LSL16(lshift), rd);
+ }
+ 
+ /* Load a 32 bit constant into a GPR. */
+-#define emit_loadi(as, rd, i)	emit_loadk(as, rd, i, 0)
++#define emit_loadi(as, rd, i)	emit_loadk(as, rd, (uint32_t)i)
+ 
+ /* Load a 64 bit constant into a GPR. */
+-#define emit_loadu64(as, rd, i)	emit_loadk(as, rd, i, A64I_X)
+-
+-#define emit_loada(as, r, addr)	emit_loadu64(as, (r), (uintptr_t)(addr))
+-
+-#define glofs(as, k) \
+-  ((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g))
+-#define mcpofs(as, k) \
+-  ((intptr_t)((uintptr_t)(k) - (uintptr_t)(as->mcp - 1)))
+-#define checkmcpofs(as, k) \
+-  (A64F_S_OK(mcpofs(as, k)>>2, 19))
++#define emit_loadu64(as, rd, i)	emit_loadk(as, rd, i)
+ 
+ static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow);
+ 
+ /* Get/set from constant pointer. */
+ static void emit_lsptr(ASMState *as, A64Ins ai, Reg r, void *p)
+ {
+-  /* First, check if ip + offset is in range. */
+-  if ((ai & 0x00400000) && checkmcpofs(as, p)) {
++  Reg base = RID_GL;
++  int64_t ofs = glofs(as, p);
++  if (emit_checkofs(ai, ofs)) {
++    /* GL + offset, might subsequently fuse to LDP/STP. */
++  } else if (ai == A64I_LDRx && checkmcpofs(as, p)) {
++    /* IP + offset is cheaper than allock, but address must be in range. */
+     emit_d(as, A64I_LDRLx | A64F_S19(mcpofs(as, p)>>2), r);
+-  } else {
+-    Reg base = RID_GL;  /* Next, try GL + offset. */
+-    int64_t ofs = glofs(as, p);
+-    if (!emit_checkofs(ai, ofs)) {  /* Else split up into base reg + offset. */
+-      int64_t i64 = i64ptr(p);
+-      base = ra_allock(as, (i64 & ~0x7fffull), rset_exclude(RSET_GPR, r));
+-      ofs = i64 & 0x7fffull;
+-    }
+-    emit_lso(as, ai, r, base, ofs);
++    return;
++  } else {  /* Split up into base reg + offset. */
++    int64_t i64 = i64ptr(p);
++    base = ra_allock(as, (i64 & ~0x7fffull), rset_exclude(RSET_GPR, r));
++    ofs = i64 & 0x7fffull;
+   }
++  emit_lso(as, ai, r, base, ofs);
+ }
+ 
+ /* Load 64 bit IR constant into register. */
+@@ -346,16 +371,22 @@ static void emit_cnb(ASMState *as, A64In
+ 
+ #define emit_jmp(as, target)	emit_branch(as, A64I_B, (target))
+ 
+-static void emit_call(ASMState *as, void *target)
++static void emit_call(ASMState *as, ASMFunction target)
+ {
+   MCode *p = --as->mcp;
+-  ptrdiff_t delta = (char *)target - (char *)p;
++#if LJ_ABI_PAUTH
++  char *targetp = ptrauth_auth_data((char *)target,
++				    ptrauth_key_function_pointer, 0);
++#else
++  char *targetp = (char *)target;
++#endif
++  ptrdiff_t delta = targetp - (char *)p;
+   if (A64F_S_OK(delta>>2, 26)) {
+     *p = A64I_BL | A64F_S26(delta>>2);
+   } else {  /* Target out of range: need indirect call. But don't use R0-R7. */
+     Reg r = ra_allock(as, i64ptr(target),
+ 		      RSET_RANGE(RID_X8, RID_MAX_GPR)-RSET_FIXED);
+-    *p = A64I_BLR | A64F_N(r);
++    *p = A64I_BLR_AUTH | A64F_N(r);
+   }
+ }
+ 
+@@ -415,7 +446,8 @@ static void emit_addptr(ASMState *as, Re
+ {
+   if (ofs)
+     emit_opk(as, ofs < 0 ? A64I_SUBx : A64I_ADDx, r, r,
+-		 ofs < 0 ? -ofs : ofs, rset_exclude(RSET_GPR, r));
++		 ofs < 0 ? (int32_t)(~(uint32_t)ofs+1u) : ofs,
++		 rset_exclude(RSET_GPR, r));
+ }
+ 
+ #define emit_spsub(as, ofs)	emit_addptr(as, RID_SP, -(ofs))
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_emit_mips.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_emit_mips.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_emit_mips.h
+@@ -1,6 +1,6 @@
+ /*
+ ** MIPS instruction emitter.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #if LJ_64
+@@ -70,7 +70,7 @@ static void emit_rotr(ASMState *as, Reg
+   }
+ }
+ 
+-#if LJ_64
++#if LJ_64 || LJ_HASBUFFER
+ static void emit_tsml(ASMState *as, MIPSIns mi, Reg rt, Reg rs, uint32_t msb,
+ 		      uint32_t lsb)
+ {
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_emit_ppc.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_emit_ppc.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_emit_ppc.h
+@@ -1,6 +1,6 @@
+ /*
+ ** PPC instruction emitter.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ /* -- Emit basic instructions --------------------------------------------- */
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_emit_riscv.h
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_emit_riscv.h
+@@ -0,0 +1,519 @@
++/*
++** RISC-V instruction emitter.
++** Copyright (C) 2005-2022 Mike Pall. See Copyright Notice in luajit.h
++*/
++
++static intptr_t get_k64val(ASMState *as, IRRef ref)
++{
++  IRIns *ir = IR(ref);
++  if (ir->o == IR_KINT64) {
++    return (intptr_t)ir_kint64(ir)->u64;
++  } else if (ir->o == IR_KGC) {
++    return (intptr_t)ir_kgc(ir);
++  } else if (ir->o == IR_KPTR || ir->o == IR_KKPTR) {
++    return (intptr_t)ir_kptr(ir);
++  } else {
++    lj_assertA(ir->o == IR_KINT || ir->o == IR_KNULL,
++               "bad 64 bit const IR op %d", ir->o);
++    return ir->i;  /* Sign-extended. */
++  }
++}
++
++#define get_kval(as, ref)       get_k64val(as, ref)
++
++/* -- Emit basic instructions --------------------------------------------- */
++
++static void emit_r(ASMState *as, RISCVIns riscvi, Reg rd, Reg rs1, Reg rs2)
++{
++  *--as->mcp = riscvi | RISCVF_D(rd) | RISCVF_S1(rs1) | RISCVF_S2(rs2);
++}
++
++#define emit_ds(as, riscvi, rd, rs1)         emit_r(as, riscvi, rd, rs1, 0)
++#define emit_ds2(as, riscvi, rd, rs2)         emit_r(as, riscvi, rd, 0, rs2)
++#define emit_ds1s2(as, riscvi, rd, rs1, rs2)         emit_r(as, riscvi, rd, rs1, rs2)
++
++static void emit_r4(ASMState *as, RISCVIns riscvi, Reg rd, Reg rs1, Reg rs2, Reg rs3)
++{
++  *--as->mcp = riscvi | RISCVF_D(rd) | RISCVF_S1(rs1) | RISCVF_S2(rs2) | RISCVF_S3(rs3);
++}
++
++#define emit_ds1s2s3(as, riscvi, rd, rs1, rs2, rs3)         emit_r4(as, riscvi, rd, rs1, rs2, rs3)
++
++static void emit_i(ASMState *as, RISCVIns riscvi, Reg rd, Reg rs1, int32_t i)
++{
++  *--as->mcp = riscvi | RISCVF_D(rd) | RISCVF_S1(rs1) | RISCVF_IMMI((uint32_t)i & 0xfff);
++}
++
++#define emit_di(as, riscvi, rd, i)         emit_i(as, riscvi, rd, 0, i)
++#define emit_dsi(as, riscvi, rd, rs1, i)     emit_i(as, riscvi, rd, rs1, i)
++#define emit_dsshamt(as, riscvi, rd, rs1, i) emit_i(as, riscvi, rd, rs1, i&0x3f)
++
++static void emit_s(ASMState *as, RISCVIns riscvi, Reg rs1, Reg rs2, int32_t i)
++{
++  *--as->mcp = riscvi | RISCVF_S1(rs1) | RISCVF_S2(rs2) | RISCVF_IMMS((uint32_t)i & 0xfff);
++}
++
++#define emit_s1s2i(as, riscvi, rs1, rs2, i)  emit_s(as, riscvi, rs1, rs2, i)
++
++/*
++static void emit_b(ASMState *as, RISCVIns riscvi, Reg rs1, Reg rs2, int32_t i)
++{
++  *--as->mcp = riscvi | RISCVF_S1(rs1) | RISCVF_S2(rs2) | RISCVF_IMMB((uint32_t)i & 0x1ffe);
++}
++*/
++
++static void emit_u(ASMState *as, RISCVIns riscvi, Reg rd, uint32_t i)
++{
++  *--as->mcp = riscvi | RISCVF_D(rd) | RISCVF_IMMU(i & 0xfffff);
++}
++
++#define emit_du(as, riscvi, rd, i)           emit_u(as, riscvi, rd, i)
++
++/*
++static void emit_j(ASMState *as, RISCVIns riscvi, Reg rd, int32_t i)
++{
++  *--as->mcp = riscvi | RISCVF_D(rd) | RISCVF_IMMJ((uint32_t)i & 0x1fffffe);
++}
++*/
++
++static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow);
++static void ra_allockreg(ASMState *as, intptr_t k, Reg r);
++static Reg ra_scratch(ASMState *as, RegSet allow);
++
++static void emit_lso(ASMState *as, RISCVIns riscvi, Reg data, Reg base, int32_t ofs)
++{
++  lj_assertA(checki12(ofs), "load/store offset %d out of range", ofs);
++  switch (riscvi) {
++    case RISCVI_LD: case RISCVI_LW: case RISCVI_LH: case RISCVI_LB:
++    case RISCVI_LWU: case RISCVI_LHU: case RISCVI_LBU:
++    case RISCVI_FLW: case RISCVI_FLD:
++      emit_dsi(as, riscvi, data, base, ofs);
++      break;
++    case RISCVI_SD: case RISCVI_SW: case RISCVI_SH: case RISCVI_SB:
++    case RISCVI_FSW: case RISCVI_FSD:
++      emit_s1s2i(as, riscvi, base, data, ofs);
++      break;
++    default: lj_assertA(0, "invalid lso"); break;
++  }
++}
++
++static void emit_roti(ASMState *as, RISCVIns riscvi, Reg rd, Reg rs1, Reg tmp,
++                       int32_t shamt)
++{
++  if (as->flags & JIT_F_RVZbb || as->flags & JIT_F_RVXThead) {
++    if (as->flags & JIT_F_RVXThead) switch (riscvi) {
++      case RISCVI_RORI: riscvi = RISCVI_TH_SRRI; break;
++      case RISCVI_RORIW: riscvi = RISCVI_TH_SRRIW; break;
++      default: lj_assertA(0, "invalid roti op"); break;
++    }
++    emit_dsshamt(as, riscvi, rd, rs1, shamt);
++  } else {
++    RISCVIns ai, bi;
++    int32_t shwid, shmsk;
++    switch (riscvi) {
++      case RISCVI_RORI:
++        ai = RISCVI_SRLI, bi = RISCVI_SLLI;
++        shwid = 64, shmsk = 63;
++        break;
++      case RISCVI_RORIW:
++        ai = RISCVI_SRLIW, bi = RISCVI_SLLIW;
++        shwid = 32, shmsk = 31;
++        break;
++      default:
++        lj_assertA(0, "invalid roti op");
++        return;
++    }
++    emit_ds1s2(as, RISCVI_OR, rd, rd, tmp);
++    emit_dsshamt(as, bi, rd, rs1, (shwid - shamt)&shmsk);
++    emit_dsshamt(as, ai, tmp, rs1, shamt&shmsk);
++  }
++}
++
++static void emit_rot(ASMState *as, RISCVIns riscvi, Reg rd, Reg rs1, Reg rs2, Reg tmp)
++{
++  if (as->flags & JIT_F_RVZbb) {
++    emit_ds1s2(as, riscvi, rd, rs1, rs2);
++  } else {
++    RISCVIns sai, sbi;
++    switch (riscvi) {
++      case RISCVI_ROL:
++        sai = RISCVI_SLL, sbi = RISCVI_SRL;
++        break;
++      case RISCVI_ROR:
++        sai = RISCVI_SRL, sbi = RISCVI_SLL;
++        break;
++      case RISCVI_ROLW:
++        sai = RISCVI_SLLW, sbi = RISCVI_SRLW;
++        break;
++      case RISCVI_RORW:
++        sai = RISCVI_SRLW, sbi = RISCVI_SLLW;
++        break;
++      default:
++        lj_assertA(0, "invalid rot op");
++        return;
++    }
++    if (rd == rs2) {
++      emit_ds1s2(as, RISCVI_OR, rd, rd, tmp);
++      emit_ds1s2(as, sbi, tmp, rs1, tmp);
++      emit_ds1s2(as, sai, rd, rs1, rs2);
++      emit_ds2(as, RISCVI_NEG, tmp, rs2);
++    } else {
++      emit_ds1s2(as, RISCVI_OR, rd, rd, tmp);
++      emit_ds1s2(as, sai, rd, rs1, rs2);
++      emit_ds1s2(as, sbi, tmp, rs1, tmp);
++      emit_ds2(as, RISCVI_NEG, tmp, rs2);
++    }
++  }
++}
++
++static void emit_ext(ASMState *as, RISCVIns riscvi, Reg rd, Reg rs1)
++{
++  if ((riscvi != RISCVI_ZEXT_W && as->flags & JIT_F_RVZbb) ||
++      (riscvi == RISCVI_ZEXT_W && as->flags & JIT_F_RVZba)) {
++    emit_ds(as, riscvi, rd, rs1);
++  } else if (as->flags & JIT_F_RVXThead) {
++    uint32_t hi, sext;
++    switch (riscvi) {
++      case RISCVI_ZEXT_B:
++      case RISCVI_SEXT_W:
++        emit_ds(as, riscvi, rd, rs1);
++        return;
++      case RISCVI_ZEXT_H:
++        hi = 15, sext = 0;
++        break;
++      case RISCVI_ZEXT_W:
++        hi = 31, sext = 0;
++        break;
++      case RISCVI_SEXT_B:
++        hi = 7, sext = 1;
++        break;
++      case RISCVI_SEXT_H:
++        hi = 15, sext = 1;
++        break;
++      default:
++        lj_assertA(0, "invalid ext op");
++        return;
++    }
++    emit_dsi(as, sext ? RISCVI_TH_EXT : RISCVI_TH_EXTU,
++      rd, rs1, hi << 6);
++  } else {
++    RISCVIns sli, sri;
++    int32_t shamt;
++    switch (riscvi) {
++      case RISCVI_ZEXT_B:
++      case RISCVI_SEXT_W:
++        emit_ds(as, riscvi, rd, rs1);
++        return;
++      case RISCVI_ZEXT_H:
++        sli = RISCVI_SLLI, sri = RISCVI_SRLI;
++        shamt = 48;
++        break;
++      case RISCVI_ZEXT_W:
++        sli = RISCVI_SLLI, sri = RISCVI_SRLI;
++        shamt = 32;
++        break;
++      case RISCVI_SEXT_B:
++        sli = RISCVI_SLLI, sri = RISCVI_SRAI;
++        shamt = 56;
++        break;
++      case RISCVI_SEXT_H:
++        sli = RISCVI_SLLI, sri = RISCVI_SRAI;
++        shamt = 48;
++        break;
++      default:
++        lj_assertA(0, "invalid ext op");
++        return;
++    }
++    emit_dsshamt(as, sri, rd, rd, shamt);   
++    emit_dsshamt(as, sli, rd, rs1, shamt);
++  }
++}
++
++static void emit_cleartp(ASMState *as, Reg rd, Reg rs1)
++{
++  if (as->flags & JIT_F_RVXThead) {
++    emit_dsi(as, RISCVI_TH_EXTU, rd, rs1, 46u << 6);
++  } else {
++    emit_dsshamt(as, RISCVI_SRLI, rd, rd, 17);
++    emit_dsshamt(as, RISCVI_SLLI, rd, rs1, 17);
++  }
++}
++
++/*
++static void emit_andn(ASMState *as, Reg rd, Reg rs1, Reg rs2, Reg tmp)
++{
++  if (as->flags & JIT_F_RVZbb) {
++    emit_ds1s2(as, RISCVI_ANDN, rd, rs1, rs2);
++  } else {
++    emit_ds1s2(as, RISCVI_AND, rd, rs1, tmp);
++    emit_ds(as, RISCVI_NOT, tmp, rs2);
++  }
++}
++*/
++
++/*
++static void emit_orn(ASMState *as, Reg rd, Reg rs1, Reg rs2, Reg tmp)
++{
++  if (as->flags & JIT_F_RVZbb) {
++    emit_ds1s2(as, RISCVI_ORN, rd, rs1, rs2);
++  } else {
++    emit_ds1s2(as, RISCVI_OR, rd, rs1, tmp);
++    emit_ds(as, RISCVI_NOT, tmp, rs2);
++  }
++}
++*/
++
++static void emit_xnor(ASMState *as, Reg rd, Reg rs1, Reg rs2)
++{
++  if (as->flags & JIT_F_RVZbb) {
++    emit_ds1s2(as, RISCVI_XNOR, rd, rs1, rs2);
++  } else {
++    emit_ds(as, RISCVI_NOT, rd, rd);
++    emit_ds1s2(as, RISCVI_XOR, rd, rs1, rs2);
++  }
++}
++
++static void emit_shxadd(ASMState *as, Reg rd, Reg rs1, Reg rs2, Reg tmp, unsigned int shamt)
++{
++  if (as->flags & JIT_F_RVZba) {
++    switch (shamt) {
++      case 1: emit_ds1s2(as, RISCVI_SH1ADD, rd, rs2, rs1); break;
++      case 2: emit_ds1s2(as, RISCVI_SH2ADD, rd, rs2, rs1); break;
++      case 3: emit_ds1s2(as, RISCVI_SH3ADD, rd, rs2, rs1); break;
++      default: return;
++    }
++  } else if (as->flags & JIT_F_RVXThead) {
++    emit_dsi(as, RISCVI_TH_ADDSL|RISCVF_IMMI(shamt<<5), rd, rs1, rs2);
++  } else {
++    emit_ds1s2(as, RISCVI_ADD, rd, rs1, tmp);
++    emit_dsshamt(as, RISCVI_SLLI, tmp, rs2, shamt);
++  }
++}
++
++#define emit_sh1add(as, rd, rs1, rs2, tmp) emit_shxadd(as, rd, rs1, rs2, tmp, 1)
++#define emit_sh2add(as, rd, rs1, rs2, tmp) emit_shxadd(as, rd, rs1, rs2, tmp, 2)
++#define emit_sh3add(as, rd, rs1, rs2, tmp) emit_shxadd(as, rd, rs1, rs2, tmp, 3)
++
++static void emit_loadk12(ASMState *as, Reg rd, int32_t i)
++{
++  emit_di(as, RISCVI_ADDI, rd, i);
++}
++
++static void emit_loadk32(ASMState *as, Reg rd, int32_t i)
++{
++  if (checki12((int64_t)i)) {
++    emit_loadk12(as, rd, i);
++  } else {
++    if(LJ_UNLIKELY(RISCVF_HI((uint32_t)i) == 0x80000u && i > 0))
++      emit_dsi(as, RISCVI_XORI, rd, rd, RISCVF_LO(i));
++    else
++    emit_dsi(as, RISCVI_ADDI, rd, rd, RISCVF_LO(i));
++    emit_du(as, RISCVI_LUI, rd, RISCVF_HI((uint32_t)i));
++  }
++}
++
++/* -- Emit loads/stores --------------------------------------------------- */
++
++/* Prefer rematerialization of BASE/L from global_State over spills. */
++#define emit_canremat(ref)	((ref) <= REF_BASE)
++
++
++/* Load a 32 bit constant into a GPR. */
++#define emit_loadi(as, r, i)	emit_loadk32(as, r, i);
++
++/* Load a 64 bit constant into a GPR. */
++static void emit_loadu64(ASMState *as, Reg r, uint64_t u64)
++{
++  if (checki32((int64_t)u64)) {
++    emit_loadk32(as, r, (int32_t)u64);
++  } else {
++    uint32_t lo32 = u64 & 0xfffffffful;
++    RISCVIns instrs[7] = {0};
++    int shamt = 0, step = 0;
++    for(int bit = 0; bit < 32; bit++) {
++      if (lo32 & (1u << bit)) {
++  if (shamt) instrs[step++] = RISCVI_SLLI | RISCVF_D(r) | RISCVF_S1(r) | RISCVF_IMMI(shamt);
++  int inc = bit+10 > 31 ? 31-bit : 10;
++  bit += inc, shamt = inc+1;
++  uint32_t msk = ((1ul << (bit+1))-1)^((1ul << (((bit-inc) >= 0) ? (bit-inc) : 0))-1);
++  uint16_t payload = (lo32 & msk) >> (((bit-inc) >= 0) ? (bit-inc) : 0);
++  instrs[step++] = RISCVI_ADDI | RISCVF_D(r) | RISCVF_S1(r) | RISCVF_IMMI(payload);
++      } else shamt++;
++    }
++    if (shamt) instrs[step++] = RISCVI_SLLI | RISCVF_D(r) | RISCVF_S1(r) | RISCVF_IMMI(shamt);
++
++    if (step < 6) {
++      for(int i = 0; i < step; i++)
++        *--as->mcp = instrs[i];
++    } else {
++      emit_dsi(as, RISCVI_ADDI, r, r, u64 & 0x3ff);
++      emit_dsshamt(as, RISCVI_SLLI, r, r, 10);
++      emit_dsi(as, RISCVI_ADDI, r, r, (u64 >> 10) & 0x7ff);
++      emit_dsshamt(as, RISCVI_SLLI, r, r, 11);
++      emit_dsi(as, RISCVI_ADDI, r, r, (u64 >> 21) & 0x7ff);
++      emit_dsshamt(as, RISCVI_SLLI, r, r, 11);
++    }
++
++    uint32_t hi32 = u64 >> 32;
++    if (hi32 & 0xfff) emit_loadk32(as, r, hi32);
++    else emit_du(as, RISCVI_LUI, r, hi32 >> 12);
++  }
++}
++
++#define emit_loada(as, r, addr)	emit_loadu64(as, (r), u64ptr((addr)))
++
++/* Get/set from constant pointer. */
++static void emit_lsptr(ASMState *as, RISCVIns riscvi, Reg r, void *p, RegSet allow)
++{
++  emit_lso(as, riscvi, r, ra_allock(as, igcptr(p), allow), 0);
++}
++
++/* Load 64 bit IR constant into register. */
++static void emit_loadk64(ASMState *as, Reg r, IRIns *ir)
++{
++  const uint64_t *k = &ir_k64(ir)->u64;
++  Reg r64 = r;
++  if (rset_test(RSET_FPR, r)) {
++    r64 = RID_TMP;
++    emit_ds(as, RISCVI_FMV_D_X, r, r64);
++  }
++  emit_loadu64(as, r64, *k);
++}
++
++/* Get/set global_State fields. */
++static void emit_lsglptr(ASMState *as, RISCVIns riscvi, Reg r, int32_t ofs)
++{
++  emit_lso(as, riscvi, r, RID_GL, ofs);
++}
++
++#define emit_getgl(as, r, field) \
++  emit_lsglptr(as, RISCVI_LD, (r), (int32_t)offsetof(global_State, field))
++#define emit_setgl(as, r, field) \
++  emit_lsglptr(as, RISCVI_SD, (r), (int32_t)offsetof(global_State, field))
++
++/* Trace number is determined from per-trace exit stubs. */
++#define emit_setvmstate(as, i)		UNUSED(i)
++
++/* -- Emit control-flow instructions -------------------------------------- */
++
++/* Label for internal jumps. */
++typedef MCode *MCLabel;
++
++/* Return label pointing to current PC. */
++#define emit_label(as)		((as)->mcp)
++
++static void emit_branch(ASMState *as, RISCVIns riscvi, Reg rs1, Reg rs2, MCode *target, int jump)
++{
++  MCode *p = as->mcp;
++  ptrdiff_t delta = (char *)target - (char *)(p - 1);
++  // lj_assertA(((delta + 0x10000) >> 13) == 0, "branch target out of range"); /* B */
++  lj_assertA(((delta + 0x100000) >> 21) == 0, "branch target out of range"); /* ^B+J */
++  if (checki13(delta) && !jump) {
++    *--p = riscvi | RISCVF_S1(rs1) | RISCVF_S2(rs2) | RISCVF_IMMB(delta);
++    *--p = RISCVI_NOP;
++  } else {
++    *--p = RISCVI_JAL | RISCVF_IMMJ(delta); /* Poorman's trampoline */
++    *--p = (riscvi^0x00001000) | RISCVF_S1(rs1) | RISCVF_S2(rs2) | RISCVF_IMMB(8);
++  }
++  as->mcp = p;
++}
++
++static void emit_jmp(ASMState *as, MCode *target)
++{
++  MCode *p = as->mcp;
++  ptrdiff_t delta = (char *)target - (char *)(p - 2);
++  // lj_assertA(((delta + 0x100000) >> 21) == 0, "jump target out of range"); /* J */
++  lj_assertA(checki32(delta), "jump target out of range"); /* AUIPC+JALR */
++  if (checki21(delta)) {
++    *--p = RISCVI_NOP;
++    *--p = RISCVI_JAL | RISCVF_IMMJ(delta);
++  } else {
++    *--p = RISCVI_JALR | RISCVF_S1(RID_TMP) | RISCVF_IMMI(RISCVF_LO(delta));
++    *--p = RISCVI_AUIPC | RISCVF_D(RID_TMP) | RISCVF_IMMU(RISCVF_HI(delta));
++  }
++  as->mcp = p;
++}
++
++#define emit_mv(as, dst, src) \
++  emit_ds(as, RISCVI_MV, (dst), (src))
++
++static void emit_call(ASMState *as, void *target, int needcfa)
++{
++  MCode *p = as->mcp;
++  ptrdiff_t delta = (char *)target - (char *)(p - 2);
++  if (checki21(delta)) {
++    *--p = RISCVI_NOP;
++    *--p = RISCVI_JAL | RISCVF_D(RID_RA) | RISCVF_IMMJ(delta);
++  } else if (checki32(delta)) {
++    *--p = RISCVI_JALR | RISCVF_D(RID_RA) | RISCVF_S1(RID_TMP) | RISCVF_IMMI(RISCVF_LO(delta));
++    *--p = RISCVI_AUIPC | RISCVF_D(RID_TMP) | RISCVF_IMMU(RISCVF_HI(delta));
++    needcfa = 1;
++  } else {
++    *--p = RISCVI_JALR | RISCVF_D(RID_RA) | RISCVF_S1(RID_CFUNCADDR) | RISCVF_IMMI(0);
++    needcfa = 2;
++  }
++  as->mcp = p;
++  if (needcfa > 1)
++    ra_allockreg(as, (intptr_t)target, RID_CFUNCADDR);
++}
++
++/* -- Emit generic operations --------------------------------------------- */
++
++/* Generic move between two regs. */
++static void emit_movrr(ASMState *as, IRIns *ir, Reg dst, Reg src)
++{
++  if (src < RID_MAX_GPR && dst < RID_MAX_GPR)
++    emit_mv(as, dst, src);
++  else if (src < RID_MAX_GPR)
++    emit_ds(as, irt_isnum(ir->t) ? RISCVI_FMV_D_X : RISCVI_FMV_W_X, dst, src);
++  else if (dst < RID_MAX_GPR)
++    emit_ds(as, irt_isnum(ir->t) ? RISCVI_FMV_X_D : RISCVI_FMV_X_W, dst, src);
++  else
++    emit_ds1s2(as, irt_isnum(ir->t) ? RISCVI_FMV_D : RISCVI_FMV_S, dst, src, src);
++}
++
++/* Emit an arithmetic operation with a constant operand. */
++static void emit_opk(ASMState *as, RISCVIns riscvi, Reg dest, Reg src,
++         Reg tmp, intptr_t k)
++{
++  if (checki12(k)) emit_dsi(as, riscvi, dest, src, k);
++  else {
++    switch (riscvi) {
++      case RISCVI_ADDI: riscvi = RISCVI_ADD; break;
++      case RISCVI_XORI: riscvi = RISCVI_XOR; break;
++      case RISCVI_ORI: riscvi = RISCVI_OR; break;
++      case RISCVI_ANDI: riscvi = RISCVI_AND; break;
++      default: lj_assertA(0, "NYI arithmetic RISCVIns"); return;
++    }
++    emit_ds1s2(as, riscvi, dest, src, tmp);
++    emit_loadu64(as, tmp, (uintptr_t)k);
++  }
++}
++
++/* Generic load of register with base and (small) offset address. */
++static void emit_loadofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
++{
++  if (r < RID_MAX_GPR)
++    emit_lso(as, irt_is64(ir->t) ? RISCVI_LD : RISCVI_LW, r, base, ofs);
++  else
++    emit_lso(as, irt_isnum(ir->t) ? RISCVI_FLD : RISCVI_FLW, r, base, ofs);
++}
++
++/* Generic store of register with base and (small) offset address. */
++static void emit_storeofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
++{
++  if (r < RID_MAX_GPR)
++    emit_lso(as, irt_is64(ir->t) ? RISCVI_SD : RISCVI_SW, r, base, ofs);
++  else
++    emit_lso(as, irt_isnum(ir->t) ? RISCVI_FSD : RISCVI_FSW, r, base, ofs);
++}
++
++/* Add offset to pointer. */
++static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
++{
++  if (ofs)
++    emit_opk(as, RISCVI_ADDI, r, r, RID_TMP, ofs);
++}
++
++
++#define emit_spsub(as, ofs)	emit_addptr(as, RID_SP, -(ofs))
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_emit_x86.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_emit_x86.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_emit_x86.h
+@@ -1,6 +1,6 @@
+ /*
+ ** x86/x64 instruction emitter.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ /* -- Emit basic instructions --------------------------------------------- */
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_err.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_err.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_err.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Error handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_err_c
+@@ -29,12 +29,18 @@
+ ** Pros and Cons:
+ **
+ ** - EXT requires unwind tables for *all* functions on the C stack between
+-**   the pcall/catch and the error/throw. This is the default on x64,
+-**   but needs to be manually enabled on x86/PPC for non-C++ code.
++**   the pcall/catch and the error/throw. C modules used by Lua code can
++**   throw errors, so these need to have unwind tables, too. Transitively
++**   this applies to all system libraries used by C modules -- at least
++**   when they have callbacks which may throw an error.
+ **
+-** - INT is faster when actually throwing errors (but this happens rarely).
++** - INT is faster when actually throwing errors, but this happens rarely.
+ **   Setting up error handlers is zero-cost in any case.
+ **
++** - INT needs to save *all* callee-saved registers when entering the
++**   interpreter. EXT only needs to save those actually used inside the
++**   interpreter. JIT-compiled code may need to save some more.
++**
+ ** - EXT provides full interoperability with C++ exceptions. You can throw
+ **   Lua errors or C++ exceptions through a mix of Lua frames and C++ frames.
+ **   C++ destructors are called as needed. C++ exceptions caught by pcall
+@@ -46,27 +52,38 @@
+ **   the wrapper function feature. Lua errors thrown through C++ frames
+ **   cannot be caught by C++ code and C++ destructors are not run.
+ **
+-** EXT is the default on x64 systems and on Windows, INT is the default on all
+-** other systems.
+-**
+-** EXT can be manually enabled on POSIX systems using GCC and DWARF2 stack
+-** unwinding with -DLUAJIT_UNWIND_EXTERNAL. *All* C code must be compiled
+-** with -funwind-tables (or -fexceptions). This includes LuaJIT itself (set
+-** TARGET_CFLAGS), all of your C/Lua binding code, all loadable C modules
+-** and all C libraries that have callbacks which may be used to call back
+-** into Lua. C++ code must *not* be compiled with -fno-exceptions.
+-**
+-** EXT is mandatory on WIN64 since the calling convention has an abundance
+-** of callee-saved registers (rbx, rbp, rsi, rdi, r12-r15, xmm6-xmm15).
+-** The POSIX/x64 interpreter only saves r12/r13 for INT (e.g. PS4).
++** - EXT can handle errors from internal helper functions that are called
++**   from JIT-compiled code (except for Windows/x86 and 32 bit ARM).
++**   INT has no choice but to call the panic handler, if this happens.
++**   Note: this is mainly relevant for out-of-memory errors.
++**
++** EXT is the default on all systems where the toolchain produces unwind
++** tables by default (*). This is hard-coded and/or detected in src/Makefile.
++** You can thwart the detection with: TARGET_XCFLAGS=-DLUAJIT_UNWIND_INTERNAL
++**
++** INT is the default on all other systems.
++**
++** EXT can be manually enabled for toolchains that are able to produce
++** conforming unwind tables:
++**   "TARGET_XCFLAGS=-funwind-tables -DLUAJIT_UNWIND_EXTERNAL"
++** As explained above, *all* C code used directly or indirectly by LuaJIT
++** must be compiled with -funwind-tables (or -fexceptions). C++ code must
++** *not* be compiled with -fno-exceptions.
++**
++** If you're unsure whether error handling inside the VM works correctly,
++** try running this and check whether it prints "OK":
++**
++**   luajit -e "print(select(2, load('OK')):match('OK'))"
++**
++** (*) Originally, toolchains only generated unwind tables for C++ code. For
++** interoperability reasons, this can be manually enabled for plain C code,
++** too (with -funwind-tables). With the introduction of the x64 architecture,
++** the corresponding POSIX and Windows ABIs mandated unwind tables for all
++** code. Over the following years most desktop and server platforms have
++** enabled unwind tables by default on all architectures. OTOH mobile and
++** embedded platforms do not consistently mandate unwind tables.
+ */
+ 
+-#if (defined(__GNUC__) || defined(__clang__)) && (LJ_TARGET_X64 || defined(LUAJIT_UNWIND_EXTERNAL)) && !LJ_NO_UNWIND
+-#define LJ_UNWIND_EXT	1
+-#elif LJ_TARGET_WINDOWS
+-#define LJ_UNWIND_EXT	1
+-#endif
+-
+ /* -- Error messages ------------------------------------------------------ */
+ 
+ /* Error message strings. */
+@@ -157,12 +174,15 @@ static void *err_unwind(lua_State *L, vo
+     case FRAME_PCALL:  /* FF pcall() frame. */
+     case FRAME_PCALLH:  /* FF pcall() frame inside hook. */
+       if (errcode) {
++	global_State *g;
+ 	if (errcode == LUA_YIELD) {
+ 	  frame = frame_prevd(frame);
+ 	  break;
+ 	}
++	g = G(L);
++	setgcref(g->cur_L, obj2gco(L));
+ 	if (frame_typep(frame) == FRAME_PCALL)
+-	  hook_leave(G(L));
++	  hook_leave(g);
+ 	L->base = frame_prevd(frame) + 1;
+ 	L->cframe = cf;
+ 	unwindstack(L, L->base);
+@@ -184,7 +204,198 @@ static void *err_unwind(lua_State *L, vo
+ 
+ /* -- External frame unwinding -------------------------------------------- */
+ 
+-#if (defined(__GNUC__) || defined(__clang__)) && !LJ_NO_UNWIND && !LJ_ABI_WIN
++#if LJ_ABI_WIN
++
++/*
++** Someone in Redmond owes me several days of my life. A lot of this is
++** undocumented or just plain wrong on MSDN. Some of it can be gathered
++** from 3rd party docs or must be found by trial-and-error. They really
++** don't want you to write your own language-specific exception handler
++** or to interact gracefully with MSVC. :-(
++*/
++
++#define WIN32_LEAN_AND_MEAN
++#include <windows.h>
++
++#if LJ_TARGET_X86
++typedef void *UndocumentedDispatcherContext;  /* Unused on x86. */
++#else
++/* Taken from: http://www.nynaeve.net/?p=99 */
++typedef struct UndocumentedDispatcherContext {
++  ULONG64 ControlPc;
++  ULONG64 ImageBase;
++  PRUNTIME_FUNCTION FunctionEntry;
++  ULONG64 EstablisherFrame;
++  ULONG64 TargetIp;
++  PCONTEXT ContextRecord;
++  void (*LanguageHandler)(void);
++  PVOID HandlerData;
++  PUNWIND_HISTORY_TABLE HistoryTable;
++  ULONG ScopeIndex;
++  ULONG Fill0;
++} UndocumentedDispatcherContext;
++#endif
++
++/* Another wild guess. */
++extern void __DestructExceptionObject(EXCEPTION_RECORD *rec, int nothrow);
++
++#if LJ_TARGET_X64 && defined(MINGW_SDK_INIT)
++/* Workaround for broken MinGW64 declaration. */
++VOID RtlUnwindEx_FIXED(PVOID,PVOID,PVOID,PVOID,PVOID,PVOID) asm("RtlUnwindEx");
++#define RtlUnwindEx RtlUnwindEx_FIXED
++#endif
++
++#define LJ_MSVC_EXCODE		((DWORD)0xe06d7363)
++#define LJ_GCC_EXCODE		((DWORD)0x20474343)
++
++#define LJ_EXCODE		((DWORD)0xe24c4a00)
++#define LJ_EXCODE_MAKE(c)	(LJ_EXCODE | (DWORD)(c))
++#define LJ_EXCODE_CHECK(cl)	(((cl) ^ LJ_EXCODE) <= 0xff)
++#define LJ_EXCODE_ERRCODE(cl)	((int)((cl) & 0xff))
++
++/* Windows exception handler for interpreter frame. */
++LJ_FUNCA int lj_err_unwind_win(EXCEPTION_RECORD *rec,
++  void *f, CONTEXT *ctx, UndocumentedDispatcherContext *dispatch)
++{
++#if LJ_TARGET_X86
++  void *cf = (char *)f - CFRAME_OFS_SEH;
++#elif LJ_TARGET_ARM64
++  void *cf = (char *)f - CFRAME_SIZE;
++#else
++  void *cf = f;
++#endif
++  lua_State *L = cframe_L(cf);
++  int errcode = LJ_EXCODE_CHECK(rec->ExceptionCode) ?
++		LJ_EXCODE_ERRCODE(rec->ExceptionCode) : LUA_ERRRUN;
++  if ((rec->ExceptionFlags & 6)) {  /* EH_UNWINDING|EH_EXIT_UNWIND */
++    if (rec->ExceptionCode == STATUS_LONGJUMP &&
++	rec->ExceptionRecord &&
++	LJ_EXCODE_CHECK(rec->ExceptionRecord->ExceptionCode)) {
++      errcode = LJ_EXCODE_ERRCODE(rec->ExceptionRecord->ExceptionCode);
++      if ((rec->ExceptionFlags & 0x20)) {  /* EH_TARGET_UNWIND */
++	/* Unwinding is about to finish; revert the ExceptionCode so that
++	** RtlRestoreContext does not try to restore from a _JUMP_BUFFER.
++	*/
++	rec->ExceptionCode = 0;
++      }
++    }
++    /* Unwind internal frames. */
++    err_unwind(L, cf, errcode);
++  } else {
++    void *cf2 = err_unwind(L, cf, 0);
++    if (cf2) {  /* We catch it, so start unwinding the upper frames. */
++#if !LJ_TARGET_X86
++      EXCEPTION_RECORD rec2;
++#endif
++      if (rec->ExceptionCode == LJ_MSVC_EXCODE ||
++	  rec->ExceptionCode == LJ_GCC_EXCODE) {
++#if !LJ_TARGET_CYGWIN
++	__DestructExceptionObject(rec, 1);
++#endif
++	setstrV(L, L->top++, lj_err_str(L, LJ_ERR_ERRCPP));
++      } else if (!LJ_EXCODE_CHECK(rec->ExceptionCode)) {
++	/* Don't catch access violations etc. */
++	return 1;  /* ExceptionContinueSearch */
++      }
++#if LJ_TARGET_X86
++      UNUSED(ctx);
++      UNUSED(dispatch);
++      /* Call all handlers for all lower C frames (including ourselves) again
++      ** with EH_UNWINDING set. Then call the specified function, passing cf
++      ** and errcode.
++      */
++      lj_vm_rtlunwind(cf, (void *)rec,
++	(cframe_unwind_ff(cf2) && errcode != LUA_YIELD) ?
++	(void *)lj_vm_unwind_ff : (void *)lj_vm_unwind_c, errcode);
++      /* lj_vm_rtlunwind does not return. */
++#else
++      if (LJ_EXCODE_CHECK(rec->ExceptionCode)) {
++	/* For unwind purposes, wrap the EXCEPTION_RECORD in something that
++	** looks like a longjmp, so that MSVC will execute C++ destructors in
++	** the frames we unwind over. ExceptionInformation[0] should really
++	** contain a _JUMP_BUFFER*, but hopefully nobody is looking too closely
++	** at this point.
++	*/
++	rec2.ExceptionCode = STATUS_LONGJUMP;
++	rec2.ExceptionRecord = rec;
++	rec2.ExceptionAddress = 0;
++	rec2.NumberParameters = 1;
++	rec2.ExceptionInformation[0] = (ULONG_PTR)ctx;
++	rec = &rec2;
++      }
++      /* Unwind the stack and call all handlers for all lower C frames
++      ** (including ourselves) again with EH_UNWINDING set. Then set
++      ** stack pointer = f, result = errcode and jump to the specified target.
++      */
++      RtlUnwindEx(f, (void *)((cframe_unwind_ff(cf2) && errcode != LUA_YIELD) ?
++			      lj_vm_unwind_ff_eh :
++			      lj_vm_unwind_c_eh),
++		  rec, (void *)(uintptr_t)errcode, dispatch->ContextRecord,
++		  dispatch->HistoryTable);
++      /* RtlUnwindEx should never return. */
++#endif
++    }
++  }
++  return 1;  /* ExceptionContinueSearch */
++}
++
++#if LJ_UNWIND_JIT
++
++#if LJ_TARGET_X64
++#define CONTEXT_REG_PC	Rip
++#elif LJ_TARGET_ARM64
++#define CONTEXT_REG_PC	Pc
++#else
++#error "NYI: Windows arch-specific unwinder for JIT-compiled code"
++#endif
++
++/* Windows unwinder for JIT-compiled code. */
++static void err_unwind_win_jit(global_State *g, int errcode)
++{
++  CONTEXT ctx;
++  UNWIND_HISTORY_TABLE hist;
++
++  memset(&hist, 0, sizeof(hist));
++  RtlCaptureContext(&ctx);
++  while (1) {
++    DWORD64 frame, base, addr = ctx.CONTEXT_REG_PC;
++    void *hdata;
++    PRUNTIME_FUNCTION func = RtlLookupFunctionEntry(addr, &base, &hist);
++    if (!func) {  /* Found frame without .pdata: must be JIT-compiled code. */
++      ExitNo exitno;
++      uintptr_t stub = lj_trace_unwind(G2J(g), (uintptr_t)(addr - sizeof(MCode)), &exitno);
++      if (stub) {  /* Jump to side exit to unwind the trace. */
++	ctx.CONTEXT_REG_PC = stub;
++	G2J(g)->exitcode = errcode;
++	RtlRestoreContext(&ctx, NULL);  /* Does not return. */
++      }
++      break;
++    }
++    RtlVirtualUnwind(UNW_FLAG_NHANDLER, base, addr, func,
++		     &ctx, &hdata, &frame, NULL);
++    if (!addr) break;
++  }
++  /* Unwinding failed, if we end up here. */
++}
++#endif
++
++/* Raise Windows exception. */
++static void err_raise_ext(global_State *g, int errcode)
++{
++#if LJ_UNWIND_JIT
++  if (tvref(g->jit_base)) {
++    err_unwind_win_jit(g, errcode);
++    return;  /* Unwinding failed. */
++  }
++#elif LJ_HASJIT
++  /* Cannot catch on-trace errors for Windows/x86 SEH. Unwind to interpreter. */
++  setmref(g->jit_base, NULL);
++#endif
++  UNUSED(g);
++  RaiseException(LJ_EXCODE_MAKE(errcode), 1 /* EH_NONCONTINUABLE */, 0, NULL);
++}
++
++#elif !LJ_NO_UNWIND && (defined(__GNUC__) || defined(__clang__))
+ 
+ /*
+ ** We have to use our own definitions instead of the mandatory (!) unwind.h,
+@@ -194,6 +405,7 @@ static void *err_unwind(lua_State *L, vo
+ typedef struct _Unwind_Context _Unwind_Context;
+ 
+ #define _URC_OK			0
++#define _URC_FATAL_PHASE2_ERROR	2
+ #define _URC_FATAL_PHASE1_ERROR	3
+ #define _URC_HANDLER_FOUND	6
+ #define _URC_INSTALL_CONTEXT	7
+@@ -213,9 +425,11 @@ typedef struct _Unwind_Exception
+   void (*excleanup)(int, struct _Unwind_Exception *);
+   uintptr_t p1, p2;
+ } __attribute__((__aligned__)) _Unwind_Exception;
++#define UNWIND_EXCEPTION_TYPE	_Unwind_Exception
+ 
+ extern uintptr_t _Unwind_GetCFA(_Unwind_Context *);
+ extern void _Unwind_SetGR(_Unwind_Context *, int, uintptr_t);
++extern uintptr_t _Unwind_GetIP(_Unwind_Context *);
+ extern void _Unwind_SetIP(_Unwind_Context *, uintptr_t);
+ extern void _Unwind_DeleteException(_Unwind_Exception *);
+ extern int _Unwind_RaiseException(_Unwind_Exception *);
+@@ -233,7 +447,6 @@ LJ_FUNCA int lj_err_unwind_dwarf(int ver
+   lua_State *L;
+   if (version != 1)
+     return _URC_FATAL_PHASE1_ERROR;
+-  UNUSED(uexclass);
+   cf = (void *)_Unwind_GetCFA(ctx);
+   L = cframe_L(cf);
+   if ((actions & _UA_SEARCH_PHASE)) {
+@@ -260,10 +473,10 @@ LJ_FUNCA int lj_err_unwind_dwarf(int ver
+     if ((actions & _UA_FORCE_UNWIND)) {
+       return _URC_CONTINUE_UNWIND;
+     } else if (cf) {
++      ASMFunction ip;
+       _Unwind_SetGR(ctx, LJ_TARGET_EHRETREG, errcode);
+-      _Unwind_SetIP(ctx, (uintptr_t)(cframe_unwind_ff(cf) ?
+-				     lj_vm_unwind_ff_eh :
+-				     lj_vm_unwind_c_eh));
++      ip = cframe_unwind_ff(cf) ? lj_vm_unwind_ff_eh : lj_vm_unwind_c_eh;
++      _Unwind_SetIP(ctx, (uintptr_t)lj_ptr_strip(ip));
+       return _URC_INSTALL_CONTEXT;
+     }
+ #if LJ_TARGET_X86ORX64
+@@ -281,20 +494,150 @@ LJ_FUNCA int lj_err_unwind_dwarf(int ver
+     ** it on non-x64 because the interpreter restores all callee-saved regs.
+     */
+     lj_err_throw(L, errcode);
++#if LJ_TARGET_X64
++#error "Broken build system -- only use the provided Makefiles!"
++#endif
+ #endif
+   }
+   return _URC_CONTINUE_UNWIND;
+ }
+ 
+-#if LJ_UNWIND_EXT
+-static __thread _Unwind_Exception static_uex;
++#if LJ_UNWIND_EXT && defined(LUA_USE_ASSERT)
++struct dwarf_eh_bases { void *tbase, *dbase, *func; };
++extern const void *_Unwind_Find_FDE(void *pc, struct dwarf_eh_bases *bases);
++
++/* Verify that external error handling actually has a chance to work. */
++void lj_err_verify(void)
++{
++#if !LJ_TARGET_OSX
++  /* Check disabled on MacOS due to brilliant software engineering at Apple. */
++  struct dwarf_eh_bases ehb;
++  lj_assertX(_Unwind_Find_FDE((void *)lj_err_throw, &ehb), "broken build: external frame unwinding enabled, but missing -funwind-tables");
++#endif
++  /* Check disabled, because of broken Fedora/ARM64. See #722.
++  lj_assertX(_Unwind_Find_FDE((void *)_Unwind_RaiseException, &ehb), "broken build: external frame unwinding enabled, but system libraries have no unwind tables");
++  */
++}
++#endif
+ 
+-/* Raise DWARF2 exception. */
+-static void err_raise_ext(int errcode)
++#if LJ_UNWIND_JIT
++/* DWARF2 personality handler for JIT-compiled code. */
++static int err_unwind_jit(int version, int actions,
++  uint64_t uexclass, _Unwind_Exception *uex, _Unwind_Context *ctx)
+ {
+-  static_uex.exclass = LJ_UEXCLASS_MAKE(errcode);
+-  static_uex.excleanup = NULL;
+-  _Unwind_RaiseException(&static_uex);
++  /* NYI: FFI C++ exception interoperability. */
++  if (version != 1 || !LJ_UEXCLASS_CHECK(uexclass))
++    return _URC_FATAL_PHASE1_ERROR;
++  if ((actions & _UA_SEARCH_PHASE)) {
++    return _URC_HANDLER_FOUND;
++  }
++  if ((actions & _UA_CLEANUP_PHASE)) {
++    global_State *g = *(global_State **)(uex+1);
++    ExitNo exitno;
++    uintptr_t addr = _Unwind_GetIP(ctx);  /* Return address _after_ call. */
++    uintptr_t stub = lj_trace_unwind(G2J(g), addr - sizeof(MCode), &exitno);
++    lj_assertG(tvref(g->jit_base), "unexpected throw across mcode frame");
++    if (stub) {  /* Jump to side exit to unwind the trace. */
++      G2J(g)->exitcode = LJ_UEXCLASS_ERRCODE(uexclass);
++#ifdef LJ_TARGET_MIPS
++      _Unwind_SetGR(ctx, 4, stub);
++      _Unwind_SetGR(ctx, 5, exitno);
++      _Unwind_SetIP(ctx, (uintptr_t)(void *)lj_vm_unwind_stub);
++#else
++      _Unwind_SetIP(ctx, stub);
++#endif
++      return _URC_INSTALL_CONTEXT;
++    }
++    return _URC_FATAL_PHASE2_ERROR;
++  }
++  return _URC_FATAL_PHASE1_ERROR;
++}
++
++/* DWARF2 template frame info for JIT-compiled code.
++**
++** After copying the template to the start of the mcode segment,
++** the frame handler function and the code size is patched.
++** The frame handler always installs a new context to jump to the exit,
++** so don't bother to add any unwind opcodes.
++*/
++static const uint8_t err_frame_jit_template[] = {
++#if LJ_BE
++  0,0,0,
++#endif
++  LJ_64 ? 0x1c : 0x14,  /* CIE length. */
++#if LJ_LE
++  0,0,0,
++#endif
++  0,0,0,0, 1, 'z','P','R',0,  /* CIE mark, CIE version, augmentation. */
++  1, LJ_64 ? 0x78 : 0x7c, LJ_TARGET_EHRAREG,  /* Code/data align, RA. */
++#if LJ_64
++  10, 0, 0,0,0,0,0,0,0,0, 0x1b,  /* Aug. data ABS handler, PCREL|SDATA4 code. */
++  0,0,0,0,0,  /* Alignment. */
++#else
++  6, 0, 0,0,0,0, 0x1b,  /* Aug. data ABS handler, PCREL|SDATA4 code. */
++  0,  /* Alignment. */
++#endif
++#if LJ_BE
++  0,0,0,
++#endif
++  LJ_64 ? 0x14 : 0x10,  /* FDE length. */
++  0,0,0,
++  LJ_64 ? 0x24 : 0x1c,  /* CIE offset. */
++  0,0,0,
++  LJ_64 ? 0x14 : 0x10,  /* Code offset. After Final FDE. */
++#if LJ_LE
++  0,0,0,
++#endif
++  0,0,0,0, 0, 0,0,0, /* Code size, augmentation length, alignment. */
++#if LJ_64
++  0,0,0,0,  /* Alignment. */
++#endif
++  0,0,0,0  /* Final FDE. */
++};
++
++#define ERR_FRAME_JIT_OFS_HANDLER	0x12
++#define ERR_FRAME_JIT_OFS_FDE		(LJ_64 ? 0x20 : 0x18)
++#define ERR_FRAME_JIT_OFS_CODE_SIZE	(LJ_64 ? 0x2c : 0x24)
++#if LJ_TARGET_OSX
++#define ERR_FRAME_JIT_OFS_REGISTER	ERR_FRAME_JIT_OFS_FDE
++#else
++#define ERR_FRAME_JIT_OFS_REGISTER	0
++#endif
++
++extern void __register_frame(const void *);
++extern void __deregister_frame(const void *);
++
++uint8_t *lj_err_register_mcode(void *base, size_t sz, uint8_t *info)
++{
++  ASMFunction handler = (ASMFunction)err_unwind_jit;
++  memcpy(info, err_frame_jit_template, sizeof(err_frame_jit_template));
++#if LJ_ABI_PAUTH
++#if LJ_TARGET_ARM64
++  handler = ptrauth_auth_and_resign(handler,
++    ptrauth_key_function_pointer, 0,
++    ptrauth_key_process_independent_code, info + ERR_FRAME_JIT_OFS_HANDLER);
++#else
++#error "missing pointer authentication support for this architecture"
++#endif
++#endif
++  memcpy(info + ERR_FRAME_JIT_OFS_HANDLER, &handler, sizeof(handler));
++  *(uint32_t *)(info + ERR_FRAME_JIT_OFS_CODE_SIZE) =
++    (uint32_t)(sz - sizeof(err_frame_jit_template) - (info - (uint8_t *)base));
++  __register_frame(info + ERR_FRAME_JIT_OFS_REGISTER);
++#ifdef LUA_USE_ASSERT
++  {
++    struct dwarf_eh_bases ehb;
++    lj_assertX(_Unwind_Find_FDE(info + sizeof(err_frame_jit_template)+1, &ehb),
++	       "bad JIT unwind table registration");
++  }
++#endif
++  return info + sizeof(err_frame_jit_template);
++}
++
++void lj_err_deregister_mcode(void *base, size_t sz, uint8_t *info)
++{
++  UNUSED(base); UNUSED(sz);
++  __deregister_frame(info + ERR_FRAME_JIT_OFS_REGISTER);
+ }
+ #endif
+ 
+@@ -306,6 +649,7 @@ static void err_raise_ext(int errcode)
+ #define _US_FORCE_UNWIND		8
+ 
+ typedef struct _Unwind_Control_Block _Unwind_Control_Block;
++#define UNWIND_EXCEPTION_TYPE	_Unwind_Control_Block
+ 
+ struct _Unwind_Control_Block {
+   uint64_t exclass;
+@@ -364,136 +708,63 @@ LJ_FUNCA int lj_err_unwind_arm(int state
+   }
+   if (__gnu_unwind_frame(ucb, ctx) != _URC_OK)
+     return _URC_FAILURE;
++#ifdef LUA_USE_ASSERT
++  /* We should never get here unless this is a forced unwind aka backtrace. */
++  if (_Unwind_GetGR(ctx, 0) == 0xff33aa77) {
++    _Unwind_SetGR(ctx, 0, 0xff33aa88);
++  }
++#endif
+   return _URC_CONTINUE_UNWIND;
+ }
+ 
+-#if LJ_UNWIND_EXT
+-static __thread _Unwind_Control_Block static_uex;
++#if LJ_UNWIND_EXT && defined(LUA_USE_ASSERT)
++typedef int (*_Unwind_Trace_Fn)(_Unwind_Context *, void *);
++extern int _Unwind_Backtrace(_Unwind_Trace_Fn, void *);
+ 
+-static void err_raise_ext(int errcode)
++static int err_verify_bt(_Unwind_Context *ctx, int *got)
+ {
+-  memset(&static_uex, 0, sizeof(static_uex));
+-  static_uex.exclass = LJ_UEXCLASS_MAKE(errcode);
+-  _Unwind_RaiseException(&static_uex);
++  if (_Unwind_GetGR(ctx, 0) == 0xff33aa88) { *got = 2; }
++  else if (*got == 0) { *got = 1; _Unwind_SetGR(ctx, 0, 0xff33aa77); }
++  return _URC_OK;
+ }
+-#endif
+ 
+-#endif /* LJ_TARGET_ARM */
+-
+-#elif LJ_ABI_WIN
++/* Verify that external error handling actually has a chance to work. */
++void lj_err_verify(void)
++{
++  int got = 0;
++  _Unwind_Backtrace((_Unwind_Trace_Fn)err_verify_bt, &got);
++  lj_assertX(got == 2, "broken build: external frame unwinding enabled, but missing -funwind-tables");
++}
++#endif
+ 
+ /*
+-** Someone in Redmond owes me several days of my life. A lot of this is
+-** undocumented or just plain wrong on MSDN. Some of it can be gathered
+-** from 3rd party docs or must be found by trial-and-error. They really
+-** don't want you to write your own language-specific exception handler
+-** or to interact gracefully with MSVC. :-(
++** Note: LJ_UNWIND_JIT is not implemented for 32 bit ARM.
+ **
+-** Apparently MSVC doesn't call C++ destructors for foreign exceptions
+-** unless you compile your C++ code with /EHa. Unfortunately this means
+-** catch (...) also catches things like access violations. The use of
+-** _set_se_translator doesn't really help, because it requires /EHa, too.
++** The quirky ARM unwind API doesn't have __register_frame().
++** A potential workaround might involve _Unwind_Backtrace.
++** But most 32 bit ARM targets don't qualify for LJ_UNWIND_EXT, anyway,
++** since they are built without unwind tables by default.
+ */
+ 
+-#define WIN32_LEAN_AND_MEAN
+-#include <windows.h>
+-
+-#if LJ_TARGET_X64
+-/* Taken from: http://www.nynaeve.net/?p=99 */
+-typedef struct UndocumentedDispatcherContext {
+-  ULONG64 ControlPc;
+-  ULONG64 ImageBase;
+-  PRUNTIME_FUNCTION FunctionEntry;
+-  ULONG64 EstablisherFrame;
+-  ULONG64 TargetIp;
+-  PCONTEXT ContextRecord;
+-  void (*LanguageHandler)(void);
+-  PVOID HandlerData;
+-  PUNWIND_HISTORY_TABLE HistoryTable;
+-  ULONG ScopeIndex;
+-  ULONG Fill0;
+-} UndocumentedDispatcherContext;
+-#else
+-typedef void *UndocumentedDispatcherContext;
+-#endif
+-
+-/* Another wild guess. */
+-extern void __DestructExceptionObject(EXCEPTION_RECORD *rec, int nothrow);
+-
+-#if LJ_TARGET_X64 && defined(MINGW_SDK_INIT)
+-/* Workaround for broken MinGW64 declaration. */
+-VOID RtlUnwindEx_FIXED(PVOID,PVOID,PVOID,PVOID,PVOID,PVOID) asm("RtlUnwindEx");
+-#define RtlUnwindEx RtlUnwindEx_FIXED
+-#endif
++#endif /* LJ_TARGET_ARM */
+ 
+-#define LJ_MSVC_EXCODE		((DWORD)0xe06d7363)
+-#define LJ_GCC_EXCODE		((DWORD)0x20474343)
+ 
+-#define LJ_EXCODE		((DWORD)0xe24c4a00)
+-#define LJ_EXCODE_MAKE(c)	(LJ_EXCODE | (DWORD)(c))
+-#define LJ_EXCODE_CHECK(cl)	(((cl) ^ LJ_EXCODE) <= 0xff)
+-#define LJ_EXCODE_ERRCODE(cl)	((int)((cl) & 0xff))
++#if LJ_UNWIND_EXT
++static __thread struct {
++  UNWIND_EXCEPTION_TYPE ex;
++  global_State *g;
++} static_uex;
+ 
+-/* Windows exception handler for interpreter frame. */
+-LJ_FUNCA int lj_err_unwind_win(EXCEPTION_RECORD *rec,
+-  void *f, CONTEXT *ctx, UndocumentedDispatcherContext *dispatch)
++/* Raise external exception. */
++static void err_raise_ext(global_State *g, int errcode)
+ {
+-#if LJ_TARGET_X64
+-  void *cf = f;
+-#else
+-  void *cf = (char *)f - CFRAME_OFS_SEH;
+-#endif
+-  lua_State *L = cframe_L(cf);
+-  int errcode = LJ_EXCODE_CHECK(rec->ExceptionCode) ?
+-		LJ_EXCODE_ERRCODE(rec->ExceptionCode) : LUA_ERRRUN;
+-  if ((rec->ExceptionFlags & 6)) {  /* EH_UNWINDING|EH_EXIT_UNWIND */
+-    /* Unwind internal frames. */
+-    err_unwind(L, cf, errcode);
+-  } else {
+-    void *cf2 = err_unwind(L, cf, 0);
+-    if (cf2) {  /* We catch it, so start unwinding the upper frames. */
+-      if (rec->ExceptionCode == LJ_MSVC_EXCODE ||
+-	  rec->ExceptionCode == LJ_GCC_EXCODE) {
+-#if LJ_TARGET_WINDOWS
+-	__DestructExceptionObject(rec, 1);
+-#endif
+-	setstrV(L, L->top++, lj_err_str(L, LJ_ERR_ERRCPP));
+-      } else if (!LJ_EXCODE_CHECK(rec->ExceptionCode)) {
+-	/* Don't catch access violations etc. */
+-	return 1;  /* ExceptionContinueSearch */
+-      }
+-#if LJ_TARGET_X64
+-      /* Unwind the stack and call all handlers for all lower C frames
+-      ** (including ourselves) again with EH_UNWINDING set. Then set
+-      ** rsp = cf, rax = errcode and jump to the specified target.
+-      */
+-      RtlUnwindEx(cf, (void *)((cframe_unwind_ff(cf2) && errcode != LUA_YIELD) ?
+-			       lj_vm_unwind_ff_eh :
+-			       lj_vm_unwind_c_eh),
+-		  rec, (void *)(uintptr_t)errcode, ctx, dispatch->HistoryTable);
+-      /* RtlUnwindEx should never return. */
+-#else
+-      UNUSED(ctx);
+-      UNUSED(dispatch);
+-      /* Call all handlers for all lower C frames (including ourselves) again
+-      ** with EH_UNWINDING set. Then call the specified function, passing cf
+-      ** and errcode.
+-      */
+-      lj_vm_rtlunwind(cf, (void *)rec,
+-	(cframe_unwind_ff(cf2) && errcode != LUA_YIELD) ?
+-	(void *)lj_vm_unwind_ff : (void *)lj_vm_unwind_c, errcode);
+-      /* lj_vm_rtlunwind does not return. */
+-#endif
+-    }
+-  }
+-  return 1;  /* ExceptionContinueSearch */
++  memset(&static_uex, 0, sizeof(static_uex));
++  static_uex.ex.exclass = LJ_UEXCLASS_MAKE(errcode);
++  static_uex.g = g;
++  _Unwind_RaiseException(&static_uex.ex);
+ }
+ 
+-/* Raise Windows exception. */
+-static void err_raise_ext(int errcode)
+-{
+-  RaiseException(LJ_EXCODE_MAKE(errcode), 1 /* EH_NONCONTINUABLE */, 0, NULL);
+-}
++#endif
+ 
+ #endif
+ 
+@@ -504,22 +775,23 @@ LJ_NOINLINE void LJ_FASTCALL lj_err_thro
+ {
+   global_State *g = G(L);
+   lj_trace_abort(g);
+-  setmref(g->jit_base, NULL);
+   L->status = LUA_OK;
+ #if LJ_UNWIND_EXT
+-  err_raise_ext(errcode);
++  err_raise_ext(g, errcode);
+   /*
+   ** A return from this function signals a corrupt C stack that cannot be
+   ** unwound. We have no choice but to call the panic function and exit.
+   **
+   ** Usually this is caused by a C function without unwind information.
+-  ** This should never happen on x64, but may happen if you've manually
+-  ** enabled LUAJIT_UNWIND_EXTERNAL and forgot to recompile *every*
+-  ** non-C++ file with -funwind-tables.
++  ** This may happen if you've manually enabled LUAJIT_UNWIND_EXTERNAL
++  ** and forgot to recompile *every* non-C++ file with -funwind-tables.
+   */
+   if (G(L)->panic)
+     G(L)->panic(L);
+ #else
++#if LJ_HASJIT
++  setmref(g->jit_base, NULL);
++#endif
+   {
+     void *cf = err_unwind(L, NULL, errcode);
+     if (cframe_unwind_ff(cf))
+@@ -542,6 +814,11 @@ LJ_NOINLINE void lj_err_mem(lua_State *L
+ {
+   if (L->status == LUA_ERRERR+1)  /* Don't touch the stack during lua_open. */
+     lj_vm_unwind_c(L->cframe, LUA_ERRMEM);
++  if (LJ_HASJIT) {
++    TValue *base = tvref(G(L)->jit_base);
++    if (base) L->base = base;
++  }
++  if (curr_funcisL(L)) L->top = curr_topL(L);
+   setstrV(L, L->top++, lj_err_str(L, LJ_ERR_ERRMEM));
+   lj_err_throw(L, LUA_ERRMEM);
+ }
+@@ -600,7 +877,7 @@ static ptrdiff_t finderrfunc(lua_State *
+ /* Runtime error. */
+ LJ_NOINLINE void LJ_FASTCALL lj_err_run(lua_State *L)
+ {
+-  ptrdiff_t ef = finderrfunc(L);
++  ptrdiff_t ef = (LJ_HASJIT && tvref(G(L)->jit_base)) ? 0 : finderrfunc(L);
+   if (ef) {
+     TValue *errfunc = restorestack(L, ef);
+     TValue *top = L->top;
+@@ -619,12 +896,26 @@ LJ_NOINLINE void LJ_FASTCALL lj_err_run(
+   lj_err_throw(L, LUA_ERRRUN);
+ }
+ 
++#if LJ_HASJIT
++LJ_NOINLINE void LJ_FASTCALL lj_err_trace(lua_State *L, int errcode)
++{
++  if (errcode == LUA_ERRRUN)
++    lj_err_run(L);
++  else
++    lj_err_throw(L, errcode);
++}
++#endif
++
+ /* Formatted runtime error message. */
+ LJ_NORET LJ_NOINLINE static void err_msgv(lua_State *L, ErrMsg em, ...)
+ {
+   const char *msg;
+   va_list argp;
+   va_start(argp, em);
++  if (LJ_HASJIT) {
++    TValue *base = tvref(G(L)->jit_base);
++    if (base) L->base = base;
++  }
+   if (curr_funcisL(L)) L->top = curr_topL(L);
+   msg = lj_strfmt_pushvf(L, err2msg(em), argp);
+   va_end(argp);
+@@ -699,25 +990,27 @@ LJ_NOINLINE void lj_err_optype_call(lua_
+ /* Error in context of caller. */
+ LJ_NOINLINE void lj_err_callermsg(lua_State *L, const char *msg)
+ {
+-  TValue *frame = L->base-1;
+-  TValue *pframe = NULL;
+-  if (frame_islua(frame)) {
+-    pframe = frame_prevl(frame);
+-  } else if (frame_iscont(frame)) {
+-    if (frame_iscont_fficb(frame)) {
+-      pframe = frame;
+-      frame = NULL;
+-    } else {
+-      pframe = frame_prevd(frame);
++  TValue *frame = NULL, *pframe = NULL;
++  if (!(LJ_HASJIT && tvref(G(L)->jit_base))) {
++    frame = L->base-1;
++    if (frame_islua(frame)) {
++      pframe = frame_prevl(frame);
++    } else if (frame_iscont(frame)) {
++      if (frame_iscont_fficb(frame)) {
++	pframe = frame;
++	frame = NULL;
++      } else {
++	pframe = frame_prevd(frame);
+ #if LJ_HASFFI
+-      /* Remove frame for FFI metamethods. */
+-      if (frame_func(frame)->c.ffid >= FF_ffi_meta___index &&
+-	  frame_func(frame)->c.ffid <= FF_ffi_meta___tostring) {
+-	L->base = pframe+1;
+-	L->top = frame;
+-	setcframe_pc(cframe_raw(L->cframe), frame_contpc(frame));
+-      }
++	/* Remove frame for FFI metamethods. */
++	if (frame_func(frame)->c.ffid >= FF_ffi_meta___index &&
++	    frame_func(frame)->c.ffid <= FF_ffi_meta___tostring) {
++	  L->base = pframe+1;
++	  L->top = frame;
++	  setcframe_pc(cframe_raw(L->cframe), frame_contpc(frame));
++	}
+ #endif
++      }
+     }
+   }
+   lj_debug_addloc(L, msg, pframe, frame);
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_err.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_err.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_err.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Error handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_ERR_H
+@@ -23,7 +23,10 @@ LJ_DATA const char *lj_err_allmsg;
+ LJ_FUNC GCstr *lj_err_str(lua_State *L, ErrMsg em);
+ LJ_FUNCA_NORET void LJ_FASTCALL lj_err_throw(lua_State *L, int errcode);
+ LJ_FUNC_NORET void lj_err_mem(lua_State *L);
+-LJ_FUNCA_NORET void LJ_FASTCALL lj_err_run(lua_State *L);
++LJ_FUNC_NORET void LJ_FASTCALL lj_err_run(lua_State *L);
++#if LJ_HASJIT
++LJ_FUNCA_NORET void LJ_FASTCALL lj_err_trace(lua_State *L, int errcode);
++#endif
+ LJ_FUNC_NORET void lj_err_msg(lua_State *L, ErrMsg em);
+ LJ_FUNC_NORET void lj_err_lex(lua_State *L, GCstr *src, const char *tok,
+ 			      BCLine line, ErrMsg em, va_list argp);
+@@ -38,4 +41,18 @@ LJ_FUNC_NORET void lj_err_argv(lua_State
+ LJ_FUNC_NORET void lj_err_argtype(lua_State *L, int narg, const char *xname);
+ LJ_FUNC_NORET void lj_err_argt(lua_State *L, int narg, int tt);
+ 
++#if LJ_UNWIND_JIT && !LJ_ABI_WIN
++LJ_FUNC uint8_t *lj_err_register_mcode(void *base, size_t sz, uint8_t *info);
++LJ_FUNC void lj_err_deregister_mcode(void *base, size_t sz, uint8_t *info);
++#else
++#define lj_err_register_mcode(base, sz, info)	(info)
++#define lj_err_deregister_mcode(base, sz, info)	UNUSED(base)
++#endif
++
++#if LJ_UNWIND_EXT && !LJ_ABI_WIN && defined(LUA_USE_ASSERT)
++LJ_FUNC void lj_err_verify(void);
++#else
++#define lj_err_verify()		((void)0)
++#endif
++
+ #endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_errmsg.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_errmsg.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_errmsg.h
+@@ -1,6 +1,6 @@
+ /*
+ ** VM error messages.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ /* This file may be included multiple times with different ERRDEF macros. */
+@@ -67,6 +67,7 @@ ERRDEF(PROTMT,	"cannot change a protecte
+ ERRDEF(UNPACK,	"too many results to unpack")
+ ERRDEF(RDRSTR,	"reader function must return a string")
+ ERRDEF(PRTOSTR,	LUA_QL("tostring") " must return a string to " LUA_QL("print"))
++ERRDEF(NUMRNG,	"number out of range")
+ ERRDEF(IDXRNG,	"index out of range")
+ ERRDEF(BASERNG,	"base out of range")
+ ERRDEF(LVLRNG,	"level out of range")
+@@ -179,6 +180,19 @@ ERRDEF(FFI_NYIPACKBIT,	"NYI: packed bit
+ ERRDEF(FFI_NYICALL,	"NYI: cannot call this C function (yet)")
+ #endif
+ 
++#if LJ_HASBUFFER
++/* String buffer errors. */
++ERRDEF(BUFFER_SELF,	"cannot put buffer into itself")
++ERRDEF(BUFFER_BADOPT,	"bad options table")
++ERRDEF(BUFFER_BADENC,	"cannot serialize " LUA_QS)
++ERRDEF(BUFFER_BADDEC,	"cannot deserialize tag 0x%02x")
++ERRDEF(BUFFER_BADDICTX,	"cannot deserialize dictionary index %d")
++ERRDEF(BUFFER_DEPTH,	"too deep to serialize")
++ERRDEF(BUFFER_DUPKEY,	"duplicate table key")
++ERRDEF(BUFFER_EOB,	"unexpected end of buffer")
++ERRDEF(BUFFER_LEFTOV,	"left-over data in buffer")
++#endif
++
+ #undef ERRDEF
+ 
+ /* Detecting unused error messages:
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ff.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_ff.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ff.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Fast function IDs.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_FF_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ffrecord.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_ffrecord.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ffrecord.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Fast function call recorder.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_ffrecord_c
+@@ -11,6 +11,7 @@
+ #if LJ_HASJIT
+ 
+ #include "lj_err.h"
++#include "lj_buf.h"
+ #include "lj_str.h"
+ #include "lj_tab.h"
+ #include "lj_frame.h"
+@@ -28,6 +29,7 @@
+ #include "lj_vm.h"
+ #include "lj_strscan.h"
+ #include "lj_strfmt.h"
++#include "lj_serialize.h"
+ 
+ /* Some local macros to save typing. Undef'd at the end. */
+ #define IR(ref)			(&J->cur.ir[(ref)])
+@@ -107,6 +109,10 @@ static void recff_stitch(jit_State *J)
+   const BCIns *pc = frame_pc(base-1);
+   TValue *pframe = frame_prevl(base-1);
+ 
++  /* Check for this now. Throwing in lj_record_stop messes up the stack. */
++  if (J->cur.nsnap >= (MSize)J->param[JIT_P_maxsnap])
++    lj_trace_err(J, LJ_TRERR_SNAPOV);
++
+   /* Move func + args up in Lua stack and insert continuation. */
+   memmove(&base[1], &base[-1-LJ_FR2], sizeof(TValue)*nslot);
+   setframe_ftsz(nframe, ((char *)nframe - (char *)pframe) + FRAME_CONT);
+@@ -182,6 +188,14 @@ static TRef recff_bufhdr(jit_State *J)
+ 		lj_ir_kptr(J, &J2G(J)->tmpbuf), IRBUFHDR_RESET);
+ }
+ 
++/* Emit TMPREF. */
++static TRef recff_tmpref(jit_State *J, TRef tr, int mode)
++{
++  if (!LJ_DUALNUM && tref_isinteger(tr))
++    tr = emitir(IRTN(IR_CONV), tr, IRCONV_NUM_INT);
++  return emitir(IRT(IR_TMPREF, IRT_PGC), tr, mode);
++}
++
+ /* -- Base library fast functions ----------------------------------------- */
+ 
+ static void LJ_FASTCALL recff_assert(jit_State *J, RecordFFData *rd)
+@@ -296,7 +310,7 @@ int32_t lj_ffrecord_select_mode(jit_Stat
+     } else {
+       TRef trptr = emitir(IRT(IR_STRREF, IRT_PGC), tr, lj_ir_kint(J, 0));
+       TRef trchar = emitir(IRT(IR_XLOAD, IRT_U8), trptr, IRXLOAD_READONLY);
+-      emitir(IRTG(IR_EQ, IRT_INT), trchar, lj_ir_kint(J, '#'));
++      emitir(IRTGI(IR_EQ), trchar, lj_ir_kint(J, '#'));
+     }
+     return 0;
+   } else {  /* select(n, ...) */
+@@ -317,9 +331,9 @@ static void LJ_FASTCALL recff_select(jit
+       ptrdiff_t n = (ptrdiff_t)J->maxslot;
+       if (start < 0) start += n;
+       else if (start > n) start = n;
+-      rd->nres = n - start;
+       if (start >= 1) {
+ 	ptrdiff_t i;
++	rd->nres = n - start;
+ 	for (i = 0; i < n - start; i++)
+ 	  J->base[i] = J->base[start+i];
+       }  /* else: Interpreter will throw. */
+@@ -455,6 +469,7 @@ static void LJ_FASTCALL recff_pcall(jit_
+ #endif
+     lj_record_call(J, 0, J->maxslot - 1);
+     rd->nres = -1;  /* Pending call. */
++    J->needsnap = 1;  /* Start catching on-trace errors. */
+   }  /* else: Interpreter will throw. */
+ }
+ 
+@@ -490,6 +505,7 @@ static void LJ_FASTCALL recff_xpcall(jit
+     if (errcode)
+       lj_err_throw(J->L, errcode);  /* Propagate errors. */
+     rd->nres = -1;  /* Pending call. */
++    J->needsnap = 1;  /* Start catching on-trace errors. */
+   }  /* else: Interpreter will throw. */
+ }
+ 
+@@ -505,6 +521,40 @@ static void LJ_FASTCALL recff_getfenv(ji
+   recff_nyiu(J, rd);
+ }
+ 
++static void LJ_FASTCALL recff_next(jit_State *J, RecordFFData *rd)
++{
++#if LJ_BE
++  /* YAGNI: Disabled on big-endian due to issues with lj_vm_next,
++  ** IR_HIOP, RID_RETLO/RID_RETHI and ra_destpair.
++  */
++  recff_nyi(J, rd);
++#else
++  TRef tab = J->base[0];
++  if (tref_istab(tab)) {
++    RecordIndex ix;
++    cTValue *keyv;
++    ix.tab = tab;
++    if (tref_isnil(J->base[1])) {  /* Shortcut for start of traversal. */
++      ix.key = lj_ir_kint(J, 0);
++      keyv = niltvg(J2G(J));
++    } else {
++      TRef tmp = recff_tmpref(J, J->base[1], IRTMPREF_IN1);
++      ix.key = lj_ir_call(J, IRCALL_lj_tab_keyindex, tab, tmp);
++      keyv = &rd->argv[1];
++    }
++    copyTV(J->L, &ix.tabv, &rd->argv[0]);
++    ix.keyv.u32.lo = lj_tab_keyindex(tabV(&ix.tabv), keyv);
++    /* Omit the value, if not used by the caller. */
++    ix.idxchain = (J->framedepth && frame_islua(J->L->base-1) &&
++		   bc_b(frame_pc(J->L->base-1)[-1])-1 < 2);
++    ix.mobj = 0;  /* We don't need the next index. */
++    rd->nres = lj_record_next(J, &ix);
++    J->base[0] = ix.key;
++    J->base[1] = ix.val;
++  }  /* else: Interpreter will throw. */
++#endif
++}
++
+ /* -- Math library fast functions ----------------------------------------- */
+ 
+ static void LJ_FASTCALL recff_math_abs(jit_State *J, RecordFFData *rd)
+@@ -588,8 +638,8 @@ static void LJ_FASTCALL recff_math_call(
+ 
+ static void LJ_FASTCALL recff_math_pow(jit_State *J, RecordFFData *rd)
+ {
+-  J->base[0] = lj_opt_narrow_pow(J, J->base[0], J->base[1],
+-				 &rd->argv[0], &rd->argv[1]);
++  J->base[0] = lj_opt_narrow_arith(J, J->base[0], J->base[1],
++				   &rd->argv[0], &rd->argv[1], IR_POW);
+   UNUSED(rd);
+ }
+ 
+@@ -707,7 +757,7 @@ static void LJ_FASTCALL recff_bit_tohex(
+ #if LJ_HASFFI
+   TRef hdr = recff_bufhdr(J);
+   TRef tr = recff_bit64_tohex(J, rd, hdr);
+-  J->base[0] = emitir(IRT(IR_BUFSTR, IRT_STR), tr, hdr);
++  J->base[0] = emitir(IRTG(IR_BUFSTR, IRT_STR), tr, hdr);
+ #else
+   recff_nyiu(J, rd);  /* Don't bother working around this NYI. */
+ #endif
+@@ -833,8 +883,8 @@ static void LJ_FASTCALL recff_string_cha
+   if (i > 1) {  /* Concatenate the strings, if there's more than one. */
+     TRef hdr = recff_bufhdr(J), tr = hdr;
+     for (i = 0; J->base[i] != 0; i++)
+-      tr = emitir(IRT(IR_BUFPUT, IRT_PGC), tr, J->base[i]);
+-    J->base[0] = emitir(IRT(IR_BUFSTR, IRT_STR), tr, hdr);
++      tr = emitir(IRTG(IR_BUFPUT, IRT_PGC), tr, J->base[i]);
++    J->base[0] = emitir(IRTG(IR_BUFSTR, IRT_STR), tr, hdr);
+   } else if (i == 0) {
+     J->base[0] = lj_ir_kstr(J, &J2G(J)->strempty);
+   }
+@@ -852,19 +902,19 @@ static void LJ_FASTCALL recff_string_rep
+     emitir(IRTGI(vrep > 1 ? IR_GT : IR_LE), rep, lj_ir_kint(J, 1));
+     if (vrep > 1) {
+       TRef hdr2 = recff_bufhdr(J);
+-      TRef tr2 = emitir(IRT(IR_BUFPUT, IRT_PGC), hdr2, sep);
+-      tr2 = emitir(IRT(IR_BUFPUT, IRT_PGC), tr2, str);
+-      str2 = emitir(IRT(IR_BUFSTR, IRT_STR), tr2, hdr2);
++      TRef tr2 = emitir(IRTG(IR_BUFPUT, IRT_PGC), hdr2, sep);
++      tr2 = emitir(IRTG(IR_BUFPUT, IRT_PGC), tr2, str);
++      str2 = emitir(IRTG(IR_BUFSTR, IRT_STR), tr2, hdr2);
+     }
+   }
+   tr = hdr = recff_bufhdr(J);
+   if (str2) {
+-    tr = emitir(IRT(IR_BUFPUT, IRT_PGC), tr, str);
++    tr = emitir(IRTG(IR_BUFPUT, IRT_PGC), tr, str);
+     str = str2;
+     rep = emitir(IRTI(IR_ADD), rep, lj_ir_kint(J, -1));
+   }
+   tr = lj_ir_call(J, IRCALL_lj_buf_putstr_rep, tr, str, rep);
+-  J->base[0] = emitir(IRT(IR_BUFSTR, IRT_STR), tr, hdr);
++  J->base[0] = emitir(IRTG(IR_BUFSTR, IRT_STR), tr, hdr);
+ }
+ 
+ static void LJ_FASTCALL recff_string_op(jit_State *J, RecordFFData *rd)
+@@ -872,7 +922,7 @@ static void LJ_FASTCALL recff_string_op(
+   TRef str = lj_ir_tostr(J, J->base[0]);
+   TRef hdr = recff_bufhdr(J);
+   TRef tr = lj_ir_call(J, rd->data, hdr, str);
+-  J->base[0] = emitir(IRT(IR_BUFSTR, IRT_STR), tr, hdr);
++  J->base[0] = emitir(IRTG(IR_BUFSTR, IRT_STR), tr, hdr);
+ }
+ 
+ static void LJ_FASTCALL recff_string_find(jit_State *J, RecordFFData *rd)
+@@ -935,34 +985,40 @@ static void LJ_FASTCALL recff_string_fin
+   }
+ }
+ 
+-static void LJ_FASTCALL recff_string_format(jit_State *J, RecordFFData *rd)
++static void recff_format(jit_State *J, RecordFFData *rd, TRef hdr, int sbufx)
+ {
+-  TRef trfmt = lj_ir_tostr(J, J->base[0]);
+-  GCstr *fmt = argv2str(J, &rd->argv[0]);
+-  int arg = 1;
+-  TRef hdr, tr;
++  ptrdiff_t arg = sbufx;
++  TRef tr = hdr, trfmt = lj_ir_tostr(J, J->base[arg]);
++  GCstr *fmt = argv2str(J, &rd->argv[arg]);
+   FormatState fs;
+   SFormat sf;
+   /* Specialize to the format string. */
+   emitir(IRTG(IR_EQ, IRT_STR), trfmt, lj_ir_kstr(J, fmt));
+-  tr = hdr = recff_bufhdr(J);
+   lj_strfmt_init(&fs, strdata(fmt), fmt->len);
+   while ((sf = lj_strfmt_parse(&fs)) != STRFMT_EOF) {  /* Parse format. */
+-    TRef tra = sf == STRFMT_LIT ? 0 : J->base[arg++];
++    TRef tra = sf == STRFMT_LIT ? 0 : J->base[++arg];
+     TRef trsf = lj_ir_kint(J, (int32_t)sf);
+     IRCallID id;
+     switch (STRFMT_TYPE(sf)) {
+     case STRFMT_LIT:
+-      tr = emitir(IRT(IR_BUFPUT, IRT_PGC), tr,
++      tr = emitir(IRTG(IR_BUFPUT, IRT_PGC), tr,
+ 		  lj_ir_kstr(J, lj_str_new(J->L, fs.str, fs.len)));
+       break;
+     case STRFMT_INT:
+       id = IRCALL_lj_strfmt_putfnum_int;
+     handle_int:
+-      if (!tref_isinteger(tra))
++      if (!tref_isinteger(tra)) {
++#if LJ_HASFFI
++	if (tref_iscdata(tra)) {
++	  tra = lj_crecord_loadiu64(J, tra, &rd->argv[arg]);
++	  tr = lj_ir_call(J, IRCALL_lj_strfmt_putfxint, tr, trsf, tra);
++	  break;
++	}
++#endif
+ 	goto handle_num;
++      }
+       if (sf == STRFMT_INT) { /* Shortcut for plain %d. */
+-	tr = emitir(IRT(IR_BUFPUT, IRT_PGC), tr,
++	tr = emitir(IRTG(IR_BUFPUT, IRT_PGC), tr,
+ 		    emitir(IRT(IR_TOSTR, IRT_STR), tra, IRTOSTR_INT));
+       } else {
+ #if LJ_HASFFI
+@@ -989,10 +1045,11 @@ static void LJ_FASTCALL recff_string_for
+     case STRFMT_STR:
+       if (!tref_isstr(tra)) {
+ 	recff_nyiu(J, rd);  /* NYI: __tostring and non-string types for %s. */
++	/* NYI: also buffers. */
+ 	return;
+       }
+       if (sf == STRFMT_STR)  /* Shortcut for plain %s. */
+-	tr = emitir(IRT(IR_BUFPUT, IRT_PGC), tr, tra);
++	tr = emitir(IRTG(IR_BUFPUT, IRT_PGC), tr, tra);
+       else if ((sf & STRFMT_T_QUOTED))
+ 	tr = lj_ir_call(J, IRCALL_lj_strfmt_putquoted, tr, tra);
+       else
+@@ -1001,7 +1058,7 @@ static void LJ_FASTCALL recff_string_for
+     case STRFMT_CHAR:
+       tra = lj_opt_narrow_toint(J, tra);
+       if (sf == STRFMT_CHAR)  /* Shortcut for plain %c. */
+-	tr = emitir(IRT(IR_BUFPUT, IRT_PGC), tr,
++	tr = emitir(IRTG(IR_BUFPUT, IRT_PGC), tr,
+ 		    emitir(IRT(IR_TOSTR, IRT_STR), tra, IRTOSTR_CHAR));
+       else
+ 	tr = lj_ir_call(J, IRCALL_lj_strfmt_putfchar, tr, trsf, tra);
+@@ -1013,9 +1070,333 @@ static void LJ_FASTCALL recff_string_for
+       return;
+     }
+   }
+-  J->base[0] = emitir(IRT(IR_BUFSTR, IRT_STR), tr, hdr);
++  if (sbufx) {
++    emitir(IRT(IR_USE, IRT_NIL), tr, 0);
++  } else {
++    J->base[0] = emitir(IRTG(IR_BUFSTR, IRT_STR), tr, hdr);
++  }
+ }
+ 
++static void LJ_FASTCALL recff_string_format(jit_State *J, RecordFFData *rd)
++{
++  recff_format(J, rd, recff_bufhdr(J), 0);
++}
++
++/* -- Buffer library fast functions --------------------------------------- */
++
++#if LJ_HASBUFFER
++
++static LJ_AINLINE TRef recff_sbufx_get_L(jit_State *J, TRef ud)
++{
++  return emitir(IRT(IR_FLOAD, IRT_PGC), ud, IRFL_SBUF_L);
++}
++
++static LJ_AINLINE void recff_sbufx_set_L(jit_State *J, TRef ud, TRef val)
++{
++  TRef fref = emitir(IRT(IR_FREF, IRT_PGC), ud, IRFL_SBUF_L);
++  emitir(IRT(IR_FSTORE, IRT_PGC), fref, val);
++}
++
++static LJ_AINLINE TRef recff_sbufx_get_ptr(jit_State *J, TRef ud, IRFieldID fl)
++{
++  return emitir(IRT(IR_FLOAD, IRT_PTR), ud, fl);
++}
++
++static LJ_AINLINE void recff_sbufx_set_ptr(jit_State *J, TRef ud, IRFieldID fl, TRef val)
++{
++  TRef fref = emitir(IRT(IR_FREF, IRT_PTR), ud, fl);
++  emitir(IRT(IR_FSTORE, IRT_PTR), fref, val);
++}
++
++static LJ_AINLINE TRef recff_sbufx_len(jit_State *J, TRef trr, TRef trw)
++{
++  TRef len = emitir(IRT(IR_SUB, IRT_INTP), trw, trr);
++  if (LJ_64)
++    len = emitir(IRTI(IR_CONV), len, (IRT_INT<<5)|IRT_INTP|IRCONV_NONE);
++  return len;
++}
++
++/* Emit typecheck for string buffer. */
++static TRef recff_sbufx_check(jit_State *J, RecordFFData *rd, ptrdiff_t arg)
++{
++  TRef trtype, ud = J->base[arg];
++  if (!tvisbuf(&rd->argv[arg])) lj_trace_err(J, LJ_TRERR_BADTYPE);
++  trtype = emitir(IRT(IR_FLOAD, IRT_U8), ud, IRFL_UDATA_UDTYPE);
++  emitir(IRTGI(IR_EQ), trtype, lj_ir_kint(J, UDTYPE_BUFFER));
++  J->needsnap = 1;
++  return ud;
++}
++
++/* Emit BUFHDR for write to extended string buffer. */
++static TRef recff_sbufx_write(jit_State *J, TRef ud)
++{
++  TRef trbuf = emitir(IRT(IR_ADD, IRT_PGC), ud, lj_ir_kintpgc(J, sizeof(GCudata)));
++  return emitir(IRT(IR_BUFHDR, IRT_PGC), trbuf, IRBUFHDR_WRITE);
++}
++
++/* Check for integer in range for the buffer API. */
++static TRef recff_sbufx_checkint(jit_State *J, RecordFFData *rd, ptrdiff_t arg)
++{
++  TRef tr = J->base[arg];
++  TRef trlim = lj_ir_kint(J, LJ_MAX_BUF);
++  if (tref_isinteger(tr)) {
++    emitir(IRTGI(IR_ULE), tr, trlim);
++  } else if (tref_isnum(tr)) {
++    tr = emitir(IRTI(IR_CONV), tr, IRCONV_INT_NUM|IRCONV_ANY);
++    emitir(IRTGI(IR_ULE), tr, trlim);
++#if LJ_HASFFI
++  } else if (tref_iscdata(tr)) {
++    tr = lj_crecord_loadiu64(J, tr, &rd->argv[arg]);
++    emitir(IRTG(IR_ULE, IRT_U64), tr, lj_ir_kint64(J, LJ_MAX_BUF));
++    tr = emitir(IRTI(IR_CONV), tr, (IRT_INT<<5)|IRT_I64|IRCONV_NONE);
++#else
++    UNUSED(rd);
++#endif
++  } else {
++    lj_trace_err(J, LJ_TRERR_BADTYPE);
++  }
++  return tr;
++}
++
++static void LJ_FASTCALL recff_buffer_method_reset(jit_State *J, RecordFFData *rd)
++{
++  TRef ud = recff_sbufx_check(J, rd, 0);
++  SBufExt *sbx = bufV(&rd->argv[0]);
++  int iscow = (int)sbufiscow(sbx);
++  TRef trl = recff_sbufx_get_L(J, ud);
++  TRef trcow = emitir(IRT(IR_BAND, IRT_IGC), trl, lj_ir_kintpgc(J, SBUF_FLAG_COW));
++  TRef zeropgc = lj_ir_kintpgc(J, 0);
++  emitir(IRTG(iscow ? IR_NE : IR_EQ, IRT_IGC), trcow, zeropgc);
++  if (iscow) {
++    TRef zerop = lj_ir_kintp(J, 0);
++    trl = emitir(IRT(IR_BXOR, IRT_IGC), trl, lj_ir_kintpgc(J, SBUF_FLAG_COW));
++    recff_sbufx_set_ptr(J, ud, IRFL_SBUF_W, zerop);
++    recff_sbufx_set_ptr(J, ud, IRFL_SBUF_E, zerop);
++    recff_sbufx_set_ptr(J, ud, IRFL_SBUF_B, zerop);
++    recff_sbufx_set_L(J, ud, trl);
++    emitir(IRT(IR_FSTORE, IRT_PGC),
++	   emitir(IRT(IR_FREF, IRT_PGC), ud, IRFL_SBUF_REF), zeropgc);
++    recff_sbufx_set_ptr(J, ud, IRFL_SBUF_R, zerop);
++  } else {
++    TRef trb = recff_sbufx_get_ptr(J, ud, IRFL_SBUF_B);
++    recff_sbufx_set_ptr(J, ud, IRFL_SBUF_W, trb);
++    recff_sbufx_set_ptr(J, ud, IRFL_SBUF_R, trb);
++  }
++}
++
++static void LJ_FASTCALL recff_buffer_method_skip(jit_State *J, RecordFFData *rd)
++{
++  TRef ud = recff_sbufx_check(J, rd, 0);
++  TRef trr = recff_sbufx_get_ptr(J, ud, IRFL_SBUF_R);
++  TRef trw = recff_sbufx_get_ptr(J, ud, IRFL_SBUF_W);
++  TRef len = recff_sbufx_len(J, trr, trw);
++  TRef trn = recff_sbufx_checkint(J, rd, 1);
++  len = emitir(IRTI(IR_MIN), len, trn);
++  trr = emitir(IRT(IR_ADD, IRT_PTR), trr, len);
++  recff_sbufx_set_ptr(J, ud, IRFL_SBUF_R, trr);
++}
++
++static void LJ_FASTCALL recff_buffer_method_set(jit_State *J, RecordFFData *rd)
++{
++  TRef ud = recff_sbufx_check(J, rd, 0);
++  TRef trbuf = recff_sbufx_write(J, ud);
++  TRef tr = J->base[1];
++  if (tref_isstr(tr)) {
++    TRef trp = emitir(IRT(IR_STRREF, IRT_PGC), tr, lj_ir_kint(J, 0));
++    TRef len = emitir(IRTI(IR_FLOAD), tr, IRFL_STR_LEN);
++    lj_ir_call(J, IRCALL_lj_bufx_set, trbuf, trp, len, tr);
++#if LJ_HASFFI
++  } else if (tref_iscdata(tr)) {
++    TRef trp = lj_crecord_topcvoid(J, tr, &rd->argv[1]);
++    TRef len = recff_sbufx_checkint(J, rd, 2);
++    lj_ir_call(J, IRCALL_lj_bufx_set, trbuf, trp, len, tr);
++#endif
++  }  /* else: Interpreter will throw. */
++}
++
++static void LJ_FASTCALL recff_buffer_method_put(jit_State *J, RecordFFData *rd)
++{
++  TRef ud = recff_sbufx_check(J, rd, 0);
++  TRef trbuf = recff_sbufx_write(J, ud);
++  TRef tr;
++  ptrdiff_t arg;
++  if (!J->base[1]) return;
++  for (arg = 1; (tr = J->base[arg]); arg++) {
++    if (tref_isudata(tr)) {
++      TRef ud2 = recff_sbufx_check(J, rd, arg);
++      emitir(IRTG(IR_NE, IRT_PGC), ud, ud2);
++    }
++  }
++  for (arg = 1; (tr = J->base[arg]); arg++) {
++    if (tref_isstr(tr)) {
++      trbuf = emitir(IRTG(IR_BUFPUT, IRT_PGC), trbuf, tr);
++    } else if (tref_isnumber(tr)) {
++      trbuf = emitir(IRTG(IR_BUFPUT, IRT_PGC), trbuf,
++		     emitir(IRT(IR_TOSTR, IRT_STR), tr,
++			    tref_isnum(tr) ? IRTOSTR_NUM : IRTOSTR_INT));
++    } else if (tref_isudata(tr)) {
++      TRef trr = recff_sbufx_get_ptr(J, tr, IRFL_SBUF_R);
++      TRef trw = recff_sbufx_get_ptr(J, tr, IRFL_SBUF_W);
++      TRef len = recff_sbufx_len(J, trr, trw);
++      trbuf = lj_ir_call(J, IRCALL_lj_buf_putmem, trbuf, trr, len);
++    } else {
++      recff_nyiu(J, rd);
++    }
++  }
++  emitir(IRT(IR_USE, IRT_NIL), trbuf, 0);
++}
++
++static void LJ_FASTCALL recff_buffer_method_putf(jit_State *J, RecordFFData *rd)
++{
++  TRef ud = recff_sbufx_check(J, rd, 0);
++  TRef trbuf = recff_sbufx_write(J, ud);
++  recff_format(J, rd, trbuf, 1);
++}
++
++static void LJ_FASTCALL recff_buffer_method_get(jit_State *J, RecordFFData *rd)
++{
++  TRef ud = recff_sbufx_check(J, rd, 0);
++  TRef trr = recff_sbufx_get_ptr(J, ud, IRFL_SBUF_R);
++  TRef trw = recff_sbufx_get_ptr(J, ud, IRFL_SBUF_W);
++  TRef tr;
++  ptrdiff_t arg;
++  if (!J->base[1]) { J->base[1] = TREF_NIL; J->base[2] = 0; }
++  for (arg = 0; (tr = J->base[arg+1]); arg++) {
++    if (!tref_isnil(tr)) {
++      J->base[arg+1] = recff_sbufx_checkint(J, rd, arg+1);
++    }
++  }
++  for (arg = 0; (tr = J->base[arg+1]); arg++) {
++    TRef len = recff_sbufx_len(J, trr, trw);
++    if (tref_isnil(tr)) {
++      J->base[arg] = emitir(IRT(IR_XSNEW, IRT_STR), trr, len);
++      trr = trw;
++    } else {
++      TRef tru;
++      len = emitir(IRTI(IR_MIN), len, tr);
++      tru = emitir(IRT(IR_ADD, IRT_PTR), trr, len);
++      J->base[arg] = emitir(IRT(IR_XSNEW, IRT_STR), trr, len);
++      trr = tru;  /* Doing the ADD before the SNEW generates better code. */
++    }
++    recff_sbufx_set_ptr(J, ud, IRFL_SBUF_R, trr);
++  }
++  rd->nres = arg;
++}
++
++static void LJ_FASTCALL recff_buffer_method___tostring(jit_State *J, RecordFFData *rd)
++{
++  TRef ud = recff_sbufx_check(J, rd, 0);
++  TRef trr = recff_sbufx_get_ptr(J, ud, IRFL_SBUF_R);
++  TRef trw = recff_sbufx_get_ptr(J, ud, IRFL_SBUF_W);
++  J->base[0] = emitir(IRT(IR_XSNEW, IRT_STR), trr, recff_sbufx_len(J, trr, trw));
++}
++
++static void LJ_FASTCALL recff_buffer_method___len(jit_State *J, RecordFFData *rd)
++{
++  TRef ud = recff_sbufx_check(J, rd, 0);
++  TRef trr = recff_sbufx_get_ptr(J, ud, IRFL_SBUF_R);
++  TRef trw = recff_sbufx_get_ptr(J, ud, IRFL_SBUF_W);
++  J->base[0] = recff_sbufx_len(J, trr, trw);
++}
++
++#if LJ_HASFFI
++static void LJ_FASTCALL recff_buffer_method_putcdata(jit_State *J, RecordFFData *rd)
++{
++  TRef ud = recff_sbufx_check(J, rd, 0);
++  TRef trbuf = recff_sbufx_write(J, ud);
++  TRef tr = lj_crecord_topcvoid(J, J->base[1], &rd->argv[1]);
++  TRef len = recff_sbufx_checkint(J, rd, 2);
++  trbuf = lj_ir_call(J, IRCALL_lj_buf_putmem, trbuf, tr, len);
++  emitir(IRT(IR_USE, IRT_NIL), trbuf, 0);
++}
++
++static void LJ_FASTCALL recff_buffer_method_reserve(jit_State *J, RecordFFData *rd)
++{
++  TRef ud = recff_sbufx_check(J, rd, 0);
++  TRef trbuf = recff_sbufx_write(J, ud);
++  TRef trsz = recff_sbufx_checkint(J, rd, 1);
++  J->base[1] = lj_ir_call(J, IRCALL_lj_bufx_more, trbuf, trsz);
++  J->base[0] = lj_crecord_topuint8(J, recff_sbufx_get_ptr(J, ud, IRFL_SBUF_W));
++  rd->nres = 2;
++}
++
++static void LJ_FASTCALL recff_buffer_method_commit(jit_State *J, RecordFFData *rd)
++{
++  TRef ud = recff_sbufx_check(J, rd, 0);
++  TRef len = recff_sbufx_checkint(J, rd, 1);
++  TRef trw = recff_sbufx_get_ptr(J, ud, IRFL_SBUF_W);
++  TRef tre = recff_sbufx_get_ptr(J, ud, IRFL_SBUF_E);
++  TRef left = emitir(IRT(IR_SUB, IRT_INTP), tre, trw);
++  if (LJ_64)
++    left = emitir(IRTI(IR_CONV), left, (IRT_INT<<5)|IRT_INTP|IRCONV_NONE);
++  emitir(IRTGI(IR_ULE), len, left);
++  trw = emitir(IRT(IR_ADD, IRT_PTR), trw, len);
++  recff_sbufx_set_ptr(J, ud, IRFL_SBUF_W, trw);
++}
++
++static void LJ_FASTCALL recff_buffer_method_ref(jit_State *J, RecordFFData *rd)
++{
++  TRef ud = recff_sbufx_check(J, rd, 0);
++  TRef trr = recff_sbufx_get_ptr(J, ud, IRFL_SBUF_R);
++  TRef trw = recff_sbufx_get_ptr(J, ud, IRFL_SBUF_W);
++  J->base[0] = lj_crecord_topuint8(J, trr);
++  J->base[1] = recff_sbufx_len(J, trr, trw);
++  rd->nres = 2;
++}
++#endif
++
++static void LJ_FASTCALL recff_buffer_method_encode(jit_State *J, RecordFFData *rd)
++{
++  TRef ud = recff_sbufx_check(J, rd, 0);
++  TRef trbuf = recff_sbufx_write(J, ud);
++  TRef tmp = recff_tmpref(J, J->base[1], IRTMPREF_IN1);
++  lj_ir_call(J, IRCALL_lj_serialize_put, trbuf, tmp);
++  /* No IR_USE needed, since the call is a store. */
++}
++
++static void LJ_FASTCALL recff_buffer_method_decode(jit_State *J, RecordFFData *rd)
++{
++  TRef ud = recff_sbufx_check(J, rd, 0);
++  TRef trbuf = recff_sbufx_write(J, ud);
++  TRef tmp = recff_tmpref(J, TREF_NIL, IRTMPREF_OUT1);
++  TRef trr = lj_ir_call(J, IRCALL_lj_serialize_get, trbuf, tmp);
++  IRType t = (IRType)lj_serialize_peektype(bufV(&rd->argv[0]));
++  /* No IR_USE needed, since the call is a store. */
++  J->base[0] = lj_record_vload(J, tmp, 0, t);
++  /* The sbx->r store must be after the VLOAD type check, in case it fails. */
++  recff_sbufx_set_ptr(J, ud, IRFL_SBUF_R, trr);
++}
++
++static void LJ_FASTCALL recff_buffer_encode(jit_State *J, RecordFFData *rd)
++{
++  TRef tmp = recff_tmpref(J, J->base[0], IRTMPREF_IN1);
++  J->base[0] = lj_ir_call(J, IRCALL_lj_serialize_encode, tmp);
++  /* IR_USE needed for IR_CALLA, because the encoder may throw non-OOM. */
++  emitir(IRT(IR_USE, IRT_NIL), J->base[0], 0);
++  UNUSED(rd);
++}
++
++static void LJ_FASTCALL recff_buffer_decode(jit_State *J, RecordFFData *rd)
++{
++  if (tvisstr(&rd->argv[0])) {
++    GCstr *str = strV(&rd->argv[0]);
++    SBufExt sbx;
++    IRType t;
++    TRef tmp = recff_tmpref(J, TREF_NIL, IRTMPREF_OUT1);
++    TRef tr = lj_ir_call(J, IRCALL_lj_serialize_decode, tmp, J->base[0]);
++    /* IR_USE needed for IR_CALLA, because the decoder may throw non-OOM.
++    ** That's why IRCALL_lj_serialize_decode needs a fake INT result.
++    */
++    emitir(IRT(IR_USE, IRT_NIL), tr, 0);
++    memset(&sbx, 0, sizeof(SBufExt));
++    lj_bufx_set_cow(J->L, &sbx, strdata(str), str->len);
++    t = (IRType)lj_serialize_peektype(&sbx);
++    J->base[0] = lj_record_vload(J, tmp, 0, t);
++  }  /* else: Interpreter will throw. */
++}
++
++#endif
++
+ /* -- Table library fast functions ---------------------------------------- */
+ 
+ static void LJ_FASTCALL recff_table_insert(jit_State *J, RecordFFData *rd)
+@@ -1054,7 +1435,7 @@ static void LJ_FASTCALL recff_table_conc
+     TRef hdr = recff_bufhdr(J);
+     TRef tr = lj_ir_call(J, IRCALL_lj_buf_puttab, hdr, tab, sep, tri, tre);
+     emitir(IRTG(IR_NE, IRT_PTR), tr, lj_ir_kptr(J, NULL));
+-    J->base[0] = emitir(IRT(IR_BUFSTR, IRT_STR), tr, hdr);
++    J->base[0] = emitir(IRTG(IR_BUFSTR, IRT_STR), tr, hdr);
+   }  /* else: Interpreter will throw. */
+   UNUSED(rd);
+ }
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ffrecord.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_ffrecord.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ffrecord.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Fast function call recorder.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_FFRECORD_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_frame.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_frame.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_frame.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Stack frames.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_FRAME_H
+@@ -192,12 +192,12 @@ enum { LJ_CONT_TAILCALL, LJ_CONT_FFI_CAL
+ #endif
+ #define CFRAME_SHIFT_MULTRES	3
+ #elif LJ_TARGET_ARM64
+-#define CFRAME_OFS_ERRF		196
+-#define CFRAME_OFS_NRES		200
+-#define CFRAME_OFS_PREV		160
+-#define CFRAME_OFS_L		176
+-#define CFRAME_OFS_PC		168
+-#define CFRAME_OFS_MULTRES	192
++#define CFRAME_OFS_ERRF		36
++#define CFRAME_OFS_NRES		40
++#define CFRAME_OFS_PREV		0
++#define CFRAME_OFS_L		16
++#define CFRAME_OFS_PC		8
++#define CFRAME_OFS_MULTRES	32
+ #define CFRAME_SIZE		208
+ #define CFRAME_SHIFT_MULTRES	3
+ #elif LJ_TARGET_PPC
+@@ -264,6 +264,15 @@ enum { LJ_CONT_TAILCALL, LJ_CONT_FFI_CAL
+ #endif
+ #define CFRAME_OFS_MULTRES	0
+ #define CFRAME_SHIFT_MULTRES	3
++#elif LJ_TARGET_RISCV64
++#define CFRAME_OFS_ERRF		252
++#define CFRAME_OFS_NRES		248
++#define CFRAME_OFS_PREV		240
++#define CFRAME_OFS_L		232
++#define CFRAME_OFS_PC		224
++#define CFRAME_OFS_MULTRES	0
++#define CFRAME_SIZE		256
++#define CFRAME_SHIFT_MULTRES	3
+ #else
+ #error "Missing CFRAME_* definitions for this architecture"
+ #endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_func.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_func.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_func.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Function handling (prototypes, functions and upvalues).
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Portions taken verbatim or adapted from the Lua interpreter.
+ ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_func.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_func.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_func.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Function handling (prototypes, functions and upvalues).
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_FUNC_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_gc.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_gc.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_gc.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Garbage collector.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Major portions taken verbatim or adapted from the Lua interpreter.
+ ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+@@ -27,6 +27,7 @@
+ #include "lj_trace.h"
+ #include "lj_dispatch.h"
+ #include "lj_vm.h"
++#include "lj_vmevent.h"
+ 
+ #define GCSTEPSIZE	1024u
+ #define GCSWEEPMAX	40
+@@ -65,6 +66,15 @@ static void gc_mark(global_State *g, GCo
+     gray2black(o);  /* Userdata are never gray. */
+     if (mt) gc_markobj(g, mt);
+     gc_markobj(g, tabref(gco2ud(o)->env));
++    if (LJ_HASBUFFER && gco2ud(o)->udtype == UDTYPE_BUFFER) {
++      SBufExt *sbx = (SBufExt *)uddata(gco2ud(o));
++      if (sbufiscow(sbx) && gcref(sbx->cowref))
++	gc_markobj(g, gcref(sbx->cowref));
++      if (gcref(sbx->dict_str))
++	gc_markobj(g, gcref(sbx->dict_str));
++      if (gcref(sbx->dict_mt))
++	gc_markobj(g, gcref(sbx->dict_mt));
++    }
+   } else if (LJ_UNLIKELY(gct == ~LJ_TUPVAL)) {
+     GCupval *uv = gco2uv(o);
+     gc_marktv(g, uvval(uv));
+@@ -512,8 +522,13 @@ static void gc_call_finalizer(global_Sta
+   hook_restore(g, oldh);
+   if (LJ_HASPROFILE && (oldh & HOOK_PROFILE)) lj_dispatch_update(g);
+   g->gc.threshold = oldt;  /* Restore GC threshold. */
+-  if (errcode)
+-    lj_err_throw(L, errcode);  /* Propagate errors. */
++  if (errcode) {
++    ptrdiff_t errobj = savestack(L, L->top-1);  /* Stack may be resized. */
++    lj_vmevent_send(L, ERRFIN,
++      copyTV(L, L->top++, restorestack(L, errobj));
++    );
++    L->top--;
++  }
+ }
+ 
+ /* Finalize one userdata or cdata object from the mmudata list. */
+@@ -691,9 +706,12 @@ static size_t gc_onestep(lua_State *L)
+     }
+   case GCSfinalize:
+     if (gcref(g->gc.mmudata) != NULL) {
++      GCSize old = g->gc.total;
+       if (tvref(g->jit_base))  /* Don't call finalizers on trace. */
+ 	return LJ_MAX_MEM;
+       gc_finalize(L);  /* Finalize one userdata object. */
++      if (old >= g->gc.total && g->gc.estimate > old - g->gc.total)
++	g->gc.estimate -= old - g->gc.total;
+       if (g->gc.estimate > GCFINALIZECOST)
+ 	g->gc.estimate -= GCFINALIZECOST;
+       return GCFINALIZECOST;
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_gc.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_gc.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_gc.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Garbage collector.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_GC_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_gdbjit.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_gdbjit.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_gdbjit.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Client for the GDB JIT API.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_gdbjit_c
+@@ -306,6 +306,9 @@ enum {
+ #elif LJ_TARGET_MIPS
+   DW_REG_SP = 29,
+   DW_REG_RA = 31,
++#elif LJ_TARGET_RISCV64
++  DW_REG_SP = 2,
++  DW_REG_RA = 1,
+ #else
+ #error "Unsupported target architecture"
+ #endif
+@@ -383,6 +386,8 @@ static const ELFheader elfhdr_template =
+   .machine = 20,
+ #elif LJ_TARGET_MIPS
+   .machine = 8,
++#elif LJ_TARGET_RISCV64
++  .machine = 243,
+ #else
+ #error "Unsupported target architecture"
+ #endif
+@@ -591,6 +596,16 @@ static void LJ_FASTCALL gdbjit_ehframe(G
+       for (i = 23; i >= 16; i--) { DB(DW_CFA_offset|i); DUV(26-i); }
+       for (i = 30; i >= 20; i -= 2) { DB(DW_CFA_offset|32|i); DUV(42-i); }
+     }
++#elif LJ_TARGET_RISCV64
++    {
++      int i;
++      for (i = 27; i >= 18; i--) { DB(DW_CFA_offset|i); DUV(27-i+7); }
++      DB(DW_CFA_offset|9); DUV(17);
++      DB(DW_CFA_offset|8); DUV(18);
++      for (i = 27; i >= 18; i--) { DB(DW_CFA_offset|32|i); DUV(27-i+19); }
++      DB(DW_CFA_offset|32|9); DUV(29);
++      DB(DW_CFA_offset|32|8); DUV(30);
++    }
+ #else
+ #error "Unsupported target architecture"
+ #endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_gdbjit.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_gdbjit.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_gdbjit.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Client for the GDB JIT API.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_GDBJIT_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ir.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_ir.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ir.c
+@@ -1,6 +1,6 @@
+ /*
+ ** SSA IR (Intermediate Representation) emitter.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_ir_c
+@@ -30,6 +30,7 @@
+ #endif
+ #include "lj_vm.h"
+ #include "lj_strscan.h"
++#include "lj_serialize.h"
+ #include "lj_strfmt.h"
+ #include "lj_prng.h"
+ 
+@@ -147,7 +148,7 @@ TRef lj_ir_call(jit_State *J, IRCallID i
+ }
+ 
+ /* Load field of type t from GG_State + offset. Must be 32 bit aligned. */
+-LJ_FUNC TRef lj_ir_ggfload(jit_State *J, IRType t, uintptr_t ofs)
++TRef lj_ir_ggfload(jit_State *J, IRType t, uintptr_t ofs)
+ {
+   lj_assertJ((ofs & 3) == 0, "unaligned GG_State field offset");
+   ofs >>= 2;
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ir.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_ir.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ir.h
+@@ -1,6 +1,6 @@
+ /*
+ ** SSA IR (Intermediate Representation) format.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_IR_H
+@@ -76,8 +76,8 @@
+   \
+   _(ABS,	N , ref, ref) \
+   _(LDEXP,	N , ref, ref) \
+-  _(MIN,	C , ref, ref) \
+-  _(MAX,	C , ref, ref) \
++  _(MIN,	N , ref, ref) \
++  _(MAX,	N , ref, ref) \
+   _(FPMATH,	N , ref, lit) \
+   \
+   /* Overflow-checking arithmetic ops. */ \
+@@ -95,6 +95,7 @@
+   _(UREFO,	LW, ref, lit) \
+   _(UREFC,	LW, ref, lit) \
+   _(FREF,	R , ref, lit) \
++  _(TMPREF,	S , ref, lit) \
+   _(STRREF,	N , ref, ref) \
+   _(LREF,	L , ___, ___) \
+   \
+@@ -105,7 +106,7 @@
+   _(FLOAD,	L , ref, lit) \
+   _(XLOAD,	L , ref, lit) \
+   _(SLOAD,	L , lit, lit) \
+-  _(VLOAD,	L , ref, ___) \
++  _(VLOAD,	L , ref, lit) \
+   _(ALEN,	L , ref, ref) \
+   \
+   _(ASTORE,	S , ref, ref) \
+@@ -124,8 +125,8 @@
+   \
+   /* Buffer operations. */ \
+   _(BUFHDR,	L , ref, lit) \
+-  _(BUFPUT,	L , ref, ref) \
+-  _(BUFSTR,	A , ref, ref) \
++  _(BUFPUT,	LW, ref, ref) \
++  _(BUFSTR,	AW, ref, ref) \
+   \
+   /* Barriers. */ \
+   _(TBAR,	S , ref, ___) \
+@@ -139,9 +140,9 @@
+   _(STRTO,	N , ref, ___) \
+   \
+   /* Calls. */ \
+-  _(CALLN,	N , ref, lit) \
+-  _(CALLA,	A , ref, lit) \
+-  _(CALLL,	L , ref, lit) \
++  _(CALLN,	NW, ref, lit) \
++  _(CALLA,	AW, ref, lit) \
++  _(CALLL,	LW, ref, lit) \
+   _(CALLS,	S , ref, lit) \
+   _(CALLXS,	S , ref, ref) \
+   _(CARG,	N , ref, ref) \
+@@ -204,9 +205,15 @@ IRFPMDEF(FPMENUM)
+   _(UDATA_META,	offsetof(GCudata, metatable)) \
+   _(UDATA_UDTYPE, offsetof(GCudata, udtype)) \
+   _(UDATA_FILE,	sizeof(GCudata)) \
++  _(SBUF_W,	sizeof(GCudata) + offsetof(SBufExt, w)) \
++  _(SBUF_E,	sizeof(GCudata) + offsetof(SBufExt, e)) \
++  _(SBUF_B,	sizeof(GCudata) + offsetof(SBufExt, b)) \
++  _(SBUF_L,	sizeof(GCudata) + offsetof(SBufExt, L)) \
++  _(SBUF_REF,	sizeof(GCudata) + offsetof(SBufExt, cowref)) \
++  _(SBUF_R,	sizeof(GCudata) + offsetof(SBufExt, r)) \
+   _(CDATA_CTYPEID, offsetof(GCcdata, ctypeid)) \
+   _(CDATA_PTR,	sizeof(GCcdata)) \
+-  _(CDATA_INT, sizeof(GCcdata)) \
++  _(CDATA_INT,	sizeof(GCcdata)) \
+   _(CDATA_INT64, sizeof(GCcdata)) \
+   _(CDATA_INT64_4, sizeof(GCcdata) + 4)
+ 
+@@ -217,6 +224,11 @@ IRFLDEF(FLENUM)
+   IRFL__MAX
+ } IRFieldID;
+ 
++/* TMPREF mode bits, stored in op2. */
++#define IRTMPREF_IN1		0x01	/* First input value. */
++#define IRTMPREF_OUT1		0x02	/* First output value. */
++#define IRTMPREF_OUT2		0x04	/* Second output value. */
++
+ /* SLOAD mode bits, stored in op2. */
+ #define IRSLOAD_PARENT		0x01	/* Coalesce with parent trace. */
+ #define IRSLOAD_FRAME		0x02	/* Load 32 bits of ftsz. */
+@@ -224,15 +236,17 @@ IRFLDEF(FLENUM)
+ #define IRSLOAD_CONVERT		0x08	/* Number to integer conversion. */
+ #define IRSLOAD_READONLY	0x10	/* Read-only, omit slot store. */
+ #define IRSLOAD_INHERIT		0x20	/* Inherited by exits/side traces. */
++#define IRSLOAD_KEYINDEX	0x40	/* Table traversal key index. */
+ 
+-/* XLOAD mode, stored in op2. */
+-#define IRXLOAD_READONLY	1	/* Load from read-only data. */
+-#define IRXLOAD_VOLATILE	2	/* Load from volatile data. */
+-#define IRXLOAD_UNALIGNED	4	/* Unaligned load. */
++/* XLOAD mode bits, stored in op2. */
++#define IRXLOAD_READONLY	0x01	/* Load from read-only data. */
++#define IRXLOAD_VOLATILE	0x02	/* Load from volatile data. */
++#define IRXLOAD_UNALIGNED	0x04	/* Unaligned load. */
+ 
+ /* BUFHDR mode, stored in op2. */
+ #define IRBUFHDR_RESET		0	/* Reset buffer. */
+ #define IRBUFHDR_APPEND		1	/* Append to buffer. */
++#define IRBUFHDR_WRITE		2	/* Write to string buffer. */
+ 
+ /* CONV mode, stored in op2. */
+ #define IRCONV_SRCMASK		0x001f	/* Source IRType. */
+@@ -249,6 +263,7 @@ IRFLDEF(FLENUM)
+ #define IRCONV_ANY    (1<<IRCONV_CSH)	/* Any FP number is ok. */
+ #define IRCONV_INDEX  (2<<IRCONV_CSH)	/* Check + special backprop rules. */
+ #define IRCONV_CHECK  (3<<IRCONV_CSH)	/* Number checked for integerness. */
++#define IRCONV_NONE   IRCONV_ANY	/* INT|*64 no conv, but change type. */
+ 
+ /* TOSTR mode, stored in op2. */
+ #define IRTOSTR_INT		0	/* Convert integer to string. */
+@@ -481,6 +496,7 @@ typedef uint32_t TRef;
+ #define TREF_REFMASK		0x0000ffff
+ #define TREF_FRAME		0x00010000
+ #define TREF_CONT		0x00020000
++#define TREF_KEYINDEX		0x00100000
+ 
+ #define TREF(ref, t)		((TRef)((ref) + ((t)<<24)))
+ 
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ircall.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_ircall.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_ircall.h
+@@ -1,6 +1,6 @@
+ /*
+ ** IR CALL* instruction definitions.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_IRCALL_H
+@@ -30,10 +30,12 @@ typedef struct CCallInfo {
+ #define CCI_CALL_L		(IR_CALLL << CCI_OPSHIFT)
+ #define CCI_CALL_S		(IR_CALLS << CCI_OPSHIFT)
+ #define CCI_CALL_FN		(CCI_CALL_N|CCI_CC_FASTCALL)
++#define CCI_CALL_FA		(CCI_CALL_A|CCI_CC_FASTCALL)
+ #define CCI_CALL_FL		(CCI_CALL_L|CCI_CC_FASTCALL)
+ #define CCI_CALL_FS		(CCI_CALL_S|CCI_CC_FASTCALL)
+ 
+ /* C call info flags. */
++#define CCI_T			(IRT_GUARD << CCI_OTSHIFT)  /* May throw. */
+ #define CCI_L			0x0100	/* Implicit L arg. */
+ #define CCI_CASTU64		0x0200	/* Cast u64 result to number. */
+ #define CCI_NOFPRCLOBBER	0x0400	/* Does not clobber any FPRs. */
+@@ -61,7 +63,7 @@ typedef struct CCallInfo {
+ /* Helpers for conditional function definitions. */
+ #define IRCALLCOND_ANY(x)		x
+ 
+-#if LJ_TARGET_X86ORX64
++#if LJ_TARGET_X86ORX64 || LJ_TARGET_ARM64
+ #define IRCALLCOND_FPMATH(x)		NULL
+ #else
+ #define IRCALLCOND_FPMATH(x)		x
+@@ -111,6 +113,18 @@ typedef struct CCallInfo {
+ #define IRCALLCOND_FFI32(x)		NULL
+ #endif
+ 
++#if LJ_HASBUFFER
++#define IRCALLCOND_BUFFER(x)		x
++#else
++#define IRCALLCOND_BUFFER(x)		NULL
++#endif
++
++#if LJ_HASBUFFER && LJ_HASFFI
++#define IRCALLCOND_BUFFFI(x)		x
++#else
++#define IRCALLCOND_BUFFFI(x)		NULL
++#endif
++
+ #if LJ_SOFTFP
+ #define XA_FP		CCI_XA
+ #define XA2_FP		(CCI_XA+CCI_XA)
+@@ -139,39 +153,47 @@ typedef struct CCallInfo {
+ #define IRCALLDEF(_) \
+   _(ANY,	lj_str_cmp,		2,  FN, INT, CCI_NOFPRCLOBBER) \
+   _(ANY,	lj_str_find,		4,   N, PGC, 0) \
+-  _(ANY,	lj_str_new,		3,   S, STR, CCI_L) \
++  _(ANY,	lj_str_new,		3,   S, STR, CCI_L|CCI_T) \
+   _(ANY,	lj_strscan_num,		2,  FN, INT, 0) \
+-  _(ANY,	lj_strfmt_int,		2,  FN, STR, CCI_L) \
+-  _(ANY,	lj_strfmt_num,		2,  FN, STR, CCI_L) \
+-  _(ANY,	lj_strfmt_char,		2,  FN, STR, CCI_L) \
+-  _(ANY,	lj_strfmt_putint,	2,  FL, PGC, 0) \
+-  _(ANY,	lj_strfmt_putnum,	2,  FL, PGC, 0) \
+-  _(ANY,	lj_strfmt_putquoted,	2,  FL, PGC, 0) \
+-  _(ANY,	lj_strfmt_putfxint,	3,   L, PGC, XA_64) \
+-  _(ANY,	lj_strfmt_putfnum_int,	3,   L, PGC, XA_FP) \
+-  _(ANY,	lj_strfmt_putfnum_uint,	3,   L, PGC, XA_FP) \
+-  _(ANY,	lj_strfmt_putfnum,	3,   L, PGC, XA_FP) \
+-  _(ANY,	lj_strfmt_putfstr,	3,   L, PGC, 0) \
+-  _(ANY,	lj_strfmt_putfchar,	3,   L, PGC, 0) \
+-  _(ANY,	lj_buf_putmem,		3,   S, PGC, 0) \
+-  _(ANY,	lj_buf_putstr,		2,  FL, PGC, 0) \
+-  _(ANY,	lj_buf_putchar,		2,  FL, PGC, 0) \
+-  _(ANY,	lj_buf_putstr_reverse,	2,  FL, PGC, 0) \
+-  _(ANY,	lj_buf_putstr_lower,	2,  FL, PGC, 0) \
+-  _(ANY,	lj_buf_putstr_upper,	2,  FL, PGC, 0) \
+-  _(ANY,	lj_buf_putstr_rep,	3,   L, PGC, 0) \
+-  _(ANY,	lj_buf_puttab,		5,   L, PGC, 0) \
+-  _(ANY,	lj_buf_tostr,		1,  FL, STR, 0) \
+-  _(ANY,	lj_tab_new_ah,		3,   A, TAB, CCI_L) \
+-  _(ANY,	lj_tab_new1,		2,  FS, TAB, CCI_L) \
+-  _(ANY,	lj_tab_dup,		2,  FS, TAB, CCI_L) \
++  _(ANY,	lj_strfmt_int,		2,  FN, STR, CCI_L|CCI_T) \
++  _(ANY,	lj_strfmt_num,		2,  FN, STR, CCI_L|CCI_T) \
++  _(ANY,	lj_strfmt_char,		2,  FN, STR, CCI_L|CCI_T) \
++  _(ANY,	lj_strfmt_putint,	2,  FL, PGC, CCI_T) \
++  _(ANY,	lj_strfmt_putnum,	2,  FL, PGC, CCI_T) \
++  _(ANY,	lj_strfmt_putquoted,	2,  FL, PGC, CCI_T) \
++  _(ANY,	lj_strfmt_putfxint,	3,   L, PGC, XA_64|CCI_T) \
++  _(ANY,	lj_strfmt_putfnum_int,	3,   L, PGC, XA_FP|CCI_T) \
++  _(ANY,	lj_strfmt_putfnum_uint,	3,   L, PGC, XA_FP|CCI_T) \
++  _(ANY,	lj_strfmt_putfnum,	3,   L, PGC, XA_FP|CCI_T) \
++  _(ANY,	lj_strfmt_putfstr,	3,   L, PGC, CCI_T) \
++  _(ANY,	lj_strfmt_putfchar,	3,   L, PGC, CCI_T) \
++  _(ANY,	lj_buf_putmem,		3,   S, PGC, CCI_T) \
++  _(ANY,	lj_buf_putstr,		2,  FL, PGC, CCI_T) \
++  _(ANY,	lj_buf_putchar,		2,  FL, PGC, CCI_T) \
++  _(ANY,	lj_buf_putstr_reverse,	2,  FL, PGC, CCI_T) \
++  _(ANY,	lj_buf_putstr_lower,	2,  FL, PGC, CCI_T) \
++  _(ANY,	lj_buf_putstr_upper,	2,  FL, PGC, CCI_T) \
++  _(ANY,	lj_buf_putstr_rep,	3,   L, PGC, CCI_T) \
++  _(ANY,	lj_buf_puttab,		5,   L, PGC, CCI_T) \
++  _(BUFFER,	lj_bufx_set,		4,   S, NIL, 0) \
++  _(BUFFFI,	lj_bufx_more,		2,  FS, INT, CCI_T) \
++  _(BUFFER,	lj_serialize_put,	2,  FS, PGC, CCI_T) \
++  _(BUFFER,	lj_serialize_get,	2,  FS, PTR, CCI_T) \
++  _(BUFFER,	lj_serialize_encode,	2,  FA, STR, CCI_L|CCI_T) \
++  _(BUFFER,	lj_serialize_decode,	3,   A, INT, CCI_L|CCI_T) \
++  _(ANY,	lj_buf_tostr,		1,  FL, STR, CCI_T) \
++  _(ANY,	lj_tab_new_ah,		3,   A, TAB, CCI_L|CCI_T) \
++  _(ANY,	lj_tab_new1,		2,  FA, TAB, CCI_L|CCI_T) \
++  _(ANY,	lj_tab_dup,		2,  FA, TAB, CCI_L|CCI_T) \
+   _(ANY,	lj_tab_clear,		1,  FS, NIL, 0) \
+-  _(ANY,	lj_tab_newkey,		3,   S, PGC, CCI_L) \
++  _(ANY,	lj_tab_newkey,		3,   S, PGC, CCI_L|CCI_T) \
++  _(ANY,	lj_tab_keyindex,	2,  FL, INT, 0) \
++  _(ANY,	lj_vm_next,		2,  FL, PTR, 0) \
+   _(ANY,	lj_tab_len,		1,  FL, INT, 0) \
+   _(ANY,	lj_tab_len_hint,	2,  FL, INT, 0) \
+   _(ANY,	lj_gc_step_jit,		2,  FS, NIL, CCI_L) \
+   _(ANY,	lj_gc_barrieruv,	2,  FS, NIL, 0) \
+-  _(ANY,	lj_mem_newgco,		2,  FS, PGC, CCI_L) \
++  _(ANY,	lj_mem_newgco,		2,  FA, PGC, CCI_L|CCI_T) \
+   _(ANY,	lj_prng_u64d,		1,  FS, NUM, CCI_CASTU64) \
+   _(ANY,	lj_vm_modi,		2,  FN, INT, 0) \
+   _(ANY,	log10,			1,   N, NUM, XA_FP) \
+@@ -195,7 +217,6 @@ typedef struct CCallInfo {
+   _(FPMATH,	sqrt,			1,   N, NUM, XA_FP) \
+   _(ANY,	log,			1,   N, NUM, XA_FP) \
+   _(ANY,	lj_vm_log2,		1,   N, NUM, XA_FP) \
+-  _(ANY,	lj_vm_powi,		2,   N, NUM, XA_FP) \
+   _(ANY,	pow,			2,   N, NUM, XA2_FP) \
+   _(ANY,	atan2,			2,   N, NUM, XA2_FP) \
+   _(ANY,	ldexp,			2,   N, NUM, XA_FP) \
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_iropt.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_iropt.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_iropt.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Common header for IR emitter and optimizations.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_IROPT_H
+@@ -56,6 +56,12 @@ LJ_FUNC TRef lj_ir_ktrace(jit_State *J);
+ #define lj_ir_kintp(J, k)	lj_ir_kint(J, (int32_t)(k))
+ #endif
+ 
++#if LJ_GC64
++#define lj_ir_kintpgc		lj_ir_kintp
++#else
++#define lj_ir_kintpgc		lj_ir_kint
++#endif
++
+ static LJ_AINLINE TRef lj_ir_knum(jit_State *J, lua_Number n)
+ {
+   TValue tv;
+@@ -124,6 +130,7 @@ LJ_FUNC TRef LJ_FASTCALL lj_opt_fwd_alen
+ LJ_FUNC TRef LJ_FASTCALL lj_opt_fwd_hrefk(jit_State *J);
+ LJ_FUNC int LJ_FASTCALL lj_opt_fwd_href_nokey(jit_State *J);
+ LJ_FUNC int LJ_FASTCALL lj_opt_fwd_tptr(jit_State *J, IRRef lim);
++LJ_FUNC int LJ_FASTCALL lj_opt_fwd_sbuf(jit_State *J, IRRef lim);
+ LJ_FUNC int lj_opt_fwd_wasnonnil(jit_State *J, IROpT loadop, IRRef xref);
+ 
+ /* Dead-store elimination. */
+@@ -144,7 +151,6 @@ LJ_FUNC TRef lj_opt_narrow_arith(jit_Sta
+ 				 TValue *vb, TValue *vc, IROp op);
+ LJ_FUNC TRef lj_opt_narrow_unm(jit_State *J, TRef rc, TValue *vc);
+ LJ_FUNC TRef lj_opt_narrow_mod(jit_State *J, TRef rb, TRef rc, TValue *vb, TValue *vc);
+-LJ_FUNC TRef lj_opt_narrow_pow(jit_State *J, TRef rb, TRef rc, TValue *vb, TValue *vc);
+ LJ_FUNC IRType lj_opt_narrow_forl(jit_State *J, cTValue *forbase);
+ 
+ /* Optimization passes. */
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_jit.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_jit.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_jit.h
+@@ -1,12 +1,13 @@
+ /*
+ ** Common definitions for the JIT compiler.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_JIT_H
+ #define _LJ_JIT_H
+ 
+ #include "lj_obj.h"
++#if LJ_HASJIT
+ #include "lj_ir.h"
+ 
+ /* -- JIT engine flags ---------------------------------------------------- */
+@@ -66,6 +67,15 @@
+ #endif
+ #endif
+ 
++#elif LJ_TARGET_RISCV64
++
++#define JIT_F_RVC		(JIT_F_CPU << 0)
++#define JIT_F_RVZba		(JIT_F_CPU << 1)
++#define JIT_F_RVZbb		(JIT_F_CPU << 2)
++#define JIT_F_RVXThead		(JIT_F_CPU << 3)
++
++#define JIT_F_CPUSTRING		"\003RVC\003Zba\003Zbb\006XThead"
++
+ #else
+ 
+ #define JIT_F_CPUSTRING		""
+@@ -86,10 +96,11 @@
+ #define JIT_F_OPT_ABC		(JIT_F_OPT << 7)
+ #define JIT_F_OPT_SINK		(JIT_F_OPT << 8)
+ #define JIT_F_OPT_FUSE		(JIT_F_OPT << 9)
++#define JIT_F_OPT_FMA		(JIT_F_OPT << 10)
+ 
+ /* Optimizations names for -O. Must match the order above. */
+ #define JIT_F_OPTSTRING	\
+-  "\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\3abc\4sink\4fuse"
++  "\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\3abc\4sink\4fuse\3fma"
+ 
+ /* Optimization levels set a fixed combination of flags. */
+ #define JIT_F_OPT_0	0
+@@ -98,11 +109,12 @@
+ #define JIT_F_OPT_3	(JIT_F_OPT_2|\
+   JIT_F_OPT_FWD|JIT_F_OPT_DSE|JIT_F_OPT_ABC|JIT_F_OPT_SINK|JIT_F_OPT_FUSE)
+ #define JIT_F_OPT_DEFAULT	JIT_F_OPT_3
++/* Note: FMA is not set by default. */
+ 
+ /* -- JIT engine parameters ----------------------------------------------- */
+ 
+ #if LJ_TARGET_WINDOWS || LJ_64
+-/* See: http://blogs.msdn.com/oldnewthing/archive/2003/10/08/55239.aspx */
++/* See: https://devblogs.microsoft.com/oldnewthing/20031008-00/?p=42223 */
+ #define JIT_P_sizemcode_DEFAULT		64
+ #else
+ /* Could go as low as 4K, but the mmap() overhead would be rather high. */
+@@ -150,6 +162,7 @@ typedef enum {
+   LJ_TRACE_IDLE,	/* Trace compiler idle. */
+   LJ_TRACE_ACTIVE = 0x10,
+   LJ_TRACE_RECORD,	/* Bytecode recording active. */
++  LJ_TRACE_RECORD_1ST,	/* Record 1st instruction, too. */
+   LJ_TRACE_START,	/* New trace started. */
+   LJ_TRACE_END,		/* End of trace. */
+   LJ_TRACE_ASM,		/* Assemble trace. */
+@@ -184,6 +197,7 @@ typedef struct MCLink {
+ typedef struct SnapShot {
+   uint32_t mapofs;	/* Offset into snapshot map. */
+   IRRef1 ref;		/* First IR ref for this snapshot. */
++  uint16_t mcofs;	/* Offset into machine code in MCode units. */
+   uint8_t nslots;	/* Number of valid slots. */
+   uint8_t topslot;	/* Maximum frame extent. */
+   uint8_t nent;		/* Number of compressed entries. */
+@@ -199,12 +213,15 @@ typedef uint32_t SnapEntry;
+ #define SNAP_CONT		0x020000	/* Continuation slot. */
+ #define SNAP_NORESTORE		0x040000	/* No need to restore slot. */
+ #define SNAP_SOFTFPNUM		0x080000	/* Soft-float number. */
++#define SNAP_KEYINDEX		0x100000	/* Traversal key index. */
+ LJ_STATIC_ASSERT(SNAP_FRAME == TREF_FRAME);
+ LJ_STATIC_ASSERT(SNAP_CONT == TREF_CONT);
++LJ_STATIC_ASSERT(SNAP_KEYINDEX == TREF_KEYINDEX);
+ 
+ #define SNAP(slot, flags, ref)	(((SnapEntry)(slot) << 24) + (flags) + (ref))
+ #define SNAP_TR(slot, tr) \
+-  (((SnapEntry)(slot) << 24) + ((tr) & (TREF_CONT|TREF_FRAME|TREF_REFMASK)))
++  (((SnapEntry)(slot) << 24) + \
++   ((tr) & (TREF_KEYINDEX|TREF_CONT|TREF_FRAME|TREF_REFMASK)))
+ #if !LJ_FR2
+ #define SNAP_MKPC(pc)		((SnapEntry)u32ptr(pc))
+ #endif
+@@ -265,6 +282,9 @@ typedef struct GCtrace {
+   BCIns startins;	/* Original bytecode of starting instruction. */
+   MSize szmcode;	/* Size of machine code. */
+   MCode *mcode;		/* Start of machine code. */
++#if LJ_ABI_PAUTH
++  ASMFunction mcauth;	/* Start of machine code, with ptr auth applied. */
++#endif
+   MSize mcloop;		/* Offset of loop start in machine code. */
+   uint16_t nchild;	/* Number of child traces (root trace only). */
+   uint16_t spadjust;	/* Stack pointer adjustment (offset in bytes). */
+@@ -366,6 +386,7 @@ enum {
+ #endif
+   LJ_K64__MAX,
+ };
++#define LJ_K64__USED	(LJ_TARGET_X86ORX64 || LJ_TARGET_MIPS)
+ 
+ enum {
+ #if LJ_TARGET_X86ORX64
+@@ -384,6 +405,7 @@ enum {
+ #endif
+   LJ_K32__MAX
+ };
++#define LJ_K32__USED	(LJ_TARGET_X86ORX64 || LJ_TARGET_PPC || LJ_TARGET_MIPS)
+ 
+ /* Get 16 byte aligned pointer to SIMD constant. */
+ #define LJ_KSIMD(J, n) \
+@@ -438,9 +460,13 @@ typedef struct jit_State {
+   int32_t framedepth;	/* Current frame depth. */
+   int32_t retdepth;	/* Return frame depth (count of RETF). */
+ 
++#if LJ_K32__USED
+   uint32_t k32[LJ_K32__MAX];  /* Common 4 byte constants used by backends. */
++#endif
+   TValue ksimd[LJ_KSIMD__MAX*2+1];  /* 16 byte aligned SIMD constants. */
++#if LJ_K64__USED
+   TValue k64[LJ_K64__MAX];  /* Common 8 byte constants. */
++#endif
+ 
+   IRIns *irbuf;		/* Temp. IR instruction buffer. Biased with REF_BIAS. */
+   IRRef irtoplim;	/* Upper limit of instuction buffer (biased). */
+@@ -485,6 +511,7 @@ typedef struct jit_State {
+   const BCIns *startpc;	/* Bytecode PC of starting instruction. */
+   TraceNo parent;	/* Parent of current side trace (0 for root traces). */
+   ExitNo exitno;	/* Exit number in parent of current side trace. */
++  int exitcode;		/* Exit code from unwound trace. */
+ 
+   BCIns *patchpc;	/* PC for pending re-patch. */
+   BCIns patchins;	/* Instruction for pending re-patch. */
+@@ -510,5 +537,6 @@ typedef struct jit_State {
+ #else
+ #define lj_assertJ(c, ...)	((void)J)
+ #endif
++#endif
+ 
+ #endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_lex.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_lex.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_lex.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Lexical analyzer.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Major portions taken verbatim or adapted from the Lua interpreter.
+ ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+@@ -105,7 +105,7 @@ static void lex_number(LexState *ls, TVa
+     lex_savenext(ls);
+   }
+   lex_save(ls, '\0');
+-  fmt = lj_strscan_scan((const uint8_t *)sbufB(&ls->sb), sbuflen(&ls->sb)-1, tv,
++  fmt = lj_strscan_scan((const uint8_t *)ls->sb.b, sbuflen(&ls->sb)-1, tv,
+ 	  (LJ_DUALNUM ? STRSCAN_OPT_TOINT : STRSCAN_OPT_TONUM) |
+ 	  (LJ_HASFFI ? (STRSCAN_OPT_LL|STRSCAN_OPT_IMAG) : 0));
+   if (LJ_DUALNUM && fmt == STRSCAN_INT) {
+@@ -118,11 +118,7 @@ static void lex_number(LexState *ls, TVa
+     GCcdata *cd;
+     lj_assertLS(fmt == STRSCAN_I64 || fmt == STRSCAN_U64 || fmt == STRSCAN_IMAG,
+ 		"unexpected number format %d", fmt);
+-    if (!ctype_ctsG(G(L))) {
+-      ptrdiff_t oldtop = savestack(L, L->top);
+-      luaopen_ffi(L);  /* Load FFI library on-demand. */
+-      L->top = restorestack(L, oldtop);
+-    }
++    ctype_loadffi(L);
+     if (fmt == STRSCAN_IMAG) {
+       cd = lj_cdata_new_(L, CTID_COMPLEX_DOUBLE, 2*sizeof(double));
+       ((double *)cdataptr(cd))[0] = 0;
+@@ -180,7 +176,7 @@ static void lex_longstring(LexState *ls,
+     }
+   } endloop:
+   if (tv) {
+-    GCstr *str = lj_parse_keepstr(ls, sbufB(&ls->sb) + (2 + (MSize)sep),
++    GCstr *str = lj_parse_keepstr(ls, ls->sb.b + (2 + (MSize)sep),
+ 				      sbuflen(&ls->sb) - 2*(2 + (MSize)sep));
+     setstrV(ls->L, tv, str);
+   }
+@@ -286,7 +282,7 @@ static void lex_string(LexState *ls, TVa
+   }
+   lex_savenext(ls);  /* Skip trailing delimiter. */
+   setstrV(ls->L, tv,
+-	  lj_parse_keepstr(ls, sbufB(&ls->sb)+1, sbuflen(&ls->sb)-2));
++	  lj_parse_keepstr(ls, ls->sb.b+1, sbuflen(&ls->sb)-2));
+ }
+ 
+ /* -- Main lexical scanner ------------------------------------------------ */
+@@ -306,7 +302,7 @@ static LexToken lex_scan(LexState *ls, T
+       do {
+ 	lex_savenext(ls);
+       } while (lj_char_isident(ls->c));
+-      s = lj_parse_keepstr(ls, sbufB(&ls->sb), sbuflen(&ls->sb));
++      s = lj_parse_keepstr(ls, ls->sb.b, sbuflen(&ls->sb));
+       setstrV(ls->L, tv, s);
+       if (s->reserved > 0)  /* Reserved word? */
+ 	return TK_OFS + s->reserved;
+@@ -496,7 +492,7 @@ void lj_lex_error(LexState *ls, LexToken
+     tokstr = NULL;
+   } else if (tok == TK_name || tok == TK_string || tok == TK_number) {
+     lex_save(ls, '\0');
+-    tokstr = sbufB(&ls->sb);
++    tokstr = ls->sb.b;
+   } else {
+     tokstr = lj_lex_token2str(ls, tok);
+   }
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_lex.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_lex.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_lex.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Lexical analyzer.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_LEX_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_lib.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_lib.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_lib.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Library function support.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_lib_c
+@@ -16,6 +16,9 @@
+ #include "lj_func.h"
+ #include "lj_bc.h"
+ #include "lj_dispatch.h"
++#if LJ_HASFFI
++#include "lj_ctype.h"
++#endif
+ #include "lj_vm.h"
+ #include "lj_strscan.h"
+ #include "lj_strfmt.h"
+@@ -301,3 +304,56 @@ int lj_lib_checkopt(lua_State *L, int na
+   return def;
+ }
+ 
++/* -- Strict type checks -------------------------------------------------- */
++
++/* The following type checks do not coerce between strings and numbers.
++** And they handle plain int64_t/uint64_t FFI numbers, too.
++*/
++
++#if LJ_HASBUFFER
++GCstr *lj_lib_checkstrx(lua_State *L, int narg)
++{
++  TValue *o = L->base + narg-1;
++  if (!(o < L->top && tvisstr(o))) lj_err_argt(L, narg, LUA_TSTRING);
++  return strV(o);
++}
++
++int32_t lj_lib_checkintrange(lua_State *L, int narg, int32_t a, int32_t b)
++{
++  TValue *o = L->base + narg-1;
++  lj_assertL(b >= 0, "expected range must be non-negative");
++  if (o < L->top) {
++    if (LJ_LIKELY(tvisint(o))) {
++      int32_t i = intV(o);
++      if (i >= a && i <= b) return i;
++    } else if (LJ_LIKELY(tvisnum(o))) {
++      /* For performance reasons, this doesn't check for integerness or
++      ** integer overflow. Overflow detection still works, since all FPUs
++      ** return either MININT or MAXINT, which is then out of range.
++      */
++      int32_t i = (int32_t)numV(o);
++      if (i >= a && i <= b) return i;
++#if LJ_HASFFI
++    } else if (tviscdata(o)) {
++      GCcdata *cd = cdataV(o);
++      if (cd->ctypeid == CTID_INT64) {
++	int64_t i = *(int64_t *)cdataptr(cd);
++	if (i >= (int64_t)a && i <= (int64_t)b) return (int32_t)i;
++      } else if (cd->ctypeid == CTID_UINT64) {
++	uint64_t i = *(uint64_t *)cdataptr(cd);
++	if ((a < 0 || i >= (uint64_t)a) && i <= (uint64_t)b) return (int32_t)i;
++      } else {
++	goto badtype;
++      }
++#endif
++    } else {
++      goto badtype;
++    }
++    lj_err_arg(L, narg, LJ_ERR_NUMRNG);
++  }
++badtype:
++  lj_err_argt(L, narg, LUA_TNUMBER);
++  return 0;  /* unreachable */
++}
++#endif
++
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_lib.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_lib.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_lib.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Library function support.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_LIB_H
+@@ -46,6 +46,12 @@ LJ_FUNC GCtab *lj_lib_checktab(lua_State
+ LJ_FUNC GCtab *lj_lib_checktabornil(lua_State *L, int narg);
+ LJ_FUNC int lj_lib_checkopt(lua_State *L, int narg, int def, const char *lst);
+ 
++#if LJ_HASBUFFER
++LJ_FUNC GCstr *lj_lib_checkstrx(lua_State *L, int narg);
++LJ_FUNC int32_t lj_lib_checkintrange(lua_State *L, int narg,
++				     int32_t a, int32_t b);
++#endif
++
+ /* Avoid including lj_frame.h. */
+ #if LJ_GC64
+ #define lj_lib_upvalue(L, n) \
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_load.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_load.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_load.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Load and dump code.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #include <errno.h>
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_mcode.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_mcode.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_mcode.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Machine code management.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_mcode_c
+@@ -29,6 +29,11 @@
+ #include <valgrind/valgrind.h>
+ #endif
+ 
++#if LJ_TARGET_WINDOWS
++#define WIN32_LEAN_AND_MEAN
++#include <windows.h>
++#endif
++
+ #if LJ_TARGET_IOS
+ void sys_icache_invalidate(void *start, size_t len);
+ #endif
+@@ -41,6 +46,8 @@ void lj_mcode_sync(void *start, void *en
+ #endif
+ #if LJ_TARGET_X86ORX64
+   UNUSED(start); UNUSED(end);
++#elif LJ_TARGET_WINDOWS
++  FlushInstructionCache(GetCurrentProcess(), start, (char *)end-(char *)start);
+ #elif LJ_TARGET_IOS
+   sys_icache_invalidate(start, (char *)end-(char *)start);
+ #elif LJ_TARGET_PPC
+@@ -58,9 +65,6 @@ void lj_mcode_sync(void *start, void *en
+ 
+ #if LJ_TARGET_WINDOWS
+ 
+-#define WIN32_LEAN_AND_MEAN
+-#include <windows.h>
+-
+ #define MCPROT_RW	PAGE_READWRITE
+ #define MCPROT_RX	PAGE_EXECUTE_READ
+ #define MCPROT_RWX	PAGE_EXECUTE_READWRITE
+@@ -97,10 +101,15 @@ static int mcode_setprot(void *p, size_t
+ #define MCPROT_RW	(PROT_READ|PROT_WRITE)
+ #define MCPROT_RX	(PROT_READ|PROT_EXEC)
+ #define MCPROT_RWX	(PROT_READ|PROT_WRITE|PROT_EXEC)
++#ifdef PROT_MPROTECT
++#define MCPROT_CREATE	(PROT_MPROTECT(MCPROT_RWX))
++#else
++#define MCPROT_CREATE	0
++#endif
+ 
+ static void *mcode_alloc_at(jit_State *J, uintptr_t hint, size_t sz, int prot)
+ {
+-  void *p = mmap((void *)hint, sz, prot, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
++  void *p = mmap((void *)hint, sz, prot|MCPROT_CREATE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+   if (p == MAP_FAILED) {
+     if (!hint) lj_trace_err(J, LJ_TRERR_MCODEAL);
+     p = NULL;
+@@ -163,7 +172,7 @@ static void mcode_protect(jit_State *J,
+ #define MCPROT_RUN	MCPROT_RX
+ 
+ /* Protection twiddling failed. Probably due to kernel security. */
+-static LJ_NOINLINE void mcode_protfail(jit_State *J)
++static LJ_NORET LJ_NOINLINE void mcode_protfail(jit_State *J)
+ {
+   lua_CFunction panic = J2G(J)->panic;
+   if (panic) {
+@@ -171,6 +180,7 @@ static LJ_NOINLINE void mcode_protfail(j
+     setstrV(L, L->top++, lj_err_str(L, LJ_ERR_JITPROT));
+     panic(L);
+   }
++  exit(EXIT_FAILURE);
+ }
+ 
+ /* Change protection of MCode area. */
+@@ -238,7 +248,7 @@ static void *mcode_alloc(jit_State *J, s
+ /* All memory addresses are reachable by relative jumps. */
+ static void *mcode_alloc(jit_State *J, size_t sz)
+ {
+-#if defined(__OpenBSD__) || LJ_TARGET_UWP
++#if defined(__OpenBSD__) || defined(__NetBSD__) || LJ_TARGET_UWP
+   /* Allow better executable memory allocation for OpenBSD W^X mode. */
+   void *p = mcode_alloc_at(J, 0, sz, MCPROT_RUN);
+   if (p && mcode_setprot(p, sz, MCPROT_GEN)) {
+@@ -269,6 +279,7 @@ static void mcode_allocarea(jit_State *J
+   ((MCLink *)J->mcarea)->next = oldarea;
+   ((MCLink *)J->mcarea)->size = sz;
+   J->szallmcarea += sz;
++  J->mcbot = (MCode *)lj_err_register_mcode(J->mcarea, sz, (uint8_t *)J->mcbot);
+ }
+ 
+ /* Free all MCode areas. */
+@@ -279,7 +290,9 @@ void lj_mcode_free(jit_State *J)
+   J->szallmcarea = 0;
+   while (mc) {
+     MCode *next = ((MCLink *)mc)->next;
+-    mcode_free(J, mc, ((MCLink *)mc)->size);
++    size_t sz = ((MCLink *)mc)->size;
++    lj_err_deregister_mcode(mc, sz, (uint8_t *)mc + sizeof(MCLink));
++    mcode_free(J, mc, sz);
+     mc = next;
+   }
+ }
+@@ -314,21 +327,21 @@ void lj_mcode_abort(jit_State *J)
+ /* Set/reset protection to allow patching of MCode areas. */
+ MCode *lj_mcode_patch(jit_State *J, MCode *ptr, int finish)
+ {
+-#if LUAJIT_SECURITY_MCODE == 0
+-  UNUSED(J); UNUSED(ptr); UNUSED(finish);
+-  return NULL;
+-#else
+   if (finish) {
++#if LUAJIT_SECURITY_MCODE
+     if (J->mcarea == ptr)
+       mcode_protect(J, MCPROT_RUN);
+     else if (LJ_UNLIKELY(mcode_setprot(ptr, ((MCLink *)ptr)->size, MCPROT_RUN)))
+       mcode_protfail(J);
++#endif
+     return NULL;
+   } else {
+     MCode *mc = J->mcarea;
+     /* Try current area first to use the protection cache. */
+     if (ptr >= mc && ptr < (MCode *)((char *)mc + J->szmcarea)) {
++#if LUAJIT_SECURITY_MCODE
+       mcode_protect(J, MCPROT_GEN);
++#endif
+       return mc;
+     }
+     /* Otherwise search through the list of MCode areas. */
+@@ -336,13 +349,14 @@ MCode *lj_mcode_patch(jit_State *J, MCod
+       mc = ((MCLink *)mc)->next;
+       lj_assertJ(mc != NULL, "broken MCode area chain");
+       if (ptr >= mc && ptr < (MCode *)((char *)mc + ((MCLink *)mc)->size)) {
++#if LUAJIT_SECURITY_MCODE
+ 	if (LJ_UNLIKELY(mcode_setprot(mc, ((MCLink *)mc)->size, MCPROT_GEN)))
+ 	  mcode_protfail(J);
++#endif
+ 	return mc;
+       }
+     }
+   }
+-#endif
+ }
+ 
+ /* Limit of MCode reservation reached. */
+@@ -353,7 +367,7 @@ void lj_mcode_limiterr(jit_State *J, siz
+   sizemcode = (size_t)J->param[JIT_P_sizemcode] << 10;
+   sizemcode = (sizemcode + LJ_PAGESIZE-1) & ~(size_t)(LJ_PAGESIZE - 1);
+   maxmcode = (size_t)J->param[JIT_P_maxmcode] << 10;
+-  if ((size_t)need > sizemcode)
++  if (need * sizeof(MCode) > sizemcode)
+     lj_trace_err(J, LJ_TRERR_MCODEOV);  /* Too long for any area. */
+   if (J->szallmcarea + sizemcode > maxmcode)
+     lj_trace_err(J, LJ_TRERR_MCODEAL);
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_mcode.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_mcode.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_mcode.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Machine code management.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_MCODE_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_meta.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_meta.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_meta.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Metamethod handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Portions taken verbatim or adapted from the Lua interpreter.
+ ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+@@ -240,8 +240,8 @@ TValue *lj_meta_cat(lua_State *L, TValue
+   int fromc = 0;
+   if (left < 0) { left = -left; fromc = 1; }
+   do {
+-    if (!(tvisstr(top) || tvisnumber(top)) ||
+-	!(tvisstr(top-1) || tvisnumber(top-1))) {
++    if (!(tvisstr(top) || tvisnumber(top) || tvisbuf(top)) ||
++	!(tvisstr(top-1) || tvisnumber(top-1) || tvisbuf(top-1))) {
+       cTValue *mo = lj_meta_lookup(L, top-1, MM_concat);
+       if (tvisnil(mo)) {
+ 	mo = lj_meta_lookup(L, top, MM_concat);
+@@ -277,10 +277,12 @@ TValue *lj_meta_cat(lua_State *L, TValue
+       ** next step: [...][CAT stack ............]
+       */
+       TValue *e, *o = top;
+-      uint64_t tlen = tvisstr(o) ? strV(o)->len : STRFMT_MAXBUF_NUM;
++      uint64_t tlen = tvisstr(o) ? strV(o)->len :
++		      tvisbuf(o) ? sbufxlen(bufV(o)) : STRFMT_MAXBUF_NUM;
+       SBuf *sb;
+       do {
+-	o--; tlen += tvisstr(o) ? strV(o)->len : STRFMT_MAXBUF_NUM;
++	o--; tlen += tvisstr(o) ? strV(o)->len :
++		     tvisbuf(o) ? sbufxlen(bufV(o)) : STRFMT_MAXBUF_NUM;
+       } while (--left > 0 && (tvisstr(o-1) || tvisnumber(o-1)));
+       if (tlen >= LJ_MAX_STR) lj_err_msg(L, LJ_ERR_STROV);
+       sb = lj_buf_tmp_(L);
+@@ -290,6 +292,9 @@ TValue *lj_meta_cat(lua_State *L, TValue
+ 	  GCstr *s = strV(o);
+ 	  MSize len = s->len;
+ 	  lj_buf_putmem(sb, strdata(s), len);
++	} else if (tvisbuf(o)) {
++	  SBufExt *sbx = bufV(o);
++	  lj_buf_putmem(sb, sbx->r, sbufxlen(sbx));
+ 	} else if (tvisint(o)) {
+ 	  lj_strfmt_putint(sb, intV(o));
+ 	} else {
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_meta.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_meta.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_meta.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Metamethod handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_META_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_obj.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_obj.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_obj.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Miscellaneous object handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_obj_c
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_obj.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_obj.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_obj.h
+@@ -1,6 +1,6 @@
+ /*
+ ** LuaJIT VM tags, values and objects.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Portions taken verbatim or adapted from the Lua interpreter.
+ ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+@@ -34,13 +34,17 @@ typedef struct MRef {
+ 
+ #if LJ_GC64
+ #define mref(r, t)	((t *)(void *)(r).ptr64)
++#define mrefu(r)	((r).ptr64)
+ 
+ #define setmref(r, p)	((r).ptr64 = (uint64_t)(void *)(p))
++#define setmrefu(r, u)	((r).ptr64 = (uint64_t)(u))
+ #define setmrefr(r, v)	((r).ptr64 = (v).ptr64)
+ #else
+ #define mref(r, t)	((t *)(void *)(uintptr_t)(r).ptr32)
++#define mrefu(r)	((r).ptr32)
+ 
+ #define setmref(r, p)	((r).ptr32 = (uint32_t)(uintptr_t)(void *)(p))
++#define setmrefu(r, u)	((r).ptr32 = (uint32_t)(u))
+ #define setmrefr(r, v)	((r).ptr32 = (v).ptr32)
+ #endif
+ 
+@@ -153,11 +157,9 @@ typedef int32_t BCLine;  /* Bytecode lin
+ typedef void (*ASMFunction)(void);
+ 
+ /* Resizable string buffer. Need this here, details in lj_buf.h. */
++#define SBufHeader	char *w, *e, *b; MRef L
+ typedef struct SBuf {
+-  MRef p;		/* String buffer pointer. */
+-  MRef e;		/* String buffer end pointer. */
+-  MRef b;		/* String buffer base. */
+-  MRef L;		/* lua_State, used for buffer resizing. */
++  SBufHeader;
+ } SBuf;
+ 
+ /* -- Tags and values ----------------------------------------------------- */
+@@ -282,6 +284,9 @@ typedef const TValue cTValue;
+ #define LJ_TISGCV		(LJ_TSTR+1)
+ #define LJ_TISTABUD		LJ_TTAB
+ 
++/* Type marker for slot holding a traversal index. Must be lightuserdata. */
++#define LJ_KEYINDEX		0xfffe7fffu
++
+ #if LJ_GC64
+ #define LJ_GCVMASK		(((uint64_t)1 << 47) - 1)
+ #endif
+@@ -330,6 +335,7 @@ enum {
+   UDTYPE_USERDATA,	/* Regular userdata. */
+   UDTYPE_IO_FILE,	/* I/O library FILE. */
+   UDTYPE_FFI_CLIB,	/* FFI C library namespace. */
++  UDTYPE_BUFFER,	/* String buffer. */
+   UDTYPE__MAX
+ };
+ 
+@@ -407,7 +413,7 @@ typedef struct GCproto {
+ #define PROTO_UV_IMMUTABLE	0x4000	/* Immutable upvalue. */
+ 
+ #define proto_kgc(pt, idx) \
+-  check_exp((uintptr_t)(intptr_t)(idx) >= (uintptr_t)-(intptr_t)(pt)->sizekgc, \
++  check_exp((uintptr_t)(intptr_t)(idx) >= ~(uintptr_t)(pt)->sizekgc+1u, \
+ 	    gcref(mref((pt)->k, GCRef)[(idx)]))
+ #define proto_knumtv(pt, idx) \
+   check_exp((uintptr_t)(idx) < (pt)->sizekn, &mref((pt)->k, TValue)[(idx)])
+@@ -505,7 +511,7 @@ typedef struct GCtab {
+ } GCtab;
+ 
+ #define sizetabcolo(n)	((n)*sizeof(TValue) + sizeof(GCtab))
+-#define tabref(r)	(&gcref((r))->tab)
++#define tabref(r)	((GCtab *)gcref((r)))
+ #define noderef(r)	(mref((r), Node))
+ #define nextnode(n)	(mref((n)->next, Node))
+ #if LJ_GC64
+@@ -839,6 +845,7 @@ static LJ_AINLINE void *lightudV(global_
+   uint64_t seg = lightudseg(u);
+   uint32_t *segmap = mref(g->gc.lightudseg, uint32_t);
+   lj_assertG(tvislightud(o), "lightuserdata expected");
++  if (seg == (1 << LJ_LIGHTUD_BITS_SEG)-1) return NULL;
+   lj_assertG(seg <= g->gc.lightudnum, "bad lightuserdata segment %d", seg);
+   return (void *)(((uint64_t)segmap[seg] << 32) | lightudlo(u));
+ }
+@@ -920,7 +927,7 @@ static LJ_AINLINE void setgcV(lua_State
+ }
+ 
+ #define define_setV(name, type, tag) \
+-static LJ_AINLINE void name(lua_State *L, TValue *o, type *v) \
++static LJ_AINLINE void name(lua_State *L, TValue *o, const type *v) \
+ { \
+   setgcV(L, o, obj2gco(v), tag); \
+ }
+@@ -1035,4 +1042,18 @@ LJ_DATA const char *const lj_obj_itypena
+ LJ_FUNC int LJ_FASTCALL lj_obj_equal(cTValue *o1, cTValue *o2);
+ LJ_FUNC const void * LJ_FASTCALL lj_obj_ptr(global_State *g, cTValue *o);
+ 
++#if LJ_ABI_PAUTH
++#if LJ_TARGET_ARM64
++#include <ptrauth.h>
++#define lj_ptr_sign(ptr, ctx) \
++  ptrauth_sign_unauthenticated((ptr), ptrauth_key_function_pointer, (ctx))
++#define lj_ptr_strip(ptr) ptrauth_strip((ptr), ptrauth_key_function_pointer)
++#else
++#error "No support for pointer authentication for this architecture"
++#endif
++#else
++#define lj_ptr_sign(ptr, ctx) (ptr)
++#define lj_ptr_strip(ptr) (ptr)
++#endif
++
+ #endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_opt_dce.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_opt_dce.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_opt_dce.c
+@@ -1,6 +1,6 @@
+ /*
+ ** DCE: Dead Code Elimination. Pre-LOOP only -- ASM already performs DCE.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_opt_dce_c
+@@ -44,12 +44,12 @@ static void dce_propagate(jit_State *J)
+     IRIns *ir = IR(ins);
+     if (irt_ismarked(ir->t)) {
+       irt_clearmark(ir->t);
+-      pchain[ir->o] = &ir->prev;
+     } else if (!ir_sideeff(ir)) {
+       *pchain[ir->o] = ir->prev;  /* Reroute original instruction chain. */
+       lj_ir_nop(ir);
+       continue;
+     }
++    pchain[ir->o] = &ir->prev;
+     if (ir->op1 >= REF_FIRST) irt_setmark(IR(ir->op1)->t);
+     if (ir->op2 >= REF_FIRST) irt_setmark(IR(ir->op2)->t);
+   }
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_opt_fold.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_opt_fold.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_opt_fold.c
+@@ -2,7 +2,7 @@
+ ** FOLD: Constant Folding, Algebraic Simplifications and Reassociation.
+ ** ABCelim: Array Bounds Check Elimination.
+ ** CSE: Common-Subexpression Elimination.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_opt_fold_c
+@@ -236,14 +236,10 @@ LJFOLDF(kfold_fpcall2)
+   return NEXTFOLD;
+ }
+ 
+-LJFOLD(POW KNUM KINT)
+ LJFOLD(POW KNUM KNUM)
+ LJFOLDF(kfold_numpow)
+ {
+-  lua_Number a = knumleft;
+-  lua_Number b = fright->o == IR_KINT ? (lua_Number)fright->i : knumright;
+-  lua_Number y = lj_vm_foldarith(a, b, IR_POW - IR_ADD);
+-  return lj_ir_knum(J, y);
++  return lj_ir_knum(J, lj_vm_foldarith(knumleft, knumright, IR_POW - IR_ADD));
+ }
+ 
+ /* Must not use kfold_kref for numbers (could be NaN). */
+@@ -271,7 +267,7 @@ static int32_t kfold_intop(int32_t k1, i
+   case IR_SUB: k1 -= k2; break;
+   case IR_MUL: k1 *= k2; break;
+   case IR_MOD: k1 = lj_vm_modi(k1, k2); break;
+-  case IR_NEG: k1 = -k1; break;
++  case IR_NEG: k1 = (int32_t)(~(uint32_t)k1+1u); break;
+   case IR_BAND: k1 &= k2; break;
+   case IR_BOR: k1 |= k2; break;
+   case IR_BXOR: k1 ^= k2; break;
+@@ -381,10 +377,10 @@ static uint64_t kfold_int64arith(jit_Sta
+   case IR_BOR: k1 |= k2; break;
+   case IR_BXOR: k1 ^= k2; break;
+   case IR_BSHL: k1 <<= (k2 & 63); break;
+-  case IR_BSHR: k1 = (int32_t)((uint32_t)k1 >> (k2 & 63)); break;
+-  case IR_BSAR: k1 >>= (k2 & 63); break;
+-  case IR_BROL: k1 = (int32_t)lj_rol((uint32_t)k1, (k2 & 63)); break;
+-  case IR_BROR: k1 = (int32_t)lj_ror((uint32_t)k1, (k2 & 63)); break;
++  case IR_BSHR: k1 >>= (k2 & 63); break;
++  case IR_BSAR: k1 = (uint64_t)((int64_t)k1 >> (k2 & 63)); break;
++  case IR_BROL: k1 = lj_rol(k1, (k2 & 63)); break;
++  case IR_BROR: k1 = lj_ror(k1, (k2 & 63)); break;
+   default: lj_assertJ(0, "bad IR op %d", op); break;
+   }
+ #else
+@@ -514,6 +510,7 @@ LJFOLDF(kfold_snew_kptr)
+ }
+ 
+ LJFOLD(SNEW any KINT)
++LJFOLD(XSNEW any KINT)
+ LJFOLDF(kfold_snew_empty)
+ {
+   if (fright->i == 0)
+@@ -577,22 +574,51 @@ LJFOLDF(kfold_strcmp)
+ ** The compromise is to declare them as loads, emit them like stores and
+ ** CSE whole chains manually when the BUFSTR is to be emitted. Any chain
+ ** fragments left over from CSE are eliminated by DCE.
++**
++** The string buffer methods emit a USE instead of a BUFSTR to keep the
++** chain alive.
+ */
+ 
+-/* BUFHDR is emitted like a store, see below. */
++LJFOLD(BUFHDR any any)
++LJFOLDF(bufhdr_merge)
++{
++  return fins->op2 == IRBUFHDR_WRITE ? CSEFOLD : EMITFOLD;
++}
+ 
+-LJFOLD(BUFPUT BUFHDR BUFSTR)
+-LJFOLDF(bufput_append)
++LJFOLD(BUFPUT any BUFSTR)
++LJFOLDF(bufput_bufstr)
+ {
+-  /* New buffer, no other buffer op inbetween and same buffer? */
+-  if ((J->flags & JIT_F_OPT_FWD) &&
+-      !(fleft->op2 & IRBUFHDR_APPEND) &&
+-      fleft->prev == fright->op2 &&
+-      fleft->op1 == IR(fright->op2)->op1) {
+-    IRRef ref = fins->op1;
+-    IR(ref)->op2 = (fleft->op2 | IRBUFHDR_APPEND);  /* Modify BUFHDR. */
+-    IR(ref)->op1 = fright->op1;
+-    return ref;
++  if ((J->flags & JIT_F_OPT_FWD)) {
++    IRRef hdr = fright->op2;
++    /* New buffer, no other buffer op inbetween and same buffer? */
++    if (fleft->o == IR_BUFHDR && fleft->op2 == IRBUFHDR_RESET &&
++	fleft->prev == hdr &&
++	fleft->op1 == IR(hdr)->op1 &&
++	!(irt_isphi(fright->t) && IR(hdr)->prev) &&
++	(!LJ_HASBUFFER || J->chain[IR_CALLA] < hdr)) {
++      IRRef ref = fins->op1;
++      IR(ref)->op2 = IRBUFHDR_APPEND;  /* Modify BUFHDR. */
++      IR(ref)->op1 = fright->op1;
++      return ref;
++    }
++    /* Replay puts to global temporary buffer. */
++    if (IR(hdr)->op2 == IRBUFHDR_RESET && !irt_isphi(fright->t)) {
++      IRIns *ir = IR(fright->op1);
++      /* For now only handle single string.reverse .lower .upper .rep. */
++      if (ir->o == IR_CALLL &&
++	  ir->op2 >= IRCALL_lj_buf_putstr_reverse &&
++	  ir->op2 <= IRCALL_lj_buf_putstr_rep) {
++	IRIns *carg1 = IR(ir->op1);
++	if (ir->op2 == IRCALL_lj_buf_putstr_rep) {
++	  IRIns *carg2 = IR(carg1->op1);
++	  if (carg2->op1 == hdr) {
++	    return lj_ir_call(J, ir->op2, fins->op1, carg2->op2, carg1->op2);
++	  }
++	} else if (carg1->op1 == hdr) {
++	  return lj_ir_call(J, ir->op2, fins->op1, carg1->op2);
++	}
++      }
++    }
+   }
+   return EMITFOLD;  /* Always emit, CSE later. */
+ }
+@@ -626,14 +652,14 @@ LJFOLDF(bufstr_kfold_cse)
+ 	     "bad buffer constructor IR op %d", fleft->o);
+   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD)) {
+     if (fleft->o == IR_BUFHDR) {  /* No put operations? */
+-      if (!(fleft->op2 & IRBUFHDR_APPEND))  /* Empty buffer? */
++      if (fleft->op2 == IRBUFHDR_RESET)  /* Empty buffer? */
+ 	return lj_ir_kstr(J, &J2G(J)->strempty);
+       fins->op1 = fleft->op1;
+       fins->op2 = fleft->prev;  /* Relies on checks in bufput_append. */
+       return CSEFOLD;
+     } else if (fleft->o == IR_BUFPUT) {
+       IRIns *irb = IR(fleft->op1);
+-      if (irb->o == IR_BUFHDR && !(irb->op2 & IRBUFHDR_APPEND))
++      if (irb->o == IR_BUFHDR && irb->op2 == IRBUFHDR_RESET)
+ 	return fleft->op2;  /* Shortcut for a single put operation. */
+     }
+   }
+@@ -646,7 +672,7 @@ LJFOLDF(bufstr_kfold_cse)
+ 	lj_assertJ(ira->o == IR_BUFHDR || ira->o == IR_BUFPUT ||
+ 		   ira->o == IR_CALLL || ira->o == IR_CARG,
+ 		   "bad buffer constructor IR op %d", ira->o);
+-	if (ira->o == IR_BUFHDR && !(ira->op2 & IRBUFHDR_APPEND))
++	if (ira->o == IR_BUFHDR && ira->op2 == IRBUFHDR_RESET)
+ 	  return ref;  /* CSE succeeded. */
+ 	if (ira->o == IR_CALLL && ira->op2 == IRCALL_lj_buf_puttab)
+ 	  break;
+@@ -1009,8 +1035,7 @@ LJFOLDF(simplify_numadd_xneg)
+ LJFOLD(SUB any KNUM)
+ LJFOLDF(simplify_numsub_k)
+ {
+-  lua_Number n = knumright;
+-  if (n == 0.0)  /* x - (+-0) ==> x */
++  if (ir_knum(fright)->u64 == 0)  /* x - (+0) ==> x */
+     return LEFTFOLD;
+   return NEXTFOLD;
+ }
+@@ -1084,61 +1109,17 @@ LJFOLDF(simplify_nummuldiv_negneg)
+   return RETRYFOLD;
+ }
+ 
+-LJFOLD(POW any KINT)
+-LJFOLDF(simplify_numpow_xkint)
++LJFOLD(POW any KNUM)
++LJFOLDF(simplify_numpow_k)
+ {
+-  int32_t k = fright->i;
+-  TRef ref = fins->op1;
+-  if (k == 0)  /* x ^ 0 ==> 1 */
++  if (knumright == 0.0)  /* x ^ 0 ==> 1 */
+     return lj_ir_knum_one(J);  /* Result must be a number, not an int. */
+-  if (k == 1)  /* x ^ 1 ==> x */
++  else if (knumright == 1.0)  /* x ^ 1 ==> x */
+     return LEFTFOLD;
+-  if ((uint32_t)(k+65536) > 2*65536u)  /* Limit code explosion. */
++  else if (knumright == 2.0)  /* x ^ 2 ==> x * x */
++    return emitir(IRTN(IR_MUL), fins->op1, fins->op1);
++  else
+     return NEXTFOLD;
+-  if (k < 0) {  /* x ^ (-k) ==> (1/x) ^ k. */
+-    ref = emitir(IRTN(IR_DIV), lj_ir_knum_one(J), ref);
+-    k = -k;
+-  }
+-  /* Unroll x^k for 1 <= k <= 65536. */
+-  for (; (k & 1) == 0; k >>= 1)  /* Handle leading zeros. */
+-    ref = emitir(IRTN(IR_MUL), ref, ref);
+-  if ((k >>= 1) != 0) {  /* Handle trailing bits. */
+-    TRef tmp = emitir(IRTN(IR_MUL), ref, ref);
+-    for (; k != 1; k >>= 1) {
+-      if (k & 1)
+-	ref = emitir(IRTN(IR_MUL), ref, tmp);
+-      tmp = emitir(IRTN(IR_MUL), tmp, tmp);
+-    }
+-    ref = emitir(IRTN(IR_MUL), ref, tmp);
+-  }
+-  return ref;
+-}
+-
+-LJFOLD(POW any KNUM)
+-LJFOLDF(simplify_numpow_xknum)
+-{
+-  if (knumright == 0.5)  /* x ^ 0.5 ==> sqrt(x) */
+-    return emitir(IRTN(IR_FPMATH), fins->op1, IRFPM_SQRT);
+-  return NEXTFOLD;
+-}
+-
+-LJFOLD(POW KNUM any)
+-LJFOLDF(simplify_numpow_kx)
+-{
+-  lua_Number n = knumleft;
+-  if (n == 2.0 && irt_isint(fright->t)) {  /* 2.0 ^ i ==> ldexp(1.0, i) */
+-#if LJ_TARGET_X86ORX64
+-    /* Different IR_LDEXP calling convention on x86/x64 requires conversion. */
+-    fins->o = IR_CONV;
+-    fins->op1 = fins->op2;
+-    fins->op2 = IRCONV_NUM_INT;
+-    fins->op2 = (IRRef1)lj_opt_fold(J);
+-#endif
+-    fins->op1 = (IRRef1)lj_ir_knum_one(J);
+-    fins->o = IR_LDEXP;
+-    return RETRYFOLD;
+-  }
+-  return NEXTFOLD;
+ }
+ 
+ /* -- Simplify conversions ------------------------------------------------ */
+@@ -1297,6 +1278,10 @@ LJFOLD(CONV SUB IRCONV_U32_U64)
+ LJFOLD(CONV MUL IRCONV_U32_U64)
+ LJFOLDF(simplify_conv_narrow)
+ {
++#if LJ_64
++  UNUSED(J);
++  return NEXTFOLD;
++#else
+   IROp op = (IROp)fleft->o;
+   IRType t = irt_type(fins->t);
+   IRRef op1 = fleft->op1, op2 = fleft->op2, mode = fins->op2;
+@@ -1307,6 +1292,7 @@ LJFOLDF(simplify_conv_narrow)
+   fins->op1 = op1;
+   fins->op2 = op2;
+   return RETRYFOLD;
++#endif
+ }
+ 
+ /* Special CSE rule for CONV. */
+@@ -1380,7 +1366,7 @@ LJFOLDF(simplify_intsub_k)
+   if (fright->i == 0)  /* i - 0 ==> i */
+     return LEFTFOLD;
+   fins->o = IR_ADD;  /* i - k ==> i + (-k) */
+-  fins->op2 = (IRRef1)lj_ir_kint(J, -fright->i);  /* Overflow for -2^31 ok. */
++  fins->op2 = (IRRef1)lj_ir_kint(J, (int32_t)(~(uint32_t)fright->i+1u));  /* Overflow for -2^31 ok. */
+   return RETRYFOLD;
+ }
+ 
+@@ -1411,7 +1397,7 @@ LJFOLDF(simplify_intsub_k64)
+   if (k == 0)  /* i - 0 ==> i */
+     return LEFTFOLD;
+   fins->o = IR_ADD;  /* i - k ==> i + (-k) */
+-  fins->op2 = (IRRef1)lj_ir_kint64(J, (uint64_t)-(int64_t)k);
++  fins->op2 = (IRRef1)lj_ir_kint64(J, ~k+1u);
+   return RETRYFOLD;
+ }
+ 
+@@ -1926,14 +1912,15 @@ LJFOLDF(abc_fwd)
+ LJFOLD(ABC any KINT)
+ LJFOLDF(abc_k)
+ {
++  PHIBARRIER(fleft);
+   if (LJ_LIKELY(J->flags & JIT_F_OPT_ABC)) {
+     IRRef ref = J->chain[IR_ABC];
+     IRRef asize = fins->op1;
+     while (ref > asize) {
+       IRIns *ir = IR(ref);
+       if (ir->op1 == asize && irref_isk(ir->op2)) {
+-	int32_t k = IR(ir->op2)->i;
+-	if (fright->i > k)
++	uint32_t k = (uint32_t)IR(ir->op2)->i;
++	if ((uint32_t)fright->i > k)
+ 	  ir->op2 = fins->op2;
+ 	return DROPFOLD;
+       }
+@@ -1985,7 +1972,10 @@ LJFOLD(NE any any)
+ LJFOLDF(comm_equal)
+ {
+   /* For non-numbers only: x == x ==> drop; x ~= x ==> fail */
+-  if (fins->op1 == fins->op2 && !irt_isnum(fins->t))
++  if (fins->op1 == fins->op2 &&
++      (!irt_isnum(fins->t) ||
++       (fleft->o == IR_CONV &&  /* Converted integers cannot be NaN. */
++	(uint32_t)(fleft->op2 & IRCONV_SRCMASK) - (uint32_t)IRT_I8 <= (uint32_t)(IRT_U64 - IRT_U8))))
+     return CONDFOLD(fins->o == IR_EQ);
+   return fold_comm_swap(J);
+ }
+@@ -2144,8 +2134,26 @@ LJFOLDX(lj_opt_fwd_uload)
+ LJFOLD(ALEN any any)
+ LJFOLDX(lj_opt_fwd_alen)
+ 
++/* Try to merge UREFO/UREFC into referenced instruction. */
++static TRef merge_uref(jit_State *J, IRRef ref, IRIns* ir)
++{
++  if (ir->o == IR_UREFO && irt_isguard(ir->t)) {
++    /* Might be pointing to some other coroutine's stack.
++    ** And GC might shrink said stack, thereby repointing the upvalue.
++    ** GC might even collect said coroutine, thereby closing the upvalue.
++    */
++    if (gcstep_barrier(J, ref))
++      return EMITFOLD;  /* So cannot merge. */
++    /* Current fins wants a check, but ir doesn't have one. */
++    if ((irt_t(fins->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC) &&
++	irt_type(ir->t) == IRT_IGC)
++      ir->t.irt += IRT_PGC-IRT_IGC;  /* So install a check. */
++  }
++  return ref;  /* Not a TRef, but the caller doesn't care. */
++}
++
+ /* Upvalue refs are really loads, but there are no corresponding stores.
+-** So CSE is ok for them, except for UREFO across a GC step (see below).
++** So CSE is ok for them, except for guarded UREFO across a GC step.
+ ** If the referenced function is const, its upvalue addresses are const, too.
+ ** This can be used to improve CSE by looking for the same address,
+ ** even if the upvalues originate from a different function.
+@@ -2163,9 +2171,7 @@ LJFOLDF(cse_uref)
+       if (irref_isk(ir->op1)) {
+ 	GCfunc *fn2 = ir_kfunc(IR(ir->op1));
+ 	if (gco2uv(gcref(fn2->l.uvptr[(ir->op2 >> 8)])) == uv) {
+-	  if (fins->o == IR_UREFO && gcstep_barrier(J, ref))
+-	    break;
+-	  return ref;
++	  return merge_uref(J, ref, ir);
+ 	}
+       }
+       ref = ir->prev;
+@@ -2174,6 +2180,24 @@ LJFOLDF(cse_uref)
+   return EMITFOLD;
+ }
+ 
++/* Custom CSE for UREFO. */
++LJFOLD(UREFO any any)
++LJFOLDF(cse_urefo)
++{
++  if (LJ_LIKELY(J->flags & JIT_F_OPT_CSE)) {
++    IRRef ref = J->chain[IR_UREFO];
++    IRRef lim = fins->op1;
++    IRRef2 op12 = (IRRef2)fins->op1 + ((IRRef2)fins->op2 << 16);
++    while (ref > lim) {
++      IRIns *ir = IR(ref);
++      if (ir->op12 == op12)
++	return merge_uref(J, ref, ir);
++      ref = ir->prev;
++    }
++  }
++  return EMITFOLD;
++}
++
+ LJFOLD(HREFK any any)
+ LJFOLDX(lj_opt_fwd_hrefk)
+ 
+@@ -2275,6 +2299,27 @@ LJFOLDF(fload_str_len_tostr)
+   return NEXTFOLD;
+ }
+ 
++LJFOLD(FLOAD any IRFL_SBUF_W)
++LJFOLD(FLOAD any IRFL_SBUF_E)
++LJFOLD(FLOAD any IRFL_SBUF_B)
++LJFOLD(FLOAD any IRFL_SBUF_L)
++LJFOLD(FLOAD any IRFL_SBUF_REF)
++LJFOLD(FLOAD any IRFL_SBUF_R)
++LJFOLDF(fload_sbuf)
++{
++  TRef tr = lj_opt_fwd_fload(J);
++  return lj_opt_fwd_sbuf(J, tref_ref(tr)) ? tr : EMITFOLD;
++}
++
++/* The fast function ID of function objects is immutable. */
++LJFOLD(FLOAD KGC IRFL_FUNC_FFID)
++LJFOLDF(fload_func_ffid_kgc)
++{
++  if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD))
++    return INTFOLD((int32_t)ir_kfunc(fleft)->c.ffid);
++  return NEXTFOLD;
++}
++
+ /* The C type ID of cdata objects is immutable. */
+ LJFOLD(FLOAD KGC IRFL_CDATA_CTYPEID)
+ LJFOLDF(fload_cdata_typeid_kgc)
+@@ -2358,18 +2403,24 @@ LJFOLDF(xload_kptr)
+ LJFOLD(XLOAD any any)
+ LJFOLDX(lj_opt_fwd_xload)
+ 
++/* -- Frame handling ------------------------------------------------------ */
++
++/* Prevent CSE of a REF_BASE operand across IR_RETF. */
++LJFOLD(SUB any BASE)
++LJFOLD(SUB BASE any)
++LJFOLD(EQ any BASE)
++LJFOLDF(fold_base)
++{
++  return lj_opt_cselim(J, J->chain[IR_RETF]);
++}
++
+ /* -- Write barriers ------------------------------------------------------ */
+ 
+ /* Write barriers are amenable to CSE, but not across any incremental
+ ** GC steps.
+-**
+-** The same logic applies to open upvalue references, because a stack
+-** may be resized during a GC step (not the current stack, but maybe that
+-** of a coroutine).
+ */
+ LJFOLD(TBAR any)
+ LJFOLD(OBAR any any)
+-LJFOLD(UREFO any any)
+ LJFOLDF(barrier_tab)
+ {
+   TRef tr = lj_opt_cse(J);
+@@ -2421,6 +2472,7 @@ LJFOLD(XSTORE any any)
+ LJFOLDX(lj_opt_dse_xstore)
+ 
+ LJFOLD(NEWREF any any)  /* Treated like a store. */
++LJFOLD(TMPREF any any)
+ LJFOLD(CALLA any any)
+ LJFOLD(CALLL any any)  /* Safeguard fallback. */
+ LJFOLD(CALLS any any)
+@@ -2431,7 +2483,6 @@ LJFOLD(TNEW any any)
+ LJFOLD(TDUP any)
+ LJFOLD(CNEW any any)
+ LJFOLD(XSNEW any any)
+-LJFOLD(BUFHDR any any)
+ LJFOLDX(lj_ir_emit)
+ 
+ /* ------------------------------------------------------------------------ */
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_opt_loop.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_opt_loop.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_opt_loop.c
+@@ -1,6 +1,6 @@
+ /*
+ ** LOOP: Loop Optimizations.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_opt_loop_c
+@@ -225,6 +225,7 @@ static void loop_subst_snap(jit_State *J
+   /* Setup new snapshot. */
+   snap->mapofs = (uint32_t)nmapofs;
+   snap->ref = (IRRef1)J->cur.nins;
++  snap->mcofs = 0;
+   snap->nslots = nslots;
+   snap->topslot = osnap->topslot;
+   snap->count = 0;
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_opt_mem.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_opt_mem.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_opt_mem.c
+@@ -3,7 +3,7 @@
+ ** AA: Alias Analysis using high-level semantic disambiguation.
+ ** FWD: Load Forwarding (L2L) + Store Forwarding (S2L).
+ ** DSE: Dead-Store Elimination.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_opt_mem_c
+@@ -72,6 +72,34 @@ static AliasRet aa_table(jit_State *J, I
+   return aa_escape(J, taba, tabb);
+ }
+ 
++/* Check whether there's no aliasing table.clear. */
++static int fwd_aa_tab_clear(jit_State *J, IRRef lim, IRRef ta)
++{
++  IRRef ref = J->chain[IR_CALLS];
++  while (ref > lim) {
++    IRIns *calls = IR(ref);
++    if (calls->op2 == IRCALL_lj_tab_clear &&
++	(ta == calls->op1 || aa_table(J, ta, calls->op1) != ALIAS_NO))
++      return 0;  /* Conflict. */
++    ref = calls->prev;
++  }
++  return 1;  /* No conflict. Can safely FOLD/CSE. */
++}
++
++/* Check whether there's no aliasing NEWREF/table.clear for the left operand. */
++int LJ_FASTCALL lj_opt_fwd_tptr(jit_State *J, IRRef lim)
++{
++  IRRef ta = fins->op1;
++  IRRef ref = J->chain[IR_NEWREF];
++  while (ref > lim) {
++    IRIns *newref = IR(ref);
++    if (ta == newref->op1 || aa_table(J, ta, newref->op1) != ALIAS_NO)
++      return 0;  /* Conflict. */
++    ref = newref->prev;
++  }
++  return fwd_aa_tab_clear(J, lim, ta);
++}
++
+ /* Alias analysis for array and hash access using key-based disambiguation. */
+ static AliasRet aa_ahref(jit_State *J, IRIns *refa, IRIns *refb)
+ {
+@@ -154,9 +182,11 @@ static TRef fwd_ahload(jit_State *J, IRR
+     IRIns *ir = (xr->o == IR_HREFK || xr->o == IR_AREF) ? IR(xr->op1) : xr;
+     IRRef tab = ir->op1;
+     ir = IR(tab);
+-    if (ir->o == IR_TNEW || (ir->o == IR_TDUP && irref_isk(xr->op2))) {
++    if ((ir->o == IR_TNEW || (ir->o == IR_TDUP && irref_isk(xr->op2))) &&
++	fwd_aa_tab_clear(J, tab, tab)) {
+       /* A NEWREF with a number key may end up pointing to the array part.
+       ** But it's referenced from HSTORE and not found in the ASTORE chain.
++      ** Or a NEWREF may rehash the table and move unrelated number keys.
+       ** For now simply consider this a conflict without forwarding anything.
+       */
+       if (xr->o == IR_AREF) {
+@@ -167,6 +197,11 @@ static TRef fwd_ahload(jit_State *J, IRR
+ 	    goto cselim;
+ 	  ref2 = newref->prev;
+ 	}
++      } else {
++	IRIns *key = IR(xr->op2);
++	if (key->o == IR_KSLOT) key = IR(key->op1);
++	if (irt_isnum(key->t) && J->chain[IR_NEWREF] > tab)
++	  goto cselim;
+       }
+       /* NEWREF inhibits CSE for HREF, and dependent FLOADs from HREFK/AREF.
+       ** But the above search for conflicting stores was limited by xref.
+@@ -194,8 +229,8 @@ static TRef fwd_ahload(jit_State *J, IRR
+ 	if (key->o == IR_KSLOT) key = IR(key->op1);
+ 	lj_ir_kvalue(J->L, &keyv, key);
+ 	tv = lj_tab_get(J->L, ir_ktab(IR(ir->op1)), &keyv);
+-	lj_assertJ(itype2irt(tv) == irt_type(fins->t),
+-		   "mismatched type in constant table");
++	if (itype2irt(tv) != irt_type(fins->t))
++	  return 0;  /* Type instability in loop-carried dependency. */
+ 	if (irt_isnum(fins->t))
+ 	  return lj_ir_knum_u64(J, tv->u64);
+ 	else if (LJ_DUALNUM && irt_isint(fins->t))
+@@ -269,7 +304,7 @@ TRef LJ_FASTCALL lj_opt_fwd_hrefk(jit_St
+   while (ref > tab) {
+     IRIns *newref = IR(ref);
+     if (tab == newref->op1) {
+-      if (fright->op1 == newref->op2)
++      if (fright->op1 == newref->op2 && fwd_aa_tab_clear(J, ref, tab))
+ 	return ref;  /* Forward from NEWREF. */
+       else
+ 	goto docse;
+@@ -279,7 +314,7 @@ TRef LJ_FASTCALL lj_opt_fwd_hrefk(jit_St
+     ref = newref->prev;
+   }
+   /* No conflicting NEWREF: key location unchanged for HREFK of TDUP. */
+-  if (IR(tab)->o == IR_TDUP)
++  if (IR(tab)->o == IR_TDUP && fwd_aa_tab_clear(J, tab, tab))
+     fins->t.irt &= ~IRT_GUARD;  /* Drop HREFK guard. */
+ docse:
+   return CSEFOLD;
+@@ -313,34 +348,6 @@ int LJ_FASTCALL lj_opt_fwd_href_nokey(ji
+   return 1;  /* No conflict. Can fold to niltv. */
+ }
+ 
+-/* Check whether there's no aliasing table.clear. */
+-static int fwd_aa_tab_clear(jit_State *J, IRRef lim, IRRef ta)
+-{
+-  IRRef ref = J->chain[IR_CALLS];
+-  while (ref > lim) {
+-    IRIns *calls = IR(ref);
+-    if (calls->op2 == IRCALL_lj_tab_clear &&
+-	(ta == calls->op1 || aa_table(J, ta, calls->op1) != ALIAS_NO))
+-      return 0;  /* Conflict. */
+-    ref = calls->prev;
+-  }
+-  return 1;  /* No conflict. Can safely FOLD/CSE. */
+-}
+-
+-/* Check whether there's no aliasing NEWREF/table.clear for the left operand. */
+-int LJ_FASTCALL lj_opt_fwd_tptr(jit_State *J, IRRef lim)
+-{
+-  IRRef ta = fins->op1;
+-  IRRef ref = J->chain[IR_NEWREF];
+-  while (ref > lim) {
+-    IRIns *newref = IR(ref);
+-    if (ta == newref->op1 || aa_table(J, ta, newref->op1) != ALIAS_NO)
+-      return 0;  /* Conflict. */
+-    ref = newref->prev;
+-  }
+-  return fwd_aa_tab_clear(J, lim, ta);
+-}
+-
+ /* ASTORE/HSTORE elimination. */
+ TRef LJ_FASTCALL lj_opt_dse_ahstore(jit_State *J)
+ {
+@@ -364,7 +371,10 @@ TRef LJ_FASTCALL lj_opt_dse_ahstore(jit_
+       /* Different value: try to eliminate the redundant store. */
+       if (ref > J->chain[IR_LOOP]) {  /* Quick check to avoid crossing LOOP. */
+ 	IRIns *ir;
+-	/* Check for any intervening guards (includes conflicting loads). */
++	/* Check for any intervening guards (includes conflicting loads).
++	** Note that lj_tab_keyindex and lj_vm_next don't need guards,
++	** since they are followed by at least one guarded VLOAD.
++	*/
+ 	for (ir = IR(J->cur.nins-1); ir > store; ir--)
+ 	  if (irt_isguard(ir->t) || ir->o == IR_ALEN)
+ 	    goto doemit;  /* No elimination possible. */
+@@ -428,7 +438,7 @@ TRef LJ_FASTCALL lj_opt_fwd_alen(jit_Sta
+ 	    fins->op2 = aref->op2;  /* Set ALEN hint. */
+ 	  }
+ 	  goto doemit;  /* Conflicting store, possibly giving a hint. */
+-	} else if (aa_table(J, tab, fref->op1) == ALIAS_NO) {
++	} else if (aa_table(J, tab, fref->op1) != ALIAS_NO) {
+ 	  goto doemit;  /* Conflicting store. */
+ 	}
+ 	sref = store->prev;
+@@ -454,18 +464,23 @@ doemit:
+ */
+ static AliasRet aa_uref(IRIns *refa, IRIns *refb)
+ {
+-  if (refa->o != refb->o)
+-    return ALIAS_NO;  /* Different UREFx type. */
+   if (refa->op1 == refb->op1) {  /* Same function. */
+     if (refa->op2 == refb->op2)
+       return ALIAS_MUST;  /* Same function, same upvalue idx. */
+     else
+       return ALIAS_NO;  /* Same function, different upvalue idx. */
+   } else {  /* Different functions, check disambiguation hash values. */
+-    if (((refa->op2 ^ refb->op2) & 0xff))
++    if (((refa->op2 ^ refb->op2) & 0xff)) {
+       return ALIAS_NO;  /* Upvalues with different hash values cannot alias. */
+-    else
+-      return ALIAS_MAY;  /* No conclusion can be drawn for same hash value. */
++    } else if (refa->o != refb->o) {
++      /* Different UREFx type, but need to confirm the UREFO really is open. */
++      if (irt_type(refa->t) == IRT_IGC) refa->t.irt += IRT_PGC-IRT_IGC;
++      else if (irt_type(refb->t) == IRT_IGC) refb->t.irt += IRT_PGC-IRT_IGC;
++      return ALIAS_NO;
++    } else {
++      /* No conclusion can be drawn for same hash value and same UREFx type. */
++      return ALIAS_MAY;
++    }
+   }
+ }
+ 
+@@ -620,8 +635,9 @@ TRef LJ_FASTCALL lj_opt_dse_fstore(jit_S
+ 	goto doemit;
+       break;  /* Otherwise continue searching. */
+     case ALIAS_MUST:
+-      if (store->op2 == val)  /* Same value: drop the new store. */
+-	return DROPFOLD;
++      if (store->op2 == val &&
++	  !(xr->op2 >= IRFL_SBUF_W && xr->op2 <= IRFL_SBUF_R))
++	return DROPFOLD;  /* Same value: drop the new store. */
+       /* Different value: try to eliminate the redundant store. */
+       if (ref > J->chain[IR_LOOP]) {  /* Quick check to avoid crossing LOOP. */
+ 	IRIns *ir;
+@@ -642,6 +658,29 @@ doemit:
+   return EMITFOLD;  /* Otherwise we have a conflict or simply no match. */
+ }
+ 
++/* Check whether there's no aliasing buffer op between IRFL_SBUF_*. */
++int LJ_FASTCALL lj_opt_fwd_sbuf(jit_State *J, IRRef lim)
++{
++  IRRef ref;
++  if (J->chain[IR_BUFPUT] > lim)
++    return 0;  /* Conflict. */
++  ref = J->chain[IR_CALLS];
++  while (ref > lim) {
++    IRIns *ir = IR(ref);
++    if (ir->op2 >= IRCALL_lj_strfmt_putint && ir->op2 < IRCALL_lj_buf_tostr)
++      return 0;  /* Conflict. */
++    ref = ir->prev;
++  }
++  ref = J->chain[IR_CALLL];
++  while (ref > lim) {
++    IRIns *ir = IR(ref);
++    if (ir->op2 >= IRCALL_lj_strfmt_putint && ir->op2 < IRCALL_lj_buf_tostr)
++      return 0;  /* Conflict. */
++    ref = ir->prev;
++  }
++  return 1;  /* No conflict. Can safely FOLD/CSE. */
++}
++
+ /* -- XLOAD forwarding and XSTORE elimination ----------------------------- */
+ 
+ /* Find cdata allocation for a reference (if any). */
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_opt_narrow.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_opt_narrow.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_opt_narrow.c
+@@ -1,7 +1,7 @@
+ /*
+ ** NARROW: Narrowing of numbers to integers (double to int32_t).
+ ** STRIPOV: Stripping of overflow checks.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_opt_narrow_c
+@@ -584,36 +584,6 @@ TRef lj_opt_narrow_mod(jit_State *J, TRe
+   return emitir(IRTN(IR_SUB), rb, tmp);
+ }
+ 
+-/* Narrowing of power operator or math.pow. */
+-TRef lj_opt_narrow_pow(jit_State *J, TRef rb, TRef rc, TValue *vb, TValue *vc)
+-{
+-  rb = conv_str_tonum(J, rb, vb);
+-  rb = lj_ir_tonum(J, rb);  /* Left arg is always treated as an FP number. */
+-  rc = conv_str_tonum(J, rc, vc);
+-  /* Narrowing must be unconditional to preserve (-x)^i semantics. */
+-  if (tvisint(vc) || numisint(numV(vc))) {
+-    int checkrange = 0;
+-    /* pow() is faster for bigger exponents. But do this only for (+k)^i. */
+-    if (tref_isk(rb) && (int32_t)ir_knum(IR(tref_ref(rb)))->u32.hi >= 0) {
+-      int32_t k = numberVint(vc);
+-      if (!(k >= -65536 && k <= 65536)) goto force_pow_num;
+-      checkrange = 1;
+-    }
+-    if (!tref_isinteger(rc)) {
+-      /* Guarded conversion to integer! */
+-      rc = emitir(IRTGI(IR_CONV), rc, IRCONV_INT_NUM|IRCONV_CHECK);
+-    }
+-    if (checkrange && !tref_isk(rc)) {  /* Range guard: -65536 <= i <= 65536 */
+-      TRef tmp = emitir(IRTI(IR_ADD), rc, lj_ir_kint(J, 65536));
+-      emitir(IRTGI(IR_ULE), tmp, lj_ir_kint(J, 2*65536));
+-    }
+-  } else {
+-force_pow_num:
+-    rc = lj_ir_tonum(J, rc);  /* Want POW(num, num), not POW(num, int). */
+-  }
+-  return emitir(IRTN(IR_POW), rb, rc);
+-}
+-
+ /* -- Predictive narrowing of induction variables ------------------------- */
+ 
+ /* Narrow a single runtime value. */
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_opt_sink.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_opt_sink.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_opt_sink.c
+@@ -1,6 +1,6 @@
+ /*
+ ** SINK: Allocation Sinking and Store Sinking.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_opt_sink_c
+@@ -36,12 +36,14 @@ static IRIns *sink_checkalloc(jit_State
+ }
+ 
+ /* Recursively check whether a value depends on a PHI. */
+-static int sink_phidep(jit_State *J, IRRef ref)
++static int sink_phidep(jit_State *J, IRRef ref, int *workp)
+ {
+   IRIns *ir = IR(ref);
++  if (!*workp) return 1;  /* Give up and pretend it does. */
++  (*workp)--;
+   if (irt_isphi(ir->t)) return 1;
+-  if (ir->op1 >= REF_FIRST && sink_phidep(J, ir->op1)) return 1;
+-  if (ir->op2 >= REF_FIRST && sink_phidep(J, ir->op2)) return 1;
++  if (ir->op1 >= REF_FIRST && sink_phidep(J, ir->op1, workp)) return 1;
++  if (ir->op2 >= REF_FIRST && sink_phidep(J, ir->op2, workp)) return 1;
+   return 0;
+ }
+ 
+@@ -56,7 +58,13 @@ static int sink_checkphi(jit_State *J, I
+       return 1;  /* Sinkable PHI. */
+     }
+     /* Otherwise the value must be loop-invariant. */
+-    return ref < J->loopref && !sink_phidep(J, ref);
++    if (ref < J->loopref) {
++      /* Check for PHI dependencies, but give up after reasonable effort. */
++      int work = 64;
++      return !sink_phidep(J, ref, &work);
++    } else {
++      return 0;  /* Loop-variant. */
++    }
+   }
+   return 1;  /* Constant (non-PHI). */
+ }
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_opt_split.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_opt_split.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_opt_split.c
+@@ -1,6 +1,6 @@
+ /*
+ ** SPLIT: Split 64 bit IR instructions into 32 bit IR instructions.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_opt_split_c
+@@ -400,7 +400,7 @@ static void split_ir(jit_State *J)
+ 	hi = split_call_ll(J, hisubst, oir, ir, IRCALL_softfp_div);
+ 	break;
+       case IR_POW:
+-	hi = split_call_li(J, hisubst, oir, ir, IRCALL_lj_vm_powi);
++	hi = split_call_ll(J, hisubst, oir, ir, IRCALL_pow);
+ 	break;
+       case IR_FPMATH:
+ 	hi = split_call_l(J, hisubst, oir, ir, IRCALL_lj_vm_floor + ir->op2);
+@@ -645,7 +645,7 @@ static void split_ir(jit_State *J)
+       tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), hisubst[op1], oir[op1].prev);
+ #endif
+       ir->prev = split_emit(J, IRTI(IR_CALLN), tmp, IRCALL_lj_vm_tobit);
+-    } else if (ir->o == IR_TOSTR) {
++    } else if (ir->o == IR_TOSTR || ir->o == IR_TMPREF) {
+       if (hisubst[ir->op1]) {
+ 	if (irref_isk(ir->op1))
+ 	  nir->op1 = ir->op1;
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_parse.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_parse.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_parse.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Lua parser (source code -> bytecode).
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Major portions taken verbatim or adapted from the Lua interpreter.
+ ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+@@ -964,22 +964,22 @@ static void bcemit_unop(FuncState *fs, B
+ #if LJ_HASFFI
+       if (e->k == VKCDATA) {  /* Fold in-place since cdata is not interned. */
+ 	GCcdata *cd = cdataV(&e->u.nval);
+-	int64_t *p = (int64_t *)cdataptr(cd);
++	uint64_t *p = (uint64_t *)cdataptr(cd);
+ 	if (cd->ctypeid == CTID_COMPLEX_DOUBLE)
+-	  p[1] ^= (int64_t)U64x(80000000,00000000);
++	  p[1] ^= U64x(80000000,00000000);
+ 	else
+-	  *p = -*p;
++	  *p = ~*p+1u;
+ 	return;
+       } else
+ #endif
+       if (expr_isnumk(e) && !expr_numiszero(e)) {  /* Avoid folding to -0. */
+ 	TValue *o = expr_numtv(e);
+ 	if (tvisint(o)) {
+-	  int32_t k = intV(o);
+-	  if (k == -k)
++	  int32_t k = intV(o), negk = (int32_t)(~(uint32_t)k+1u);
++	  if (k == negk)
+ 	    setnumV(o, -(lua_Number)k);
+ 	  else
+-	    setintV(o, -k);
++	    setintV(o, negk);
+ 	  return;
+ 	} else {
+ 	  o->u64 ^= U64x(80000000,00000000);
+@@ -1465,7 +1465,7 @@ static size_t fs_prep_var(LexState *ls,
+     MSize len = s->len+1;
+     char *p = lj_buf_more(&ls->sb, len);
+     p = lj_buf_wmem(p, strdata(s), len);
+-    setsbufP(&ls->sb, p);
++    ls->sb.w = p;
+   }
+   *ofsvar = sbuflen(&ls->sb);
+   lastpc = 0;
+@@ -1486,7 +1486,7 @@ static size_t fs_prep_var(LexState *ls,
+       startpc = vs->startpc;
+       p = lj_strfmt_wuleb128(p, startpc-lastpc);
+       p = lj_strfmt_wuleb128(p, vs->endpc-startpc);
+-      setsbufP(&ls->sb, p);
++      ls->sb.w = p;
+       lastpc = startpc;
+     }
+   }
+@@ -1499,7 +1499,7 @@ static void fs_fixup_var(LexState *ls, G
+ {
+   setmref(pt->uvinfo, p);
+   setmref(pt->varinfo, (char *)p + ofsvar);
+-  memcpy(p, sbufB(&ls->sb), sbuflen(&ls->sb));  /* Copy from temp. buffer. */
++  memcpy(p, ls->sb.b, sbuflen(&ls->sb));  /* Copy from temp. buffer. */
+ }
+ #else
+ 
+@@ -1554,7 +1554,7 @@ static void fs_fixup_ret(FuncState *fs)
+ 	/* Replace with UCLO plus branch. */
+ 	fs->bcbase[pc].ins = BCINS_AD(BC_UCLO, 0, offset);
+ 	break;
+-      case BC_UCLO:
++      case BC_FNEW:
+ 	return;  /* We're done. */
+       default:
+ 	break;
+@@ -2513,11 +2513,14 @@ static void parse_for_num(LexState *ls,
+ */
+ static int predict_next(LexState *ls, FuncState *fs, BCPos pc)
+ {
+-  BCIns ins = fs->bcbase[pc].ins;
++  BCIns ins;
+   GCstr *name;
+   cTValue *o;
++  if (pc >= fs->bclim) return 0;
++  ins = fs->bcbase[pc].ins;
+   switch (bc_op(ins)) {
+   case BC_MOV:
++    if (bc_d(ins) >= fs->nactvar) return 0;
+     name = gco2str(gcref(var_get(ls, fs, bc_d(ins)).name));
+     break;
+   case BC_UGET:
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_parse.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_parse.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_parse.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Lua parser (source code -> bytecode).
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_PARSE_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_prng.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_prng.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_prng.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Pseudo-random number generation.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_prng_c
+@@ -83,10 +83,14 @@ extern int XNetRandom(void *buf, unsigne
+ 
+ extern int sys_get_random_number(void *buf, uint64_t len);
+ 
+-#elif LJ_TARGET_PS4 || LJ_TARGET_PSVITA
++#elif LJ_TARGET_PS4 || LJ_TARGET_PS5 || LJ_TARGET_PSVITA
+ 
+ extern int sceRandomGetRandomNumber(void *buf, size_t len);
+ 
++#elif LJ_TARGET_NX
++
++#include <unistd.h>
++
+ #elif LJ_TARGET_WINDOWS || LJ_TARGET_XBOXONE
+ 
+ #define WIN32_LEAN_AND_MEAN
+@@ -109,18 +113,24 @@ static PRGR libfunc_rgr;
+ #include <sys/syscall.h>
+ #else
+ 
+-#if LJ_TARGET_OSX
++#if LJ_TARGET_OSX && !LJ_TARGET_IOS
++/*
++** In their infinite wisdom Apple decided to disallow getentropy() in the
++** iOS App Store. Even though the call is common to all BSD-ish OS, it's
++** recommended by Apple in their own security-related docs, and, to top
++** off the foolery, /dev/urandom is handled by the same kernel code,
++** yet accessing it is actually permitted (but less efficient).
++*/
+ #include <Availability.h>
+-#if __MAC_OS_X_VERSION_MIN_REQUIRED >= 101200 || \
+-    __IPHONE_OS_VERSION_MIN_REQUIRED >= 100000
++#if __MAC_OS_X_VERSION_MIN_REQUIRED >= 101200
+ #define LJ_TARGET_HAS_GETENTROPY	1
+ #endif
+-#elif LJ_TARGET_BSD || LJ_TARGET_SOLARIS || LJ_TARGET_CYGWIN
++#elif (LJ_TARGET_BSD && !defined(__NetBSD__)) || LJ_TARGET_SOLARIS || LJ_TARGET_CYGWIN || LJ_TARGET_QNX
+ #define LJ_TARGET_HAS_GETENTROPY	1
+ #endif
+ 
+ #if LJ_TARGET_HAS_GETENTROPY
+-extern int getentropy(void *buf, size_t len);
++extern int getentropy(void *buf, size_t len)
+ #ifdef __ELF__
+   __attribute__((weak))
+ #endif
+@@ -165,9 +175,14 @@ int LJ_FASTCALL lj_prng_seed_secure(PRNG
+   if (sys_get_random_number(rs->u, sizeof(rs->u)) == 0)
+     goto ok;
+ 
+-#elif LJ_TARGET_PS4 || LJ_TARGET_PSVITA
++#elif LJ_TARGET_PS4 || LJ_TARGET_PS5 || LJ_TARGET_PSVITA
++
++  if (sceRandomGetRandomNumber(rs->u, sizeof(rs->u)) == 0)
++    goto ok;
++
++#elif LJ_TARGET_NX
+ 
+-  if (sceRandomGetRandomNumber(rs->u, sizeof(rs->u) == 0)
++  if (getentropy(rs->u, sizeof(rs->u)) == 0)
+     goto ok;
+ 
+ #elif LJ_TARGET_UWP || LJ_TARGET_XBOXONE
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_prng.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_prng.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_prng.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Pseudo-random number generation.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_PRNG_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_profile.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_profile.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_profile.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Low-overhead profiling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_profile_c
+@@ -185,7 +185,11 @@ static void profile_timer_start(ProfileS
+   tm.it_value.tv_sec = tm.it_interval.tv_sec = interval / 1000;
+   tm.it_value.tv_usec = tm.it_interval.tv_usec = (interval % 1000) * 1000;
+   setitimer(ITIMER_PROF, &tm, NULL);
++#if LJ_TARGET_QNX
++  sa.sa_flags = 0;
++#else
+   sa.sa_flags = SA_RESTART;
++#endif
+   sa.sa_handler = profile_signal;
+   sigemptyset(&sa.sa_mask);
+   sigaction(SIGPROF, &sa, &ps->oldsa);
+@@ -346,8 +350,7 @@ LUA_API void luaJIT_profile_stop(lua_Sta
+     lj_trace_flushall(L);
+ #endif
+     lj_buf_free(g, &ps->sb);
+-    setmref(ps->sb.b, NULL);
+-    setmref(ps->sb.e, NULL);
++    ps->sb.w = ps->sb.e = NULL;
+     ps->g = NULL;
+   }
+ }
+@@ -362,7 +365,7 @@ LUA_API const char *luaJIT_profile_dumps
+   lj_buf_reset(sb);
+   lj_debug_dumpstack(L, sb, fmt, depth);
+   *len = (size_t)sbuflen(sb);
+-  return sbufB(sb);
++  return sb->b;
+ }
+ 
+ #endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_profile.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_profile.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_profile.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Low-overhead profiling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_PROFILE_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_record.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_record.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_record.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Trace recorder (bytecode -> SSA IR).
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_record_c
+@@ -116,6 +116,7 @@ static void rec_check_slots(jit_State *J
+       cTValue *tv = &base[s];
+       IRRef ref = tref_ref(tr);
+       IRIns *ir = NULL;  /* Silence compiler. */
++      lj_assertJ(tv < J->L->top, "slot %d above top of Lua stack", s);
+       if (!LJ_FR2 || ref || !(tr & (TREF_FRAME | TREF_CONT))) {
+ 	lj_assertJ(ref >= J->cur.nk && ref < J->cur.nins,
+ 		   "slot %d ref %04d out of range", s, ref - REF_BIAS);
+@@ -156,6 +157,9 @@ static void rec_check_slots(jit_State *J
+ 	lj_assertJ((J->slot[s+1+LJ_FR2] & TREF_FRAME),
+ 		   "cont slot %d not followed by frame", s);
+ 	depth++;
++      } else if ((tr & TREF_KEYINDEX)) {
++	lj_assertJ(tref_isint(tr), "keyindex slot %d bad type %d",
++				   s, tref_type(tr));
+       } else {
+ 	/* Number repr. may differ, but other types must be the same. */
+ 	lj_assertJ(tvisnumber(tv) ? tref_isnumber(tr) :
+@@ -259,6 +263,14 @@ TRef lj_record_constify(jit_State *J, cT
+     return 0;  /* Can't represent lightuserdata (pointless). */
+ }
+ 
++/* Emit a VLOAD with the correct type. */
++TRef lj_record_vload(jit_State *J, TRef ref, MSize idx, IRType t)
++{
++  TRef tr = emitir(IRTG(IR_VLOAD, t), ref, idx);
++  if (irtype_ispri(t)) tr = TREF_PRI(t);  /* Canonicalize primitives. */
++  return tr;
++}
++
+ /* -- Record loop ops ----------------------------------------------------- */
+ 
+ /* Loop event. */
+@@ -275,9 +287,9 @@ static void canonicalize_slots(jit_State
+   if (LJ_DUALNUM) return;
+   for (s = J->baseslot+J->maxslot-1; s >= 1; s--) {
+     TRef tr = J->slot[s];
+-    if (tref_isinteger(tr)) {
++    if (tref_isinteger(tr) && !(tr & TREF_KEYINDEX)) {
+       IRIns *ir = IR(tref_ref(tr));
+-      if (!(ir->o == IR_SLOAD && (ir->op2 & IRSLOAD_READONLY)))
++      if (!(ir->o == IR_SLOAD && (ir->op2 & (IRSLOAD_READONLY))))
+ 	J->slot[s] = emitir(IRTN(IR_CONV), tr, IRCONV_NUM_INT);
+     }
+   }
+@@ -598,6 +610,7 @@ static void rec_loop_interp(jit_State *J
+ {
+   if (J->parent == 0 && J->exitno == 0) {
+     if (pc == J->startpc && J->framedepth + J->retdepth == 0) {
++      if (bc_op(J->cur.startins) == BC_ITERN) return;  /* See rec_itern(). */
+       /* Same loop? */
+       if (ev == LOOPEV_LEAVE)  /* Must loop back to form a root trace. */
+ 	lj_trace_err(J, LJ_TRERR_LLEAVE);
+@@ -638,6 +651,77 @@ static void rec_loop_jit(jit_State *J, T
+   }  /* Side trace continues across a loop that's left or not entered. */
+ }
+ 
++/* Record ITERN. */
++static LoopEvent rec_itern(jit_State *J, BCReg ra, BCReg rb)
++{
++#if LJ_BE
++  /* YAGNI: Disabled on big-endian due to issues with lj_vm_next,
++  ** IR_HIOP, RID_RETLO/RID_RETHI and ra_destpair.
++  */
++  UNUSED(ra); UNUSED(rb);
++  setintV(&J->errinfo, (int32_t)BC_ITERN);
++  lj_trace_err_info(J, LJ_TRERR_NYIBC);
++#else
++  RecordIndex ix;
++  /* Since ITERN is recorded at the start, we need our own loop detection. */
++  if (J->pc == J->startpc &&
++      J->framedepth + J->retdepth == 0 && J->parent == 0 && J->exitno == 0) {
++    IRRef ref = REF_FIRST + LJ_HASPROFILE;
++#ifdef LUAJIT_ENABLE_CHECKHOOK
++    ref += 3;
++#endif
++    if (J->cur.nins > ref ||
++       (LJ_HASPROFILE && J->cur.nins == ref && J->cur.ir[ref-1].o != IR_PROF)) {
++      J->instunroll = 0;  /* Cannot continue unrolling across an ITERN. */
++      lj_record_stop(J, LJ_TRLINK_LOOP, J->cur.traceno);  /* Looping trace. */
++      return LOOPEV_ENTER;
++    }
++  }
++  J->maxslot = ra;
++  lj_snap_add(J);  /* Required to make JLOOP the first ins in a side-trace. */
++  ix.tab = getslot(J, ra-2);
++  ix.key = J->base[ra-1] ? J->base[ra-1] :
++	   sloadt(J, (int32_t)(ra-1), IRT_GUARD|IRT_INT,
++		  IRSLOAD_TYPECHECK|IRSLOAD_KEYINDEX);
++  copyTV(J->L, &ix.tabv, &J->L->base[ra-2]);
++  copyTV(J->L, &ix.keyv, &J->L->base[ra-1]);
++  ix.idxchain = (rb < 3);  /* Omit value type check, if unused. */
++  ix.mobj = 1;  /* We need the next index, too. */
++  J->maxslot = ra + lj_record_next(J, &ix);
++  J->needsnap = 1;
++  if (!tref_isnil(ix.key)) {  /* Looping back? */
++    J->base[ra-1] = ix.mobj | TREF_KEYINDEX;  /* Control var has next index. */
++    J->base[ra] = ix.key;
++    J->base[ra+1] = ix.val;
++    J->pc += bc_j(J->pc[1])+2;
++    return LOOPEV_ENTER;
++  } else {
++    J->maxslot = ra-3;
++    J->pc += 2;
++    return LOOPEV_LEAVE;
++  }
++#endif
++}
++
++/* Record ISNEXT. */
++static void rec_isnext(jit_State *J, BCReg ra)
++{
++  cTValue *b = &J->L->base[ra-3];
++  if (tvisfunc(b) && funcV(b)->c.ffid == FF_next &&
++      tvistab(b+1) && tvisnil(b+2)) {
++    /* These checks are folded away for a compiled pairs(). */
++    TRef func = getslot(J, ra-3);
++    TRef trid = emitir(IRT(IR_FLOAD, IRT_U8), func, IRFL_FUNC_FFID);
++    emitir(IRTGI(IR_EQ), trid, lj_ir_kint(J, FF_next));
++    (void)getslot(J, ra-2); /* Type check for table. */
++    (void)getslot(J, ra-1); /* Type check for nil key. */
++    J->base[ra-1] = lj_ir_kint(J, 0) | TREF_KEYINDEX;
++    J->maxslot = ra;
++  } else {  /* Abort trace. Interpreter will despecialize bytecode. */
++    lj_trace_err(J, LJ_TRERR_RECERR);
++  }
++}
++
+ /* -- Record profiler hook checks ----------------------------------------- */
+ 
+ #if LJ_HASPROFILE
+@@ -708,7 +792,7 @@ static TRef rec_call_specialize(jit_Stat
+       /* NYI: io_file_iter doesn't have an ffid, yet. */
+       {  /* Specialize to the ffid. */
+ 	TRef trid = emitir(IRT(IR_FLOAD, IRT_U8), tr, IRFL_FUNC_FFID);
+-	emitir(IRTG(IR_EQ, IRT_INT), trid, lj_ir_kint(J, fn->c.ffid));
++	emitir(IRTGI(IR_EQ), trid, lj_ir_kint(J, fn->c.ffid));
+       }
+       return tr;
+     default:
+@@ -832,6 +916,7 @@ void lj_record_ret(jit_State *J, BCReg r
+     J->base -= cbase;
+     J->base[--rbase] = TREF_TRUE;  /* Prepend true to results. */
+     frame = frame_prevd(frame);
++    J->needsnap = 1;  /* Stop catching on-trace errors. */
+   }
+   /* Return to lower frame via interpreter for unhandled cases. */
+   if (J->framedepth == 0 && J->pt && bc_isret(bc_op(*J->pc)) &&
+@@ -891,6 +976,7 @@ void lj_record_ret(jit_State *J, BCReg r
+       emitir(IRTG(IR_RETF, IRT_PGC), trpt, trpc);
+       J->retdepth++;
+       J->needsnap = 1;
++      J->scev.idx = REF_NIL;
+       lj_assertJ(J->baseslot == 1+LJ_FR2, "bad baseslot for return");
+       /* Shift result slots up and clear the slots of the new frame below. */
+       memmove(J->base + cbase, J->base-1-LJ_FR2, sizeof(TRef)*nresults);
+@@ -918,6 +1004,9 @@ void lj_record_ret(jit_State *J, BCReg r
+       TRef tr = gotresults ? J->base[cbase+rbase] : TREF_NIL;
+       if (bslot != J->maxslot) {  /* Concatenate the remainder. */
+ 	TValue *b = J->L->base, save;  /* Simulate lower frame and result. */
++	/* Can't handle MM_concat + CALLT + fast func side-effects. */
++	if (J->postproc != LJ_POST_NONE)
++	  lj_trace_err(J, LJ_TRERR_NYIRETL);
+ 	J->base[J->maxslot] = tr;
+ 	copyTV(J->L, &save, b-(2<<LJ_FR2));
+ 	if (gotresults)
+@@ -1366,16 +1455,16 @@ static TRef rec_idx_key(jit_State *J, Re
+     key = emitir(IRTN(IR_CONV), key, IRCONV_NUM_INT);
+   if (tref_isk(key)) {
+     /* Optimize lookup of constant hash keys. */
+-    MSize hslot = (MSize)((char *)ix->oldv - (char *)&noderef(t->node)[0].val);
+-    if (t->hmask > 0 && hslot <= t->hmask*(MSize)sizeof(Node) &&
+-	hslot <= 65535*(MSize)sizeof(Node)) {
++    GCSize hslot = (GCSize)((char *)ix->oldv-(char *)&noderef(t->node)[0].val);
++    if (hslot <= t->hmask*(GCSize)sizeof(Node) &&
++	hslot <= 65535*(GCSize)sizeof(Node)) {
+       TRef node, kslot, hm;
+       *rbref = J->cur.nins;  /* Mark possible rollback point. */
+       *rbguard = J->guardemit;
+       hm = emitir(IRTI(IR_FLOAD), ix->tab, IRFL_TAB_HMASK);
+       emitir(IRTGI(IR_EQ), hm, lj_ir_kint(J, (int32_t)t->hmask));
+       node = emitir(IRT(IR_FLOAD, IRT_PGC), ix->tab, IRFL_TAB_NODE);
+-      kslot = lj_ir_kslot(J, key, hslot / sizeof(Node));
++      kslot = lj_ir_kslot(J, key, (IRRef)(hslot / sizeof(Node)));
+       return emitir(IRTG(IR_HREFK, IRT_PGC), node, kslot);
+     }
+   }
+@@ -1433,6 +1522,16 @@ TRef lj_record_idx(jit_State *J, RecordI
+ 	return 0;  /* No result yet. */
+       }
+     }
++#if LJ_HASBUFFER
++    /* The index table of buffer objects is treated as immutable. */
++    if (ix->mt == TREF_NIL && !ix->val &&
++	tref_isudata(ix->tab) && udataV(&ix->tabv)->udtype == UDTYPE_BUFFER &&
++	tref_istab(ix->mobj) && tref_isstr(ix->key) && tref_isk(ix->key)) {
++      cTValue *val = lj_tab_getstr(tabV(&ix->mobjv), strV(&ix->keyv));
++      TRef tr = lj_record_constify(J, val);
++      if (tr) return tr;  /* Specialize to the value, i.e. a method. */
++    }
++#endif
+     /* Otherwise retry lookup with metaobject. */
+     ix->tab = ix->mobj;
+     copyTV(J->L, &ix->tabv, &ix->mobjv);
+@@ -1501,8 +1600,16 @@ TRef lj_record_idx(jit_State *J, RecordI
+       lj_assertJ(!hasmm, "inconsistent metamethod handling");
+       if (oldv == niltvg(J2G(J))) {  /* Need to insert a new key. */
+ 	TRef key = ix->key;
+-	if (tref_isinteger(key))  /* NEWREF needs a TValue as a key. */
++	if (tref_isinteger(key)) {  /* NEWREF needs a TValue as a key. */
+ 	  key = emitir(IRTN(IR_CONV), key, IRCONV_NUM_INT);
++	} else if (tref_isnum(key)) {
++	  if (tref_isk(key)) {
++	    if (tvismzero(&ix->keyv))
++	      key = lj_ir_knum_zero(J);  /* Canonicalize -0.0 to +0.0. */
++	  } else {
++	    emitir(IRTG(IR_EQ, IRT_NUM), key, key);  /* Check for !NaN. */
++	  }
++	}
+ 	xref = emitir(IRT(IR_NEWREF, IRT_PGC), ix->tab, key);
+ 	keybarrier = 0;  /* NEWREF already takes care of the key barrier. */
+ #ifdef LUAJIT_ENABLE_TABLE_BUMP
+@@ -1543,6 +1650,47 @@ TRef lj_record_idx(jit_State *J, RecordI
+   }
+ }
+ 
++/* Determine result type of table traversal. */
++static IRType rec_next_types(GCtab *t, uint32_t idx)
++{
++  for (; idx < t->asize; idx++) {
++    cTValue *a = arrayslot(t, idx);
++    if (LJ_LIKELY(!tvisnil(a)))
++      return (LJ_DUALNUM ? IRT_INT : IRT_NUM) + (itype2irt(a) << 8);
++  }
++  idx -= t->asize;
++  for (; idx <= t->hmask; idx++) {
++    Node *n = &noderef(t->node)[idx];
++    if (!tvisnil(&n->val))
++      return itype2irt(&n->key) + (itype2irt(&n->val) << 8);
++  }
++  return IRT_NIL + (IRT_NIL << 8);
++}
++
++/* Record a table traversal step aka next(). */
++int lj_record_next(jit_State *J, RecordIndex *ix)
++{
++  IRType t, tkey, tval;
++  TRef trvk;
++  t = rec_next_types(tabV(&ix->tabv), ix->keyv.u32.lo);
++  tkey = (t & 0xff); tval = (t >> 8);
++  trvk = lj_ir_call(J, IRCALL_lj_vm_next, ix->tab, ix->key);
++  if (ix->mobj || tkey == IRT_NIL) {
++    TRef idx = emitir(IRTI(IR_HIOP), trvk, trvk);
++    /* Always check for invalid key from next() for nil result. */
++    if (!ix->mobj) emitir(IRTGI(IR_NE), idx, lj_ir_kint(J, -1));
++    ix->mobj = idx;
++  }
++  ix->key = lj_record_vload(J, trvk, 1, tkey);
++  if (tkey == IRT_NIL || ix->idxchain) {  /* Omit value type check. */
++    ix->val = TREF_NIL;
++    return 1;
++  } else {  /* Need value. */
++    ix->val = lj_record_vload(J, trvk, 0, tval);
++    return 2;
++  }
++}
++
+ static void rec_tsetm(jit_State *J, BCReg ra, BCReg rn, int32_t i)
+ {
+   RecordIndex ix;
+@@ -1625,16 +1773,16 @@ noconstify:
+   /* Note: this effectively limits LJ_MAX_UPVAL to 127. */
+   uv = (uv << 8) | (hashrot(uvp->dhash, uvp->dhash + HASH_BIAS) & 0xff);
+   if (!uvp->closed) {
+-    uref = tref_ref(emitir(IRTG(IR_UREFO, IRT_PGC), fn, uv));
+     /* In current stack? */
+     if (uvval(uvp) >= tvref(J->L->stack) &&
+ 	uvval(uvp) < tvref(J->L->maxstack)) {
+       int32_t slot = (int32_t)(uvval(uvp) - (J->L->base - J->baseslot));
+       if (slot >= 0) {  /* Aliases an SSA slot? */
++	uref = tref_ref(emitir(IRT(IR_UREFO, IRT_PGC), fn, uv));
+ 	emitir(IRTG(IR_EQ, IRT_PGC),
+ 	       REF_BASE,
+ 	       emitir(IRT(IR_ADD, IRT_PGC), uref,
+-		      lj_ir_kint(J, (slot - 1 - LJ_FR2) * -8)));
++		      lj_ir_kintpgc(J, (slot - 1 - LJ_FR2) * -8)));
+ 	slot -= (int32_t)J->baseslot;  /* Note: slot number may be negative! */
+ 	if (val == 0) {
+ 	  return getslot(J, slot);
+@@ -1645,12 +1793,21 @@ noconstify:
+ 	}
+       }
+     }
++    /* IR_UREFO+IRT_IGC is not checked for open-ness at runtime.
++    ** Always marked as a guard, since it might get promoted to IRT_PGC later.
++    */
++    uref = emitir(IRTG(IR_UREFO, tref_isgcv(val) ? IRT_PGC : IRT_IGC), fn, uv);
++    uref = tref_ref(uref);
+     emitir(IRTG(IR_UGT, IRT_PGC),
+ 	   emitir(IRT(IR_SUB, IRT_PGC), uref, REF_BASE),
+-	   lj_ir_kint(J, (J->baseslot + J->maxslot) * 8));
++	   lj_ir_kintpgc(J, (J->baseslot + J->maxslot) * 8));
+   } else {
++    /* If fn is constant, then so is the GCupval*, and the upvalue cannot
++    ** transition back to open, so no guard is required in this case.
++    */
++    IRType t = (tref_isk(fn) ? 0 : IRT_GUARD) | IRT_PGC;
++    uref = tref_ref(emitir(IRT(IR_UREFC, t), fn, uv));
+     needbarrier = 1;
+-    uref = tref_ref(emitir(IRTG(IR_UREFC, IRT_PGC), fn, uv));
+   }
+   if (val == 0) {  /* Upvalue load */
+     IRType t = itype2irt(uvval(uvp));
+@@ -1801,12 +1958,14 @@ static void rec_varg(jit_State *J, BCReg
+   if (J->framedepth > 0) {  /* Simple case: varargs defined on-trace. */
+     ptrdiff_t i;
+     if (nvararg < 0) nvararg = 0;
+-    if (nresults == -1) {
+-      nresults = nvararg;
+-      J->maxslot = dst + (BCReg)nvararg;
+-    } else if (dst + nresults > J->maxslot) {
++    if (nresults != 1) {
++      if (nresults == -1) nresults = nvararg;
+       J->maxslot = dst + (BCReg)nresults;
++    } else if (dst >= J->maxslot) {
++      J->maxslot = dst + 1;
+     }
++    if (J->baseslot + J->maxslot >= LJ_MAX_JSLOTS)
++      lj_trace_err(J, LJ_TRERR_STACKOV);
+     for (i = 0; i < nresults; i++)
+       J->base[dst+i] = i < nvararg ? getslot(J, i - nvararg - 1 - LJ_FR2) : TREF_NIL;
+   } else {  /* Unknown number of varargs passed to trace. */
+@@ -1823,14 +1982,11 @@ static void rec_varg(jit_State *J, BCReg
+ 	  emitir(IRTGI(IR_EQ), fr,
+ 		 lj_ir_kint(J, (int32_t)frame_ftsz(J->L->base-1)));
+ 	vbase = emitir(IRT(IR_SUB, IRT_IGC), REF_BASE, fr);
+-	vbase = emitir(IRT(IR_ADD, IRT_PGC), vbase, lj_ir_kint(J, frofs-8));
++	vbase = emitir(IRT(IR_ADD, IRT_PGC), vbase,
++		       lj_ir_kintpgc(J, frofs-8*(1+LJ_FR2)));
+ 	for (i = 0; i < nload; i++) {
+ 	  IRType t = itype2irt(&J->L->base[i-1-LJ_FR2-nvararg]);
+-	  TRef aref = emitir(IRT(IR_AREF, IRT_PGC),
+-			     vbase, lj_ir_kint(J, (int32_t)i));
+-	  TRef tr = emitir(IRTG(IR_VLOAD, t), aref, 0);
+-	  if (irtype_ispri(t)) tr = TREF_PRI(t);  /* Canonicalize primitives. */
+-	  J->base[dst+i] = tr;
++	  J->base[dst+i] = lj_record_vload(J, vbase, (MSize)i, t);
+ 	}
+       } else {
+ 	emitir(IRTGI(IR_LE), fr, lj_ir_kint(J, frofs));
+@@ -1838,15 +1994,19 @@ static void rec_varg(jit_State *J, BCReg
+       }
+       for (i = nvararg; i < nresults; i++)
+ 	J->base[dst+i] = TREF_NIL;
+-      if (dst + (BCReg)nresults > J->maxslot)
++      if (nresults != 1 || dst >= J->maxslot) {
+ 	J->maxslot = dst + (BCReg)nresults;
++      }
+     } else if (select_detect(J)) {  /* y = select(x, ...) */
+       TRef tridx = J->base[dst-1];
+       TRef tr = TREF_NIL;
+       ptrdiff_t idx = lj_ffrecord_select_mode(J, tridx, &J->L->base[dst-1]);
+       if (idx < 0) goto nyivarg;
+-      if (idx != 0 && !tref_isinteger(tridx))
++      if (idx != 0 && !tref_isinteger(tridx)) {
++	if (tref_isstr(tridx))
++	  tridx = emitir(IRTG(IR_STRTO, IRT_NUM), tridx, 0);
+ 	tridx = emitir(IRTGI(IR_CONV), tridx, IRCONV_INT_NUM|IRCONV_INDEX);
++      }
+       if (idx != 0 && tref_isk(tridx)) {
+ 	emitir(IRTGI(idx <= nvararg ? IR_GE : IR_LT),
+ 	       fr, lj_ir_kint(J, frofs+8*(int32_t)idx));
+@@ -1874,11 +2034,10 @@ static void rec_varg(jit_State *J, BCReg
+ 	IRType t;
+ 	TRef aref, vbase = emitir(IRT(IR_SUB, IRT_IGC), REF_BASE, fr);
+ 	vbase = emitir(IRT(IR_ADD, IRT_PGC), vbase,
+-		       lj_ir_kint(J, frofs-(8<<LJ_FR2)));
++		       lj_ir_kintpgc(J, frofs-(8<<LJ_FR2)));
+ 	t = itype2irt(&J->L->base[idx-2-LJ_FR2-nvararg]);
+ 	aref = emitir(IRT(IR_AREF, IRT_PGC), vbase, tridx);
+-	tr = emitir(IRTG(IR_VLOAD, t), aref, 0);
+-	if (irtype_ispri(t)) tr = TREF_PRI(t);  /* Canonicalize primitives. */
++	tr = lj_record_vload(J, aref, 0, t);
+       }
+       J->base[dst-2-LJ_FR2] = tr;
+       J->maxslot = dst-1-LJ_FR2;
+@@ -1889,8 +2048,6 @@ static void rec_varg(jit_State *J, BCReg
+       lj_trace_err_info(J, LJ_TRERR_NYIBC);
+     }
+   }
+-  if (J->baseslot + J->maxslot >= LJ_MAX_JSLOTS)
+-    lj_trace_err(J, LJ_TRERR_STACKOV);
+ }
+ 
+ /* -- Record allocations -------------------------------------------------- */
+@@ -1915,7 +2072,7 @@ static TRef rec_tnew(jit_State *J, uint3
+ static TRef rec_cat(jit_State *J, BCReg baseslot, BCReg topslot)
+ {
+   TRef *top = &J->base[topslot];
+-  TValue savetv[5];
++  TValue savetv[5+LJ_FR2];
+   BCReg s;
+   RecordIndex ix;
+   lj_assertJ(baseslot < topslot, "bad CAT arg");
+@@ -1935,9 +2092,9 @@ static TRef rec_cat(jit_State *J, BCReg
+     tr = hdr = emitir(IRT(IR_BUFHDR, IRT_PGC),
+ 		      lj_ir_kptr(J, &J2G(J)->tmpbuf), IRBUFHDR_RESET);
+     do {
+-      tr = emitir(IRT(IR_BUFPUT, IRT_PGC), tr, *trp++);
++      tr = emitir(IRTG(IR_BUFPUT, IRT_PGC), tr, *trp++);
+     } while (trp <= top);
+-    tr = emitir(IRT(IR_BUFSTR, IRT_STR), tr, hdr);
++    tr = emitir(IRTG(IR_BUFSTR, IRT_STR), tr, hdr);
+     J->maxslot = (BCReg)(xbase - J->base);
+     if (xbase == base) return tr;  /* Return simple concatenation result. */
+     /* Pass partial result. */
+@@ -2050,7 +2207,7 @@ void lj_record_ins(jit_State *J)
+   /* Need snapshot before recording next bytecode (e.g. after a store). */
+   if (J->needsnap) {
+     J->needsnap = 0;
+-    lj_snap_purge(J);
++    if (J->pt) lj_snap_purge(J);
+     lj_snap_add(J);
+     J->mergesnap = 1;
+   }
+@@ -2105,6 +2262,7 @@ void lj_record_ins(jit_State *J)
+   case BCMpri: setpriV(rcv, ~rc); ix.key = rc = TREF_PRI(IRT_NIL+rc); break;
+   case BCMnum: { cTValue *tv = proto_knumtv(J->pt, rc);
+     copyTV(J->L, rcv, tv); ix.key = rc = tvisint(tv) ? lj_ir_kint(J, intV(tv)) :
++    tv->u32.hi == LJ_KEYINDEX ? (lj_ir_kint(J, 0) | TREF_KEYINDEX) :
+     lj_ir_knumint(J, numV(tv)); } break;
+   case BCMstr: { GCstr *s = gco2str(proto_kgc(J->pt, ~(ptrdiff_t)rc));
+     setstrV(J->L, rcv, s); ix.key = rc = lj_ir_kstr(J, s); } break;
+@@ -2267,7 +2425,7 @@ void lj_record_ins(jit_State *J)
+ 
+   case BC_POW:
+     if (tref_isnumber_str(rb) && tref_isnumber_str(rc))
+-      rc = lj_opt_narrow_pow(J, rb, rc, rbv, rcv);
++      rc = lj_opt_narrow_arith(J, rb, rc, rbv, rcv, IR_POW);
+     else
+       rc = rec_mm_arith(J, &ix, MM_pow);
+     break;
+@@ -2341,6 +2499,7 @@ void lj_record_ins(jit_State *J)
+ 
+   case BC_TSETM:
+     rec_tsetm(J, ra, (BCReg)(J->L->top - J->L->base), (int32_t)rcv->u32.lo);
++    J->maxslot = ra;  /* The table slot at ra-1 is the highest used slot. */
+     break;
+ 
+   case BC_TNEW:
+@@ -2423,6 +2582,9 @@ void lj_record_ins(jit_State *J)
+   case BC_ITERL:
+     rec_loop_interp(J, pc, rec_iterl(J, *pc));
+     break;
++  case BC_ITERN:
++    rec_loop_interp(J, pc, rec_itern(J, ra, rb));
++    break;
+   case BC_LOOP:
+     rec_loop_interp(J, pc, rec_loop(J, ra, 1));
+     break;
+@@ -2435,7 +2597,8 @@ void lj_record_ins(jit_State *J)
+     break;
+   case BC_JLOOP:
+     rec_loop_jit(J, rc, rec_loop(J, ra,
+-				 !bc_isret(bc_op(traceref(J, rc)->startins))));
++				 !bc_isret(bc_op(traceref(J, rc)->startins)) &&
++				 bc_op(traceref(J, rc)->startins) != BC_ITERN));
+     break;
+ 
+   case BC_IFORL:
+@@ -2451,6 +2614,10 @@ void lj_record_ins(jit_State *J)
+       J->maxslot = ra;  /* Shrink used slots. */
+     break;
+ 
++  case BC_ISNEXT:
++    rec_isnext(J, ra);
++    break;
++
+   /* -- Function headers -------------------------------------------------- */
+ 
+   case BC_FUNCF:
+@@ -2480,8 +2647,6 @@ void lj_record_ins(jit_State *J)
+       break;
+     }
+     /* fallthrough */
+-  case BC_ITERN:
+-  case BC_ISNEXT:
+   case BC_UCLO:
+   case BC_FNEW:
+     setintV(&J->errinfo, (int32_t)op);
+@@ -2526,6 +2691,8 @@ static const BCIns *rec_setup_root(jit_S
+     J->bc_min = pc;
+     break;
+   case BC_ITERL:
++    if (bc_op(pc[-1]) == BC_JLOOP)
++      lj_trace_err(J, LJ_TRERR_LINNER);
+     lj_assertJ(bc_op(pc[-1]) == BC_ITERC, "no ITERC before ITERL");
+     J->maxslot = ra + bc_b(pc[-1]) - 1;
+     J->bc_extent = (MSize)(-bc_j(ins))*sizeof(BCIns);
+@@ -2533,6 +2700,13 @@ static const BCIns *rec_setup_root(jit_S
+     lj_assertJ(bc_op(pc[-1]) == BC_JMP, "ITERL does not point to JMP+1");
+     J->bc_min = pc;
+     break;
++  case BC_ITERN:
++    lj_assertJ(bc_op(pc[1]) == BC_ITERL, "no ITERL after ITERN");
++    J->maxslot = ra;
++    J->bc_extent = (MSize)(-bc_j(pc[1]))*sizeof(BCIns);
++    J->bc_min = pc+2 + bc_j(pc[1]);
++    J->state = LJ_TRACE_RECORD_1ST;  /* Record the first ITERN, too. */
++    break;
+   case BC_LOOP:
+     /* Only check BC range for real loops, but not for "repeat until true". */
+     pcj = pc + bc_j(ins);
+@@ -2629,9 +2803,14 @@ void lj_record_setup(jit_State *J)
+     }
+     lj_snap_replay(J, T);
+   sidecheck:
+-    if (traceref(J, J->cur.root)->nchild >= J->param[JIT_P_maxside] ||
+-	T->snap[J->exitno].count >= J->param[JIT_P_hotexit] +
+-				    J->param[JIT_P_tryside]) {
++    if ((traceref(J, J->cur.root)->nchild >= J->param[JIT_P_maxside] ||
++	 T->snap[J->exitno].count >= J->param[JIT_P_hotexit] +
++				     J->param[JIT_P_tryside])) {
++      if (bc_op(*J->pc) == BC_JLOOP) {
++	BCIns startins = traceref(J, bc_d(*J->pc))->startins;
++	if (bc_op(startins) == BC_ITERN)
++	  rec_itern(J, bc_a(startins), bc_b(startins));
++      }
+       lj_record_stop(J, LJ_TRLINK_INTERP, 0);
+     }
+   } else {  /* Root trace. */
+@@ -2640,6 +2819,7 @@ void lj_record_setup(jit_State *J)
+     J->pc = rec_setup_root(J);
+     /* Note: the loop instruction itself is recorded at the end and not
+     ** at the start! So snapshot #0 needs to point to the *next* instruction.
++    ** The one exception is BC_ITERN, which sets LJ_TRACE_RECORD_1ST.
+     */
+     lj_snap_add(J);
+     if (bc_op(J->cur.startins) == BC_FORL)
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_record.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_record.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_record.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Trace recorder (bytecode -> SSA IR).
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_RECORD_H
+@@ -30,6 +30,7 @@ LJ_FUNC int lj_record_objcmp(jit_State *
+ 			     cTValue *av, cTValue *bv);
+ LJ_FUNC void lj_record_stop(jit_State *J, TraceLink linktype, TraceNo lnk);
+ LJ_FUNC TRef lj_record_constify(jit_State *J, cTValue *o);
++LJ_FUNC TRef lj_record_vload(jit_State *J, TRef ref, MSize idx, IRType t);
+ 
+ LJ_FUNC void lj_record_call(jit_State *J, BCReg func, ptrdiff_t nargs);
+ LJ_FUNC void lj_record_tailcall(jit_State *J, BCReg func, ptrdiff_t nargs);
+@@ -37,6 +38,7 @@ LJ_FUNC void lj_record_ret(jit_State *J,
+ 
+ LJ_FUNC int lj_record_mm_lookup(jit_State *J, RecordIndex *ix, MMS mm);
+ LJ_FUNC TRef lj_record_idx(jit_State *J, RecordIndex *ix);
++LJ_FUNC int lj_record_next(jit_State *J, RecordIndex *ix);
+ 
+ LJ_FUNC void lj_record_ins(jit_State *J);
+ LJ_FUNC void lj_record_setup(jit_State *J);
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_serialize.c
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_serialize.c
+@@ -0,0 +1,539 @@
++/*
++** Object de/serialization.
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
++*/
++
++#define lj_serialize_c
++#define LUA_CORE
++
++#include "lj_obj.h"
++
++#if LJ_HASBUFFER
++#include "lj_err.h"
++#include "lj_buf.h"
++#include "lj_str.h"
++#include "lj_tab.h"
++#include "lj_udata.h"
++#if LJ_HASFFI
++#include "lj_ctype.h"
++#include "lj_cdata.h"
++#endif
++#if LJ_HASJIT
++#include "lj_ir.h"
++#endif
++#include "lj_serialize.h"
++
++/* Tags for internal serialization format. */
++enum {
++  SER_TAG_NIL,		/* 0x00 */
++  SER_TAG_FALSE,
++  SER_TAG_TRUE,
++  SER_TAG_NULL,
++  SER_TAG_LIGHTUD32,
++  SER_TAG_LIGHTUD64,
++  SER_TAG_INT,
++  SER_TAG_NUM,
++  SER_TAG_TAB,		/* 0x08 */
++  SER_TAG_DICT_MT = SER_TAG_TAB+6,
++  SER_TAG_DICT_STR,
++  SER_TAG_INT64,	/* 0x10 */
++  SER_TAG_UINT64,
++  SER_TAG_COMPLEX,
++  SER_TAG_0x13,
++  SER_TAG_0x14,
++  SER_TAG_0x15,
++  SER_TAG_0x16,
++  SER_TAG_0x17,
++  SER_TAG_0x18,		/* 0x18 */
++  SER_TAG_0x19,
++  SER_TAG_0x1a,
++  SER_TAG_0x1b,
++  SER_TAG_0x1c,
++  SER_TAG_0x1d,
++  SER_TAG_0x1e,
++  SER_TAG_0x1f,
++  SER_TAG_STR,		/* 0x20 + str->len */
++};
++LJ_STATIC_ASSERT((SER_TAG_TAB & 7) == 0);
++
++/* -- Helper functions ---------------------------------------------------- */
++
++static LJ_AINLINE char *serialize_more(char *w, SBufExt *sbx, MSize sz)
++{
++  if (LJ_UNLIKELY(sz > (MSize)(sbx->e - w))) {
++    sbx->w = w;
++    w = lj_buf_more2((SBuf *)sbx, sz);
++  }
++  return w;
++}
++
++/* Write U124 to buffer. */
++static LJ_NOINLINE char *serialize_wu124_(char *w, uint32_t v)
++{
++  if (v < 0x1fe0) {
++    v -= 0xe0;
++    *w++ = (char)(0xe0 | (v >> 8)); *w++ = (char)v;
++  } else {
++    *w++ = (char)0xff;
++#if LJ_BE
++    v = lj_bswap(v);
++#endif
++    memcpy(w, &v, 4); w += 4;
++  }
++  return w;
++}
++
++static LJ_AINLINE char *serialize_wu124(char *w, uint32_t v)
++{
++  if (LJ_LIKELY(v < 0xe0)) {
++    *w++ = (char)v;
++    return w;
++  } else {
++    return serialize_wu124_(w, v);
++  }
++}
++
++static LJ_NOINLINE char *serialize_ru124_(char *r, char *w, uint32_t *pv)
++{
++  uint32_t v = *pv;
++  if (v != 0xff) {
++    if (r >= w) return NULL;
++    v = ((v & 0x1f) << 8) + *(uint8_t *)r + 0xe0; r++;
++  } else {
++    if (r + 4 > w) return NULL;
++    v = lj_getu32(r); r += 4;
++#if LJ_BE
++    v = lj_bswap(v);
++#endif
++  }
++  *pv = v;
++  return r;
++}
++
++static LJ_AINLINE char *serialize_ru124(char *r, char *w, uint32_t *pv)
++{
++  if (LJ_LIKELY(r < w)) {
++    uint32_t v = *(uint8_t *)r; r++;
++    *pv = v;
++    if (LJ_UNLIKELY(v >= 0xe0)) {
++      r = serialize_ru124_(r, w, pv);
++    }
++    return r;
++  }
++  return NULL;
++}
++
++/* Prepare string dictionary for use (once). */
++void LJ_FASTCALL lj_serialize_dict_prep_str(lua_State *L, GCtab *dict)
++{
++  if (!dict->hmask) {  /* No hash part means not prepared, yet. */
++    MSize i, len = lj_tab_len(dict);
++    if (!len) return;
++    lj_tab_resize(L, dict, dict->asize, hsize2hbits(len));
++    for (i = 1; i <= len && i < dict->asize; i++) {
++      cTValue *o = arrayslot(dict, i);
++      if (tvisstr(o)) {
++	if (!lj_tab_getstr(dict, strV(o))) {  /* Ignore dups. */
++	  lj_tab_newkey(L, dict, o)->u64 = (uint64_t)(i-1);
++	}
++      } else if (!tvisfalse(o)) {
++	lj_err_caller(L, LJ_ERR_BUFFER_BADOPT);
++      }
++    }
++  }
++}
++
++/* Prepare metatable dictionary for use (once). */
++void LJ_FASTCALL lj_serialize_dict_prep_mt(lua_State *L, GCtab *dict)
++{
++  if (!dict->hmask) {  /* No hash part means not prepared, yet. */
++    MSize i, len = lj_tab_len(dict);
++    if (!len) return;
++    lj_tab_resize(L, dict, dict->asize, hsize2hbits(len));
++    for (i = 1; i <= len && i < dict->asize; i++) {
++      cTValue *o = arrayslot(dict, i);
++      if (tvistab(o)) {
++	if (tvisnil(lj_tab_get(L, dict, o))) {  /* Ignore dups. */
++	  lj_tab_newkey(L, dict, o)->u64 = (uint64_t)(i-1);
++	}
++      } else if (!tvisfalse(o)) {
++	lj_err_caller(L, LJ_ERR_BUFFER_BADOPT);
++      }
++    }
++  }
++}
++
++/* -- Internal serializer ------------------------------------------------- */
++
++/* Put serialized object into buffer. */
++static char *serialize_put(char *w, SBufExt *sbx, cTValue *o)
++{
++  if (LJ_LIKELY(tvisstr(o))) {
++    const GCstr *str = strV(o);
++    MSize len = str->len;
++    w = serialize_more(w, sbx, 5+len);
++    w = serialize_wu124(w, SER_TAG_STR + len);
++    w = lj_buf_wmem(w, strdata(str), len);
++  } else if (tvisint(o)) {
++    uint32_t x = LJ_BE ? lj_bswap((uint32_t)intV(o)) : (uint32_t)intV(o);
++    w = serialize_more(w, sbx, 1+4);
++    *w++ = SER_TAG_INT; memcpy(w, &x, 4); w += 4;
++  } else if (tvisnum(o)) {
++    uint64_t x = LJ_BE ? lj_bswap64(o->u64) : o->u64;
++    w = serialize_more(w, sbx, 1+sizeof(lua_Number));
++    *w++ = SER_TAG_NUM; memcpy(w, &x, 8); w += 8;
++  } else if (tvispri(o)) {
++    w = serialize_more(w, sbx, 1);
++    *w++ = (char)(SER_TAG_NIL + ~itype(o));
++  } else if (tvistab(o)) {
++    const GCtab *t = tabV(o);
++    uint32_t narray = 0, nhash = 0, one = 2;
++    if (sbx->depth <= 0) lj_err_caller(sbufL(sbx), LJ_ERR_BUFFER_DEPTH);
++    sbx->depth--;
++    if (t->asize > 0) {  /* Determine max. length of array part. */
++      ptrdiff_t i;
++      TValue *array = tvref(t->array);
++      for (i = (ptrdiff_t)t->asize-1; i >= 0; i--)
++	if (!tvisnil(&array[i]))
++	  break;
++      narray = (uint32_t)(i+1);
++      if (narray && tvisnil(&array[0])) one = 4;
++    }
++    if (t->hmask > 0) {  /* Count number of used hash slots. */
++      uint32_t i, hmask = t->hmask;
++      Node *node = noderef(t->node);
++      for (i = 0; i <= hmask; i++)
++	nhash += !tvisnil(&node[i].val);
++    }
++    /* Write metatable index. */
++    if (LJ_UNLIKELY(tabref(sbx->dict_mt)) && tabref(t->metatable)) {
++      TValue mto;
++      Node *n;
++      settabV(sbufL(sbx), &mto, tabref(t->metatable));
++      n = hashgcref(tabref(sbx->dict_mt), mto.gcr);
++      do {
++	if (n->key.u64 == mto.u64) {
++	  uint32_t idx = n->val.u32.lo;
++	  w = serialize_more(w, sbx, 1+5);
++	  *w++ = SER_TAG_DICT_MT;
++	  w = serialize_wu124(w, idx);
++	  break;
++	}
++      } while ((n = nextnode(n)));
++    }
++    /* Write number of array slots and hash slots. */
++    w = serialize_more(w, sbx, 1+2*5);
++    *w++ = (char)(SER_TAG_TAB + (nhash ? 1 : 0) + (narray ? one : 0));
++    if (narray) w = serialize_wu124(w, narray);
++    if (nhash) w = serialize_wu124(w, nhash);
++    if (narray) {  /* Write array entries. */
++      cTValue *oa = tvref(t->array) + (one >> 2);
++      cTValue *oe = tvref(t->array) + narray;
++      while (oa < oe) w = serialize_put(w, sbx, oa++);
++    }
++    if (nhash) {  /* Write hash entries. */
++      const Node *node = noderef(t->node) + t->hmask;
++      GCtab *dict_str = tabref(sbx->dict_str);
++      if (LJ_UNLIKELY(dict_str)) {
++	for (;; node--)
++	  if (!tvisnil(&node->val)) {
++	    if (LJ_LIKELY(tvisstr(&node->key))) {
++	      /* Inlined lj_tab_getstr is 30% faster. */
++	      const GCstr *str = strV(&node->key);
++	      Node *n = hashstr(dict_str, str);
++	      do {
++		if (tvisstr(&n->key) && strV(&n->key) == str) {
++		  uint32_t idx = n->val.u32.lo;
++		  w = serialize_more(w, sbx, 1+5);
++		  *w++ = SER_TAG_DICT_STR;
++		  w = serialize_wu124(w, idx);
++		  break;
++		}
++		n = nextnode(n);
++		if (!n) {
++		  MSize len = str->len;
++		  w = serialize_more(w, sbx, 5+len);
++		  w = serialize_wu124(w, SER_TAG_STR + len);
++		  w = lj_buf_wmem(w, strdata(str), len);
++		  break;
++		}
++	      } while (1);
++	    } else {
++	      w = serialize_put(w, sbx, &node->key);
++	    }
++	    w = serialize_put(w, sbx, &node->val);
++	    if (--nhash == 0) break;
++	  }
++      } else {
++	for (;; node--)
++	  if (!tvisnil(&node->val)) {
++	    w = serialize_put(w, sbx, &node->key);
++	    w = serialize_put(w, sbx, &node->val);
++	    if (--nhash == 0) break;
++	  }
++      }
++    }
++    sbx->depth++;
++#if LJ_HASFFI
++  } else if (tviscdata(o)) {
++    CTState *cts = ctype_cts(sbufL(sbx));
++    CType *s = ctype_raw(cts, cdataV(o)->ctypeid);
++    uint8_t *sp = cdataptr(cdataV(o));
++    if (ctype_isinteger(s->info) && s->size == 8) {
++      w = serialize_more(w, sbx, 1+8);
++      *w++ = (s->info & CTF_UNSIGNED) ? SER_TAG_UINT64 : SER_TAG_INT64;
++#if LJ_BE
++      { uint64_t u = lj_bswap64(*(uint64_t *)sp); memcpy(w, &u, 8); }
++#else
++      memcpy(w, sp, 8);
++#endif
++      w += 8;
++    } else if (ctype_iscomplex(s->info) && s->size == 16) {
++      w = serialize_more(w, sbx, 1+16);
++      *w++ = SER_TAG_COMPLEX;
++#if LJ_BE
++      {  /* Only swap the doubles. The re/im order stays the same. */
++	uint64_t u = lj_bswap64(((uint64_t *)sp)[0]); memcpy(w, &u, 8);
++	u = lj_bswap64(((uint64_t *)sp)[1]); memcpy(w+8, &u, 8);
++      }
++#else
++      memcpy(w, sp, 16);
++#endif
++      w += 16;
++    } else {
++      goto badenc;  /* NYI other cdata */
++    }
++#endif
++  } else if (tvislightud(o)) {
++    uintptr_t ud = (uintptr_t)lightudV(G(sbufL(sbx)), o);
++    w = serialize_more(w, sbx, 1+sizeof(ud));
++    if (ud == 0) {
++      *w++ = SER_TAG_NULL;
++    } else if (LJ_32 || checku32(ud)) {
++#if LJ_BE && LJ_64
++      ud = lj_bswap64(ud);
++#elif LJ_BE
++      ud = lj_bswap(ud);
++#endif
++      *w++ = SER_TAG_LIGHTUD32; memcpy(w, &ud, 4); w += 4;
++#if LJ_64
++    } else {
++#if LJ_BE
++      ud = lj_bswap64(ud);
++#endif
++      *w++ = SER_TAG_LIGHTUD64; memcpy(w, &ud, 8); w += 8;
++#endif
++    }
++  } else {
++    /* NYI userdata */
++#if LJ_HASFFI
++  badenc:
++#endif
++    lj_err_callerv(sbufL(sbx), LJ_ERR_BUFFER_BADENC, lj_typename(o));
++  }
++  return w;
++}
++
++/* Get serialized object from buffer. */
++static char *serialize_get(char *r, SBufExt *sbx, TValue *o)
++{
++  char *w = sbx->w;
++  uint32_t tp;
++  r = serialize_ru124(r, w, &tp); if (LJ_UNLIKELY(!r)) goto eob;
++  if (LJ_LIKELY(tp >= SER_TAG_STR)) {
++    uint32_t len = tp - SER_TAG_STR;
++    if (LJ_UNLIKELY(len > (uint32_t)(w - r))) goto eob;
++    setstrV(sbufL(sbx), o, lj_str_new(sbufL(sbx), r, len));
++    r += len;
++  } else if (tp == SER_TAG_INT) {
++    if (LJ_UNLIKELY(r + 4 > w)) goto eob;
++    setintV(o, (int32_t)(LJ_BE ? lj_bswap(lj_getu32(r)) : lj_getu32(r)));
++    r += 4;
++  } else if (tp == SER_TAG_NUM) {
++    if (LJ_UNLIKELY(r + 8 > w)) goto eob;
++    memcpy(o, r, 8); r += 8;
++#if LJ_BE
++    o->u64 = lj_bswap64(o->u64);
++#endif
++    if (!tvisnum(o)) setnanV(o);  /* Fix non-canonical NaNs. */
++  } else if (tp <= SER_TAG_TRUE) {
++    setpriV(o, ~tp);
++  } else if (tp == SER_TAG_DICT_STR) {
++    GCtab *dict_str;
++    uint32_t idx;
++    r = serialize_ru124(r, w, &idx); if (LJ_UNLIKELY(!r)) goto eob;
++    idx++;
++    dict_str = tabref(sbx->dict_str);
++    if (dict_str && idx < dict_str->asize && tvisstr(arrayslot(dict_str, idx)))
++      copyTV(sbufL(sbx), o, arrayslot(dict_str, idx));
++    else
++      lj_err_callerv(sbufL(sbx), LJ_ERR_BUFFER_BADDICTX, idx);
++  } else if (tp >= SER_TAG_TAB && tp <= SER_TAG_DICT_MT) {
++    uint32_t narray = 0, nhash = 0;
++    GCtab *t, *mt = NULL;
++    if (sbx->depth <= 0) lj_err_caller(sbufL(sbx), LJ_ERR_BUFFER_DEPTH);
++    sbx->depth--;
++    if (tp == SER_TAG_DICT_MT) {
++      GCtab *dict_mt;
++      uint32_t idx;
++      r = serialize_ru124(r, w, &idx); if (LJ_UNLIKELY(!r)) goto eob;
++      idx++;
++      dict_mt = tabref(sbx->dict_mt);
++      if (dict_mt && idx < dict_mt->asize && tvistab(arrayslot(dict_mt, idx)))
++	mt = tabV(arrayslot(dict_mt, idx));
++      else
++	lj_err_callerv(sbufL(sbx), LJ_ERR_BUFFER_BADDICTX, idx);
++      r = serialize_ru124(r, w, &tp); if (LJ_UNLIKELY(!r)) goto eob;
++      if (!(tp >= SER_TAG_TAB && tp < SER_TAG_DICT_MT)) goto badtag;
++    }
++    if (tp >= SER_TAG_TAB+2) {
++      r = serialize_ru124(r, w, &narray); if (LJ_UNLIKELY(!r)) goto eob;
++    }
++    if ((tp & 1)) {
++      r = serialize_ru124(r, w, &nhash); if (LJ_UNLIKELY(!r)) goto eob;
++    }
++    t = lj_tab_new(sbufL(sbx), narray, hsize2hbits(nhash));
++    /* NOBARRIER: The table is new (marked white). */
++    setgcref(t->metatable, obj2gco(mt));
++    settabV(sbufL(sbx), o, t);
++    if (narray) {
++      TValue *oa = tvref(t->array) + (tp >= SER_TAG_TAB+4);
++      TValue *oe = tvref(t->array) + narray;
++      while (oa < oe) r = serialize_get(r, sbx, oa++);
++    }
++    if (nhash) {
++      do {
++	TValue k, *v;
++	r = serialize_get(r, sbx, &k);
++	v = lj_tab_set(sbufL(sbx), t, &k);
++	if (LJ_UNLIKELY(!tvisnil(v)))
++	  lj_err_caller(sbufL(sbx), LJ_ERR_BUFFER_DUPKEY);
++	r = serialize_get(r, sbx, v);
++      } while (--nhash);
++    }
++    sbx->depth++;
++#if LJ_HASFFI
++  } else if (tp >= SER_TAG_INT64 &&  tp <= SER_TAG_COMPLEX) {
++    uint32_t sz = tp == SER_TAG_COMPLEX ? 16 : 8;
++    GCcdata *cd;
++    if (LJ_UNLIKELY(r + sz > w)) goto eob;
++    if (LJ_UNLIKELY(!ctype_ctsG(G(sbufL(sbx))))) goto badtag;
++    cd = lj_cdata_new_(sbufL(sbx),
++	   tp == SER_TAG_INT64 ? CTID_INT64 :
++	   tp == SER_TAG_UINT64 ? CTID_UINT64 : CTID_COMPLEX_DOUBLE,
++	   sz);
++    memcpy(cdataptr(cd), r, sz); r += sz;
++#if LJ_BE
++    *(uint64_t *)cdataptr(cd) = lj_bswap64(*(uint64_t *)cdataptr(cd));
++    if (sz == 16)
++      ((uint64_t *)cdataptr(cd))[1] = lj_bswap64(((uint64_t *)cdataptr(cd))[1]);
++#endif
++    if (sz == 16) {  /* Fix non-canonical NaNs. */
++      TValue *cdo = (TValue *)cdataptr(cd);
++      if (!tvisnum(&cdo[0])) setnanV(&cdo[0]);
++      if (!tvisnum(&cdo[1])) setnanV(&cdo[1]);
++    }
++    setcdataV(sbufL(sbx), o, cd);
++#endif
++  } else if (tp <= (LJ_64 ? SER_TAG_LIGHTUD64 : SER_TAG_LIGHTUD32)) {
++    uintptr_t ud = 0;
++    if (tp == SER_TAG_LIGHTUD32) {
++      if (LJ_UNLIKELY(r + 4 > w)) goto eob;
++      ud = (uintptr_t)(LJ_BE ? lj_bswap(lj_getu32(r)) : lj_getu32(r));
++      r += 4;
++    }
++#if LJ_64
++    else if (tp == SER_TAG_LIGHTUD64) {
++      if (LJ_UNLIKELY(r + 8 > w)) goto eob;
++      memcpy(&ud, r, 8); r += 8;
++#if LJ_BE
++      ud = lj_bswap64(ud);
++#endif
++    }
++    setrawlightudV(o, lj_lightud_intern(sbufL(sbx), (void *)ud));
++#else
++    setrawlightudV(o, (void *)ud);
++#endif
++  } else {
++badtag:
++    lj_err_callerv(sbufL(sbx), LJ_ERR_BUFFER_BADDEC, tp);
++  }
++  return r;
++eob:
++  lj_err_caller(sbufL(sbx), LJ_ERR_BUFFER_EOB);
++  return NULL;
++}
++
++/* -- External serialization API ------------------------------------------ */
++
++/* Encode to buffer. */
++SBufExt * LJ_FASTCALL lj_serialize_put(SBufExt *sbx, cTValue *o)
++{
++  sbx->depth = LJ_SERIALIZE_DEPTH;
++  sbx->w = serialize_put(sbx->w, sbx, o);
++  return sbx;
++}
++
++/* Decode from buffer. */
++char * LJ_FASTCALL lj_serialize_get(SBufExt *sbx, TValue *o)
++{
++  sbx->depth = LJ_SERIALIZE_DEPTH;
++  return serialize_get(sbx->r, sbx, o);
++}
++
++/* Stand-alone encoding, borrowing from global temporary buffer. */
++GCstr * LJ_FASTCALL lj_serialize_encode(lua_State *L, cTValue *o)
++{
++  SBufExt sbx;
++  char *w;
++  memset(&sbx, 0, sizeof(SBufExt));
++  lj_bufx_set_borrow(L, &sbx, &G(L)->tmpbuf);
++  sbx.depth = LJ_SERIALIZE_DEPTH;
++  w = serialize_put(sbx.w, &sbx, o);
++  return lj_str_new(L, sbx.b, (size_t)(w - sbx.b));
++}
++
++/* Stand-alone decoding, copy-on-write from string. */
++void lj_serialize_decode(lua_State *L, TValue *o, GCstr *str)
++{
++  SBufExt sbx;
++  char *r;
++  memset(&sbx, 0, sizeof(SBufExt));
++  lj_bufx_set_cow(L, &sbx, strdata(str), str->len);
++  /* No need to set sbx.cowref here. */
++  sbx.depth = LJ_SERIALIZE_DEPTH;
++  r = serialize_get(sbx.r, &sbx, o);
++  if (r != sbx.w) lj_err_caller(L, LJ_ERR_BUFFER_LEFTOV);
++}
++
++#if LJ_HASJIT
++/* Peek into buffer to find the result IRType for specialization purposes. */
++LJ_FUNC MSize LJ_FASTCALL lj_serialize_peektype(SBufExt *sbx)
++{
++  uint32_t tp;
++  if (serialize_ru124(sbx->r, sbx->w, &tp)) {
++    /* This must match the handling of all tags in the decoder above. */
++    switch (tp) {
++    case SER_TAG_NIL: return IRT_NIL;
++    case SER_TAG_FALSE: return IRT_FALSE;
++    case SER_TAG_TRUE: return IRT_TRUE;
++    case SER_TAG_NULL: case SER_TAG_LIGHTUD32: case SER_TAG_LIGHTUD64:
++      return IRT_LIGHTUD;
++    case SER_TAG_INT: return LJ_DUALNUM ? IRT_INT : IRT_NUM;
++    case SER_TAG_NUM: return IRT_NUM;
++    case SER_TAG_TAB: case SER_TAG_TAB+1: case SER_TAG_TAB+2:
++    case SER_TAG_TAB+3: case SER_TAG_TAB+4: case SER_TAG_TAB+5:
++    case SER_TAG_DICT_MT:
++      return IRT_TAB;
++    case SER_TAG_INT64: case SER_TAG_UINT64: case SER_TAG_COMPLEX:
++      return IRT_CDATA;
++    case SER_TAG_DICT_STR:
++    default:
++      return IRT_STR;
++    }
++  }
++  return IRT_NIL;  /* Will fail on actual decode. */
++}
++#endif
++
++#endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_serialize.h
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_serialize.h
+@@ -0,0 +1,28 @@
++/*
++** Object de/serialization.
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
++*/
++
++#ifndef _LJ_SERIALIZE_H
++#define _LJ_SERIALIZE_H
++
++#include "lj_obj.h"
++#include "lj_buf.h"
++
++#if LJ_HASBUFFER
++
++#define LJ_SERIALIZE_DEPTH	100	/* Default depth. */
++
++LJ_FUNC void LJ_FASTCALL lj_serialize_dict_prep_str(lua_State *L, GCtab *dict);
++LJ_FUNC void LJ_FASTCALL lj_serialize_dict_prep_mt(lua_State *L, GCtab *dict);
++LJ_FUNC SBufExt * LJ_FASTCALL lj_serialize_put(SBufExt *sbx, cTValue *o);
++LJ_FUNC char * LJ_FASTCALL lj_serialize_get(SBufExt *sbx, TValue *o);
++LJ_FUNC GCstr * LJ_FASTCALL lj_serialize_encode(lua_State *L, cTValue *o);
++LJ_FUNC void lj_serialize_decode(lua_State *L, TValue *o, GCstr *str);
++#if LJ_HASJIT
++LJ_FUNC MSize LJ_FASTCALL lj_serialize_peektype(SBufExt *sbx);
++#endif
++
++#endif
++
++#endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_snap.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_snap.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_snap.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Snapshot handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_snap_c
+@@ -171,6 +171,7 @@ static void snapshot_stack(jit_State *J,
+   nent += snapshot_framelinks(J, p + nent, &snap->topslot);
+   snap->mapofs = (uint32_t)nsnapmap;
+   snap->ref = (IRRef1)J->cur.nins;
++  snap->mcofs = 0;
+   snap->nslots = (uint8_t)nslots;
+   snap->count = 0;
+   J->cur.nsnapmap = (uint32_t)(nsnapmap + nent);
+@@ -251,7 +252,12 @@ static BCReg snap_usedef(jit_State *J, u
+       BCReg minslot = bc_a(ins);
+       if (op >= BC_FORI && op <= BC_JFORL) minslot += FORL_EXT;
+       else if (op >= BC_ITERL && op <= BC_JITERL) minslot += bc_b(pc[-2])-1;
+-      else if (op == BC_UCLO) { pc += bc_j(ins); break; }
++      else if (op == BC_UCLO) {
++	ptrdiff_t delta = bc_j(ins);
++	if (delta < 0) return maxslot;  /* Prevent loop. */
++	pc += delta;
++	break;
++      }
+       for (s = minslot; s < maxslot; s++) DEF_SLOT(s);
+       return minslot < maxslot ? minslot : maxslot;
+       }
+@@ -275,7 +281,7 @@ static BCReg snap_usedef(jit_State *J, u
+        if (!(op == BC_ISTC || op == BC_ISFC)) DEF_SLOT(bc_a(ins));
+        break;
+     case BCMbase:
+-      if (op >= BC_CALLM && op <= BC_VARG) {
++      if (op >= BC_CALLM && op <= BC_ITERN) {
+ 	BCReg top = (op == BC_CALLM || op == BC_CALLMT || bc_c(ins) == 0) ?
+ 		    maxslot : (bc_a(ins) + bc_c(ins)+LJ_FR2);
+ 	if (LJ_FR2) DEF_SLOT(bc_a(ins)+1);
+@@ -286,6 +292,8 @@ static BCReg snap_usedef(jit_State *J, u
+ 	  for (s = 0; s < bc_a(ins); s++) DEF_SLOT(s);
+ 	  return 0;
+ 	}
++      } else if (op == BC_VARG) {
++	return maxslot;  /* NYI: punt. */
+       } else if (op == BC_KNIL) {
+ 	for (s = bc_a(ins); s <= bc_d(ins); s++) DEF_SLOT(s);
+       } else if (op == BC_TSETM) {
+@@ -304,15 +312,45 @@ static BCReg snap_usedef(jit_State *J, u
+   return 0;  /* unreachable */
+ }
+ 
++/* Mark slots used by upvalues of child prototypes as used. */
++static void snap_useuv(GCproto *pt, uint8_t *udf)
++{
++  /* This is a coarse check, because it's difficult to correlate the lifetime
++  ** of slots and closures. But the number of false positives is quite low.
++  ** A false positive may cause a slot not to be purged, which is just
++  ** a missed optimization.
++  */
++  if ((pt->flags & PROTO_CHILD)) {
++    ptrdiff_t i, j, n = pt->sizekgc;
++    GCRef *kr = mref(pt->k, GCRef) - 1;
++    for (i = 0; i < n; i++, kr--) {
++      GCobj *o = gcref(*kr);
++      if (o->gch.gct == ~LJ_TPROTO) {
++	for (j = 0; j < gco2pt(o)->sizeuv; j++) {
++	  uint32_t v = proto_uv(gco2pt(o))[j];
++	  if ((v & PROTO_UV_LOCAL)) {
++	    udf[(v & 0xff)] = 0;
++	  }
++	}
++      }
++    }
++  }
++}
++
+ /* Purge dead slots before the next snapshot. */
+ void lj_snap_purge(jit_State *J)
+ {
+   uint8_t udf[SNAP_USEDEF_SLOTS];
+-  BCReg maxslot = J->maxslot;
+-  BCReg s = snap_usedef(J, udf, J->pc, maxslot);
+-  for (; s < maxslot; s++)
+-    if (udf[s] != 0)
+-      J->base[s] = 0;  /* Purge dead slots. */
++  BCReg s, maxslot = J->maxslot;
++  if (bc_op(*J->pc) == BC_FUNCV && maxslot > J->pt->numparams)
++    maxslot = J->pt->numparams;
++  s = snap_usedef(J, udf, J->pc, maxslot);
++  if (s < maxslot) {
++    snap_useuv(J->pt, udf);
++    for (; s < maxslot; s++)
++      if (udf[s] != 0)
++	J->base[s] = 0;  /* Purge dead slots. */
++  }
+ }
+ 
+ /* Shrink last snapshot. */
+@@ -325,6 +363,7 @@ void lj_snap_shrink(jit_State *J)
+   BCReg maxslot = J->maxslot;
+   BCReg baseslot = J->baseslot;
+   BCReg minslot = snap_usedef(J, udf, snap_pc(&map[nent]), maxslot);
++  if (minslot < maxslot) snap_useuv(J->pt, udf);
+   maxslot += baseslot;
+   minslot += baseslot;
+   snap->nslots = (uint8_t)maxslot;
+@@ -424,7 +463,7 @@ static TRef snap_dedup(jit_State *J, Sna
+   MSize j;
+   for (j = 0; j < nmax; j++)
+     if (snap_ref(map[j]) == ref)
+-      return J->slot[snap_slot(map[j])] & ~(SNAP_CONT|SNAP_FRAME);
++      return J->slot[snap_slot(map[j])] & ~(SNAP_KEYINDEX|SNAP_CONT|SNAP_FRAME);
+   return 0;
+ }
+ 
+@@ -499,10 +538,12 @@ void lj_snap_replay(jit_State *J, GCtrac
+       uint32_t mode = IRSLOAD_INHERIT|IRSLOAD_PARENT;
+       if (LJ_SOFTFP32 && (sn & SNAP_SOFTFPNUM)) t = IRT_NUM;
+       if (ir->o == IR_SLOAD) mode |= (ir->op2 & IRSLOAD_READONLY);
++      if ((sn & SNAP_KEYINDEX)) mode |= IRSLOAD_KEYINDEX;
+       tr = emitir_raw(IRT(IR_SLOAD, t), s, mode);
+     }
+   setslot:
+-    J->slot[s] = tr | (sn&(SNAP_CONT|SNAP_FRAME));  /* Same as TREF_* flags. */
++    /* Same as TREF_* flags. */
++    J->slot[s] = tr | (sn&(SNAP_KEYINDEX|SNAP_CONT|SNAP_FRAME));
+     J->framedepth += ((sn & (SNAP_CONT|SNAP_FRAME)) && (s != LJ_FR2));
+     if ((sn & SNAP_FRAME))
+       J->baseslot = s+1;
+@@ -839,11 +880,19 @@ static void snap_unsink(jit_State *J, GC
+ 		   irs->o == IR_FSTORE,
+ 		   "sunk store with bad op %d", irs->o);
+ 	if (irk->o == IR_FREF) {
+-	  lj_assertJ(irk->op2 == IRFL_TAB_META,
+-		     "sunk store with bad field %d", irk->op2);
+-	  snap_restoreval(J, T, ex, snapno, rfilt, irs->op2, &tmp);
+-	  /* NOBARRIER: The table is new (marked white). */
+-	  setgcref(t->metatable, obj2gco(tabV(&tmp)));
++	  switch (irk->op2) {
++	  case IRFL_TAB_META:
++	    snap_restoreval(J, T, ex, snapno, rfilt, irs->op2, &tmp);
++	    /* NOBARRIER: The table is new (marked white). */
++	    setgcref(t->metatable, obj2gco(tabV(&tmp)));
++	    break;
++	  case IRFL_TAB_NOMM:
++	    /* Negative metamethod cache invalidated by lj_tab_set() below. */
++	    break;
++	  default:
++	    lj_assertJ(0, "sunk store with bad field %d", irk->op2);
++	    break;
++	  }
+ 	} else {
+ 	  irk = &T->ir[irk->op2];
+ 	  if (irk->o == IR_KSLOT) irk = &T->ir[irk->op1];
+@@ -922,6 +971,10 @@ const BCIns *lj_snap_restore(jit_State *
+ 	setframe_ftsz(o, snap_slot(sn) != 0 ? (int32_t)*flinks-- : ftsz0);
+ 	L->base = o+1;
+ #endif
++      } else if ((sn & SNAP_KEYINDEX)) {
++	/* A IRT_INT key index slot is restored as a number. Undo this. */
++	o->u32.lo = (uint32_t)(LJ_DUALNUM ? intV(o) : lj_num2int(numV(o)));
++	o->u32.hi = LJ_KEYINDEX;
+       }
+     }
+   }
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_snap.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_snap.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_snap.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Snapshot handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_SNAP_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_state.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_state.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_state.c
+@@ -1,6 +1,6 @@
+ /*
+ ** State and stack handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Portions taken verbatim or adapted from the Lua interpreter.
+ ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+@@ -103,8 +103,17 @@ void lj_state_shrinkstack(lua_State *L,
+ void LJ_FASTCALL lj_state_growstack(lua_State *L, MSize need)
+ {
+   MSize n;
+-  if (L->stacksize > LJ_STACK_MAXEX)  /* Overflow while handling overflow? */
+-    lj_err_throw(L, LUA_ERRERR);
++  if (L->stacksize >= LJ_STACK_MAXEX) {
++    /* 4. Throw 'error in error handling' when we are _over_ the limit. */
++    if (L->stacksize > LJ_STACK_MAXEX)
++      lj_err_throw(L, LUA_ERRERR);  /* Does not invoke an error handler. */
++    /* 1. We are _at_ the limit after the last growth. */
++    if (L->status < LUA_ERRRUN) {  /* 2. Throw 'stack overflow'. */
++      L->status = LUA_ERRRUN;  /* Prevent ending here again for pushed msg. */
++      lj_err_msg(L, LJ_ERR_STKOV);  /* May invoke an error handler. */
++    }
++    /* 3. Add space (over the limit) for pushed message and error handler. */
++  }
+   n = L->stacksize + need;
+   if (n > LJ_STACK_MAX) {
+     n += 2*LUA_MINSTACK;
+@@ -114,8 +123,6 @@ void LJ_FASTCALL lj_state_growstack(lua_
+       n = LJ_STACK_MAX;
+   }
+   resizestack(L, n);
+-  if (L->stacksize > LJ_STACK_MAXEX)
+-    lj_err_msg(L, LJ_ERR_STKOV);
+ }
+ 
+ void LJ_FASTCALL lj_state_growstack1(lua_State *L)
+@@ -123,6 +130,18 @@ void LJ_FASTCALL lj_state_growstack1(lua
+   lj_state_growstack(L, 1);
+ }
+ 
++static TValue *cpgrowstack(lua_State *co, lua_CFunction dummy, void *ud)
++{
++  UNUSED(dummy);
++  lj_state_growstack(co, *(MSize *)ud);
++  return NULL;
++}
++
++int LJ_FASTCALL lj_state_cpgrowstack(lua_State *L, MSize need)
++{
++  return lj_vm_cpcall(L, NULL, &need, cpgrowstack);
++}
++
+ /* Allocate basic stack for new state. */
+ static void stack_init(lua_State *L1, lua_State *L)
+ {
+@@ -156,6 +175,7 @@ static TValue *cpluaopen(lua_State *L, l
+   fixstring(lj_err_str(L, LJ_ERR_ERRMEM));  /* Preallocate memory error msg. */
+   g->gc.threshold = 4*g->gc.total;
+   lj_trace_initstate(g);
++  lj_err_verify();
+   return NULL;
+ }
+ 
+@@ -326,8 +346,11 @@ void LJ_FASTCALL lj_state_free(global_St
+   lj_assertG(L != mainthread(g), "free of main thread");
+   if (obj2gco(L) == gcref(g->cur_L))
+     setgcrefnull(g->cur_L);
+-  lj_func_closeuv(L, tvref(L->stack));
+-  lj_assertG(gcref(L->openupval) == NULL, "stale open upvalues");
++  if (gcref(L->openupval) != NULL) {
++    lj_func_closeuv(L, tvref(L->stack));
++    lj_trace_abort(g);  /* For aa_uref soundness. */
++    lj_assertG(gcref(L->openupval) == NULL, "stale open upvalues");
++  }
+   lj_mem_freevec(g, tvref(L->stack), L->stacksize, TValue);
+   lj_mem_freet(g, L);
+ }
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_state.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_state.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_state.h
+@@ -1,6 +1,6 @@
+ /*
+ ** State and stack handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_STATE_H
+@@ -18,6 +18,7 @@ LJ_FUNC void lj_state_relimitstack(lua_S
+ LJ_FUNC void lj_state_shrinkstack(lua_State *L, MSize used);
+ LJ_FUNCA void LJ_FASTCALL lj_state_growstack(lua_State *L, MSize need);
+ LJ_FUNC void LJ_FASTCALL lj_state_growstack1(lua_State *L);
++LJ_FUNC int LJ_FASTCALL lj_state_cpgrowstack(lua_State *L, MSize need);
+ 
+ static LJ_AINLINE void lj_state_checkstack(lua_State *L, MSize need)
+ {
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_str.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_str.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_str.c
+@@ -1,6 +1,6 @@
+ /*
+ ** String handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_str_c
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_str.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_str.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_str.h
+@@ -1,6 +1,6 @@
+ /*
+ ** String handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_STR_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_strfmt.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_strfmt.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_strfmt.c
+@@ -1,6 +1,6 @@
+ /*
+ ** String formatting.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #include <stdio.h>
+@@ -9,11 +9,17 @@
+ #define LUA_CORE
+ 
+ #include "lj_obj.h"
++#include "lj_err.h"
+ #include "lj_buf.h"
+ #include "lj_str.h"
++#include "lj_meta.h"
+ #include "lj_state.h"
+ #include "lj_char.h"
+ #include "lj_strfmt.h"
++#if LJ_HASFFI
++#include "lj_ctype.h"
++#endif
++#include "lj_lib.h"
+ 
+ /* -- Format parser ------------------------------------------------------- */
+ 
+@@ -96,7 +102,7 @@ retlit:
+ char * LJ_FASTCALL lj_strfmt_wint(char *p, int32_t k)
+ {
+   uint32_t u = (uint32_t)k;
+-  if (k < 0) { u = (uint32_t)-k; *p++ = '-'; }
++  if (k < 0) { u = ~u+1u; *p++ = '-'; }
+   if (u < 10000) {
+     if (u < 10) goto dig1;
+     if (u < 100) goto dig2;
+@@ -161,6 +167,10 @@ const char *lj_strfmt_wstrnum(lua_State
+   if (tvisstr(o)) {
+     *lenp = strV(o)->len;
+     return strVdata(o);
++  } else if (tvisbuf(o)) {
++    SBufExt *sbx = bufV(o);
++    *lenp = sbufxlen(sbx);
++    return sbx->r;
+   } else if (tvisint(o)) {
+     sb = lj_strfmt_putint(lj_buf_tmp_(L), intV(o));
+   } else if (tvisnum(o)) {
+@@ -169,7 +179,7 @@ const char *lj_strfmt_wstrnum(lua_State
+     return NULL;
+   }
+   *lenp = sbuflen(sb);
+-  return sbufB(sb);
++  return sb->b;
+ }
+ 
+ /* -- Unformatted conversions to buffer ----------------------------------- */
+@@ -177,7 +187,7 @@ const char *lj_strfmt_wstrnum(lua_State
+ /* Add integer to buffer. */
+ SBuf * LJ_FASTCALL lj_strfmt_putint(SBuf *sb, int32_t k)
+ {
+-  setsbufP(sb, lj_strfmt_wint(lj_buf_more(sb, STRFMT_MAXBUF_INT), k));
++  sb->w = lj_strfmt_wint(lj_buf_more(sb, STRFMT_MAXBUF_INT), k);
+   return sb;
+ }
+ 
+@@ -191,80 +201,93 @@ SBuf * LJ_FASTCALL lj_strfmt_putnum(SBuf
+ 
+ SBuf * LJ_FASTCALL lj_strfmt_putptr(SBuf *sb, const void *v)
+ {
+-  setsbufP(sb, lj_strfmt_wptr(lj_buf_more(sb, STRFMT_MAXBUF_PTR), v));
++  sb->w = lj_strfmt_wptr(lj_buf_more(sb, STRFMT_MAXBUF_PTR), v);
+   return sb;
+ }
+ 
+ /* Add quoted string to buffer. */
+-SBuf * LJ_FASTCALL lj_strfmt_putquoted(SBuf *sb, GCstr *str)
++static SBuf *strfmt_putquotedlen(SBuf *sb, const char *s, MSize len)
+ {
+-  const char *s = strdata(str);
+-  MSize len = str->len;
+   lj_buf_putb(sb, '"');
+   while (len--) {
+     uint32_t c = (uint32_t)(uint8_t)*s++;
+-    char *p = lj_buf_more(sb, 4);
++    char *w = lj_buf_more(sb, 4);
+     if (c == '"' || c == '\\' || c == '\n') {
+-      *p++ = '\\';
++      *w++ = '\\';
+     } else if (lj_char_iscntrl(c)) {  /* This can only be 0-31 or 127. */
+       uint32_t d;
+-      *p++ = '\\';
++      *w++ = '\\';
+       if (c >= 100 || lj_char_isdigit((uint8_t)*s)) {
+-	*p++ = (char)('0'+(c >= 100)); if (c >= 100) c -= 100;
++	*w++ = (char)('0'+(c >= 100)); if (c >= 100) c -= 100;
+ 	goto tens;
+       } else if (c >= 10) {
+       tens:
+-	d = (c * 205) >> 11; c -= d * 10; *p++ = (char)('0'+d);
++	d = (c * 205) >> 11; c -= d * 10; *w++ = (char)('0'+d);
+       }
+       c += '0';
+     }
+-    *p++ = (char)c;
+-    setsbufP(sb, p);
++    *w++ = (char)c;
++    sb->w = w;
+   }
+   lj_buf_putb(sb, '"');
+   return sb;
+ }
+ 
++#if LJ_HASJIT
++SBuf * LJ_FASTCALL lj_strfmt_putquoted(SBuf *sb, GCstr *str)
++{
++  return strfmt_putquotedlen(sb, strdata(str), str->len);
++}
++#endif
++
+ /* -- Formatted conversions to buffer ------------------------------------- */
+ 
+ /* Add formatted char to buffer. */
+ SBuf *lj_strfmt_putfchar(SBuf *sb, SFormat sf, int32_t c)
+ {
+   MSize width = STRFMT_WIDTH(sf);
+-  char *p = lj_buf_more(sb, width > 1 ? width : 1);
+-  if ((sf & STRFMT_F_LEFT)) *p++ = (char)c;
+-  while (width-- > 1) *p++ = ' ';
+-  if (!(sf & STRFMT_F_LEFT)) *p++ = (char)c;
+-  setsbufP(sb, p);
++  char *w = lj_buf_more(sb, width > 1 ? width : 1);
++  if ((sf & STRFMT_F_LEFT)) *w++ = (char)c;
++  while (width-- > 1) *w++ = ' ';
++  if (!(sf & STRFMT_F_LEFT)) *w++ = (char)c;
++  sb->w = w;
+   return sb;
+ }
+ 
+ /* Add formatted string to buffer. */
+-SBuf *lj_strfmt_putfstr(SBuf *sb, SFormat sf, GCstr *str)
++static SBuf *strfmt_putfstrlen(SBuf *sb, SFormat sf, const char *s, MSize len)
+ {
+-  MSize len = str->len <= STRFMT_PREC(sf) ? str->len : STRFMT_PREC(sf);
+   MSize width = STRFMT_WIDTH(sf);
+-  char *p = lj_buf_more(sb, width > len ? width : len);
+-  if ((sf & STRFMT_F_LEFT)) p = lj_buf_wmem(p, strdata(str), len);
+-  while (width-- > len) *p++ = ' ';
+-  if (!(sf & STRFMT_F_LEFT)) p = lj_buf_wmem(p, strdata(str), len);
+-  setsbufP(sb, p);
++  char *w;
++  if (len > STRFMT_PREC(sf)) len = STRFMT_PREC(sf);
++  w = lj_buf_more(sb, width > len ? width : len);
++  if ((sf & STRFMT_F_LEFT)) w = lj_buf_wmem(w, s, len);
++  while (width-- > len) *w++ = ' ';
++  if (!(sf & STRFMT_F_LEFT)) w = lj_buf_wmem(w, s, len);
++  sb->w = w;
+   return sb;
+ }
+ 
++#if LJ_HASJIT
++SBuf *lj_strfmt_putfstr(SBuf *sb, SFormat sf, GCstr *str)
++{
++  return strfmt_putfstrlen(sb, sf, strdata(str), str->len);
++}
++#endif
++
+ /* Add formatted signed/unsigned integer to buffer. */
+ SBuf *lj_strfmt_putfxint(SBuf *sb, SFormat sf, uint64_t k)
+ {
+-  char buf[STRFMT_MAXBUF_XINT], *q = buf + sizeof(buf), *p;
++  char buf[STRFMT_MAXBUF_XINT], *q = buf + sizeof(buf), *w;
+ #ifdef LUA_USE_ASSERT
+-  char *ps;
++  char *ws;
+ #endif
+   MSize prefix = 0, len, prec, pprec, width, need;
+ 
+   /* Figure out signed prefixes. */
+   if (STRFMT_TYPE(sf) == STRFMT_INT) {
+     if ((int64_t)k < 0) {
+-      k = (uint64_t)-(int64_t)k;
++      k = ~k+1u;
+       prefix = 256 + '-';
+     } else if ((sf & STRFMT_F_PLUS)) {
+       prefix = 256 + '+';
+@@ -301,27 +324,27 @@ SBuf *lj_strfmt_putfxint(SBuf *sb, SForm
+   width = STRFMT_WIDTH(sf);
+   pprec = prec + (prefix >> 8);
+   need = width > pprec ? width : pprec;
+-  p = lj_buf_more(sb, need);
++  w = lj_buf_more(sb, need);
+ #ifdef LUA_USE_ASSERT
+-  ps = p;
++  ws = w;
+ #endif
+ 
+   /* Format number with leading/trailing whitespace and zeros. */
+   if ((sf & (STRFMT_F_LEFT|STRFMT_F_ZERO)) == 0)
+-    while (width-- > pprec) *p++ = ' ';
++    while (width-- > pprec) *w++ = ' ';
+   if (prefix) {
+-    if ((char)prefix >= 'X') *p++ = '0';
+-    *p++ = (char)prefix;
++    if ((char)prefix >= 'X') *w++ = '0';
++    *w++ = (char)prefix;
+   }
+   if ((sf & (STRFMT_F_LEFT|STRFMT_F_ZERO)) == STRFMT_F_ZERO)
+-    while (width-- > pprec) *p++ = '0';
+-  while (prec-- > len) *p++ = '0';
+-  while (q < buf + sizeof(buf)) *p++ = *q++;  /* Add number itself. */
++    while (width-- > pprec) *w++ = '0';
++  while (prec-- > len) *w++ = '0';
++  while (q < buf + sizeof(buf)) *w++ = *q++;  /* Add number itself. */
+   if ((sf & STRFMT_F_LEFT))
+-    while (width-- > pprec) *p++ = ' ';
++    while (width-- > pprec) *w++ = ' ';
+ 
+-  lj_assertX(need == (MSize)(p - ps), "miscalculated format size");
+-  setsbufP(sb, p);
++  lj_assertX(need == (MSize)(w - ws), "miscalculated format size");
++  sb->w = w;
+   return sb;
+ }
+ 
+@@ -346,6 +369,117 @@ SBuf *lj_strfmt_putfnum_uint(SBuf *sb, S
+   return lj_strfmt_putfxint(sb, sf, (uint64_t)k);
+ }
+ 
++/* Format stack arguments to buffer. */
++int lj_strfmt_putarg(lua_State *L, SBuf *sb, int arg, int retry)
++{
++  int narg = (int)(L->top - L->base);
++  GCstr *fmt = lj_lib_checkstr(L, arg);
++  FormatState fs;
++  SFormat sf;
++  lj_strfmt_init(&fs, strdata(fmt), fmt->len);
++  while ((sf = lj_strfmt_parse(&fs)) != STRFMT_EOF) {
++    if (sf == STRFMT_LIT) {
++      lj_buf_putmem(sb, fs.str, fs.len);
++    } else if (sf == STRFMT_ERR) {
++      lj_err_callerv(L, LJ_ERR_STRFMT,
++		     strdata(lj_str_new(L, fs.str, fs.len)));
++    } else {
++      TValue *o = &L->base[arg++];
++      if (arg > narg)
++	lj_err_arg(L, arg, LJ_ERR_NOVAL);
++      switch (STRFMT_TYPE(sf)) {
++      case STRFMT_INT:
++	if (tvisint(o)) {
++	  int32_t k = intV(o);
++	  if (sf == STRFMT_INT)
++	    lj_strfmt_putint(sb, k);  /* Shortcut for plain %d. */
++	  else
++	    lj_strfmt_putfxint(sb, sf, k);
++	  break;
++	}
++#if LJ_HASFFI
++	if (tviscdata(o)) {
++	  GCcdata *cd = cdataV(o);
++	  if (cd->ctypeid == CTID_INT64 || cd->ctypeid == CTID_UINT64) {
++	    lj_strfmt_putfxint(sb, sf, *(uint64_t *)cdataptr(cd));
++	    break;
++	  }
++	}
++#endif
++	lj_strfmt_putfnum_int(sb, sf, lj_lib_checknum(L, arg));
++	break;
++      case STRFMT_UINT:
++	if (tvisint(o)) {
++	  lj_strfmt_putfxint(sb, sf, intV(o));
++	  break;
++	}
++#if LJ_HASFFI
++	if (tviscdata(o)) {
++	  GCcdata *cd = cdataV(o);
++	  if (cd->ctypeid == CTID_INT64 || cd->ctypeid == CTID_UINT64) {
++	    lj_strfmt_putfxint(sb, sf, *(uint64_t *)cdataptr(cd));
++	    break;
++	  }
++	}
++#endif
++	lj_strfmt_putfnum_uint(sb, sf, lj_lib_checknum(L, arg));
++	break;
++      case STRFMT_NUM:
++	lj_strfmt_putfnum(sb, sf, lj_lib_checknum(L, arg));
++	break;
++      case STRFMT_STR: {
++	MSize len;
++	const char *s;
++	cTValue *mo;
++	if (LJ_UNLIKELY(!tvisstr(o) && !tvisbuf(o)) && retry >= 0 &&
++	    !tvisnil(mo = lj_meta_lookup(L, o, MM_tostring))) {
++	  /* Call __tostring metamethod once. */
++	  copyTV(L, L->top++, mo);
++	  copyTV(L, L->top++, o);
++	  lua_call(L, 1, 1);
++	  o = &L->base[arg-1];  /* Stack may have been reallocated. */
++	  copyTV(L, o, --L->top);  /* Replace inline for retry. */
++	  if (retry < 2) {  /* Global buffer may have been overwritten. */
++	    retry = 1;
++	    break;
++	  }
++	}
++	if (LJ_LIKELY(tvisstr(o))) {
++	  len = strV(o)->len;
++	  s = strVdata(o);
++#if LJ_HASBUFFER
++	} else if (tvisbuf(o)) {
++	  SBufExt *sbx = bufV(o);
++	  if (sbx == (SBufExt *)sb) lj_err_arg(L, arg+1, LJ_ERR_BUFFER_SELF);
++	  len = sbufxlen(sbx);
++	  s = sbx->r;
++#endif
++	} else {
++	  GCstr *str = lj_strfmt_obj(L, o);
++	  len = str->len;
++	  s = strdata(str);
++	}
++	if ((sf & STRFMT_T_QUOTED))
++	  strfmt_putquotedlen(sb, s, len);  /* No formatting. */
++	else
++	  strfmt_putfstrlen(sb, sf, s, len);
++	break;
++	}
++      case STRFMT_CHAR:
++	lj_strfmt_putfchar(sb, sf, lj_lib_checkint(L, arg));
++	break;
++      case STRFMT_PTR:  /* No formatting. */
++	lj_strfmt_putptr(sb, lj_obj_ptr(G(L), o));
++	break;
++      default:
++	lj_assertL(0, "bad string format type");
++	break;
++      }
++    }
++  }
++  return retry;
++}
++
+ /* -- Conversions to strings ---------------------------------------------- */
+ 
+ /* Convert integer to string. */
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_strfmt.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_strfmt.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_strfmt.h
+@@ -1,6 +1,6 @@
+ /*
+ ** String formatting.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_STRFMT_H
+@@ -95,7 +95,9 @@ LJ_FUNC SBuf * LJ_FASTCALL lj_strfmt_put
+ LJ_FUNC SBuf * LJ_FASTCALL lj_strfmt_putnum(SBuf *sb, cTValue *o);
+ #endif
+ LJ_FUNC SBuf * LJ_FASTCALL lj_strfmt_putptr(SBuf *sb, const void *v);
++#if LJ_HASJIT
+ LJ_FUNC SBuf * LJ_FASTCALL lj_strfmt_putquoted(SBuf *sb, GCstr *str);
++#endif
+ 
+ /* Formatted conversions to buffer. */
+ LJ_FUNC SBuf *lj_strfmt_putfxint(SBuf *sb, SFormat sf, uint64_t k);
+@@ -103,7 +105,10 @@ LJ_FUNC SBuf *lj_strfmt_putfnum_int(SBuf
+ LJ_FUNC SBuf *lj_strfmt_putfnum_uint(SBuf *sb, SFormat sf, lua_Number n);
+ LJ_FUNC SBuf *lj_strfmt_putfnum(SBuf *sb, SFormat, lua_Number n);
+ LJ_FUNC SBuf *lj_strfmt_putfchar(SBuf *sb, SFormat, int32_t c);
++#if LJ_HASJIT
+ LJ_FUNC SBuf *lj_strfmt_putfstr(SBuf *sb, SFormat, GCstr *str);
++#endif
++LJ_FUNC int lj_strfmt_putarg(lua_State *L, SBuf *sb, int arg, int retry);
+ 
+ /* Conversions to strings. */
+ LJ_FUNC GCstr * LJ_FASTCALL lj_strfmt_int(lua_State *L, int32_t k);
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_strfmt_num.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_strfmt_num.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_strfmt_num.c
+@@ -1,6 +1,6 @@
+ /*
+ ** String formatting for floating-point numbers.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ ** Contributed by Peter Cawley.
+ */
+ 
+@@ -576,7 +576,7 @@ static char *lj_strfmt_wfnum(SBuf *sb, S
+ /* Add formatted floating-point number to buffer. */
+ SBuf *lj_strfmt_putfnum(SBuf *sb, SFormat sf, lua_Number n)
+ {
+-  setsbufP(sb, lj_strfmt_wfnum(sb, sf, n, NULL));
++  sb->w = lj_strfmt_wfnum(sb, sf, n, NULL);
+   return sb;
+ }
+ 
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_strscan.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_strscan.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_strscan.c
+@@ -1,6 +1,6 @@
+ /*
+ ** String scanning.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #include <math.h>
+@@ -63,6 +63,7 @@
+ #define STRSCAN_MAXDIG	800		/* 772 + extra are sufficient. */
+ #define STRSCAN_DDIG	(STRSCAN_DIG/2)
+ #define STRSCAN_DMASK	(STRSCAN_DDIG-1)
++#define STRSCAN_MAXEXP	(1 << 20)
+ 
+ /* Helpers for circular buffer. */
+ #define DNEXT(a)	(((a)+1) & STRSCAN_DMASK)
+@@ -121,20 +122,21 @@ static StrScanFmt strscan_hex(const uint
+   /* Format-specific handling. */
+   switch (fmt) {
+   case STRSCAN_INT:
+-    if (!(opt & STRSCAN_OPT_TONUM) && x < 0x80000000u+neg) {
+-      o->i = neg ? -(int32_t)x : (int32_t)x;
++    if (!(opt & STRSCAN_OPT_TONUM) && x < 0x80000000u+neg &&
++	!(x == 0 && neg)) {
++      o->i = neg ? (int32_t)(~x+1u) : (int32_t)x;
+       return STRSCAN_INT;  /* Fast path for 32 bit integers. */
+     }
+     if (!(opt & STRSCAN_OPT_C)) { fmt = STRSCAN_NUM; break; }
+     /* fallthrough */
+   case STRSCAN_U32:
+     if (dig > 8) return STRSCAN_ERROR;
+-    o->i = neg ? -(int32_t)x : (int32_t)x;
++    o->i = neg ? (int32_t)(~x+1u) : (int32_t)x;
+     return STRSCAN_U32;
+   case STRSCAN_I64:
+   case STRSCAN_U64:
+     if (dig > 16) return STRSCAN_ERROR;
+-    o->u64 = neg ? (uint64_t)-(int64_t)x : x;
++    o->u64 = neg ? ~x+1u : x;
+     return fmt;
+   default:
+     break;
+@@ -166,12 +168,12 @@ static StrScanFmt strscan_oct(const uint
+     /* fallthrough */
+   case STRSCAN_U32:
+     if ((x >> 32)) return STRSCAN_ERROR;
+-    o->i = neg ? -(int32_t)x : (int32_t)x;
++    o->i = neg ? (int32_t)(~(uint32_t)x+1u) : (int32_t)x;
+     break;
+   default:
+   case STRSCAN_I64:
+   case STRSCAN_U64:
+-    o->u64 = neg ? (uint64_t)-(int64_t)x : x;
++    o->u64 = neg ? ~x+1u : x;
+     break;
+   }
+   return fmt;
+@@ -227,18 +229,18 @@ static StrScanFmt strscan_dec(const uint
+       switch (fmt) {
+       case STRSCAN_INT:
+ 	if (!(opt & STRSCAN_OPT_TONUM) && x < 0x80000000u+neg) {
+-	  o->i = neg ? -(int32_t)x : (int32_t)x;
++	  o->i = neg ? (int32_t)(~x+1u) : (int32_t)x;
+ 	  return STRSCAN_INT;  /* Fast path for 32 bit integers. */
+ 	}
+ 	if (!(opt & STRSCAN_OPT_C)) { fmt = STRSCAN_NUM; goto plainnumber; }
+ 	/* fallthrough */
+       case STRSCAN_U32:
+ 	if ((x >> 32) != 0) return STRSCAN_ERROR;
+-	o->i = neg ? -(int32_t)x : (int32_t)x;
++	o->i = neg ? (int32_t)(~x+1u) : (int32_t)x;
+ 	return STRSCAN_U32;
+       case STRSCAN_I64:
+       case STRSCAN_U64:
+-	o->u64 = neg ? (uint64_t)-(int64_t)x : x;
++	o->u64 = neg ? ~x+1u : x;
+ 	return fmt;
+       default:
+       plainnumber:  /* Fast path for plain numbers < 2^63. */
+@@ -346,18 +348,18 @@ static StrScanFmt strscan_bin(const uint
+   switch (fmt) {
+   case STRSCAN_INT:
+     if (!(opt & STRSCAN_OPT_TONUM) && x < 0x80000000u+neg) {
+-      o->i = neg ? -(int32_t)x : (int32_t)x;
++      o->i = neg ? (int32_t)(~x+1u) : (int32_t)x;
+       return STRSCAN_INT;  /* Fast path for 32 bit integers. */
+     }
+     if (!(opt & STRSCAN_OPT_C)) { fmt = STRSCAN_NUM; break; }
+     /* fallthrough */
+   case STRSCAN_U32:
+     if (dig > 32) return STRSCAN_ERROR;
+-    o->i = neg ? -(int32_t)x : (int32_t)x;
++    o->i = neg ? (int32_t)(~x+1u) : (int32_t)x;
+     return STRSCAN_U32;
+   case STRSCAN_I64:
+   case STRSCAN_U64:
+-    o->u64 = neg ? (uint64_t)-(int64_t)x : x;
++    o->u64 = neg ? ~x+1u : x;
+     return fmt;
+   default:
+     break;
+@@ -448,6 +450,7 @@ StrScanFmt lj_strscan_scan(const uint8_t
+       if (dig) {
+ 	ex = (int32_t)(dp-(p-1)); dp = p-1;
+ 	while (ex < 0 && *dp-- == '0') ex++, dig--;  /* Skip trailing zeros. */
++	if (ex <= -STRSCAN_MAXEXP) return STRSCAN_ERROR;
+ 	if (base == 16) ex *= 4;
+       }
+     }
+@@ -461,10 +464,11 @@ StrScanFmt lj_strscan_scan(const uint8_t
+       if (!lj_char_isdigit(*p)) return STRSCAN_ERROR;
+       xx = (*p++ & 15);
+       while (lj_char_isdigit(*p)) {
+-	if (xx < 65536) xx = xx * 10 + (*p & 15);
++	xx = xx * 10 + (*p & 15);
++	if (xx >= STRSCAN_MAXEXP) return STRSCAN_ERROR;
+ 	p++;
+       }
+-      ex += negx ? -(int32_t)xx : (int32_t)xx;
++      ex += negx ? (int32_t)(~xx+1u) : (int32_t)xx;
+     }
+ 
+     /* Parse suffix. */
+@@ -499,8 +503,11 @@ StrScanFmt lj_strscan_scan(const uint8_t
+       if ((opt & STRSCAN_OPT_TONUM)) {
+ 	o->n = neg ? -(double)x : (double)x;
+ 	return STRSCAN_NUM;
++      } else if (x == 0 && neg) {
++	o->n = -0.0;
++	return STRSCAN_NUM;
+       } else {
+-	o->i = neg ? -(int32_t)x : (int32_t)x;
++	o->i = neg ? (int32_t)(~x+1u) : (int32_t)x;
+ 	return STRSCAN_INT;
+       }
+     }
+@@ -516,7 +523,7 @@ StrScanFmt lj_strscan_scan(const uint8_t
+       fmt = strscan_dec(sp, o, fmt, opt, ex, neg, dig);
+ 
+     /* Try to convert number to integer, if requested. */
+-    if (fmt == STRSCAN_NUM && (opt & STRSCAN_OPT_TOINT)) {
++    if (fmt == STRSCAN_NUM && (opt & STRSCAN_OPT_TOINT) && !tvismzero(o)) {
+       double n = o->n;
+       int32_t i = lj_num2int(n);
+       if (n == (lua_Number)i) { o->i = i; return STRSCAN_INT; }
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_strscan.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_strscan.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_strscan.h
+@@ -1,6 +1,6 @@
+ /*
+ ** String scanning.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_STRSCAN_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_tab.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_tab.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_tab.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Table handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Major portions taken verbatim or adapted from the Lua interpreter.
+ ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+@@ -16,25 +16,6 @@
+ 
+ /* -- Object hashing ------------------------------------------------------ */
+ 
+-/* Hash values are masked with the table hash mask and used as an index. */
+-static LJ_AINLINE Node *hashmask(const GCtab *t, uint32_t hash)
+-{
+-  Node *n = noderef(t->node);
+-  return &n[hash & t->hmask];
+-}
+-
+-/* String IDs are generated when a string is interned. */
+-#define hashstr(t, s)		hashmask(t, (s)->sid)
+-
+-#define hashlohi(t, lo, hi)	hashmask((t), hashrot((lo), (hi)))
+-#define hashnum(t, o)		hashlohi((t), (o)->u32.lo, ((o)->u32.hi << 1))
+-#if LJ_GC64
+-#define hashgcref(t, r) \
+-  hashlohi((t), (uint32_t)gcrefu(r), (uint32_t)(gcrefu(r) >> 32))
+-#else
+-#define hashgcref(t, r)		hashlohi((t), gcrefu(r), gcrefu(r) + HASH_BIAS)
+-#endif
+-
+ /* Hash an arbitrary key and return its anchor position in the hash table. */
+ static Node *hashkey(const GCtab *t, cTValue *key)
+ {
+@@ -413,7 +394,7 @@ cTValue * LJ_FASTCALL lj_tab_getinth(GCt
+   return NULL;
+ }
+ 
+-cTValue *lj_tab_getstr(GCtab *t, GCstr *key)
++cTValue *lj_tab_getstr(GCtab *t, const GCstr *key)
+ {
+   Node *n = hashstr(t, key);
+   do {
+@@ -546,7 +527,7 @@ TValue *lj_tab_setinth(lua_State *L, GCt
+   return lj_tab_newkey(L, t, &k);
+ }
+ 
+-TValue *lj_tab_setstr(lua_State *L, GCtab *t, GCstr *key)
++TValue *lj_tab_setstr(lua_State *L, GCtab *t, const GCstr *key)
+ {
+   TValue k;
+   Node *n = hashstr(t, key);
+@@ -587,56 +568,66 @@ TValue *lj_tab_set(lua_State *L, GCtab *
+ 
+ /* -- Table traversal ----------------------------------------------------- */
+ 
+-/* Get the traversal index of a key. */
+-static uint32_t keyindex(lua_State *L, GCtab *t, cTValue *key)
++/* Table traversal indexes:
++**
++** Array key index: [0 .. t->asize-1]
++** Hash key index:  [t->asize .. t->asize+t->hmask]
++** Invalid key:     ~0
++*/
++
++/* Get the successor traversal index of a key. */
++uint32_t LJ_FASTCALL lj_tab_keyindex(GCtab *t, cTValue *key)
+ {
+   TValue tmp;
+   if (tvisint(key)) {
+     int32_t k = intV(key);
+     if ((uint32_t)k < t->asize)
+-      return (uint32_t)k;  /* Array key indexes: [0..t->asize-1] */
++      return (uint32_t)k + 1;
+     setnumV(&tmp, (lua_Number)k);
+     key = &tmp;
+   } else if (tvisnum(key)) {
+     lua_Number nk = numV(key);
+     int32_t k = lj_num2int(nk);
+     if ((uint32_t)k < t->asize && nk == (lua_Number)k)
+-      return (uint32_t)k;  /* Array key indexes: [0..t->asize-1] */
++      return (uint32_t)k + 1;
+   }
+   if (!tvisnil(key)) {
+     Node *n = hashkey(t, key);
+     do {
+       if (lj_obj_equal(&n->key, key))
+-	return t->asize + (uint32_t)(n - noderef(t->node));
+-	/* Hash key indexes: [t->asize..t->asize+t->nmask] */
++	return t->asize + (uint32_t)((n+1) - noderef(t->node));
+     } while ((n = nextnode(n)));
+-    if (key->u32.hi == 0xfffe7fff)  /* ITERN was despecialized while running. */
+-      return key->u32.lo - 1;
+-    lj_err_msg(L, LJ_ERR_NEXTIDX);
+-    return 0;  /* unreachable */
+-  }
+-  return ~0u;  /* A nil key starts the traversal. */
+-}
+-
+-/* Advance to the next step in a table traversal. */
+-int lj_tab_next(lua_State *L, GCtab *t, TValue *key)
+-{
+-  uint32_t i = keyindex(L, t, key);  /* Find predecessor key index. */
+-  for (i++; i < t->asize; i++)  /* First traverse the array keys. */
+-    if (!tvisnil(arrayslot(t, i))) {
+-      setintV(key, i);
+-      copyTV(L, key+1, arrayslot(t, i));
++    if (key->u32.hi == LJ_KEYINDEX)  /* Despecialized ITERN while running. */
++      return key->u32.lo;
++    return ~0u;  /* Invalid key to next. */
++  }
++  return 0;  /* A nil key starts the traversal. */
++}
++
++/* Get the next key/value pair of a table traversal. */
++int lj_tab_next(GCtab *t, cTValue *key, TValue *o)
++{
++  uint32_t idx = lj_tab_keyindex(t, key);  /* Find successor index of key. */
++  /* First traverse the array part. */
++  for (; idx < t->asize; idx++) {
++    cTValue *a = arrayslot(t, idx);
++    if (LJ_LIKELY(!tvisnil(a))) {
++      setintV(o, idx);
++      o[1] = *a;
+       return 1;
+     }
+-  for (i -= t->asize; i <= t->hmask; i++) {  /* Then traverse the hash keys. */
+-    Node *n = &noderef(t->node)[i];
++  }
++  idx -= t->asize;
++  /* Then traverse the hash part. */
++  for (; idx <= t->hmask; idx++) {
++    Node *n = &noderef(t->node)[idx];
+     if (!tvisnil(&n->val)) {
+-      copyTV(L, key, &n->key);
+-      copyTV(L, key+1, &n->val);
++      o[0] = n->key;
++      o[1] = n->val;
+       return 1;
+     }
+   }
+-  return 0;  /* End of traversal. */
++  return (int32_t)idx < 0 ? -1 : 0;  /* Invalid key or end of traversal. */
+ }
+ 
+ /* -- Table length calculation -------------------------------------------- */
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_tab.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_tab.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_tab.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Table handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_TAB_H
+@@ -31,6 +31,25 @@ static LJ_AINLINE uint32_t hashrot(uint3
+   return hi;
+ }
+ 
++/* Hash values are masked with the table hash mask and used as an index. */
++static LJ_AINLINE Node *hashmask(const GCtab *t, uint32_t hash)
++{
++  Node *n = noderef(t->node);
++  return &n[hash & t->hmask];
++}
++
++/* String IDs are generated when a string is interned. */
++#define hashstr(t, s)		hashmask(t, (s)->sid)
++
++#define hashlohi(t, lo, hi)	hashmask((t), hashrot((lo), (hi)))
++#define hashnum(t, o)		hashlohi((t), (o)->u32.lo, ((o)->u32.hi << 1))
++#if LJ_GC64
++#define hashgcref(t, r) \
++  hashlohi((t), (uint32_t)gcrefu(r), (uint32_t)(gcrefu(r) >> 32))
++#else
++#define hashgcref(t, r)		hashlohi((t), gcrefu(r), gcrefu(r) + HASH_BIAS)
++#endif
++
+ #define hsize2hbits(s)	((s) ? ((s)==1 ? 1 : 1+lj_fls((uint32_t)((s)-1))) : 0)
+ 
+ LJ_FUNCA GCtab *lj_tab_new(lua_State *L, uint32_t asize, uint32_t hbits);
+@@ -50,14 +69,14 @@ LJ_FUNCA void lj_tab_reasize(lua_State *
+ /* Caveat: all getters except lj_tab_get() can return NULL! */
+ 
+ LJ_FUNCA cTValue * LJ_FASTCALL lj_tab_getinth(GCtab *t, int32_t key);
+-LJ_FUNC cTValue *lj_tab_getstr(GCtab *t, GCstr *key);
++LJ_FUNC cTValue *lj_tab_getstr(GCtab *t, const GCstr *key);
+ LJ_FUNCA cTValue *lj_tab_get(lua_State *L, GCtab *t, cTValue *key);
+ 
+ /* Caveat: all setters require a write barrier for the stored value. */
+ 
+ LJ_FUNCA TValue *lj_tab_newkey(lua_State *L, GCtab *t, cTValue *key);
+ LJ_FUNCA TValue *lj_tab_setinth(lua_State *L, GCtab *t, int32_t key);
+-LJ_FUNC TValue *lj_tab_setstr(lua_State *L, GCtab *t, GCstr *key);
++LJ_FUNC TValue *lj_tab_setstr(lua_State *L, GCtab *t, const GCstr *key);
+ LJ_FUNC TValue *lj_tab_set(lua_State *L, GCtab *t, cTValue *key);
+ 
+ #define inarray(t, key)		((MSize)(key) < (MSize)(t)->asize)
+@@ -67,7 +86,8 @@ LJ_FUNC TValue *lj_tab_set(lua_State *L,
+ #define lj_tab_setint(L, t, key) \
+   (inarray((t), (key)) ? arrayslot((t), (key)) : lj_tab_setinth(L, (t), (key)))
+ 
+-LJ_FUNCA int lj_tab_next(lua_State *L, GCtab *t, TValue *key);
++LJ_FUNC uint32_t LJ_FASTCALL lj_tab_keyindex(GCtab *t, cTValue *key);
++LJ_FUNCA int lj_tab_next(GCtab *t, cTValue *key, TValue *o);
+ LJ_FUNCA MSize LJ_FASTCALL lj_tab_len(GCtab *t);
+ #if LJ_HASJIT
+ LJ_FUNC MSize LJ_FASTCALL lj_tab_len_hint(GCtab *t, size_t hint);
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_target.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_target.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_target.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Definitions for target CPU.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_TARGET_H
+@@ -55,10 +55,16 @@ typedef uint32_t RegSP;
+ /* Bitset for registers. 32 registers suffice for most architectures.
+ ** Note that one set holds bits for both GPRs and FPRs.
+ */
+-#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64
++#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64 || LJ_TARGET_RISCV64
+ typedef uint64_t RegSet;
++#define RSET_BITS		6
++#define rset_picktop_(rs)	((Reg)lj_fls64(rs))
++#define rset_pickbot_(rs)	((Reg)lj_ffs64(rs))
+ #else
+ typedef uint32_t RegSet;
++#define RSET_BITS		5
++#define rset_picktop_(rs)	((Reg)lj_fls(rs))
++#define rset_pickbot_(rs)	((Reg)lj_ffs(rs))
+ #endif
+ 
+ #define RID2RSET(r)		(((RegSet)1) << (r))
+@@ -69,13 +75,6 @@ typedef uint32_t RegSet;
+ #define rset_set(rs, r)		(rs |= RID2RSET(r))
+ #define rset_clear(rs, r)	(rs &= ~RID2RSET(r))
+ #define rset_exclude(rs, r)	(rs & ~RID2RSET(r))
+-#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64
+-#define rset_picktop(rs)	((Reg)(__builtin_clzll(rs)^63))
+-#define rset_pickbot(rs)	((Reg)__builtin_ctzll(rs))
+-#else
+-#define rset_picktop(rs)	((Reg)lj_fls(rs))
+-#define rset_pickbot(rs)	((Reg)lj_ffs(rs))
+-#endif
+ 
+ /* -- Register allocation cost -------------------------------------------- */
+ 
+@@ -144,6 +143,8 @@ typedef uint32_t RegCost;
+ #include "lj_target_ppc.h"
+ #elif LJ_TARGET_MIPS
+ #include "lj_target_mips.h"
++#elif LJ_TARGET_RISCV64
++#include "lj_target_riscv.h"
+ #else
+ #error "Missing include for target CPU"
+ #endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_target_arm.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_target_arm.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_target_arm.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Definitions for ARM CPUs.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_TARGET_ARM_H
+@@ -211,6 +211,7 @@ typedef enum ARMIns {
+   /* ARMv6T2 */
+   ARMI_MOVW = 0xe3000000,
+   ARMI_MOVT = 0xe3400000,
++  ARMI_BFI = 0xe7c00010,
+ 
+   /* VFP */
+   ARMI_VMOV_D = 0xeeb00b40,
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_target_arm64.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_target_arm64.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_target_arm64.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Definitions for ARM64 CPUs.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_TARGET_ARM64_H
+@@ -31,6 +31,8 @@ enum {
+ 
+   /* Calling conventions. */
+   RID_RET = RID_X0,
++  RID_RETLO = RID_X0,
++  RID_RETHI = RID_X1,
+   RID_FPRET = RID_D0,
+ 
+   /* These definitions must match with the *.dasc file(s): */
+@@ -210,6 +212,8 @@ typedef enum A64Ins {
+ 
+   A64I_EXTRw = 0x13800000,
+   A64I_EXTRx = 0x93c00000,
++  A64I_BFMw = 0x33000000,
++  A64I_BFMx = 0xb3400000,
+   A64I_SBFMw = 0x13000000,
+   A64I_SBFMx = 0x93400000,
+   A64I_SXTBw = 0x13001c00,
+@@ -230,6 +234,8 @@ typedef enum A64Ins {
+   A64I_MOVZx = 0xd2800000,
+   A64I_MOVNw = 0x12800000,
+   A64I_MOVNx = 0x92800000,
++  A64I_ADR = 0x10000000,
++  A64I_ADRP = 0x90000000,
+ 
+   A64I_LDRB = 0x39400000,
+   A64I_LDRH = 0x79400000,
+@@ -256,6 +262,9 @@ typedef enum A64Ins {
+   A64I_CBZ = 0x34000000,
+   A64I_CBNZ = 0x35000000,
+ 
++  A64I_BRAAZ = 0xd61f081f,
++  A64I_BLRAAZ = 0xd63f081f,
++
+   A64I_NOP = 0xd503201f,
+ 
+   /* FP */
+@@ -313,6 +322,9 @@ typedef enum A64Ins {
+   A64I_FMOV_DI = 0x1e601000,
+ } A64Ins;
+ 
++#define A64I_BR_AUTH	(LJ_ABI_PAUTH ? A64I_BRAAZ : A64I_BR)
++#define A64I_BLR_AUTH	(LJ_ABI_PAUTH ? A64I_BLRAAZ : A64I_BLR)
++
+ typedef enum A64Shift {
+   A64SH_LSL, A64SH_LSR, A64SH_ASR, A64SH_ROR
+ } A64Shift;
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_target_mips.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_target_mips.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_target_mips.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Definitions for MIPS CPUs.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_TARGET_MIPS_H
+@@ -256,6 +256,8 @@ typedef enum MIPSIns {
+   MIPSI_ROTRV = 0x00000046,	/* MIPSXXR2 */
+   MIPSI_DROTRV = 0x00000056,
+ 
++  MIPSI_INS = 0x7c000004,	/* MIPSXXR2 */
++
+   MIPSI_SEB = 0x7c000420,	/* MIPSXXR2 */
+   MIPSI_SEH = 0x7c000620,	/* MIPSXXR2 */
+   MIPSI_WSBH = 0x7c0000a0,	/* MIPSXXR2 */
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_target_ppc.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_target_ppc.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_target_ppc.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Definitions for PPC CPUs.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_TARGET_PPC_H
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_target_riscv.h
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_target_riscv.h
+@@ -0,0 +1,513 @@
++/*
++** Definitions for RISC-V CPUs.
++** Copyright (C) 2005-2022 Mike Pall. See Copyright Notice in luajit.h
++*/
++
++#ifndef _LJ_TARGET_RISCV_H
++#define _LJ_TARGET_RISCV_H
++
++/* -- Registers IDs ------------------------------------------------------- */
++
++#if LJ_ARCH_EMBEDDED
++#define GPRDEF(_) \
++  _(X0) _(RA) _(SP) _(X3) _(X4) _(X5) _(X6) _(X7) \
++  _(X8) _(X9) _(X10) _(X11) _(X12) _(X13) _(X14) _(X15)
++#else
++#define GPRDEF(_) \
++  _(X0) _(RA) _(SP) _(X3) _(X4) _(X5) _(X6) _(X7) \
++  _(X8) _(X9) _(X10) _(X11) _(X12) _(X13) _(X14) _(X15) \
++  _(X16) _(X17) _(X18) _(X19) _(X20) _(X21) _(X22) _(X23) \
++  _(X24) _(X25) _(X26) _(X27) _(X28) _(X29) _(X30) _(X31)
++#endif
++#if LJ_SOFTFP
++#define FPRDEF(_)
++#else
++#define FPRDEF(_) \
++  _(F0) _(F1) _(F2) _(F3) _(F4) _(F5) _(F6) _(F7) \
++  _(F8) _(F9) _(F10) _(F11) _(F12) _(F13) _(F14) _(F15) \
++  _(F16) _(F17) _(F18) _(F19) _(F20) _(F21) _(F22) _(F23) \
++  _(F24) _(F25) _(F26) _(F27) _(F28) _(F29) _(F30) _(F31)
++#endif
++#define VRIDDEF(_)
++
++#define RIDENUM(name)	RID_##name,
++
++enum {
++  GPRDEF(RIDENUM)		/* General-purpose registers (GPRs). */
++  FPRDEF(RIDENUM)		/* Floating-point registers (FPRs). */
++  RID_MAX,
++  RID_ZERO = RID_X0,
++  RID_TMP = RID_RA,
++  RID_GP = RID_X3,
++  RID_TP = RID_X4,
++
++  /* Calling conventions. */
++  RID_RET = RID_X10,
++#if LJ_LE
++  RID_RETHI = RID_X11,
++  RID_RETLO = RID_X10,
++#else
++  RID_RETHI = RID_X10,
++  RID_RETLO = RID_X11,
++#endif
++#if LJ_SOFTFP
++  RID_FPRET = RID_X10,
++#else
++  RID_FPRET = RID_F10,
++#endif
++  RID_CFUNCADDR = RID_X5,
++
++  /* These definitions must match with the *.dasc file(s): */
++  RID_BASE = RID_X18,		/* Interpreter BASE. */
++  RID_LPC = RID_X20,		/* Interpreter PC. */
++  RID_GL = RID_X21,		/* Interpreter GL. */
++  RID_LREG = RID_X23,		/* Interpreter L. */
++
++  /* Register ranges [min, max) and number of registers. */
++  RID_MIN_GPR = RID_X0,
++  RID_MAX_GPR = RID_X31+1,
++  RID_MIN_FPR = RID_MAX_GPR,
++#if LJ_SOFTFP
++  RID_MAX_FPR = RID_MIN_FPR,
++#else
++  RID_MAX_FPR = RID_F31+1,
++#endif
++  RID_NUM_GPR = RID_MAX_GPR - RID_MIN_GPR,
++  RID_NUM_FPR = RID_MAX_FPR - RID_MIN_FPR	/* Only even regs are used. */
++};
++
++#define RID_NUM_KREF		RID_NUM_GPR
++#define RID_MIN_KREF		RID_X0
++
++/* -- Register sets ------------------------------------------------------- */
++
++/* Make use of all registers, except ZERO, TMP, SP, GP, TP, CFUNCADDR and GL. */
++#define RSET_FIXED \
++  (RID2RSET(RID_ZERO)|RID2RSET(RID_TMP)|RID2RSET(RID_SP)|\
++   RID2RSET(RID_GP)|RID2RSET(RID_TP)|RID2RSET(RID_GL))
++#define RSET_GPR	(RSET_RANGE(RID_MIN_GPR, RID_MAX_GPR) - RSET_FIXED)
++#if LJ_SOFTFP
++#define RSET_FPR	0
++#else
++#define RSET_FPR	RSET_RANGE(RID_MIN_FPR, RID_MAX_FPR)
++#endif
++
++#define RSET_ALL	(RSET_GPR|RSET_FPR)
++#define RSET_INIT	RSET_ALL
++
++#define RSET_SCRATCH_GPR \
++  (RSET_RANGE(RID_X5, RID_X7+1)|RSET_RANGE(RID_X28, RID_X31+1)|\
++   RSET_RANGE(RID_X10, RID_X17+1))
++
++#if LJ_SOFTFP
++#define RSET_SCRATCH_FPR	0
++#else
++#define RSET_SCRATCH_FPR \
++  (RSET_RANGE(RID_F0, RID_F7+1)|RSET_RANGE(RID_F10, RID_F17+1)|\
++   RSET_RANGE(RID_F28, RID_F31+1))
++#endif
++#define RSET_SCRATCH		(RSET_SCRATCH_GPR|RSET_SCRATCH_FPR)
++
++#define REGARG_FIRSTGPR		RID_X10
++#define REGARG_LASTGPR		RID_X17
++#define REGARG_NUMGPR		8
++
++#if LJ_ABI_SOFTFP
++#define REGARG_FIRSTFPR		0
++#define REGARG_LASTFPR		0
++#define REGARG_NUMFPR		0
++#else
++#define REGARG_FIRSTFPR		RID_F10
++#define REGARG_LASTFPR		RID_F17
++#define REGARG_NUMFPR		8
++#endif
++
++/* -- Spill slots --------------------------------------------------------- */
++
++/* Spill slots are 32 bit wide. An even/odd pair is used for FPRs.
++**
++** SPS_FIXED: Available fixed spill slots in interpreter frame.
++** This definition must match with the *.dasc file(s).
++**
++** SPS_FIRST: First spill slot for general use.
++*/
++#if LJ_32
++#define SPS_FIXED	5
++#else
++#define SPS_FIXED	4
++#endif
++#define SPS_FIRST	4
++
++#define SPOFS_TMP	0
++
++#define sps_scale(slot)		(4 * (int32_t)(slot))
++#define sps_align(slot)		(((slot) - SPS_FIXED + 3) & ~3)
++
++/* -- Exit state ---------------------------------------------------------- */
++/* This definition must match with the *.dasc file(s). */
++typedef struct {
++#if !LJ_SOFTFP
++  lua_Number fpr[RID_NUM_FPR];	/* Floating-point registers. */
++#endif
++  intptr_t gpr[RID_NUM_GPR];	/* General-purpose registers. */
++  int32_t spill[256];		/* Spill slots. */
++} ExitState;
++
++/* Highest exit + 1 indicates stack check. */
++#define EXITSTATE_CHECKEXIT	1
++
++/* Return the address of a per-trace exit stub. */
++static LJ_AINLINE uint32_t *exitstub_trace_addr_(uint32_t *p, uint32_t exitno)
++{
++  while (*p == 0x00000013) p++;  /* Skip RISCVI_NOP. */
++  return p + 4 + exitno;
++}
++/* Avoid dependence on lj_jit.h if only including lj_target.h. */
++#define exitstub_trace_addr(T, exitno) \
++  exitstub_trace_addr_((MCode *)((char *)(T)->mcode + (T)->szmcode), (exitno))
++
++/* -- Instructions -------------------------------------------------------- */
++
++/* Instruction fields. */
++#define RISCVF_D(d)	(((d)&31) << 7)
++#define RISCVF_S1(r)	(((r)&31) << 15)
++#define RISCVF_S2(r)	(((r)&31) << 20)
++#define RISCVF_S3(r)	(((r)&31) << 27)
++#define RISCVF_FUNCT2(f)	(((f)&3) << 25)
++#define RISCVF_FUNCT3(f)	(((f)&7) << 12)
++#define RISCVF_FUNCT7(f)	(((f)&127) << 25)
++#define RISCVF_SHAMT(s)	((s) << 20)
++#define RISCVF_RM(m)	(((m)&7) << 12)
++#define RISCVF_IMMI(i)	((i) << 20)
++#define RISCVF_IMMS(i)	(((i)&0xfe0) << 20 | ((i)&0x1f) << 7)
++#define RISCVF_IMMB(i)	(((i)&0x1000) << 19 | ((i)&0x800) >> 4 | ((i)&0x7e0) << 20 | ((i)&0x1e) << 7)
++#define RISCVF_IMMU(i)	(((i)&0xfffff) << 12)
++#define RISCVF_IMMJ(i)	(((i)&0x100000) << 11 | ((i)&0xff000) | ((i)&0x800) << 9 | ((i)&0x7fe) << 20)
++
++/* Encode helpers. */
++#define RISCVF_W_HI(w)  ((w) - ((((w)&0xfff)^0x800) - 0x800))
++#define RISCVF_W_LO(w)  ((w)&0xfff)
++#define RISCVF_HI(i)	((RISCVF_W_HI(i) >> 12) & 0xfffff)
++#define RISCVF_LO(i)	RISCVF_W_LO(i)
++
++/* Check for valid field range. */
++#define RISCVF_SIMM_OK(x, b)	((((x) + (1 << (b-1))) >> (b)) == 0)
++#define checki12(i)		RISCVF_SIMM_OK(i, 12)
++#define checki13(i)		RISCVF_SIMM_OK(i, 13)
++#define checki20(i)		RISCVF_SIMM_OK(i, 20)
++#define checki21(i)		RISCVF_SIMM_OK(i, 21)
++
++typedef enum RISCVIns {
++
++  /* --- RVI --- */
++  RISCVI_LUI = 0x00000037,
++  RISCVI_AUIPC = 0x00000017,
++
++  RISCVI_JAL = 0x0000006f,
++  RISCVI_JALR = 0x00000067,
++
++  RISCVI_ADDI = 0x00000013,
++  RISCVI_SLTI = 0x00002013,
++  RISCVI_SLTIU = 0x00003013,
++  RISCVI_XORI = 0x00004013,
++  RISCVI_ORI = 0x00006013,
++  RISCVI_ANDI = 0x00007013,
++
++  RISCVI_SLLI = 0x00001013,
++  RISCVI_SRLI = 0x00005013,
++  RISCVI_SRAI = 0x40005013,
++
++  RISCVI_ADD = 0x00000033,
++  RISCVI_SUB = 0x40000033,
++  RISCVI_SLL = 0x00001033,
++  RISCVI_SLT = 0x00002033,
++  RISCVI_SLTU = 0x00003033,
++  RISCVI_XOR = 0x00004033,
++  RISCVI_SRL = 0x00005033,
++  RISCVI_SRA = 0x40005033,
++  RISCVI_OR = 0x00006033,
++  RISCVI_AND = 0x00007033,
++
++  RISCVI_LB = 0x00000003,
++  RISCVI_LH = 0x00001003,
++  RISCVI_LW = 0x00002003,
++  RISCVI_LBU = 0x00004003,
++  RISCVI_LHU = 0x00005003,
++  RISCVI_SB = 0x00000023,
++  RISCVI_SH = 0x00001023,
++  RISCVI_SW = 0x00002023,
++
++  RISCVI_BEQ = 0x00000063,
++  RISCVI_BNE = 0x00001063,
++  RISCVI_BLT = 0x00004063,
++  RISCVI_BGE = 0x00005063,
++  RISCVI_BLTU = 0x00006063,
++  RISCVI_BGEU = 0x00007063,
++
++  RISCVI_ECALL = 0x00000073,
++  RISCVI_EBREAK = 0x00100073,
++
++  RISCVI_NOP = 0x00000013,
++  RISCVI_MV = 0x00000013,
++  RISCVI_NOT = 0xfff04013,
++  RISCVI_NEG = 0x40000033,
++  RISCVI_RET = 0x00008067,
++  RISCVI_ZEXT_B = 0x0ff07013,
++
++#if LJ_TARGET_RISCV64
++  RISCVI_LWU = 0x00007003,
++  RISCVI_LD = 0x00003003,
++  RISCVI_SD = 0x00003023,
++
++  RISCVI_ADDIW = 0x0000001b,
++
++  RISCVI_SLLIW = 0x0000101b,
++  RISCVI_SRLIW = 0x0000501b,
++  RISCVI_SRAIW = 0x4000501b,
++
++  RISCVI_ADDW = 0x0000003b,
++  RISCVI_SUBW = 0x4000003b,
++  RISCVI_SLLW = 0x0000103b,
++  RISCVI_SRLW = 0x0000503b,
++  RISCVI_SRAW = 0x4000503b,
++
++  RISCVI_NEGW = 0x4000003b,
++  RISCVI_SEXT_W = 0x0000001b,
++#endif
++
++  /* --- RVM --- */
++  RISCVI_MUL = 0x02000033,
++  RISCVI_MULH = 0x02001033,
++  RISCVI_MULHSU = 0x02002033,
++  RISCVI_MULHU = 0x02003033,
++  RISCVI_DIV = 0x02004033,
++  RISCVI_DIVU = 0x02005033,
++  RISCVI_REM = 0x02006033,
++  RISCVI_REMU = 0x02007033,
++#if LJ_TARGET_RISCV64
++  RISCVI_MULW = 0x0200003b,
++  RISCVI_DIVW = 0x0200403b,
++  RISCVI_DIVUW = 0x0200503b,
++  RISCVI_REMW = 0x0200603b,
++  RISCVI_REMUW = 0x0200703b,
++#endif
++
++  /* --- RVF --- */
++  RISCVI_FLW = 0x00002007,
++  RISCVI_FSW = 0x00002027,
++
++  RISCVI_FMADD_S = 0x00000043,
++  RISCVI_FMSUB_S = 0x00000047,
++  RISCVI_FNMSUB_S = 0x0000004b,
++  RISCVI_FNMADD_S = 0x0000004f,
++
++  RISCVI_FADD_S = 0x00000053,
++  RISCVI_FSUB_S = 0x08000053,
++  RISCVI_FMUL_S = 0x10000053,
++  RISCVI_FDIV_S = 0x18000053,
++  RISCVI_FSQRT_S = 0x58000053,
++
++  RISCVI_FSGNJ_S = 0x20000053,
++  RISCVI_FSGNJN_S = 0x20001053,
++  RISCVI_FSGNJX_S = 0x20002053,
++
++  RISCVI_FMIN_S = 0x28000053,
++  RISCVI_FMAX_S = 0x28001053,
++
++  RISCVI_FCVT_W_S = 0xc0000053,
++  RISCVI_FCVT_WU_S = 0xc0100053,
++
++  RISCVI_FMV_X_W = 0xe0000053,
++
++  RISCVI_FEQ_S = 0xa0002053,
++  RISCVI_FLT_S = 0xa0001053,
++  RISCVI_FLE_S = 0xa0000053,
++
++  RISCVI_FCLASS_S = 0xe0001053,
++
++  RISCVI_FCVT_S_W = 0xd0000053,
++  RISCVI_FCVT_S_WU = 0xd0100053,
++  RISCVI_FMV_W_X = 0xf0000033,
++
++  RISCVI_FMV_S = 0x20000053,
++  RISCVI_FNEG_S = 0x20001053,
++  RISCVI_FABS_S = 0x20002053,
++#if LJ_TARGET_RISCV64
++  RISCVI_FCVT_L_S = 0xc0200053,
++  RISCVI_FCVT_LU_S = 0xc0300053,
++  RISCVI_FCVT_S_L = 0xd0200053,
++  RISCVI_FCVT_S_LU = 0xd0300053,
++#endif
++
++  /* --- RVD --- */
++  RISCVI_FLD = 0x00003007,
++  RISCVI_FSD = 0x00003027,
++
++  RISCVI_FMADD_D = 0x02000043,
++  RISCVI_FMSUB_D = 0x02000047,
++  RISCVI_FNMSUB_D = 0x0200004b,
++  RISCVI_FNMADD_D = 0x0200004f,
++
++  RISCVI_FADD_D = 0x02000053,
++  RISCVI_FSUB_D = 0x0a000053,
++  RISCVI_FMUL_D = 0x12000053,
++  RISCVI_FDIV_D = 0x1a000053,
++  RISCVI_FSQRT_D = 0x5a000053,
++
++  RISCVI_FSGNJ_D = 0x22000053,
++  RISCVI_FSGNJN_D = 0x22001053,
++  RISCVI_FSGNJX_D = 0x22002053,
++
++  RISCVI_FMIN_D = 0x2a000053,
++  RISCVI_FMAX_D = 0x2a001053,
++
++  RISCVI_FCVT_S_D = 0x40100053,
++  RISCVI_FCVT_D_S = 0x42000053,
++
++  RISCVI_FEQ_D = 0xa2002053,
++  RISCVI_FLT_D = 0xa2001053,
++  RISCVI_FLE_D = 0xa2000053,
++
++  RISCVI_FCLASS_D = 0xe2001053,
++
++  RISCVI_FCVT_W_D = 0xc2000053,
++  RISCVI_FCVT_WU_D = 0xc2100053,
++  RISCVI_FCVT_D_W = 0xd2000053,
++  RISCVI_FCVT_D_WU = 0xd2100053,
++
++  RISCVI_FMV_D = 0x22000053,
++  RISCVI_FNEG_D = 0x22001053,
++  RISCVI_FABS_D = 0x22002053,
++#if LJ_TARGET_RISCV64
++  RISCVI_FCVT_L_D = 0xc2200053,
++  RISCVI_FCVT_LU_D = 0xc2300053,
++  RISCVI_FMV_X_D = 0xe2000053,
++  RISCVI_FCVT_D_L = 0xd2200053,
++  RISCVI_FCVT_D_LU = 0xd2300053,
++  RISCVI_FMV_D_X = 0xf2000053,
++#endif
++
++  /* --- Zifencei --- */
++  RISCVI_FENCE = 0x0000000f,
++  RISCVI_FENCE_I = 0x0000100f,
++
++  /* --- Zicsr --- */
++  RISCVI_CSRRW = 0x00001073,
++  RISCVI_CSRRS = 0x00002073,
++  RISCVI_CSRRC = 0x00003073,
++  RISCVI_CSRRWI = 0x00005073,
++  RISCVI_CSRRSI = 0x00006073,
++  RISCVI_CSRRCI = 0x00007073,
++
++  /* --- RVB --- */
++  /* Zba */
++  RISCVI_SH1ADD = 0x20002033,
++  RISCVI_SH2ADD = 0x20004033,
++  RISCVI_SH3ADD = 0x20006033,
++#if LJ_TARGET_RISCV64
++  RISCVI_ADD_UW = 0x0800003b,
++
++  RISCVI_SH1ADD_UW = 0x2000203b,
++  RISCVI_SH2ADD_UW = 0x2000403b,
++  RISCVI_SH3ADD_UW = 0x2000603b,
++
++  RISCVI_SLLI_UW = 0x0800101b,
++
++  RISCVI_ZEXT_W = 0x0800003b,
++#endif
++  /* Zbb */
++  RISCVI_ANDN = 0x40007033,
++  RISCVI_ORN = 0x40006033,
++  RISCVI_XNOR = 0x40004033,
++
++  RISCVI_CLZ = 0x60001013,
++  RISCVI_CTZ = 0x60101013,
++
++  RISCVI_CPOP = 0x60201013,
++
++  RISCVI_MAX = 0x0a006033,
++  RISCVI_MAXU = 0x0a007033,
++  RISCVI_MIN = 0x0a004033,
++  RISCVI_MINU = 0x0a005033,
++
++  RISCVI_SEXT_B = 0x60401013,
++  RISCVI_SEXT_H = 0x60501013,
++#if LJ_TARGET_RISCV32
++  RISCVI_ZEXT_H = 0x08004033,
++#elif LJ_TARGET_RISCV64
++  RISCVI_ZEXT_H = 0x0800403b,
++#endif
++
++  RISCVI_ROL = 0x60001033,
++  RISCVI_ROR = 0x60005033,
++  RISCVI_RORI = 0x60005013,
++
++  RISCVI_ORC_B = 0x28705013,
++
++#if LJ_TARGET_RISCV32
++  RISCVI_REV8 = 0x69805013,
++#elif LJ_TARGET_RISCV64
++  RISCVI_REV8 = 0x6b805013,
++
++  RISCVI_CLZW = 0x6000101b,
++  RISCVI_CTZW = 0x6010101b,
++
++  RISCVI_CPOPW = 0x6020101b,
++
++  RISCVI_ROLW = 0x6000103b,
++  RISCVI_RORIW = 0x6000501b,
++  RISCVI_RORW = 0x6000503b,
++#endif
++  /* NYI: Zbc, Zbs */
++
++  /* TBD: RVV?, RVP?, RVJ? */
++
++  /* --- XThead* --- */
++  /* XTHeadBa */
++  RISCVI_TH_ADDSL = 0x0000100b,
++
++  /* XTHeadBb */
++  RISCVI_TH_SRRI = 0x1000100b,
++#if LJ_TARGET_RISCV64
++  RISCVI_TH_SRRIW = 0x1400100b,
++#endif
++  RISCVI_TH_EXT = 0x0000200b,
++  RISCVI_TH_EXTU = 0x0000300b,
++  RISCVI_TH_FF0 = 0x8400100b,
++  RISCVI_TH_FF1 = 0x8600100b,
++  RISCVI_TH_REV = 0x8200100b,
++#if LJ_TARGET_RISCV64
++  RISCVI_TH_REVW = 0x9000100b,
++#endif
++  RISCVI_TH_TSTNBZ = 0x8000100b,
++
++  /* XTHeadBs */
++  RISCVI_TH_TST = 0x8800100b,
++
++  /* XTHeadCondMov */
++  RISCVI_TH_MVEQZ = 0x4000100b,
++  RISCVI_TH_MVNEZ = 0x4200100b,
++
++  /* XTHeadMac */
++  RISCVI_TH_MULA = 0x2000100b,
++  RISCVI_TH_MULAH = 0x2800100b,
++#if LJ_TARGET_RISCV64
++  RISCVI_TH_MULAW = 0x2400100b,
++#endif
++  RISCVI_TH_MULS = 0x2200100b,
++  RISCVI_TH_MULSH = 0x2a00100b,
++  RISCVI_TH_MULSW = 0x2600100b,
++
++  /* NYI: XTHeadMemIdx, XTHeadFMemIdx, XTHeadMemPair */
++} RISCVIns;
++
++typedef enum RISCVRM {
++  RISCVRM_RNE = 0,
++  RISCVRM_RTZ = 1,
++  RISCVRM_RDN = 2,
++  RISCVRM_RUP = 3,
++  RISCVRM_RMM = 4,
++  RISCVRM_DYN = 7,
++} RISCVRM;
++
++#endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_target_x86.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_target_x86.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_target_x86.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Definitions for x86 and x64 CPUs.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_TARGET_X86_H
+@@ -38,10 +38,9 @@ enum {
+   RID_RET = RID_EAX,
+ #if LJ_64
+   RID_FPRET = RID_XMM0,
+-#else
++#endif
+   RID_RETLO = RID_EAX,
+   RID_RETHI = RID_EDX,
+-#endif
+ 
+   /* These definitions must match with the *.dasc file(s): */
+   RID_BASE = RID_EDX,		/* Interpreter BASE. */
+@@ -117,8 +116,8 @@ enum {
+ 
+ #if LJ_64
+ /* Prefer the low 8 regs of each type to reduce REX prefixes. */
+-#undef rset_picktop
+-#define rset_picktop(rs)	(lj_fls(lj_bswap(rs)) ^ 0x18)
++#undef rset_picktop_
++#define rset_picktop_(rs)	(lj_fls(lj_bswap(rs)) ^ 0x18)
+ #endif
+ 
+ /* -- Spill slots --------------------------------------------------------- */
+@@ -165,6 +164,8 @@ typedef struct {
+ #define EXITSTUB_SPACING	(2+2)
+ #define EXITSTUBS_PER_GROUP	32
+ 
++#define EXITTRACE_VMSTATE	1	/* g->vmstate has traceno on exit. */
++
+ /* -- x86 ModRM operand encoding ------------------------------------------ */
+ 
+ typedef enum {
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_trace.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_trace.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_trace.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Trace management.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_trace_c
+@@ -153,6 +153,9 @@ static void trace_save(jit_State *J, GCt
+   newwhite(J2G(J), T);
+   T->gct = ~LJ_TTRACE;
+   T->ir = (IRIns *)p - J->cur.nk;  /* The IR has already been copied above. */
++#if LJ_ABI_PAUTH
++  T->mcauth = lj_ptr_sign((ASMFunction)T->mcode, T);
++#endif
+   p += szins;
+   TRACE_APPENDVEC(snap, nsnap, SnapShot)
+   TRACE_APPENDVEC(snapmap, nsnapmap, SnapEntry)
+@@ -215,8 +218,8 @@ static void trace_unpatch(jit_State *J,
+     break;
+   case BC_JITERL:
+   case BC_JLOOP:
+-    lj_assertJ(op == BC_ITERL || op == BC_LOOP || bc_isret(op),
+-	       "bad original bytecode %d", op);
++    lj_assertJ(op == BC_ITERL || op == BC_ITERN || op == BC_LOOP ||
++	       bc_isret(op), "bad original bytecode %d", op);
+     *pc = T->startins;
+     break;
+   case BC_JMP:
+@@ -373,8 +376,13 @@ void lj_trace_freestate(global_State *g)
+ /* Blacklist a bytecode instruction. */
+ static void blacklist_pc(GCproto *pt, BCIns *pc)
+ {
+-  setbc_op(pc, (int)bc_op(*pc)+(int)BC_ILOOP-(int)BC_LOOP);
+-  pt->flags |= PROTO_ILOOP;
++  if (bc_op(*pc) == BC_ITERN) {
++    setbc_op(pc, BC_ITERC);
++    setbc_op(pc+1+bc_j(pc[1]), BC_JMP);
++  } else {
++    setbc_op(pc, (int)bc_op(*pc)+(int)BC_ILOOP-(int)BC_LOOP);
++    pt->flags |= PROTO_ILOOP;
++  }
+ }
+ 
+ /* Penalize a bytecode instruction. */
+@@ -411,7 +419,7 @@ static void trace_start(jit_State *J)
+   TraceNo traceno;
+ 
+   if ((J->pt->flags & PROTO_NOJIT)) {  /* JIT disabled for this proto? */
+-    if (J->parent == 0 && J->exitno == 0) {
++    if (J->parent == 0 && J->exitno == 0 && bc_op(*J->pc) != BC_ITERN) {
+       /* Lazy bytecode patching to disable hotcount events. */
+       lj_assertJ(bc_op(*J->pc) == BC_FORL || bc_op(*J->pc) == BC_ITERL ||
+ 		 bc_op(*J->pc) == BC_LOOP || bc_op(*J->pc) == BC_FUNCF,
+@@ -423,6 +431,12 @@ static void trace_start(jit_State *J)
+     return;
+   }
+ 
++  /* Ensuring forward progress for BC_ITERN can trigger hotcount again. */
++  if (!J->parent && bc_op(*J->pc) == BC_JLOOP) {  /* Already compiled. */
++    J->state = LJ_TRACE_IDLE;  /* Silently ignored. */
++    return;
++  }
++
+   /* Get a new trace number. */
+   traceno = trace_findfree(J);
+   if (LJ_UNLIKELY(traceno == 0)) {  /* No free trace? */
+@@ -496,6 +510,7 @@ static void trace_stop(jit_State *J)
+     J->cur.nextroot = pt->trace;
+     pt->trace = (TraceNo1)traceno;
+     break;
++  case BC_ITERN:
+   case BC_RET:
+   case BC_RET0:
+   case BC_RET1:
+@@ -506,7 +521,11 @@ static void trace_stop(jit_State *J)
+     lj_assertJ(J->parent != 0 && J->cur.root != 0, "not a side trace");
+     lj_asm_patchexit(J, traceref(J, J->parent), J->exitno, J->cur.mcode);
+     /* Avoid compiling a side trace twice (stack resizing uses parent exit). */
+-    traceref(J, J->parent)->snap[J->exitno].count = SNAPCOUNT_DONE;
++    {
++      SnapShot *snap = &traceref(J, J->parent)->snap[J->exitno];
++      snap->count = SNAPCOUNT_DONE;
++      if (J->cur.topslot > snap->topslot) snap->topslot = J->cur.topslot;
++    }
+     /* Add to side trace chain in root trace. */
+     {
+       GCtrace *root = traceref(J, J->cur.root);
+@@ -594,21 +613,27 @@ static int trace_abort(jit_State *J)
+     J->cur.link = 0;
+     J->cur.linktype = LJ_TRLINK_NONE;
+     lj_vmevent_send(L, TRACE,
+-      TValue *frame;
++      cTValue *bot = tvref(L->stack)+LJ_FR2;
++      cTValue *frame;
+       const BCIns *pc;
+-      GCfunc *fn;
++      BCPos pos = 0;
+       setstrV(L, L->top++, lj_str_newlit(L, "abort"));
+       setintV(L->top++, traceno);
+       /* Find original Lua function call to generate a better error message. */
+-      frame = J->L->base-1;
+-      pc = J->pc;
+-      while (!isluafunc(frame_func(frame))) {
+-	pc = (frame_iscont(frame) ? frame_contpc(frame) : frame_pc(frame)) - 1;
+-	frame = frame_prev(frame);
++      for (frame = J->L->base-1, pc = J->pc; ; frame = frame_prev(frame)) {
++	if (isluafunc(frame_func(frame))) {
++	  pos = proto_bcpos(funcproto(frame_func(frame)), pc);
++	  break;
++	} else if (frame_prev(frame) <= bot) {
++	  break;
++	} else if (frame_iscont(frame)) {
++	  pc = frame_contpc(frame) - 1;
++	} else {
++	  pc = frame_pc(frame) - 1;
++	}
+       }
+-      fn = frame_func(frame);
+-      setfuncV(L, L->top++, fn);
+-      setintV(L->top++, proto_bcpos(funcproto(fn), pc));
++      setfuncV(L, L->top++, frame_func(frame));
++      setintV(L->top++, pos);
+       copyTV(L, L->top++, restorestack(L, errobj));
+       copyTV(L, L->top++, &J->errinfo);
+     );
+@@ -651,15 +676,22 @@ static TValue *trace_state(lua_State *L,
+       J->state = LJ_TRACE_RECORD;  /* trace_start() may change state. */
+       trace_start(J);
+       lj_dispatch_update(J2G(J));
+-      break;
++      if (J->state != LJ_TRACE_RECORD_1ST)
++	break;
++      /* fallthrough */
+ 
++    case LJ_TRACE_RECORD_1ST:
++      J->state = LJ_TRACE_RECORD;
++      /* fallthrough */
+     case LJ_TRACE_RECORD:
+       trace_pendpatch(J, 0);
+       setvmstate(J2G(J), RECORD);
+       lj_vmevent_send_(L, RECORD,
+-	/* Save/restore tmptv state for trace recorder. */
++	/* Save/restore state for trace recorder. */
+ 	TValue savetv = J2G(J)->tmptv;
+ 	TValue savetv2 = J2G(J)->tmptv2;
++	TraceNo parent = J->parent;
++	ExitNo exitno = J->exitno;
+ 	setintV(L->top++, J->cur.traceno);
+ 	setfuncV(L, L->top++, J->fn);
+ 	setintV(L->top++, J->pt ? (int32_t)proto_bcpos(J->pt, J->pc) : -1);
+@@ -667,6 +699,8 @@ static TValue *trace_state(lua_State *L,
+       ,
+ 	J2G(J)->tmptv = savetv;
+ 	J2G(J)->tmptv2 = savetv2;
++	J->parent = parent;
++	J->exitno = exitno;
+       );
+       lj_record_ins(J);
+       break;
+@@ -821,7 +855,7 @@ static void trace_exit_regs(lua_State *L
+ }
+ #endif
+ 
+-#ifdef EXITSTATE_PCREG
++#if defined(EXITSTATE_PCREG) || (LJ_UNWIND_JIT && !EXITTRACE_VMSTATE)
+ /* Determine trace number from pc of exit instruction. */
+ static TraceNo trace_exit_find(jit_State *J, MCode *pc)
+ {
+@@ -843,10 +877,18 @@ int LJ_FASTCALL lj_trace_exit(jit_State
+   lua_State *L = J->L;
+   ExitState *ex = (ExitState *)exptr;
+   ExitDataCP exd;
+-  int errcode;
+-  const BCIns *pc;
++  int errcode, exitcode = J->exitcode;
++  TValue exiterr;
++  const BCIns *pc, *retpc;
+   void *cf;
+   GCtrace *T;
++
++  setnilV(&exiterr);
++  if (exitcode) {  /* Trace unwound with error code. */
++    J->exitcode = 0;
++    copyTV(L, &exiterr, L->top-1);
++  }
++
+ #ifdef EXITSTATE_PCREG
+   J->parent = trace_exit_find(J, (MCode *)(intptr_t)ex->gpr[EXITSTATE_PCREG]);
+ #endif
+@@ -866,6 +908,8 @@ int LJ_FASTCALL lj_trace_exit(jit_State
+   if (errcode)
+     return -errcode;  /* Return negated error code. */
+ 
++  if (exitcode) copyTV(L, L->top++, &exiterr);  /* Anchor the error object. */
++
+   if (!(LJ_HASPROFILE && (G(L)->hookmask & HOOK_PROFILE)))
+     lj_vmevent_send(L, TEXIT,
+       lj_state_checkstack(L, 4+RID_NUM_GPR+RID_NUM_FPR+LUA_MINSTACK);
+@@ -877,7 +921,9 @@ int LJ_FASTCALL lj_trace_exit(jit_State
+   pc = exd.pc;
+   cf = cframe_raw(L->cframe);
+   setcframe_pc(cf, pc);
+-  if (LJ_HASPROFILE && (G(L)->hookmask & HOOK_PROFILE)) {
++  if (exitcode) {
++    return -exitcode;
++  } else if (LJ_HASPROFILE && (G(L)->hookmask & HOOK_PROFILE)) {
+     /* Just exit to interpreter. */
+   } else if (G(L)->gc.state == GCSatomic || G(L)->gc.state == GCSfinalize) {
+     if (!(G(L)->hookmask & HOOK_GC))
+@@ -885,21 +931,7 @@ int LJ_FASTCALL lj_trace_exit(jit_State
+   } else {
+     trace_hotside(J, pc);
+   }
+-  if (bc_op(*pc) == BC_JLOOP) {
+-    BCIns *retpc = &traceref(J, bc_d(*pc))->startins;
+-    if (bc_isret(bc_op(*retpc))) {
+-      if (J->state == LJ_TRACE_RECORD) {
+-	J->patchins = *pc;
+-	J->patchpc = (BCIns *)pc;
+-	*J->patchpc = *retpc;
+-	J->bcskip = 1;
+-      } else {
+-	pc = retpc;
+-	setcframe_pc(cf, pc);
+-      }
+-    }
+-  }
+-  /* Return MULTRES or 0. */
++  /* Return MULTRES or 0 or -17. */
+   ERRNO_RESTORE
+   switch (bc_op(*pc)) {
+   case BC_CALLM: case BC_CALLMT:
+@@ -908,6 +940,18 @@ int LJ_FASTCALL lj_trace_exit(jit_State
+     return (int)((BCReg)(L->top - L->base) + 1 - bc_a(*pc) - bc_d(*pc));
+   case BC_TSETM:
+     return (int)((BCReg)(L->top - L->base) + 1 - bc_a(*pc));
++  case BC_JLOOP:
++    retpc = &traceref(J, bc_d(*pc))->startins;
++    if (bc_isret(bc_op(*retpc)) || bc_op(*retpc) == BC_ITERN) {
++      /* Dispatch to original ins to ensure forward progress. */
++      if (J->state != LJ_TRACE_RECORD) return -17;
++      /* Unpatch bytecode when recording. */
++      J->patchins = *pc;
++      J->patchpc = (BCIns *)pc;
++      *J->patchpc = *retpc;
++      J->bcskip = 1;
++    }
++    return 0;
+   default:
+     if (bc_op(*pc) >= BC_FUNCF)
+       return (int)((BCReg)(L->top - L->base) + 1);
+@@ -915,4 +959,41 @@ int LJ_FASTCALL lj_trace_exit(jit_State
+   }
+ }
+ 
++#if LJ_UNWIND_JIT
++/* Given an mcode address determine trace exit address for unwinding. */
++uintptr_t LJ_FASTCALL lj_trace_unwind(jit_State *J, uintptr_t addr, ExitNo *ep)
++{
++#if EXITTRACE_VMSTATE
++  TraceNo traceno = J2G(J)->vmstate;
++#else
++  TraceNo traceno = trace_exit_find(J, (MCode *)addr);
++#endif
++  GCtrace *T = traceref(J, traceno);
++  if (T
++#if EXITTRACE_VMSTATE
++      && addr >= (uintptr_t)T->mcode && addr < (uintptr_t)T->mcode + T->szmcode
++#endif
++     ) {
++    SnapShot *snap = T->snap;
++    SnapNo lo = 0, exitno = T->nsnap;
++    uintptr_t ofs = (uintptr_t)((MCode *)addr - T->mcode);  /* MCode units! */
++    /* Rightmost binary search for mcode offset to determine exit number. */
++    do {
++      SnapNo mid = (lo+exitno) >> 1;
++      if (ofs < snap[mid].mcofs) exitno = mid; else lo = mid + 1;
++    } while (lo < exitno);
++    exitno--;
++    *ep = exitno;
++#ifdef EXITSTUBS_PER_GROUP
++    return (uintptr_t)exitstub_addr(J, exitno);
++#else
++    return (uintptr_t)exitstub_trace_addr(T, exitno);
++#endif
++  }
++  /* Cannot correlate addr with trace/exit. This will be fatal. */
++  lj_assertJ(0, "bad exit pc");
++  return 0;
++}
++#endif
++
+ #endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_trace.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_trace.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_trace.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Trace management.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_TRACE_H
+@@ -37,6 +37,9 @@ LJ_FUNC void lj_trace_ins(jit_State *J,
+ LJ_FUNCA void LJ_FASTCALL lj_trace_hot(jit_State *J, const BCIns *pc);
+ LJ_FUNCA void LJ_FASTCALL lj_trace_stitch(jit_State *J, const BCIns *pc);
+ LJ_FUNCA int LJ_FASTCALL lj_trace_exit(jit_State *J, void *exptr);
++#if LJ_UNWIND_EXT
++LJ_FUNC uintptr_t LJ_FASTCALL lj_trace_unwind(jit_State *J, uintptr_t addr, ExitNo *ep);
++#endif
+ 
+ /* Signal asynchronous abort of trace or end of trace. */
+ #define lj_trace_abort(g)	(G2J(g)->state &= ~LJ_TRACE_ACTIVE)
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_traceerr.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_traceerr.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_traceerr.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Trace compiler error messages.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ /* This file may be included multiple times with different TREDEF macros. */
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_udata.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_udata.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_udata.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Userdata handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_udata_c
+@@ -8,6 +8,7 @@
+ 
+ #include "lj_obj.h"
+ #include "lj_gc.h"
++#include "lj_err.h"
+ #include "lj_udata.h"
+ 
+ GCudata *lj_udata_new(lua_State *L, MSize sz, GCtab *env)
+@@ -32,3 +33,30 @@ void LJ_FASTCALL lj_udata_free(global_St
+   lj_mem_free(g, ud, sizeudata(ud));
+ }
+ 
++#if LJ_64
++void *lj_lightud_intern(lua_State *L, void *p)
++{
++  global_State *g = G(L);
++  uint64_t u = (uint64_t)p;
++  uint32_t up = lightudup(u);
++  uint32_t *segmap = mref(g->gc.lightudseg, uint32_t);
++  MSize segnum = g->gc.lightudnum;
++  if (segmap) {
++    MSize seg;
++    for (seg = 0; seg <= segnum; seg++)
++      if (segmap[seg] == up)  /* Fast path. */
++	return (void *)(((uint64_t)seg << LJ_LIGHTUD_BITS_LO) | lightudlo(u));
++    segnum++;
++    /* Leave last segment unused to avoid clash with ITERN key. */
++    if (segnum >= (1 << LJ_LIGHTUD_BITS_SEG)-1) lj_err_msg(L, LJ_ERR_BADLU);
++  }
++  if (!((segnum-1) & segnum) && segnum != 1) {
++    lj_mem_reallocvec(L, segmap, segnum, segnum ? 2*segnum : 2u, uint32_t);
++    setmref(g->gc.lightudseg, segmap);
++  }
++  g->gc.lightudnum = segnum;
++  segmap[segnum] = up;
++  return (void *)(((uint64_t)segnum << LJ_LIGHTUD_BITS_LO) | lightudlo(u));
++}
++#endif
++
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_udata.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_udata.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_udata.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Userdata handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_UDATA_H
+@@ -10,5 +10,8 @@
+ 
+ LJ_FUNC GCudata *lj_udata_new(lua_State *L, MSize sz, GCtab *env);
+ LJ_FUNC void LJ_FASTCALL lj_udata_free(global_State *g, GCudata *ud);
++#if LJ_64
++LJ_FUNC void * LJ_FASTCALL lj_lightud_intern(lua_State *L, void *p);
++#endif
+ 
+ #endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_vm.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_vm.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_vm.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Assembler VM interface definitions.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_VM_H
+@@ -26,6 +26,9 @@ LJ_ASMF void lj_vm_unwind_ff_eh(void);
+ #if LJ_TARGET_X86ORX64
+ LJ_ASMF void lj_vm_unwind_rethrow(void);
+ #endif
++#if LJ_TARGET_MIPS
++LJ_ASMF void lj_vm_unwind_stub(void);
++#endif
+ 
+ /* Miscellaneous functions. */
+ #if LJ_TARGET_X86ORX64
+@@ -48,10 +51,11 @@ LJ_ASMF void lj_vm_inshook(void);
+ LJ_ASMF void lj_vm_rethook(void);
+ LJ_ASMF void lj_vm_callhook(void);
+ LJ_ASMF void lj_vm_profhook(void);
++LJ_ASMF void lj_vm_IITERN(void);
+ 
+ /* Trace exit handling. */
+-LJ_ASMF void lj_vm_exit_handler(void);
+-LJ_ASMF void lj_vm_exit_interp(void);
++LJ_ASMF char lj_vm_exit_handler[];
++LJ_ASMF char lj_vm_exit_interp[];
+ 
+ /* Internal math helper functions. */
+ #if LJ_TARGET_PPC || LJ_TARGET_ARM64 || (LJ_TARGET_MIPS && LJ_ABI_SOFTFP)
+@@ -79,10 +83,6 @@ LJ_ASMF int32_t LJ_FASTCALL lj_vm_modi(i
+ LJ_ASMF void lj_vm_floor_sse(void);
+ LJ_ASMF void lj_vm_ceil_sse(void);
+ LJ_ASMF void lj_vm_trunc_sse(void);
+-LJ_ASMF void lj_vm_powi_sse(void);
+-#define lj_vm_powi	NULL
+-#else
+-LJ_ASMF double lj_vm_powi(double, int32_t);
+ #endif
+ #if LJ_TARGET_PPC || LJ_TARGET_ARM64
+ #define lj_vm_trunc	trunc
+@@ -95,6 +95,7 @@ LJ_ASMF double lj_vm_trunc_sf(double);
+ #if LJ_HASFFI
+ LJ_ASMF int lj_vm_errno(void);
+ #endif
++LJ_ASMF TValue *lj_vm_next(GCtab *t, uint32_t idx);
+ #endif
+ 
+ /* Continuations for metamethods. */
+@@ -110,6 +111,6 @@ LJ_ASMF void lj_cont_stitch(void);  /* T
+ LJ_ASMF char lj_vm_asm_begin[];
+ 
+ /* Bytecode offsets are relative to lj_vm_asm_begin. */
+-#define makeasmfunc(ofs)	((ASMFunction)(lj_vm_asm_begin + (ofs)))
++#define makeasmfunc(ofs) lj_ptr_sign((ASMFunction)(lj_vm_asm_begin + (ofs)), 0)
+ 
+ #endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_vmevent.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_vmevent.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_vmevent.c
+@@ -1,6 +1,6 @@
+ /*
+ ** VM event handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #include <stdio.h>
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_vmevent.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_vmevent.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_vmevent.h
+@@ -1,6 +1,6 @@
+ /*
+ ** VM event handling.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LJ_VMEVENT_H
+@@ -24,9 +24,10 @@
+ /* VM event IDs. */
+ typedef enum {
+   VMEVENT_DEF(BC,	0x00003883),
+-  VMEVENT_DEF(TRACE,	0xb2d91467),
+-  VMEVENT_DEF(RECORD,	0x9284bf4f),
+-  VMEVENT_DEF(TEXIT,	0xb29df2b0),
++  VMEVENT_DEF(TRACE,	0x12d91467),
++  VMEVENT_DEF(RECORD,	0x1284bf4f),
++  VMEVENT_DEF(TEXIT,	0x129df2b0),
++  VMEVENT_DEF(ERRFIN,	0x12d93888),
+   LJ_VMEVENT__MAX
+ } VMEvent;
+ 
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lj_vmmath.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lj_vmmath.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lj_vmmath.c
+@@ -1,6 +1,6 @@
+ /*
+ ** Math helper functions for assembler VM.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define lj_vmmath_c
+@@ -34,7 +34,18 @@ LJ_FUNCA double lj_wrap_pow(double x, do
+ LJ_FUNCA double lj_wrap_fmod(double x, double y) { return fmod(x, y); }
+ #endif
+ 
+-/* -- Helper functions for generated machine code ------------------------- */
++/* -- Helper functions ---------------------------------------------------- */
++
++/* Required to prevent the C compiler from applying FMA optimizations.
++**
++** Yes, there's -ffp-contract and the FP_CONTRACT pragma ... in theory.
++** But the current state of C compilers is a mess in this regard.
++** Also, this function is not performance sensitive at all.
++*/
++LJ_NOINLINE static double lj_vm_floormul(double x, double y)
++{
++  return lj_vm_floor(x / y) * y;
++}
+ 
+ double lj_vm_foldarith(double x, double y, int op)
+ {
+@@ -43,7 +54,7 @@ double lj_vm_foldarith(double x, double
+   case IR_SUB - IR_ADD: return x-y; break;
+   case IR_MUL - IR_ADD: return x*y; break;
+   case IR_DIV - IR_ADD: return x/y; break;
+-  case IR_MOD - IR_ADD: return x-lj_vm_floor(x/y)*y; break;
++  case IR_MOD - IR_ADD: return x-lj_vm_floormul(x, y); break;
+   case IR_POW - IR_ADD: return pow(x, y); break;
+   case IR_NEG - IR_ADD: return -x; break;
+   case IR_ABS - IR_ADD: return fabs(x); break;
+@@ -56,17 +67,20 @@ double lj_vm_foldarith(double x, double
+   }
+ }
+ 
+-#if (LJ_HASJIT && !(LJ_TARGET_ARM || LJ_TARGET_ARM64 || LJ_TARGET_PPC)) || LJ_TARGET_MIPS
++/* -- Helper functions for generated machine code ------------------------- */
++
++#if (LJ_HASJIT && !(LJ_TARGET_ARM || LJ_TARGET_ARM64 || LJ_TARGET_PPC)) || LJ_TARGET_MIPS \
++ || LJ_TARGET_RISCV64
+ int32_t LJ_FASTCALL lj_vm_modi(int32_t a, int32_t b)
+ {
+   uint32_t y, ua, ub;
+   /* This must be checked before using this function. */
+   lj_assertX(b != 0, "modulo with zero divisor");
+-  ua = a < 0 ? (uint32_t)-a : (uint32_t)a;
+-  ub = b < 0 ? (uint32_t)-b : (uint32_t)b;
++  ua = a < 0 ? ~(uint32_t)a+1u : (uint32_t)a;
++  ub = b < 0 ? ~(uint32_t)b+1u : (uint32_t)b;
+   y = ua % ub;
+   if (y != 0 && (a^b) < 0) y = y - ub;
+-  if (((int32_t)y^b) < 0) y = (uint32_t)-(int32_t)y;
++  if (((int32_t)y^b) < 0) y = ~y+1u;
+   return (int32_t)y;
+ }
+ #endif
+@@ -80,40 +94,6 @@ double lj_vm_log2(double a)
+ }
+ #endif
+ 
+-#if !LJ_TARGET_X86ORX64
+-/* Unsigned x^k. */
+-static double lj_vm_powui(double x, uint32_t k)
+-{
+-  double y;
+-  lj_assertX(k != 0, "pow with zero exponent");
+-  for (; (k & 1) == 0; k >>= 1) x *= x;
+-  y = x;
+-  if ((k >>= 1) != 0) {
+-    for (;;) {
+-      x *= x;
+-      if (k == 1) break;
+-      if (k & 1) y *= x;
+-      k >>= 1;
+-    }
+-    y *= x;
+-  }
+-  return y;
+-}
+-
+-/* Signed x^k. */
+-double lj_vm_powi(double x, int32_t k)
+-{
+-  if (k > 1)
+-    return lj_vm_powui(x, (uint32_t)k);
+-  else if (k == 1)
+-    return x;
+-  else if (k == 0)
+-    return 1.0;
+-  else
+-    return 1.0 / lj_vm_powui(x, (uint32_t)-k);
+-}
+-#endif
+-
+ /* Computes fpm(x) for extended math functions. */
+ double lj_vm_foldfpm(double x, int fpm)
+ {
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/ljamalg.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/ljamalg.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/ljamalg.c
+@@ -1,6 +1,6 @@
+ /*
+ ** LuaJIT core and libraries amalgamation.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #define ljamalg_c
+@@ -39,6 +39,7 @@
+ #include "lj_strscan.c"
+ #include "lj_strfmt.c"
+ #include "lj_strfmt_num.c"
++#include "lj_serialize.c"
+ #include "lj_api.c"
+ #include "lj_profile.c"
+ #include "lj_lex.c"
+@@ -85,5 +86,6 @@
+ #include "lib_bit.c"
+ #include "lib_jit.c"
+ #include "lib_ffi.c"
++#include "lib_buffer.c"
+ #include "lib_init.c"
+ 
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/luaconf.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/luaconf.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/luaconf.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Configuration header.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef luaconf_h
+@@ -37,7 +37,7 @@
+ #endif
+ #define LUA_LROOT	"/usr/local"
+ #define LUA_LUADIR	"/lua/5.1/"
+-#define LUA_LJDIR	"/luajit-2.1.0-beta3/"
++#define LUA_LJDIR	"/luajit-2.1/"
+ 
+ #ifdef LUA_ROOT
+ #define LUA_JROOT	LUA_ROOT
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/luajit.c
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/luajit.c
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/luajit.c
+@@ -1,6 +1,6 @@
+ /*
+ ** LuaJIT frontend. Runs commands, scripts, read-eval-print (REPL) etc.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ **
+ ** Major portions taken verbatim or adapted from the Lua interpreter.
+ ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+@@ -39,6 +39,7 @@
+ 
+ static lua_State *globalL = NULL;
+ static const char *progname = LUA_PROGNAME;
++static char *empty_argv[2] = { NULL, NULL };
+ 
+ #if !LJ_TARGET_CONSOLE
+ static void lstop(lua_State *L, lua_Debug *ar)
+@@ -78,9 +79,9 @@ static void print_usage(void)
+   fflush(stderr);
+ }
+ 
+-static void l_message(const char *pname, const char *msg)
++static void l_message(const char *msg)
+ {
+-  if (pname) { fputs(pname, stderr); fputc(':', stderr); fputc(' ', stderr); }
++  if (progname) { fputs(progname, stderr); fputc(':', stderr); fputc(' ', stderr); }
+   fputs(msg, stderr); fputc('\n', stderr);
+   fflush(stderr);
+ }
+@@ -90,7 +91,7 @@ static int report(lua_State *L, int stat
+   if (status && !lua_isnil(L, -1)) {
+     const char *msg = lua_tostring(L, -1);
+     if (msg == NULL) msg = "(error object is not a string)";
+-    l_message(progname, msg);
++    l_message(msg);
+     lua_pop(L, 1);
+   }
+   return status;
+@@ -256,9 +257,8 @@ static void dotty(lua_State *L)
+       lua_getglobal(L, "print");
+       lua_insert(L, 1);
+       if (lua_pcall(L, lua_gettop(L)-1, 0, 0) != 0)
+-	l_message(progname,
+-	  lua_pushfstring(L, "error calling " LUA_QL("print") " (%s)",
+-			      lua_tostring(L, -1)));
++	l_message(lua_pushfstring(L, "error calling " LUA_QL("print") " (%s)",
++				  lua_tostring(L, -1)));
+     }
+   }
+   lua_settop(L, 0);  /* clear stack */
+@@ -310,8 +310,7 @@ static int loadjitmodule(lua_State *L)
+   lua_getfield(L, -1, "start");
+   if (lua_isnil(L, -1)) {
+   nomodule:
+-    l_message(progname,
+-	      "unknown luaJIT command or jit.* modules not installed");
++    l_message("unknown luaJIT command or jit.* modules not installed");
+     return 1;
+   }
+   lua_remove(L, -2);  /* Drop module table. */
+@@ -516,8 +515,6 @@ static int pmain(lua_State *L)
+   int argn;
+   int flags = 0;
+   globalL = L;
+-  if (argv[0] && argv[0][0]) progname = argv[0];
+-
+   LUAJIT_VERSION_SYM();  /* Linker-enforced version check. */
+ 
+   argn = collectargs(argv, &flags);
+@@ -572,9 +569,11 @@ static int pmain(lua_State *L)
+ int main(int argc, char **argv)
+ {
+   int status;
+-  lua_State *L = lua_open();
++  lua_State *L;
++  if (!argv[0]) argv = empty_argv; else if (argv[0][0]) progname = argv[0];
++  L = lua_open();
+   if (L == NULL) {
+-    l_message(argv[0], "cannot create state: not enough memory");
++    l_message("cannot create state: not enough memory");
+     return EXIT_FAILURE;
+   }
+   smain.argc = argc;
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/luajit.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/luajit.h
++++ /dev/null
+@@ -1,79 +0,0 @@
+-/*
+-** LuaJIT -- a Just-In-Time Compiler for Lua. https://luajit.org/
+-**
+-** Copyright (C) 2005-2021 Mike Pall. All rights reserved.
+-**
+-** Permission is hereby granted, free of charge, to any person obtaining
+-** a copy of this software and associated documentation files (the
+-** "Software"), to deal in the Software without restriction, including
+-** without limitation the rights to use, copy, modify, merge, publish,
+-** distribute, sublicense, and/or sell copies of the Software, and to
+-** permit persons to whom the Software is furnished to do so, subject to
+-** the following conditions:
+-**
+-** The above copyright notice and this permission notice shall be
+-** included in all copies or substantial portions of the Software.
+-**
+-** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+-** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+-** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+-** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+-** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+-** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+-** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+-**
+-** [ MIT license: https://www.opensource.org/licenses/mit-license.php ]
+-*/
+-
+-#ifndef _LUAJIT_H
+-#define _LUAJIT_H
+-
+-#include "lua.h"
+-
+-#define LUAJIT_VERSION		"LuaJIT 2.1.0-beta3"
+-#define LUAJIT_VERSION_NUM	20100  /* Version 2.1.0 = 02.01.00. */
+-#define LUAJIT_VERSION_SYM	luaJIT_version_2_1_0_beta3
+-#define LUAJIT_COPYRIGHT	"Copyright (C) 2005-2021 Mike Pall"
+-#define LUAJIT_URL		"https://luajit.org/"
+-
+-/* Modes for luaJIT_setmode. */
+-#define LUAJIT_MODE_MASK	0x00ff
+-
+-enum {
+-  LUAJIT_MODE_ENGINE,		/* Set mode for whole JIT engine. */
+-  LUAJIT_MODE_DEBUG,		/* Set debug mode (idx = level). */
+-
+-  LUAJIT_MODE_FUNC,		/* Change mode for a function. */
+-  LUAJIT_MODE_ALLFUNC,		/* Recurse into subroutine protos. */
+-  LUAJIT_MODE_ALLSUBFUNC,	/* Change only the subroutines. */
+-
+-  LUAJIT_MODE_TRACE,		/* Flush a compiled trace. */
+-
+-  LUAJIT_MODE_WRAPCFUNC = 0x10,	/* Set wrapper mode for C function calls. */
+-
+-  LUAJIT_MODE_MAX
+-};
+-
+-/* Flags or'ed in to the mode. */
+-#define LUAJIT_MODE_OFF		0x0000	/* Turn feature off. */
+-#define LUAJIT_MODE_ON		0x0100	/* Turn feature on. */
+-#define LUAJIT_MODE_FLUSH	0x0200	/* Flush JIT-compiled code. */
+-
+-/* LuaJIT public C API. */
+-
+-/* Control the JIT engine. */
+-LUA_API int luaJIT_setmode(lua_State *L, int idx, int mode);
+-
+-/* Low-overhead profiling API. */
+-typedef void (*luaJIT_profile_callback)(void *data, lua_State *L,
+-					int samples, int vmstate);
+-LUA_API void luaJIT_profile_start(lua_State *L, const char *mode,
+-				  luaJIT_profile_callback cb, void *data);
+-LUA_API void luaJIT_profile_stop(lua_State *L);
+-LUA_API const char *luaJIT_profile_dumpstack(lua_State *L, const char *fmt,
+-					     int depth, size_t *len);
+-
+-/* Enforce (dynamic) linker error for version mismatches. Call from main. */
+-LUA_API void LUAJIT_VERSION_SYM(void);
+-
+-#endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/luajit_rolling.h
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/luajit_rolling.h
+@@ -0,0 +1,79 @@
++/*
++** LuaJIT -- a Just-In-Time Compiler for Lua. https://luajit.org/
++**
++** Copyright (C) 2005-2023 Mike Pall. All rights reserved.
++**
++** Permission is hereby granted, free of charge, to any person obtaining
++** a copy of this software and associated documentation files (the
++** "Software"), to deal in the Software without restriction, including
++** without limitation the rights to use, copy, modify, merge, publish,
++** distribute, sublicense, and/or sell copies of the Software, and to
++** permit persons to whom the Software is furnished to do so, subject to
++** the following conditions:
++**
++** The above copyright notice and this permission notice shall be
++** included in all copies or substantial portions of the Software.
++**
++** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++**
++** [ MIT license: https://www.opensource.org/licenses/mit-license.php ]
++*/
++
++#ifndef _LUAJIT_H
++#define _LUAJIT_H
++
++#include "lua.h"
++
++#define LUAJIT_VERSION		"LuaJIT 2.1.ROLLING"
++#define LUAJIT_VERSION_NUM	20199  /* Deprecated. */
++#define LUAJIT_VERSION_SYM	luaJIT_version_2_1_ROLLING
++#define LUAJIT_COPYRIGHT	"Copyright (C) 2005-2023 Mike Pall"
++#define LUAJIT_URL		"https://luajit.org/"
++
++/* Modes for luaJIT_setmode. */
++#define LUAJIT_MODE_MASK	0x00ff
++
++enum {
++  LUAJIT_MODE_ENGINE,		/* Set mode for whole JIT engine. */
++  LUAJIT_MODE_DEBUG,		/* Set debug mode (idx = level). */
++
++  LUAJIT_MODE_FUNC,		/* Change mode for a function. */
++  LUAJIT_MODE_ALLFUNC,		/* Recurse into subroutine protos. */
++  LUAJIT_MODE_ALLSUBFUNC,	/* Change only the subroutines. */
++
++  LUAJIT_MODE_TRACE,		/* Flush a compiled trace. */
++
++  LUAJIT_MODE_WRAPCFUNC = 0x10,	/* Set wrapper mode for C function calls. */
++
++  LUAJIT_MODE_MAX
++};
++
++/* Flags or'ed in to the mode. */
++#define LUAJIT_MODE_OFF		0x0000	/* Turn feature off. */
++#define LUAJIT_MODE_ON		0x0100	/* Turn feature on. */
++#define LUAJIT_MODE_FLUSH	0x0200	/* Flush JIT-compiled code. */
++
++/* LuaJIT public C API. */
++
++/* Control the JIT engine. */
++LUA_API int luaJIT_setmode(lua_State *L, int idx, int mode);
++
++/* Low-overhead profiling API. */
++typedef void (*luaJIT_profile_callback)(void *data, lua_State *L,
++					int samples, int vmstate);
++LUA_API void luaJIT_profile_start(lua_State *L, const char *mode,
++				  luaJIT_profile_callback cb, void *data);
++LUA_API void luaJIT_profile_stop(lua_State *L);
++LUA_API const char *luaJIT_profile_dumpstack(lua_State *L, const char *fmt,
++					     int depth, size_t *len);
++
++/* Enforce (dynamic) linker error for version mismatches. Call from main. */
++LUA_API void LUAJIT_VERSION_SYM(void);
++
++#endif
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/lualib.h
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/lualib.h
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/lualib.h
+@@ -1,6 +1,6 @@
+ /*
+ ** Standard library header.
+-** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ */
+ 
+ #ifndef _LUALIB_H
+@@ -33,6 +33,7 @@ LUALIB_API int luaopen_debug(lua_State *
+ LUALIB_API int luaopen_bit(lua_State *L);
+ LUALIB_API int luaopen_jit(lua_State *L);
+ LUALIB_API int luaopen_ffi(lua_State *L);
++LUALIB_API int luaopen_string_buffer(lua_State *L);
+ 
+ LUALIB_API void luaL_openlibs(lua_State *L);
+ 
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/msvcbuild.bat
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/msvcbuild.bat
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/msvcbuild.bat
+@@ -1,5 +1,5 @@
+ @rem Script to build LuaJIT with MSVC.
+-@rem Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++@rem Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ @rem
+ @rem Open a "Visual Studio Command Prompt" (either x86 or x64).
+ @rem Then cd to this directory and run this script. Use the following
+@@ -25,38 +25,54 @@
+ @set LJDLLNAME=lua51.dll
+ @set LJLIBNAME=lua51.lib
+ @set BUILDTYPE=release
+-@set ALL_LIB=lib_base.c lib_math.c lib_bit.c lib_string.c lib_table.c lib_io.c lib_os.c lib_package.c lib_debug.c lib_jit.c lib_ffi.c
++@set ALL_LIB=lib_base.c lib_math.c lib_bit.c lib_string.c lib_table.c lib_io.c lib_os.c lib_package.c lib_debug.c lib_jit.c lib_ffi.c lib_buffer.c
+ 
++@setlocal
++@call :SETHOSTVARS
+ %LJCOMPILE% host\minilua.c
+ @if errorlevel 1 goto :BAD
+ %LJLINK% /out:minilua.exe minilua.obj
+ @if errorlevel 1 goto :BAD
+ if exist minilua.exe.manifest^
+   %LJMT% -manifest minilua.exe.manifest -outputresource:minilua.exe
++@endlocal
+ 
+-@set DASMFLAGS=-D WIN -D JIT -D FFI -D P64
++@set DASMFLAGS=-D WIN -D JIT -D FFI -D ENDIAN_LE -D FPU -D P64
+ @set LJARCH=x64
+ @minilua
+-@if errorlevel 8 goto :X64
++@if errorlevel 8 goto :NO32
+ @set DASC=vm_x86.dasc
+-@set DASMFLAGS=-D WIN -D JIT -D FFI
++@set DASMFLAGS=-D WIN -D JIT -D FFI -D ENDIAN_LE -D FPU
+ @set LJARCH=x86
+ @set LJCOMPILE=%LJCOMPILE% /arch:SSE2
++@goto :DA
++:NO32
++@if "%VSCMD_ARG_TGT_ARCH%" neq "arm64" goto :X64
++@set DASC=vm_arm64.dasc
++@set DASMTARGET=-D LUAJIT_TARGET=LUAJIT_ARCH_ARM64
++@set LJARCH=arm64
++@goto :DA
+ :X64
+-@if "%1" neq "nogc64" goto :GC64
++@if "%1" neq "nogc64" goto :DA
+ @shift
+ @set DASC=vm_x86.dasc
+ @set LJCOMPILE=%LJCOMPILE% /DLUAJIT_DISABLE_GC64
+-:GC64
++:DA
+ minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h %DASC%
+ @if errorlevel 1 goto :BAD
+ 
+-%LJCOMPILE% /I "." /I %DASMDIR% host\buildvm*.c
++if exist ..\.git ( git show -s --format=%%ct >luajit_relver.txt ) else ( type ..\.relver >luajit_relver.txt )
++minilua host\genversion.lua
++
++@setlocal
++@call :SETHOSTVARS
++%LJCOMPILE% /I "." /I %DASMDIR% %DASMTARGET% host\buildvm*.c
+ @if errorlevel 1 goto :BAD
+ %LJLINK% /out:buildvm.exe buildvm*.obj
+ @if errorlevel 1 goto :BAD
+ if exist buildvm.exe.manifest^
+   %LJMT% -manifest buildvm.exe.manifest -outputresource:buildvm.exe
++@endlocal
+ 
+ buildvm -m peobj -o lj_vm.obj
+ @if errorlevel 1 goto :BAD
+@@ -116,6 +132,12 @@ if exist luajit.exe.manifest^
+ @echo === Successfully built LuaJIT for Windows/%LJARCH% ===
+ 
+ @goto :END
++:SETHOSTVARS
++@if "%VSCMD_ARG_HOST_ARCH%_%VSCMD_ARG_TGT_ARCH%" equ "x64_arm64" (
++  call "%VSINSTALLDIR%Common7\Tools\VsDevCmd.bat" -arch=%VSCMD_ARG_HOST_ARCH% -no_logo
++  echo on
++)
++@goto :END
+ :BAD
+ @echo.
+ @echo *******************************************************
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/nxbuild.bat
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/nxbuild.bat
+@@ -0,0 +1,164 @@
++@rem Script to build LuaJIT with NintendoSDK + NX Addon.
++@rem Donated to the public domain by Swyter.
++@rem
++@rem To run this script you must open a "Native Tools Command Prompt for VS".
++@rem
++@rem Either the x86 version for NX32, or x64 for the NX64 target.
++@rem This is because the pointer size of the LuaJIT host tools (buildvm.exe)
++@rem must match the cross-compiled target (32 or 64 bits).
++@rem
++@rem Then cd to this directory and run this script.
++@rem
++@rem Recommended invocation:
++@rem
++@rem nxbuild            # release build, amalgamated
++@rem nxbuild debug      # debug build, amalgamated
++@rem
++@rem Additional command-line options (not generally recommended):
++@rem
++@rem noamalg            # (after debug) non-amalgamated build
++
++@if not defined INCLUDE goto :FAIL
++@if not defined NINTENDO_SDK_ROOT goto :FAIL
++@if not defined PLATFORM goto :FAIL
++
++@if "%platform%" == "x86" goto :DO_NX32
++@if "%platform%" == "x64" goto :DO_NX64
++
++@echo Error: Current host platform is %platform%!
++@echo.
++@goto :FAIL
++
++@setlocal
++
++:DO_NX32
++@set DASC=vm_arm.dasc
++@set DASMFLAGS= -D HFABI -D FPU
++@set DASMTARGET= -D LUAJIT_TARGET=LUAJIT_ARCH_ARM
++@set HOST_PTR_SIZE=4
++goto :BEGIN
++
++:DO_NX64
++@set DASC=vm_arm64.dasc
++@set DASMFLAGS= -D ENDIAN_LE
++@set DASMTARGET= -D LUAJIT_TARGET=LUAJIT_ARCH_ARM64
++@set HOST_PTR_SIZE=8
++
++:BEGIN
++@rem ---- Host compiler ----
++@set LJCOMPILE=cl /nologo /c /MD /O2 /W3 /wo4146 /wo4244 /D_CRT_SECURE_NO_DEPRECATE
++@set LJLINK=link /nologo
++@set LJMT=mt /nologo
++@set DASMDIR=..\dynasm
++@set DASM=%DASMDIR%\dynasm.lua
++@set ALL_LIB=lib_base.c lib_math.c lib_bit.c lib_string.c lib_table.c lib_io.c lib_os.c lib_package.c lib_debug.c lib_jit.c lib_ffi.c lib_buffer.c
++
++%LJCOMPILE% host\minilua.c
++@if errorlevel 1 goto :BAD
++%LJLINK% /out:minilua.exe minilua.obj
++@if errorlevel 1 goto :BAD
++if exist minilua.exe.manifest^
++  %LJMT% -manifest minilua.exe.manifest -outputresource:minilua.exe
++
++@rem Check that we have the right 32/64 bit host compiler to generate the right virtual machine files.
++@minilua
++@if "%ERRORLEVEL%" == "%HOST_PTR_SIZE%" goto :PASSED_PTR_CHECK
++
++@echo The pointer size of the host in bytes (%HOST_PTR_SIZE%) does not match the expected value (%errorlevel%).
++@echo Check that the script is being ran under the correct x86/x64 VS prompt.
++@goto :BAD
++
++:PASSED_PTR_CHECK
++@set DASMFLAGS=%DASMFLAGS% %DASMTARGET% -D LJ_TARGET_NX -D LUAJIT_OS=LUAJIT_OS_OTHER -D LUAJIT_DISABLE_JIT -D LUAJIT_DISABLE_FFI
++minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h %DASC%
++@if errorlevel 1 goto :BAD
++
++if exist ..\.git ( git show -s --format=%%ct >luajit_relver.txt ) else ( type ..\.relver >luajit_relver.txt )
++minilua host\genversion.lua
++
++%LJCOMPILE% /I "." /I %DASMDIR% %DASMTARGET% -D LJ_TARGET_NX -DLUAJIT_OS=LUAJIT_OS_OTHER -DLUAJIT_DISABLE_JIT -DLUAJIT_DISABLE_FFI host\buildvm*.c
++@if errorlevel 1 goto :BAD
++%LJLINK% /out:buildvm.exe buildvm*.obj
++@if errorlevel 1 goto :BAD
++if exist buildvm.exe.manifest^
++  %LJMT% -manifest buildvm.exe.manifest -outputresource:buildvm.exe
++
++buildvm -m elfasm -o lj_vm.s
++@if errorlevel 1 goto :BAD
++buildvm -m bcdef -o lj_bcdef.h %ALL_LIB%
++@if errorlevel 1 goto :BAD
++buildvm -m ffdef -o lj_ffdef.h %ALL_LIB%
++@if errorlevel 1 goto :BAD
++buildvm -m libdef -o lj_libdef.h %ALL_LIB%
++@if errorlevel 1 goto :BAD
++buildvm -m recdef -o lj_recdef.h %ALL_LIB%
++@if errorlevel 1 goto :BAD
++buildvm -m vmdef -o jit\vmdef.lua %ALL_LIB%
++@if errorlevel 1 goto :BAD
++buildvm -m folddef -o lj_folddef.h lj_opt_fold.c
++@if errorlevel 1 goto :BAD
++
++@rem ---- Cross compiler ----
++@if "%platform%" neq "x64" goto :NX32_CROSSBUILD
++@set LJCOMPILE="%NINTENDO_SDK_ROOT%\Compilers\NX\nx\aarch64\bin\clang" -Wall -I%NINTENDO_SDK_ROOT%\Include %DASMTARGET% -DLUAJIT_OS=LUAJIT_OS_OTHER -DLUAJIT_DISABLE_JIT -DLUAJIT_DISABLE_FFI -DLUAJIT_USE_SYSMALLOC -c
++@set LJLIB="%NINTENDO_SDK_ROOT%\Compilers\NX\nx\aarch64\bin\aarch64-nintendo-nx-elf-ar" rc
++@set TARGETLIB_SUFFIX=nx64
++
++%NINTENDO_SDK_ROOT%\Compilers\NX\nx\aarch64\bin\aarch64-nintendo-nx-elf-as -o lj_vm.o lj_vm.s
++goto :DEBUGCHECK
++
++:NX32_CROSSBUILD
++@set LJCOMPILE="%NINTENDO_SDK_ROOT%\Compilers\NX\nx\armv7l\bin\clang" -Wall -I%NINTENDO_SDK_ROOT%\Include %DASMTARGET% -DLUAJIT_OS=LUAJIT_OS_OTHER -DLUAJIT_DISABLE_JIT -DLUAJIT_DISABLE_FFI -DLUAJIT_USE_SYSMALLOC -c
++@set LJLIB="%NINTENDO_SDK_ROOT%\Compilers\NX\nx\armv7l\bin\armv7l-nintendo-nx-eabihf-ar" rc
++@set TARGETLIB_SUFFIX=nx32
++
++%NINTENDO_SDK_ROOT%\Compilers\NX\nx\armv7l\bin\armv7l-nintendo-nx-eabihf-as -o lj_vm.o lj_vm.s
++:DEBUGCHECK
++
++@if "%1" neq "debug" goto :NODEBUG
++@shift
++@set LJCOMPILE=%LJCOMPILE% -DNN_SDK_BUILD_DEBUG -g -O0
++@set TARGETLIB=libluajitD_%TARGETLIB_SUFFIX%.a
++goto :BUILD
++:NODEBUG
++@set LJCOMPILE=%LJCOMPILE% -DNN_SDK_BUILD_RELEASE -O3
++@set TARGETLIB=libluajit_%TARGETLIB_SUFFIX%.a
++:BUILD
++del %TARGETLIB%
++@set LJCOMPILE=%LJCOMPILE% -fPIC
++@if "%1" neq "noamalg" goto :AMALG
++for %%f in (lj_*.c lib_*.c) do (
++  %LJCOMPILE% %%f
++  @if errorlevel 1 goto :BAD
++)
++
++%LJLIB% %TARGETLIB% lj_*.o lib_*.o
++@if errorlevel 1 goto :BAD
++@goto :NOAMALG
++:AMALG
++%LJCOMPILE% ljamalg.c
++@if errorlevel 1 goto :BAD
++%LJLIB% %TARGETLIB% ljamalg.o lj_vm.o
++@if errorlevel 1 goto :BAD
++:NOAMALG
++
++@del *.o *.obj *.manifest minilua.exe buildvm.exe
++@echo.
++@echo === Successfully built LuaJIT for Nintendo Switch (%TARGETLIB_SUFFIX%) ===
++
++@goto :END
++:BAD
++@echo.
++@echo *******************************************************
++@echo *** Build FAILED -- Please check the error messages ***
++@echo *******************************************************
++@goto :END
++:FAIL
++@echo To run this script you must open a "Native Tools Command Prompt for VS".
++@echo.
++@echo Either the x86 version for NX32, or x64 for the NX64 target.
++@echo This is because the pointer size of the LuaJIT host tools (buildvm.exe)
++@echo must match the cross-compiled target (32 or 64 bits).
++@echo.
++@echo Keep in mind that NintendoSDK + NX Addon must be installed, too.
++:END
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/ps4build.bat
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/ps4build.bat
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/ps4build.bat
+@@ -26,7 +26,7 @@
+ @set LJMT=mt /nologo
+ @set DASMDIR=..\dynasm
+ @set DASM=%DASMDIR%\dynasm.lua
+-@set ALL_LIB=lib_base.c lib_math.c lib_bit.c lib_string.c lib_table.c lib_io.c lib_os.c lib_package.c lib_debug.c lib_jit.c lib_ffi.c
++@set ALL_LIB=lib_base.c lib_math.c lib_bit.c lib_string.c lib_table.c lib_io.c lib_os.c lib_package.c lib_debug.c lib_jit.c lib_ffi.c lib_buffer.c
+ @set GC64=
+ @set DASC=vm_x64.dasc
+ 
+@@ -51,7 +51,11 @@ if exist minilua.exe.manifest^
+ minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h %DASC%
+ @if errorlevel 1 goto :BAD
+ 
+-%LJCOMPILE% /I "." /I %DASMDIR% %GC64% -DLUAJIT_TARGET=LUAJIT_ARCH_X64 -DLUAJIT_OS=LUAJIT_OS_OTHER -DLUAJIT_DISABLE_JIT -DLUAJIT_DISABLE_FFI -DLUAJIT_NO_UNWIND host\buildvm*.c
++if exist ..\.git ( git show -s --format=%%ct >luajit_relver.txt ) else ( type ..\.relver >luajit_relver.txt )
++minilua host\genversion.lua
++
++%LJCOMPILE% /I "." /I %DASMDIR% %GC64% -DLUAJIT_TARGET=LUAJIT_ARCH_X64 -DLUAJIT_OS=LUAJIT_OS_OTHER -DLUAJIT_DISABLE_JIT -DLUAJIT_DISABLE_FFI -DLUAJIT_USE_SYSMALLOC -DLUAJIT_NO_UNWIND host\buildvm*.c
++
+ @if errorlevel 1 goto :BAD
+ %LJLINK% /out:buildvm.exe buildvm*.obj
+ @if errorlevel 1 goto :BAD
+@@ -78,7 +82,7 @@ buildvm -m folddef -o lj_folddef.h lj_op
+ @set LJLIB="%SCE_ORBIS_SDK_DIR%\host_tools\bin\orbis-ar" rcus
+ @set INCLUDE=""
+ 
+-orbis-as -o lj_vm.o lj_vm.s
++"%SCE_ORBIS_SDK_DIR%\host_tools\bin\orbis-as" -o lj_vm.o lj_vm.s
+ 
+ @if "%1" neq "debug" goto :NODEBUG
+ @shift
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/ps5build.bat
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/ps5build.bat
+@@ -0,0 +1,126 @@
++@rem Script to build LuaJIT with the PS5 SDK.
++@rem Donated to the public domain.
++@rem
++@rem Open a "Visual Studio .NET Command Prompt" (64 bit host compiler)
++@rem or "VS20xx x64 Native Tools Command Prompt".
++@rem
++@rem Then cd to this directory and run this script.
++@rem
++@rem Recommended invocation:
++@rem
++@rem ps5build        release build, amalgamated, 64-bit GC
++@rem ps5build debug    debug build, amalgamated, 64-bit GC
++@rem
++@rem Additional command-line options (not generally recommended):
++@rem
++@rem gc32 (before debug)    32-bit GC
++@rem noamalg (after debug)  non-amalgamated build
++
++@if not defined INCLUDE goto :FAIL
++@if not defined SCE_PROSPERO_SDK_DIR goto :FAIL
++
++@setlocal
++@rem ---- Host compiler ----
++@set LJCOMPILE=cl /nologo /c /MD /O2 /W3 /D_CRT_SECURE_NO_DEPRECATE
++@set LJLINK=link /nologo
++@set LJMT=mt /nologo
++@set DASMDIR=..\dynasm
++@set DASM=%DASMDIR%\dynasm.lua
++@set ALL_LIB=lib_base.c lib_math.c lib_bit.c lib_string.c lib_table.c lib_io.c lib_os.c lib_package.c lib_debug.c lib_jit.c lib_ffi.c lib_buffer.c
++@set GC64=
++@set DASC=vm_x64.dasc
++
++@if "%1" neq "gc32" goto :NOGC32
++@shift
++@set GC64=-DLUAJIT_DISABLE_GC64
++@set DASC=vm_x86.dasc
++:NOGC32
++
++%LJCOMPILE% host\minilua.c
++@if errorlevel 1 goto :BAD
++%LJLINK% /out:minilua.exe minilua.obj
++@if errorlevel 1 goto :BAD
++if exist minilua.exe.manifest^
++  %LJMT% -manifest minilua.exe.manifest -outputresource:minilua.exe
++
++@rem Check for 64 bit host compiler.
++@minilua
++@if not errorlevel 8 goto :FAIL
++
++@set DASMFLAGS=-D P64 -D NO_UNWIND
++minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h %DASC%
++@if errorlevel 1 goto :BAD
++
++if exist ..\.git ( git show -s --format=%%ct >luajit_relver.txt ) else ( type ..\.relver >luajit_relver.txt )
++minilua host\genversion.lua
++
++%LJCOMPILE% /I "." /I %DASMDIR% %GC64% -DLUAJIT_TARGET=LUAJIT_ARCH_X64 -DLUAJIT_OS=LUAJIT_OS_OTHER -DLUAJIT_DISABLE_JIT -DLUAJIT_DISABLE_FFI -DLUAJIT_NO_UNWIND host\buildvm*.c
++@if errorlevel 1 goto :BAD
++%LJLINK% /out:buildvm.exe buildvm*.obj
++@if errorlevel 1 goto :BAD
++if exist buildvm.exe.manifest^
++  %LJMT% -manifest buildvm.exe.manifest -outputresource:buildvm.exe
++
++buildvm -m elfasm -o lj_vm.s
++@if errorlevel 1 goto :BAD
++buildvm -m bcdef -o lj_bcdef.h %ALL_LIB%
++@if errorlevel 1 goto :BAD
++buildvm -m ffdef -o lj_ffdef.h %ALL_LIB%
++@if errorlevel 1 goto :BAD
++buildvm -m libdef -o lj_libdef.h %ALL_LIB%
++@if errorlevel 1 goto :BAD
++buildvm -m recdef -o lj_recdef.h %ALL_LIB%
++@if errorlevel 1 goto :BAD
++buildvm -m vmdef -o jit\vmdef.lua %ALL_LIB%
++@if errorlevel 1 goto :BAD
++buildvm -m folddef -o lj_folddef.h lj_opt_fold.c
++@if errorlevel 1 goto :BAD
++
++@rem ---- Cross compiler ----
++@set LJCOMPILE="%SCE_PROSPERO_SDK_DIR%\host_tools\bin\prospero-clang" -c -Wall -DLUAJIT_DISABLE_FFI -DLUAJIT_USE_SYSMALLOC %GC64%
++@set LJLIB="%SCE_PROSPERO_SDK_DIR%\host_tools\bin\prospero-llvm-ar" rcus
++@set INCLUDE=""
++
++"%SCE_PROSPERO_SDK_DIR%\host_tools\bin\prospero-clang" -c -o lj_vm.o lj_vm.s
++
++@if "%1" neq "debug" goto :NODEBUG
++@shift
++@set LJCOMPILE=%LJCOMPILE% -g -O0
++@set TARGETLIB=libluajitD_ps5.a
++goto :BUILD
++:NODEBUG
++@set LJCOMPILE=%LJCOMPILE% -O2
++@set TARGETLIB=libluajit_ps5.a
++:BUILD
++del %TARGETLIB%
++@if "%1" neq "noamalg" goto :AMALG
++for %%f in (lj_*.c lib_*.c) do (
++  %LJCOMPILE% %%f
++  @if errorlevel 1 goto :BAD
++)
++
++%LJLIB% %TARGETLIB% lj_*.o lib_*.o
++@if errorlevel 1 goto :BAD
++@goto :NOAMALG
++:AMALG
++%LJCOMPILE% ljamalg.c
++@if errorlevel 1 goto :BAD
++%LJLIB% %TARGETLIB% ljamalg.o lj_vm.o
++@if errorlevel 1 goto :BAD
++:NOAMALG
++
++@del *.o *.obj *.manifest minilua.exe buildvm.exe
++@echo.
++@echo === Successfully built LuaJIT for PS5 ===
++
++@goto :END
++:BAD
++@echo.
++@echo *******************************************************
++@echo *** Build FAILED -- Please check the error messages ***
++@echo *******************************************************
++@goto :END
++:FAIL
++@echo To run this script you must open a "Visual Studio .NET Command Prompt"
++@echo (64 bit host compiler). The PS5 Prospero SDK must be installed, too.
++:END
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/psvitabuild.bat
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/psvitabuild.bat
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/psvitabuild.bat
+@@ -14,7 +14,7 @@
+ @set LJMT=mt /nologo
+ @set DASMDIR=..\dynasm
+ @set DASM=%DASMDIR%\dynasm.lua
+-@set ALL_LIB=lib_base.c lib_math.c lib_bit.c lib_string.c lib_table.c lib_io.c lib_os.c lib_package.c lib_debug.c lib_jit.c lib_ffi.c
++@set ALL_LIB=lib_base.c lib_math.c lib_bit.c lib_string.c lib_table.c lib_io.c lib_os.c lib_package.c lib_debug.c lib_jit.c lib_ffi.c lib_buffer.c
+ 
+ %LJCOMPILE% host\minilua.c
+ @if errorlevel 1 goto :BAD
+@@ -31,6 +31,9 @@ if exist minilua.exe.manifest^
+ minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h vm_arm.dasc
+ @if errorlevel 1 goto :BAD
+ 
++if exist ..\.git ( git show -s --format=%%ct >luajit_relver.txt ) else ( type ..\.relver >luajit_relver.txt )
++minilua host\genversion.lua
++
+ %LJCOMPILE% /I "." /I %DASMDIR% -DLUAJIT_TARGET=LUAJIT_ARCH_ARM -DLUAJIT_OS=LUAJIT_OS_OTHER -DLUAJIT_DISABLE_JIT -DLUAJIT_DISABLE_FFI -DLJ_TARGET_PSVITA=1 host\buildvm*.c
+ @if errorlevel 1 goto :BAD
+ %LJLINK% /out:buildvm.exe buildvm*.obj
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/vm_arm.dasc
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/vm_arm.dasc
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/vm_arm.dasc
+@@ -1,6 +1,6 @@
+ |// Low-level VM code for ARM CPUs.
+ |// Bytecode interpreter, fast functions and helper functions.
+-|// Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++|// Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ |
+ |.arch arm
+ |.section code_op, code_sub
+@@ -539,13 +539,13 @@ static void build_subroutines(BuildCtx *
+   |    cmp CARG1, #1
+   |.endif
+   |   ldr PC, [CARG4, #-12]		// Restore PC from [cont|PC].
+-  |  ldr CARG3, LFUNC:CARG3->field_pc
+   |    mvn INS, #~LJ_TNIL
+   |    add CARG2, RA, RC
+   |    str INS, [CARG2, #-4]		// Ensure one valid arg.
+   |.if FFI
+   |    bls >1
+   |.endif
++  |  ldr CARG3, LFUNC:CARG3->field_pc
+   |  ldr KBASE, [CARG3, #PC2PROTO(k)]
+   |  // BASE = base, RA = resultptr, CARG4 = meta base
+   |    bx CARG1
+@@ -1111,24 +1111,18 @@ static void build_subroutines(BuildCtx *
+   |  checktab CARG2, ->fff_fallback
+   |   strd CARG34, [BASE, NARGS8:RC]	// Set missing 2nd arg to nil.
+   |   ldr PC, [BASE, FRAME_PC]
+-  |  mov CARG2, CARG1
+-  |    str BASE, L->base		// Add frame since C call can throw.
+-  |  mov CARG1, L
+-  |    str BASE, L->top			// Dummy frame length is ok.
+-  |  add CARG3, BASE, #8
+-  |   str PC, SAVE_PC
+-  |  bl extern lj_tab_next	// (lua_State *L, GCtab *t, TValue *key)
+-  |  // Returns 0 at end of traversal.
++  |  add CARG2, BASE, #8
++  |  sub CARG3, BASE, #8
++  |  bl extern lj_tab_next		// (GCtab *t, cTValue *key, TValue *o)
++  |  // Returns 1=found, 0=end, -1=error.
+   |  .IOS ldr BASE, L->base
+   |  cmp CRET1, #0
+-  |  mvneq CRET2, #~LJ_TNIL
+-  |  beq ->fff_restv			// End of traversal: return nil.
+-  |  ldrd CARG12, [BASE, #8]		// Copy key and value to results.
+-  |   ldrd CARG34, [BASE, #16]
+-  |    mov RC, #(2+1)*8
+-  |  strd CARG12, [BASE, #-8]
+-  |   strd CARG34, [BASE]
+-  |  b ->fff_res
++  |   mov RC, #(2+1)*8
++  |  bgt ->fff_res			// Found key/value.
++  |  bmi ->fff_fallback			// Invalid key.
++  |  // End of traversal: return nil.
++  |  mvn CRET2, #~LJ_TNIL
++  |  b ->fff_restv
+   |
+   |.ffunc_1 pairs
+   |  checktab CARG2, ->fff_fallback
+@@ -1810,7 +1804,7 @@ static void build_subroutines(BuildCtx *
+   |   str BASE, L->base
+   |   str PC, SAVE_PC
+   |   str L, SBUF:CARG1->L
+-  |  str CARG4, SBUF:CARG1->p
++  |  str CARG4, SBUF:CARG1->w
+   |  bl extern lj_buf_putstr_ .. name
+   |  bl extern lj_buf_tostr
+   |  b ->fff_resstr
+@@ -2202,8 +2196,8 @@ static void build_subroutines(BuildCtx *
+   |.if JIT
+   |  ldr L, SAVE_L
+   |1:
+-  |  cmp CARG1, #0
+-  |  blt >9				// Check for error from exit.
++  |  cmn CARG1, #LUA_ERRERR
++  |  bhs >9				// Check for error from exit.
+   |   lsl RC, CARG1, #3
+   |  ldr LFUNC:CARG2, [BASE, FRAME_FUNC]
+   |   str RC, SAVE_MULTRES
+@@ -2219,6 +2213,8 @@ static void build_subroutines(BuildCtx *
+   |   ldr INS, [PC], #4
+   |     lsl MASKR8, MASKR8, #3		// MASKR8 = 255*8.
+   |    st_vmstate CARG4
++  |  cmn CARG1, #17			// Static dispatch?
++  |  beq >5
+   |  cmp OP, #BC_FUNCC+2		// Fast function?
+   |  bhs >4
+   |2:
+@@ -2244,9 +2240,21 @@ static void build_subroutines(BuildCtx *
+   |  ldr KBASE, [CARG3, #PC2PROTO(k)]
+   |  b <2
+   |
++  |5:  // Dispatch to static entry of original ins replaced by BC_JLOOP.
++  |  ldr CARG1, [DISPATCH, #DISPATCH_J(trace)]
++  |  decode_RD RC, INS
++  |  ldr TRACE:CARG1, [CARG1, RC, lsl #2]
++  |  ldr INS, TRACE:CARG1->startins
++  |  decode_OP OP, INS
++  |   decode_RA8 RA, INS
++  |  add OP, DISPATCH, OP, lsl #2
++  |   decode_RD RC, INS
++  |  ldr pc, [OP, #GG_DISP2STATIC]
++  |
+   |9:  // Rethrow error from the right C frame.
++  |  rsb CARG2, CARG1, #0
+   |  mov CARG1, L
+-  |  bl extern lj_err_run		// (lua_State *L)
++  |  bl extern lj_err_trace		// (lua_State *L, int errcode)
+   |.endif
+   |
+   |//-----------------------------------------------------------------------
+@@ -2429,6 +2437,64 @@ static void build_subroutines(BuildCtx *
+   |//-- Miscellaneous functions --------------------------------------------
+   |//-----------------------------------------------------------------------
+   |
++  |.define NEXT_TAB,		TAB:CARG1
++  |.define NEXT_RES,		CARG1
++  |.define NEXT_IDX,		CARG2
++  |.define NEXT_TMP0,		CARG3
++  |.define NEXT_TMP1,		CARG4
++  |.define NEXT_LIM,		r12
++  |.define NEXT_RES_PTR,	sp
++  |.define NEXT_RES_VAL,	[sp]
++  |.define NEXT_RES_KEY_I,	[sp, #8]
++  |.define NEXT_RES_KEY_IT,	[sp, #12]
++  |
++  |// TValue *lj_vm_next(GCtab *t, uint32_t idx)
++  |// Next idx returned in CRET2.
++  |->vm_next:
++  |.if JIT
++  |  ldr NEXT_TMP0, NEXT_TAB->array
++  |   ldr NEXT_LIM, NEXT_TAB->asize
++  |  add NEXT_TMP0, NEXT_TMP0, NEXT_IDX, lsl #3
++  |1:  // Traverse array part.
++  |   subs NEXT_TMP1, NEXT_IDX, NEXT_LIM
++  |   bhs >5
++  |  ldr NEXT_TMP1, [NEXT_TMP0, #4]
++  |   str NEXT_IDX, NEXT_RES_KEY_I
++  |   add NEXT_TMP0, NEXT_TMP0, #8
++  |   add NEXT_IDX, NEXT_IDX, #1
++  |  checktp NEXT_TMP1, LJ_TNIL
++  |  beq <1				// Skip holes in array part.
++  |  ldr NEXT_TMP0, [NEXT_TMP0, #-8]
++  |   mov NEXT_RES, NEXT_RES_PTR
++  |  strd NEXT_TMP0, NEXT_RES_VAL	// Stores NEXT_TMP1, too.
++  |  mvn NEXT_TMP0, #~LJ_TISNUM
++  |  str NEXT_TMP0, NEXT_RES_KEY_IT
++  |  bx lr
++  |
++  |5:  // Traverse hash part.
++  |  ldr NEXT_TMP0, NEXT_TAB->hmask
++  |   ldr NODE:NEXT_RES, NEXT_TAB->node
++  |   add NEXT_TMP1, NEXT_TMP1, NEXT_TMP1, lsl #1
++  |  add NEXT_LIM, NEXT_LIM, NEXT_TMP0
++  |   add NODE:NEXT_RES, NODE:NEXT_RES, NEXT_TMP1, lsl #3
++  |6:
++  |  cmp NEXT_IDX, NEXT_LIM
++  |  bhi >9
++  |  ldr NEXT_TMP1, NODE:NEXT_RES->val.it
++  |  checktp NEXT_TMP1, LJ_TNIL
++  |   add NEXT_IDX, NEXT_IDX, #1
++  |  bxne lr
++  |  // Skip holes in hash part.
++  |  add NEXT_RES, NEXT_RES, #sizeof(Node)
++  |  b <6
++  |
++  |9:  // End of iteration. Set the key to nil (not the value).
++  |  mvn NEXT_TMP0, #0
++  |   mov NEXT_RES, NEXT_RES_PTR
++  |  str NEXT_TMP0, NEXT_RES_KEY_IT
++  |  bx lr
++  |.endif
++  |
+   |//-----------------------------------------------------------------------
+   |//-- FFI helper functions -----------------------------------------------
+   |//-----------------------------------------------------------------------
+@@ -2505,16 +2571,16 @@ static void build_subroutines(BuildCtx *
+   |.endif
+   |  mov r11, sp
+   |  sub sp, sp, CARG1			// Readjust stack.
+-  |   subs CARG2, CARG2, #1
++  |   subs CARG2, CARG2, #4
+   |.if HFABI
+   |  vldm RB, {d0-d7}
+   |.endif
+   |    ldr RB, CCSTATE->func
+   |   bmi >2
+   |1:  // Copy stack slots.
+-  |  ldr CARG4, [CARG3, CARG2, lsl #2]
+-  |  str CARG4, [sp, CARG2, lsl #2]
+-  |  subs CARG2, CARG2, #1
++  |  ldr CARG4, [CARG3, CARG2]
++  |  str CARG4, [sp, CARG2]
++  |  subs CARG2, CARG2, #4
+   |  bpl <1
+   |2:
+   |  ldrd CARG12, CCSTATE->gpr[0]
+@@ -3919,10 +3985,11 @@ static void build_ins(BuildCtx *ctx, BCO
+     break;
+ 
+   case BC_ITERN:
+-    |  // RA = base*8, (RB = nresults+1, RC = nargs+1 (2+1))
+     |.if JIT
+-    |  // NYI: add hotloop, record BC_ITERN.
++    |  hotloop
+     |.endif
++    |->vm_IITERN:
++    |  // RA = base*8, (RB = nresults+1, RC = nargs+1 (2+1))
+     |  add RA, BASE, RA
+     |  ldr TAB:RB, [RA, #-16]
+     |  ldr CARG1, [RA, #-8]		// Get index from control var.
+@@ -3988,7 +4055,7 @@ static void build_ins(BuildCtx *ctx, BCO
+     |   ins_next1
+     |   ins_next2
+     |  mov CARG1, #0
+-    |  mvn CARG2, #0x00018000
++    |  mvn CARG2, #~LJ_KEYINDEX
+     |  strd CARG1, [RA, #-8]		// Initialize control var.
+     |1:
+     |   ins_next3
+@@ -3997,9 +4064,25 @@ static void build_ins(BuildCtx *ctx, BCO
+     |   mov OP, #BC_ITERC
+     |  strb CARG1, [PC, #-4]
+     |   sub PC, RC, #0x20000
++    |.if JIT
++    |   ldrb CARG1, [PC]
++    |   cmp CARG1, #BC_ITERN
++    |   bne >6
++    |.endif
+     |   strb OP, [PC]			// Subsumes ins_next1.
+     |   ins_next2
+     |  b <1
++    |.if JIT
++    |6:  // Unpatch JLOOP.
++    |  ldr CARG1, [DISPATCH, #DISPATCH_J(trace)]
++    |  ldrh CARG2, [PC, #2]
++    |  ldr TRACE:CARG1, [CARG1, CARG2, lsl #2]
++    |  // Subsumes ins_next1 and ins_next2.
++    |  ldr INS, TRACE:CARG1->startins
++    |  bfi INS, OP, #0, #8
++    |  str INS, [PC], #4
++    |  b <1
++    |.endif
+     break;
+ 
+   case BC_VARG:
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/vm_arm64.dasc
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/vm_arm64.dasc
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/vm_arm64.dasc
+@@ -1,6 +1,6 @@
+ |// Low-level VM code for ARM64 CPUs.
+ |// Bytecode interpreter, fast functions and helper functions.
+-|// Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++|// Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ |
+ |.arch arm64
+ |.section code_op, code_sub
+@@ -77,51 +77,94 @@
+ |.define CRET1,		x0
+ |.define CRET1w,	w0
+ |
++|//-----------------------------------------------------------------------
++|
++|// ARM64e pointer authentication codes (PAC).
++|.if PAUTH
++|.macro sp_auth; pacibsp; .endmacro
++|.macro br_auth, reg; braaz reg; .endmacro
++|.macro blr_auth, reg; blraaz reg; .endmacro
++|.macro ret_auth; retab; .endmacro
++|.else
++|.macro sp_auth; .endmacro
++|.macro br_auth, reg; br reg; .endmacro
++|.macro blr_auth, reg; blr reg; .endmacro
++|.macro ret_auth; ret; .endmacro
++|.endif
++|
++|//-----------------------------------------------------------------------
++|
+ |// Stack layout while in interpreter. Must match with lj_frame.h.
+ |
+ |.define CFRAME_SPACE,	208
+ |//----- 16 byte aligned, <-- sp entering interpreter
+-|// Unused		[sp, #204]	// 32 bit values
+-|.define SAVE_NRES,	[sp, #200]
+-|.define SAVE_ERRF,	[sp, #196]
+-|.define SAVE_MULTRES,	[sp, #192]
+-|.define TMPD,		[sp, #184]	// 64 bit values
+-|.define SAVE_L,	[sp, #176]
+-|.define SAVE_PC,	[sp, #168]
+-|.define SAVE_CFRAME,	[sp, #160]
+-|.define SAVE_FPR_,	96		// 96+8*8: 64 bit FPR saves
+-|.define SAVE_GPR_,	16		// 16+10*8: 64 bit GPR saves
+-|.define SAVE_LR,	[sp, #8]
+-|.define SAVE_FP,	[sp]
++|.define SAVE_FP_LR_,	192
++|.define SAVE_GPR_,	112		// 112+10*8: 64 bit GPR saves
++|.define SAVE_FPR_,	48		// 48+8*8: 64 bit FPR saves
++|// Unused		[sp, #44]	// 32 bit values
++|.define SAVE_NRES,	[sp, #40]
++|.define SAVE_ERRF,	[sp, #36]
++|.define SAVE_MULTRES,	[sp, #32]
++|.define TMPD,		[sp, #24]	// 64 bit values
++|.define SAVE_L,	[sp, #16]
++|.define SAVE_PC,	[sp, #8]
++|.define SAVE_CFRAME,	[sp, #0]
+ |//----- 16 byte aligned, <-- sp while in interpreter.
+ |
+-|.define TMPDofs,	#184
++|.define TMPDofs,	#24
++|
++|.if WIN
++|// Windows unwind data is suited to r1 stored first.
++|.macro stp_unwind, r1, r2, where
++|  stp r1, r2, where
++|.endmacro
++|.macro ldp_unwind, r1, r2, where
++|  ldp r1, r2, where
++|.endmacro
++|.macro ldp_unwind, r1, r2, where, post_index
++|  ldp r1, r2, where, post_index
++|.endmacro
++|.else
++|// Otherwise store r2 first for compact unwind info (OSX).
++|.macro stp_unwind, r1, r2, where
++|  stp r2, r1, where
++|.endmacro
++|.macro ldp_unwind, r1, r2, where
++|  ldp r2, r1, where
++|.endmacro
++|.macro ldp_unwind, r1, r2, where, post_index
++|  ldp r2, r1, where, post_index
++|.endmacro
++|.endif
+ |
+ |.macro save_, gpr1, gpr2, fpr1, fpr2
+-|  stp d..fpr1, d..fpr2, [sp, # SAVE_FPR_+(fpr1-8)*8]
+-|  stp x..gpr1, x..gpr2, [sp, # SAVE_GPR_+(gpr1-19)*8]
++|  stp_unwind d..fpr1, d..fpr2, [sp, # SAVE_FPR_+(14-fpr1)*8]
++|  stp_unwind x..gpr1, x..gpr2, [sp, # SAVE_GPR_+(27-gpr1)*8]
+ |.endmacro
+ |.macro rest_, gpr1, gpr2, fpr1, fpr2
+-|  ldp d..fpr1, d..fpr2, [sp, # SAVE_FPR_+(fpr1-8)*8]
+-|  ldp x..gpr1, x..gpr2, [sp, # SAVE_GPR_+(gpr1-19)*8]
++|  ldp_unwind d..fpr1, d..fpr2, [sp, # SAVE_FPR_+(14-fpr1)*8]
++|  ldp_unwind x..gpr1, x..gpr2, [sp, # SAVE_GPR_+(27-gpr1)*8]
+ |.endmacro
+ |
+ |.macro saveregs
+-|  stp fp, lr, [sp, #-CFRAME_SPACE]!
+-|  add fp, sp, #0
+-|  stp x19, x20, [sp, # SAVE_GPR_]
++|  sp_auth
++|  sub sp, sp, # CFRAME_SPACE
++|  stp fp, lr, [sp, # SAVE_FP_LR_]
++|  add fp, sp, # SAVE_FP_LR_
++|  stp_unwind x19, x20, [sp, # SAVE_GPR_+(27-19)*8]
+ |  save_ 21, 22, 8, 9
+ |  save_ 23, 24, 10, 11
+ |  save_ 25, 26, 12, 13
+ |  save_ 27, 28, 14, 15
+ |.endmacro
+ |.macro restoreregs
+-|  ldp x19, x20, [sp, # SAVE_GPR_]
++|  ldp_unwind x19, x20, [sp, # SAVE_GPR_+(27-19)*8]
+ |  rest_ 21, 22, 8, 9
+ |  rest_ 23, 24, 10, 11
+ |  rest_ 25, 26, 12, 13
+ |  rest_ 27, 28, 14, 15
+-|  ldp fp, lr, [sp], # CFRAME_SPACE
++|  ldp fp, lr, [sp, # SAVE_FP_LR_]
++|  add sp, sp, # CFRAME_SPACE
+ |.endmacro
+ |
+ |// Type definitions. Some of these are only used for documentation.
+@@ -179,7 +222,7 @@
+ |   decode_RA RA, INS
+ |  ldr TMP0, [TMP1, #GG_G2DISP]
+ |   decode_RD RC, INS
+-|  br TMP0
++|  br_auth TMP0
+ |.endmacro
+ |
+ |// Instruction footer.
+@@ -208,7 +251,7 @@
+ |   decode_RA RA, INS
+ |  ldr TMP0, [TMP1, #GG_G2DISP]
+ |   add RA, BASE, RA, lsl #3
+-|  br TMP0
++|  br_auth TMP0
+ |.endmacro
+ |
+ |.macro ins_call
+@@ -248,8 +291,17 @@
+ |  blo target
+ |.endmacro
+ |
++|.macro init_constants
++|  movn TISNIL, #0
++|  movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48
++|  movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16
++|.endmacro
++|
+ |.macro mov_false, reg; movn reg, #0x8000, lsl #32; .endmacro
+ |.macro mov_true, reg; movn reg, #0x0001, lsl #48; .endmacro
++|.macro mov_nil, reg; mov reg, TISNIL; .endmacro
++|.macro cmp_nil, reg; cmp reg, TISNIL; .endmacro
++|.macro add_TISNUM, dst, src; add dst, src, TISNUM; .endmacro
+ |
+ #define GL_J(field)	(GG_G2J + (int)offsetof(jit_State, field))
+ |
+@@ -355,7 +407,7 @@ static void build_subroutines(BuildCtx *
+   |
+   |->vm_leave_unw:
+   |  restoreregs
+-  |  ret
++  |  ret_auth
+   |
+   |6:
+   |  bgt >7				// Less results wanted?
+@@ -387,26 +439,26 @@ static void build_subroutines(BuildCtx *
+   |
+   |->vm_unwind_c:			// Unwind C stack, return from vm_pcall.
+   |  // (void *cframe, int errcode)
++  |  add fp, CARG1, # SAVE_FP_LR_
+   |  mov sp, CARG1
+   |  mov CRET1, CARG2
+-  |->vm_unwind_c_eh:			// Landing pad for external unwinder.
+   |  ldr L, SAVE_L
+-  |   mv_vmstate TMP0w, C
+   |  ldr GL, L->glref
++  |->vm_unwind_c_eh:			// Landing pad for external unwinder.
++  |   mv_vmstate TMP0w, C
+   |   st_vmstate TMP0w
+   |  b ->vm_leave_unw
+   |
+   |->vm_unwind_ff:			// Unwind C stack, return from ff pcall.
+   |  // (void *cframe)
+-  |  and sp, CARG1, #CFRAME_RAWMASK
+-  |->vm_unwind_ff_eh:			// Landing pad for external unwinder.
++  |  add fp, CARG1, # SAVE_FP_LR_
++  |  mov sp, CARG1
+   |  ldr L, SAVE_L
+-  |    movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48
+-  |    movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16
+-  |    movn TISNIL, #0
++  |    init_constants
++  |   ldr GL, L->glref			// Setup pointer to global state.
++  |->vm_unwind_ff_eh:			// Landing pad for external unwinder.
+   |    mov RC, #16			// 2 results: false + error message.
+   |  ldr BASE, L->base
+-  |   ldr GL, L->glref			// Setup pointer to global state.
+   |    mov_false TMP0
+   |  sub RA, BASE, #8			// Results start at BASE-8.
+   |  ldr PC, [BASE, FRAME_PC]		// Fetch PC of previous frame.
+@@ -467,11 +519,9 @@ static void build_subroutines(BuildCtx *
+   |  str L, GL->cur_L
+   |  mov RA, BASE
+   |   ldp BASE, CARG1, L->base
+-  |    movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48
+-  |    movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16
++  |    init_constants
+   |  ldr PC, [BASE, FRAME_PC]
+   |     strb wzr, L->status
+-  |    movn TISNIL, #0
+   |   sub RC, CARG1, BASE
+   |  ands CARG1, PC, #FRAME_TYPE
+   |   add RC, RC, #8
+@@ -500,16 +550,15 @@ static void build_subroutines(BuildCtx *
+   |    ldr GL, L->glref			// Setup pointer to global state.
+   |     mov BASE, CARG2
+   |   str CARG1, SAVE_PC		// Any value outside of bytecode is ok.
+-  |  str RC, SAVE_CFRAME
+-  |  str fp, L->cframe			// Add our C frame to cframe chain.
++  |  add TMP0, sp, #0
++  |   str RC, SAVE_CFRAME
++  |  str TMP0, L->cframe		// Add our C frame to cframe chain.
+   |
+   |3:  // Entry point for vm_cpcall/vm_resume (BASE = base, PC = ftype).
+   |  str L, GL->cur_L
+   |  ldp RB, CARG1, L->base		// RB = old base (for vmeta_call).
+-  |    movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48
+-  |    movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16
+   |  add PC, PC, BASE
+-  |    movn TISNIL, #0
++  |    init_constants
+   |  sub PC, PC, RB			// PC = frame delta + frame type
+   |   sub NARGS8:RC, CARG1, BASE
+   |    st_vmstate ST_INTERP
+@@ -536,10 +585,11 @@ static void build_subroutines(BuildCtx *
+   |   sub RA, RA, RB			// Compute -savestack(L, L->top).
+   |   str RAw, SAVE_NRES		// Neg. delta means cframe w/o frame.
+   |  str wzr, SAVE_ERRF			// No error function.
+-  |  str RC, SAVE_CFRAME
+-  |  str fp, L->cframe			// Add our C frame to cframe chain.
++  |  add TMP0, sp, #0
++  |   str RC, SAVE_CFRAME
++  |  str TMP0, L->cframe		// Add our C frame to cframe chain.
+   |    str L, GL->cur_L
+-  |  blr CARG4			// (lua_State *L, lua_CFunction func, void *ud)
++  |  blr_auth CARG4		// (lua_State *L, lua_CFunction func, void *ud)
+   |  mov BASE, CRET1
+   |   mov PC, #FRAME_CP
+   |  cbnz BASE, <3			// Else continue with the call.
+@@ -562,15 +612,15 @@ static void build_subroutines(BuildCtx *
+   |    cmp CARG1, #1
+   |.endif
+   |   ldr PC, [CARG4, #-24]		// Restore PC from [cont|PC].
+-  |  ldr CARG3, LFUNC:CARG3->pc
+   |    add TMP0, RA, RC
+   |    str TISNIL, [TMP0, #-8]		// Ensure one valid arg.
+   |.if FFI
+   |    bls >1
+   |.endif
++  |  ldr CARG3, LFUNC:CARG3->pc
+   |  ldr KBASE, [CARG3, #PC2PROTO(k)]
+   |  // BASE = base, RA = resultptr, CARG4 = meta base
+-  |    br CARG1
++  |    br_auth CARG1
+   |
+   |.if FFI
+   |1:
+@@ -617,7 +667,7 @@ static void build_subroutines(BuildCtx *
+   |  b >1
+   |
+   |->vmeta_tgetb:			// RB = table, RC = index
+-  |  add RC, RC, TISNUM
++  |  add_TISNUM RC, RC
+   |   add CARG2, BASE, RB, lsl #3
+   |   add CARG3, sp, TMPDofs
+   |  str RC, TMPD
+@@ -652,7 +702,7 @@ static void build_subroutines(BuildCtx *
+   |  sxtw CARG2, TMP1w
+   |  bl extern lj_tab_getinth		// (GCtab *t, int32_t key)
+   |  // Returns cTValue * or NULL.
+-  |  mov TMP0, TISNIL
++  |  mov_nil TMP0
+   |  cbz CRET1, ->BC_TGETR_Z
+   |  ldr TMP0, [CRET1]
+   |  b ->BC_TGETR_Z
+@@ -675,7 +725,7 @@ static void build_subroutines(BuildCtx *
+   |  b >1
+   |
+   |->vmeta_tsetb:			// RB = table, RC = index
+-  |  add RC, RC, TISNUM
++  |  add_TISNUM RC, RC
+   |   add CARG2, BASE, RB, lsl #3
+   |   add CARG3, sp, TMPDofs
+   |  str RC, TMPD
+@@ -989,7 +1039,7 @@ static void build_subroutines(BuildCtx *
+   |1:  // Field metatable must be at same offset for GCtab and GCudata!
+   |  ldr TAB:RB, TAB:CARG1->metatable
+   |2:
+-  |   mov CARG1, TISNIL
++  |   mov_nil CARG1
+   |   ldr STR:RC, GL->gcroot[GCROOT_MMNAME+MM_metatable]
+   |  cbz TAB:RB, ->fff_restv
+   |  ldr TMP1w, TAB:RB->hmask
+@@ -1011,7 +1061,7 @@ static void build_subroutines(BuildCtx *
+   |  movk CARG1, #(LJ_TTAB>>1)&0xffff, lsl #48
+   |  b ->fff_restv
+   |5:
+-  |  cmp TMP0, TISNIL
++  |  cmp_nil TMP0
+   |  bne ->fff_restv
+   |  b <4
+   |
+@@ -1086,21 +1136,19 @@ static void build_subroutines(BuildCtx *
+   |//-- Base library: iterators -------------------------------------------
+   |
+   |.ffunc_1 next
+-  |  checktp CARG2, CARG1, LJ_TTAB, ->fff_fallback
++  |  checktp CARG1, LJ_TTAB, ->fff_fallback
+   |  str TISNIL, [BASE, NARGS8:RC]	// Set missing 2nd arg to nil.
+   |  ldr PC, [BASE, FRAME_PC]
+-  |   stp BASE, BASE, L->base		// Add frame since C call can throw.
+-  |  mov CARG1, L
+-  |  add CARG3, BASE, #8
+-  |   str PC, SAVE_PC
+-  |  bl extern lj_tab_next	// (lua_State *L, GCtab *t, TValue *key)
+-  |  // Returns 0 at end of traversal.
++  |  add CARG2, BASE, #8
++  |  sub CARG3, BASE, #16
++  |  bl extern lj_tab_next		// (GCtab *t, cTValue *key, TValue *o)
++  |  // Returns 1=found, 0=end, -1=error.
++  |   mov RC, #(2+1)*8
++  |  tbnz CRET1w, #31, ->fff_fallback	// Invalid key.
++  |  cbnz CRET1, ->fff_res		// Found key/value.
++  |  // End of traversal: return nil.
+   |  str TISNIL, [BASE, #-16]
+-  |  cbz CRET1, ->fff_res1		// End of traversal: return nil.
+-  |  ldp CARG1, CARG2, [BASE, #8]	// Copy key and value to results.
+-  |    mov RC, #(2+1)*8
+-  |  stp CARG1, CARG2, [BASE, #-16]
+-  |  b ->fff_res
++  |  b ->fff_res1
+   |
+   |.ffunc_1 pairs
+   |  checktp TMP1, CARG1, LJ_TTAB, ->fff_fallback
+@@ -1113,8 +1161,8 @@ static void build_subroutines(BuildCtx *
+   |  cbnz TAB:CARG2, ->fff_fallback
+ #endif
+   |  mov RC, #(3+1)*8
+-  |  stp CARG1, TISNIL, [BASE, #-8]
+-  |   str CFUNC:CARG4, [BASE, #-16]
++  |  stp CFUNC:CARG4, CARG1, [BASE, #-16]
++  |   str TISNIL, [BASE]
+   |  b ->fff_res
+   |
+   |.ffunc_2 ipairs_aux
+@@ -1126,14 +1174,14 @@ static void build_subroutines(BuildCtx *
+   |  add CARG2w, CARG2w, #1
+   |  cmp CARG2w, TMP1w
+   |    ldr PC, [BASE, FRAME_PC]
+-  |     add TMP2, CARG2, TISNUM
++  |     add_TISNUM TMP2, CARG2
+   |   mov RC, #(0+1)*8
+   |     str TMP2, [BASE, #-16]
+   |  bhs >2				// Not in array part?
+   |  ldr TMP0, [CARG3, CARG2, lsl #3]
+   |1:
+   |   mov TMP1, #(2+1)*8
+-  |   cmp TMP0, TISNIL
++  |   cmp_nil TMP0
+   |  str TMP0, [BASE, #-8]
+   |   csel RC, RC, TMP1, eq
+   |  b ->fff_res
+@@ -1156,16 +1204,17 @@ static void build_subroutines(BuildCtx *
+   |  cbnz TAB:CARG2, ->fff_fallback
+ #endif
+   |  mov RC, #(3+1)*8
+-  |  stp CARG1, TISNUM, [BASE, #-8]
+-  |   str CFUNC:CARG4, [BASE, #-16]
++  |  stp CFUNC:CARG4, CARG1, [BASE, #-16]
++  |   str TISNUM, [BASE]
+   |  b ->fff_res
+   |
+   |//-- Base library: catch errors ----------------------------------------
+   |
+   |.ffunc pcall
++  |   cmp NARGS8:RC, #8
+   |  ldrb TMP0w, GL->hookmask
+-  |   subs NARGS8:RC, NARGS8:RC, #8
+   |   blo ->fff_fallback
++  |   sub NARGS8:RC, NARGS8:RC, #8
+   |    mov RB, BASE
+   |    add BASE, BASE, #16
+   |  ubfx TMP0w, TMP0w, #HOOK_ACTIVE_SHIFT, #1
+@@ -1346,7 +1395,7 @@ static void build_subroutines(BuildCtx *
+   |  eor CARG2w, CARG1w, CARG1w, asr #31
+   |   movz CARG3, #0x41e0, lsl #48	// 2^31.
+   |  subs CARG1w, CARG2w, CARG1w, asr #31
+-  |   add CARG1, CARG1, TISNUM
++  |   add_TISNUM CARG1, CARG1
+   |  csel CARG1, CARG1, CARG3, pl
+   |  // Fallthrough.
+   |
+@@ -1437,7 +1486,7 @@ static void build_subroutines(BuildCtx *
+   |    ldr PC, [BASE, FRAME_PC]
+   |  str d0, [BASE, #-16]
+   |    mov RC, #(2+1)*8
+-  |   add CARG2, CARG2, TISNUM
++  |   add_TISNUM CARG2, CARG2
+   |   str CARG2, [BASE, #-8]
+   |  b ->fff_res
+   |
+@@ -1503,7 +1552,7 @@ static void build_subroutines(BuildCtx *
+   |  bne ->fff_fallback
+   |  ldrb TMP0w, STR:CARG1[1]		// Access is always ok (NUL at end).
+   |   ldr CARG3w, STR:CARG1->len
+-  |  add TMP0, TMP0, TISNUM
++  |  add_TISNUM TMP0, TMP0
+   |  str TMP0, [BASE, #-16]
+   |  mov RC, #(0+1)*8
+   |   cbz CARG3, ->fff_res
+@@ -1589,7 +1638,7 @@ static void build_subroutines(BuildCtx *
+   |   str BASE, L->base
+   |   str PC, SAVE_PC
+   |   str L, GL->tmpbuf.L
+-  |  str TMP0, GL->tmpbuf.p
++  |  str TMP0, GL->tmpbuf.w
+   |  bl extern lj_buf_putstr_ .. name
+   |  bl extern lj_buf_tostr
+   |  b ->fff_resstr
+@@ -1649,17 +1698,17 @@ static void build_subroutines(BuildCtx *
+   |.ffunc_bit tobit
+   |  mov TMP0w, CARG1w
+   |9:  // Label reused by .ffunc_bit_op users.
+-  |  add CARG1, TMP0, TISNUM
++  |  add_TISNUM CARG1, TMP0
+   |  b ->fff_restv
+   |
+   |.ffunc_bit bswap
+   |  rev TMP0w, CARG1w
+-  |  add CARG1, TMP0, TISNUM
++  |  add_TISNUM CARG1, TMP0
+   |  b ->fff_restv
+   |
+   |.ffunc_bit bnot
+   |  mvn TMP0w, CARG1w
+-  |  add CARG1, TMP0, TISNUM
++  |  add_TISNUM CARG1, TMP0
+   |  b ->fff_restv
+   |
+   |.macro .ffunc_bit_sh, name, ins, shmod
+@@ -1680,7 +1729,7 @@ static void build_subroutines(BuildCtx *
+   |  checkint CARG1, ->vm_tobit_fb
+   |2:
+   |  ins TMP0w, CARG1w, TMP1w
+-  |  add CARG1, TMP0, TISNUM
++  |  add_TISNUM CARG1, TMP0
+   |  b ->fff_restv
+   |.endmacro
+   |
+@@ -1705,7 +1754,7 @@ static void build_subroutines(BuildCtx *
+   |  cmp TMP1, TMP2
+   |   mov CARG1, L
+   |  bhi >5				// Need to grow stack.
+-  |   blr CARG3				// (lua_State *L)
++  |   blr_auth CARG3			// (lua_State *L)
+   |  // Either throws an error, or recovers and returns -1, 0 or nresults+1.
+   |   ldr BASE, L->base
+   |  cmp CRET1w, #0
+@@ -1741,6 +1790,7 @@ static void build_subroutines(BuildCtx *
+   |
+   |->fff_gcstep:			// Call GC step function.
+   |  // BASE = new base, RC = nargs*8
++  |  sp_auth
+   |   add CARG2, BASE, NARGS8:RC	// Calculate L->top.
+   |  mov RA, lr
+   |   stp BASE, CARG2, L->base
+@@ -1752,7 +1802,7 @@ static void build_subroutines(BuildCtx *
+   |  mov lr, RA				// Help return address predictor.
+   |  sub NARGS8:RC, CARG2, BASE		// Calculate nargs*8.
+   |   and CFUNC:CARG3, CARG3, #LJ_GCVMASK
+-  |  ret
++  |  ret_auth
+   |
+   |//-----------------------------------------------------------------------
+   |//-- Special dispatch targets -------------------------------------------
+@@ -1779,7 +1829,7 @@ static void build_subroutines(BuildCtx *
+   |  tbz TMP2w, #HOOK_ACTIVE_SHIFT, >1	// Hook already active?
+   |5:  // Re-dispatch to static ins.
+   |  ldr TMP0, [TMP1, #GG_G2DISP+GG_DISP2STATIC]
+-  |  br TMP0
++  |  br_auth TMP0
+   |
+   |->vm_inshook:			// Dispatch target for instr/line hooks.
+   |  ldrb TMP2w, GL->hookmask
+@@ -1805,7 +1855,7 @@ static void build_subroutines(BuildCtx *
+   |   decode_RA RA, INS
+   |  ldr TMP0, [TMP1, #GG_G2DISP+GG_DISP2STATIC]
+   |   decode_RD RC, INS
+-  |  br TMP0
++  |  br_auth TMP0
+   |
+   |->cont_hook:				// Continue from hook yield.
+   |  ldr CARG1, [CARG4, #-40]
+@@ -1855,7 +1905,7 @@ static void build_subroutines(BuildCtx *
+   |  sub NARGS8:RC, TMP1, BASE
+   |   ldr INSw, [PC, #-4]
+   |  and LFUNC:CARG3, CARG3, #LJ_GCVMASK
+-  |  br CRET1
++  |  br_auth CRET1
+   |
+   |->cont_stitch:			// Trace stitching.
+   |.if JIT
+@@ -1868,8 +1918,7 @@ static void build_subroutines(BuildCtx *
+   |    and CARG3, CARG3, #LJ_GCVMASK
+   |   beq >2
+   |1:  // Move results down.
+-  |  ldr CARG1, [RA]
+-  |    add RA, RA, #8
++  |  ldr CARG1, [RA], #8
+   |   subs RB, RB, #8
+   |  str CARG1, [BASE, RC, lsl #3]
+   |    add RC, RC, #1
+@@ -1984,13 +2033,11 @@ static void build_subroutines(BuildCtx *
+   |.if JIT
+   |  ldr L, SAVE_L
+   |1:
+-  |  cmp CARG1w, #0
+-  |  blt >9				// Check for error from exit.
+-  |   lsl RC, CARG1, #3
++  |   init_constants
++  |  cmn CARG1w, #LUA_ERRERR
++  |  bhs >9				// Check for error from exit.
+   |  ldr LFUNC:CARG2, [BASE, FRAME_FUNC]
+-  |    movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48
+-  |    movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16
+-  |    movn TISNIL, #0
++  |   lsl RC, CARG1, #3
+   |  and LFUNC:CARG2, CARG2, #LJ_GCVMASK
+   |   str RCw, SAVE_MULTRES
+   |   str BASE, L->base
+@@ -2002,6 +2049,8 @@ static void build_subroutines(BuildCtx *
+   |  ldrb RBw, [PC, # OFS_OP]
+   |   ldr INSw, [PC], #4
+   |    st_vmstate CARG4w
++  |  cmn CARG1w, #17			// Static dispatch?
++  |  beq >5
+   |  cmp RBw, #BC_FUNCC+2		// Fast function?
+   |   add TMP1, GL, INS, uxtb #3
+   |  bhs >4
+@@ -2012,13 +2061,13 @@ static void build_subroutines(BuildCtx *
+   |   decode_RA RA, INS
+   |   lsr TMP0, INS, #16
+   |   csel RC, TMP0, RC, lo
+-  |   blo >5
++  |   blo >3
+   |   ldr CARG3, [BASE, FRAME_FUNC]
+   |   sub RC, RC, #8
+   |   add RA, BASE, RA, lsl #3	// Yes: RA = BASE+framesize*8, RC = nargs*8
+   |   and LFUNC:CARG3, CARG3, #LJ_GCVMASK
+-  |5:
+-  |  br RB
++  |3:
++  |  br_auth RB
+   |
+   |4:  // Check frame below fast function.
+   |  ldr CARG1, [BASE, FRAME_PC]
+@@ -2034,9 +2083,21 @@ static void build_subroutines(BuildCtx *
+   |  ldr KBASE, [CARG3, #PC2PROTO(k)]
+   |  b <2
+   |
++  |5:  // Dispatch to static entry of original ins replaced by BC_JLOOP.
++  |  ldr RA, [GL, #GL_J(trace)]
++  |  decode_RD RC, INS
++  |  ldr TRACE:RA, [RA, RC, lsl #3]
++  |  ldr INSw, TRACE:RA->startins
++  |  add TMP0, GL, INS, uxtb #3
++  |   decode_RA RA, INS
++  |  ldr RB, [TMP0, #GG_G2DISP+GG_DISP2STATIC]
++  |   decode_RD RC, INS
++  |  br_auth RB
++  |
+   |9:  // Rethrow error from the right C frame.
++  |  neg CARG2w, CARG1w
+   |  mov CARG1, L
+-  |  bl extern lj_err_run		// (lua_State *L)
++  |  bl extern lj_err_trace		// (lua_State *L, int errcode)
+   |.endif
+   |
+   |//-----------------------------------------------------------------------
+@@ -2065,12 +2126,69 @@ static void build_subroutines(BuildCtx *
+   |//-- Miscellaneous functions --------------------------------------------
+   |//-----------------------------------------------------------------------
+   |
++  |.define NEXT_TAB,		TAB:CARG1
++  |.define NEXT_RES,		CARG1
++  |.define NEXT_IDX,		CARG2w
++  |.define NEXT_LIM,		CARG3w
++  |.define NEXT_TMP0,		TMP0
++  |.define NEXT_TMP0w,		TMP0w
++  |.define NEXT_TMP1,		TMP1
++  |.define NEXT_TMP1w,		TMP1w
++  |.define NEXT_RES_PTR,	sp
++  |.define NEXT_RES_VAL,	[sp]
++  |.define NEXT_RES_KEY,	[sp, #8]
++  |
++  |// TValue *lj_vm_next(GCtab *t, uint32_t idx)
++  |// Next idx returned in CRET2w.
++  |->vm_next:
++  |.if JIT
++  |  ldr NEXT_LIM, NEXT_TAB->asize
++  |   ldr NEXT_TMP1, NEXT_TAB->array
++  |1:  // Traverse array part.
++  |  subs NEXT_TMP0w, NEXT_IDX, NEXT_LIM
++  |  bhs >5				// Index points after array part?
++  |  ldr NEXT_TMP0, [NEXT_TMP1, NEXT_IDX, uxtw #3]
++  |  cmn NEXT_TMP0, #-LJ_TNIL
++  |   cinc NEXT_IDX, NEXT_IDX, eq
++  |  beq <1				// Skip holes in array part.
++  |  str NEXT_TMP0, NEXT_RES_VAL
++  |   movz NEXT_TMP0w, #(LJ_TISNUM>>1)&0xffff, lsl #16
++  |   stp NEXT_IDX, NEXT_TMP0w, NEXT_RES_KEY
++  |  add NEXT_IDX, NEXT_IDX, #1
++  |  mov NEXT_RES, NEXT_RES_PTR
++  |4:
++  |  ret
++  |
++  |5:  // Traverse hash part.
++  |  ldr NEXT_TMP1w, NEXT_TAB->hmask
++  |   ldr NODE:NEXT_RES, NEXT_TAB->node
++  |   add NEXT_TMP0w, NEXT_TMP0w, NEXT_TMP0w, lsl #1
++  |  add NEXT_LIM, NEXT_LIM, NEXT_TMP1w
++  |   add NODE:NEXT_RES, NODE:NEXT_RES, NEXT_TMP0w, uxtw #3
++  |6:
++  |  cmp NEXT_IDX, NEXT_LIM
++  |  bhi >9
++  |  ldr NEXT_TMP0, NODE:NEXT_RES->val
++  |  cmn NEXT_TMP0, #-LJ_TNIL
++  |   add NEXT_IDX, NEXT_IDX, #1
++  |  bne <4
++  |  // Skip holes in hash part.
++  |  add NODE:NEXT_RES, NODE:NEXT_RES, #sizeof(Node)
++  |  b <6
++  |
++  |9:  // End of iteration. Set the key to nil (not the value).
++  |  movn NEXT_TMP0, #0
++  |  str NEXT_TMP0, NEXT_RES_KEY
++  |  mov NEXT_RES, NEXT_RES_PTR
++  |  ret
++  |.endif
++  |
+   |//-----------------------------------------------------------------------
+   |//-- FFI helper functions -----------------------------------------------
+   |//-----------------------------------------------------------------------
+   |
+   |// Handler for callback functions.
+-  |// Saveregs already performed. Callback slot number in [sp], g in r12.
++  |// Saveregs already performed. Callback slot number in w9, g in x10.
+   |->vm_ffi_callback:
+   |.if FFI
+   |.type CTSTATE, CTState, PC
+@@ -2094,9 +2212,7 @@ static void build_subroutines(BuildCtx *
+   |  bl extern lj_ccallback_enter	// (CTState *cts, void *cf)
+   |  // Returns lua_State *.
+   |  ldp BASE, RC, L:CRET1->base
+-  |   movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48
+-  |   movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16
+-  |   movn TISNIL, #0
++  |   init_constants
+   |   mov L, CRET1
+   |  ldr LFUNC:CARG3, [BASE, FRAME_FUNC]
+   |  sub RC, RC, BASE
+@@ -2122,21 +2238,22 @@ static void build_subroutines(BuildCtx *
+   |  // Caveat: needs special frame unwinding, see below.
+   |.if FFI
+   |  .type CCSTATE, CCallState, x19
+-  |  stp fp, lr, [sp, #-32]!
+-  |  add fp, sp, #0
+-  |  str CCSTATE, [sp, #16]
++  |  sp_auth
++  |  stp_unwind CCSTATE, x20, [sp, #-32]!
++  |  stp fp, lr, [sp, #16]
++  |  add fp, sp, #16
+   |  mov CCSTATE, x0
+   |  ldr TMP0w, CCSTATE:x0->spadj
+   |   ldrb TMP1w, CCSTATE->nsp
+   |    add TMP2, CCSTATE, #offsetof(CCallState, stack)
+-  |   subs TMP1, TMP1, #1
++  |   subs TMP1, TMP1, #8
+   |    ldr TMP3, CCSTATE->func
+-  |  sub sp, fp, TMP0
++  |  sub sp, sp, TMP0
+   |   bmi >2
+   |1:  // Copy stack slots
+-  |  ldr TMP0, [TMP2, TMP1, lsl #3]
+-  |  str TMP0, [sp, TMP1, lsl #3]
+-  |  subs TMP1, TMP1, #1
++  |  ldr TMP0, [TMP2, TMP1]
++  |  str TMP0, [sp, TMP1]
++  |  subs TMP1, TMP1, #8
+   |  bpl <1
+   |2:
+   |  ldp x0, x1, CCSTATE->gpr[0]
+@@ -2148,14 +2265,14 @@ static void build_subroutines(BuildCtx *
+   |  ldp x6, x7, CCSTATE->gpr[6]
+   |   ldp d6, d7, CCSTATE->fpr[6]
+   |  ldr x8, CCSTATE->retp
+-  |  blr TMP3
+-  |  mov sp, fp
++  |  blr_auth TMP3
++  |  sub sp, fp, #16
+   |  stp x0, x1, CCSTATE->gpr[0]
+   |   stp d0, d1, CCSTATE->fpr[0]
+   |   stp d2, d3, CCSTATE->fpr[2]
+-  |  ldr CCSTATE, [sp, #16]
+-  |  ldp fp, lr, [sp], #32
+-  |  ret
++  |  ldp fp, lr, [sp, #16]
++  |  ldp_unwind CCSTATE, x20, [sp], #32
++  |  ret_auth
+   |.endif
+   |// Note: vm_ffi_call must be the last function in this object file!
+   |
+@@ -2474,7 +2591,7 @@ static void build_ins(BuildCtx *ctx, BCO
+     |  bne >5
+     |  negs TMP0w, TMP0w
+     |   movz CARG3, #0x41e0, lsl #48	// 2^31.
+-    |   add TMP0, TMP0, TISNUM
++    |   add_TISNUM TMP0, TMP0
+     |  csel TMP0, TMP0, CARG3, vc
+     |5:
+     |  str TMP0, [BASE, RA, lsl #3]
+@@ -2489,7 +2606,7 @@ static void build_ins(BuildCtx *ctx, BCO
+     |  bne >2
+     |  ldr CARG1w, STR:CARG1->len
+     |1:
+-    |  add CARG1, CARG1, TISNUM
++    |  add_TISNUM CARG1, CARG1
+     |  str CARG1, [BASE, RA, lsl #3]
+     |  ins_next
+     |
+@@ -2576,7 +2693,9 @@ static void build_ins(BuildCtx *ctx, BCO
+     |.macro ins_arithmod, res, reg1, reg2
+     |  fdiv d2, reg1, reg2
+     |  frintm d2, d2
+-    |  fmsub res, d2, reg2, reg1
++    |  // Cannot use fmsub, because FMA is not enabled by default.
++    |  fmul d2, d2, reg2
++    |  fsub res, reg1, d2
+     |.endmacro
+     |
+     |.macro ins_arithdn, intins, fpins
+@@ -2595,7 +2714,7 @@ static void build_ins(BuildCtx *ctx, BCO
+     |  intins CARG1w, CARG1w, CARG2w
+     |  ins_arithfallback bvs
+     |.endif
+-    |  add CARG1, CARG1, TISNUM
++    |  add_TISNUM CARG1, CARG1
+     |  str CARG1, [BASE, RA, lsl #3]
+     |4:
+     |  ins_next
+@@ -2688,7 +2807,7 @@ static void build_ins(BuildCtx *ctx, BCO
+   case BC_KSHORT:
+     |  // RA = dst, RC = int16_literal
+     |  sxth RCw, RCw
+-    |  add TMP0, RC, TISNUM
++    |  add_TISNUM TMP0, RC
+     |  str TMP0, [BASE, RA, lsl #3]
+     |  ins_next
+     break;
+@@ -2884,7 +3003,7 @@ static void build_ins(BuildCtx *ctx, BCO
+   case BC_GGET:
+     |  // RA = dst, RC = str_const (~)
+   case BC_GSET:
+-    |  // RA = dst, RC = str_const (~)
++    |  // RA = src, RC = str_const (~)
+     |  ldr LFUNC:CARG1, [BASE, FRAME_FUNC]
+     |   mvn RC, RC
+     |  and LFUNC:CARG1, CARG1, #LJ_GCVMASK
+@@ -2911,7 +3030,7 @@ static void build_ins(BuildCtx *ctx, BCO
+     |   cmp TMP1w, CARG1w		// In array part?
+     |   bhs ->vmeta_tgetv
+     |  ldr TMP0, [CARG3]
+-    |  cmp TMP0, TISNIL
++    |  cmp_nil TMP0
+     |  beq >5
+     |1:
+     |  str TMP0, [BASE, RA, lsl #3]
+@@ -2954,7 +3073,7 @@ static void build_ins(BuildCtx *ctx, BCO
+     |   ldr NODE:CARG3, NODE:CARG3->next
+     |  cmp CARG1, CARG4
+     |  bne >4
+-    |  cmp TMP0, TISNIL
++    |  cmp_nil TMP0
+     |  beq >5
+     |3:
+     |  str TMP0, [BASE, RA, lsl #3]
+@@ -2963,7 +3082,7 @@ static void build_ins(BuildCtx *ctx, BCO
+     |4:  // Follow hash chain.
+     |  cbnz NODE:CARG3, <1
+     |  // End of hash chain: key not found, nil result.
+-    |   mov TMP0, TISNIL
++    |   mov_nil TMP0
+     |
+     |5:  // Check for __index if table value is nil.
+     |  ldr TAB:CARG1, TAB:CARG2->metatable
+@@ -2984,7 +3103,7 @@ static void build_ins(BuildCtx *ctx, BCO
+     |   cmp RCw, CARG1w			// In array part?
+     |   bhs ->vmeta_tgetb
+     |  ldr TMP0, [CARG3]
+-    |  cmp TMP0, TISNIL
++    |  cmp_nil TMP0
+     |  beq >5
+     |1:
+     |  str TMP0, [BASE, RA, lsl #3]
+@@ -3031,7 +3150,7 @@ static void build_ins(BuildCtx *ctx, BCO
+     |  ldr TMP1, [CARG3]
+     |   ldr TMP0, [BASE, RA, lsl #3]
+     |    ldrb TMP2w, TAB:CARG2->marked
+-    |  cmp TMP1, TISNIL			// Previous value is nil?
++    |  cmp_nil TMP1			// Previous value is nil?
+     |  beq >5
+     |1:
+     |   str TMP0, [CARG3]
+@@ -3083,7 +3202,7 @@ static void build_ins(BuildCtx *ctx, BCO
+     |  cmp CARG1, CARG4
+     |  bne >5
+     |   ldr TMP0, [BASE, RA, lsl #3]
+-    |  cmp TMP1, TISNIL			// Previous value is nil?
++    |  cmp_nil TMP1			// Previous value is nil?
+     |  beq >4
+     |2:
+     |   str TMP0, NODE:CARG3->val
+@@ -3142,7 +3261,7 @@ static void build_ins(BuildCtx *ctx, BCO
+     |  ldr TMP1, [CARG3]
+     |   ldr TMP0, [BASE, RA, lsl #3]
+     |    ldrb TMP2w, TAB:CARG2->marked
+-    |  cmp TMP1, TISNIL			// Previous value is nil?
++    |  cmp_nil TMP1			// Previous value is nil?
+     |  beq >5
+     |1:
+     |   str TMP0, [CARG3]
+@@ -3241,9 +3360,8 @@ static void build_ins(BuildCtx *ctx, BCO
+     |->BC_CALL_Z:
+     |  mov RB, BASE			// Save old BASE for vmeta_call.
+     |  add BASE, BASE, RA, lsl #3
+-    |  ldr CARG3, [BASE]
++    |  ldr CARG3, [BASE], #16
+     |   sub NARGS8:RC, NARGS8:RC, #8
+-    |   add BASE, BASE, #16
+     |  checkfunc CARG3, ->vmeta_call
+     |  ins_call
+     break;
+@@ -3259,9 +3377,8 @@ static void build_ins(BuildCtx *ctx, BCO
+     |  // RA = base, (RB = 0,) RC = (nargs+1)*8
+     |->BC_CALLT1_Z:
+     |  add RA, BASE, RA, lsl #3
+-    |  ldr TMP1, [RA]
++    |  ldr TMP1, [RA], #16
+     |   sub NARGS8:RC, NARGS8:RC, #8
+-    |   add RA, RA, #16
+     |  checktp CARG3, TMP1, LJ_TFUNC, ->vmeta_callt
+     |  ldr PC, [BASE, FRAME_PC]
+     |->BC_CALLT2_Z:
+@@ -3321,10 +3438,11 @@ static void build_ins(BuildCtx *ctx, BCO
+     break;
+ 
+   case BC_ITERN:
+-    |  // RA = base, (RB = nresults+1, RC = nargs+1 (2+1))
+     |.if JIT
+-    |  // NYI: add hotloop, record BC_ITERN.
++    |  hotloop
+     |.endif
++    |->vm_IITERN:
++    |  // RA = base, (RB = nresults+1, RC = nargs+1 (2+1))
+     |  add RA, BASE, RA, lsl #3
+     |  ldr TAB:RB, [RA, #-16]
+     |    ldrh TMP3w, [PC, # OFS_RD]
+@@ -3340,10 +3458,10 @@ static void build_ins(BuildCtx *ctx, BCO
+     |   add CARG3, CARG2, CARG1, lsl #3
+     |  bhs >5				// Index points after array part?
+     |   ldr TMP0, [CARG3]
+-    |   cmp TMP0, TISNIL
++    |   cmp_nil TMP0
+     |   cinc CARG1, CARG1, eq		// Skip holes in array part.
+     |   beq <1
+-    |   add CARG1, CARG1, TISNUM
++    |   add_TISNUM CARG1, CARG1
+     |   stp CARG1, TMP0, [RA]
+     |    add CARG1, CARG1, #1
+     |3:
+@@ -3361,7 +3479,7 @@ static void build_ins(BuildCtx *ctx, BCO
+     |   add NODE:CARG3, NODE:RB, CARG1, lsl #3  // node = tab->node + idx*3*8
+     |  bhi <4
+     |  ldp TMP0, CARG1, NODE:CARG3->val
+-    |  cmp TMP0, TISNIL
++    |  cmp_nil TMP0
+     |   add RC, RC, #1
+     |  beq <6				// Skip holes in hash part.
+     |  stp CARG1, TMP0, [RA]
+@@ -3379,11 +3497,11 @@ static void build_ins(BuildCtx *ctx, BCO
+     |  checkfunc CFUNC:CARG1, >5
+     |   asr TMP0, TAB:CARG3, #47
+     |  ldrb TMP1w, CFUNC:CARG1->ffid
+-    |   cmn TMP0, #-LJ_TTAB
+-    |   ccmp CARG4, TISNIL, #0, eq
++    |   cmp_nil CARG4
++    |   ccmn TMP0, #-LJ_TTAB, #0, eq
+     |  ccmp TMP1w, #FF_next_N, #0, eq
+     |  bne >5
+-    |  mov TMP0w, #0xfffe7fff
++    |  mov TMP0w, #0xfffe7fff		// LJ_KEYINDEX
+     |  lsl TMP0, TMP0, #32
+     |  str TMP0, [RA, #-8]		// Initialize control var.
+     |1:
+@@ -3391,11 +3509,28 @@ static void build_ins(BuildCtx *ctx, BCO
+     |  ins_next
+     |
+     |5:  // Despecialize bytecode if any of the checks fail.
++    |.if JIT
++    |  ldrb TMP2w, [RC, # OFS_OP]
++    |.endif
+     |  mov TMP0, #BC_JMP
+     |   mov TMP1, #BC_ITERC
+     |  strb TMP0w, [PC, #-4+OFS_OP]
++    |.if JIT
++    |  cmp TMP2w, #BC_ITERN
++    |  bne >6
++    |.endif
+     |   strb TMP1w, [RC, # OFS_OP]
+     |  b <1
++    |.if JIT
++    |6:  // Unpatch JLOOP.
++    |  ldr RA, [GL, #GL_J(trace)]
++    |  ldrh TMP2w, [RC, # OFS_RD]
++    |  ldr TRACE:RA, [RA, TMP2, lsl #3]
++    |  ldr TMP2w, TRACE:RA->startins
++    |  bfxil TMP2w, TMP1w, #0, #8
++    |  str TMP2w, [RC]
++    |  b <1
++    |.endif
+     break;
+ 
+   case BC_VARG:
+@@ -3403,51 +3538,51 @@ static void build_ins(BuildCtx *ctx, BCO
+     |   and RC, RC, #255
+     |  // RA = base, RB = (nresults+1), RC = numparams
+     |  ldr TMP1, [BASE, FRAME_PC]
+-    |  add RC, BASE, RC, lsl #3
+-    |   add RA, BASE, RA, lsl #3
+-    |  add RC, RC, #FRAME_VARG
+-    |   add TMP2, RA, RB, lsl #3
+-    |  sub RC, RC, TMP1			// RC = vbase
+-    |  // Note: RC may now be even _above_ BASE if nargs was < numparams.
++    |  add TMP0, BASE, RC, lsl #3
++    |   add RC, BASE, RA, lsl #3	// RC = destination
++    |  add TMP0, TMP0, #FRAME_VARG
++    |   add TMP2, RC, RB, lsl #3
++    |  sub RA, TMP0, TMP1		// RA = vbase
++    |  // Note: RA may now be even _above_ BASE if nargs was < numparams.
+     |   sub TMP3, BASE, #16		// TMP3 = vtop
+     |  cbz RB, >5
+     |   sub TMP2, TMP2, #16
+     |1:  // Copy vararg slots to destination slots.
+-    |  cmp RC, TMP3
+-    |  ldr TMP0, [RC], #8
+-    |  csel TMP0, TMP0, TISNIL, lo
+-    |   cmp RA, TMP2
+-    |  str TMP0, [RA], #8
++    |  cmp RA, TMP3
++    |  ldr TMP0, [RA], #8
++    |  csinv TMP0, TMP0, xzr, lo	// TISNIL = ~xzr
++    |   cmp RC, TMP2
++    |  str TMP0, [RC], #8
+     |   blo <1
+     |2:
+     |  ins_next
+     |
+     |5:  // Copy all varargs.
+     |  ldr TMP0, L->maxstack
+-    |   subs TMP2, TMP3, RC
++    |   subs TMP2, TMP3, RA
+     |   csel RB, xzr, TMP2, le		// MULTRES = (max(vtop-vbase,0)+1)*8
+     |   add RB, RB, #8
+-    |  add TMP1, RA, TMP2
++    |  add TMP1, RC, TMP2
+     |   str RBw, SAVE_MULTRES
+     |   ble <2				// Nothing to copy.
+     |  cmp TMP1, TMP0
+     |  bhi >7
+     |6:
+-    |  ldr TMP0, [RC], #8
+-    |  str TMP0, [RA], #8
+-    |  cmp RC, TMP3
++    |  ldr TMP0, [RA], #8
++    |  str TMP0, [RC], #8
++    |  cmp RA, TMP3
+     |  blo <6
+     |  b <2
+     |
+     |7:  // Grow stack for varargs.
+     |  lsr CARG2, TMP2, #3
+-    |   stp BASE, RA, L->base
++    |   stp BASE, RC, L->base
+     |  mov CARG1, L
+-    |  sub RC, RC, BASE			// Need delta, because BASE may change.
++    |  sub RA, RA, BASE			// Need delta, because BASE may change.
+     |   str PC, SAVE_PC
+     |  bl extern lj_state_growstack	// (lua_State *L, int n)
+-    |  ldp BASE, RA, L->base
+-    |  add RC, BASE, RC
++    |  ldp BASE, RC, L->base
++    |  add RA, BASE, RA
+     |  sub TMP3, BASE, #16
+     |  b <6
+     break;
+@@ -3591,7 +3726,7 @@ static void build_ins(BuildCtx *ctx, BCO
+     } else {
+       |  adds CARG1w, CARG1w, CARG3w
+       |  bvs >2
+-      |   add TMP0, CARG1, TISNUM
++      |   add_TISNUM TMP0, CARG1
+       |  tbnz CARG3w, #31, >4
+       |  cmp CARG1w, CARG2w
+     }
+@@ -3670,7 +3805,7 @@ static void build_ins(BuildCtx *ctx, BCO
+     |  // RA = base, RC = target
+     |  ldr CARG1, [BASE, RA, lsl #3]
+     |   add TMP1, BASE, RA, lsl #3
+-    |  cmp CARG1, TISNIL
++    |  cmp_nil CARG1
+     |  beq >1				// Stop if iterator returned nil.
+     if (op == BC_JITERL) {
+       |  str CARG1, [TMP1, #-8]
+@@ -3703,15 +3838,22 @@ static void build_ins(BuildCtx *ctx, BCO
+     |.if JIT
+     |  // RA = base (ignored), RC = traceno
+     |  ldr CARG1, [GL, #GL_J(trace)]
+-    |   mov CARG2w, #0  // Traces on ARM64 don't store the trace #, so use 0.
++    |   st_vmstate wzr  // Traces on ARM64 don't store the trace #, so use 0.
+     |  ldr TRACE:RC, [CARG1, RC, lsl #3]
+-    |   st_vmstate CARG2w
++    |.if PAUTH
++    |  ldr RA, TRACE:RC->mcauth
++    |.else
+     |  ldr RA, TRACE:RC->mcode
++    |.endif
+     |   str BASE, GL->jit_base
+     |   str L, GL->tmpbuf.L
+     |  sub sp, sp, #16	// See SPS_FIXED. Avoids sp adjust in every root trace.
++    |.if PAUTH
++    |  braa RA, RC
++    |.else
+     |  br RA
+     |.endif
++    |.endif
+     break;
+ 
+   case BC_JMP:
+@@ -3772,6 +3914,7 @@ static void build_ins(BuildCtx *ctx, BCO
+     |   add TMP2, BASE, RC
+     |   add LFUNC:CARG3, CARG3, TMP0, lsl #47
+     |  add RA, RA, RC
++    |  sub CARG1, CARG1, #8
+     |   add TMP0, RC, #16+FRAME_VARG
+     |   str LFUNC:CARG3, [TMP2], #8	// Store (tagged) copy of LFUNC.
+     |    ldr KBASE, [PC, #-4+PC2PROTO(k)]
+@@ -3821,7 +3964,7 @@ static void build_ins(BuildCtx *ctx, BCO
+     |  mov CARG1, L
+     |   bhi ->vm_growstack_c		// Need to grow stack.
+     |    st_vmstate TMP0w
+-    |  blr CARG4			// (lua_State *L [, lua_CFunction f])
++    |  blr_auth CARG4			// (lua_State *L [, lua_CFunction f])
+     |  // Returns nresults.
+     |  ldp BASE, TMP1, L->base
+     |    str L, GL->cur_L
+@@ -3860,7 +4003,7 @@ static int build_backend(BuildCtx *ctx)
+ static void emit_asm_debug(BuildCtx *ctx)
+ {
+   int fcofs = (int)((uint8_t *)ctx->glob[GLOB_vm_ffi_call] - ctx->code);
+-  int i, cf = CFRAME_SIZE >> 3;
++  int i;
+   switch (ctx->mode) {
+   case BUILD_elfasm:
+     fprintf(ctx->fp, "\t.section .debug_frame,\"\",%%progbits\n");
+@@ -3874,7 +4017,7 @@ static void emit_asm_debug(BuildCtx *ctx
+ 	"\t.uleb128 0x1\n"
+ 	"\t.sleb128 -8\n"
+ 	"\t.byte 30\n"				/* Return address is in lr. */
+-	"\t.byte 0xc\n\t.uleb128 31\n\t.uleb128 0\n"	/* def_cfa sp */
++	"\t.byte 0xc\n\t.uleb128 29\n\t.uleb128 16\n"	/* def_cfa fp 16 */
+ 	"\t.align 3\n"
+ 	".LECIE0:\n\n");
+     fprintf(ctx->fp,
+@@ -3884,15 +4027,14 @@ static void emit_asm_debug(BuildCtx *ctx
+ 	"\t.long .Lframe0\n"
+ 	"\t.quad .Lbegin\n"
+ 	"\t.quad %d\n"
+-	"\t.byte 0xe\n\t.uleb128 %d\n"		/* def_cfa_offset */
+-	"\t.byte 0x9d\n\t.uleb128 %d\n"		/* offset fp */
+-	"\t.byte 0x9e\n\t.uleb128 %d\n",	/* offset lr */
+-	fcofs, CFRAME_SIZE, cf, cf-1);
++	"\t.byte 0x9e\n\t.uleb128 1\n"		/* offset lr */
++	"\t.byte 0x9d\n\t.uleb128 2\n",		/* offset fp */
++	fcofs);
+     for (i = 19; i <= 28; i++)  /* offset x19-x28 */
+-      fprintf(ctx->fp, "\t.byte 0x%x\n\t.uleb128 %d\n", 0x80+i, cf-i+17);
++      fprintf(ctx->fp, "\t.byte 0x%x\n\t.uleb128 %d\n", 0x80+i, i+(3-19));
+     for (i = 8; i <= 15; i++)  /* offset d8-d15 */
+       fprintf(ctx->fp, "\t.byte 5\n\t.uleb128 0x%x\n\t.uleb128 %d\n",
+-	      64+i, cf-i-4);
++	      64+i, i+(3+(28-19+1)-8));
+     fprintf(ctx->fp,
+ 	"\t.align 3\n"
+ 	".LEFDE0:\n\n");
+@@ -3904,13 +4046,14 @@ static void emit_asm_debug(BuildCtx *ctx
+ 	"\t.long .Lframe0\n"
+ 	"\t.quad lj_vm_ffi_call\n"
+ 	"\t.quad %d\n"
+-	"\t.byte 0xe\n\t.uleb128 32\n"		/* def_cfa_offset */
+-	"\t.byte 0x9d\n\t.uleb128 4\n"		/* offset fp */
+-	"\t.byte 0x9e\n\t.uleb128 3\n"		/* offset lr */
+-	"\t.byte 0x93\n\t.uleb128 2\n"		/* offset x19 */
++	"\t.byte 0x9e\n\t.uleb128 1\n"		/* offset lr */
++	"\t.byte 0x9d\n\t.uleb128 2\n"		/* offset fp */
++	"\t.byte 0x93\n\t.uleb128 3\n"		/* offset x19 */
++	"\t.byte 0x94\n\t.uleb128 4\n"		/* offset x20 */
+ 	"\t.align 3\n"
+ 	".LEFDE1:\n\n", (int)ctx->codesz - fcofs);
+ #endif
++#if !LJ_NO_UNWIND
+     fprintf(ctx->fp, "\t.section .eh_frame,\"a\",%%progbits\n");
+     fprintf(ctx->fp,
+ 	".Lframe1:\n"
+@@ -3926,7 +4069,7 @@ static void emit_asm_debug(BuildCtx *ctx
+ 	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
+ 	"\t.long lj_err_unwind_dwarf-.\n"
+ 	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
+-	"\t.byte 0xc\n\t.uleb128 31\n\t.uleb128 0\n"	/* def_cfa sp */
++	"\t.byte 0xc\n\t.uleb128 29\n\t.uleb128 16\n"	/* def_cfa fp 16 */
+ 	"\t.align 3\n"
+ 	".LECIE1:\n\n");
+     fprintf(ctx->fp,
+@@ -3937,15 +4080,14 @@ static void emit_asm_debug(BuildCtx *ctx
+ 	"\t.long .Lbegin-.\n"
+ 	"\t.long %d\n"
+ 	"\t.uleb128 0\n"			/* augmentation length */
+-	"\t.byte 0xe\n\t.uleb128 %d\n"		/* def_cfa_offset */
+-	"\t.byte 0x9d\n\t.uleb128 %d\n"		/* offset fp */
+-	"\t.byte 0x9e\n\t.uleb128 %d\n",	/* offset lr */
+-	fcofs, CFRAME_SIZE, cf, cf-1);
++	"\t.byte 0x9e\n\t.uleb128 1\n"		/* offset lr */
++	"\t.byte 0x9d\n\t.uleb128 2\n",		/* offset fp */
++	fcofs);
+     for (i = 19; i <= 28; i++)  /* offset x19-x28 */
+-      fprintf(ctx->fp, "\t.byte 0x%x\n\t.uleb128 %d\n", 0x80+i, cf-i+17);
++      fprintf(ctx->fp, "\t.byte 0x%x\n\t.uleb128 %d\n", 0x80+i, i+(3-19));
+     for (i = 8; i <= 15; i++)  /* offset d8-d15 */
+       fprintf(ctx->fp, "\t.byte 5\n\t.uleb128 0x%x\n\t.uleb128 %d\n",
+-	      64+i, cf-i-4);
++	      64+i, i+(3+(28-19+1)-8));
+     fprintf(ctx->fp,
+ 	"\t.align 3\n"
+ 	".LEFDE2:\n\n");
+@@ -3962,7 +4104,7 @@ static void emit_asm_debug(BuildCtx *ctx
+ 	"\t.byte 30\n"				/* Return address is in lr. */
+ 	"\t.uleb128 1\n"			/* augmentation length */
+ 	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
+-	"\t.byte 0xc\n\t.uleb128 31\n\t.uleb128 0\n"	/* def_cfa sp */
++	"\t.byte 0xc\n\t.uleb128 29\n\t.uleb128 16\n"	/* def_cfa fp 16 */
+ 	"\t.align 3\n"
+ 	".LECIE2:\n\n");
+     fprintf(ctx->fp,
+@@ -3973,14 +4115,107 @@ static void emit_asm_debug(BuildCtx *ctx
+ 	"\t.long lj_vm_ffi_call-.\n"
+ 	"\t.long %d\n"
+ 	"\t.uleb128 0\n"			/* augmentation length */
+-	"\t.byte 0xe\n\t.uleb128 32\n"		/* def_cfa_offset */
+-	"\t.byte 0x9d\n\t.uleb128 4\n"		/* offset fp */
+-	"\t.byte 0x9e\n\t.uleb128 3\n"		/* offset lr */
+-	"\t.byte 0x93\n\t.uleb128 2\n"		/* offset x19 */
++	"\t.byte 0x9e\n\t.uleb128 1\n"		/* offset lr */
++	"\t.byte 0x9d\n\t.uleb128 2\n"		/* offset fp */
++	"\t.byte 0x93\n\t.uleb128 3\n"		/* offset x19 */
++	"\t.byte 0x94\n\t.uleb128 4\n"		/* offset x20 */
+ 	"\t.align 3\n"
+ 	".LEFDE3:\n\n", (int)ctx->codesz - fcofs);
+ #endif
++#endif
+     break;
++#if !LJ_NO_UNWIND
++  case BUILD_machasm: {
++#if LJ_HASFFI
++    int fcsize = 0;
++#endif
++    int j;
++    fprintf(ctx->fp, "\t.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support\n");
++    fprintf(ctx->fp,
++	"EH_frame1:\n"
++	"\t.set L$set$x,LECIEX-LSCIEX\n"
++	"\t.long L$set$x\n"
++	"LSCIEX:\n"
++	"\t.long 0\n"
++	"\t.byte 0x1\n"
++	"\t.ascii \"zPR\\0\"\n"
++	"\t.uleb128 0x1\n"
++	"\t.sleb128 -8\n"
++	"\t.byte 30\n"				/* Return address is in lr. */
++	"\t.uleb128 6\n"			/* augmentation length */
++	"\t.byte 0x9b\n"			/* indirect|pcrel|sdata4 */
++	"\t.long _lj_err_unwind_dwarf@GOT-.\n"
++	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
++	"\t.byte 0xc\n\t.uleb128 29\n\t.uleb128 16\n"	/* def_cfa fp 16 */
++	"\t.align 3\n"
++	"LECIEX:\n\n");
++    for (j = 0; j < ctx->nsym; j++) {
++      const char *name = ctx->sym[j].name;
++      int32_t size = ctx->sym[j+1].ofs - ctx->sym[j].ofs;
++      if (size == 0) continue;
++#if LJ_HASFFI
++      if (!strcmp(name, "_lj_vm_ffi_call")) { fcsize = size; continue; }
++#endif
++      fprintf(ctx->fp,
++	"LSFDE%d:\n"
++	"\t.set L$set$%d,LEFDE%d-LASFDE%d\n"
++	"\t.long L$set$%d\n"
++	"LASFDE%d:\n"
++	"\t.long LASFDE%d-EH_frame1\n"
++	"\t.long %s-.\n"
++	"\t.long %d\n"
++	"\t.uleb128 0\n"			/* augmentation length */
++	"\t.byte 0x9e\n\t.uleb128 1\n"		/* offset lr */
++	"\t.byte 0x9d\n\t.uleb128 2\n",		/* offset fp */
++	j, j, j, j, j, j, j, name, size);
++      for (i = 19; i <= 28; i++)  /* offset x19-x28 */
++	fprintf(ctx->fp, "\t.byte 0x%x\n\t.uleb128 %d\n", 0x80+i, i+(3-19));
++      for (i = 8; i <= 15; i++)  /* offset d8-d15 */
++	fprintf(ctx->fp, "\t.byte 5\n\t.uleb128 0x%x\n\t.uleb128 %d\n",
++		64+i, i+(3+(28-19+1)-8));
++      fprintf(ctx->fp,
++	"\t.align 3\n"
++	"LEFDE%d:\n\n", j);
++    }
++#if LJ_HASFFI
++    if (fcsize) {
++      fprintf(ctx->fp,
++	"EH_frame2:\n"
++	"\t.set L$set$y,LECIEY-LSCIEY\n"
++	"\t.long L$set$y\n"
++	"LSCIEY:\n"
++	"\t.long 0\n"
++	"\t.byte 0x1\n"
++	"\t.ascii \"zR\\0\"\n"
++	"\t.uleb128 0x1\n"
++	"\t.sleb128 -8\n"
++	"\t.byte 30\n"				/* Return address is in lr. */
++	"\t.uleb128 1\n"			/* augmentation length */
++	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
++	"\t.byte 0xc\n\t.uleb128 29\n\t.uleb128 16\n"	/* def_cfa fp 16 */
++	"\t.align 3\n"
++	"LECIEY:\n\n");
++      fprintf(ctx->fp,
++	"LSFDEY:\n"
++	"\t.set L$set$yy,LEFDEY-LASFDEY\n"
++	"\t.long L$set$yy\n"
++	"LASFDEY:\n"
++	"\t.long LASFDEY-EH_frame2\n"
++	"\t.long _lj_vm_ffi_call-.\n"
++	"\t.long %d\n"
++	"\t.uleb128 0\n"			/* augmentation length */
++	"\t.byte 0x9e\n\t.uleb128 1\n"		/* offset lr */
++	"\t.byte 0x9d\n\t.uleb128 2\n"		/* offset fp */
++	"\t.byte 0x93\n\t.uleb128 3\n"		/* offset x19 */
++	"\t.byte 0x94\n\t.uleb128 4\n"		/* offset x20 */
++	"\t.align 3\n"
++	"LEFDEY:\n\n", fcsize);
++    }
++#endif
++    fprintf(ctx->fp, ".subsections_via_symbols\n");
++    }
++    break;
++#endif
+   default:
+     break;
+   }
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/vm_mips.dasc
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/vm_mips.dasc
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/vm_mips.dasc
+@@ -1,6 +1,6 @@
+ |// Low-level VM code for MIPS CPUs.
+ |// Bytecode interpreter, fast functions and helper functions.
+-|// Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++|// Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ |//
+ |// MIPS soft-float support contributed by Djordje Kovacevic and
+ |// Stefan Pejic from RT-RK.com, sponsored by Cisco Systems, Inc.
+@@ -190,7 +190,7 @@
+ |//-----------------------------------------------------------------------
+ |
+ |// Trap for not-yet-implemented parts.
+-|.macro NYI; .long 0xf0f0f0f0; .endmacro
++|.macro NYI; .long 0xec1cf0f0; .endmacro
+ |
+ |// Macros to mark delay slots.
+ |.macro ., a; a; .endmacro
+@@ -501,6 +501,10 @@ static void build_subroutines(BuildCtx *
+   |  b ->vm_returnc
+   |.  li RD, 16				// 2 results: false + error message.
+   |
++  |->vm_unwind_stub:			// Jump to exit stub from unwinder.
++  |  jr CARG1
++  |.  move ra, CARG2
++  |
+   |//-----------------------------------------------------------------------
+   |//-- Grow stack for calls -----------------------------------------------
+   |//-----------------------------------------------------------------------
+@@ -669,11 +673,11 @@ static void build_subroutines(BuildCtx *
+   |.endif
+   |     lw PC, -16+HI(RB)		// Restore PC from [cont|PC].
+   |   addu TMP2, RA, RD
+-  |    lw TMP1, LFUNC:TMP1->pc
+   |.if FFI
+   |  bnez AT, >1
+   |.endif
+   |.  sw TISNIL, -8+HI(TMP2)		// Ensure one valid arg.
++  |    lw TMP1, LFUNC:TMP1->pc
+   |  // BASE = base, RA = resultptr, RB = meta base
+   |  jr TMP0				// Jump to continuation.
+   |.  lw KBASE, PC2PROTO(k)(TMP1)
+@@ -1258,35 +1262,27 @@ static void build_subroutines(BuildCtx *
+   |//-- Base library: iterators -------------------------------------------
+   |
+   |.ffunc next
+-  |  lw CARG1, HI(BASE)
+-  |   lw TAB:CARG2, LO(BASE)
++  |  lw CARG2, HI(BASE)
++  |   lw TAB:CARG1, LO(BASE)
+   |  beqz NARGS8:RC, ->fff_fallback
+   |.  addu TMP2, BASE, NARGS8:RC
+   |  li AT, LJ_TTAB
+   |   sw TISNIL, HI(TMP2)		// Set missing 2nd arg to nil.
+-  |  bne CARG1, AT, ->fff_fallback
++  |  bne CARG2, AT, ->fff_fallback
+   |.  lw PC, FRAME_PC(BASE)
+   |  load_got lj_tab_next
+-  |   sw BASE, L->base			// Add frame since C call can throw.
+-  |   sw BASE, L->top			// Dummy frame length is ok.
+-  |  addiu CARG3, BASE, 8
+-  |   sw PC, SAVE_PC
+-  |  call_intern lj_tab_next		// (lua_State *L, GCtab *t, TValue *key)
+-  |.  move CARG1, L
+-  |  // Returns 0 at end of traversal.
++  |  addiu CARG2, BASE, 8
++  |  call_intern lj_tab_next		// (GCtab *t, cTValue *key, TValue *o)
++  |.  addiu CARG3, BASE, -8
++  |  // Returns 1=found, 0=end, -1=error.
++  |   addiu RA, BASE, -8
++  |  bgtz CRET1, ->fff_res		// Found key/value.
++  |.  li RD, (2+1)*8
+   |  beqz CRET1, ->fff_restv		// End of traversal: return nil.
+   |.  li SFARG1HI, LJ_TNIL
+-  |  lw TMP0, 8+HI(BASE)
+-  |   lw TMP1, 8+LO(BASE)
+-  |    addiu RA, BASE, -8
+-  |  lw TMP2, 16+HI(BASE)
+-  |   lw TMP3, 16+LO(BASE)
+-  |  sw TMP0, HI(RA)
+-  |   sw TMP1, LO(RA)
+-  |  sw TMP2, 8+HI(RA)
+-  |   sw TMP3, 8+LO(RA)
+-  |  b ->fff_res
+-  |.  li RD, (2+1)*8
++  |   lw CFUNC:RB, FRAME_FUNC(BASE)
++  |  b ->fff_fallback			// Invalid key.
++  |.  li RC, 2*8
+   |
+   |.ffunc_1 pairs
+   |  li AT, LJ_TTAB
+@@ -1967,7 +1963,7 @@ static void build_subroutines(BuildCtx *
+   |  lw TMP0, SBUF:CARG1->b
+   |   sw L, SBUF:CARG1->L
+   |   sw BASE, L->base
+-  |  sw TMP0, SBUF:CARG1->p
++  |  sw TMP0, SBUF:CARG1->w
+   |  call_intern extern lj_buf_putstr_ .. name
+   |.  sw PC, SAVE_PC
+   |  load_got lj_buf_tostr
+@@ -2470,7 +2466,8 @@ static void build_subroutines(BuildCtx *
+   |   addiu DISPATCH, JGL, -GG_DISP2G-32768
+   |  sw BASE, L->base
+   |1:
+-  |  bltz CRET1, >9			// Check for error from exit.
++  |  sltiu TMP0, CRET1, -LUA_ERRERR	// Check for error from exit.
++  |  beqz TMP0, >9
+   |.  lw LFUNC:RB, FRAME_FUNC(BASE)
+   |    .FPU lui TMP3, 0x59c0			// TOBIT = 2^52 + 2^51 (float).
+   |  sll MULTRES, CRET1, 3
+@@ -2484,14 +2481,16 @@ static void build_subroutines(BuildCtx *
+   |    .FPU cvt.d.s TOBIT, TOBIT
+   |  // Modified copy of ins_next which handles function header dispatch, too.
+   |  lw INS, 0(PC)
+-  |   addiu PC, PC, 4
++  |  addiu CRET1, CRET1, 17		// Static dispatch?
+   |    // Assumes TISNIL == ~LJ_VMST_INTERP == -1
+   |    sw TISNIL, DISPATCH_GL(vmstate)(DISPATCH)
++  |   decode_RD8a RD, INS
++  |  beqz CRET1, >5
++  |.  addiu PC, PC, 4
+   |  decode_OP4a TMP1, INS
+   |  decode_OP4b TMP1
+-  |    sltiu TMP2, TMP1, BC_FUNCF*4
+   |  addu TMP0, DISPATCH, TMP1
+-  |   decode_RD8a RD, INS
++  |    sltiu TMP2, TMP1, BC_FUNCF*4
+   |  lw AT, 0(TMP0)
+   |   decode_RA8a RA, INS
+   |    beqz TMP2, >2
+@@ -2519,9 +2518,26 @@ static void build_subroutines(BuildCtx *
+   |  jr AT
+   |.  addu RA, RA, BASE
+   |
++  |5:  // Dispatch to static entry of original ins replaced by BC_JLOOP.
++  |  lw TMP0, DISPATCH_J(trace)(DISPATCH)
++  |  decode_RD4b RD
++  |  addu TMP0, TMP0, RD
++  |  lw TRACE:TMP2, 0(TMP0)
++  |  lw INS, TRACE:TMP2->startins
++  |  decode_OP4a TMP1, INS
++  |  decode_OP4b TMP1
++  |  addu TMP0, DISPATCH, TMP1
++  |   decode_RD8a RD, INS
++  |  lw AT, GG_DISP2STATIC(TMP0)
++  |   decode_RA8a RA, INS
++  |   decode_RD8b RD
++  |  jr AT
++  |.  decode_RA8b RA
++  |
+   |9:  // Rethrow error from the right C frame.
+-  |  load_got lj_err_run
+-  |  call_intern lj_err_run		// (lua_State *L)
++  |  load_got lj_err_trace
++  |  sub CARG2, r0, CRET1
++  |  call_intern lj_err_trace		// (lua_State *L, int errcode)
+   |.  move CARG1, L
+   |.endif
+   |
+@@ -2801,6 +2817,73 @@ static void build_subroutines(BuildCtx *
+   |//-- Miscellaneous functions --------------------------------------------
+   |//-----------------------------------------------------------------------
+   |
++  |.define NEXT_TAB,		TAB:CARG1
++  |.define NEXT_IDX,		CARG2
++  |.define NEXT_ASIZE,		CARG3
++  |.define NEXT_NIL,		CARG4
++  |.define NEXT_TMP0,		r12
++  |.define NEXT_TMP1,		r13
++  |.define NEXT_TMP2,		r14
++  |.define NEXT_RES_VK,		CRET1
++  |.define NEXT_RES_IDX,	CRET2
++  |.define NEXT_RES_PTR,	sp
++  |.define NEXT_RES_VAL_I,	0(sp)
++  |.define NEXT_RES_VAL_IT,	4(sp)
++  |.define NEXT_RES_KEY_I,	8(sp)
++  |.define NEXT_RES_KEY_IT,	12(sp)
++  |
++  |// TValue *lj_vm_next(GCtab *t, uint32_t idx)
++  |// Next idx returned in CRET2.
++  |->vm_next:
++  |.if JIT and ENDIAN_LE
++  |   lw NEXT_ASIZE, NEXT_TAB->asize
++  |  lw NEXT_TMP0, NEXT_TAB->array
++  |    li NEXT_NIL, LJ_TNIL
++  |1:  // Traverse array part.
++  |   sltu AT, NEXT_IDX, NEXT_ASIZE
++  |    sll NEXT_TMP1, NEXT_IDX, 3
++  |   beqz AT, >5
++  |.   addu NEXT_TMP1, NEXT_TMP0, NEXT_TMP1
++  |  lw NEXT_TMP2, 4(NEXT_TMP1)
++  |   sw NEXT_IDX, NEXT_RES_KEY_I
++  |  beq NEXT_TMP2, NEXT_NIL, <1
++  |.  addiu NEXT_IDX, NEXT_IDX, 1
++  |    lw NEXT_TMP0, 0(NEXT_TMP1)
++  |   li AT, LJ_TISNUM
++  |  sw NEXT_TMP2, NEXT_RES_VAL_IT
++  |   sw AT, NEXT_RES_KEY_IT
++  |    sw NEXT_TMP0, NEXT_RES_VAL_I
++  |  move NEXT_RES_VK, NEXT_RES_PTR
++  |  jr ra
++  |.  move NEXT_RES_IDX, NEXT_IDX
++  |
++  |5:  // Traverse hash part.
++  |  subu NEXT_RES_IDX, NEXT_IDX, NEXT_ASIZE
++  |   lw NODE:NEXT_RES_VK, NEXT_TAB->node
++  |    sll NEXT_TMP2, NEXT_RES_IDX, 5
++  |  lw NEXT_TMP0, NEXT_TAB->hmask
++  |    sll AT, NEXT_RES_IDX, 3
++  |    subu AT, NEXT_TMP2, AT
++  |   addu NODE:NEXT_RES_VK, NODE:NEXT_RES_VK, AT
++  |6:
++  |  sltu AT, NEXT_TMP0, NEXT_RES_IDX
++  |  bnez AT, >8
++  |.  nop
++  |  lw NEXT_TMP2, NODE:NEXT_RES_VK->val.it
++  |  bne NEXT_TMP2, NEXT_NIL, >9
++  |.  addiu NEXT_RES_IDX, NEXT_RES_IDX, 1
++  |  // Skip holes in hash part.
++  |  b <6
++  |.  addiu NODE:NEXT_RES_VK, NODE:NEXT_RES_VK, sizeof(Node)
++  |
++  |8:  // End of iteration. Set the key to nil (not the value).
++  |  sw NEXT_NIL, NEXT_RES_KEY_IT
++  |  move NEXT_RES_VK, NEXT_RES_PTR
++  |9:
++  |  jr ra
++  |.  addu NEXT_RES_IDX, NEXT_RES_IDX, NEXT_ASIZE
++  |.endif
++  |
+   |//-----------------------------------------------------------------------
+   |//-- FFI helper functions -----------------------------------------------
+   |//-----------------------------------------------------------------------
+@@ -2868,7 +2951,6 @@ static void build_subroutines(BuildCtx *
+   |  move TMP2, sp
+   |  subu sp, sp, TMP1
+   |  sw ra, -4(TMP2)
+-  |   sll CARG2, CARG2, 2
+   |  sw r16, -8(TMP2)
+   |  sw CCSTATE, -12(TMP2)
+   |  move r16, TMP2
+@@ -4524,10 +4606,11 @@ static void build_ins(BuildCtx *ctx, BCO
+     break;
+ 
+   case BC_ITERN:
+-    |  // RA = base*8, (RB = (nresults+1)*8, RC = (nargs+1)*8 (2+1)*8)
+-    |.if JIT
+-    |  // NYI: add hotloop, record BC_ITERN.
++    |.if JIT and ENDIAN_LE
++    |  hotloop
+     |.endif
++    |->vm_IITERN:
++    |  // RA = base*8, (RB = (nresults+1)*8, RC = (nargs+1)*8 (2+1)*8)
+     |  addu RA, BASE, RA
+     |  lw TAB:RB, -16+LO(RA)
+     |  lw RC, -8+LO(RA)			// Get index from control var.
+@@ -4606,9 +4689,9 @@ static void build_ins(BuildCtx *ctx, BCO
+     |  addiu CARG2, CARG2, -FF_next_N
+     |  or CARG2, CARG2, CARG3
+     |  bnez CARG2, >5
+-    |.  lui TMP1, 0xfffe
++    |.  lui TMP1, (LJ_KEYINDEX >> 16)
+     |  addu PC, TMP0, TMP2
+-    |  ori TMP1, TMP1, 0x7fff
++    |  ori TMP1, TMP1, (LJ_KEYINDEX & 0xffff)
+     |  sw r0, -8+LO(RA)			// Initialize control var.
+     |  sw TMP1, -8+HI(RA)
+     |1:
+@@ -4617,9 +4700,28 @@ static void build_ins(BuildCtx *ctx, BCO
+     |  li TMP3, BC_JMP
+     |   li TMP1, BC_ITERC
+     |  sb TMP3, -4+OFS_OP(PC)
+-    |    addu PC, TMP0, TMP2
++    |  addu PC, TMP0, TMP2
++    |.if JIT
++    |  lb TMP0, OFS_OP(PC)
++    |  li AT, BC_ITERN
++    |  bne TMP0, AT, >6
++    |.  lhu TMP2, OFS_RD(PC)
++    |.endif
+     |  b <1
+     |.  sb TMP1, OFS_OP(PC)
++    |.if JIT
++    |6:  // Unpatch JLOOP.
++    |  lw TMP0, DISPATCH_J(trace)(DISPATCH)
++    |   sll TMP2, TMP2, 2
++    |  addu TMP0, TMP0, TMP2
++    |  lw TRACE:TMP2, 0(TMP0)
++    |  lw TMP0, TRACE:TMP2->startins
++    |   li AT, -256
++    |  and TMP0, TMP0, AT
++    |  or TMP0, TMP0, TMP1
++    |  b <1
++    |.  sw TMP0, 0(PC)
++    |.endif
+     break;
+ 
+   case BC_VARG:
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/vm_mips64.dasc
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/vm_mips64.dasc
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/vm_mips64.dasc
+@@ -1,6 +1,6 @@
+ |// Low-level VM code for MIPS64 CPUs.
+ |// Bytecode interpreter, fast functions and helper functions.
+-|// Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++|// Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ |//
+ |// Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
+ |// Sponsored by Cisco Systems, Inc.
+@@ -193,7 +193,7 @@
+ |//-----------------------------------------------------------------------
+ |
+ |// Trap for not-yet-implemented parts.
+-|.macro NYI; .long 0xf0f0f0f0; .endmacro
++|.macro NYI; .long 0xec1cf0f0; .endmacro
+ |
+ |// Macros to mark delay slots.
+ |.macro ., a; a; .endmacro
+@@ -556,6 +556,10 @@ static void build_subroutines(BuildCtx *
+   |  b ->vm_returnc
+   |.  li RD, 16				// 2 results: false + error message.
+   |
++  |->vm_unwind_stub:			// Jump to exit stub from unwinder.
++  |  jr CARG1
++  |.  move ra, CARG2
++  |
+   |//-----------------------------------------------------------------------
+   |//-- Grow stack for calls -----------------------------------------------
+   |//-----------------------------------------------------------------------
+@@ -724,11 +728,11 @@ static void build_subroutines(BuildCtx *
+   |     ld PC, -24(RB)			// Restore PC from [cont|PC].
+   |    cleartp LFUNC:TMP1
+   |   daddu TMP2, RA, RD
+-  |    ld TMP1, LFUNC:TMP1->pc
+   |.if FFI
+   |  bnez AT, >1
+   |.endif
+   |.  sd TISNIL, -8(TMP2)		// Ensure one valid arg.
++  |    ld TMP1, LFUNC:TMP1->pc
+   |  // BASE = base, RA = resultptr, RB = meta base
+   |  jr TMP0				// Jump to continuation.
+   |.  ld KBASE, PC2PROTO(k)(TMP1)
+@@ -1318,27 +1322,24 @@ static void build_subroutines(BuildCtx *
+   |//-- Base library: iterators -------------------------------------------
+   |
+   |.ffunc_1 next
+-  |  checktp CARG2, CARG1, -LJ_TTAB, ->fff_fallback
++  |  checktp CARG1, -LJ_TTAB, ->fff_fallback
+   |  daddu TMP2, BASE, NARGS8:RC
+   |  sd TISNIL, 0(TMP2)			// Set missing 2nd arg to nil.
+-  |  ld PC, FRAME_PC(BASE)
+   |  load_got lj_tab_next
+-  |   sd BASE, L->base			// Add frame since C call can throw.
+-  |   sd BASE, L->top			// Dummy frame length is ok.
+-  |  daddiu CARG3, BASE, 8
+-  |   sd PC, SAVE_PC
+-  |  call_intern lj_tab_next		// (lua_State *L, GCtab *t, TValue *key)
+-  |.  move CARG1, L
+-  |  // Returns 0 at end of traversal.
++  |  ld PC, FRAME_PC(BASE)
++  |  daddiu CARG2, BASE, 8
++  |  call_intern lj_tab_next		// (GCtab *t, cTValue *key, TValue *o)
++  |.  daddiu CARG3, BASE, -16
++  |  // Returns 1=found, 0=end, -1=error.
++  |   daddiu RA, BASE, -16
++  |  bgtz CRET1, ->fff_res		// Found key/value.
++  |.  li RD, (2+1)*8
+   |  beqz CRET1, ->fff_restv		// End of traversal: return nil.
+   |.  move CARG1, TISNIL
+-  |  ld TMP0, 8(BASE)
+-  |    daddiu RA, BASE, -16
+-  |  ld TMP2, 16(BASE)
+-  |  sd TMP0, 0(RA)
+-  |  sd TMP2, 8(RA)
+-  |  b ->fff_res
+-  |.  li RD, (2+1)*8
++  |   ld CFUNC:RB, FRAME_FUNC(BASE)
++  |   cleartp CFUNC:RB
++  |  b ->fff_fallback			// Invalid key.
++  |.  li RC, 2*8
+   |
+   |.ffunc_1 pairs
+   |  checktp TAB:TMP1, CARG1, -LJ_TTAB, ->fff_fallback
+@@ -2037,7 +2038,7 @@ static void build_subroutines(BuildCtx *
+   |  ld TMP0, SBUF:CARG1->b
+   |   sd L, SBUF:CARG1->L
+   |   sd BASE, L->base
+-  |  sd TMP0, SBUF:CARG1->p
++  |  sd TMP0, SBUF:CARG1->w
+   |  call_intern extern lj_buf_putstr_ .. name
+   |.  sd PC, SAVE_PC
+   |  load_got lj_buf_tostr
+@@ -2570,7 +2571,8 @@ static void build_subroutines(BuildCtx *
+   |   daddiu DISPATCH, JGL, -GG_DISP2G-32768
+   |  sd BASE, L->base
+   |1:
+-  |  bltz CRET1, >9			// Check for error from exit.
++  |  sltiu TMP0, CRET1, -LUA_ERRERR	// Check for error from exit.
++  |  beqz TMP0, >9
+   |.  ld LFUNC:RB, FRAME_FUNC(BASE)
+   |    .FPU lui TMP3, 0x59c0		// TOBIT = 2^52 + 2^51 (float).
+   |  dsll MULTRES, CRET1, 3
+@@ -2585,14 +2587,16 @@ static void build_subroutines(BuildCtx *
+   |    .FPU cvt.d.s TOBIT, TOBIT
+   |  // Modified copy of ins_next which handles function header dispatch, too.
+   |  lw INS, 0(PC)
+-  |   daddiu PC, PC, 4
++  |  addiu CRET1, CRET1, 17		// Static dispatch?
+   |    // Assumes TISNIL == ~LJ_VMST_INTERP == -1
+   |    sw TISNIL, DISPATCH_GL(vmstate)(DISPATCH)
++  |   decode_RD8a RD, INS
++  |  beqz CRET1, >5
++  |.  daddiu PC, PC, 4
+   |  decode_OP8a TMP1, INS
+   |  decode_OP8b TMP1
+-  |    sltiu TMP2, TMP1, BC_FUNCF*8
+   |  daddu TMP0, DISPATCH, TMP1
+-  |   decode_RD8a RD, INS
++  |    sltiu TMP2, TMP1, BC_FUNCF*8
+   |  ld AT, 0(TMP0)
+   |   decode_RA8a RA, INS
+   |    beqz TMP2, >2
+@@ -2621,9 +2625,26 @@ static void build_subroutines(BuildCtx *
+   |  jr AT
+   |.  daddu RA, RA, BASE
+   |
++  |5:  // Dispatch to static entry of original ins replaced by BC_JLOOP.
++  |  ld TMP0, DISPATCH_J(trace)(DISPATCH)
++  |  decode_RD8b RD
++  |  daddu TMP0, TMP0, RD
++  |  ld TRACE:TMP2, 0(TMP0)
++  |  lw INS, TRACE:TMP2->startins
++  |  decode_OP8a TMP1, INS
++  |  decode_OP8b TMP1
++  |  daddu TMP0, DISPATCH, TMP1
++  |   decode_RD8a RD, INS
++  |  ld AT, GG_DISP2STATIC(TMP0)
++  |   decode_RA8a RA, INS
++  |   decode_RD8b RD
++  |  jr AT
++  |.  decode_RA8b RA
++  |
+   |9:  // Rethrow error from the right C frame.
+-  |  load_got lj_err_run
+-  |  call_intern lj_err_run		// (lua_State *L)
++  |  load_got lj_err_trace
++  |  sub CARG2, r0, CRET1
++  |  call_intern lj_err_trace		// (lua_State *L, int errcode)
+   |.  move CARG1, L
+   |.endif
+   |
+@@ -2902,6 +2923,70 @@ static void build_subroutines(BuildCtx *
+   |//-- Miscellaneous functions --------------------------------------------
+   |//-----------------------------------------------------------------------
+   |
++  |.define NEXT_TAB,		TAB:CARG1
++  |.define NEXT_IDX,		CARG2
++  |.define NEXT_ASIZE,		CARG3
++  |.define NEXT_NIL,		CARG4
++  |.define NEXT_TMP0,		r12
++  |.define NEXT_TMP1,		r13
++  |.define NEXT_TMP2,		r14
++  |.define NEXT_RES_VK,		CRET1
++  |.define NEXT_RES_IDX,	CRET2
++  |.define NEXT_RES_PTR,	sp
++  |.define NEXT_RES_VAL,	0(sp)
++  |.define NEXT_RES_KEY,	8(sp)
++  |
++  |// TValue *lj_vm_next(GCtab *t, uint32_t idx)
++  |// Next idx returned in CRET2.
++  |->vm_next:
++  |.if JIT and ENDIAN_LE
++  |   lw NEXT_ASIZE, NEXT_TAB->asize
++  |  ld NEXT_TMP0, NEXT_TAB->array
++  |    li NEXT_NIL, LJ_TNIL
++  |1:  // Traverse array part.
++  |   sltu AT, NEXT_IDX, NEXT_ASIZE
++  |    sll NEXT_TMP1, NEXT_IDX, 3
++  |   beqz AT, >5
++  |.   daddu NEXT_TMP1, NEXT_TMP0, NEXT_TMP1
++  |   li AT, LJ_TISNUM
++  |  ld NEXT_TMP2, 0(NEXT_TMP1)
++  |   dsll AT, AT, 47
++  |   or NEXT_TMP1, NEXT_IDX, AT
++  |  beq NEXT_TMP2, NEXT_NIL, <1
++  |.  addiu NEXT_IDX, NEXT_IDX, 1
++  |  sd NEXT_TMP2, NEXT_RES_VAL
++  |   sd NEXT_TMP1, NEXT_RES_KEY
++  |  move NEXT_RES_VK, NEXT_RES_PTR
++  |  jr ra
++  |.  move NEXT_RES_IDX, NEXT_IDX
++  |
++  |5:  // Traverse hash part.
++  |  subu NEXT_RES_IDX, NEXT_IDX, NEXT_ASIZE
++  |   ld NODE:NEXT_RES_VK, NEXT_TAB->node
++  |    sll NEXT_TMP2, NEXT_RES_IDX, 5
++  |  lw NEXT_TMP0, NEXT_TAB->hmask
++  |    sll AT, NEXT_RES_IDX, 3
++  |    subu AT, NEXT_TMP2, AT
++  |   daddu NODE:NEXT_RES_VK, NODE:NEXT_RES_VK, AT
++  |6:
++  |  sltu AT, NEXT_TMP0, NEXT_RES_IDX
++  |  bnez AT, >8
++  |.  nop
++  |  ld NEXT_TMP2, NODE:NEXT_RES_VK->val
++  |  bne NEXT_TMP2, NEXT_NIL, >9
++  |.  addiu NEXT_RES_IDX, NEXT_RES_IDX, 1
++  |  // Skip holes in hash part.
++  |  b <6
++  |.  daddiu NODE:NEXT_RES_VK, NODE:NEXT_RES_VK, sizeof(Node)
++  |
++  |8:  // End of iteration. Set the key to nil (not the value).
++  |  sd NEXT_NIL, NEXT_RES_KEY
++  |  move NEXT_RES_VK, NEXT_RES_PTR
++  |9:
++  |  jr ra
++  |.  addu NEXT_RES_IDX, NEXT_RES_IDX, NEXT_ASIZE
++  |.endif
++  |
+   |//-----------------------------------------------------------------------
+   |//-- FFI helper functions -----------------------------------------------
+   |//-----------------------------------------------------------------------
+@@ -2980,7 +3065,6 @@ static void build_subroutines(BuildCtx *
+   |  move TMP2, sp
+   |  dsubu sp, sp, TMP1
+   |  sd ra, -8(TMP2)
+-  |   sll CARG2, CARG2, 3
+   |  sd r16, -16(TMP2)
+   |  sd CCSTATE, -24(TMP2)
+   |  move r16, TMP2
+@@ -4698,10 +4782,11 @@ static void build_ins(BuildCtx *ctx, BCO
+     break;
+ 
+   case BC_ITERN:
+-    |  // RA = base*8, (RB = (nresults+1)*8, RC = (nargs+1)*8 (2+1)*8)
+-    |.if JIT
+-    |  // NYI: add hotloop, record BC_ITERN.
++    |.if JIT and ENDIAN_LE
++    |  hotloop
+     |.endif
++    |->vm_IITERN:
++    |  // RA = base*8, (RB = (nresults+1)*8, RC = (nargs+1)*8 (2+1)*8)
+     |  daddu RA, BASE, RA
+     |  ld TAB:RB, -16(RA)
+     |   lw RC, -8+LO(RA)		// Get index from control var.
+@@ -4722,11 +4807,10 @@ static void build_ins(BuildCtx *ctx, BCO
+     |.  addiu RC, RC, 1
+     |   sd TMP2, 0(RA)
+     |  sd CARG1, 8(RA)
+-    |   or TMP0, RC, CARG3
+     |     lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535)
+     |     decode_RD4b RD
+     |     daddu RD, RD, TMP3
+-    |   sw TMP0, -8+LO(RA)		// Update control var.
++    |   sw RC, -8+LO(RA)		// Update control var.
+     |     daddu PC, PC, RD
+     |3:
+     |  ins_next
+@@ -4776,9 +4860,9 @@ static void build_ins(BuildCtx *ctx, BCO
+     |  daddiu TMP1, TMP1, -FF_next_N
+     |  or AT, AT, TMP1
+     |  bnez AT, >5
+-    |.  lui TMP1, 0xfffe
++    |.  lui TMP1, (LJ_KEYINDEX >> 16)
+     |  daddu PC, TMP0, TMP2
+-    |  ori TMP1, TMP1, 0x7fff
++    |  ori TMP1, TMP1, (LJ_KEYINDEX & 0xffff)
+     |  dsll TMP1, TMP1, 32
+     |  sd TMP1, -8(RA)
+     |1:
+@@ -4788,8 +4872,27 @@ static void build_ins(BuildCtx *ctx, BCO
+     |   li TMP1, BC_ITERC
+     |  sb TMP3, -4+OFS_OP(PC)
+     |   daddu PC, TMP0, TMP2
++    |.if JIT
++    |  lb TMP0, OFS_OP(PC)
++    |  li AT, BC_ITERN
++    |  bne TMP0, AT, >6
++    |.  lhu TMP2, OFS_RD(PC)
++    |.endif
+     |  b <1
+     |.  sb TMP1, OFS_OP(PC)
++    |.if JIT
++    |6:  // Unpatch JLOOP.
++    |  ld TMP0, DISPATCH_J(trace)(DISPATCH)
++    |   sll TMP2, TMP2, 3
++    |  daddu TMP0, TMP0, TMP2
++    |  ld TRACE:TMP2, 0(TMP0)
++    |  lw TMP0, TRACE:TMP2->startins
++    |   li AT, -256
++    |  and TMP0, TMP0, AT
++    |  or TMP0, TMP0, TMP1
++    |  b <1
++    |.  sw TMP0, 0(PC)
++    |.endif
+     break;
+ 
+   case BC_VARG:
+@@ -5293,6 +5396,7 @@ static void build_ins(BuildCtx *ctx, BCO
+     |   settp LFUNC:RB, TMP0
+     |  daddu TMP0, RA, RC
+     |   sd LFUNC:RB, 0(TMP1)		// Store (tagged) copy of LFUNC.
++    |  daddiu TMP2, TMP2, -8
+     |   daddiu TMP3, RC, 16+FRAME_VARG
+     |  sltu AT, TMP0, TMP2
+     |    ld KBASE, -4+PC2PROTO(k)(PC)
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/vm_ppc.dasc
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/vm_ppc.dasc
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/vm_ppc.dasc
+@@ -1,6 +1,6 @@
+ |// Low-level VM code for PowerPC 32 bit or 32on64 bit mode.
+ |// Bytecode interpreter, fast functions and helper functions.
+-|// Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++|// Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ |
+ |.arch ppc
+ |.section code_op, code_sub
+@@ -859,11 +859,11 @@ static void build_subroutines(BuildCtx *
+   |.endif
+   |     lwz PC, -16(RB)			// Restore PC from [cont|PC].
+   |   subi TMP2, RD, 8
+-  |    lwz TMP1, LFUNC:TMP1->pc
+   |   stwx TISNIL, RA, TMP2		// Ensure one valid arg.
+   |.if FFI
+   |  ble >1
+   |.endif
++  |    lwz TMP1, LFUNC:TMP1->pc
+   |    lwz KBASE, PC2PROTO(k)(TMP1)
+   |  // BASE = base, RA = resultptr, RB = meta base
+   |  mtctr TMP0
+@@ -1559,43 +1559,24 @@ static void build_subroutines(BuildCtx *
+   |
+   |//-- Base library: iterators -------------------------------------------
+   |
+-  |.ffunc next
+-  |  cmplwi NARGS8:RC, 8
+-  |   lwz CARG1, 0(BASE)
+-  |    lwz TAB:CARG2, 4(BASE)
+-  |  blt ->fff_fallback
++  |.ffunc_1 next
+   |   stwx TISNIL, BASE, NARGS8:RC	// Set missing 2nd arg to nil.
+-  |  checktab CARG1
++  |  checktab CARG3
+   |   lwz PC, FRAME_PC(BASE)
+   |  bne ->fff_fallback
+-  |   stp BASE, L->base			// Add frame since C call can throw.
+-  |  mr CARG1, L
+-  |   stp BASE, L->top			// Dummy frame length is ok.
+-  |  la CARG3, 8(BASE)
+-  |   stw PC, SAVE_PC
+-  |  bl extern lj_tab_next	// (lua_State *L, GCtab *t, TValue *key)
+-  |  // Returns 0 at end of traversal.
+-  |  cmplwi CRET1, 0
+-  |   li CARG3, LJ_TNIL
+-  |  beq ->fff_restv			// End of traversal: return nil.
++  |  la CARG2, 8(BASE)
++  |  la CARG3, -8(BASE)
++  |  bl extern lj_tab_next		// (GCtab *t, cTValue *key, TValue *o)
++  |  // Returns 1=found, 0=end, -1=error.
++  |  cmpwi CRET1, 0
+   |   la RA, -8(BASE)
+-  |.if FPU
+-  |  lfd f0, 8(BASE)			// Copy key and value to results.
+-  |  lfd f1, 16(BASE)
+-  |  stfd f0, 0(RA)
+-  |  stfd f1, 8(RA)
+-  |.else
+-  |  lwz CARG1, 8(BASE)
+-  |  lwz CARG2, 12(BASE)
+-  |  lwz CARG3, 16(BASE)
+-  |  lwz CARG4, 20(BASE)
+-  |  stw CARG1, 0(RA)
+-  |  stw CARG2, 4(RA)
+-  |  stw CARG3, 8(RA)
+-  |  stw CARG4, 12(RA)
+-  |.endif
+   |   li RD, (2+1)*8
+-  |  b ->fff_res
++  |  bgt ->fff_res			// Found key/value.
++  |   li CARG3, LJ_TNIL
++  |  beq ->fff_restv			// End of traversal: return nil.
++  |   lwz CFUNC:RB, FRAME_FUNC(BASE)
++  |   li NARGS8:RC, 2*8
++  |  b ->fff_fallback			// Invalid key.
+   |
+   |.ffunc_1 pairs
+   |  checktab CARG3
+@@ -2516,7 +2497,7 @@ static void build_subroutines(BuildCtx *
+   |  stw L, SBUF:CARG1->L
+   |  stp BASE, L->base
+   |  stw PC, SAVE_PC
+-  |   stw TMP0, SBUF:CARG1->p
++  |   stw TMP0, SBUF:CARG1->w
+   |  bl extern lj_buf_putstr_ .. name
+   |  bl extern lj_buf_tostr
+   |  b ->fff_resstr
+@@ -3034,8 +3015,9 @@ static void build_subroutines(BuildCtx *
+   |  addi DISPATCH, JGL, -GG_DISP2G-32768
+   |  stp BASE, L->base
+   |1:
+-  |  cmpwi CARG1, 0
+-  |  blt >9				// Check for error from exit.
++  |  li TMP2, -LUA_ERRERR
++  |  cmplw CARG1, TMP2
++  |  bge >9				// Check for error from exit.
+   |  lwz LFUNC:RB, FRAME_FUNC(BASE)
+   |   slwi MULTRES, CARG1, 3
+   |    li TMP2, 0
+@@ -3060,6 +3042,8 @@ static void build_subroutines(BuildCtx *
+   |   addi PC, PC, 4
+   |    // Assumes TISNIL == ~LJ_VMST_INTERP == -1.
+   |    stw TISNIL, DISPATCH_GL(vmstate)(DISPATCH)
++  |  cmpwi CARG1, -17			// Static dispatch?
++  |  beq >5
+   |  decode_OPP TMP1, INS
+   |   decode_RA8 RA, INS
+   |  lpx TMP0, DISPATCH, TMP1
+@@ -3089,9 +3073,25 @@ static void build_subroutines(BuildCtx *
+   |   add RA, RA, BASE
+   |  bctr
+   |
++  |5:  // Dispatch to static entry of original ins replaced by BC_JLOOP.
++  |  lwz TMP1, DISPATCH_J(trace)(DISPATCH)
++  |  decode_RD4 RD, INS
++  |  lwzx TRACE:TMP1, TMP1, RD
++  |  lwz INS, TRACE:TMP1->startins
++  |  decode_OPP TMP1, INS
++  |  addi TMP1, TMP1, GG_DISP2STATIC
++  |  lpx TMP0, DISPATCH, TMP1
++  |  mtctr TMP0
++  |   decode_RB8 RB, INS
++  |   decode_RD8 RD, INS
++  |   decode_RA8 RA, INS
++  |   decode_RC8 RC, INS
++  |  bctr
++  |
+   |9:  // Rethrow error from the right C frame.
++  |  neg CARG2, CARG1
+   |  mr CARG1, L
+-  |  bl extern lj_err_run		// (lua_State *L)
++  |  bl extern lj_err_trace		// (lua_State *L, int errcode)
+   |.endif
+   |
+   |//-----------------------------------------------------------------------
+@@ -3181,6 +3181,11 @@ static void build_subroutines(BuildCtx *
+   |  blr
+   |.endif
+   |
++  |->vm_next:
++  |.if JIT
++  |  NYI  // On big-endian.
++  |.endif
++  |
+   |//-----------------------------------------------------------------------
+   |//-- FFI helper functions -----------------------------------------------
+   |//-----------------------------------------------------------------------
+@@ -3264,14 +3269,13 @@ static void build_subroutines(BuildCtx *
+   |    stw TMP0, 4(sp)
+   |   cmpwi cr1, CARG3, 0
+   |  mr TMP2, sp
+-  |   addic. CARG2, CARG2, -1
++  |   addic. CARG2, CARG2, -4
+   |  stwux sp, sp, TMP1
+   |   crnot 4*cr1+eq, 4*cr1+eq		// For vararg calls.
+   |  stw r14, -4(TMP2)
+   |  stw CCSTATE, -8(TMP2)
+   |  mr r14, TMP2
+   |  la TMP1, CCSTATE->stack
+-  |   slwi CARG2, CARG2, 2
+   |   blty >2
+   |  la TMP2, 8(sp)
+   |1:
+@@ -4100,8 +4104,8 @@ static void build_ins(BuildCtx *ctx, BCO
+     |.macro addo32., y, a, b
+     |  // Need to check overflow for (a<<32) + (b<<32).
+     |  rldicr TMP0, a, 32, 31
+-    |  rldicr TMP3, b, 32, 31
+-    |  addo. TMP0, TMP0, TMP3
++    |  rldicr TMP1, b, 32, 31
++    |  addo. TMP0, TMP0, TMP1
+     |  add y, a, b
+     |.endmacro
+     |  ins_arith addo32., fadd, __adddf3
+@@ -4114,8 +4118,8 @@ static void build_ins(BuildCtx *ctx, BCO
+     |.macro subo32., y, a, b
+     |  // Need to check overflow for (a<<32) - (b<<32).
+     |  rldicr TMP0, a, 32, 31
+-    |  rldicr TMP3, b, 32, 31
+-    |  subo. TMP0, TMP0, TMP3
++    |  rldicr TMP1, b, 32, 31
++    |  subo. TMP0, TMP0, TMP1
+     |  sub y, a, b
+     |.endmacro
+     |  ins_arith subo32., fsub, __subdf3
+@@ -5130,8 +5134,9 @@ static void build_ins(BuildCtx *ctx, BCO
+   case BC_ITERN:
+     |  // RA = base*8, (RB = (nresults+1)*8, RC = (nargs+1)*8 (2+1)*8)
+     |.if JIT
+-    |  // NYI: add hotloop, record BC_ITERN.
++    |  // NYI on big-endian
+     |.endif
++    |->vm_IITERN:
+     |  add RA, BASE, RA
+     |  lwz TAB:RB, -12(RA)
+     |  lwz RC, -4(RA)			// Get index from control var.
+@@ -5250,8 +5255,8 @@ static void build_ins(BuildCtx *ctx, BCO
+     |  crand 4*cr0+eq, 4*cr0+eq, 4*cr7+eq
+     |    add TMP3, PC, TMP0
+     |  bne cr0, >5
+-    |  lus TMP1, 0xfffe
+-    |  ori TMP1, TMP1, 0x7fff
++    |  lus TMP1, (LJ_KEYINDEX >> 16)
++    |  ori TMP1, TMP1, (LJ_KEYINDEX & 0xffff)
+     |  stw ZERO, -4(RA)			// Initialize control var.
+     |  stw TMP1, -8(RA)
+     |    addis PC, TMP3, -(BCBIAS_J*4 >> 16)
+@@ -5262,6 +5267,7 @@ static void build_ins(BuildCtx *ctx, BCO
+     |   li TMP1, BC_ITERC
+     |  stb TMP0, -1(PC)
+     |    addis PC, TMP3, -(BCBIAS_J*4 >> 16)
++    |  // NYI on big-endian: unpatch JLOOP.
+     |   stb TMP1, 3(PC)
+     |  b <1
+     break;
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/vm_riscv64.dasc
+===================================================================
+--- /dev/null
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/vm_riscv64.dasc
+@@ -0,0 +1,4853 @@
++|// Low-level VM code for RISC-V 64 CPUs.
++|// Bytecode interpreter, fast functions and helper functions.
++|// Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
++|//
++|// Contributed by gns from PLCT Lab, ISCAS.
++|// Sponsored by PLCT Lab, ISCAS.
++|
++|.arch riscv64
++|.section code_op, code_sub
++|
++|.actionlist build_actionlist
++|.globals GLOB_
++|.globalnames globnames
++|.externnames extnames
++|
++|// Note: The ragged indentation of the instructions is intentional.
++|//       The starting columns indicate data dependencies.
++|
++|//-----------------------------------------------------------------------
++|
++|// Fixed register assignments for the interpreter.
++|// Don't use: x0 = 0, x1 = ra, x2 = sp, x3 = gp, x4 = tp
++|
++|
++|// The following must be C callee-save (but BASE is often refetched).
++|.define BASE,		x18	// Base of current Lua stack frame.
++|.define KBASE,		x19	// Constants of current Lua function.
++|.define PC,		x20	// Next PC.
++|.define GLREG,		x21	// Global state.
++|.define DISPATCH,	x22	// Opcode dispatch table.
++|.define LREG,		x23	// Register holding lua_State (also in SAVE_L).
++|.define MULTRES,	x24	// Size of multi-result: (nresults+1)*8.
++|
++|// Constants for type-comparisons, stores and conversions. C callee-save.
++|.define TISNIL,	x8
++|.define TISNUM,	x25
++|.define TOBIT,		f27	// 2^52 + 2^51.
++|
++|// The following temporaries are not saved across C calls, except for RA.
++|.define RA,		x9	// Callee-save.
++|.define RB,		x14
++|.define RC,		x15
++|.define RD,		x16
++|.define INS,		x17
++|
++|.define TMP0,		x6
++|.define TMP1,		x7
++|.define TMP2,		x28
++|.define TMP3,		x29
++|.define TMP4,		x30
++|
++|// RISC-V lp64d calling convention.
++|.define CFUNCADDR,	x5
++|.define CARG1,		x10
++|.define CARG2,		x11
++|.define CARG3,		x12
++|.define CARG4,		x13
++|.define CARG5,		x14
++|.define CARG6,		x15
++|.define CARG7,		x16
++|.define CARG8,		x17
++|
++|.define CRET1,		x10
++|.define CRET2,		x11
++|
++|.define FARG1,		f10
++|.define FARG2,		f11
++|.define FARG3,		f12
++|.define FARG4,		f13
++|.define FARG5,		f14
++|.define FARG6,		f15
++|.define FARG7,		f16
++|.define FARG8,		f17
++|
++|.define FRET1,		f10
++|.define FRET2,		f11
++|
++|.define FTMP0,		f0
++|.define FTMP1,		f1
++|.define FTMP2,		f2
++|.define FTMP3,		f3
++|.define FTMP4,		f4
++|
++|// Stack layout while in interpreter. Must match with lj_frame.h.
++|// RISC-V 64 lp64d.
++|
++|.define CFRAME_SPACE,	256	// Delta for sp.
++|
++|//----- 16 byte aligned, <-- sp entering interpreter
++|.define SAVE_ERRF,	252	// 32 bit values.
++|.define SAVE_NRES,	248
++|.define SAVE_CFRAME,	240	// 64 bit values.
++|.define SAVE_L,	232
++|.define SAVE_PC,	224
++|//----- 16 byte aligned
++|// Padding		216
++|.define SAVE_GPR_,	112	// .. 112+13*8: 64 bit GPR saves.
++|.define SAVE_FPR_,	16	// .. 16+12*8: 64 bit FPR saves.
++|
++|
++|.define TMPD,		0
++|//----- 16 byte aligned
++|
++|.define TMPD_OFS,	0
++|
++|//-----------------------------------------------------------------------
++|
++|.macro saveregs
++|  addi sp, sp, -CFRAME_SPACE
++|  fsd f27, SAVE_FPR_+11*8(sp)
++|  fsd f26, SAVE_FPR_+10*8(sp)
++|  fsd f25, SAVE_FPR_+9*8(sp)
++|  fsd f24, SAVE_FPR_+8*8(sp)
++|  fsd f23, SAVE_FPR_+7*8(sp)
++|  fsd f22, SAVE_FPR_+6*8(sp)
++|  fsd f21, SAVE_FPR_+5*8(sp)
++|  fsd f20, SAVE_FPR_+4*8(sp)
++|  fsd f19, SAVE_FPR_+3*8(sp)
++|  fsd f18, SAVE_FPR_+2*8(sp)
++|  fsd f9,  SAVE_FPR_+1*8(sp)
++|  fsd f8,  SAVE_FPR_+0*8(sp)
++|  sd ra,  SAVE_GPR_+12*8(sp)
++|  sd x27, SAVE_GPR_+11*8(sp)
++|  sd x26, SAVE_GPR_+10*8(sp)
++|  sd x25, SAVE_GPR_+9*8(sp)
++|  sd x24, SAVE_GPR_+8*8(sp)
++|  sd x23, SAVE_GPR_+7*8(sp)
++|  sd x22, SAVE_GPR_+6*8(sp)
++|  sd x21, SAVE_GPR_+5*8(sp)
++|  sd x20, SAVE_GPR_+4*8(sp)
++|  sd x19, SAVE_GPR_+3*8(sp)
++|  sd x18, SAVE_GPR_+2*8(sp)
++|  sd x9,  SAVE_GPR_+1*8(sp)
++|  sd x8,  SAVE_GPR_+0*8(sp)
++|.endmacro
++|
++|.macro restoreregs_ret
++|  ld ra,  SAVE_GPR_+12*8(sp)
++|  ld x27, SAVE_GPR_+11*8(sp)
++|  ld x26, SAVE_GPR_+10*8(sp)
++|  ld x25, SAVE_GPR_+9*8(sp)
++|  ld x24, SAVE_GPR_+8*8(sp)
++|  ld x23, SAVE_GPR_+7*8(sp)
++|  ld x22, SAVE_GPR_+6*8(sp)
++|  ld x21, SAVE_GPR_+5*8(sp)
++|  ld x20, SAVE_GPR_+4*8(sp)
++|  ld x19, SAVE_GPR_+3*8(sp)
++|  ld x18, SAVE_GPR_+2*8(sp)
++|  ld x9,  SAVE_GPR_+1*8(sp)
++|  ld x8,  SAVE_GPR_+0*8(sp)
++|  fld f27, SAVE_FPR_+11*8(sp)
++|  fld f26, SAVE_FPR_+10*8(sp)
++|  fld f25, SAVE_FPR_+9*8(sp)
++|  fld f24, SAVE_FPR_+8*8(sp)
++|  fld f23, SAVE_FPR_+7*8(sp)
++|  fld f22, SAVE_FPR_+6*8(sp)
++|  fld f21, SAVE_FPR_+5*8(sp)
++|  fld f20, SAVE_FPR_+4*8(sp)
++|  fld f19, SAVE_FPR_+3*8(sp)
++|  fld f18, SAVE_FPR_+2*8(sp)
++|  fld f9,  SAVE_FPR_+1*8(sp)
++|  fld f8,  SAVE_FPR_+0*8(sp)
++|  addi sp, sp, CFRAME_SPACE
++|  ret
++|.endmacro
++|
++|//-----------------------------------------------------------------------
++|
++|// Pseudo-instruction macros
++|// Be cautious with local label 9 since we use them here!
++|.macro bxeq, a, b, tgt
++|  bne a, b, >9
++|  j tgt
++|9:
++|.endmacro
++|
++|.macro bxne, a, b, tgt
++|  beq a, b, >9
++|  j tgt
++|9:
++|.endmacro
++|
++|.macro bxlt, a, b, tgt
++|  bge a, b, >9
++|  j tgt
++|9:
++|.endmacro
++|
++|.macro bxge, a, b, tgt
++|  blt a, b, >9
++|  j tgt
++|9:
++|.endmacro
++|
++|.macro bxgt, a, b, tgt
++|  bge b, a, >9
++|  j tgt
++|9:
++|.endmacro
++|
++|.macro bxle, a, b, tgt
++|  blt b, a, >9
++|  j tgt
++|9:
++|.endmacro
++|
++|.macro bxltu, a, b, tgt
++|  bgeu a, b, >9
++|  j tgt
++|9:
++|.endmacro
++|
++|.macro bxgeu, a, b, tgt
++|  bltu a, b, >9
++|  j tgt
++|9:
++|.endmacro
++|
++|.macro bxgtu, a, b, tgt
++|  bgeu b, a, >9
++|  j tgt
++|9:
++|.endmacro
++|
++|.macro bxleu, a, b, tgt
++|  bltu b, a, >9
++|  j tgt
++|9:
++|.endmacro
++|
++|.macro bxeqz, a, tgt
++|  bxeq a, x0, tgt
++|.endmacro
++|
++|.macro bxnez, a, tgt
++|  bxne a, x0, tgt
++|.endmacro
++|
++|.macro bxlez, a, tgt
++|  bxge x0, a, tgt
++|.endmacro
++|
++|.macro bxgez, a, tgt
++|  bxge a, x0, tgt
++|.endmacro
++|
++|.macro bxltz, a, tgt
++|  bxlt a, x0, tgt
++|.endmacro
++|
++|.macro bxgtz, a, tgt
++|  bxlt x0, a, tgt
++|.endmacro
++|
++|.macro lxi, a, b
++|  lui a, (b)&0xfffff
++|  srai a, a, 12
++|.endmacro
++|
++|.macro lzi, a, b
++|  lui a, (b)&0xfffff
++|  srli a, a, 12
++|.endmacro
++|
++|.macro addxi, a, b, c
++|  lui x31, (c)&0xfffff
++|  srai x31, x31, 12
++|  add a, x31, b
++|.endmacro
++|
++|.macro sext.b, a, b
++|  slli a, b, 56
++|  srai a, a, 56
++|.endmacro
++|
++|.macro sext.h, a, b
++|  slli a, b, 48
++|  srai a, a, 48
++|.endmacro
++|
++|.macro zext.h, a, b
++|  slli a, b, 48
++|  srli a, a, 48
++|.endmacro
++|
++|.macro zext.w, a, b
++|  slli a, b, 32
++|  srli a, a, 32
++|.endmacro
++|
++|.macro bfextri, a, b, c, d
++|  slli a, b, (63-c)
++|  srli a, a, (d+63-c)
++|.endmacro
++|
++|//-----------------------------------------------------------------------
++|
++|// Type definitions. Some of these are only used for documentation.
++|.type L,		lua_State,	LREG
++|.type GL,		global_State,	GLREG
++|.type TVALUE,		TValue
++|.type GCOBJ,		GCobj
++|.type STR,		GCstr
++|.type TAB,		GCtab
++|.type LFUNC,		GCfuncL
++|.type CFUNC,		GCfuncC
++|.type PROTO,		GCproto
++|.type UPVAL,		GCupval
++|.type NODE,		Node
++|.type NARGS8,		int
++|.type TRACE,		GCtrace
++|.type SBUF,		SBuf
++|
++|//-----------------------------------------------------------------------
++|
++|// Trap for not-yet-implemented parts.
++|.macro NYI; .long 0x00100073; .endmacro
++|
++|//-----------------------------------------------------------------------
++|
++|// Access to frame relative to BASE.
++|.define FRAME_PC,	-8
++|.define FRAME_FUNC,	-16
++|
++|//-----------------------------------------------------------------------
++|
++|// Endian-specific defines. RISC-V only has little endian ABI for now.
++|.define OFS_RD,	2
++|.define OFS_RA,	1
++|.define OFS_OP,	0
++|
++|// Instruction decode.
++|.macro decode_OP1, dst, ins; andi dst, ins, 0xff; .endmacro
++|.macro decode_BC4b, dst; slliw dst, dst, 2; .endmacro
++|.macro decode_BC8b, dst; slliw dst, dst, 3; .endmacro
++|.macro decode_RX8b, dst; andi dst, dst, 0x7f8; .endmacro
++|
++|.macro decode_OP8a, dst, ins; decode_OP1 dst, ins; .endmacro
++|.macro decode_OP8b, dst; decode_BC8b dst; .endmacro
++|.macro decode_RA8a, dst, ins; srliw dst, ins, 5; .endmacro
++|.macro decode_RA8b, dst; decode_RX8b dst; .endmacro
++|.macro decode_RB8a, dst, ins; srliw dst, ins, 21; .endmacro
++|.macro decode_RB8b, dst; decode_RX8b dst; .endmacro
++|.macro decode_RC8a, dst, ins; srliw dst, ins, 13; .endmacro
++|.macro decode_RC8b, dst; decode_RX8b dst; .endmacro
++|.macro decode_RD8a, dst, ins; srliw dst, ins, 16; .endmacro
++|.macro decode_RD4b, dst; decode_BC4b dst; .endmacro
++|.macro decode_RD8b, dst; decode_BC8b dst; .endmacro
++|.macro decode_RDtoRC8, dst, src; andi dst, src, 0x7f8; .endmacro
++|
++|.macro decode_OP8, dst, ins; decode_OP1 dst, ins; decode_BC8b dst; .endmacro
++|.macro decode_RA8, dst, ins; decode_RA8a dst, ins; decode_RA8b dst; .endmacro
++|.macro decode_RB8, dst, ins; decode_RB8a dst, ins; decode_RB8b dst; .endmacro
++|.macro decode_RC8, dst, ins; decode_RC8a dst, ins; decode_RC8b dst; .endmacro
++|.macro decode_RD8, dst, ins; decode_RD8a dst, ins; decode_RD8b dst; .endmacro
++|
++|// Instruction fetch.
++|.macro ins_NEXT1
++|  lw INS, 0(PC)
++|   addi PC, PC, 4
++|.endmacro
++|// Instruction decode+dispatch.
++|.macro ins_NEXT2
++|  decode_OP8 TMP1, INS
++|  add TMP0, DISPATCH, TMP1
++|   decode_RD8a RD, INS
++|  ld TMP4, 0(TMP0)
++|   decode_RA8a RA, INS
++|   decode_RD8b RD
++|   decode_RA8b RA
++|  jr TMP4
++|.endmacro
++|.macro ins_NEXT
++|  ins_NEXT1
++|  ins_NEXT2
++|.endmacro
++|
++|// Instruction footer.
++|.if 1
++|  // Replicated dispatch. Less unpredictable branches, but higher I-Cache use.
++|  .define ins_next, ins_NEXT
++|  .define ins_next_, ins_NEXT
++|  .define ins_next1, ins_NEXT1
++|  .define ins_next2, ins_NEXT2
++|.else
++|  // Common dispatch. Lower I-Cache use, only one (very) unpredictable branch.
++|  // Affects only certain kinds of benchmarks (and only with -j off).
++|  .macro ins_next
++|    j ->ins_next
++|  .endmacro
++|  .macro ins_next1
++|  .endmacro
++|  .macro ins_next2
++|    j ->ins_next
++|  .endmacro
++|  .macro ins_next_
++|  ->ins_next:
++|    ins_NEXT
++|  .endmacro
++|.endif
++|
++|// Call decode and dispatch.
++|.macro ins_callt
++|  // BASE = new base, RB = LFUNC/CFUNC, RC = nargs*8, FRAME_PC(BASE) = PC
++|  ld PC, LFUNC:RB->pc
++|  lw INS, 0(PC)
++|   addi PC, PC, 4
++|  decode_OP8 TMP1, INS
++|   decode_RA8 RA, INS
++|  add TMP0, DISPATCH, TMP1
++|  ld TMP0, 0(TMP0)
++|   add RA, RA, BASE
++|  jr TMP0
++|.endmacro
++|
++|.macro ins_call
++|  // BASE = new base, RB = LFUNC/CFUNC, RC = nargs*8, PC = caller PC
++|  sd PC, FRAME_PC(BASE)
++|  ins_callt
++|.endmacro
++|
++|//-----------------------------------------------------------------------
++|
++|.macro branch_RD
++|  srliw TMP0, RD, 1
++|  lui TMP4, (-(BCBIAS_J*4 >> 12)) & 0xfffff
++|  addw TMP0, TMP0, TMP4
++|  add PC, PC, TMP0
++|.endmacro
++|
++|// Assumes J is relative to GL. Some J members might be out of range though.
++#define GL_J(field)	(GG_G2J + (int)offsetof(jit_State, field))
++|
++#define PC2PROTO(field)  ((int)offsetof(GCproto, field)-(int)sizeof(GCproto))
++|
++|.macro call_intern, curfunc, func
++|->curfunc .. _pcrel_ .. func:
++|  auipc CFUNCADDR, extern %pcrel_hi(func)
++|  jalr CFUNCADDR, extern %pcrel_lo(lj_ .. curfunc .. _pcrel_ .. func)
++|.endmacro
++|.macro call_extern, func
++|  call extern func
++|  empty
++|.endmacro
++|
++|// Set current VM state. Uses TMP0.
++|.macro li_vmstate, st; li TMP0, ~LJ_VMST_..st; .endmacro
++|.macro st_vmstate; sw TMP0, GL->vmstate; .endmacro
++|
++|.macro hotcheck, delta, target
++|  srli TMP1, PC, 1
++|  andi TMP1, TMP1, 126
++|  add TMP1, TMP1, DISPATCH
++|  lhu TMP2, GG_DISP2HOT(TMP1)
++|  addiw TMP2, TMP2, -delta
++|  sh TMP2, GG_DISP2HOT(TMP1)
++|  bxltz TMP2, target
++|.endmacro
++|
++|.macro hotloop
++|  hotcheck HOTCOUNT_LOOP, ->vm_hotloop
++|.endmacro
++|
++|.macro hotcall
++|  hotcheck HOTCOUNT_CALL, ->vm_hotcall
++|.endmacro
++|
++|// Move table write barrier back. Overwrites mark and tmp.
++|.macro barrierback, tab, mark, tmp, target
++|  ld tmp, GL->gc.grayagain
++|  andi mark, mark, ~LJ_GC_BLACK & 255		// black2gray(tab)
++|  sd tab, GL->gc.grayagain
++|  sb mark, tab->marked
++|  sd tmp, tab->gclist
++|  j target
++|.endmacro
++|
++|// Clear type tag. Isolate lowest 64-17=47 bits of reg.
++|.macro cleartp, reg; slli reg, reg, 17; srli reg, reg, 17; .endmacro
++|.macro cleartp, dst, reg; slli dst, reg, 17; srli dst, dst, 17; .endmacro
++|
++|// Set type tag: Merge 17 type bits into bits [47, 63] of dst.
++|.macro settp_a, dst; cleartp dst; .endmacro
++|.macro settp_a, dst, src; cleartp dst, src; .endmacro
++|.macro settp_b, dst, tp;
++|  slli x31, tp, 47
++|  or dst, dst, x31
++|.endmacro
++|.macro settp_b, dst, src, tp;
++|  slli x31, tp, 47
++|  or dst, src, x31
++|.endmacro
++|.macro settp, dst, tp; settp_a dst; settp_b dst, tp; .endmacro
++|.macro settp, dst, src, tp; settp_a dst, src; settp_b dst, dst, tp; .endmacro
++|
++|// Extract (negative) type tag.
++|.macro gettp, dst, src; srai dst, src, 47; .endmacro
++|
++|// Macros to check the TValue type and extract the GCobj. Branch on failure.
++|.macro checktp, reg, tp, target
++|  gettp TMP4, reg
++|  addi TMP4, TMP4, tp
++|  cleartp reg
++|  bxnez TMP4, target
++|.endmacro
++|.macro checktp, dst, reg, tp, target
++|  gettp TMP4, reg
++|  addi TMP4, TMP4, tp
++|  cleartp dst, reg
++|  bxnez TMP4, target
++|.endmacro
++|.macro checkstr, reg, target; checktp reg, -LJ_TSTR, target; .endmacro
++|.macro checktab, reg, target; checktp reg, -LJ_TTAB, target; .endmacro
++|.macro checkfunc, reg, target; checktp reg, -LJ_TFUNC, target; .endmacro
++|.macro checkint, reg, target
++|  gettp TMP4, reg
++|  bxne TMP4, TISNUM, target
++|.endmacro
++|.macro checknum, reg, target
++|  gettp TMP4, reg
++|  sltiu TMP4, TMP4, LJ_TISNUM
++|  bxeqz TMP4, target
++|.endmacro
++|
++|.macro mov_false, reg
++|  li reg, 0x001
++|  slli reg, reg, 47
++|  not reg, reg
++|.endmacro
++|.macro mov_true, reg
++|  li reg, 0x001
++|  slli reg, reg, 48
++|  not reg, reg
++|.endmacro
++|
++|//-----------------------------------------------------------------------
++
++/* Generate subroutines used by opcodes and other parts of the VM. */
++/* The .code_sub section should be last to help static branch prediction. */
++static void build_subroutines(BuildCtx *ctx)
++{
++  |.code_sub
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Return handling ----------------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |->vm_returnp:
++  |  // See vm_return. Also: TMP2 = previous base.
++  |  andi TMP0, PC, FRAME_P
++  |
++  |  // Return from pcall or xpcall fast func.
++  |  mov_true TMP1
++  |  bxeqz TMP0, ->cont_dispatch
++  |  ld PC, FRAME_PC(TMP2)		// Fetch PC of previous frame.
++  |  mv BASE, TMP2			// Restore caller base.
++  |  // Prepending may overwrite the pcall frame, so do it at the end.
++  |  sd TMP1, -8(RA)			// Prepend true to results.
++  |  addi RA, RA, -8
++  |
++  |->vm_returnc:
++  |  addiw RD, RD, 8			// RD = (nresults+1)*8.
++  |  andi TMP0, PC, FRAME_TYPE
++  |  li CRET1, LUA_YIELD
++  |  bxeqz RD, ->vm_unwind_c_eh
++  |  mv MULTRES, RD
++  |  bxeqz TMP0, ->BC_RET_Z		// Handle regular return to Lua.
++  |
++  |->vm_return:
++  |  // BASE = base, RA = resultptr, RD/MULTRES = (nresults+1)*8, PC = return
++  |  // TMP0 = PC & FRAME_TYPE
++  |  andi TMP2, PC, ~FRAME_TYPEP
++  |  xori TMP0, TMP0, FRAME_C
++  |  sub TMP2, BASE, TMP2		// TMP2 = previous base.
++  |  bxnez TMP0, ->vm_returnp
++  |
++  |  addiw TMP1, RD, -8
++  |  sd TMP2, L->base
++  |  li_vmstate C
++  |  lw TMP2, SAVE_NRES(sp)
++  |  addi BASE, BASE, -16
++  |  st_vmstate
++  |  slliw TMP2, TMP2, 3
++  |  beqz TMP1, >2
++  |1:
++  |  addiw TMP1, TMP1, -8
++  |  ld CRET1, 0(RA)
++  |  addi RA, RA, 8
++  |  sd CRET1, 0(BASE)
++  |  addi BASE, BASE, 8
++  |  bnez TMP1, <1
++  |
++  |2:
++  |  bne TMP2, RD, >6
++  |3:
++  |  sd BASE, L->top			// Store new top.
++  |
++  |->vm_leave_cp:
++  |  ld TMP0, SAVE_CFRAME(sp)		// Restore previous C frame.
++  |  mv CRET1, x0			// Ok return status for vm_pcall.
++  |  sd TMP0, L->cframe
++  |
++  |->vm_leave_unw:
++  |  restoreregs_ret
++  |
++  |6:
++  |  ld TMP1, L->maxstack
++  |  blt TMP2, RD, >7
++  |  // More results wanted. Check stack size and fill up results with nil.
++  |  bge BASE, TMP1, >9
++  |  sd TISNIL, 0(BASE)
++  |  addiw RD, RD, 8
++  |  addi BASE, BASE, 8
++  |  j <2
++  |
++  |7:  // Less results wanted.
++  |  subw TMP0, RD, TMP2
++  |  sub TMP0, BASE, TMP0		// Either keep top or shrink it.
++  |  beqz TMP2, >8
++  |  mv BASE, TMP0 	// LUA_MULTRET+1 case
++  |8:
++  |  j <3
++  |
++  |9:  // Corner case: need to grow stack for filling up results.
++  |  // This can happen if:
++  |  // - A C function grows the stack (a lot).
++  |  // - The GC shrinks the stack in between.
++  |  // - A return back from a lua_call() with (high) nresults adjustment.
++  |
++  |  sd BASE, L->top			// Save current top held in BASE (yes).
++  |   mv MULTRES, RD
++  |  srliw CARG2, TMP2, 3
++  |  mv CARG1, L
++  |  call_intern vm_leave_unw, lj_state_growstack		// (lua_State *L, int n)
++  |    lw TMP2, SAVE_NRES(sp)
++  |  ld BASE, L->top			// Need the (realloced) L->top in BASE.
++  |   mv RD, MULTRES
++  |   slliw TMP2, TMP2, 3
++  |  j <2
++  |
++  |->vm_unwind_c:			// Unwind C stack, return from vm_pcall.
++  |  // (void *cframe, int errcode)
++  |  mv sp, CARG1
++  |  mv CRET1, CARG2
++  |->vm_unwind_c_eh:			// Landing pad for external unwinder.
++  |  ld L, SAVE_L(sp)
++  |   li TMP0, ~LJ_VMST_C
++  |  ld GL, L->glref
++  |  sw TMP0, GL->vmstate
++  |  j ->vm_leave_unw
++  |
++  |->vm_unwind_ff:			// Unwind C stack, return from ff pcall.
++  |  // (void *cframe)
++  |  andi sp, CARG1, CFRAME_RAWMASK
++  |->vm_unwind_ff_eh:			// Landing pad for external unwinder.
++  |  ld L, SAVE_L(sp)
++  |  lui TMP3, 0x43380		// TOBIT = Hiword of 2^52 + 2^51 (double).
++  |  li TISNIL, LJ_TNIL
++  |  li TISNUM, LJ_TISNUM
++  |  ld BASE, L->base
++  |  ld GL, L->glref			// Setup pointer to global state.
++  |  slli TMP3, TMP3, 32
++  |  mov_false TMP1
++  |    li_vmstate INTERP
++  |  ld PC, FRAME_PC(BASE)		// Fetch PC of previous frame.
++  |    fmv.d.x TOBIT, TMP3
++  |  addi RA, BASE, -8		// Results start at BASE-8.
++  |  addxi DISPATCH, GL, GG_G2DISP
++  |  sd TMP1, 0(RA)			// Prepend false to error message.
++  |    st_vmstate
++  |  li RD, 16			// 2 results: false + error message.
++  |  j ->vm_returnc
++  |
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Grow stack for calls -----------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |->vm_growstack_c:			// Grow stack for C function.
++  |  li CARG2, LUA_MINSTACK
++  |  j >2
++  |
++  |->vm_growstack_l:			// Grow stack for Lua function.
++  |  // BASE = new base, RA = BASE+framesize*8, RC = nargs*8, PC = first PC
++  |  add RC, BASE, RC
++  |   sub RA, RA, BASE
++  |  sd BASE, L->base
++  |   addi PC, PC, 4			// Must point after first instruction.
++  |  sd RC, L->top
++  |   srliw CARG2, RA, 3
++  |2:
++  |  // L->base = new base, L->top = top
++  |  sd PC, SAVE_PC(sp)
++  |  mv CARG1, L
++  |  call_intern vm_growstack_l, lj_state_growstack	// (lua_State *L, int n)
++  |  ld BASE, L->base
++  |  ld RC, L->top
++  |  ld LFUNC:RB, FRAME_FUNC(BASE)
++  |  sub RC, RC, BASE
++  |  cleartp LFUNC:RB
++  |  // BASE = new base, RB = LFUNC/CFUNC, RC = nargs*8, FRAME_PC(BASE) = PC
++  |  ins_callt				// Just retry the call.
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Entry points into the assembler VM ---------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |->vm_resume:				// Setup C frame and resume thread.
++  |  // (lua_State *L, TValue *base, int nres1 = 0, ptrdiff_t ef = 0)
++  |  saveregs
++  |  mv L, CARG1
++  |    ld GL, L->glref		// Setup pointer to global state.
++  |  mv BASE, CARG2
++  |    lbu TMP1, L->status
++  |   sd L, SAVE_L(sp)
++  |  li PC, FRAME_CP
++  |  addi TMP0, sp, CFRAME_RESUME
++  |    addxi DISPATCH, GL, GG_G2DISP
++  |   sw x0, SAVE_NRES(sp)
++  |   sw x0, SAVE_ERRF(sp)
++  |   sd CARG1, SAVE_PC(sp)			// Any value outside of bytecode is ok.
++  |   sd x0, SAVE_CFRAME(sp)
++  |   sd TMP0, L->cframe
++  |    beqz TMP1, >3
++  |
++  |  // Resume after yield (like a return).
++  |  sd L, GL->cur_L
++  |  mv RA, BASE
++  |   ld BASE, L->base
++  |   ld TMP1, L->top
++  |  ld PC, FRAME_PC(BASE)
++  |     lui TMP3, 0x43380		// TOBIT = Hiword of 2^52 + 2^51 (double).
++  |   sub RD, TMP1, BASE
++  |     slli TMP3, TMP3, 32
++  |    sb x0, L->status
++  |     fmv.d.x TOBIT, TMP3
++  |    li_vmstate INTERP
++  |   addi RD, RD, 8
++  |    st_vmstate
++  |   mv MULTRES, RD
++  |  andi TMP0, PC, FRAME_TYPE
++  |   li TISNIL, LJ_TNIL
++  |   li TISNUM, LJ_TISNUM
++  |  bxeqz TMP0, ->BC_RET_Z
++  |  j ->vm_return
++  |
++  |->vm_pcall:				// Setup protected C frame and enter VM.
++  |  // (lua_State *L, TValue *base, int nres1, ptrdiff_t ef)
++  |  saveregs
++  |  sw CARG4, SAVE_ERRF(sp)
++  |  li PC, FRAME_CP
++  |  j >1
++  |
++  |->vm_call:				// Setup C frame and enter VM.
++  |  // (lua_State *L, TValue *base, int nres1)
++  |  saveregs
++  |  li PC, FRAME_C
++  |
++  |1:  // Entry point for vm_pcall above (PC = ftype).
++  |  ld TMP1, L:CARG1->cframe
++  |    mv L, CARG1
++  |   sw CARG3, SAVE_NRES(sp)
++  |    ld GL, L->glref		// Setup pointer to global state.
++  |   sd CARG1, SAVE_L(sp)
++  |     mv BASE, CARG2
++  |    addxi DISPATCH, GL, GG_G2DISP
++  |   sd CARG1, SAVE_PC(sp)		// Any value outside of bytecode is ok.
++  |  sd TMP1, SAVE_CFRAME(sp)
++  |  sd sp, L->cframe			// Add our C frame to cframe chain.
++  |
++  |3:  // Entry point for vm_cpcall/vm_resume (BASE = base, PC = ftype).
++  |  sd L, GL->cur_L
++  |  ld TMP2, L->base			// TMP2 = old base (used in vmeta_call).
++  |     lui TMP3, 0x43380		// TOBIT = Hiword of 2^52 + 2^51 (double).
++  |   ld TMP1, L->top
++  |     slli TMP3, TMP3, 32
++  |  add PC, PC, BASE
++  |   sub NARGS8:RC, TMP1, BASE
++  |     li TISNUM, LJ_TISNUM
++  |  sub PC, PC, TMP2			// PC = frame delta + frame type
++  |     fmv.d.x TOBIT, TMP3
++  |    li_vmstate INTERP
++  |     li TISNIL, LJ_TNIL
++  |    st_vmstate
++  |
++  |->vm_call_dispatch:
++  |  // TMP2 = old base, BASE = new base, RC = nargs*8, PC = caller PC
++  |  ld LFUNC:RB, FRAME_FUNC(BASE)
++  |  checkfunc LFUNC:RB, ->vmeta_call
++  |
++  |->vm_call_dispatch_f:
++  |  ins_call
++  |  // BASE = new base, RB = func, RC = nargs*8, PC = caller PC
++  |
++  |->vm_cpcall:				// Setup protected C frame, call C.
++  |  // (lua_State *L, lua_CFunction func, void *ud, lua_CPFunction cp)
++  |  saveregs
++  |  mv L, CARG1
++  |   ld TMP0, L:CARG1->stack
++  |  sd CARG1, SAVE_L(sp)
++  |   ld TMP1, L->top
++  |     ld GL, L->glref		// Setup pointer to global state.
++  |  sd CARG1, SAVE_PC(sp)		// Any value outside of bytecode is ok.
++  |   sub TMP0, TMP0, TMP1		// Compute -savestack(L, L->top).
++  |    ld TMP1, L->cframe
++  |     addxi DISPATCH, GL, GG_G2DISP
++  |   sw TMP0, SAVE_NRES(sp)		// Neg. delta means cframe w/o frame.
++  |  sw x0, SAVE_ERRF(sp)		// No error function.
++  |    sd TMP1, SAVE_CFRAME(sp)
++  |    sd sp, L->cframe			// Add our C frame to cframe chain.
++  |      sd L, GL->cur_L
++  |  jalr CARG4			// (lua_State *L, lua_CFunction func, void *ud)
++  |  mv BASE, CRET1
++  |  li PC, FRAME_CP
++  |  bnez CRET1, <3			// Else continue with the call.
++  |  j ->vm_leave_cp			// No base? Just remove C frame.
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Metamethod handling ------------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |//-- Continuation dispatch ----------------------------------------------
++  |
++  |->cont_dispatch:
++  |  // BASE = meta base, RA = resultptr, RD = (nresults+1)*8
++  |  ld TMP0, -32(BASE)		// Continuation.
++  |   mv RB, BASE
++  |   mv BASE, TMP2			// Restore caller BASE.
++  |    ld LFUNC:TMP1, FRAME_FUNC(TMP2)
++  |     ld PC, -24(RB)			// Restore PC from [cont|PC].
++  |.if FFI
++  |  sltiu TMP3, TMP0, 2
++  |.endif
++  |    cleartp LFUNC:TMP1
++  |   add TMP2, RA, RD
++  |  ld TMP1, LFUNC:TMP1->pc
++  |  sd TISNIL, -8(TMP2)               // Ensure one valid arg.
++  |.if FFI
++  |  bnez TMP3, >1
++  |.endif
++  |  // BASE = base, RA = resultptr, RB = meta base
++  |  ld KBASE, PC2PROTO(k)(TMP1)
++  |  jr TMP0				// Jump to continuation.
++  |
++  |.if FFI
++  |1:
++  |  addi TMP1, RB, -32
++  |  bxnez TMP0, ->cont_ffi_callback	// cont = 1: return from FFI callback.
++  |  // cont = 0: tailcall from C function.
++  |  sub RC, TMP1, BASE
++  |  j ->vm_call_tail
++  |.endif
++  |
++  |->cont_cat:				// RA = resultptr, RB = meta base
++  |  lw INS, -4(PC)
++  |   addi CARG2, RB, -32
++  |  ld TMP0, 0(RA)
++  |  decode_RB8 MULTRES, INS
++  |   decode_RA8 RA, INS
++  |  add TMP1, BASE, MULTRES
++  |   sd BASE, L->base
++  |   sub CARG3, CARG2, TMP1
++  |  sd TMP0, 0(CARG2)
++  |  bxne TMP1, CARG2, ->BC_CAT_Z
++  |  add RA, BASE, RA
++  |  sd TMP0, 0(RA)
++  |  j ->cont_nop
++  |
++  |//-- Table indexing metamethods -----------------------------------------
++  |
++  |->vmeta_tgets1:
++  |  addi CARG3, GL, offsetof(global_State, tmptv)
++  |  li TMP0, LJ_TSTR
++  |  settp STR:RC, TMP0
++  |  sd STR:RC, 0(CARG3)
++  |  j >1
++  |
++  |->vmeta_tgets:
++  |  addi CARG2, GL, offsetof(global_State, tmptv)
++  |   addi CARG3, GL, offsetof(global_State, tmptv2)
++  |  li TMP0, LJ_TTAB
++  |   li TMP1, LJ_TSTR
++  |  settp TAB:RB, TMP0
++  |   settp STR:RC, TMP1
++  |  sd TAB:RB, 0(CARG2)
++  |   sd STR:RC, 0(CARG3)
++  |  j >1
++  |
++  |->vmeta_tgetb:			// TMP0 = index
++  |  addi CARG3, GL, offsetof(global_State, tmptv)
++  |  settp TMP0, TISNUM
++  |  sd TMP0, 0(CARG3)
++  |
++  |->vmeta_tgetv:
++  |1:
++  |  sd BASE, L->base
++  |  mv CARG1, L
++  |  sd PC, SAVE_PC(sp)
++  |  // (lua_State *L, TValue *o, TValue *k)
++  |  call_intern vmeta_tgetv, lj_meta_tget
++  |  // Returns TValue * (finished) or NULL (metamethod).
++  |  beqz CRET1, >3
++  |  ld TMP0, 0(CRET1)
++  |  ins_next1
++  |  sd TMP0, 0(RA)
++  |  ins_next2
++  |
++  |3:  // Call __index metamethod.
++  |  // BASE = base, L->top = new base, stack = cont/func/t/k
++  |  addi TMP1, BASE, -FRAME_CONT
++  |  li NARGS8:RC, 16		// 2 args for func(t, k).
++  |  ld BASE, L->top
++  |  sd PC, -24(BASE)			// [cont|PC]
++  |   sub PC, BASE, TMP1
++  |  ld LFUNC:RB, FRAME_FUNC(BASE)	// Guaranteed to be a function here.
++  |  cleartp LFUNC:RB
++  |  j ->vm_call_dispatch_f
++  |
++  |->vmeta_tgetr:
++  |  call_intern vmeta_tgetr, lj_tab_getinth	// (GCtab *t, int32_t key)
++  |  // Returns cTValue * or NULL.
++  |  mv TMP1, TISNIL
++  |  bxeqz CRET1, ->BC_TGETR_Z
++  |  ld TMP1, 0(CRET1)
++  |  j ->BC_TGETR_Z
++  |
++  |//-----------------------------------------------------------------------
++  |
++  |->vmeta_tsets1:
++  |  addi, CARG3, GL, offsetof(global_State, tmptv)
++  |  li TMP0, LJ_TSTR
++  |  settp STR:RC, TMP0
++  |  sd STR:RC, 0(CARG3)
++  |  j >1
++  |
++  |->vmeta_tsets:
++  |  addi CARG2, GL, offsetof(global_State, tmptv)
++  |   addi CARG3, GL, offsetof(global_State, tmptv2)
++  |  li TMP0, LJ_TTAB
++  |   li TMP1, LJ_TSTR
++  |  settp TAB:RB, TMP0
++  |   settp STR:RC, TMP1
++  |  sd TAB:RB, 0(CARG2)
++  |   sd STR:RC, 0(CARG3)
++  |  j >1
++  |
++  |->vmeta_tsetb:			// TMP0 = index
++  |  addi CARG3, GL, offsetof(global_State, tmptv)
++  |  settp TMP0, TISNUM
++  |  sd TMP0, 0(CARG3)
++  |
++  |->vmeta_tsetv:
++  |1:
++  |  sd BASE, L->base
++  |  mv CARG1, L
++  |  sd PC, SAVE_PC(sp)
++  |  // (lua_State *L, TValue *o, TValue *k)
++  |  call_intern vmeta_tsetv, lj_meta_tset
++  |  // Returns TValue * (finished) or NULL (metamethod).
++  |  ld TMP2, 0(RA)
++  |  beqz CRET1, >3
++  |  ins_next1
++  |  // NOBARRIER: lj_meta_tset ensures the table is not black.
++  |  sd TMP2, 0(CRET1)
++  |  ins_next2
++  |
++  |3:  // Call __newindex metamethod.
++  |  // BASE = base, L->top = new base, stack = cont/func/t/k/(v)
++  |  addi TMP1, BASE, -FRAME_CONT
++  |  ld BASE, L->top
++  |  sd PC, -24(BASE)			// [cont|PC]
++  |   sub PC, BASE, TMP1
++  |  ld LFUNC:RB, FRAME_FUNC(BASE)	// Guaranteed to be a function here.
++  |  li NARGS8:RC, 24		// 3 args for func(t, k, v)
++  |  cleartp LFUNC:RB
++  |  sd TMP2, 16(BASE)		// Copy value to third argument.
++  |  j ->vm_call_dispatch_f
++  |
++  |->vmeta_tsetr:
++  |  sd BASE, L->base
++  |  mv CARG1, L
++  |  sd PC, SAVE_PC(sp)
++  |  // (lua_State *L, GCtab *t, int32_t key)
++  |  call_intern vmeta_tsetr, lj_tab_setinth
++  |  // Returns TValue *.
++  |  j ->BC_TSETR_Z
++  |
++  |//-- Comparison metamethods ---------------------------------------------
++  |
++  |->vmeta_comp:
++  |  // RA/RD point to o1/o2.
++  |  mv CARG2, RA
++  |  mv CARG3, RD
++  |  addi PC, PC, -4
++  |  sd BASE, L->base
++  |  mv CARG1, L
++  |  decode_OP1 CARG4, INS
++  |  sd PC, SAVE_PC(sp)
++  |  // (lua_State *L, TValue *o1, *o2, int op)
++  |  call_intern vmeta_comp, lj_meta_comp
++  |  // Returns 0/1 or TValue * (metamethod).
++  |3:
++  |  sltiu TMP1, CRET1, 2
++  |  bxeqz TMP1, ->vmeta_binop
++  |   negw TMP2, CRET1
++  |4:
++  |  lhu RD, OFS_RD(PC)
++  |   addi PC, PC, 4
++  |   lui TMP1, (-(BCBIAS_J*4 >> 12)) & 0xfffff
++  |  slliw RD, RD, 2
++  |  addw RD, RD, TMP1
++  |  and RD, RD, TMP2
++  |  add PC, PC, RD
++  |->cont_nop:
++  |  ins_next
++  |
++  |->cont_ra:				// RA = resultptr
++  |  lbu TMP1, -4+OFS_RA(PC)
++  |   ld TMP2, 0(RA)
++  |  slliw TMP1, TMP1, 3
++  |  add TMP1, BASE, TMP1
++  |   sd TMP2, 0(TMP1)
++  |  j ->cont_nop
++  |
++  |->cont_condt:			// RA = resultptr
++  |  ld TMP0, 0(RA)
++  |  gettp TMP0, TMP0
++  |  sltiu TMP1, TMP0, LJ_TISTRUECOND
++  |  negw TMP2, TMP1		// Branch if result is true.
++  |  j <4
++  |
++  |->cont_condf:			// RA = resultptr
++  |  ld TMP0, 0(RA)
++  |  gettp TMP0, TMP0
++  |  sltiu TMP1, TMP0, LJ_TISTRUECOND
++  |  addiw TMP2, TMP1, -1		// Branch if result is false.
++  |  j <4
++  |
++  |->vmeta_equal:
++  |  // CARG1/CARG2 point to o1/o2. TMP0 is set to 0/1.
++  |   cleartp LFUNC:CARG3, CARG2
++  |  cleartp LFUNC:CARG2, CARG1
++  |    mv CARG4, TMP0
++  |  addi PC, PC, -4
++  |   sd BASE, L->base
++  |   mv CARG1, L
++  |   sd PC, SAVE_PC(sp)
++  |  // (lua_State *L, GCobj *o1, *o2, int ne)
++  |  call_intern vmeta_equal, lj_meta_equal
++  |  // Returns 0/1 or TValue * (metamethod).
++  |  j <3
++  |
++  |->vmeta_equal_cd:
++  |.if FFI
++  |  addi PC, PC, -4
++  |  mv CARG1, L
++  |  mv CARG2, INS
++  |  sd BASE, L->base
++  |  sd PC, SAVE_PC(sp)
++  |  call_intern vmeta_equal_cd, lj_meta_equal_cd	// (lua_State *L, BCIns op)
++  |  // Returns 0/1 or TValue * (metamethod).
++  |  j <3
++  |.endif
++  |
++  |->vmeta_istype:
++  |  addi PC, PC, -4
++  |   sd BASE, L->base
++  |   mv CARG1, L 
++  |   srliw CARG2, RA, 3
++  |   srliw CARG3, RD, 3
++  |  sd PC, SAVE_PC(sp)
++  |  // (lua_State *L, TValue *o, BCReg tp)
++  |  call_intern vmeta_istype, lj_meta_istype
++  |  j ->cont_nop
++  |
++  |//-- Arithmetic metamethods ---------------------------------------------
++  |
++  |->vmeta_unm:
++  |  mv RC, RB
++  |
++  |->vmeta_arith:
++  |  mv CARG1, L
++  |   sd BASE, L->base
++  |  mv CARG2, RA
++  |   sd PC, SAVE_PC(sp)
++  |  mv CARG3, RB
++  |  mv CARG4, RC
++  |  decode_OP1 CARG5, INS
++  |  // (lua_State *L, TValue *ra,*rb,*rc, BCReg op)
++  |  call_intern vmeta_arith, lj_meta_arith
++  |  // Returns NULL (finished) or TValue * (metamethod).
++  |  bxeqz CRET1, ->cont_nop
++  |
++  |  // Call metamethod for binary op.
++  |->vmeta_binop:
++  |  // BASE = old base, CRET1 = new base, stack = cont/func/o1/o2
++  |  sub TMP1, CRET1, BASE
++  |   sd PC, -24(CRET1)			// [cont|PC]
++  |   mv TMP2, BASE
++  |  addi PC, TMP1, FRAME_CONT
++  |   mv BASE, CRET1
++  |  li NARGS8:RC, 16                  // 2 args for func(o1, o2).
++  |  j ->vm_call_dispatch
++  |
++  |->vmeta_len:
++  |  // CARG2 already set by BC_LEN.
++#if LJ_52
++  |  mv MULTRES, CARG1
++#endif
++  |   sd BASE, L->base
++  |   mv CARG1, L
++  |   sd PC, SAVE_PC(sp)
++  |  call_intern vmeta_len, lj_meta_len	// (lua_State *L, TValue *o)
++  |  // Returns NULL (retry) or TValue * (metamethod base).
++#if LJ_52
++  |  bxnez CRET1, ->vmeta_binop		// Binop call for compatibility.
++  |  mv CARG1, MULTRES
++  |  j ->BC_LEN_Z
++#else
++  |  j ->vmeta_binop			// Binop call for compatibility.
++#endif
++  |
++  |//-- Call metamethod ----------------------------------------------------
++  |
++  |->vmeta_call:			// Resolve and call __call metamethod.
++  |  // TMP2 = old base, BASE = new base, RC = nargs*8
++  |  mv CARG1, L
++  |   sd TMP2, L->base			// This is the callers base!
++  |  addi CARG2, BASE, -16
++  |   sd PC, SAVE_PC(sp)
++  |  add CARG3, BASE, RC
++  |   mv MULTRES, NARGS8:RC
++  |  // (lua_State *L, TValue *func, TValue *top)
++  |  call_intern vmeta_call, lj_meta_call
++  |  ld LFUNC:RB, FRAME_FUNC(BASE)	// Guaranteed to be a function here.
++  |   addi NARGS8:RC, MULTRES, 8	// Got one more argument now.
++  |  cleartp LFUNC:RB
++  |  ins_call
++  |
++  |->vmeta_callt:			// Resolve __call for BC_CALLT.
++  |  // BASE = old base, RA = new base, RC = nargs*8
++  |  mv CARG1, L
++  |   sd BASE, L->base
++  |  addi CARG2, RA, -16
++  |   sd PC, SAVE_PC(sp)
++  |  add CARG3, RA, RC
++  |   mv MULTRES, NARGS8:RC
++  |  // (lua_State *L, TValue *func, TValue *top)
++  |  call_intern vmeta_callt, lj_meta_call
++  |   ld RB, FRAME_FUNC(RA)		// Guaranteed to be a function here.
++  |  ld TMP1, FRAME_PC(BASE)
++  |  addi NARGS8:RC, MULTRES, 8	// Got one more argument now.
++  |  cleartp LFUNC:CARG3, RB
++  |  j ->BC_CALLT_Z
++  |
++  |//-- Argument coercion for 'for' statement ------------------------------
++  |
++  |->vmeta_for:
++  |  mv CARG1, L
++  |   sd BASE, L->base
++  |  mv CARG2, RA
++  |   sd PC, SAVE_PC(sp)
++  |  mv MULTRES, INS
++  |  call_intern vmeta_for, lj_meta_for	// (lua_State *L, TValue *base)
++  |.if JIT
++  |  decode_OP1 TMP0, MULTRES
++  |  li TMP1, BC_JFORI
++  |.endif
++  |  decode_RA8 RA, MULTRES
++  |   decode_RD8 RD, MULTRES
++  |.if JIT
++  |  bxeq TMP0, TMP1, =>BC_JFORI
++  |.endif
++  |  j =>BC_FORI
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Fast functions -----------------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |.macro .ffunc, name
++  |->ff_ .. name:
++  |.endmacro
++  |
++  |.macro .ffunc_1, name
++  |->ff_ .. name:
++  |  ld CARG1, 0(BASE)
++  |  bxeqz NARGS8:RC, ->fff_fallback
++  |.endmacro
++  |
++  |.macro .ffunc_2, name
++  |->ff_ .. name:
++  |  sltiu TMP0, NARGS8:RC, 16
++  |  ld CARG1, 0(BASE)
++  |  ld CARG2, 8(BASE)
++  |  bxnez TMP0, ->fff_fallback
++  |.endmacro
++  |
++  |.macro .ffunc_n, name
++  |->ff_ .. name:
++  |  ld CARG1, 0(BASE)
++  |  fld FARG1, 0(BASE)
++  |  bxeqz NARGS8:RC, ->fff_fallback
++  |  checknum CARG1, ->fff_fallback
++  |.endmacro
++  |
++  |.macro .ffunc_nn, name
++  |->ff_ .. name:
++  |  ld CARG1, 0(BASE)
++  |    sltiu TMP0, NARGS8:RC, 16
++  |   ld CARG2, 8(BASE)
++  |  bxnez TMP0, ->fff_fallback
++  |  gettp TMP1, CARG1
++  |   gettp TMP2, CARG2
++  |  sltiu TMP1, TMP1, LJ_TISNUM
++  |   sltiu TMP2, TMP2, LJ_TISNUM
++  |  fld FARG1, 0(BASE)
++  |  and TMP1, TMP1, TMP2
++  |   fld FARG2, 8(BASE)
++  |  bxeqz TMP1, ->fff_fallback
++  |.endmacro
++  |
++  |// Inlined GC threshold check.
++  |.macro ffgccheck
++  |   ld TMP0, GL->gc.total
++  |   ld TMP1, GL->gc.threshold
++  |  bltu TMP0, TMP1, >1
++  |  jal ->fff_gcstep
++  |1:
++  |.endmacro
++  |
++  |//-- Base library: checks -----------------------------------------------
++  |.ffunc_1 assert
++  |  gettp TMP1, CARG1
++  |  sltiu TMP1, TMP1, LJ_TISTRUECOND
++  |  addi RA, BASE, -16
++  |  bxeqz TMP1, ->fff_fallback
++  |  ld PC, FRAME_PC(BASE)
++  |  addiw RD, NARGS8:RC, 8		// Compute (nresults+1)*8.
++  |  addi TMP1, BASE, 8
++  |  add TMP2, RA, RD
++  |  sd CARG1, 0(RA)
++  |  bne BASE, TMP2, >1
++  |  j ->fff_res		// Done if exactly 1 argument.
++  |1:
++  |  ld TMP0, 0(TMP1)
++  |  sd TMP0, -16(TMP1)
++  |  mv TMP3, TMP1
++  |  addi TMP1, TMP1, 8
++  |  bne TMP3, TMP2, <1
++  |  j ->fff_res
++  |
++  |.ffunc_1 type
++  |  gettp TMP0, CARG1
++  |  not TMP3, TMP0
++  |  bltu TISNUM, TMP0, >1
++  |  li TMP3, ~LJ_TISNUM
++  |1:
++  |  slli TMP3, TMP3, 3
++  |  add TMP3, CFUNC:RB, TMP3
++  |  ld CARG1, CFUNC:TMP3->upvalue
++  |  j ->fff_restv
++  |
++  |//-- Base library: getters and setters ---------------------------------
++  |
++  |.ffunc_1 getmetatable
++  |  gettp TMP2, CARG1
++  |  addi TMP0, TMP2, -LJ_TTAB
++  |  addi TMP1, TMP2, -LJ_TUDATA
++  |  snez TMP0, TMP0
++  |  neg TMP0, TMP0
++  |  and TMP0, TMP0, TMP1
++  |  cleartp TAB:CARG1
++  |  bnez TMP0, >6
++  |1:  // Field metatable must be at same offset for GCtab and GCudata!
++  |  ld TAB:RB, TAB:CARG1->metatable
++  |2:
++  |   ld STR:RC, GL->gcroot[GCROOT_MMNAME+MM_metatable]
++  |  li CARG1, LJ_TNIL
++  |  bxeqz TAB:RB, ->fff_restv
++  |  lw TMP0, TAB:RB->hmask
++  |   lw TMP1, STR:RC->sid
++  |    ld NODE:TMP2, TAB:RB->node
++  |  and TMP1, TMP1, TMP0		// idx = str->sid & tab->hmask
++  |  slli TMP0, TMP1, 5
++  |  slli TMP1, TMP1, 3
++  |  sub TMP1, TMP0, TMP1
++  |  add NODE:TMP2, NODE:TMP2, TMP1	// node = tab->node + (idx*32-idx*8)
++  |  li CARG4, LJ_TSTR
++  |  settp STR:RC, CARG4		// Tagged key to look for.
++  |3:  // Rearranged logic, because we expect _not_ to find the key.
++  |  ld TMP0, NODE:TMP2->key
++  |   ld CARG1, NODE:TMP2->val
++  |    ld NODE:TMP2, NODE:TMP2->next
++  |  li TMP3, LJ_TTAB
++  |  beq RC, TMP0, >5
++  |  bnez NODE:TMP2, <3
++  |4:
++  |  settp CARG1, RB, TMP3
++  |  j ->fff_restv			// Not found, keep default result.
++  |5:
++  |  bxne CARG1, TISNIL, ->fff_restv
++  |  j <4				// Ditto for nil value.
++  |
++  |6:
++  |  sltiu TMP3, TMP2, LJ_TISNUM
++  |  neg TMP3, TMP3
++  |  and TMP0, TISNUM, TMP3
++  |  not TMP3, TMP3
++  |  and TMP2, TMP2, TMP3
++  |  or TMP2, TMP2, TMP0
++  |  slli TMP2, TMP2, 3
++  |   sub TMP0, GL, TMP2
++  |   ld TAB:RB, (offsetof(global_State, gcroot[GCROOT_BASEMT])-8)(TMP0)
++  |  j <2
++  |
++  |.ffunc_2 setmetatable
++  |  // Fast path: no mt for table yet and not clearing the mt.
++  |  checktp TMP1, CARG1, -LJ_TTAB, ->fff_fallback
++  |  gettp TMP3, CARG2
++  |   ld TAB:TMP0, TAB:TMP1->metatable
++  |   lbu TMP2, TAB:TMP1->marked
++  |  addi TMP3, TMP3, -LJ_TTAB
++  |   cleartp TAB:CARG2
++  |  or TMP3, TMP3, TAB:TMP0
++  |  bxnez TMP3, ->fff_fallback
++  |  andi TMP3, TMP2, LJ_GC_BLACK		// isblack(table)
++  |  sd TAB:CARG2, TAB:TMP1->metatable
++  |  bxeqz TMP3, ->fff_restv
++  |  barrierback TAB:TMP1, TMP2, TMP0, ->fff_restv
++  |
++  |.ffunc rawget
++  |  ld CARG2, 0(BASE)
++  |  sltiu TMP0, NARGS8:RC, 16
++  |  gettp TMP1, CARG2
++  |   cleartp CARG2
++  |  addi TMP1, TMP1, -LJ_TTAB
++  |  or TMP0, TMP0, TMP1
++  |  addi CARG3, BASE, 8
++  |  bxnez TMP0, ->fff_fallback
++  |  mv CARG1, L
++  |  call_intern ff_rawget, lj_tab_get	// (lua_State *L, GCtab *t, cTValue *key)
++  |  // Returns cTValue *.
++  |  ld CARG1, 0(CRET1)
++  |  j ->fff_restv
++  |
++  |//-- Base library: conversions ------------------------------------------
++  |
++  |.ffunc tonumber
++  |  // Only handles the number case inline (without a base argument).
++  |  ld CARG1, 0(BASE)
++  |  xori TMP0, NARGS8:RC, 8		// Exactly one number argument.
++  |  gettp TMP1, CARG1
++  |  sltu TMP1, TISNUM, TMP1
++  |  or TMP0, TMP0, TMP1
++  |  bxnez TMP0, ->fff_fallback		// No args or CARG1 is not number
++  |  j ->fff_restv
++  |
++  |.ffunc_1 tostring
++  |  // Only handles the string or number case inline.
++  |  gettp TMP0, CARG1
++  |  addi TMP1, TMP0, -LJ_TSTR
++  |  // A __tostring method in the string base metatable is ignored.
++  |  bxeqz TMP1, ->fff_restv	// String key?
++  |  // Handle numbers inline, unless a number base metatable is present.
++  |   ld TMP1, GL->gcroot[GCROOT_BASEMT_NUM]
++  |  sltu TMP0, TISNUM, TMP0
++  |  sd BASE, L->base			// Add frame since C call can throw.
++  |  or TMP0, TMP0, TMP1
++  |  bxnez TMP0, ->fff_fallback
++  |  sd PC, SAVE_PC(sp)		// Redundant (but a defined value).
++  |  ffgccheck
++  |  mv CARG1, L
++  |  mv CARG2, BASE
++  |  call_intern ff_tostring, lj_strfmt_number	// (lua_State *L, cTValue *o)
++  |  // Returns GCstr *.
++  |  li TMP1, LJ_TSTR
++  |//  ld BASE, L->base
++  |  settp CARG1, TMP1
++  |  j ->fff_restv
++  |
++  |//-- Base library: iterators -------------------------------------------
++  |
++  |.ffunc_1 next
++  |  checktp CARG1, -LJ_TTAB, ->fff_fallback
++  |  add TMP0, BASE, NARGS8:RC
++  |  ld PC, FRAME_PC(BASE)
++  |  sd TISNIL, 0(TMP0)		// Set missing 2nd arg to nil.
++  |  addi CARG2, BASE, 8
++  |  addi CARG3, BASE, -16
++  |  call_intern ff_next, lj_tab_next	// (GCtab *t, cTValue *key, TValue *o)
++  |  // Returns 1=found, 0=end, -1=error.
++  |//  addi RA, BASE, -16
++  |  li RD, (2+1)*8
++  |  bxgtz CRET1, ->fff_res		// Found key/value.
++  |  mv TMP1, CRET1
++  |  mv CARG1, TISNIL
++  |  bxeqz TMP1, ->fff_restv		// End of traversal: return nil.
++  |   ld CFUNC:RB, FRAME_FUNC(BASE)
++  |  li RC, 2*8
++  |   cleartp CFUNC:RB
++  |  j ->fff_fallback			// Invalid key.
++  |
++  |.ffunc_1 pairs
++  |  checktp TAB:TMP1, CARG1, -LJ_TTAB, ->fff_fallback
++  |  ld PC, FRAME_PC(BASE)
++#if LJ_52
++  |  ld TAB:TMP2, TAB:TMP1->metatable
++  |  ld TMP0, CFUNC:RB->upvalue[0]
++  |  addi RA, BASE, -16
++  |  bxnez TAB:TMP2, ->fff_fallback
++#else
++  |  ld TMP0, CFUNC:RB->upvalue[0]
++  |  addi RA, BASE, -16
++#endif
++  |  sd TISNIL, 0(BASE)
++  |   sd CARG1, -8(BASE)
++  |    sd TMP0, 0(RA)
++  |  li RD, (3+1)*8
++  |  j ->fff_res
++  |
++  |.ffunc_2 ipairs_aux
++  |  checktab CARG1, ->fff_fallback
++  |   checkint CARG2, ->fff_fallback
++  |  lw TMP0, TAB:CARG1->asize
++  |   ld TMP1, TAB:CARG1->array
++  |    ld PC, FRAME_PC(BASE)
++  |  sext.w TMP2, CARG2
++  |  addiw TMP2, TMP2, 1
++  |  sltu TMP3, TMP2, TMP0
++  |    addi RA, BASE, -16
++  |   zext.w TMP0, TMP2
++  |   settp_b TMP0, TISNUM
++  |  sd TMP0, 0(RA)
++  |  beqz TMP3, >2			// Not in array part?
++  |  slli TMP3, TMP2, 3
++  |  add TMP3, TMP1, TMP3
++  |  ld TMP1, 0(TMP3)
++  |1:
++  |  li RD, (0+1)*8
++  |  bxeq TMP1, TISNIL, ->fff_res	// End of iteration, return 0 results.
++  |  sd TMP1, -8(BASE)
++  |  li RD, (2+1)*8
++  |  j ->fff_res
++  |2:  // Check for empty hash part first. Otherwise call C function.
++  |  lw TMP0, TAB:CARG1->hmask
++  |  li RD, (0+1)*8
++  |  bxeqz TMP0, ->fff_res
++  |  mv CARG2, TMP2
++  |  call_intern ff_ipairs_aux, lj_tab_getinth	// (GCtab *t, int32_t key)
++  |  // Returns cTValue * or NULL.
++  |  li RD, (0+1)*8
++  |  bxeqz CRET1, ->fff_res
++  |  ld TMP1, 0(CRET1)
++  |  j <1
++  |
++  |.ffunc_1 ipairs
++  |  checktp TAB:TMP1, CARG1, -LJ_TTAB, ->fff_fallback
++  |  ld PC, FRAME_PC(BASE)
++#if LJ_52
++  |  ld TAB:TMP2, TAB:TMP1->metatable
++#endif
++  |  ld CFUNC:TMP0, CFUNC:RB->upvalue[0]
++  |  addi RA, BASE, -16
++#if LJ_52
++  |  bxnez TAB:TMP2, ->fff_fallback
++#endif
++  |  slli TMP1, TISNUM, 47
++  |  sd CARG1, -8(BASE)
++  |   sd TMP1, 0(BASE)
++  |    sd CFUNC:TMP0, 0(RA)
++  |  li RD, (3+1)*8
++  |  j ->fff_res
++  |
++  |//-- Base library: catch errors ----------------------------------------
++  |
++  |.ffunc pcall
++  |  addi NARGS8:RC, NARGS8:RC, -8
++  |   lbu TMP3, GL->hookmask
++  |   mv TMP2, BASE
++  |  bxltz NARGS8:RC, ->fff_fallback
++  |   addi BASE, BASE, 16
++  |  // Remember active hook before pcall.
++  |  srliw TMP3, TMP3, HOOK_ACTIVE_SHIFT
++  |  andi TMP3, TMP3, 1
++  |  addi PC, TMP3, 16+FRAME_PCALL
++  |  bxeqz NARGS8:RC, ->vm_call_dispatch
++  |1:
++  |   add TMP0, BASE, NARGS8:RC
++  |2:
++  |  ld TMP1, -16(TMP0)
++  |  sd TMP1, -8(TMP0)
++  |  addi TMP0, TMP0, -8
++  |  bne TMP0, BASE, <2
++  |  j ->vm_call_dispatch
++  |
++  |.ffunc xpcall
++  |  addi NARGS8:TMP0, NARGS8:RC, -16
++  |  ld CARG1, 0(BASE)
++  |   ld CARG2, 8(BASE)
++  |     lbu TMP1, GL->hookmask
++  |    bxltz NARGS8:TMP0, ->fff_fallback
++  |  gettp TMP2, CARG2
++  |  addi TMP2, TMP2, -LJ_TFUNC
++  |  bxnez TMP2, ->fff_fallback		// Traceback must be a function.
++  |   mv TMP2, BASE
++  |  mv NARGS8:RC, NARGS8:TMP0
++  |   addi BASE, BASE, 24
++  |  // Remember active hook before pcall.
++  |  srliw TMP3, TMP3, HOOK_ACTIVE_SHIFT
++  |   sd CARG2, 0(TMP2)			// Swap function and traceback.
++  |  andi TMP3, TMP3, 1
++  |   sd CARG1, 8(TMP2)
++  |  addi PC, TMP3, 24+FRAME_PCALL
++  |  bnez NARGS8:RC, <1
++  |  j ->vm_call_dispatch
++  |
++  |//-- Coroutine library --------------------------------------------------
++  |
++  |.macro coroutine_resume_wrap, resume
++  |.if resume
++  |.ffunc_1 coroutine_resume
++  |  checktp CARG1, CARG1, -LJ_TTHREAD, ->fff_fallback
++  |.else
++  |.ffunc coroutine_wrap_aux
++  |  ld L:CARG1, CFUNC:RB->upvalue[0].gcr
++  |  cleartp L:CARG1
++  |.endif
++  |  lbu TMP0, L:CARG1->status
++  |   ld TMP1, L:CARG1->cframe
++  |    ld CARG2, L:CARG1->top
++  |    ld TMP2, L:CARG1->base
++  |  addiw CARG4, TMP0, -LUA_YIELD
++  |    add CARG3, CARG2, TMP0
++  |   addi TMP3, CARG2, 8
++  |  seqz TMP4, CARG4
++  |  neg TMP4, TMP4
++  |  and CARG2, CARG2, TMP4
++  |  not TMP4, TMP4
++  |  and TMP3, TMP3, TMP4
++  |   or CARG2, CARG2, TMP3
++  |  bxgtz CARG4, ->fff_fallback		// st > LUA_YIELD?
++  |   xor TMP2, TMP2, CARG3
++  |   or CARG4, TMP2, TMP0
++  |  bxnez TMP1, ->fff_fallback		// cframe != 0?
++  |  ld TMP0, L:CARG1->maxstack
++  |   ld PC, FRAME_PC(BASE)
++  |  bxeqz CARG4, ->fff_fallback		// base == top && st == 0?
++  |  add TMP2, CARG2, NARGS8:RC
++  |  sd BASE, L->base
++  |  sd PC, SAVE_PC(sp)
++  |  bxltu TMP0, TMP2, ->fff_fallback		// Stack overflow?
++  |1:
++  |.if resume
++  |  addi BASE, BASE, 8		// Keep resumed thread in stack for GC.
++  |  addi NARGS8:RC, NARGS8:RC, -8
++  |  addi TMP2, TMP2, -8
++  |.endif
++  |  sd TMP2, L:CARG1->top
++  |  sd BASE, L->top
++  |  add TMP1, BASE, NARGS8:RC
++  |  mv CARG3, CARG2
++  |2:  // Move args to coroutine.
++  |   ld TMP0, 0(BASE)
++  |  sltu TMP3, BASE, TMP1
++  |   addi BASE, BASE, 8
++  |  beqz TMP3, >3
++  |   sd TMP0, 0(CARG3)
++  |   addi CARG3, CARG3, 8
++  |  j <2
++  |3:
++  |   mv L:RA, L:CARG1
++  |  jal ->vm_resume			// (lua_State *L, TValue *base, 0, 0)
++  |  // Returns thread status.
++  |4:
++  |  ld TMP2, L:RA->base
++  |   sltiu TMP1, CRET1, LUA_YIELD+1
++  |  ld TMP3, L:RA->top
++  |    li_vmstate INTERP
++  |  ld BASE, L->base
++  |     sd L, GL->cur_L
++  |    st_vmstate
++  |  sub RD, TMP3, TMP2
++  |   beqz TMP1, >8
++  |  ld TMP0, L->maxstack
++  |   add TMP1, BASE, RD
++  |  beqz RD, >6			// No results?
++  |   add TMP3, TMP2, RD
++  |  bltu TMP0, TMP1, >9		// Need to grow stack?
++  |  sd TMP2, L:RA->top		// Clear coroutine stack.
++  |  mv TMP1, BASE
++  |5:  // Move results from coroutine.
++  |  ld TMP0, 0(TMP2)
++  |  addi TMP2, TMP2, 8
++  |  sd TMP0, 0(TMP1)
++  |  addi TMP1, TMP1, 8
++  |  bltu TMP2, TMP3, <5
++  |6:
++  |.if resume
++  |  mov_true TMP1
++  |  addi RD, RD, 16
++  |7:
++  |  sd TMP1, -8(BASE)	// Prepend true/false to results.
++  |   addi RA, BASE, -8
++  |.else
++  |  mv RA, BASE
++  |  addi RD, RD, 8
++  |.endif
++  |  andi TMP0, PC, FRAME_TYPE
++  |  sd PC, SAVE_PC(sp)
++  |   mv MULTRES, RD
++  |//  bxeqz TMP0, ->BC_RET_Z	// Local label 9 in use
++  |  bnez TMP0, >6
++  |  j ->BC_RET_Z
++  |6:
++  |  j ->vm_return
++  |
++  |8:  // Coroutine returned with error (at co->top-1).
++  |.if resume
++  |  addi TMP3, TMP3, -8
++  |   mov_false TMP1
++  |   li RD, (2+1)*8
++  |   ld TMP0, 0(TMP3)
++  |  sd TMP3, L:RA->top		// Remove error from coroutine stack.
++  |  sd TMP0, 0(BASE)			// Copy error message.
++  |  j <7
++  |.else
++  |  mv CARG1, L
++  |  mv CARG2, L:RA
++  |  // (lua_State *L, lua_State *co)
++  |  call_intern ff_coroutine_wrap_aux, lj_ffh_coroutine_wrap_err
++  |.endif
++  |
++  |9:  // Handle stack expansion on return from yield.
++  |  mv CARG1, L
++  |  srliw CARG2, RD, 3
++  |  // (lua_State *L, int n)
++  |.if resume
++  |  call_intern ff_coroutine_resume, lj_state_growstack
++  |.else
++  |  call_intern ff_coroutine_wrap_aux, lj_state_growstack
++  |.endif
++  |  mv CRET1, x0
++  |  j <4
++  |.endmacro
++  |
++  |  coroutine_resume_wrap 1		// coroutine.resume
++  |  coroutine_resume_wrap 0		// coroutine.wrap
++  |
++  |.ffunc coroutine_yield
++  |  ld TMP0, L->cframe
++  |   add TMP1, BASE, NARGS8:RC
++  |    li CRET1, LUA_YIELD
++  |   sd BASE, L->base
++  |  andi TMP0, TMP0, CFRAME_RESUME
++  |   sd TMP1, L->top
++  |  bxeqz TMP0, ->fff_fallback
++  |  sd x0, L->cframe
++  |    sb CRET1, L->status
++  |  j ->vm_leave_unw
++  |
++  |//-- Math library -------------------------------------------------------
++  |
++  |.macro math_round, func
++  |->ff_math_ .. func:
++  |  ld CARG1, 0(BASE)
++  |   gettp TMP0, CARG1
++  |  bxeqz NARGS8:RC, ->fff_fallback
++  |  bxeq TMP0, TISNUM, ->fff_restv
++  |   fld FARG1, 0(BASE)
++  |  bxgeu TMP0, TISNUM, ->fff_fallback
++  |  jal ->vm_ .. func
++  |  j ->fff_resn
++  |.endmacro
++  |
++  |  math_round floor
++  |  math_round ceil
++  |
++  |.ffunc_1 math_abs
++  |  gettp CARG2, CARG1
++  |  addi TMP2, CARG2, -LJ_TISNUM
++  |   sext.w TMP1, CARG1
++  |  bnez TMP2, >1
++  |  sraiw TMP0, TMP1, 31			// Extract sign. int
++  |  xor TMP1, TMP1, TMP0
++  |  sub CARG1, TMP1, TMP0
++  |  slli TMP3, CARG1, 32
++  |   settp CARG1, TISNUM
++  |  bxgez TMP3, ->fff_restv
++  |  lui CARG1, 0x41e00		// 2^31 as a double.
++  |  slli CARG1, CARG1, 32
++  |  j ->fff_restv
++  |1:
++  |  sltiu TMP2, CARG2, LJ_TISNUM
++  |  slli CARG1, CARG1, 1
++  |  srli CARG1, CARG1, 1
++  |  bxeqz TMP2, ->fff_fallback		// int
++  |// fallthrough
++  |
++  |->fff_restv:
++  |  // CARG1 = TValue result.
++  |  ld PC, FRAME_PC(BASE)
++  |  sd CARG1, -16(BASE)
++  |->fff_res1:
++  |  // RA = results, PC = return.
++  |  li RD, (1+1)*8
++  |->fff_res:
++  |  // RA = results, RD = (nresults+1)*8, PC = return.
++  |  andi TMP0, PC, FRAME_TYPE
++  |   mv MULTRES, RD
++  |  addi RA, BASE, -16
++  |  bxnez TMP0, ->vm_return
++  |  lw INS, -4(PC)
++  |  decode_RB8 RB, INS
++  |5:
++  |  bltu RD, RB, >6			// More results expected?
++  |  decode_RA8a TMP0, INS
++  |  ins_next1
++  |  decode_RA8b TMP0
++  |  // Adjust BASE. KBASE is assumed to be set for the calling frame.
++  |  sub BASE, RA, TMP0
++  |  ins_next2
++  |
++  |6:  // Fill up results with nil.
++  |  add TMP1, RA, RD
++  |   addi RD, RD, 8
++  |   sd TISNIL, -8(TMP1)
++  |  j <5
++  |
++  |.macro math_extern, func
++  |  .ffunc_n math_ .. func
++  |  call_extern func
++  |  j ->fff_resn
++  |.endmacro
++  |
++  |.macro math_extern2, func
++  |  .ffunc_nn math_ .. func
++  |  call_extern func
++  |  j ->fff_resn
++  |.endmacro
++  |
++  |.ffunc_n math_sqrt
++  |  fsqrt.d FRET1, FARG1
++  |->fff_resn:
++  |  ld PC, FRAME_PC(BASE)
++  |  fsd FRET1, -16(BASE)
++  |  j ->fff_res1
++  |
++  |.ffunc math_log
++  |  li TMP1, 8
++  |   ld CARG1, 0(BASE)
++  |   fld FARG1, 0(BASE)
++  |  bxne NARGS8:RC, TMP1, ->fff_fallback		// Need exactly 1 argument.
++  |  checknum CARG1, ->fff_fallback
++  |  call_extern log
++  |  j ->fff_resn
++  |
++  |  math_extern log10
++  |  math_extern exp
++  |  math_extern sin
++  |  math_extern cos
++  |  math_extern tan
++  |  math_extern asin
++  |  math_extern acos
++  |  math_extern atan
++  |  math_extern sinh
++  |  math_extern cosh
++  |  math_extern tanh
++  |  math_extern2 pow
++  |  math_extern2 atan2
++  |  math_extern2 fmod
++  |
++  |.ffunc_2 math_ldexp
++  |  checknum CARG1, ->fff_fallback
++  |  checkint CARG2, ->fff_fallback
++  |   fld FARG1, 0(BASE)
++  |   lw CARG1, 8(BASE)
++  |  call_extern ldexp			// (double x, int exp)
++  |  j ->fff_resn
++  |
++  |.ffunc_n math_frexp
++  |   ld PC, FRAME_PC(BASE)
++  |  addi CARG1, GL, offsetof(global_State, tmptv)
++  |  call_extern frexp
++  |    lw TMP1, GL->tmptv
++  |   fcvt.d.w FARG2, TMP1
++  |  fsd FRET1, -16(BASE)
++  |   fsd FARG2, -8(BASE)
++  |  li RD, (2+1)*8
++  |  j ->fff_res
++  |
++  |.ffunc_n math_modf
++  |   addi CARG1, BASE, -16
++  |   ld PC, FRAME_PC(BASE)
++  |  call_extern modf
++  |  fsd FRET1, -8(BASE)
++  |   li RD, (2+1)*8
++  |  j ->fff_res
++  |
++  |.macro math_minmax, name, ismax, fpins
++  |  .ffunc_1 name
++  |  add RB, BASE, NARGS8:RC
++  |   addi RA, BASE, 8
++  |  checkint CARG1, >4
++  |1:  // Handle integers.
++  |   ld CARG2, 0(RA)
++  |  bxeq RA, RB, ->fff_restv
++  |   sext.w CARG1, CARG1
++  |  checkint CARG2, >3
++  |   sext.w CARG2, CARG2
++  |   slt TMP0, CARG1, CARG2
++  |.if ismax
++  |   addi TMP1, TMP0, -1
++  |.else
++  |   neg TMP1, TMP0
++  |.endif
++  | and CARG1, CARG1, TMP1
++  |  not TMP1, TMP1
++  |  and CARG2, CARG2, TMP1
++  |   or CARG1, CARG1, CARG2
++  |  addi RA, RA, 8
++  |   zext.w CARG1, CARG1
++  |   settp_b CARG1, TISNUM
++  |  j <1
++  |3:  // Convert intermediate result to number and continue below.
++  |   fcvt.d.w FARG1, CARG1
++  |  checknum CARG2, ->fff_fallback
++  |   fld FARG2, 0(RA)
++  |  j >6
++  |
++  |4:
++  |  fld FARG1, 0(BASE)
++  |  checknum CARG1, ->fff_fallback
++  |5:  // Handle numbers.
++  |  ld CARG2, 0(RA)
++  |  fld FARG2, 0(RA)
++  |   bxgeu RA, RB, ->fff_resn
++  |  checknum CARG2, >7
++  |6:
++  |  fpins FARG1, FARG1, FARG2
++  |   addi RA, RA, 8
++  |  j <5
++  |7:  // Convert integer to number and continue above.
++  |  checkint CARG2, ->fff_fallback
++  |   fcvt.d.w FARG2, CARG2
++  |  j <6
++  |.endmacro
++  |
++  |  math_minmax math_min, 0, fmin.d
++  |  math_minmax math_max, 1, fmax.d
++  |
++  |//-- String library -----------------------------------------------------
++  |
++  |.ffunc string_byte			// Only handle the 1-arg case here.
++  |  ld CARG1, 0(BASE)
++  |  gettp TMP0, CARG1
++  |  xori TMP1, NARGS8:RC, 8
++  |  addi TMP0, TMP0, -LJ_TSTR
++  |  or TMP1, TMP1, TMP0
++  |   cleartp STR:CARG1
++  |  bxnez TMP1, ->fff_fallback		// Need exactly 1 string argument.
++  |  lw TMP0, STR:CARG1->len
++  |    ld PC, FRAME_PC(BASE)
++  |  snez RD, TMP0
++  |   lbu TMP2, STR:CARG1[1]		// Access is always ok (NUL at end).
++  |  addiw RD, RD, 1
++  |  slliw RD, RD, 3			// RD = ((str->len != 0)+1)*8
++  |  settp_b TMP2, TISNUM
++  |   sd TMP2, -16(BASE)
++  |  j ->fff_res
++  |
++  |.ffunc string_char			// Only handle the 1-arg case here.
++  |  ffgccheck
++  |  ld CARG1, 0(BASE)
++  |  gettp TMP0, CARG1
++  |  xori TMP1, NARGS8:RC, 8		// Need exactly 1 argument.
++  |  addi TMP0, TMP0, -LJ_TISNUM	// Integer.
++  |  li TMP2, 255
++  |   sext.w CARG1, CARG1
++  |  or TMP1, TMP1, TMP0
++  |   sltu TMP2, TMP2, CARG1		// !(255 < n).
++  |   or TMP1, TMP1, TMP2
++  |   li CARG3, 1
++  |  bxnez TMP1, ->fff_fallback
++  |  addi CARG2, sp, TMPD_OFS
++  |  sb CARG1, TMPD(sp)
++  |->fff_newstr:
++  |  sd BASE, L->base
++  |  sd PC, SAVE_PC(sp)
++  |  mv CARG1, L
++  |  // (lua_State *L, const char *str, size_t l)
++  |  call_intern fff_newstr, lj_str_new
++  |  // Returns GCstr *.
++  |  ld BASE, L->base
++  |->fff_resstr:
++  |  li TMP1, LJ_TSTR
++  |  settp CRET1, TMP1
++  |  j ->fff_restv
++  |
++  |.ffunc string_sub
++  |  ffgccheck
++  |  ld CARG1, 0(BASE)
++  |  ld CARG2, 8(BASE)
++  |  ld CARG3, 16(BASE)
++  |  addi TMP0, NARGS8:RC, -16
++  |   gettp TMP1, CARG1
++  |  bxltz TMP0, ->fff_fallback
++  |  cleartp STR:CARG1, CARG1
++  |   li CARG4, -1
++  |  beqz TMP0, >1
++  |   sext.w CARG4, CARG3
++  |  checkint CARG3, ->fff_fallback
++  |1:
++  |  checkint CARG2, ->fff_fallback
++  |  addi TMP0, TMP1, -LJ_TSTR
++  |   sext.w CARG3, CARG2
++  |  bxnez TMP0, ->fff_fallback
++  |  lw CARG2, STR:CARG1->len
++  |  // STR:CARG1 = str, CARG2 = str->len, CARG3 = start, CARG4 = end
++  |  addiw TMP0, CARG2, 1
++  |  bgez CARG4, >2
++  |  addw CARG4, CARG4, TMP0		// if (end < 0) end += len+1
++  |2:
++  |  bgez CARG3, >3
++  |  addw CARG3, CARG3, TMP0		// if (start < 0) start += len+1
++  |3:
++  |  bgez CARG4, >4
++  |  mv CARG4, x0			// if (end < 0) end = 0
++  |4:
++  |  bgtz CARG3, >5
++  |   li CARG3, 1		// if (start < 1) start = 1
++  |5:
++  |  ble CARG4, CARG2, >6
++  |  mv CARG4, CARG2		// if (end > len) end = len
++  |6:
++  |   add CARG2, STR:CARG1, CARG3
++  |  sub CARG3, CARG4, CARG3		// len = end - start
++  |   addi CARG2, CARG2, sizeof(GCstr)-1
++  |   addiw CARG3, CARG3, 1             // len += 1
++  |  bxgez CARG3, ->fff_newstr
++  |->fff_emptystr:  // Return empty string.
++  |  li TMP1, LJ_TSTR
++  |  addi STR:CARG1, GL, offsetof(global_State, strempty)
++  |   settp CARG1, TMP1
++  |  j ->fff_restv
++  |
++  |.macro ffstring_op, name
++  |  .ffunc string_ .. name
++  |  ffgccheck
++  |   ld CARG2, 0(BASE)
++  |  bxeqz NARGS8:RC, ->fff_fallback
++  |  checkstr STR:CARG2, ->fff_fallback
++  |  addi SBUF:CARG1, GL, offsetof(global_State, tmpbuf)
++  |  ld TMP0, SBUF:CARG1->b
++  |   sd L, SBUF:CARG1->L
++  |   sd BASE, L->base
++  |  sd TMP0, SBUF:CARG1->w
++  |   sd PC, SAVE_PC(sp)
++  |  call_intern ff_string_ .. name, lj_buf_putstr_ .. name
++  |//  mv SBUF:CARG1, SBUF:CRET1
++  |  call_intern ff_string_ .. name, lj_buf_tostr
++  |   ld BASE, L->base
++  |  j ->fff_resstr
++  |.endmacro
++  |
++  |ffstring_op reverse
++  |ffstring_op lower
++  |ffstring_op upper
++  |
++  |//-- Bit library --------------------------------------------------------
++  |
++  |->vm_tobit_fb:
++  |  fld FARG1, 0(BASE)
++  |  bxeqz TMP1, ->fff_fallback
++  |   fadd.d FARG1, FARG1, TOBIT
++  |  fmv.x.w CRET1, FARG1
++  |  zext.w CRET1, CRET1
++  |  ret
++  |
++  |.macro .ffunc_bit, name
++  |  .ffunc_1 bit_..name
++  |  gettp TMP0, CARG1
++  |   zext.w CRET1, CARG1
++  |  beq TMP0, TISNUM, >1
++  |   sltiu TMP1, TMP0, LJ_TISNUM
++  |  jal ->vm_tobit_fb
++  |1:
++  |.endmacro
++  |
++  |.macro .ffunc_bit_op, name, bins
++  |  .ffunc_bit name
++  |  addi TMP2, BASE, 8
++  |  add TMP3, BASE, NARGS8:RC
++  |1:
++  |   ld TMP1, 0(TMP2)
++  |  bxeq TMP2, TMP3, ->fff_resi
++  |  gettp TMP0, TMP1
++  |   addi TMP2, TMP2, 8
++  |  bne TMP0, TISNUM, >2
++  |  zext.w TMP1, TMP1
++  |   bins CRET1, CRET1, TMP1
++  |  j <1
++  |2:
++  |   fld FARG1, -8(TMP2)
++  |  sltiu TMP0, TMP0, LJ_TISNUM
++  |   fadd.d FARG1, FARG1, TOBIT
++  |  bxeqz TMP0, ->fff_fallback
++  |  fmv.x.w TMP1, FARG1
++  |  zext.w TMP1, TMP1
++  |   bins CRET1, CRET1, TMP1
++  |  j <1
++  |.endmacro
++  |
++  |.ffunc_bit_op band, and
++  |.ffunc_bit_op bor, or
++  |.ffunc_bit_op bxor, xor
++  |
++  |.ffunc_bit bswap
++  |  srliw CARG2, CARG1, 8
++  |   lui CARG3, 16
++  |   addiw CARG3, CARG3, -256
++  |  and CARG2, CARG2, CARG3
++  |   srliw CARG3, CARG1, 24
++  |  or CARG2, CARG2, CARG3
++  |   slli CARG3, CARG1, 8
++  |    lui CARG4, 0x00ff0
++  |   and CARG3, CARG3, CARG4
++  |  slli CARG1, CARG1, 24
++  |  or CARG1, CARG1, CARG3
++  |  or CARG1, CARG1, CARG2
++  |  slli CARG1, CARG1, 32
++  |  srli CARG1, CARG1, 32
++  |  j ->fff_resi
++  |
++  |.ffunc_bit tobit
++  |->fff_resi:
++  |  settp CARG1, TISNUM	// CARG1 = CRET1
++  |  j ->fff_restv
++  |
++  |.ffunc_bit bnot
++  |  not CRET1, CRET1
++  |  zext.w CRET1, CRET1
++  |  j ->fff_resi
++  |
++  |.macro .ffunc_bit_sh, name, shins
++  |  .ffunc_2 bit_..name
++  |  gettp TMP0, CARG1
++  |  beq TMP0, TISNUM, >1
++  |   sltiu TMP1, TMP0, LJ_TISNUM
++  |  jal ->vm_tobit_fb
++  |//  mv CARG1, CRET1		// CARG1 = CRET1
++  |1:
++  |  gettp TMP0, CARG2
++  |   zext.w CARG2, CARG2
++  |  bxne TMP0, TISNUM, ->fff_fallback
++  |  sext.w CARG1, CARG1
++  |  shins CRET1, CARG1, CARG2
++  |   zext.w CRET1, CRET1
++  |  j ->fff_resi
++  |.endmacro
++  |
++  |.ffunc_bit_sh lshift, sllw
++  |.ffunc_bit_sh rshift, srlw
++  |.ffunc_bit_sh arshift, sraw
++  |
++  |.macro .ffunc_bit_rot, name, rotinsa, rotinsb
++  |  .ffunc_2 bit_..name
++  |  gettp TMP0, CARG1
++  |  beq TMP0, TISNUM, >1
++  |   sltiu TMP1, TMP0, LJ_TISNUM
++  |  jal ->vm_tobit_fb
++  |//  mv CARG1, CRET1		// CARG1 = CRET1
++  |1:
++  |  gettp TMP0, CARG2
++  |   zext.w CARG2, CARG2
++  |  bxne TMP0, TISNUM, ->fff_fallback
++  |  sext.w CARG1, CARG1
++  |  neg TMP2, CARG2
++  |  rotinsa TMP1, CARG1, CARG2
++  |  rotinsb TMP0, CARG1, TMP2
++  |  or CRET1, TMP0, TMP1
++  |   zext.w CRET1, CRET1
++  |  j ->fff_resi
++  |.endmacro
++  |
++  |.ffunc_bit_rot rol, sllw, srlw
++  |.ffunc_bit_rot ror, srlw, sllw
++  |
++  |//-----------------------------------------------------------------------
++  |
++  |->fff_fallback:			// Call fast function fallback handler.
++  |  // BASE = new base, RB = CFUNC, RC = nargs*8
++  |   ld PC, FRAME_PC(BASE)		// Fallback may overwrite PC.
++  |  ld CARG3, CFUNC:RB->f
++  |    add TMP1, BASE, NARGS8:RC
++  |  sd BASE, L->base
++  |    addi TMP0, TMP1, 8*LUA_MINSTACK
++  |     ld TMP2, L->maxstack
++  |   sd PC, SAVE_PC(sp)			// Redundant (but a defined value).
++  |    sd TMP1, L->top
++  |   mv CARG1, L
++  |  bltu TMP2, TMP0, >5			// Need to grow stack.
++  |  jalr CARG3				// (lua_State *L)
++  |  // Either throws an error, or recovers and returns -1, 0 or nresults+1.
++  |  ld BASE, L->base
++  |   slliw RD, CRET1, 3
++  |  bxgtz CRET1, ->fff_res		// Returned nresults+1?
++  |1:  // Returned 0 or -1: retry fast path.
++  |   ld LFUNC:RB, FRAME_FUNC(BASE)
++  |  ld TMP0, L->top
++  |   sub NARGS8:RC, TMP0, BASE
++  |   cleartp LFUNC:RB
++  |  bxnez CRET1, ->vm_call_tail		// Returned -1?
++  |  ins_callt				// Returned 0: retry fast path.
++  |
++  |// Reconstruct previous base for vmeta_call during tailcall.
++  |->vm_call_tail:
++  |  andi TMP0, PC, FRAME_TYPE
++  |   andi TMP1, PC, ~FRAME_TYPEP	// TODO
++  |  bnez TMP0, >3
++  |  lbu TMP1, OFS_RA(PC)
++  |  slliw TMP1, TMP1, 3
++  |  addiw TMP1, TMP1, 16
++  |3:
++  |   sub TMP2, BASE, TMP1
++  |  j ->vm_call_dispatch		// Resolve again for tailcall.
++  |
++  |5:  // Grow stack for fallback handler.
++  |  li CARG2, LUA_MINSTACK
++  |   mv CARG1, L
++  |  call_intern vm_call_tail, lj_state_growstack	// (lua_State *L, int n)
++  |  ld BASE, L->base
++  |   mv CRET1, x0		// Set zero-flag to force retry.
++  |  j <1
++  |
++  |->fff_gcstep:			// Call GC step function.
++  |  // BASE = new base, RC = nargs*8
++  |  mv MULTRES, ra
++  |  add TMP0, BASE, NARGS8:RC	// Calculate L->top.
++  |   sd BASE, L->base
++  |   sd PC, SAVE_PC(sp)		// Redundant (but a defined value).
++  |   mv CARG1, L
++  |  sd TMP0, L->top
++  |  call_intern fff_gc_step, lj_gc_step	// (lua_State *L)
++  |   ld BASE, L->base
++  |  mv ra, MULTRES			// Help return address predictor.
++  |    ld TMP0, L->top
++  |  ld CFUNC:RB, FRAME_FUNC(BASE)
++  |  cleartp CFUNC:RB
++  |   sub NARGS8:RC, TMP0, BASE
++  |  ret
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Special dispatch targets -------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |->vm_record:				// Dispatch target for recording phase.
++  |.if JIT
++  |  lbu TMP3, GL->hookmask
++  |  andi TMP1, TMP3, HOOK_VMEVENT	// No recording while in vmevent.
++  |  bnez TMP1, >5
++  |  // Decrement the hookcount for consistency, but always do the call.
++  |  lw TMP2, GL->hookcount
++  |  andi TMP1, TMP3, HOOK_ACTIVE
++  |  bnez TMP1, >1
++  |  addiw TMP2, TMP2, -1
++  |  andi TMP1, TMP3, LUA_MASKLINE|LUA_MASKCOUNT
++  |  beqz TMP1, >1
++  |  sw TMP2, GL->hookcount
++  |  j >1
++  |.endif
++  |
++  |->vm_rethook:			// Dispatch target for return hooks.
++  |   lbu TMP3, GL->hookmask
++  |  andi TMP1, TMP3, HOOK_ACTIVE		// Hook already active?
++  |  beqz TMP1, >1
++  |5:  // Re-dispatch to static ins.
++  |   ld TMP1, GG_DISP2STATIC(TMP0)	// Assumes TMP0 holds DISPATCH+OP*4.
++  |  jr TMP1
++  |
++  |->vm_inshook:			// Dispatch target for instr/line hooks.
++  |  lbu TMP3, GL->hookmask
++  |  lw TMP2, GL->hookcount
++  |  andi TMP1, TMP3, HOOK_ACTIVE		// Hook already active?
++  |  bnez TMP1, <5
++  |   andi TMP1, TMP3, LUA_MASKLINE|LUA_MASKCOUNT
++  |   addiw TMP2, TMP2, -1
++  |  beqz TMP1, <5
++  |   sw TMP2, GL->hookcount
++  |  beqz TMP2, >1
++  |  andi TMP1, TMP3, LUA_MASKLINE
++  |  beqz TMP1, <5
++  |1:
++  |   sw MULTRES, TMPD(sp)
++  |  mv CARG2, PC
++  |   sd BASE, L->base
++  |   mv CARG1, L
++  |  // SAVE_PC must hold the _previous_ PC. The callee updates it with PC.
++  |  call_intern vm_inshook, lj_dispatch_ins	// (lua_State *L, const BCIns *pc)
++  |3:
++  |  ld BASE, L->base
++  |4:  // Re-dispatch to static ins.
++  |  lw INS, -4(PC)
++  |  decode_OP8 TMP1, INS
++  |  add TMP0, DISPATCH, TMP1
++  |   decode_RD8a RD, INS
++  |  ld TMP1, GG_DISP2STATIC(TMP0)
++  |   decode_RA8 RA, INS
++  |   decode_RD8b RD
++  |  jr TMP1
++  |
++  |->cont_hook:				// Continue from hook yield.
++  |  addi PC, PC, 4
++  |   lw MULTRES, -24(RB)		// Restore MULTRES for *M ins.
++  |  j <4
++  |
++  |->vm_hotloop:			// Hot loop counter underflow.
++  |.if JIT
++  |  ld LFUNC:TMP1, FRAME_FUNC(BASE)
++  |  addi CARG1, GL, GG_G2J
++  |  cleartp LFUNC:TMP1
++  |  sd PC, SAVE_PC(sp)
++  |  ld TMP1, LFUNC:TMP1->pc
++  |  mv CARG2, PC
++  |  sd L, (offsetof(jit_State, L))(CARG1)
++  |  lbu TMP1, PC2PROTO(framesize)(TMP1)
++  |  sd BASE, L->base
++  |  slli TMP1, TMP1, 3
++  |  add TMP1, BASE, TMP1
++  |  sd TMP1, L->top
++  |  call_intern vm_hotloop, lj_trace_hot	// (jit_State *J, const BCIns *pc)
++  |  j <3
++  |.endif
++  |
++  |
++  |->vm_callhook:			// Dispatch target for call hooks.
++  |  mv CARG2, PC
++  |.if JIT
++  |  j >1
++  |.endif
++  |
++  |->vm_hotcall:			// Hot call counter underflow.
++  |.if JIT
++  |  ori CARG2, PC, 1
++  |1:
++  |.endif
++  |  add TMP0, BASE, RC
++  |  sd PC, SAVE_PC(sp)
++  |  sd BASE, L->base
++  |  sub RA, RA, BASE
++  |  sd TMP0, L->top
++  |  mv CARG1, L
++  |  call_intern vm_hotcall, lj_dispatch_call	// (lua_State *L, const BCIns *pc)
++  |  // Returns ASMFunction.
++  |  ld BASE, L->base
++  |  ld TMP0, L->top
++  |  sd x0, SAVE_PC(sp)		// Invalidate for subsequent line hook.
++  |  add RA, BASE, RA
++  |  sub NARGS8:RC, TMP0, BASE
++  |  ld LFUNC:RB, FRAME_FUNC(BASE)
++  |  cleartp LFUNC:RB
++  |  lw INS, -4(PC)
++  |  jr CRET1
++  |
++  |->cont_stitch:			// Trace stitching.
++  |.if JIT
++  |  // RA = resultptr, RB = meta base
++  |  lw INS, -4(PC)
++  |  ld TRACE:TMP2, -40(RB)		// Save previous trace.
++  |  decode_RA8 RC, INS
++  |  addi TMP1, MULTRES, -8
++  |  cleartp TRACE:TMP2
++  |  add RC, BASE, RC			// Call base.
++  |  beqz TMP1, >2
++  |1:  // Move results down.
++  |  ld CARG1, 0(RA)
++  |  addi TMP1, TMP1, -8
++  |  addi RA, RA, 8
++  |  sd CARG1, 0(RC)
++  |  addi RC, RC, 8
++  |  bnez TMP1, <1
++  |2:
++  |  decode_RA8 RA, INS
++  |  decode_RB8 RB, INS
++  |  add RA, RA, RB
++  |  add RA, BASE, RA
++  |3:
++  |  bltu RC, RA, >8			// More results wanted?
++  |
++  |  lhu TMP3, TRACE:TMP2->traceno
++  |  lhu RD, TRACE:TMP2->link
++  |  bxeq RD, TMP3, ->cont_nop		// Blacklisted.
++  |  slliw RD, RD, 3
++  |  bxnez RD, =>BC_JLOOP		// Jump to stitched trace.
++  |
++  |  // Stitch a new trace to the previous trace.
++  |  addi CARG1, GL, GG_G2J
++  |  // addi CARG2, CARG1, 1		// We don't care what's on the verge.
++  |  addi CARG2, CARG1, 2047		// jit_State too large.
++  |  sw TMP3, (offsetof(jit_State, exitno)-2047)(CARG2)
++  |  sd L, (offsetof(jit_State, L)-2047)(CARG2)
++  |  sd BASE, L->base
++  |  mv CARG2, PC
++  |  // (jit_State *J, const BCIns *pc)
++  |  call_intern cont_stitch, lj_dispatch_stitch
++  |  ld BASE, L->base
++  |  j ->cont_nop
++  |
++  |8:
++  |  sd TISNIL, 0(RC)
++  |  addi RC, RC, 8
++  |  j <3
++  |.endif
++  |
++  |->vm_profhook:			// Dispatch target for profiler hook.
++#if LJ_HASPROFILE
++  |   mv CARG1, L
++  |  mv CARG2, PC
++  |   sd BASE, L->base
++  |   sw MULTRES, TMPD(sp)
++  |  // (lua_State *L, const BCIns *pc)
++  |  call_intern vm_profhook, lj_dispatch_profile
++  |  // HOOK_PROFILE is off again, so re-dispatch to dynamic instruction.
++  |  addi PC, PC, -4
++  |   ld BASE, L->base
++  |  j ->cont_nop
++#endif
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Trace exit handler -------------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |.macro savex_, a, b
++  |  fsd f..a, a*8(sp)
++  |  fsd f..b, b*8(sp)
++  |  sd x..a, 32*8+a*8(sp)
++  |  sd x..b, 32*8+b*8(sp)
++  |.endmacro
++  |
++  |->vm_exit_handler:
++  |.if JIT
++  |  addi sp, sp, -(32*8+32*8)
++  |  savex_ 0, 5
++  |  savex_ 6, 7
++  |  savex_ 8, 9
++  |  savex_ 10, 11
++  |  savex_ 12, 13
++  |  savex_ 14, 15
++  |  savex_ 16, 17
++  |  savex_ 18, 19
++  |  savex_ 20, 21
++  |  savex_ 22, 23
++  |  savex_ 24, 25
++  |  savex_ 26, 27
++  |  savex_ 28, 29
++  |  savex_ 30, 31
++  |  fsd f1, 1*8(sp)
++  |  fsd f2, 2*8(sp)
++  |  fsd f3, 3*8(sp)
++  |  fsd f4, 4*8(sp)
++  |  sd x0, 32*8+1*8(sp)		// Clear RID_TMP.
++  |  ld TMP1, 32*8+32*8(sp)			// Load exit pc.
++  |   addi TMP2, sp, 32*8+32*8		// Recompute original value of sp.
++  |  addxi DISPATCH, GL, GG_G2DISP
++  |   sd TMP2, 32*8+2*8(sp)		// Store sp in RID_SP
++  |  addi CARG1, GL, GG_G2J
++  |    li_vmstate EXIT
++  |  // addi CARG2, CARG1, 1		// We don't care what's on the verge.
++  |  addi CARG2, CARG1, 2047		// jit_State too large.
++  |  sub TMP1, TMP1, ra
++  |   lw TMP2, 0(ra)			// Load trace number.
++  |    st_vmstate
++  |  srli TMP1, TMP1, 2
++  |  ld L, GL->cur_L
++  |  ld BASE, GL->jit_base
++  |  srli TMP2, TMP2, 12
++  |  addi TMP1, TMP1, -2
++  |  sd L, (offsetof(jit_State, L)-2047)(CARG2)
++  |  sw TMP2, (offsetof(jit_State, parent)-2047)(CARG2)	// Store trace number.
++  |  sd BASE, L->base
++  |  sw TMP1, (offsetof(jit_State, exitno)-2047)(CARG2)	// Store exit number.
++  |  sd x0, GL->jit_base
++  |  mv CARG2, sp
++  |  call_intern vm_exit_handler, lj_trace_exit	// (jit_State *J, ExitState *ex)
++  |  // Returns MULTRES (unscaled) or negated error code.
++  |  ld TMP1, L->cframe
++  |  ld BASE, L->base
++  |  andi sp, TMP1, CFRAME_RAWMASK
++  |  ld PC, SAVE_PC(sp)		// Get SAVE_PC.
++  |  sd L, SAVE_L(sp)			// Set SAVE_L (on-trace resume/yield).
++  |  j >1
++  |.endif
++  |
++  |->vm_exit_interp:
++  |.if JIT
++  |  // CRET1 = MULTRES or negated error code, BASE, PC and JGL set.
++  |  ld L, SAVE_L(sp)
++  |  addxi DISPATCH, GL, GG_G2DISP
++  |  sd BASE, L->base
++  |1:
++  |  ld LFUNC:RB, FRAME_FUNC(BASE)
++  |  sltiu TMP0, CRET1, -LUA_ERRERR  // Check for error from exit.
++  |  beqz TMP0, >9
++  |  lui TMP3, 0x43380		// TOBIT = Hiword of 2^52 + 2^51 (double).
++  |  slli MULTRES, CRET1, 3
++  |  cleartp LFUNC:RB
++  |  sw MULTRES, TMPD(sp)
++  |  li TISNIL, LJ_TNIL
++  |  li TISNUM, LJ_TISNUM		// Setup type comparison constants.
++  |  slli TMP3, TMP3, 32
++  |  ld TMP1, LFUNC:RB->pc
++  |  sd x0, GL->jit_base
++  |  ld KBASE, PC2PROTO(k)(TMP1)
++  |  fmv.d.x TOBIT, TMP3
++  |  // Modified copy of ins_next which handles function header dispatch, too.
++  |  lw INS, 0(PC)
++  |   addi PC, PC, 4
++  |  addiw CRET1, CRET1, 17		// Static dispatch?
++  |  // Assumes TISNIL == ~LJ_VMST_INTERP == -1
++  |  sw TISNIL, GL->vmstate
++  |   decode_RD8a RD, INS
++  |  beqz CRET1, >5
++  |  decode_OP8 TMP1, INS
++  |  add TMP0, DISPATCH, TMP1
++  |    sltiu TMP2, TMP1, BC_FUNCF*8
++  |  ld TMP3, 0(TMP0)
++  |   decode_RA8 RA, INS
++  |    beqz TMP2, >2
++  |   decode_RD8b RD
++  |  jr TMP3
++  |2:
++  |  sltiu TMP2, TMP1, (BC_FUNCC+2)*8	// Fast function?
++  |  ld TMP1, FRAME_PC(BASE)
++  |  bnez TMP2, >3
++  |  // Check frame below fast function.
++  |  andi TMP0, TMP1, FRAME_TYPE
++  |  bnez TMP0, >3			// Trace stitching continuation?
++  |  // Otherwise set KBASE for Lua function below fast function.
++  |  lw TMP2, -4(TMP1)
++  |  decode_RA8 TMP0, TMP2
++  |  sub TMP1, BASE, TMP0
++  |  ld LFUNC:TMP2, -32(TMP1)
++  |  cleartp LFUNC:TMP2
++  |  ld TMP1, LFUNC:TMP2->pc
++  |  ld KBASE, PC2PROTO(k)(TMP1)
++  |3:
++  |  addi RC, MULTRES, -8
++  |  add RA, RA, BASE
++  |  jr TMP3
++  |
++  |5:  // Dispatch to static entry of original ins replaced by BC_JLOOP.
++  |  ld TMP0, GL_J(trace)(GL)
++  |  decode_RD8b RD
++  |  add TMP0, TMP0, RD
++  |  ld TRACE:TMP2, 0(TMP0)
++  |  lw INS, TRACE:TMP2->startins
++  |  decode_OP8 TMP1, INS
++  |  add TMP0, DISPATCH, TMP1
++  |   decode_RD8a RD, INS
++  |  ld TMP3, GG_DISP2STATIC(TMP0)
++  |   decode_RA8a RA, INS
++  |   decode_RD8b RD
++  |   decode_RA8b RA
++  |  jr TMP3
++  |
++  |9:  // Rethrow error from the right C frame.
++  |  negw CARG2, CRET1
++  |  mv CARG1, L
++  |  call_intern vm_exit_interp, lj_err_trace	// (lua_State *L, int errcode)
++  |.endif
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Math helper functions ----------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |
++  |// Hard-float round to integer.
++  |// Modifies TMP0, TMP1, FARG1, FARG5, FTMP1, FTMP3, FTMP4
++  |.macro vm_round_hf, func
++  |  lui TMP0, 0x43300		// Hiword of 2^52 (double).
++  |  slli TMP0, TMP0, 32
++  |  fmv.d.x FARG5, TMP0
++  |  fabs.d FTMP4, FARG1		// |x|
++  |   fmv.x.d TMP1, FARG1
++  |  flt.d TMP0, FTMP4, FARG5
++  |  fadd.d FTMP3, FTMP4, FARG5		// (|x| + 2^52) - 2^52
++  |  fsub.d FTMP3, FTMP3, FARG5
++  |  beqz TMP0, >5			// Truncate only if |x| < 2^52.
++  |  sltz TMP1, TMP1
++  |.if "func" == "ceil"
++  |  lui TMP0, 0xbff00	// Hiword of -1 (double).
++  |.else
++  |  lui TMP0, 0x3ff00	// Hiword of +1 (double).
++  |.endif
++  |.if "func" == "trunc"
++  |  slli TMP0, TMP0, 32
++  |  fmv.d.x FARG5, TMP0
++  |  flt.d TMP0, FTMP4, FRET1	// |x| < result?
++  |  fsub.d FTMP4, FTMP3, FARG5
++  |  beqz TMP0, >1
++  |  fmv.d FTMP1, FTMP4
++  |  j >2
++  |1:
++  |  fmv.d FTMP1, FTMP3
++  |2:
++  |  fneg.d FTMP4, FTMP1
++  |  beqz TMP1, >3
++  |  fmv.d FTMP3, FTMP4
++  |  j >4
++  |3:
++  |  fmv.d FTMP3, FTMP1
++  |4:
++  |  ret
++  |.else
++  |  fneg.d FTMP4, FTMP3
++  |  slli TMP0, TMP0, 32
++  |  fmv.d.x FARG5, TMP0
++  |  beqz TMP1, >1
++  |  fmv.d FTMP1, FTMP4
++  |  j >2
++  |1:
++  |  fmv.d FTMP1, FTMP3
++  |2:
++  |.if "func" == "ceil"
++  |  flt.d TMP0, FTMP1, FARG1	// x > result?
++  |.else
++  |  flt.d TMP0, FARG1, FTMP1	// x < result?
++  |.endif
++  |  beqz TMP0, >3
++  |  fsub.d FTMP4, FTMP1, FARG5		// If yes, subtract +-1.
++  |  fmv.d FRET1, FTMP4
++  |  j >4
++  |3:
++  |  fmv.d FRET1, FTMP1
++  |4:
++  |  ret
++  |.endif
++  |5:
++  |  fmv.d FTMP3, FARG1
++  |  ret
++  |.endmacro
++  |
++  |
++  |->vm_floor:
++  |  vm_round_hf floor
++  |->vm_ceil:
++  |  vm_round_hf ceil
++  |->vm_trunc:
++  |.if JIT
++  |  vm_round_hf trunc
++  |.endif
++  |
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Miscellaneous functions --------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |.define NEXT_TAB,            TAB:CARG1
++  |.define NEXT_IDX,            CARG2
++  |.define NEXT_ASIZE,          CARG3
++  |.define NEXT_NIL,            CARG4
++  |.define NEXT_TMP0,           TMP0
++  |.define NEXT_TMP1,           TMP1
++  |.define NEXT_TMP2,           TMP2
++  |.define NEXT_RES_VK,         CRET1
++  |.define NEXT_RES_IDX,        CRET2
++  |.define NEXT_RES_PTR,        sp
++  |.define NEXT_RES_VAL,        0(sp)
++  |.define NEXT_RES_KEY,        8(sp)
++  |
++  |// TValue *lj_vm_next(GCtab *t, uint32_t idx)
++  |// Next idx returned in CRET2.
++  |->vm_next:
++  |.if JIT
++  |  lw NEXT_ASIZE, NEXT_TAB->asize
++  |  ld NEXT_TMP0, NEXT_TAB->array
++  |  li NEXT_NIL, LJ_TNIL
++  |1:  // Traverse array part.
++  |  bgeu NEXT_IDX, NEXT_ASIZE, >5
++  |  slliw NEXT_TMP1, NEXT_IDX, 3
++  |  add NEXT_TMP1, NEXT_TMP0, NEXT_TMP1
++  |  li TMP3, LJ_TISNUM
++  |  ld NEXT_TMP2, 0(NEXT_TMP1)
++  |  slli TMP3, TMP3, 47
++  |  or NEXT_TMP1, NEXT_IDX, TMP3
++  |  addiw NEXT_IDX, NEXT_IDX, 1
++  |  beq NEXT_TMP2, NEXT_NIL, <1
++  |  sd NEXT_TMP2, NEXT_RES_VAL
++  |  sd NEXT_TMP1, NEXT_RES_KEY
++  |  mv NEXT_RES_VK, NEXT_RES_PTR
++  |  mv NEXT_RES_IDX, NEXT_IDX
++  |  ret
++  |
++  |5:  // Traverse hash part.
++  |  subw NEXT_RES_IDX, NEXT_IDX, NEXT_ASIZE
++  |  lw NEXT_TMP0, NEXT_TAB->hmask
++  |  ld NODE:NEXT_RES_VK, NEXT_TAB->node
++  |  slliw NEXT_TMP2, NEXT_RES_IDX, 5
++  |  slliw TMP3, NEXT_RES_IDX, 3
++  |  subw TMP3, NEXT_TMP2, TMP3
++  |  add NODE:NEXT_RES_VK, NODE:NEXT_RES_VK, TMP3
++  |6:
++  |  bltu NEXT_TMP0, NEXT_RES_IDX, >8
++  |  ld NEXT_TMP2, NODE:NEXT_RES_VK->val
++  |  addiw NEXT_RES_IDX, NEXT_RES_IDX, 1
++  |  bne NEXT_TMP2, NEXT_NIL, >9
++  |  // Skip holes in hash part.
++  |  addi NODE:NEXT_RES_VK, NODE:NEXT_RES_VK, sizeof(Node)
++  |  j <6
++  |
++  |8:  // End of iteration. Set the key to nil (not the value).
++  |  sd NEXT_NIL, NEXT_RES_KEY
++  |  mv NEXT_RES_VK, NEXT_RES_PTR
++  |9:
++  |  addw NEXT_RES_IDX, NEXT_RES_IDX, NEXT_ASIZE
++  |  ret
++  |.endif
++  |
++  |//-----------------------------------------------------------------------
++  |//-- FFI helper functions -----------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |// Handler for callback functions. Callback slot number in x5, g in x7.
++  |->vm_ffi_callback:
++  |.if FFI
++  |.type CTSTATE, CTState, PC
++  |  saveregs
++  |  ld CTSTATE, GL:x7->ctype_state
++  |  mv GL, x7
++  |  addxi DISPATCH, x7, GG_G2DISP
++  |  srli x5, x5, 12
++  |  sw x5, CTSTATE->cb.slot
++  |  sd CARG1, CTSTATE->cb.gpr[0]
++  |  fsd FARG1, CTSTATE->cb.fpr[0]
++  |  sd CARG2, CTSTATE->cb.gpr[1]
++  |  fsd FARG2, CTSTATE->cb.fpr[1]
++  |  sd CARG3, CTSTATE->cb.gpr[2]
++  |  fsd FARG3, CTSTATE->cb.fpr[2]
++  |  sd CARG4, CTSTATE->cb.gpr[3]
++  |  fsd FARG4, CTSTATE->cb.fpr[3]
++  |  sd CARG5, CTSTATE->cb.gpr[4]
++  |  fsd FARG5, CTSTATE->cb.fpr[4]
++  |  sd CARG6, CTSTATE->cb.gpr[5]
++  |  fsd FARG6, CTSTATE->cb.fpr[5]
++  |  sd CARG7, CTSTATE->cb.gpr[6]
++  |  fsd FARG7, CTSTATE->cb.fpr[6]
++  |  sd CARG8, CTSTATE->cb.gpr[7]
++  |  fsd FARG8, CTSTATE->cb.fpr[7]
++  |  addi TMP0, sp, CFRAME_SPACE
++  |  sd TMP0, CTSTATE->cb.stack
++  |  sd x0, SAVE_PC(sp)			// Any value outside of bytecode is ok.
++  |  mv CARG1, CTSTATE
++  |  mv CARG2, sp
++  |  call_intern vm_ffi_callback, lj_ccallback_enter	// (CTState *cts, void *cf)
++  |  // Returns lua_State *.
++  |  ld BASE, L:CRET1->base
++  |  ld RC, L:CRET1->top
++  |  mv L, CRET1
++  |  lui TMP3, 0x43380			// TOBIT = Hiword of 2^52 + 2^51 (double).
++  |  ld LFUNC:RB, FRAME_FUNC(BASE)
++  |  li TISNIL, LJ_TNIL
++  |  li TISNUM, LJ_TISNUM
++  |  slli TMP3, TMP3, 32
++  |  li_vmstate INTERP
++  |  subw RC, RC, BASE
++  |  cleartp LFUNC:RB
++  |  st_vmstate
++  |  fmv.d.x TOBIT, TMP3
++  |  ins_callt
++  |.endif
++  |
++  |->cont_ffi_callback:				// Return from FFI callback.
++  |.if FFI
++  |  ld CTSTATE, GL->ctype_state
++  |  sd BASE, L->base
++  |  sd RB, L->top
++  |  sd L, CTSTATE->L
++  |  mv CARG1, CTSTATE
++  |  mv CARG2, RA
++  |  // (CTState *cts, TValue *o)
++  |  call_intern cont_ffi_callback, lj_ccallback_leave
++  |  fld FRET1, CTSTATE->cb.fpr[0]
++  |  ld CRET1, CTSTATE->cb.gpr[0]
++  |  fld FRET2, CTSTATE->cb.fpr[1]
++  |  ld CRET2, CTSTATE->cb.gpr[1]
++  |  j ->vm_leave_unw
++  |.endif
++  |
++  |->vm_ffi_call:			// Call C function via FFI.
++  |  // Caveat: needs special frame unwinding, see below.
++  |.if FFI
++  |  .type CCSTATE, CCallState, CARG1
++  |  lw TMP1, CCSTATE->spadj
++  |  lbu CARG2, CCSTATE->nsp
++  |  lbu CARG3, CCSTATE->nfpr
++  |  mv TMP2, sp
++  |  sub sp, sp, TMP1
++  |  sd ra, -8(TMP2)
++  |  sd x18, -16(TMP2)
++  |  sd CCSTATE, -24(TMP2)
++  |  mv x18, TMP2
++  |  addi TMP1, CCSTATE, offsetof(CCallState, stack)
++  |  mv TMP2, sp
++  |  add TMP3, TMP1, CARG2
++  |  beqz CARG2, >2
++  |1:
++  |  ld TMP0, 0(TMP1)
++  |  addi TMP1, TMP1, 8
++  |  sd TMP0, 0(TMP2)
++  |  addi TMP2, TMP2, 8
++  |  bltu TMP1, TMP3, <1
++  |2:
++  |  beqz CARG3, >3
++  |  fld FARG1, CCSTATE->fpr[0]
++  |  fld FARG2, CCSTATE->fpr[1]
++  |  fld FARG3, CCSTATE->fpr[2]
++  |  fld FARG4, CCSTATE->fpr[3]
++  |  fld FARG5, CCSTATE->fpr[4]
++  |  fld FARG6, CCSTATE->fpr[5]
++  |  fld FARG7, CCSTATE->fpr[6]
++  |  fld FARG8, CCSTATE->fpr[7]
++  |3:
++  |  ld CFUNCADDR, CCSTATE->func
++  |  ld CARG2, CCSTATE->gpr[1]
++  |  ld CARG3, CCSTATE->gpr[2]
++  |  ld CARG4, CCSTATE->gpr[3]
++  |  ld CARG5, CCSTATE->gpr[4]
++  |  ld CARG6, CCSTATE->gpr[5]
++  |  ld CARG7, CCSTATE->gpr[6]
++  |  ld CARG8, CCSTATE->gpr[7]
++  |  ld CARG1, CCSTATE->gpr[0]		// Do this last, since CCSTATE is CARG1.
++  |  jalr CFUNCADDR
++  |  ld CCSTATE:TMP1, -24(x18)
++  |  ld TMP0, -16(x18)
++  |  ld ra, -8(x18)
++  |  sd CRET1, CCSTATE:TMP1->gpr[0]
++  |  sd CRET2, CCSTATE:TMP1->gpr[1]
++  |  fsd FRET1, CCSTATE:TMP1->fpr[0]
++  |  fsd FRET2, CCSTATE:TMP1->fpr[1]
++  |  mv sp, x18
++  |  mv x18, TMP0
++  |  ret
++  |.endif
++  |// Note: vm_ffi_call must be the last function in this object file!
++  |
++  |//-----------------------------------------------------------------------
++}
++
++/* Generate the code for a single instruction. */
++static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++{
++  int vk = 0;
++  |=>defop:
++
++  switch (op) {
++
++  /* -- Comparison ops ---------------------------------------------------- */
++
++  /* Remember: all ops branch for a true comparison, fall through otherwise. */
++
++  case BC_ISLT: case BC_ISGE: case BC_ISLE: case BC_ISGT:
++    |  // RA = src1*8, RD = src2*8, JMP with RD = target
++    |  add RA, BASE, RA
++    |  add RD, BASE, RD
++    if (op == BC_ISLT || op == BC_ISGE) {
++      |  ld CARG1, 0(RA)
++      |   ld CARG2, 0(RD)
++      |  gettp CARG3, CARG1
++      |   gettp CARG4, CARG2
++    } else {
++      |  ld CARG2, 0(RA)
++      |   ld CARG1, 0(RD)
++      |  gettp CARG3, CARG2
++      |   gettp CARG4, CARG1
++    }
++    |  lhu TMP2, OFS_RD(PC)		// TMP2=jump
++    |   addi PC, PC, 4
++    |  bne CARG3, TISNUM, >2
++    |  decode_BC4b TMP2
++    |   bne CARG4, TISNUM, >5
++    |  sext.w CARG1, CARG1
++    |  sext.w CARG2, CARG2
++    |  lui TMP3, (-(BCBIAS_J*4 >> 12)) & 0xfffff		// -BCBIAS_J*4
++    |  slt TMP1, CARG1, CARG2
++    |  addw TMP2, TMP2, TMP3		// TMP2=(jump-0x8000)<<2
++    if (op == BC_ISLT || op == BC_ISGT) {
++      |  neg TMP1, TMP1
++    } else {
++      |  addi TMP1, TMP1, -1
++    }
++    |  and TMP2, TMP2, TMP1
++    |1:
++    |  add PC, PC, TMP2
++    |  ins_next
++    |
++    |2:  // RA is not an integer.
++    |  sltiu TMP1, CARG3, LJ_TISNUM
++    |  lui TMP3, (-(BCBIAS_J*4 >> 12)) & 0xfffff		// -BCBIAS_J*4
++    |  bxeqz TMP1, ->vmeta_comp
++    |  sltiu TMP1, CARG4, LJ_TISNUM
++    |  decode_BC4b TMP2
++    |  beqz TMP1, >4
++    |  fmv.d.x FTMP0, CARG1
++    |  fmv.d.x FTMP2, CARG2
++    |3:  // RA and RD are both numbers.
++    |  addw TMP2, TMP2, TMP3
++    if (op == BC_ISLT) {
++      |  flt.d TMP3, FTMP0, FTMP2
++      |  neg TMP3, TMP3
++    } else if (op == BC_ISGE) {
++      |  flt.d TMP3, FTMP0, FTMP2
++      |  addi TMP3, TMP3, -1
++    } else if (op == BC_ISLE) {
++      |  fle.d TMP3, FTMP2, FTMP0
++      |  neg TMP3, TMP3
++    } else if (op == BC_ISGT) {
++      |  fle.d TMP3, FTMP2, FTMP0
++      |  addi TMP3, TMP3, -1
++    }
++    |  and TMP2, TMP2, TMP3
++    |  j <1
++    |
++    |4:  // RA is a number, RD is not a number.
++    |  // RA is a number, RD is an integer. Convert RD to a number.
++    |  bxne CARG4, TISNUM, ->vmeta_comp
++    if (op == BC_ISLT || op == BC_ISGE) {
++      |  fcvt.d.w FTMP2, CARG2
++      |  fmv.d.x FTMP0, CARG1
++    } else {
++      |  fcvt.d.w FTMP0, CARG1
++      |  fmv.d.x FTMP2, CARG2
++    }
++    |  j <3
++    |
++    |5:  // RA is an integer, RD is not an integer
++    |  sltiu TMP1, CARG4, LJ_TISNUM
++    |  lui TMP3, (-(BCBIAS_J*4 >> 12)) & 0xfffff		// -BCBIAS_J*4
++    |  bxeqz TMP1, ->vmeta_comp
++    |  // RA is an integer, RD is a number. Convert RA to a number.
++    if (op == BC_ISLT || op == BC_ISGE) {
++      |  fcvt.d.w FTMP0, CARG1
++      |  fmv.d.x FTMP2, CARG2
++    } else {
++      |  fcvt.d.w FTMP2, CARG2
++      |  fmv.d.x FTMP0, CARG1
++    }
++    |  j <3
++    break;
++
++  case BC_ISEQV: case BC_ISNEV:
++    vk = op == BC_ISEQV;
++    |  // RA = src1*8, RD = src2*8, JMP with RD = target
++    |  add RA, BASE, RA
++    |   add RD, BASE, RD
++    |    addi PC, PC, 4
++    |  ld CARG1, 0(RA)
++    |   ld CARG2, 0(RD)
++    |    lhu TMP2, -4+OFS_RD(PC)
++    |  gettp CARG3, CARG1
++    |   gettp CARG4, CARG2
++    |  sltu TMP0, TISNUM, CARG3
++    |   sltu TMP1, TISNUM, CARG4
++    |  or TMP0, TMP0, TMP1
++    |  lui TMP3, (-(BCBIAS_J*4 >> 12)) & 0xfffff		// -BCBIAS_J*4
++    if (vk) {
++      |  beqz TMP0, ->BC_ISEQN_Z
++    } else {
++      |  beqz TMP0, ->BC_ISNEN_Z
++    }
++    |// Either or both types are not numbers.
++    |.if FFI
++    |  // Check if RA or RD is a cdata.
++    |  xori TMP0, CARG3, LJ_TCDATA
++    |  xori TMP1, CARG4, LJ_TCDATA
++    |  and TMP0, TMP0, TMP1
++    |  bxeqz TMP0, ->vmeta_equal_cd
++    |.endif
++    |  lui TMP3, (-(BCBIAS_J*4 >> 12)) & 0xfffff		// -BCBIAS_J*4
++    |  decode_BC4b TMP2
++    |  addw TMP2, TMP2, TMP3		// (jump-0x8000)<<2
++    |  bne CARG1, CARG2, >2
++    |  // Tag and value are equal.
++    if (vk) {
++      |->BC_ISEQV_Z:
++      |  add PC, PC, TMP2
++    }
++    |1:
++    |  ins_next
++    |
++    |2:  // Check if the tags are the same and it's a table or userdata.
++    |  xor TMP3, CARG3, CARG4			// Same type?
++    |  sltiu TMP0, CARG3, LJ_TISTABUD+1		// Table or userdata? TMP0=1
++    |  beqz TMP3, >3
++    |  mv TMP0, x0		// TMP0=0: not same type, or same type table/userdata
++    |3:
++    |  cleartp TAB:TMP1, CARG1
++    if (vk) {
++      |  beqz TMP0, <1
++    } else {
++      |  beqz TMP0, ->BC_ISEQV_Z  // Reuse code from opposite instruction.
++    }
++    |  // Different tables or userdatas. Need to check __eq metamethod.
++    |  // Field metatable must be at same offset for GCtab and GCudata!
++    |  ld TAB:TMP3, TAB:TMP1->metatable
++    if (vk) {
++      |  beqz TAB:TMP3, <1		// No metatable?
++      |  lbu TMP3, TAB:TMP3->nomm
++      |  andi TMP3, TMP3, 1<<MM_eq
++      |  li TMP0, 0		// ne = 0
++      |  bnez TMP3, <1			// Or 'no __eq' flag set?
++    } else {
++      |  beqz TAB:TMP3,->BC_ISEQV_Z	// No metatable?
++      |  lbu TMP3, TAB:TMP3->nomm
++      |  andi TMP3, TMP3, 1<<MM_eq
++      |  li TMP0, 1		// ne = 1
++      |  bnez TMP3, ->BC_ISEQV_Z	// Or 'no __eq' flag set?
++    }
++    |  j ->vmeta_equal			// Handle __eq metamethod.
++    break;
++
++  case BC_ISEQS: case BC_ISNES:
++    vk = op == BC_ISEQS;
++    |  // RA = src*8, RD = str_const*8 (~), JMP with RD = target
++    |  add RA, BASE, RA
++    |   addi PC, PC, 4
++    |  ld CARG1, 0(RA)
++    |   sub RD, KBASE, RD
++    |    lhu TMP2, -4+OFS_RD(PC)
++    |   ld CARG2, -8(RD)		// KBASE-8-str_const*8
++    |.if FFI
++    |  gettp CARG3, CARG1
++    |  li TMP1, LJ_TCDATA
++    |.endif
++    |  li TMP0, LJ_TSTR
++    |   decode_BC4b TMP2
++    |   settp CARG2, TMP0
++    |   lui TMP3, (-(BCBIAS_J*4 >> 12)) & 0xfffff		// -BCBIAS_J*4
++    |.if FFI
++    |  bxeq CARG3, TMP1, ->vmeta_equal_cd
++    |.endif
++    |  xor TMP0, CARG1, CARG2		// TMP2=0: A==D; TMP2!=0: A!=D
++    |   addw TMP2, TMP2, TMP3
++    if (vk) {
++      |  seqz TMP4, TMP0
++    } else {
++      |  snez TMP4, TMP0
++    }
++    |  neg TMP4, TMP4
++    |  and TMP2, TMP2, TMP4
++    |  add PC, PC, TMP2
++    |  ins_next
++    break;
++
++  case BC_ISEQN: case BC_ISNEN:
++    vk = op == BC_ISEQN;
++    |  // RA = src*8, RD = num_const*8, JMP with RD = target
++    |  add RA, BASE, RA
++    |   add RD, KBASE, RD
++    |  ld CARG1, 0(RA)
++    |   ld CARG2, 0(RD)
++    |    lhu TMP2, OFS_RD(PC)
++    |  gettp CARG3, CARG1
++    |   gettp CARG4, CARG2
++    |    addi PC, PC, 4
++    |    lui TMP3, (-(BCBIAS_J*4 >> 12)) & 0xfffff		// -BCBIAS_J*4
++    if (vk) {
++      |->BC_ISEQN_Z:
++    } else {
++      |->BC_ISNEN_Z:
++    }
++    |  decode_BC4b TMP2
++    |  bne CARG3, TISNUM, >4
++    |  addw TMP2, TMP2, TMP3
++    |  bne CARG4, TISNUM, >6
++    |  xor TMP0, CARG1, CARG2		// TMP0=0: A==D; TMP0!=0: A!=D
++    |1:
++    if (vk) {
++      |  seqz TMP4, TMP0
++      |  neg TMP4, TMP4
++      |  and TMP2, TMP2, TMP4
++      |  add PC, PC, TMP2
++      |2:
++    } else {
++      |  snez TMP4, TMP0
++      |  neg TMP4, TMP4
++      |  and TMP2, TMP2, TMP4
++      |2:
++      |  add PC, PC, TMP2
++    }
++    |3:
++    |  ins_next
++    |
++    |4:  // RA is not an integer.
++    |    addw TMP2, TMP2, TMP3
++    |.if FFI
++    |  bgeu CARG3, TISNUM, >7
++    |.else
++    |  bgeu CARG3, TISNUM, <2
++    |.endif
++    |  fmv.d.x FTMP0, CARG1
++    |   fmv.d.x FTMP2, CARG2
++    |  bne CARG4, TISNUM, >5
++    |// RA is a number, RD is an integer.
++    |  fcvt.d.w FTMP2, CARG2
++    |
++    |5:  // RA and RD are both numbers.
++    |  feq.d TMP0, FTMP0, FTMP2
++    |  seqz TMP0, TMP0
++    |  j <1
++    |
++    |6: // RA is an integer, RD is a number.
++    |.if FFI
++    |  bgeu CARG4, TISNUM, >8
++    |.else
++    |  bgeu CARG4, TISNUM, <2
++    |.endif
++    |  fcvt.d.w FTMP0, CARG1
++    |   fmv.d.x FTMP2, CARG2
++    |  j <5
++    |
++    |.if FFI
++    |7:	// RA not int, not number
++    |  li TMP0, LJ_TCDATA
++    |  bne CARG3, TMP0, <2
++    |  j ->vmeta_equal_cd
++    |
++    |8:	// RD not int, not number
++    |  li TMP0, LJ_TCDATA
++    |  bne CARG4, TMP0, <2
++    |  j ->vmeta_equal_cd
++    |.endif
++    break;
++
++  case BC_ISEQP: case BC_ISNEP:
++    vk = op == BC_ISEQP;
++    |  // RA = src*8, RD = primitive_type*8 (~), JMP with RD = target
++    |  add RA, BASE, RA
++    |   srliw TMP0, RD, 3
++    |  ld TMP1, 0(RA)
++    |   not TMP0, TMP0		// ~TMP0: ~0 ~1 ~2
++    |    lhu TMP2, OFS_RD(PC)		// TMP2: RD in next INS, branch target
++    |  gettp TMP1, TMP1
++    |    addi PC, PC, 4
++    |   xor TMP0, TMP1, TMP0		// TMP0=0 A=D; TMP0!=0 A!=D
++    |.if FFI
++    |  li TMP3, LJ_TCDATA
++    |  bxeq TMP1, TMP3, ->vmeta_equal_cd
++    |.endif
++    |  decode_BC4b TMP2
++    |  lui TMP3, (-(BCBIAS_J*4 >> 12)) & 0xfffff		// -BCBIAS_J*4
++    |  addw TMP2, TMP2, TMP3		// TMP2=(jump-0x8000)<<2
++    if (vk) {
++      |  seqz TMP4, TMP0
++    } else {
++      |  snez TMP4, TMP0
++    }
++    |  neg TMP4, TMP4
++    |  and TMP2, TMP2, TMP4
++    |  add PC, PC, TMP2
++    |  ins_next
++    break;
++
++  /* -- Unary test and copy ops ------------------------------------------- */
++
++  case BC_ISTC: case BC_ISFC: case BC_IST: case BC_ISF:
++    |  // RA = dst*8 or unused, RD = src*8, JMP with RD = target
++    |  add RD, BASE, RD
++    |   lhu TMP2, OFS_RD(PC)
++    |  ld TMP0, 0(RD)
++    |   addi PC, PC, 4
++    |  gettp TMP0, TMP0
++    |  add RA, BASE, RA
++    |  sltiu TMP0, TMP0, LJ_TISTRUECOND		// TMP0=1 true; TMP0=0 false
++    |  decode_BC4b TMP2
++    |  lui TMP3, (-(BCBIAS_J*4 >> 12)) & 0xfffff		// -BCBIAS_J*4
++    |  ld CRET1, 0(RD)
++    |  addw TMP2, TMP2, TMP3		// (jump-0x8000)<<2
++    if (op == BC_IST || op == BC_ISTC) {
++      |  beqz TMP0, >1
++      if (op == BC_ISTC) {
++        |  sd CRET1, 0(RA)
++      }
++    } else {
++      |  bnez TMP0, >1
++      if (op == BC_ISFC) {
++	|  sd CRET1, 0(RA)
++      }
++    }
++    |  add PC, PC, TMP2
++    |1:
++    |  ins_next
++    break;
++
++  case BC_ISTYPE:
++    |  // RA = src*8, RD = -type*8
++    |  add TMP0, BASE, RA
++    |  srliw TMP1, RD, 3
++    |  ld TMP0, 0(TMP0)
++    |  gettp TMP0, TMP0
++    |  add TMP0, TMP0, TMP1		// if itype of RA == type, then TMP0=0
++    |  bxnez TMP0, ->vmeta_istype
++    |  ins_next
++    break;
++  case BC_ISNUM:
++    |  // RA = src*8, RD = -(TISNUM-1)*8
++    |  add TMP0, BASE, RA
++    |  ld TMP0, 0(TMP0)
++    |  checknum TMP0, ->vmeta_istype
++    |  ins_next
++    break;
++
++  /* -- Unary ops --------------------------------------------------------- */
++
++  case BC_MOV:
++    |  // RA = dst*8, RD = src*8
++    |  add RD, BASE, RD
++    |   add RA, BASE, RA
++    |  ld TMP0, 0(RD)
++    |  ins_next1
++    |  sd TMP0, 0(RA)
++    |  ins_next2
++    break;
++  case BC_NOT:
++    |  // RA = dst*8, RD = src*8
++    |  add RD, BASE, RD
++    |   add RA, BASE, RA
++    |  ld TMP0, 0(RD)
++    |   li TMP1, LJ_TTRUE
++    |  ins_next1
++    |  gettp TMP0, TMP0
++    |  sltu TMP0, TMP1, TMP0
++    |  addiw TMP0, TMP0, 1
++    |  slli TMP0, TMP0, 47
++    |  not TMP0, TMP0
++    |   sd TMP0, 0(RA)
++    |  ins_next2
++    break;
++  case BC_UNM:
++    |  // RA = dst*8, RD = src*8
++    |  add RB, BASE, RD
++    |  add RA, BASE, RA
++    |  ld TMP0, 0(RB)
++    |  lui TMP1, 0x80000
++    |  gettp CARG3, TMP0
++    |  bne CARG3, TISNUM, >1
++    |  negw TMP0, TMP0
++    |  bxeq TMP0, TMP1, ->vmeta_unm      // Meta handler deals with -2^31.
++    |  zext.w TMP0, TMP0
++    |  settp_b TMP0, TISNUM
++    |  j >2
++    |1:
++    |  sltiu TMP3, CARG3, LJ_TISNUM
++    |   slli TMP1, TMP1, 32
++    |  bxeqz TMP3, ->vmeta_unm
++    |   xor TMP0, TMP0, TMP1     // sign => ~sign
++    |2:
++    |   sd TMP0, 0(RA)
++    |  ins_next
++    break;
++  case BC_LEN:
++    |  // RA = dst*8, RD = src*8
++    |  add CARG2, BASE, RD
++    |  ld TMP0, 0(CARG2)
++    |   add RA, BASE, RA
++    |  gettp TMP1, TMP0
++    |  addi TMP2, TMP1, -LJ_TSTR
++    |   cleartp STR:CARG1, TMP0
++    |  bnez TMP2, >2
++    |   lwu CARG1, STR:CARG1->len
++    |1:
++    |  settp_b CARG1, TISNUM
++    |  sd CARG1, 0(RA)
++    |  ins_next
++    |2:
++    |  addi TMP2, TMP1, -LJ_TTAB
++    |  bxnez TMP2, ->vmeta_len
++#if LJ_52
++    |  ld TAB:TMP2, TAB:CARG1->metatable
++    |  bnez TAB:TMP2, >9
++    |3:
++#endif
++    |->BC_LEN_Z:
++    |  call_intern BC_LEN, lj_tab_len		// (GCtab *t)
++    |  // Returns uint32_t (but less than 2^31).
++    |  j <1
++#if LJ_52
++    |9:
++    |  lbu TMP0, TAB:TMP2->nomm
++    |  andi TMP0, TMP0, 1<<MM_len
++    |  bnez TMP0, <3			// 'no __len' flag set: done.
++    |  j ->vmeta_len
++#endif
++    break;
++
++  /* -- Binary ops -------------------------------------------------------- */
++
++    |.macro fpmod, a, b, c
++    |  fdiv.d FARG1, b, c
++    |  jal ->vm_floor		// floor(b/c)
++    |  fmul.d a, FRET1, c
++    |  fsub.d a, b, a		// b - floor(b/c)*c
++    |.endmacro
++    |
++    |.macro ins_arithpre
++    ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
++    |  // RA = dst*8, RB = src1*8, RC = src2*8 | num_const*8
++    ||if (vk == 1) {
++    |   // RA = dst*8, RB = num_const*8, RC = src1*8
++    |   decode_RB8 RC, INS
++    |   decode_RDtoRC8 RB, RD
++    ||} else {
++    |   // RA = dst*8, RB = src1*8, RC = num_const*8
++    |   decode_RB8 RB, INS
++    |   decode_RDtoRC8 RC, RD
++    ||}
++    ||switch (vk) {
++    ||case 0:			// suffix is VN
++    |   add RB, BASE, RB
++    |   add RC, KBASE, RC
++    ||  break;
++    ||case 1:			// suffix is NV
++    |   add RC, BASE, RC
++    |   add RB, KBASE, RB
++    ||  break;
++    ||default:			// CAT or suffix is VV
++    |   add RB, BASE, RB
++    |   add RC, BASE, RC
++    ||  break;
++    ||}
++    |.endmacro
++    |
++    |.macro ins_arithfp, fpins, itype1, itype2
++    |  fld FTMP0, 0(RB)
++    |  sltu itype1, itype1, TISNUM
++    |  sltu itype2, itype2, TISNUM
++    |  fld FTMP2, 0(RC)
++    |  and itype1, itype1, itype2
++    |  add RA, BASE, RA
++    |  bxeqz itype1, ->vmeta_arith
++    |  fpins FRET1, FTMP0, FTMP2
++    |  ins_next1
++    |  fsd FRET1, 0(RA)
++    |  ins_next2
++    |.endmacro
++    |
++    |.macro ins_arithead, itype1, itype2, tval1, tval2 
++    |  ld tval1, 0(RB)
++    |  ld tval2, 0(RC)
++    |  // Check for two integers.
++    |  gettp itype1, tval1
++    |  gettp itype2, tval2
++    |.endmacro
++    |
++    |.macro ins_arithdn, intins, fpins
++    |  ins_arithpre
++    |  ins_arithead TMP0, TMP1, CARG1, CARG2
++    |  bne TMP0, TISNUM, >1
++    |  bne TMP1, TISNUM, >1
++    |  sext.w CARG3, CARG1
++    |  sext.w CARG4, CARG2
++    |.if "intins" == "addw"
++    |  intins CRET1, CARG3, CARG4
++    |  xor TMP1, CRET1, CARG3		// ((y^a) & (y^b)) < 0: overflow.
++    |  xor TMP2, CRET1, CARG4
++    |  and TMP1, TMP1, TMP2
++    |  add RA, BASE, RA
++    |  bxltz TMP1, ->vmeta_arith
++    |.elif "intins" == "subw"
++    |  intins CRET1, CARG3, CARG4
++    |  xor TMP1, CRET1, CARG3		// ((y^a) & (a^b)) < 0: overflow.
++    |  xor TMP2, CARG3, CARG4
++    |  and TMP1, TMP1, TMP2
++    |  add RA, BASE, RA
++    |  bxltz TMP1, ->vmeta_arith
++    |.elif "intins" == "mulw"
++    |  mul TMP2, CARG3, CARG4
++    |  add RA, BASE, RA
++    |  sext.w CRET1, TMP2
++    |  bxne CRET1, TMP2, ->vmeta_arith		// 63-32bit not all 0 or 1: overflow.
++    |.endif
++    |  zext.w CRET1, CRET1
++    |  settp_b CRET1, TISNUM
++    |  sd CRET1, 0(RA)
++    |  ins_next
++    |1:  // Check for two numbers.
++    |  ins_arithfp, fpins, TMP0, TMP1
++    |.endmacro
++    |
++    |.macro ins_arithdiv, fpins
++    |  ins_arithpre
++    |  ins_arithead TMP0, TMP1, CARG1, CARG2
++    |  ins_arithfp, fpins, TMP0, TMP1
++    |.endmacro
++    |
++    |.macro ins_arithmod, fpins, BC
++    |  ins_arithpre
++    |  ins_arithead TMP0, TMP1, CARG1, CARG2
++    |  bne TMP0, TISNUM, >1
++    |  bne TMP1, TISNUM, >1
++    |  sext.w CARG1, CARG1
++    |  sext.w CARG2, CARG2
++    |  add RA, BASE, RA
++    |  bxeqz CARG2, ->vmeta_arith
++    |  call_intern BC, lj_vm_modi
++    |  zext.w CRET1, CRET1
++    |  settp_b CRET1, TISNUM
++    |  sd CRET1, 0(RA)
++    |  ins_next
++    |1:  // Check for two numbers.
++    |  ins_arithfp, fpins, TMP0, TMP1
++    |.endmacro
++    
++  case BC_ADDVN: case BC_ADDNV: case BC_ADDVV:
++    |  ins_arithdn addw, fadd.d
++    break;
++  case BC_SUBVN: case BC_SUBNV: case BC_SUBVV:
++    |  ins_arithdn subw, fsub.d
++    break;
++  case BC_MULVN: case BC_MULNV: case BC_MULVV:
++    |  ins_arithdn mulw, fmul.d
++    break;
++  case BC_DIVVN: case BC_DIVNV: case BC_DIVVV:
++    |  ins_arithdiv fdiv.d
++    break;
++  case BC_MODVN:
++    |  ins_arithmod fpmod, BC_MODVN
++    break;
++  case BC_MODNV:
++    |  ins_arithmod fpmod, BC_MODNV
++    break;
++  case BC_MODVV:
++    |  ins_arithmod fpmod, BC_MODVV
++    break;
++  case BC_POW:
++    |  ins_arithpre
++    |  ld CARG1, 0(RB)
++    |   ld CARG2, 0(RC)
++    |  gettp TMP0, CARG1
++    |   gettp TMP1, CARG2
++    |  sltiu TMP0, TMP0, LJ_TISNUM
++    |   sltiu TMP1, TMP1, LJ_TISNUM
++    |  and TMP0, TMP0, TMP1
++    |   add RA, BASE, RA
++    |  bxeqz TMP0, ->vmeta_arith
++    |  fld FARG1, 0(RB)
++    |  fld FARG2, 0(RC)
++    |  call_extern pow
++    |  ins_next1
++    |  fsd FRET1, 0(RA)
++    |  ins_next2
++    break;
++
++  case BC_CAT:
++    |  // RA = dst*8, RB = src_start*8, RC = src_end*8
++    |  decode_RB8 RB, INS
++    |   decode_RDtoRC8 RC, RD
++    |  sub CARG3, RC, RB
++    |   sd BASE, L->base
++    |  add CARG2, BASE, RC
++    |  mv MULTRES, RB
++    |->BC_CAT_Z:
++    |  srliw CARG3, CARG3, 3
++    |   sd PC, SAVE_PC(sp)
++    |   mv CARG1, L
++    |  call_intern BC_CAT, lj_meta_cat	// (lua_State *L, TValue *top, int left)
++    |  // Returns NULL (finished) or TValue * (metamethod).
++    |   ld BASE, L->base
++    |  bxnez CRET1, ->vmeta_binop
++    |  add RB, BASE, MULTRES
++    |  ld TMP0, 0(RB)
++    |   add RA, BASE, RA
++    |  sd TMP0, 0(RA)
++    |  ins_next
++    break;
++
++  /* -- Constant ops ------------------------------------------------------ */
++
++  case BC_KSTR:
++    |  // RA = dst*8, RD = str_const*8 (~)
++    |  sub TMP1, KBASE, RD
++    |   li TMP2, LJ_TSTR
++    |  ld TMP0, -8(TMP1)		// KBASE-8-str_const*8
++    |  add RA, BASE, RA
++    |   settp TMP0, TMP2
++    |  sd TMP0, 0(RA)
++    |  ins_next
++    break;
++  case BC_KCDATA:
++    |.if FFI
++    |  // RA = dst*8, RD = cdata_const*8 (~)
++    |  sub TMP1, KBASE, RD
++    |  ld TMP0, -8(TMP1)		// KBASE-8-cdata_const*8
++    |   li TMP2, LJ_TCDATA
++    |  add RA, BASE, RA
++    |   settp TMP0, TMP2
++    |  sd TMP0, 0(RA)
++    |  ins_next
++    |.endif
++    break;
++  case BC_KSHORT:
++    |  // RA = dst*8, RD = int16_literal*8
++    |   sraiw RD, INS, 16
++    |  add RA, BASE, RA
++    |   zext.w RD, RD
++    |  ins_next1
++    |   settp_b RD, TISNUM
++    |   sd RD, 0(RA)
++    |  ins_next2
++    break;
++  case BC_KNUM:
++    |  // RA = dst*8, RD = num_const*8
++    |  add RD, KBASE, RD
++    |   add RA, BASE, RA
++    |  ld TMP0, 0(RD)
++    |  ins_next1
++    |  sd TMP0, 0(RA)
++    |  ins_next2
++    break;
++  case BC_KPRI:
++    |  // RA = dst*8, RD = primitive_type*8 (~)
++    |   add RA, BASE, RA
++    |  slli TMP0, RD, 44	// 44+3
++    |  not TMP0, TMP0
++    |  ins_next1
++    |   sd TMP0, 0(RA)
++    |  ins_next2
++    break;
++  case BC_KNIL:
++    |  // RA = base*8, RD = end*8
++    |  add RA, BASE, RA
++    |  sd TISNIL, 0(RA)
++    |   addi RA, RA, 8
++    |  add RD, BASE, RD
++    |1:
++    |  sd TISNIL, 0(RA)
++    |  slt TMP0, RA, RD
++    |   addi RA, RA, 8
++    |  bnez TMP0, <1
++    |  ins_next
++    break;
++
++  /* -- Upvalue and function ops ------------------------------------------ */
++
++  case BC_UGET:
++    |  // RA = dst*8, RD = uvnum*8
++    |  ld LFUNC:TMP0, FRAME_FUNC(BASE)
++    |   add RA, BASE, RA
++    |  cleartp LFUNC:TMP0
++    |  add RD, RD, LFUNC:TMP0
++    |  ld UPVAL:TMP0, LFUNC:RD->uvptr
++    |  ld TMP1, UPVAL:TMP0->v
++    |  ld TMP2, 0(TMP1)
++    |  ins_next1
++    |   sd TMP2, 0(RA)
++    |  ins_next2
++    break;
++  case BC_USETV:
++    |  // RA = uvnum*8, RD = src*8
++    |  ld LFUNC:TMP0, FRAME_FUNC(BASE)
++    |   add RD, BASE, RD
++    |  cleartp LFUNC:TMP0
++    |  add RA, RA, LFUNC:TMP0
++    |  ld UPVAL:TMP0, LFUNC:RA->uvptr
++    |   ld CRET1, 0(RD)
++    |  lbu TMP3, UPVAL:TMP0->marked
++    |   ld CARG2, UPVAL:TMP0->v
++    |  andi TMP3, TMP3, LJ_GC_BLACK	// isblack(uv)
++    |  lbu TMP0, UPVAL:TMP0->closed
++    |   gettp TMP2, CRET1
++    |   sd CRET1, 0(CARG2)
++    |  or TMP3, TMP3, TMP0
++    |  li TMP0, LJ_GC_BLACK|1
++    |   addi TMP2, TMP2, -(LJ_TNUMX+1)
++    |  beq TMP3, TMP0, >2			// Upvalue is closed and black?
++    |1:
++    |  ins_next
++    |
++    |2:  // Check if new value is collectable.
++    |  sltiu TMP0, TMP2, LJ_TISGCV - (LJ_TNUMX+1)
++    |   cleartp GCOBJ:CRET1, CRET1
++    |  beqz TMP0, <1			// tvisgcv(v)
++    |  lbu TMP3, GCOBJ:CRET1->gch.marked
++    |  andi TMP3, TMP3, LJ_GC_WHITES	// iswhite(v)
++    |  beqz TMP3, <1
++    |  // Crossed a write barrier. Move the barrier forward.
++    |  mv CARG1, GL
++    |  call_intern BC_USETV, lj_gc_barrieruv	// (global_State *g, TValue *tv)
++    |  j <1
++    break;
++  case BC_USETS:
++    |  // RA = uvnum*8, RD = str_const*8 (~)
++    |  ld LFUNC:TMP0, FRAME_FUNC(BASE)
++    |   sub TMP1, KBASE, RD
++    |  cleartp LFUNC:TMP0
++    |  add RA, RA, LFUNC:TMP0
++    |  ld UPVAL:TMP0, LFUNC:RA->uvptr
++    |   ld STR:TMP1, -8(TMP1)		// KBASE-8-str_const*8
++    |  lbu TMP2, UPVAL:TMP0->marked
++    |   ld CARG2, UPVAL:TMP0->v
++    |   lbu TMP3, STR:TMP1->marked
++    |  andi TMP4, TMP2, LJ_GC_BLACK	// isblack(uv)
++    |   lbu TMP2, UPVAL:TMP0->closed
++    |   li TMP0, LJ_TSTR
++    |   settp TMP1, TMP0
++    |  sd TMP1, 0(CARG2)
++    |   bnez TMP4, >2
++    |1:
++    |  ins_next
++    |
++    |2:  // Check if string is white and ensure upvalue is closed.
++    |  beqz TMP2, <1
++    |   andi TMP0, TMP3, LJ_GC_WHITES     // iswhite(str)
++    |  beqz TMP0, <1
++    |  // Crossed a write barrier. Move the barrier forward.
++    |  mv CARG1, GL
++    |  call_intern BC_USETS, lj_gc_barrieruv	// (global_State *g, TValue *tv)
++    |  j <1
++    break;
++  case BC_USETN:
++    |  // RA = uvnum*8, RD = num_const*8
++    |  ld LFUNC:TMP0, FRAME_FUNC(BASE)
++    |   add RD, KBASE, RD
++    |  cleartp LFUNC:TMP0
++    |  add TMP0, RA, LFUNC:TMP0
++    |  ld UPVAL:TMP0, LFUNC:TMP0->uvptr
++    |   ld TMP1, 0(RD)
++    |  ld TMP0, UPVAL:TMP0->v
++    |   sd TMP1, 0(TMP0)
++    |  ins_next
++    break;
++  case BC_USETP:
++    |  // RA = uvnum*8, RD = primitive_type*8 (~)
++    |  ld LFUNC:TMP0, FRAME_FUNC(BASE)
++    |   slli TMP2, RD, 44
++    |  cleartp LFUNC:TMP0
++    |  add TMP0, RA, LFUNC:TMP0
++    |   not TMP2, TMP2
++    |  ld UPVAL:TMP0, LFUNC:TMP0->uvptr
++    |  ld TMP1, UPVAL:TMP0->v
++    |   sd TMP2, 0(TMP1)
++    |  ins_next
++    break;
++
++  case BC_UCLO:
++    |  // RA = level*8, RD = target
++    |  ld TMP2, L->openupval
++    |  branch_RD			// Do this first since RD is not saved.
++    |   sd BASE, L->base
++    |   mv CARG1, L
++    |  beqz TMP2, >1
++    |   add CARG2, BASE, RA
++    |  call_intern BC_UCLO, lj_func_closeuv	// (lua_State *L, TValue *level)
++    |  ld BASE, L->base
++    |1:
++    |  ins_next
++    break;
++
++  case BC_FNEW:
++    |  // RA = dst*8, RD = proto_const*8 (~) (holding function prototype)
++    |  sub TMP1, KBASE, RD
++    |  ld CARG3, FRAME_FUNC(BASE)
++    |   ld CARG2, -8(TMP1)		// KBASE-8-tab_const*8
++    |    sd BASE, L->base
++    |    sd PC, SAVE_PC(sp)
++    |  cleartp CARG3
++    |   mv CARG1, L
++    |  // (lua_State *L, GCproto *pt, GCfuncL *parent)
++    |  call_intern BC_FNEW, lj_func_newL_gc
++    |  // Returns GCfuncL *.
++    |   li TMP0, LJ_TFUNC
++    |  ld BASE, L->base
++    |   settp CRET1, TMP0
++    |  add RA, BASE, RA
++    |   sd CRET1, 0(RA)
++    |  ins_next
++    break;
++
++  /* -- Table ops --------------------------------------------------------- */
++
++  case BC_TNEW:
++  case BC_TDUP:
++    |  // RA = dst*8, RD = (hbits|asize)*8 | tab_const*8 (~)
++    |  ld TMP0, GL->gc.total
++    |  ld TMP1, GL->gc.threshold
++    |   sd BASE, L->base
++    |   sd PC, SAVE_PC(sp)
++    |  bgeu TMP0, TMP1, >5
++    |1:
++    if (op == BC_TNEW) {
++      |  srliw CARG2, RD, 3
++      |  andi CARG2, CARG2, 0x7ff
++      |  lzi TMP0, 0x801
++      |  addiw TMP2, CARG2, -0x7ff
++      |   srliw CARG3, RD, 14
++      |  seqz TMP4, TMP2
++      |  neg TMP4, TMP4
++      |  and TMP0, TMP0, TMP4
++      |  not TMP4, TMP4
++      |  and CARG2, CARG2, TMP4
++      |  or CARG2, CARG2, TMP0
++      |   mv CARG1, L
++      |  // (lua_State *L, int32_t asize, uint32_t hbits)
++      |  call_intern BC_TNEW, lj_tab_new
++      |  // Returns Table *.
++    } else {
++      |  sub TMP1, KBASE, RD
++      |  mv CARG1, L
++      |   ld CARG2, -8(TMP1)		// KBASE-8-str_const*8
++      |  call_intern BC_TDUP, lj_tab_dup		// (lua_State *L, Table *kt)
++      |  // Returns Table *.
++    }
++    |   li TMP0, LJ_TTAB
++    |  ld BASE, L->base
++    |  ins_next1
++    |   settp CRET1, TMP0
++    |  add RA, BASE, RA
++    |   sd CRET1, 0(RA)
++    |  ins_next2
++    |5:
++    |  mv MULTRES, RD
++    |   mv CARG1, L
++    if (op == BC_TNEW) {
++      |  call_intern BC_TNEW, lj_gc_step_fixtop	// (lua_State *L)
++    } else {
++      |  call_intern BC_TDUP, lj_gc_step_fixtop	// (lua_State *L)
++    }
++    |   mv RD, MULTRES
++    |  j <1
++    break;
++
++  case BC_GGET:
++    |  // RA = dst*8, RD = str_const*8 (~)
++  case BC_GSET:
++    |  // RA = src*8, RD = str_const*8 (~)
++    |  ld LFUNC:TMP0, FRAME_FUNC(BASE)
++    |   sub TMP1, KBASE, RD
++    |   ld STR:RC, -8(TMP1)	// KBASE-8-str_const*8
++    |  cleartp LFUNC:TMP0
++    |  ld TAB:RB, LFUNC:TMP0->env
++    |  add RA, BASE, RA
++    if (op == BC_GGET) {
++      |  j ->BC_TGETS_Z
++    } else {
++      |  j ->BC_TSETS_Z
++    }
++    break;
++
++  case BC_TGETV:
++    |  // RA = dst*8, RB = table*8, RC = key*8
++    |  decode_RB8 RB, INS
++    |   decode_RDtoRC8 RC, RD
++    |  add CARG2, BASE, RB
++    |   add CARG3, BASE, RC
++    |  ld TAB:RB, 0(CARG2)
++    |   ld TMP2, 0(CARG3)
++    |   add RA, BASE, RA
++    |  checktab TAB:RB, ->vmeta_tgetv
++    |   gettp TMP3, TMP2
++    |   lw TMP0, TAB:RB->asize
++    |  bne TMP3, TISNUM, >5		// Integer key?
++    |  sext.w TMP2, TMP2
++    |   ld TMP1, TAB:RB->array
++    |  bxgeu TMP2, TMP0, ->vmeta_tgetv	// Integer key and in array part?
++    |   slliw TMP2, TMP2, 3
++    |   add TMP2, TMP1, TMP2
++    |   ld CRET1, 0(TMP2)
++    |  beq CRET1, TISNIL, >2
++    |1:
++    |   sd CRET1, 0(RA)
++    |  ins_next
++    |
++    |2:  // Check for __index if table value is nil.
++    |  ld TAB:TMP2, TAB:RB->metatable
++    |  beqz TAB:TMP2, <1		// No metatable: done.
++    |  lbu TMP0, TAB:TMP2->nomm
++    |  andi TMP0, TMP0, 1<<MM_index
++    |  bnez TMP0, <1			// 'no __index' flag set: done.
++    |  j ->vmeta_tgetv
++    |
++    |5:
++    |  li TMP0, LJ_TSTR
++    |   cleartp RC, TMP2
++    |  bxne TMP3, TMP0, ->vmeta_tgetv	// String key?
++    |  j ->BC_TGETS_Z
++    break;
++  case BC_TGETS:
++    |  // RA = dst*8, RB = table*8, RC = str_const*8 (~)
++    |  decode_RB8 RB, INS
++    |   decode_RDtoRC8 RC, RD
++    |  add CARG2, BASE, RB
++    |   sub CARG3, KBASE, RC
++    |  ld TAB:RB, 0(CARG2)
++    |  add RA, BASE, RA
++    |   ld STR:RC, -8(CARG3)		// KBASE-8-str_const*8
++    |  checktab TAB:RB, ->vmeta_tgets1
++    |->BC_TGETS_Z:
++    |  // TAB:RB = GCtab *, STR:RC = GCstr *, RA = dst*8
++    |  lw TMP0, TAB:RB->hmask
++    |   lw TMP1, STR:RC->sid
++    |    ld NODE:TMP2, TAB:RB->node
++    |  and TMP1, TMP1, TMP0		// idx = str->sid & tab->hmask
++    |  slliw TMP0, TMP1, 5
++    |  slliw TMP1, TMP1, 3
++    |  subw TMP1, TMP0, TMP1
++    |   li TMP3, LJ_TSTR
++    |  add NODE:TMP2, NODE:TMP2, TMP1	// node = tab->node + (idx*32-idx*8)
++    |   settp STR:RC, TMP3		// Tagged key to look for.
++    |1:
++    |  ld CARG1, NODE:TMP2->key
++    |   ld CARG2, NODE:TMP2->val
++    |    ld NODE:TMP1, NODE:TMP2->next
++    |   ld TAB:TMP3, TAB:RB->metatable
++    |  bne CARG1, RC, >4
++    |  beq CARG2, TISNIL, >5		// Key found, but nil value?
++    |3:
++    |   sd CARG2, 0(RA)
++    |  ins_next
++    |
++    |4:  // Follow hash chain.
++    |   mv NODE:TMP2, NODE:TMP1
++    |  bnez NODE:TMP1, <1
++    |  // End of hash chain: key not found, nil result.
++    |
++    |5:  // Check for __index if table value is nil.
++    |   mv CARG2, TISNIL
++    |  beqz TAB:TMP3, <3		// No metatable: done.
++    |  lbu TMP0, TAB:TMP3->nomm
++    |  andi TMP0, TMP0, 1<<MM_index
++    |  bnez TMP0, <3			// 'no __index' flag set: done.
++    |  j ->vmeta_tgets
++    break;
++  case BC_TGETB:
++    |  // RA = dst*8, RB = table*8, RC = index*8
++    |  decode_RB8 RB, INS
++    |  add CARG2, BASE, RB
++    |   decode_RDtoRC8 RC, RD
++    |  ld TAB:RB, 0(CARG2)
++    |   add RA, BASE, RA
++    |  srliw TMP0, RC, 3
++    |  checktab TAB:RB, ->vmeta_tgetb
++    |  lw TMP1, TAB:RB->asize
++    |   ld TMP2, TAB:RB->array
++    |  bxgeu TMP0, TMP1, ->vmeta_tgetb
++    |   add RC, TMP2, RC
++    |   ld CRET1, 0(RC)
++    |  beq CRET1, TISNIL, >5
++    |1:
++    |   sd CRET1, 0(RA)
++    |  ins_next
++    |
++    |5:  // Check for __index if table value is nil.
++    |  ld TAB:TMP2, TAB:RB->metatable
++    |  beqz TAB:TMP2, <1		// No metatable: done.
++    |  lbu TMP1, TAB:TMP2->nomm
++    |  andi TMP1, TMP1, 1<<MM_index
++    |  bnez TMP1, <1			// 'no __index' flag set: done.
++    |  j ->vmeta_tgetb			// Caveat: preserve TMP0 and CARG2!
++    break;
++  case BC_TGETR:
++    |  // RA = dst*8, RB = table*8, RC = key*8
++    |  decode_RB8 RB, INS
++    |   decode_RDtoRC8 RC, RD
++    |  add RB, BASE, RB
++    |   add RC, BASE, RC
++    |  ld TAB:CARG1, 0(RB)
++    |   lw CARG2, 0(RC)
++    |    add RA, BASE, RA
++    |  cleartp TAB:CARG1
++    |  lw TMP0, TAB:CARG1->asize
++    |   ld TMP1, TAB:CARG1->array
++    |  bxgeu CARG2, TMP0, ->vmeta_tgetr		// In array part?
++    |   slliw TMP2, CARG2, 3
++    |   add TMP3, TMP1, TMP2
++    |   ld TMP1, 0(TMP3)
++    |->BC_TGETR_Z:
++    |  ins_next1
++    |   sd TMP1, 0(RA)
++    |  ins_next2
++    break;
++
++  case BC_TSETV:
++    |  // RA = src*8, RB = table*8, RC = key*8
++    |  decode_RB8 RB, INS
++    |   decode_RDtoRC8 RC, RD
++    |  add CARG2, BASE, RB
++    |   add CARG3, BASE, RC
++    |  ld TAB:RB, 0(CARG2)
++    |   ld TMP2, 0(CARG3)
++    |  add RA, BASE, RA
++    |  checktab TAB:RB, ->vmeta_tsetv
++    |   sext.w RC, TMP2
++    |  checkint TMP2, >5
++    |  lw TMP0, TAB:RB->asize
++    |   ld TMP1, TAB:RB->array
++    |  bxgeu RC, TMP0, ->vmeta_tsetv		// Integer key and in array part?
++    |   slliw TMP2, RC, 3
++    |  add TMP1, TMP1, TMP2
++    |   lbu TMP3, TAB:RB->marked
++    |  ld TMP0, 0(TMP1)
++    |   ld CRET1, 0(RA)
++    |  beq TMP0, TISNIL, >3
++    |1:
++    |   andi TMP2, TMP3, LJ_GC_BLACK	// isblack(table)
++    |   sd CRET1, 0(TMP1)
++    |  bnez TMP2, >7
++    |2:
++    |  ins_next
++    |
++    |3:  // Check for __newindex if previous value is nil.
++    |  ld TAB:TMP2, TAB:RB->metatable
++    |  beqz TAB:TMP2, <1		// No metatable: done.
++    |  lbu TMP2, TAB:TMP2->nomm
++    |  andi TMP2, TMP2, 1<<MM_newindex
++    |  bnez TMP2, <1			// 'no __newindex' flag set: done.
++    |  j ->vmeta_tsetv
++    |5:
++    |  gettp TMP0, TMP2
++    |  addi TMP0, TMP0, -LJ_TSTR
++    |  bxnez TMP0, ->vmeta_tsetv
++    |  cleartp STR:RC, TMP2
++    |  j ->BC_TSETS_Z			// String key?
++    |
++    |7:  // Possible table write barrier for the value. Skip valiswhite check.
++    |  barrierback TAB:RB, TMP3, TMP0, <2
++    break;
++  case BC_TSETS:
++    |  // RA = src*8, RB = table*8, RC = str_const*8 (~)
++    |  decode_RB8 RB, INS
++    |   decode_RDtoRC8 RC, RD
++    |  add CARG2, BASE, RB
++    |   sub CARG3, KBASE, RC
++    |    ld TAB:RB, 0(CARG2)
++    |   ld RC, -8(CARG3)		// KBASE-8-str_const*8
++    |  add RA, BASE, RA
++    |   cleartp STR:RC
++    |  checktab TAB:RB, ->vmeta_tsets1
++    |->BC_TSETS_Z:
++    |  // TAB:RB = GCtab *, STR:RC = GCstr *, RA = BASE+src*8
++    |  lw TMP0, TAB:RB->hmask
++    |   lw TMP1, STR:RC->sid
++    |    ld NODE:TMP2, TAB:RB->node
++    |   sb x0, TAB:RB->nomm		// Clear metamethod cache.
++    |  and TMP1, TMP1, TMP0		// idx = str->sid & tab->hmask
++    |  slliw TMP0, TMP1, 5
++    |  slliw TMP1, TMP1, 3
++    |  subw TMP1, TMP0, TMP1
++    |   li TMP3, LJ_TSTR
++    |  add NODE:TMP2, NODE:TMP2, TMP1	// node = tab->node + (idx*32-idx*8)
++    |   settp STR:RC, TMP3		// Tagged key to look for.
++    |  fld FTMP0, 0(RA)
++    |1:
++    |  ld TMP0, NODE:TMP2->key
++    |   ld CARG2, NODE:TMP2->val
++    |    ld NODE:TMP1, NODE:TMP2->next
++    |     lbu TMP3, TAB:RB->marked
++    |  bne TMP0, RC, >5
++    |    ld TAB:TMP0, TAB:RB->metatable
++    |   beq CARG2, TISNIL, >4		// Key found, but nil value?
++    |2:
++    |  andi TMP3, TMP3, LJ_GC_BLACK	// isblack(table)
++    |   fsd FTMP0, NODE:TMP2->val
++    |  bnez TMP3, >7
++    |3:
++    |  ins_next
++    |
++    |4:  // Check for __newindex if previous value is nil.
++    |  beqz TAB:TMP0, <2		// No metatable: done.
++    |  lbu TMP0, TAB:TMP0->nomm
++    |  andi TMP0, TMP0, 1<<MM_newindex
++    |  bnez TMP0, <2			// 'no __newindex' flag set: done.
++    |  j ->vmeta_tsets
++    |
++    |5:  // Follow hash chain.
++    |   mv NODE:TMP2, NODE:TMP1
++    |  bnez NODE:TMP1, <1
++    |  // End of hash chain: key not found, add a new one
++    |
++    |  // But check for __newindex first.
++    |  ld TAB:TMP2, TAB:RB->metatable
++    |   addi CARG3, GL, offsetof(global_State, tmptv)
++    |  beqz TAB:TMP2, >6		// No metatable: continue.
++    |  lbu TMP0, TAB:TMP2->nomm
++    |  andi TMP0, TMP0, 1<<MM_newindex
++    |  bxeqz TMP0, ->vmeta_tsets		// 'no __newindex' flag NOT set: check.
++    |6:
++    |  sd RC, 0(CARG3)
++    |   sd BASE, L->base
++    |  mv CARG2, TAB:RB
++    |   sd PC, SAVE_PC(sp)
++    |   mv CARG1, L
++    |  // (lua_State *L, GCtab *t, TValue *k)
++    |  call_intern BC_TSETS, lj_tab_newkey
++    |  // Returns TValue *.
++    |  ld BASE, L->base
++    |   fsd FTMP0, 0(CRET1)
++    |  j <3				// No 2nd write barrier needed.
++    |
++    |7:  // Possible table write barrier for the value. Skip valiswhite check.
++    |  barrierback TAB:RB, TMP3, TMP0, <3
++    break;
++  case BC_TSETB:
++    |  // RA = src*8, RB = table*8, RC = index*8
++    |  decode_RB8 RB, INS
++    |   decode_RDtoRC8 RC, RD
++    |  add CARG2, BASE, RB
++    |   add RA, BASE, RA
++    |  ld TAB:RB, 0(CARG2)
++    |  srliw TMP0, RC, 3
++    |  checktab RB, ->vmeta_tsetb
++    |  lw TMP1, TAB:RB->asize
++    |   ld TMP2, TAB:RB->array
++    |  bxgeu TMP0, TMP1, ->vmeta_tsetb
++    |   add RC, TMP2, RC
++    |  ld TMP1, 0(RC)
++    |   lbu TMP3, TAB:RB->marked
++    |  beq TMP1, TISNIL, >5
++    |1:
++    |   ld CRET1, 0(RA)
++    |  andi TMP1, TMP3, LJ_GC_BLACK	// isblack(table)
++    |    sd CRET1, 0(RC)
++    |  bnez TMP1, >7
++    |2:
++    |  ins_next
++    |
++    |5:  // Check for __newindex if previous value is nil.
++    |  ld TAB:TMP2, TAB:RB->metatable
++    |  beqz TAB:TMP2, <1		// No metatable: done.
++    |  lbu TMP1, TAB:TMP2->nomm
++    |  andi TMP1, TMP1, 1<<MM_newindex
++    |  bnez TMP1, <1			// 'no __newindex' flag set: done.
++    |  j ->vmeta_tsetb	// Caveat: preserve TMP0 and CARG2!
++    |
++    |7:  // Possible table write barrier for the value. Skip valiswhite check.
++    |  barrierback TAB:RB, TMP3, TMP0, <2
++    break;
++  case BC_TSETR:
++    |  // RA = dst*8, RB = table*8, RC = key*8
++    |  decode_RB8 RB, INS
++    |   decode_RDtoRC8 RC, RD
++    |  add CARG1, BASE, RB
++    |   add CARG3, BASE, RC
++    |  ld TAB:CARG2, 0(CARG1)
++    |   lw CARG3, 0(CARG3)
++    |  cleartp TAB:CARG2
++    |  lbu TMP3, TAB:CARG2->marked
++    |   lw TMP0, TAB:CARG2->asize
++    |    ld TMP1, TAB:CARG2->array
++    |  andi TMP2, TMP3, LJ_GC_BLACK	// isblack(table)
++    |   add RA, BASE, RA
++    |  bnez TMP2, >7
++    |2:
++    |  bxgeu CARG3, TMP0, ->vmeta_tsetr		// In array part?
++    |   slliw TMP2, CARG3, 3
++    |   add CRET1, TMP1, TMP2
++    |->BC_TSETR_Z:
++    |  ld TMP1, 0(RA)
++    |  ins_next1
++    |  sd TMP1, 0(CRET1)
++    |  ins_next2
++    |
++    |7:  // Possible table write barrier for the value. Skip valiswhite check.
++    |  barrierback TAB:CARG2, TMP3, CRET1, <2
++    break;
++
++  case BC_TSETM:
++    |  // RA = base*8 (table at base-1), RD = num_const*8 (start index)
++    |  add RA, BASE, RA
++    |1:
++    |   add TMP3, KBASE, RD
++    |  ld TAB:CARG2, -8(RA)		// Guaranteed to be a table.
++    |    addiw TMP0, MULTRES, -8
++    |   lw TMP3, 0(TMP3)		// Integer constant is in lo-word.
++    |   srliw CARG3, TMP0, 3
++    |    beqz TMP0, >4			// Nothing to copy?
++    |  cleartp TAB:CARG2
++    |  addw CARG3, CARG3, TMP3
++    |  lw TMP2, TAB:CARG2->asize
++    |   slliw TMP1, TMP3, 3
++    |    lbu TMP3, TAB:CARG2->marked
++    |   ld CARG1, TAB:CARG2->array
++    |  bltu TMP2, CARG3, >5
++    |   add TMP2, RA, TMP0
++    |   add TMP1, TMP1, CARG1
++    |  andi TMP0, TMP3, LJ_GC_BLACK	// isblack(table)
++    |3:  // Copy result slots to table.
++    |   ld CRET1, 0(RA)
++    |    addi RA, RA, 8
++    |   sd CRET1, 0(TMP1)
++    |    addi TMP1, TMP1, 8
++    |  bltu RA, TMP2, <3
++    |  bnez TMP0, >7
++    |4:
++    |  ins_next
++    |
++    |5:  // Need to resize array part.
++    |   sd BASE, L->base
++    |   sd PC, SAVE_PC(sp)
++    |  mv BASE, RD
++    |   mv CARG1, L
++    |  // (lua_State *L, GCtab *t, int nasize)
++    |  call_intern BC_TSETM, lj_tab_reasize
++    |  // Must not reallocate the stack.
++    |  mv RD, BASE
++    |   ld BASE, L->base        // Reload BASE for lack of a saved register.
++    |  j <1
++    |
++    |7:  // Possible table write barrier for any value. Skip valiswhite check.
++    |  barrierback TAB:CARG2, TMP3, TMP0, <4
++    break;
++
++  /* -- Calls and vararg handling ----------------------------------------- */
++
++  case BC_CALLM:
++    |  // RA = base*8, (RB = (nresults+1)*8,) RC = extra_nargs*8
++    |  decode_RDtoRC8 NARGS8:RC, RD
++    |   addw NARGS8:RC, NARGS8:RC, MULTRES
++    |  j ->BC_CALL_Z
++    break;
++  case BC_CALL:
++    |  // RA = base*8, (RB = (nresults+1)*8,) RC = (nargs+1)*8
++    |  decode_RDtoRC8 NARGS8:RC, RD
++    |->BC_CALL_Z:
++    |  mv TMP2, BASE
++    |  add BASE, BASE, RA
++    |   ld LFUNC:RB, 0(BASE)
++    |   addi BASE, BASE, 16
++    |  addiw NARGS8:RC, NARGS8:RC, -8
++    |  checkfunc RB, ->vmeta_call
++    |  ins_call
++    break;
++
++  case BC_CALLMT:
++    |  // RA = base*8, (RB = 0,) RC = extra_nargs*8
++    |  addw NARGS8:RD, NARGS8:RD, MULTRES
++    |  j ->BC_CALLT_Z1
++    break;
++  case BC_CALLT:
++    |  // RA = base*8, (RB = 0,) RC = (nargs+1)*8
++    |->BC_CALLT_Z1:
++    |  add RA, BASE, RA
++    |  ld LFUNC:RB, 0(RA)
++    |   mv NARGS8:RC, RD
++    |    ld TMP1, FRAME_PC(BASE)
++    |   addi RA, RA, 16
++    |  addiw NARGS8:RC, NARGS8:RC, -8
++    |  checktp CARG3, LFUNC:RB, -LJ_TFUNC, ->vmeta_callt
++    |->BC_CALLT_Z:
++    |  andi TMP0, TMP1, FRAME_TYPE	// Caveat: preserve TMP0 until the 'or'.
++    |   lbu TMP3, LFUNC:CARG3->ffid
++    |   xori TMP2, TMP1, FRAME_VARG
++    |  bnez TMP0, >7
++    |1:
++    |  sd LFUNC:RB, FRAME_FUNC(BASE)		// Copy function down, but keep PC.
++    |  sltiu CARG4, TMP3, 2		// (> FF_C) Calling a fast function?
++    |  mv TMP2, BASE
++    |  mv RB, CARG3
++    |   mv TMP3, NARGS8:RC
++    |  beqz NARGS8:RC, >3
++    |2:
++    |   ld CRET1, 0(RA)
++    |    addi RA, RA, 8
++    |  addiw TMP3, TMP3, -8
++    |   sd CRET1, 0(TMP2)
++    |    addi TMP2, TMP2, 8
++    |  bnez TMP3, <2
++    |3:
++    |  or TMP0, TMP0, CARG4
++    |  beqz TMP0, >5
++    |4:
++    |  ins_callt
++    |
++    |5:  // Tailcall to a fast function with a Lua frame below.
++    |  lw INS, -4(TMP1)
++    |  decode_RA8 RA, INS
++    |  sub TMP1, BASE, RA
++    |  ld TMP1, -32(TMP1)
++    |  cleartp LFUNC:TMP1
++    |  ld TMP1, LFUNC:TMP1->pc
++    |   ld KBASE, PC2PROTO(k)(TMP1)     // Need to prepare KBASE.
++    |  j <4
++    |
++    |7:  // Tailcall from a vararg function.
++    |  andi CARG4, TMP2, FRAME_TYPEP
++    |   sub TMP2, BASE, TMP2          // Relocate BASE down.
++    |  bnez CARG4, <1			// Vararg frame below?
++    |  mv BASE, TMP2
++    |  ld TMP1, FRAME_PC(TMP2)
++    |   andi TMP0, TMP1, FRAME_TYPE
++    |  j <1
++    break;
++
++  case BC_ITERC:
++    |  // RA = base*8, (RB = (nresults+1)*8, RC = (nargs+1)*8 ((2+1)*8))
++    |  mv TMP2, BASE			// Save old BASE for vmeta_call.
++    |  add BASE, BASE, RA
++    |  ld RB, -24(BASE)		//A, A+1, A+2 = A-3, A-2, A-1.
++    |   ld CARG1, -16(BASE)
++    |    ld CARG2, -8(BASE)
++    |  li NARGS8:RC, 16		// Iterators get 2 arguments.
++    |  sd RB, 0(BASE)			// Copy callable.
++    |   sd CARG1, 16(BASE)		// Copy state.
++    |    sd CARG2, 24(BASE)		// Copy control var.
++    |   addi BASE, BASE, 16
++    |  checkfunc RB, ->vmeta_call
++    |  ins_call
++    break;
++
++  case BC_ITERN:
++    |  // RA = base*8, (RB = (nresults+1)*8, RC = (nargs+1)*8 (2+1)*8)
++    |.if JIT
++    |  hotloop
++    |.endif
++    |->vm_IITERN:
++    |  add RA, BASE, RA
++    |  ld TAB:RB, -16(RA)
++    |   lw RC, -8(RA)		// Get index from control var.
++    |  cleartp TAB:RB
++    |   addi PC, PC, 4
++    |  lw TMP0, TAB:RB->asize
++    |   ld TMP1, TAB:RB->array
++    |  slli CARG3, TISNUM, 47
++    |1:  // Traverse array part.
++    |  bleu TMP0, RC, >5			// Index points after array part?
++    |   slliw TMP3, RC, 3
++    |  add TMP3, TMP1, TMP3
++    |  ld CARG1, 0(TMP3)
++    |     lhu RD, -4+OFS_RD(PC)		// ITERL RD
++    |   or TMP2, RC, CARG3
++    |   addiw RC, RC, 1
++    |  beq CARG1, TISNIL, <1		// Skip holes in array part.
++    |   sd TMP2, 0(RA)
++    |  sd CARG1, 8(RA)
++    |     lui TMP3, (-(BCBIAS_J*4 >> 12)) & 0xfffff		// -BCBIAS_J*4
++    |     decode_BC4b RD
++    |     add RD, RD, TMP3
++    |   sw RC, -8(RA)		// Update control var.
++    |     add PC, PC, RD
++    |3:
++    |  ins_next
++    |
++    |5:  // Traverse hash part.
++    |  lw TMP1, TAB:RB->hmask
++    |  subw RC, RC, TMP0
++    |   ld TMP2, TAB:RB->node
++    |6:
++    |  bltu TMP1, RC, <3		// End of iteration? Branch to ITERL+1.
++    |   slliw TMP3, RC, 5
++    |   slliw RB, RC, 3
++    |   subw TMP3, TMP3, RB
++    |  add NODE:TMP3, TMP3, TMP2	// node = tab->node + (idx*32-idx*8)
++    |  ld CARG1, 0(NODE:TMP3)
++    |     lhu RD, -4+OFS_RD(PC)		// ITERL RD
++    |   addiw RC, RC, 1
++    |  beq CARG1, TISNIL, <6		// Skip holes in hash part.
++    |  ld CARG2, NODE:TMP3->key
++    |     lui TMP3, (-(BCBIAS_J*4 >> 12)) & 0xfffff		// -BCBIAS_J*4
++    |  sd CARG1, 8(RA)
++    |    addw RC, RC, TMP0
++    |     decode_BC4b RD
++    |     addw RD, RD, TMP3
++    |  sd CARG2, 0(RA)
++    |     add PC, PC, RD
++    |   sw RC, -8(RA)                // Update control var.
++    |  j <3
++    break;
++
++  case BC_ISNEXT:
++    |  // RA = base*8, RD = target (points to ITERN)
++    |  add RA, BASE, RA
++    |    srliw TMP0, RD, 1
++    |  ld CFUNC:CARG1, -24(RA)
++    |    add TMP0, PC, TMP0
++    |   ld CARG2, -16(RA)
++    |   ld CARG3, -8(RA)
++    |    lui TMP2, (-(BCBIAS_J*4 >> 12)) & 0xfffff		// -BCBIAS_J*4
++    |  checkfunc CFUNC:CARG1, >5
++    |  gettp CARG2, CARG2
++    |  addi CARG2, CARG2, -LJ_TTAB
++    |  lbu TMP1, CFUNC:CARG1->ffid
++    |  addi CARG3, CARG3, -LJ_TNIL
++    |  or TMP3, CARG2, CARG3
++    |  addi TMP1, TMP1, -FF_next_N
++    |  or TMP3, TMP3, TMP1
++    |   lui TMP1, ((LJ_KEYINDEX - (((LJ_KEYINDEX & 0xfff)^0x800) - 0x800)) >> 12) & 0xfffff
++    |  bnez TMP3, >5
++    |  add PC, TMP0, TMP2
++    |  addi TMP1, TMP1, (((LJ_KEYINDEX & 0xfff)^0x800) - 0x800)
++    |  slli TMP1, TMP1, 32
++    |  sd TMP1, -8(RA)
++    |1:
++    |  ins_next
++    |5:  // Despecialize bytecode if any of the checks fail.
++    |  li TMP3, BC_JMP
++    |   li TMP1, BC_ITERC
++    |  sb TMP3, -4+OFS_OP(PC)
++    |   add PC, TMP0, TMP2
++    |.if JIT
++    |  lb TMP0, OFS_OP(PC)
++    |  li TMP3, BC_ITERN
++    |  lhu TMP2, OFS_RD(PC)
++    |  bne TMP0, TMP3, >6
++    |.endif
++    |  sb TMP1, OFS_OP(PC)
++    |  j <1
++    |.if JIT
++    |6:  // Unpatch JLOOP.
++    |  ld TMP0, GL_J(trace)(GL)	// Assumes J.trace in-reach relative to GL.
++    |  slliw TMP2, TMP2, 3
++    |  add TMP0, TMP0, TMP2
++    |  ld TRACE:TMP2, 0(TMP0)
++    |  lw TMP0, TRACE:TMP2->startins
++    |  andi TMP0, TMP0, -256
++    |  or TMP0, TMP0, TMP1
++    |  sw TMP0, 0(PC)
++    |  j <1
++    |.endif
++    break;
++
++  case BC_VARG:
++    |  // RA = base*8, RB = (nresults+1)*8, RC = numparams*8
++    |  ld TMP0, FRAME_PC(BASE)
++    |  decode_RDtoRC8 RC, RD
++    |   decode_RB8 RB, INS
++    |  add RC, BASE, RC
++    |   add RA, BASE, RA
++    |  addi RC, RC, FRAME_VARG
++    |   add TMP2, RA, RB
++    |  addi TMP3, BASE, -16		// TMP3 = vtop
++    |  sub RC, RC, TMP0		// RC = vbase
++    |  // Note: RC may now be even _above_ BASE if nargs was < numparams.
++    |   sub TMP1, TMP3, RC
++    |  beqz RB, >5			// Copy all varargs?
++    |  addi TMP2, TMP2, -16
++    |1:  // Copy vararg slots to destination slots.
++    |  ld CARG1, 0(RC)
++    |  sltu TMP0, RC, TMP3
++    |    addi RC, RC, 8
++    |  bnez TMP0, >2
++    |  mv CARG1, TISNIL
++    |2:
++    |  sd CARG1, 0(RA)
++    |  sltu TMP0, RA, TMP2
++    |  addi RA, RA, 8
++    |  bnez TMP0, <1
++    |3:
++    |  ins_next
++    |
++    |5:  // Copy all varargs.
++    |  ld TMP0, L->maxstack
++    |   li MULTRES, 8		// MULTRES = (0+1)*8
++    |  blez TMP1, <3			// No vararg slots?
++    |  add TMP2, RA, TMP1
++    |   addi MULTRES, TMP1, 8
++    |  bltu TMP0, TMP2, >7
++    |6:
++    |  ld CRET1, 0(RC)
++    |   addi RC, RC, 8
++    |  sd CRET1, 0(RA)
++    |   addi RA, RA, 8
++    |  bltu RC, TMP3, <6			// More vararg slots?
++    |  j <3
++    |
++    |7:  // Grow stack for varargs.
++    |   sd RA, L->top
++    |  sub RA, RA, BASE
++    |   sd BASE, L->base
++    |  sub BASE, RC, BASE		// Need delta, because BASE may change.
++    |   sd PC, SAVE_PC(sp)
++    |  srliw CARG2, TMP1, 3
++    |   mv CARG1, L
++    |  call_intern BC_VARG, lj_state_growstack	// (lua_State *L, int n)
++    |  mv RC, BASE
++    |  ld BASE, L->base
++    |  add RA, BASE, RA
++    |  add RC, BASE, RC
++    |  addi TMP3, BASE, -16
++    |  j <6
++    break;
++
++  /* -- Returns ----------------------------------------------------------- */
++
++  case BC_RETM:
++    |  // RA = results*8, RD = extra_nresults*8
++    |  addw RD, RD, MULTRES
++    |  j ->BC_RET_Z1
++    break;
++
++  case BC_RET:
++    |  // RA = results*8, RD = (nresults+1)*8
++    |->BC_RET_Z1:
++    |  ld PC, FRAME_PC(BASE)
++    |   add RA, BASE, RA
++    |    mv MULTRES, RD
++    |1:
++    |  andi TMP0, PC, FRAME_TYPE
++    |   xori TMP1, PC, FRAME_VARG
++    |  bnez TMP0, ->BC_RETV_Z
++    |
++    |->BC_RET_Z:
++    |  // BASE = base, RA = resultptr, RD = (nresults+1)*8, PC = return
++    |   lw INS, -4(PC)
++    |    addi TMP2, BASE, -16
++    |    addi RC, RD, -8
++    |  decode_RA8 TMP0, INS
++    |   decode_RB8 RB, INS
++    |   sub BASE, TMP2, TMP0
++    |   add TMP3, TMP2, RB
++    |  beqz RC, >3
++    |2:
++    |   ld CRET1, 0(RA)
++    |    addi RA, RA, 8
++    |  addi RC, RC, -8
++    |   sd CRET1, 0(TMP2)
++    |    addi TMP2, TMP2, 8
++    |  bnez RC, <2
++    |3:
++    |  addi TMP3, TMP3, -8
++    |5:
++    |  bltu TMP2, TMP3, >6
++    |   ld LFUNC:TMP1, FRAME_FUNC(BASE)
++    |  cleartp LFUNC:TMP1
++    |  ld TMP1, LFUNC:TMP1->pc
++    |  ld KBASE, PC2PROTO(k)(TMP1)
++    |  ins_next
++    |
++    |6:  // Fill up results with nil.
++    |  sd TISNIL, 0(TMP2)
++    |   addi TMP2, TMP2, 8
++    |  j <5
++    |
++    |->BC_RETV_Z:  // Non-standard return case.
++    |  andi TMP2, TMP1, FRAME_TYPEP
++    |  bxnez TMP2, ->vm_return
++    |  // Return from vararg function: relocate BASE down.
++    |  sub BASE, BASE, TMP1
++    |   ld PC, FRAME_PC(BASE)
++    |  j <1
++    break;
++
++  case BC_RET0: case BC_RET1:
++    |  // RA = results*8, RD = (nresults+1)*8
++    |  ld PC, FRAME_PC(BASE)
++    |   add RA, BASE, RA
++    |    mv MULTRES, RD
++    |  andi TMP0, PC, FRAME_TYPE
++    |   xori TMP1, PC, FRAME_VARG
++    |  bnez TMP0, ->BC_RETV_Z
++    |  lw INS, -4(PC)
++    |   addi TMP2, BASE, -16
++    if (op == BC_RET1) {
++      |  ld CRET1, 0(RA)
++    }
++    |  decode_RB8 RB, INS
++    |   decode_RA8 RA, INS
++    |   sub BASE, TMP2, RA
++    if (op == BC_RET1) {
++      |  sd CRET1, 0(TMP2)
++    }
++    |5:
++    |  bltu RD, RB, >6
++    |   ld TMP1, FRAME_FUNC(BASE)
++    |  cleartp LFUNC:TMP1
++    |  ld TMP1, LFUNC:TMP1->pc
++    |  ins_next1
++    |  ld KBASE, PC2PROTO(k)(TMP1)
++    |  ins_next2
++    |
++    |6:  // Fill up results with nil.
++    |  addi TMP2, TMP2, 8
++    |  addi RD, RD, 8
++    if (op == BC_RET1) {
++      |  sd TISNIL, 0(TMP2)
++    } else {
++      |  sd TISNIL, -8(TMP2)
++    }
++    |  j <5
++    break;
++
++  /* -- Loops and branches ------------------------------------------------ */
++
++  case BC_FORL:
++    |.if JIT
++    |  hotloop
++    |.endif
++    |  // Fall through. Assumes BC_IFORL follows.
++    break;
++
++  case BC_JFORI:
++  case BC_JFORL:
++#if !LJ_HASJIT
++    break;
++#endif
++  case BC_FORI:
++  case BC_IFORL:
++    |  // RA = base*8, RD = target (after end of loop or start of loop)
++    vk = (op == BC_IFORL || op == BC_JFORL);
++    |  add RA, BASE, RA
++    |  ld CARG1, FORL_IDX*8(RA)		// CARG1 = IDX
++    |   ld CARG2, FORL_STEP*8(RA)		// CARG2 = STEP
++    |    ld CARG3, FORL_STOP*8(RA)		// CARG3 = STOP
++    |  gettp CARG4, CARG1
++    |   gettp CARG5, CARG2
++    |    gettp CARG6, CARG3
++    if (op != BC_JFORL) {
++      |  srliw RD, RD, 1
++      |  lui TMP2, (-(BCBIAS_J*4 >> 12)) & 0xfffff	// -BCBIAS_J<<2
++      |  add TMP2, RD, TMP2
++    }
++    |  bne CARG4, TISNUM, >3
++    |   sext.w CARG4, CARG1		// start
++    |   sext.w CARG3, CARG3		// stop
++    if (!vk) {				// init
++      |  bxne CARG6, TISNUM, ->vmeta_for
++      |  bxne CARG5, TISNUM, ->vmeta_for
++      |   bfextri TMP0, CARG2, 31, 31	// sign
++      |  slt CARG2, CARG3, CARG4
++      |  slt TMP1, CARG4, CARG3
++      |  neg TMP4, TMP0
++      |  and TMP1, TMP1, TMP4
++      |  not TMP4, TMP4
++      |  and CARG2, CARG2, TMP4
++      |  or CARG2, CARG2, TMP1		// CARG2=0: +,start <= stop or -,start >= stop
++    } else {
++      |  sext.w CARG5, CARG2		// step
++      |  addw CARG1, CARG4, CARG5	// start + step
++      |  xor TMP3, CARG1, CARG4		// y^a
++      |  xor TMP1, CARG1, CARG5		// y^b
++      |  and TMP3, TMP3, TMP1
++      |  slt TMP1, CARG1, CARG3		// start+step < stop ?
++      |  slt CARG3, CARG3, CARG1	// stop < start+step ?
++      |  sltz TMP0, CARG5		// step < 0 ?
++      |   sltz TMP3, TMP3		// ((y^a) & (y^b)) < 0: overflow.
++      |  neg TMP4, TMP0
++      |  and TMP1, TMP1, TMP4
++      |  not TMP4, TMP4
++      |  and CARG3, CARG3, TMP4
++      |  or CARG3, CARG3, TMP1
++      |  or CARG2, CARG3, TMP3		// CARG2=1: overflow; CARG2=0: continue
++      |  zext.w CARG1, CARG1
++      |  settp_b CARG1, TISNUM
++      |  sd CARG1, FORL_IDX*8(RA)
++    }
++    |1:
++    if (op == BC_FORI) {
++      |  neg TMP4, CARG2	// CARG2!=0: jump out the loop; CARG2==0: next INS
++      |  and TMP2, TMP2, TMP4
++      |  add PC, PC, TMP2
++    } else if (op == BC_JFORI) {
++      |  add PC, PC, TMP2
++      |  lhu RD, -4+OFS_RD(PC)
++    } else if (op == BC_IFORL) {
++      |  addi TMP4, CARG2, -1	// CARG2!=0: next INS; CARG2==0: jump back
++      |  and TMP2, TMP2, TMP4
++      |  add PC, PC, TMP2
++    }
++    |  ins_next1
++    |  sd CARG1, FORL_EXT*8(RA)
++    |2:
++    if (op == BC_JFORI) {
++      |  decode_RD8b RD
++      |  beqz CARG2, =>BC_JLOOP		// CARG2 == 0: excute the loop
++    } else if (op == BC_JFORL) {
++      |  beqz CARG2, =>BC_JLOOP
++    }
++    |  ins_next2
++    |
++    |3:  // FP loop.
++    |  fld FTMP0, FORL_IDX*8(RA)	// start
++    |  fld FTMP1, FORL_STOP*8(RA)	// stop
++    |  ld TMP0, FORL_STEP*8(RA)	// step
++    |  sltz CARG2, TMP0		// step < 0 ?
++    |  neg CARG2, CARG2
++    if (!vk) {
++      |  sltiu TMP3, CARG4, LJ_TISNUM	// start is number ?
++      |  sltiu TMP0, CARG5, LJ_TISNUM	// step is number ?
++      |  sltiu TMP1, CARG6, LJ_TISNUM	// stop is number ?
++      |  and TMP3, TMP3, TMP1
++      |  and TMP0, TMP0, TMP3
++      |  bxeqz TMP0, ->vmeta_for		// if start or step or stop isn't number
++      |  flt.d TMP3, FTMP0, FTMP1		// start < stop ?
++      |  flt.d TMP4, FTMP1, FTMP0		// stop < start ?
++      |  and TMP3, TMP3, CARG2
++      |  not CARG2, CARG2
++      |  and TMP4, TMP4, CARG2
++      |  or CARG2, TMP3, TMP4	// CARG2=0:+,start<stop or -,start>stop
++      |  j <1
++    } else {
++      |  fld FTMP3, FORL_STEP*8(RA)
++      |  fadd.d FTMP0, FTMP0, FTMP3		// start + step
++      |  flt.d TMP3, FTMP0, FTMP1		// start + step < stop ?
++      |  flt.d TMP4, FTMP1, FTMP0
++      |  and TMP3, TMP3, CARG2
++      |  not CARG2, CARG2
++      |  and TMP4, TMP4, CARG2
++      |  or CARG2, TMP3, TMP4
++      if (op == BC_IFORL) {
++  |  addi TMP3, CARG2, -1
++  |  and TMP2, TMP2, TMP3
++	|  add PC, PC, TMP2
++      }
++      |  fsd FTMP0, FORL_IDX*8(RA)
++      |  ins_next1
++      |  fsd FTMP0, FORL_EXT*8(RA)
++      |  j <2
++    }
++    break;
++
++  case BC_ITERL:
++    |.if JIT
++    |  hotloop
++    |.endif
++    |  // Fall through. Assumes BC_IITERL follows.
++    break;
++
++  case BC_JITERL:
++#if !LJ_HASJIT
++    break;
++#endif
++  case BC_IITERL:
++    |  // RA = base*8, RD = target
++    |  add RA, BASE, RA
++    |  ld TMP1, 0(RA)
++    |  beq TMP1, TISNIL, >1		// Stop if iterator returned nil.
++    if (op == BC_JITERL) {
++      |   sd TMP1,-8(RA)
++      |  j =>BC_JLOOP
++    } else {
++      |  branch_RD			// Otherwise save control var + branch.
++      |  sd TMP1, -8(RA)
++    }
++    |1:
++    |  ins_next
++    break;
++
++  case BC_LOOP:
++    |  // RA = base*8, RD = target (loop extent)
++    |  // Note: RA/RD is only used by trace recorder to determine scope/extent
++    |  // This opcode does NOT jump, it's only purpose is to detect a hot loop.
++    |.if JIT
++    |  hotloop
++    |.endif
++    |  // Fall through. Assumes BC_ILOOP follows.
++    break;
++
++  case BC_ILOOP:
++    |  // RA = base*8, RD = target (loop extent)
++    |  ins_next
++    break;
++
++  case BC_JLOOP:
++    |.if JIT
++    |  // RA = base*8 (ignored), RD = traceno*8
++    |  ld TMP0, GL_J(trace)(GL)	// Assumes J.trace in-reach relative to GL.
++    |  add TMP0, TMP0, RD
++    |  // Traces on RISC-V don't store the trace number, so use 0.
++    |  sd x0, GL->vmstate
++    |  ld TRACE:TMP1, 0(TMP0)
++    |  sd BASE, GL->jit_base	// store Current JIT code L->base
++    |  ld TMP1, TRACE:TMP1->mcode
++    |  sd L, GL->tmpbuf.L
++    |  jr TMP1
++    |.endif
++    break;
++
++  case BC_JMP:
++    |  // RA = base*8 (only used by trace recorder), RD = target
++    |  branch_RD		// PC + (jump - 0x8000)<<2
++    |  ins_next
++    break;
++
++  /* -- Function headers -------------------------------------------------- */
++
++  case BC_FUNCF:
++    |.if JIT
++    |  hotcall
++    |.endif
++  case BC_FUNCV:  /* NYI: compiled vararg functions. */
++    |  // Fall through. Assumes BC_IFUNCF/BC_IFUNCV follow.
++    break;
++
++  case BC_JFUNCF:
++#if !LJ_HASJIT
++    break;
++#endif
++  case BC_IFUNCF:
++    |  // BASE = new base, RA = BASE+framesize*8, RB = LFUNC, RC = nargs*8
++    |  ld TMP2, L->maxstack
++    |   lbu TMP1, -4+PC2PROTO(numparams)(PC)
++    |    ld KBASE, -4+PC2PROTO(k)(PC)
++    |  bxltu TMP2, RA, ->vm_growstack_l
++    |   slliw TMP1, TMP1, 3			// numparams*8
++    |2:
++    |  bltu NARGS8:RC, TMP1, >3		// Check for missing parameters.
++    if (op == BC_JFUNCF) {
++      |  decode_RD8 RD, INS
++      |  j =>BC_JLOOP
++    } else {
++      |  ins_next
++    }
++    |
++    |3:  // Clear missing parameters.
++    |  add TMP0, BASE, NARGS8:RC
++    |  sd TISNIL, 0(TMP0)
++    |   addiw NARGS8:RC, NARGS8:RC, 8
++    |  j <2
++    break;
++
++  case BC_JFUNCV:
++#if !LJ_HASJIT
++    break;
++#endif
++    |  NYI  // NYI: compiled vararg functions
++    break;  /* NYI: compiled vararg functions. */
++
++  case BC_IFUNCV:
++    |  // BASE = new base, RA = BASE+framesize*8, RB = LFUNC, RC = nargs*8
++    |   li TMP0, LJ_TFUNC
++    |   add TMP1, BASE, RC
++    |  ld TMP2, L->maxstack
++    |   settp LFUNC:RB, TMP0
++    |  add TMP0, RA, RC
++    |   sd LFUNC:RB, 0(TMP1)		// Store (tagged) copy of LFUNC.
++    |  addi TMP2, TMP2, -8
++    |   addi TMP3, RC, 16+FRAME_VARG
++    |    ld KBASE, -4+PC2PROTO(k)(PC)
++    |   sd TMP3, 8(TMP1)                // Store delta + FRAME_VARG.
++    |  bxgeu TMP0, TMP2, ->vm_growstack_l
++    |  lbu TMP2, -4+PC2PROTO(numparams)(PC)
++    |   mv RA, BASE
++    |   mv RC, TMP1
++    |  ins_next1
++    |   addi BASE, TMP1, 16
++    |  beqz TMP2, >2
++    |1:
++    |  ld TMP0, 0(RA)
++    |  sltu CARG2, RA, RC			// Less args than parameters?
++    |  mv CARG1, TMP0
++    |    addi RA, RA, 8
++    |    addi TMP1, TMP1, 8
++    |    addiw TMP2, TMP2, -1
++    |  beqz CARG2, >3
++    |  neg TMP4, CARG2		// Clear old fixarg slot (help the GC).
++    |  and TMP3, TISNIL, TMP4
++    |  not TMP4, TMP4
++    |  and CARG1, CARG1, TMP4
++    |  or CARG1, CARG1, TMP3
++    |  sd CARG1, -8(RA)
++    |  sd TMP0, 8(TMP1)
++    |  bnez TMP2, <1
++    |2:
++    |  ins_next2
++    |3:
++    |  neg TMP4, CARG2		// Clear missing fixargs.
++    |  and TMP0, TMP0, TMP4
++    |  not TMP4, TMP4
++    |  and TMP3, TISNIL, TMP4
++    |  or TMP0, TMP0, TMP3
++    |  sd TMP0, 8(TMP1)
++    |  bnez TMP2, <1
++    |  j <2
++    break;
++
++  case BC_FUNCC:
++  case BC_FUNCCW:
++    |  // BASE = new base, RA = BASE+framesize*8, RB = CFUNC, RC = nargs*8
++    if (op == BC_FUNCC) {
++      |  ld CARG4, CFUNC:RB->f
++    } else {
++      |  ld CARG4, GL->wrapf
++    }
++    |  add TMP1, RA, NARGS8:RC
++    |  ld TMP2, L->maxstack
++    |   add RC, BASE, NARGS8:RC
++    |  sd BASE, L->base		// base of currently excuting function
++    |   sd RC, L->top
++    |  bxgtu TMP1, TMP2, ->vm_growstack_c	// Need to grow stack.
++    |    li_vmstate C			// li TMP0, ~LJ_VMST_C
++    if (op == BC_FUNCCW) {
++      |  ld CARG2, CFUNC:RB->f
++    }
++    |   mv CARG1, L
++    |    st_vmstate			// sw TMP0, GL->vmstate
++    |  jalr CARG4		// (lua_State *L [, lua_CFunction f])
++    |  // Returns nresults.
++    |  ld BASE, L->base
++    |  ld TMP1, L->top
++    |  sd L, GL->cur_L
++    |   slliw RD, CRET1, 3
++    |    li_vmstate INTERP
++    |  ld PC, FRAME_PC(BASE)		// Fetch PC of caller.
++    |  sub RA, TMP1, RD		// RA = L->top - nresults*8
++    |    st_vmstate
++    |  j ->vm_returnc
++    break;
++
++  /* ---------------------------------------------------------------------- */
++
++  default:
++    fprintf(stderr, "Error: undefined opcode BC_%s\n", bc_names[op]);
++    exit(2);
++    break;
++  }
++}
++
++static int build_backend(BuildCtx *ctx)
++{
++  int op;
++
++  dasm_growpc(Dst, BC__MAX);
++
++  build_subroutines(ctx);
++
++  |.code_op
++  for (op = 0; op < BC__MAX; op++)
++    build_ins(ctx, (BCOp)op, op);
++
++  return BC__MAX;
++}
++
++/* Emit pseudo frame-info for all assembler functions. */
++static void emit_asm_debug(BuildCtx *ctx)
++{
++  int fcofs = (int)((uint8_t *)ctx->glob[GLOB_vm_ffi_call] - ctx->code);
++  int i;
++  switch (ctx->mode) {
++  case BUILD_elfasm:
++    fprintf(ctx->fp, "\t.section .debug_frame,\"\",@progbits\n");
++    fprintf(ctx->fp,
++	".Lframe0:\n"
++	"\t.4byte .LECIE0-.LSCIE0\n"
++	".LSCIE0:\n"
++	"\t.4byte 0xffffffff\n"
++	"\t.byte 0x1\n"
++	"\t.string \"\"\n"
++	"\t.uleb128 0x1\n"
++	"\t.sleb128 -4\n"
++	"\t.byte 1\n"				/* Return address is in ra. */
++	"\t.byte 0xc\n\t.uleb128 2\n\t.uleb128 0\n"	/* def_cfa sp 0 */
++	"\t.align 3\n"
++	".LECIE0:\n\n");
++    fprintf(ctx->fp,
++	".LSFDE0:\n"
++	"\t.4byte .LEFDE0-.LASFDE0\n"
++	".LASFDE0:\n"
++	"\t.4byte .Lframe0\n"
++	"\t.8byte .Lbegin\n"
++	"\t.8byte %d\n"
++	"\t.byte 0xe\n\t.uleb128 %d\n"
++	"\t.byte 0x81\n\t.uleb128 2*6\n"	/* offset ra */,
++	fcofs, CFRAME_SIZE);
++    for (i = 27; i >= 18; i--)	/* offset x27-x18 (s11-s2) */
++      fprintf(ctx->fp, "\t.byte %d\n\t.uleb128 %d\n", 0x80+i, 2*(27-i+7));
++    fprintf(ctx->fp,
++	"\t.byte 0x89\n\t.uleb128 2*17\n"	/* offset x9 (s1) */
++	"\t.byte 0x88\n\t.uleb128 2*18\n"	/* offset x8 (s0/fp) */);
++    for (i = 27; i >= 18; i--)	/* offset f31-f18 */
++      fprintf(ctx->fp, "\t.byte %d\n\t.uleb128 %d\n", 0x80+32+i, 2*(27-i+19));
++    fprintf(ctx->fp,
++	"\t.byte 0x89+32\n\t.uleb128 2*29\n"	/* offset f9 (fs1) */
++	"\t.byte 0x88+32\n\t.uleb128 2*30\n"	/* offset f8 (fs0) */
++	"\t.align 3\n"
++	".LEFDE0:\n\n");
++#if LJ_HASFFI
++    fprintf(ctx->fp,
++	".LSFDE1:\n"
++	"\t.4byte .LEFDE1-.LASFDE1\n"
++	".LASFDE1:\n"
++	"\t.4byte .Lframe0\n"
++	"\t.4byte lj_vm_ffi_call\n"
++	"\t.4byte %d\n"
++	"\t.byte 0x81\n\t.uleb128 2*1\n"	/* offset ra */
++	"\t.byte 0x92\n\t.uleb128 2*2\n"	/* offset x18 */
++	"\t.byte 0xd\n\t.uleb128 0x12\n"
++	"\t.align 3\n"
++	".LEFDE1:\n\n", (int)ctx->codesz - fcofs);
++#endif
++#if !LJ_NO_UNWIND
++    fprintf(ctx->fp, "\t.section .eh_frame,\"a\",@progbits\n");
++    fprintf(ctx->fp,
++	".Lframe1:\n"
++	"\t.4byte .LECIE1-.LSCIE1\n"
++	".LSCIE1:\n"
++	"\t.4byte 0\n"
++	"\t.byte 0x1\n"
++	"\t.string \"zPR\"\n"
++	"\t.uleb128 0x1\n"
++	"\t.sleb128 -4\n"
++	"\t.byte 1\n"				/* Return address is in ra. */
++	"\t.uleb128 6\n"			/* augmentation length */
++	"\t.byte 0x1b\n"
++	"\t.4byte lj_err_unwind_dwarf-.\n"
++	"\t.byte 0x1b\n"
++	"\t.byte 0xc\n\t.uleb128 2\n\t.uleb128 0\n"	/* def_cfa sp 0 */
++	"\t.align 2\n"
++	".LECIE1:\n\n");
++    fprintf(ctx->fp,
++	".LSFDE2:\n"
++	"\t.4byte .LEFDE2-.LASFDE2\n"
++	".LASFDE2:\n"
++	"\t.4byte .LASFDE2-.Lframe1\n"
++	"\t.4byte .Lbegin-.\n"
++	"\t.4byte %d\n"
++	"\t.uleb128 0\n"			/* augmentation length */
++	"\t.byte 0xe\n\t.uleb128 %d\n"
++	"\t.byte 0x81\n\t.uleb128 2*6\n",	/* offset ra */
++	fcofs, CFRAME_SIZE);
++    for (i = 27; i >= 18; i--)	/* offset x27-x18 (s11-s2) */
++      fprintf(ctx->fp, "\t.byte %d\n\t.uleb128 %d\n", 0x80+i, 2*(27-i+7));
++    fprintf(ctx->fp,
++	"\t.byte 0x89\n\t.uleb128 2*17\n"	/* offset x9 (s1) */
++	"\t.byte 0x88\n\t.uleb128 2*18\n"	/* offset x8 (s0/fp) */);
++    for (i = 27; i >= 18; i--)	/* offset f31-f18 */
++      fprintf(ctx->fp, "\t.byte %d\n\t.uleb128 %d\n", 0x80+32+i, 2*(27-i+19));
++    fprintf(ctx->fp,
++	"\t.byte 0x89+32\n\t.uleb128 2*29\n"	/* offset f9 (fs1) */
++	"\t.byte 0x88+32\n\t.uleb128 2*30\n"	/* offset f8 (fs0) */
++	"\t.align 2\n"
++	".LEFDE2:\n\n");
++#if LJ_HASFFI
++    fprintf(ctx->fp,
++	".Lframe2:\n"
++	"\t.4byte .LECIE2-.LSCIE2\n"
++	".LSCIE2:\n"
++	"\t.4byte 0\n"
++	"\t.byte 0x1\n"
++	"\t.string \"zR\"\n"
++	"\t.uleb128 0x1\n"
++	"\t.sleb128 -4\n"
++	"\t.byte 1\n"				/* Return address is in ra. */
++	"\t.uleb128 1\n"			/* augmentation length */
++	"\t.byte 0x1b\n"
++	"\t.byte 0xc\n\t.uleb128 2\n\t.uleb128 0\n"	/* def_cfa sp 0 */
++	"\t.align 2\n"
++	".LECIE2:\n\n");
++    fprintf(ctx->fp,
++	".LSFDE3:\n"
++	"\t.4byte .LEFDE3-.LASFDE3\n"
++	".LASFDE3:\n"
++	"\t.4byte .LASFDE3- .Lframe2\n"
++	"\t.4byte lj_vm_ffi_call-.\n"
++	"\t.4byte %d\n"
++	"\t.uleb128 0\n"			/* augmentation length */
++	"\t.byte 0x81\n\t.uleb128 2*1\n"	/* offset ra */
++	"\t.byte 0x92\n\t.uleb128 2*2\n"	/* offset x18 */
++	"\t.byte 0xd\n\t.uleb128 0x12\n"
++	"\t.align 2\n"
++	".LEFDE3:\n\n", (int)ctx->codesz - fcofs);
++#endif
++#endif
++    break;
++  default:
++    break;
++  }
++}
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/vm_x64.dasc
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/vm_x64.dasc
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/vm_x64.dasc
+@@ -1,6 +1,6 @@
+ |// Low-level VM code for x64 CPUs in LJ_GC64 mode.
+ |// Bytecode interpreter, fast functions and helper functions.
+-|// Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++|// Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ |
+ |.arch x64
+ |.section code_op, code_sub
+@@ -359,9 +359,6 @@
+ |.macro sseconst_1, reg, tmp		// Synthesize 1.0.
+ |  sseconst_hi reg, tmp, 3ff00000
+ |.endmacro
+-|.macro sseconst_m1, reg, tmp		// Synthesize -1.0.
+-|  sseconst_hi reg, tmp, bff00000
+-|.endmacro
+ |.macro sseconst_2p52, reg, tmp		// Synthesize 2^52.
+ |  sseconst_hi reg, tmp, 43300000
+ |.endmacro
+@@ -1346,44 +1343,28 @@ static void build_subroutines(BuildCtx *
+   |.ffunc_1 next
+   |  je >2				// Missing 2nd arg?
+   |1:
+-  |.if X64WIN
+-  |  mov RA, [BASE]
+-  |  checktab RA, ->fff_fallback
+-  |.else
+-  |  mov CARG2, [BASE]
+-  |  checktab CARG2, ->fff_fallback
+-  |.endif
+-  |  mov L:RB, SAVE_L
+-  |  mov L:RB->base, BASE		// Add frame since C call can throw.
+-  |  mov L:RB->top, BASE		// Dummy frame length is ok.
++  |  mov CARG1, [BASE]
+   |  mov PC, [BASE-8]
++  |  checktab CARG1, ->fff_fallback
++  |  mov RB, BASE			// Save BASE.
+   |.if X64WIN
+-  |  lea CARG3, [BASE+8]
+-  |  mov CARG2, RA			// Caveat: CARG2 == BASE.
+-  |  mov CARG1, L:RB
++  |  lea CARG3, [BASE-16]
++  |  lea CARG2, [BASE+8]		// Caveat: CARG2 == BASE.
+   |.else
+-  |  lea CARG3, [BASE+8]		// Caveat: CARG3 == BASE.
+-  |  mov CARG1, L:RB
++  |  lea CARG2, [BASE+8]
++  |  lea CARG3, [BASE-16]		// Caveat: CARG3 == BASE.
+   |.endif
+-  |  mov SAVE_PC, PC			// Needed for ITERN fallback.
+-  |  call extern lj_tab_next	// (lua_State *L, GCtab *t, TValue *key)
+-  |  // Flag returned in eax (RD).
+-  |  mov BASE, L:RB->base
+-  |  test RDd, RDd;  jz >3		// End of traversal?
+-  |  // Copy key and value to results.
+-  |  mov RB, [BASE+8]
+-  |  mov RD, [BASE+16]
+-  |  mov [BASE-16], RB
+-  |  mov [BASE-8], RD
+-  |->fff_res2:
+-  |  mov RDd, 1+2
+-  |  jmp ->fff_res
++  |  call extern lj_tab_next		// (GCtab *t, cTValue *key, TValue *o)
++  |  // 1=found, 0=end, -1=error returned in eax (RD).
++  |  mov BASE, RB			// Restore BASE.
++  |  test RDd, RDd;  jg ->fff_res2	// Found key/value.
++  |  js ->fff_fallback_2		// Invalid key.
++  |  // End of traversal: return nil.
++  |  mov aword [BASE-16], LJ_TNIL
++  |  jmp ->fff_res1
+   |2:  // Set missing 2nd arg to nil.
+   |  mov aword [BASE+8], LJ_TNIL
+   |  jmp <1
+-  |3:  // End of traversal: return nil.
+-  |  mov aword [BASE-16], LJ_TNIL
+-  |  jmp ->fff_res1
+   |
+   |.ffunc_1 pairs
+   |  mov TAB:RB, [BASE]
+@@ -1432,7 +1413,9 @@ static void build_subroutines(BuildCtx *
+   |  // Copy array slot.
+   |  mov RB, [RD]
+   |  mov [BASE-8], RB
+-  |  jmp ->fff_res2
++  |->fff_res2:
++  |  mov RDd, 1+2
++  |  jmp ->fff_res
+   |2:  // Check for empty hash part first. Otherwise call C function.
+   |  cmp dword TAB:RB->hmask, 0; je ->fff_res0
+   |.if X64WIN
+@@ -2011,7 +1994,7 @@ static void build_subroutines(BuildCtx *
+   |.endif
+   |   mov RC, SBUF:CARG1->b
+   |   mov SBUF:CARG1->L, L:RB
+-  |   mov SBUF:CARG1->p, RC
++  |   mov SBUF:CARG1->w, RC
+   |  mov SAVE_PC, PC
+   |  call extern lj_buf_putstr_ .. name
+   |  mov CARG1, rax
+@@ -2470,7 +2453,7 @@ static void build_subroutines(BuildCtx *
+   |  mov r12, [RA]
+   |  mov rsp, RA			// Reposition stack to C frame.
+   |.endif
+-  |  test RDd, RDd; js >9		// Check for error from exit.
++  |  cmp RDd, -LUA_ERRERR; jae >9	// Check for error from exit.
+   |  mov L:RB, SAVE_L
+   |  mov MULTRES, RDd
+   |  mov LFUNC:KBASE, [BASE-16]
+@@ -2486,6 +2469,8 @@ static void build_subroutines(BuildCtx *
+   |  movzx OP, RCL
+   |  add PC, 4
+   |  shr RCd, 16
++  |  cmp MULTRES, -17			// Static dispatch?
++  |  je >5
+   |  cmp OP, BC_FUNCF			// Function header?
+   |  jb >3
+   |  cmp OP, BC_FUNCC+2			// Fast function?
+@@ -2508,9 +2493,20 @@ static void build_subroutines(BuildCtx *
+   |  mov KBASE, [KBASE+PC2PROTO(k)]
+   |  jmp <2
+   |
++  |5:  // Dispatch to static entry of original ins replaced by BC_JLOOP.
++  |  mov RA, [DISPATCH+DISPATCH_J(trace)]
++  |  mov TRACE:RA, [RA+RD*8]
++  |  mov RCd, TRACE:RA->startins
++  |  movzx RAd, RCH
++  |  movzx OP, RCL
++  |  shr RCd, 16
++  |  jmp aword [DISPATCH+OP*8+GG_DISP2STATIC]
++  |
+   |9:  // Rethrow error from the right C frame.
++  |  mov CARG2d, RDd
+   |  mov CARG1, L:RB
+-  |  call extern lj_err_run		// (lua_State *L)
++  |  neg CARG2d
++  |  call extern lj_err_trace		// (lua_State *L, int errcode)
+   |.endif
+   |
+   |//-----------------------------------------------------------------------
+@@ -2542,15 +2538,17 @@ static void build_subroutines(BuildCtx *
+   |  addsd xmm1, xmm3			// (|x| + 2^52) - 2^52
+   |  subsd xmm1, xmm3
+   |  orpd xmm1, xmm2			// Merge sign bit back in.
++  |  sseconst_1 xmm3, RD
+   |  .if mode == 1		// ceil(x)?
+-  |    sseconst_m1 xmm2, RD		// Must subtract -1 to preserve -0.
+   |    cmpsd xmm0, xmm1, 6		// x > result?
++  |    andpd xmm0, xmm3
++  |    addsd xmm1, xmm0			// If yes, add 1.
++  |    orpd xmm1, xmm2			// Merge sign bit back in (again).
+   |  .else			// floor(x)?
+-  |    sseconst_1 xmm2, RD
+   |    cmpsd xmm0, xmm1, 1		// x < result?
++  |    andpd xmm0, xmm3
++  |    subsd xmm1, xmm0			// If yes, subtract 1.
+   |  .endif
+-  |  andpd xmm0, xmm2
+-  |  subsd xmm1, xmm0			// If yes, subtract +-1.
+   |.endif
+   |  movaps xmm0, xmm1
+   |1:
+@@ -2591,41 +2589,6 @@ static void build_subroutines(BuildCtx *
+   |  subsd xmm0, xmm1
+   |  ret
+   |
+-  |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified.
+-  |->vm_powi_sse:
+-  |  cmp eax, 1; jle >6			// i<=1?
+-  |  // Now 1 < (unsigned)i <= 0x80000000.
+-  |1:  // Handle leading zeros.
+-  |  test eax, 1; jnz >2
+-  |  mulsd xmm0, xmm0
+-  |  shr eax, 1
+-  |  jmp <1
+-  |2:
+-  |  shr eax, 1; jz >5
+-  |  movaps xmm1, xmm0
+-  |3:  // Handle trailing bits.
+-  |  mulsd xmm0, xmm0
+-  |  shr eax, 1; jz >4
+-  |  jnc <3
+-  |  mulsd xmm1, xmm0
+-  |  jmp <3
+-  |4:
+-  |  mulsd xmm0, xmm1
+-  |5:
+-  |  ret
+-  |6:
+-  |  je <5				// x^1 ==> x
+-  |  jb >7				// x^0 ==> 1
+-  |  neg eax
+-  |  call <1
+-  |  sseconst_1 xmm1, RD
+-  |  divsd xmm1, xmm0
+-  |  movaps xmm0, xmm1
+-  |  ret
+-  |7:
+-  |  sseconst_1 xmm0, RD
+-  |  ret
+-  |
+   |//-----------------------------------------------------------------------
+   |//-- Miscellaneous functions --------------------------------------------
+   |//-----------------------------------------------------------------------
+@@ -2645,6 +2608,67 @@ static void build_subroutines(BuildCtx *
+   |  .if X64WIN; pop rsi; .endif
+   |  ret
+   |
++  |.define NEXT_TAB,		TAB:CARG1
++  |.define NEXT_IDX,		CARG2d
++  |.define NEXT_IDXa,		CARG2
++  |.define NEXT_PTR,		RC
++  |.define NEXT_PTRd,		RCd
++  |.define NEXT_TMP,		CARG3
++  |.define NEXT_ASIZE,		CARG4d
++  |.macro NEXT_RES_IDXL, op2;	lea edx, [NEXT_IDX+op2]; .endmacro
++  |.if X64WIN
++  |.define NEXT_RES_PTR,	[rsp+aword*5]
++  |.macro NEXT_RES_IDX, op2;	add NEXT_IDX, op2; .endmacro
++  |.else
++  |.define NEXT_RES_PTR,	[rsp+aword*1]
++  |.macro NEXT_RES_IDX, op2;	lea edx, [NEXT_IDX+op2]; .endmacro
++  |.endif
++  |
++  |// TValue *lj_vm_next(GCtab *t, uint32_t idx)
++  |// Next idx returned in edx.
++  |->vm_next:
++  |.if JIT
++  |  mov NEXT_ASIZE, NEXT_TAB->asize
++  |1:  // Traverse array part.
++  |  cmp NEXT_IDX, NEXT_ASIZE;  jae >5
++  |  mov NEXT_TMP, NEXT_TAB->array
++  |  mov NEXT_TMP, qword [NEXT_TMP+NEXT_IDX*8]
++  |  cmp NEXT_TMP, LJ_TNIL;  je >2
++  |  lea NEXT_PTR, NEXT_RES_PTR
++  |  mov qword [NEXT_PTR], NEXT_TMP
++  |.if DUALNUM
++  |  setint NEXT_TMP, NEXT_IDXa
++  |  mov qword [NEXT_PTR+qword*1], NEXT_TMP
++  |.else
++  |  cvtsi2sd xmm0, NEXT_IDX
++  |  movsd qword [NEXT_PTR+qword*1], xmm0
++  |.endif
++  |  NEXT_RES_IDX 1
++  |  ret
++  |2:  // Skip holes in array part.
++  |  add NEXT_IDX, 1
++  |  jmp <1
++  |
++  |5:  // Traverse hash part.
++  |  sub NEXT_IDX, NEXT_ASIZE
++  |6:
++  |  cmp NEXT_IDX, NEXT_TAB->hmask; ja >9
++  |  imul NEXT_PTRd, NEXT_IDX, #NODE
++  |  add NODE:NEXT_PTR, NEXT_TAB->node
++  |  cmp qword NODE:NEXT_PTR->val, LJ_TNIL; je >7
++  |  NEXT_RES_IDXL NEXT_ASIZE+1
++  |  ret
++  |7:  // Skip holes in hash part.
++  |  add NEXT_IDX, 1
++  |  jmp <6
++  |
++  |9:  // End of iteration. Set the key to nil (not the value).
++  |  NEXT_RES_IDX NEXT_ASIZE
++  |  lea NEXT_PTR, NEXT_RES_PTR
++  |  mov qword [NEXT_PTR+qword*1], LJ_TNIL
++  |  ret
++  |.endif
++  |
+   |//-----------------------------------------------------------------------
+   |//-- Assertions ---------------------------------------------------------
+   |//-----------------------------------------------------------------------
+@@ -2731,12 +2755,12 @@ static void build_subroutines(BuildCtx *
+   |
+   |  // Copy stack slots.
+   |  movzx ecx, byte CCSTATE->nsp
+-  |  sub ecx, 1
++  |  sub ecx, 8
+   |  js >2
+   |1:
+-  |  mov rax, [CCSTATE+rcx*8+offsetof(CCallState, stack)]
+-  |  mov [rsp+rcx*8+CCALL_SPS_EXTRA*8], rax
+-  |  sub ecx, 1
++  |  mov rax, [CCSTATE+rcx+offsetof(CCallState, stack)]
++  |  mov [rsp+rcx+CCALL_SPS_EXTRA*8], rax
++  |  sub ecx, 8
+   |  jns <1
+   |2:
+   |
+@@ -4056,10 +4080,11 @@ static void build_ins(BuildCtx *ctx, BCO
+     break;
+ 
+   case BC_ITERN:
+-    |  ins_A	// RA = base, (RB = nresults+1, RC = nargs+1 (2+1))
+     |.if JIT
+-    |  // NYI: add hotloop, record BC_ITERN.
++    |  hotloop RBd
+     |.endif
++    |->vm_IITERN:
++    |  ins_A	// RA = base, (RB = nresults+1, RC = nargs+1 (2+1))
+     |  mov TAB:RB, [BASE+RA*8-16]
+     |  cleartp TAB:RB
+     |  mov RCd, [BASE+RA*8-8]		// Get index from control var.
+@@ -4123,15 +4148,29 @@ static void build_ins(BuildCtx *ctx, BCO
+     |  cmp aword [BASE+RA*8-8], LJ_TNIL; jne >5
+     |  cmp byte CFUNC:RB->ffid, FF_next_N; jne >5
+     |  branchPC RD
+-    |  mov64 TMPR, U64x(fffe7fff, 00000000)
++    |  mov64 TMPR, ((uint64_t)LJ_KEYINDEX << 32)
+     |  mov [BASE+RA*8-8], TMPR		// Initialize control var.
+     |1:
+     |  ins_next
+     |5:  // Despecialize bytecode if any of the checks fail.
+     |  mov PC_OP, BC_JMP
+     |  branchPC RD
++    |.if JIT
++    |  cmp byte [PC], BC_ITERN
++    |  jne >6
++    |.endif
+     |  mov byte [PC], BC_ITERC
+     |  jmp <1
++    |.if JIT
++    |6:  // Unpatch JLOOP.
++    |  mov RA, [DISPATCH+DISPATCH_J(trace)]
++    |  movzx RCd, word [PC+2]
++    |  mov TRACE:RA, [RA+RC*8]
++    |  mov eax, TRACE:RA->startins
++    |  mov al, BC_ITERC
++    |  mov dword [PC], eax
++    |  jmp <1
++    |.endif
+     break;
+ 
+   case BC_VARG:
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/vm_x86.dasc
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/vm_x86.dasc
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/vm_x86.dasc
+@@ -1,6 +1,6 @@
+ |// Low-level VM code for x86 CPUs.
+ |// Bytecode interpreter, fast functions and helper functions.
+-|// Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
++|// Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+ |
+ |.if P64
+ |.arch x64
+@@ -464,9 +464,6 @@
+ |.macro sseconst_1, reg, tmp		// Synthesize 1.0.
+ |  sseconst_hi reg, tmp, 3ff00000
+ |.endmacro
+-|.macro sseconst_m1, reg, tmp		// Synthesize -1.0.
+-|  sseconst_hi reg, tmp, bff00000
+-|.endmacro
+ |.macro sseconst_2p52, reg, tmp		// Synthesize 2^52.
+ |  sseconst_hi reg, tmp, 43300000
+ |.endmacro
+@@ -1372,7 +1369,7 @@ static void build_subroutines(BuildCtx *
+   |  mov LFUNC:RB, [RA-8]
+   |  add NARGS:RD, 1
+   |  // This is fragile. L->base must not move, KBASE must always be defined.
+-  |.if x64
++  |.if X64
+   |  cmp KBASEa, rdx			// Continue with CALLT if flag set.
+   |.else
+   |  cmp KBASE, BASE			// Continue with CALLT if flag set.
+@@ -1673,55 +1670,35 @@ static void build_subroutines(BuildCtx *
+   |  je >2				// Missing 2nd arg?
+   |1:
+   |  cmp dword [BASE+4], LJ_TTAB;  jne ->fff_fallback
+-  |  mov L:RB, SAVE_L
+-  |  mov L:RB->base, BASE		// Add frame since C call can throw.
+-  |  mov L:RB->top, BASE		// Dummy frame length is ok.
+   |  mov PC, [BASE-4]
++  |  mov RB, BASE			// Save BASE.
+   |.if X64WIN
+-  |  lea CARG3d, [BASE+8]
+-  |  mov CARG2d, [BASE]			// Caveat: CARG2d == BASE.
+-  |  mov CARG1d, L:RB
++  |  mov CARG1d, [BASE]
++  |  lea CARG3d, [BASE-8]
++  |  lea CARG2d, [BASE+8]		// Caveat: CARG2d == BASE.
+   |.elif X64
+-  |  mov CARG2d, [BASE]
+-  |  lea CARG3d, [BASE+8]		// Caveat: CARG3d == BASE.
+-  |  mov CARG1d, L:RB
++  |  mov CARG1d, [BASE]
++  |  lea CARG2d, [BASE+8]
++  |  lea CARG3d, [BASE-8]		// Caveat: CARG3d == BASE.
+   |.else
+   |  mov TAB:RD, [BASE]
+-  |  mov ARG2, TAB:RD
+-  |  mov ARG1, L:RB
++  |  mov ARG1, TAB:RD
+   |  add BASE, 8
++  |  mov ARG2, BASE
++  |  sub BASE, 8+8
+   |  mov ARG3, BASE
+   |.endif
+-  |  mov SAVE_PC, PC			// Needed for ITERN fallback.
+-  |  call extern lj_tab_next	// (lua_State *L, GCtab *t, TValue *key)
+-  |  // Flag returned in eax (RD).
+-  |  mov BASE, L:RB->base
+-  |  test RD, RD;  jz >3		// End of traversal?
+-  |  // Copy key and value to results.
+-  |.if X64
+-  |  mov RBa, [BASE+8]
+-  |  mov RDa, [BASE+16]
+-  |  mov [BASE-8], RBa
+-  |  mov [BASE], RDa
+-  |.else
+-  |  mov RB, [BASE+8]
+-  |  mov RD, [BASE+12]
+-  |  mov [BASE-8], RB
+-  |  mov [BASE-4], RD
+-  |  mov RB, [BASE+16]
+-  |  mov RD, [BASE+20]
+-  |  mov [BASE], RB
+-  |  mov [BASE+4], RD
+-  |.endif
+-  |->fff_res2:
+-  |  mov RD, 1+2
+-  |  jmp ->fff_res
++  |  call extern lj_tab_next		// (GCtab *t, cTValue *key, TValue *o)
++  |  // 1=found, 0=end, -1=error returned in eax (RD).
++  |  mov BASE, RB			// Restore BASE.
++  |  test RD, RD;  jg ->fff_res2	// Found key/value.
++  |  js ->fff_fallback_2		// Invalid key.
++  |  // End of traversal: return nil.
++  |  mov dword [BASE-4], LJ_TNIL
++  |  jmp ->fff_res1
+   |2:  // Set missing 2nd arg to nil.
+   |  mov dword [BASE+12], LJ_TNIL
+   |  jmp <1
+-  |3:  // End of traversal: return nil.
+-  |  mov dword [BASE-4], LJ_TNIL
+-  |  jmp ->fff_res1
+   |
+   |.ffunc_1 pairs
+   |  mov TAB:RB, [BASE]
+@@ -1775,7 +1752,9 @@ static void build_subroutines(BuildCtx *
+   |  mov [BASE], RB
+   |  mov [BASE+4], RD
+   |.endif
+-  |  jmp ->fff_res2
++  |->fff_res2:
++  |  mov RD, 1+2
++  |  jmp ->fff_res
+   |2:  // Check for empty hash part first. Otherwise call C function.
+   |  cmp dword TAB:RB->hmask, 0; je ->fff_res0
+   |  mov FCARG1, TAB:RB
+@@ -2423,9 +2402,9 @@ static void build_subroutines(BuildCtx *
+   |   lea SBUF:FCARG1, [DISPATCH+DISPATCH_GL(tmpbuf)]
+   |  mov L:RB->base, BASE
+   |  mov STR:FCARG2, [BASE]		// Caveat: FCARG2 == BASE
+-  |   mov RC, SBUF:FCARG1->b
++  |   mov RCa, SBUF:FCARG1->b
+   |   mov SBUF:FCARG1->L, L:RB
+-  |   mov SBUF:FCARG1->p, RC
++  |   mov SBUF:FCARG1->w, RCa
+   |  mov SAVE_PC, PC
+   |  call extern lj_buf_putstr_ .. name .. @8
+   |  mov FCARG1, eax
+@@ -2923,7 +2902,7 @@ static void build_subroutines(BuildCtx *
+   |  mov r13, TMPa
+   |  mov r12, TMPQ
+   |.endif
+-  |  test RD, RD; js >9			// Check for error from exit.
++  |  cmp RD, -LUA_ERRERR; jae >9	// Check for error from exit.
+   |  mov L:RB, SAVE_L
+   |  mov MULTRES, RD
+   |  mov LFUNC:KBASE, [BASE-8]
+@@ -2938,6 +2917,8 @@ static void build_subroutines(BuildCtx *
+   |  movzx OP, RCL
+   |  add PC, 4
+   |  shr RC, 16
++  |  cmp MULTRES, -17			// Static dispatch?
++  |  je >5
+   |  cmp OP, BC_FUNCF			// Function header?
+   |  jb >3
+   |  cmp OP, BC_FUNCC+2			// Fast function?
+@@ -2963,9 +2944,24 @@ static void build_subroutines(BuildCtx *
+   |  mov KBASE, [KBASE+PC2PROTO(k)]
+   |  jmp <2
+   |
++  |5:  // Dispatch to static entry of original ins replaced by BC_JLOOP.
++  |  mov RA, [DISPATCH+DISPATCH_J(trace)]
++  |  mov TRACE:RA, [RA+RD*4]
++  |  mov RC, TRACE:RA->startins
++  |  movzx RA, RCH
++  |  movzx OP, RCL
++  |  shr RC, 16
++  |.if X64
++  |  jmp aword [DISPATCH+OP*8+GG_DISP2STATIC]
++  |.else
++  |  jmp aword [DISPATCH+OP*4+GG_DISP2STATIC]
++  |.endif
++  |
+   |9:  // Rethrow error from the right C frame.
++  |  mov FCARG2, RD
+   |  mov FCARG1, L:RB
+-  |  call extern lj_err_run@4		// (lua_State *L)
++  |  neg FCARG2
++  |  call extern lj_err_trace@8		// (lua_State *L, int errcode)
+   |.endif
+   |
+   |//-----------------------------------------------------------------------
+@@ -3005,15 +3001,17 @@ static void build_subroutines(BuildCtx *
+   |  addsd xmm1, xmm3			// (|x| + 2^52) - 2^52
+   |  subsd xmm1, xmm3
+   |  orpd xmm1, xmm2			// Merge sign bit back in.
++  |  sseconst_1 xmm3, RDa
+   |  .if mode == 1		// ceil(x)?
+-  |    sseconst_m1 xmm2, RDa		// Must subtract -1 to preserve -0.
+   |    cmpsd xmm0, xmm1, 6		// x > result?
++  |    andpd xmm0, xmm3
++  |    addsd xmm1, xmm0			// If yes, add 1.
++  |    orpd xmm1, xmm2			// Merge sign bit back in (again).
+   |  .else			// floor(x)?
+-  |    sseconst_1 xmm2, RDa
+   |    cmpsd xmm0, xmm1, 1		// x < result?
++  |    andpd xmm0, xmm3
++  |    subsd xmm1, xmm0			// If yes, subtract 1.
+   |  .endif
+-  |  andpd xmm0, xmm2
+-  |  subsd xmm1, xmm0			// If yes, subtract +-1.
+   |.endif
+   |  movaps xmm0, xmm1
+   |1:
+@@ -3054,41 +3052,6 @@ static void build_subroutines(BuildCtx *
+   |  subsd xmm0, xmm1
+   |  ret
+   |
+-  |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified.
+-  |->vm_powi_sse:
+-  |  cmp eax, 1; jle >6			// i<=1?
+-  |  // Now 1 < (unsigned)i <= 0x80000000.
+-  |1:  // Handle leading zeros.
+-  |  test eax, 1; jnz >2
+-  |  mulsd xmm0, xmm0
+-  |  shr eax, 1
+-  |  jmp <1
+-  |2:
+-  |  shr eax, 1; jz >5
+-  |  movaps xmm1, xmm0
+-  |3:  // Handle trailing bits.
+-  |  mulsd xmm0, xmm0
+-  |  shr eax, 1; jz >4
+-  |  jnc <3
+-  |  mulsd xmm1, xmm0
+-  |  jmp <3
+-  |4:
+-  |  mulsd xmm0, xmm1
+-  |5:
+-  |  ret
+-  |6:
+-  |  je <5				// x^1 ==> x
+-  |  jb >7				// x^0 ==> 1
+-  |  neg eax
+-  |  call <1
+-  |  sseconst_1 xmm1, RDa
+-  |  divsd xmm1, xmm0
+-  |  movaps xmm0, xmm1
+-  |  ret
+-  |7:
+-  |  sseconst_1 xmm0, RDa
+-  |  ret
+-  |
+   |//-----------------------------------------------------------------------
+   |//-- Miscellaneous functions --------------------------------------------
+   |//-----------------------------------------------------------------------
+@@ -3136,6 +3099,86 @@ static void build_subroutines(BuildCtx *
+   |  ret
+   |.endif
+   |
++  |.define NEXT_TAB,		TAB:FCARG1
++  |.define NEXT_IDX,		FCARG2
++  |.define NEXT_PTR,		RCa
++  |.define NEXT_PTRd,		RC
++  |.macro NEXT_RES_IDXL, op2;	lea edx, [NEXT_IDX+op2]; .endmacro
++  |.if X64
++  |.define NEXT_TMP,		CARG3d
++  |.define NEXT_TMPq,		CARG3
++  |.define NEXT_ASIZE,		CARG4d
++  |.macro NEXT_ENTER;		.endmacro
++  |.macro NEXT_LEAVE;		ret; .endmacro
++  |.if X64WIN
++  |.define NEXT_RES_PTR,	[rsp+aword*5]
++  |.macro NEXT_RES_IDX, op2;	add NEXT_IDX, op2; .endmacro
++  |.else
++  |.define NEXT_RES_PTR,	[rsp+aword*1]
++  |.macro NEXT_RES_IDX, op2;	lea edx, [NEXT_IDX+op2]; .endmacro
++  |.endif
++  |.else
++  |.define NEXT_ASIZE,		esi
++  |.define NEXT_TMP,		edi
++  |.macro NEXT_ENTER;		push esi; push edi; .endmacro
++  |.macro NEXT_LEAVE;		pop edi; pop esi; ret; .endmacro
++  |.define NEXT_RES_PTR,	[esp+dword*3]
++  |.macro NEXT_RES_IDX, op2;	add NEXT_IDX, op2; .endmacro
++  |.endif
++  |
++  |// TValue *lj_vm_next(GCtab *t, uint32_t idx)
++  |// Next idx returned in edx.
++  |->vm_next:
++  |.if JIT
++  |  NEXT_ENTER
++  |  mov NEXT_ASIZE, NEXT_TAB->asize
++  |1:  // Traverse array part.
++  |  cmp NEXT_IDX, NEXT_ASIZE;  jae >5
++  |  mov NEXT_TMP, NEXT_TAB->array
++  |  cmp dword [NEXT_TMP+NEXT_IDX*8+4], LJ_TNIL;  je >2
++  |  lea NEXT_PTR, NEXT_RES_PTR
++  |.if X64
++  |  mov NEXT_TMPq, qword [NEXT_TMP+NEXT_IDX*8]
++  |  mov qword [NEXT_PTR], NEXT_TMPq
++  |.else
++  |  mov NEXT_ASIZE, dword [NEXT_TMP+NEXT_IDX*8+4]
++  |  mov NEXT_TMP, dword [NEXT_TMP+NEXT_IDX*8]
++  |  mov dword [NEXT_PTR+4], NEXT_ASIZE
++  |  mov dword [NEXT_PTR], NEXT_TMP
++  |.endif
++  |.if DUALNUM
++  |  mov dword [NEXT_PTR+dword*3], LJ_TISNUM
++  |  mov dword [NEXT_PTR+dword*2], NEXT_IDX
++  |.else
++  |  cvtsi2sd xmm0, NEXT_IDX
++  |  movsd qword [NEXT_PTR+dword*2], xmm0
++  |.endif
++  |  NEXT_RES_IDX 1
++  |  NEXT_LEAVE
++  |2:  // Skip holes in array part.
++  |  add NEXT_IDX, 1
++  |  jmp <1
++  |
++  |5:  // Traverse hash part.
++  |  sub NEXT_IDX, NEXT_ASIZE
++  |6:
++  |  cmp NEXT_IDX, NEXT_TAB->hmask; ja >9
++  |  imul NEXT_PTRd, NEXT_IDX, #NODE
++  |  add NODE:NEXT_PTRd, dword NEXT_TAB->node
++  |  cmp dword NODE:NEXT_PTR->val.it, LJ_TNIL; je >7
++  |  NEXT_RES_IDXL NEXT_ASIZE+1
++  |  NEXT_LEAVE
++  |7:  // Skip holes in hash part.
++  |  add NEXT_IDX, 1
++  |  jmp <6
++  |
++  |9:  // End of iteration. Set the key to nil (not the value).
++  |  NEXT_RES_IDX NEXT_ASIZE
++  |  lea NEXT_PTR, NEXT_RES_PTR
++  |  mov dword [NEXT_PTR+dword*3], LJ_TNIL
++  |  NEXT_LEAVE
++  |.endif
++  |
+   |//-----------------------------------------------------------------------
+   |//-- Assertions ---------------------------------------------------------
+   |//-----------------------------------------------------------------------
+@@ -3271,19 +3314,25 @@ static void build_subroutines(BuildCtx *
+   |
+   |  // Copy stack slots.
+   |  movzx ecx, byte CCSTATE->nsp
+-  |  sub ecx, 1
++  |.if X64
++  |  sub ecx, 8
+   |  js >2
+   |1:
+-  |.if X64
+-  |  mov rax, [CCSTATE+rcx*8+offsetof(CCallState, stack)]
+-  |  mov [rsp+rcx*8+CCALL_SPS_EXTRA*8], rax
++  |  mov rax, [CCSTATE+rcx+offsetof(CCallState, stack)]
++  |  mov [rsp+rcx+CCALL_SPS_EXTRA*8], rax
++  |  sub ecx, 8
++  |  jns <1
++  |2:
+   |.else
+-  |  mov eax, [CCSTATE+ecx*4+offsetof(CCallState, stack)]
+-  |  mov [esp+ecx*4], eax
+-  |.endif
+-  |  sub ecx, 1
++  |  sub ecx, 4
++  |  js >2
++  |1:
++  |  mov eax, [CCSTATE+ecx+offsetof(CCallState, stack)]
++  |  mov [esp+ecx], eax
++  |  sub ecx, 4
+   |  jns <1
+   |2:
++  |.endif
+   |
+   |.if X64
+   |  movzx eax, byte CCSTATE->nfpr
+@@ -4787,10 +4836,11 @@ static void build_ins(BuildCtx *ctx, BCO
+     break;
+ 
+   case BC_ITERN:
+-    |  ins_A	// RA = base, (RB = nresults+1, RC = nargs+1 (2+1))
+     |.if JIT
+-    |  // NYI: add hotloop, record BC_ITERN.
++    |  hotloop RB
+     |.endif
++    |->vm_IITERN:
++    |  ins_A	// RA = base, (RB = nresults+1, RC = nargs+1 (2+1))
+     |  mov TMP1, KBASE			// Need two more free registers.
+     |  mov TMP2, DISPATCH
+     |  mov TAB:RB, [BASE+RA*8-16]
+@@ -4878,14 +4928,28 @@ static void build_ins(BuildCtx *ctx, BCO
+     |  cmp byte CFUNC:RB->ffid, FF_next_N; jne >5
+     |  branchPC RD
+     |  mov dword [BASE+RA*8-8], 0	// Initialize control var.
+-    |  mov dword [BASE+RA*8-4], 0xfffe7fff
++    |  mov dword [BASE+RA*8-4], LJ_KEYINDEX
+     |1:
+     |  ins_next
+     |5:  // Despecialize bytecode if any of the checks fail.
+     |  mov PC_OP, BC_JMP
+     |  branchPC RD
++    |.if JIT
++    |  cmp byte [PC], BC_ITERN
++    |  jne >6
++    |.endif
+     |  mov byte [PC], BC_ITERC
+     |  jmp <1
++    |.if JIT
++    |6:  // Unpatch JLOOP.
++    |  mov RA, [DISPATCH+DISPATCH_J(trace)]
++    |  movzx RC, word [PC+2]
++    |  mov TRACE:RA, [RA+RC*4]
++    |  mov eax, TRACE:RA->startins
++    |  mov al, BC_ITERC
++    |  mov dword [PC], eax
++    |  jmp <1
++    |.endif
+     break;
+ 
+   case BC_VARG:
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/xb1build.bat
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/xb1build.bat
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/xb1build.bat
+@@ -14,7 +14,7 @@
+ @set LJMT=mt /nologo
+ @set DASMDIR=..\dynasm
+ @set DASM=%DASMDIR%\dynasm.lua
+-@set ALL_LIB=lib_base.c lib_math.c lib_bit.c lib_string.c lib_table.c lib_io.c lib_os.c lib_package.c lib_debug.c lib_jit.c lib_ffi.c
++@set ALL_LIB=lib_base.c lib_math.c lib_bit.c lib_string.c lib_table.c lib_io.c lib_os.c lib_package.c lib_debug.c lib_jit.c lib_ffi.c lib_buffer.c
+ 
+ %LJCOMPILE% host\minilua.c
+ @if errorlevel 1 goto :BAD
+@@ -31,6 +31,9 @@ if exist minilua.exe.manifest^
+ minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h vm_x64.dasc
+ @if errorlevel 1 goto :BAD
+ 
++if exist ..\.git ( git show -s --format=%%ct >luajit_relver.txt ) else ( type ..\.relver >luajit_relver.txt )
++minilua host\genversion.lua
++
+ %LJCOMPILE% /I "." /I %DASMDIR% /D_DURANGO host\buildvm*.c
+ @if errorlevel 1 goto :BAD
+ %LJLINK% /out:buildvm.exe buildvm*.obj
+Index: wrk-4.2.0/obj/LuaJIT-2.1/src/xedkbuild.bat
+===================================================================
+--- wrk-4.2.0.orig/obj/LuaJIT-2.1/src/xedkbuild.bat
++++ wrk-4.2.0/obj/LuaJIT-2.1/src/xedkbuild.bat
+@@ -14,7 +14,7 @@
+ @set LJMT=mt /nologo
+ @set DASMDIR=..\dynasm
+ @set DASM=%DASMDIR%\dynasm.lua
+-@set ALL_LIB=lib_base.c lib_math.c lib_bit.c lib_string.c lib_table.c lib_io.c lib_os.c lib_package.c lib_debug.c lib_jit.c lib_ffi.c
++@set ALL_LIB=lib_base.c lib_math.c lib_bit.c lib_string.c lib_table.c lib_io.c lib_os.c lib_package.c lib_debug.c lib_jit.c lib_ffi.c lib_buffer.c
+ 
+ %LJCOMPILE% host\minilua.c
+ @if errorlevel 1 goto :BAD
+@@ -31,6 +31,9 @@ if exist minilua.exe.manifest^
+ minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h vm_ppc.dasc
+ @if errorlevel 1 goto :BAD
+ 
++if exist ..\.git ( git show -s --format=%%ct >luajit_relver.txt ) else ( type ..\.relver >luajit_relver.txt )
++minilua host\genversion.lua
++
+ %LJCOMPILE% /I "." /I %DASMDIR% /D_XBOX_VER=200 /DLUAJIT_TARGET=LUAJIT_ARCH_PPC  host\buildvm*.c
+ @if errorlevel 1 goto :BAD
+ %LJLINK% /out:buildvm.exe buildvm*.obj
diff --git a/wrk.spec b/wrk.spec
index da73641..70ec643 100644
--- a/wrk.spec
+++ b/wrk.spec
@@ -1,13 +1,18 @@
 Name:                wrk
 Version:             4.2.0
-Release:             1
+Release:             2
 Summary:             Modern HTTP benchmarking tool
 License:             Apache-2.0
 URL:                 https://github.com/wg/wrk
 Source0:             https://github.com/wg/wrk/archive/%{version}.tar.gz#/%{name}-%{version}.tar.gz
 Patch0:              wrk-4.2.0_distrofixes.patch
 Patch1:              fix-wrk-version-or-v-error.patch 
-BuildRequires:       pkgconf openssl-devel
+Patch2:		     luajit-riscv-makefile.patch
+
+BuildRequires:       pkgconf openssl-devel unzip
+
+Source24300:	     luajit-riscv.patch
+
 %description
 wrk is a modern HTTP benchmarking tool capable of generating significant
 load when run on a single multi-core CPU. It combines a multithreaded
@@ -20,6 +25,8 @@ scripts.
 %autosetup -p1
 
 %build
+make obj/LuaJIT-2.1
+patch -p1 -E < %{S:24300}
 %make_build OPTFLAGS="%{optflags}" WITH_OPENSSL=%{_prefix}
 
 %install
@@ -31,6 +38,9 @@ install -D -m 0755 wrk %{buildroot}%{_bindir}/wrk
 %{_bindir}/wrk
 
 %changelog
+* Mon Mar 25 2024 luojun <luojun.oerv@isrc.iscas.ac.cn> - 4.2.0-2
+- add riscv luajit support patch
+
 * Wed Oct 25 2023 liyanan <liyanan61@h-parners.com> - 4.2.0-1
 - Update to 4.2.0