From 312c0e65fdb4f2e708706c1f84af89f89dbb22ff Mon Sep 17 00:00:00 2001
From: Takeshi Yoneda <takeshi@tetrate.io>
Date: Tue, 22 Feb 2022 17:26:54 +0900
Subject: [PATCH] Complete JIT compilation engine for arm64 target. (#276)

This commit completes the baseline single pass JIT engine for arm64 target.
The implementation passes 100% of specification tests and all the e2e tests
that have been used for amd64. Notably, the engine is stable under high
concurrency where multiple gorutines are holding stores and each of them
has Wasm execution environment.

One thing to note is that the assembler (golang-asm) is not goroutine-safe,
so we have to take a lock on the assembler usage, therefore the compilation
cannot scale to multiple CPU cores. This will be resolved once we build our
homemade assembler in #233.

resolves #187

Signed-off-by: Takeshi Yoneda <takeshi@tetrate.io>
---
 README.md                           |  14 +-
 internal/wasm/jit/engine.go         |  20 +--
 internal/wasm/jit/jit_amd64.go      |   7 +-
 internal/wasm/jit/jit_arm64.go      | 108 ++++++++++++----
 internal/wasm/jit/jit_arm64_test.go | 192 +++++++++++++++-------------
 internal/wasm/jit/jit_other.go      |   2 +-
 internal/wasm/store.go              |  12 +-
 internal/wasm/store_test.go         |   6 +-
 tests/engine/adhoc_test.go          |   2 +-
 tests/spectest/spec_test.go         |   2 +-
 10 files changed, 216 insertions(+), 149 deletions(-)

diff --git a/README.md b/README.md
index e218f106f8..2c3d02547a 100644
--- a/README.md
+++ b/README.md
@@ -35,15 +35,17 @@ wazero is an early project, so APIs are subject to change until version 1.0.
 There's the concept called "engine" in wazero (which is a word commonly used in Wasm runtimes). Engines are responsible for compiling and executing WebAssembly modules.
 There are two types of engines are available for wazero:
 
-1. _Interpreter_: a naive interpreter-based implementation of Wasm virtual machine. Its implementation doesn't have any platform (GOARCH, GOOS) specific code, therefore _interpreter_ engine can be used for any compilation target available for Go (such as `arm64`).
-2. _JIT engine_: compiles WebAssembly modules, generates the machine code, and executing it all at runtime. Currently wazero only implements the JIT compiler for `amd64` target. Generally speaking, _JIT engine_ is faster than _Interpreter_ by order of magnitude. However, the implementation is immature and has a bunch of aspects that could be improved (for example, it just does a singlepass compilation and doesn't do any optimizations, etc.). Please refer to [internal/wasm/jit/RATIONALE.md](internal/wasm/jit/RATIONALE.md) for the design choices and considerations in our JIT engine.
+1. _Interpreter_: a naive interpreter-based implementation of Wasm virtual machine. Its implementation doesn't have any platform (GOARCH, GOOS) specific code, therefore _interpreter_ engine can be used for any compilation target available for Go (such as `riscv64`).
+2. _JIT engine_: compiles WebAssembly modules, generates the machine code, and executing it all at runtime. Currently wazero implements the JIT compiler for `amd64` and `arm64` target. Generally speaking, _JIT engine_ is faster than _Interpreter_ by order of magnitude. However, the implementation is immature and has a bunch of aspects that could be improved (for example, it just does a singlepass compilation and doesn't do any optimizations, etc.). Please refer to [internal/wasm/jit/RATIONALE.md](internal/wasm/jit/RATIONALE.md) for the design choices and considerations in our JIT engine.
 
 Both of engines passes 100% of [WebAssembly spec test suites]((https://github.com/WebAssembly/spec/tree/wg-1.0/test/core)) (on supported platforms).
 
-| Engine     | Usage|GOARCH=amd64 | GOARCH=others |
-|:----------:|:---:|:-------------:|:------:|
-| Interpreter|`wazero.NewEngineInterpreter()`| ✅    | ✅ |
-| JIT engine |`wazero.NewEngineJIT()`|   ✅   | ❌  |
+| Engine     | Usage| amd64 | arm64 | others |
+|:---:|:---:|:---:|:---:|:---:|
+| Interpreter|`wazero.NewEngineInterpreter()`|✅ |✅|✅|
+| JIT engine |`wazero.NewEngineJIT()`|✅|✅ |❌|
+
+*Note:* JIT does not yet work on Windows. Please use the interpreter and track [this issue](https://github.com/tetratelabs/wazero/issues/270) if interested.
 
 If you choose no configuration, ex `wazero.NewStore()`, the interpreter is used. You can also choose explicitly like so:
 ```go
diff --git a/internal/wasm/jit/engine.go b/internal/wasm/jit/engine.go
index ace48e7627..3576585eba 100644
--- a/internal/wasm/jit/engine.go
+++ b/internal/wasm/jit/engine.go
@@ -1,7 +1,6 @@
 package jit
 
 import (
-	"encoding/hex"
 	"fmt"
 	"math"
 	"reflect"
@@ -533,19 +532,11 @@ jitentry:
 		switch status := e.exitContext.statusCode; status {
 		case jitCallStatusCodeReturned:
 			// Meaning that all the function frames above the previous call frame stack pointer are executed.
-			if e.globalContext.previousCallFrameStackPointer != e.globalContext.callFrameStackPointer {
-				panic("bug in JIT compiler")
-			}
 		case jitCallStatusCodeCallHostFunction:
 			// Not "callFrameTop" but take the below of peek with "callFrameAt(1)" as the top frame is for host function,
 			// but when making host function calls, we need to pass the memory instance of host function caller.
 			fn := e.compiledFunctions[e.exitContext.functionCallAddress]
 			callerCompiledFunction := e.callFrameAt(1).compiledFunction
-			if buildoptions.IsDebugMode {
-				if fn.source.FunctionKind == wasm.FunctionKindWasm {
-					panic("jitCallStatusCodeCallHostFunction is only for host functions")
-				}
-			}
 			saved := e.globalContext.previousCallFrameStackPointer
 			e.execHostFunction(fn.source.FunctionKind, fn.source.HostFunction,
 				ctx.WithMemory(callerCompiledFunction.source.ModuleInstance.Memory),
@@ -669,7 +660,9 @@ func (e *engine) addCompiledFunction(addr wasm.FunctionAddress, compiled *compil
 }
 
 func compileHostFunction(f *wasm.FunctionInstance) (*compiledFunction, error) {
-	compiler, err := newCompiler(f, nil)
+	compiler, done, err := newCompiler(f, nil)
+	defer done()
+
 	if err != nil {
 		return nil, err
 	}
@@ -706,7 +699,8 @@ func compileWasmFunction(f *wasm.FunctionInstance) (*compiledFunction, error) {
 		fmt.Printf("compilation target wazeroir:\n%s\n", wazeroir.Format(ir.Operations))
 	}
 
-	compiler, err := newCompiler(f, ir)
+	compiler, done, err := newCompiler(f, ir)
+	defer done()
 	if err != nil {
 		return nil, fmt.Errorf("failed to initialize assembly builder: %w", err)
 	}
@@ -879,10 +873,6 @@ func compileWasmFunction(f *wasm.FunctionInstance) (*compiledFunction, error) {
 		return nil, fmt.Errorf("failed to compile: %w", err)
 	}
 
-	if buildoptions.IsDebugMode {
-		fmt.Printf("compiled code in hex: %s\n", hex.EncodeToString(code))
-	}
-
 	return &compiledFunction{
 		source:             f,
 		codeSegment:        code,
diff --git a/internal/wasm/jit/jit_amd64.go b/internal/wasm/jit/jit_amd64.go
index 4e748fad46..9d42a468c7 100644
--- a/internal/wasm/jit/jit_amd64.go
+++ b/internal/wasm/jit/jit_amd64.go
@@ -89,13 +89,14 @@ type archContext struct{}
 func newArchContext() (ret archContext) { return }
 
 // newCompiler returns a new compiler interface which can be used to compile the given function instance.
+// The function returned must be invoked when finished compiling, so use `defer` to ensure this.
 // Note: ir param can be nil for host functions.
-func newCompiler(f *wasm.FunctionInstance, ir *wazeroir.CompilationResult) (compiler, error) {
+func newCompiler(f *wasm.FunctionInstance, ir *wazeroir.CompilationResult) (compiler, func(), error) {
 	// We can choose arbitrary number instead of 1024 which indicates the cache size in the compiler.
 	// TODO: optimize the number.
 	b, err := asm.NewBuilder("amd64", 1024)
 	if err != nil {
-		return nil, fmt.Errorf("failed to create a new assembly builder: %w", err)
+		return nil, func() {}, fmt.Errorf("failed to create a new assembly builder: %w", err)
 	}
 
 	compiler := &amd64Compiler{
@@ -106,7 +107,7 @@ func newCompiler(f *wasm.FunctionInstance, ir *wazeroir.CompilationResult) (comp
 		ir:            ir,
 		labels:        map[string]*labelInfo{},
 	}
-	return compiler, nil
+	return compiler, func() {}, nil
 }
 
 func (c *amd64Compiler) String() string {
diff --git a/internal/wasm/jit/jit_arm64.go b/internal/wasm/jit/jit_arm64.go
index 71e3f6f2ff..3c84b9a018 100644
--- a/internal/wasm/jit/jit_arm64.go
+++ b/internal/wasm/jit/jit_arm64.go
@@ -16,6 +16,7 @@ import (
 	"encoding/binary"
 	"fmt"
 	"math"
+	"sync"
 	"unsafe"
 
 	asm "github.com/twitchyliquid64/golang-asm"
@@ -69,14 +70,27 @@ const (
 // engine is the pointer to the "*engine" as uintptr.
 func jitcall(codeSegment, engine uintptr)
 
+// golang-asm is not goroutine-safe so we take lock until we complete the compilation.
+// TODO: delete after https://github.com/tetratelabs/wazero/issues/233
+var assemblerMutex = &sync.Mutex{}
+
+func unlockAssembler() {
+	assemblerMutex.Unlock()
+}
+
 // newCompiler returns a new compiler interface which can be used to compile the given function instance.
+// The function returned must be invoked when finished compiling, so use `defer` to ensure this.
 // Note: ir param can be nil for host functions.
-func newCompiler(f *wasm.FunctionInstance, ir *wazeroir.CompilationResult) (compiler, error) {
+func newCompiler(f *wasm.FunctionInstance, ir *wazeroir.CompilationResult) (c compiler, done func(), err error) {
+	// golang-asm is not goroutine-safe so we take lock until we complete the compilation.
+	// TODO: delete after https://github.com/tetratelabs/wazero/issues/233
+	assemblerMutex.Lock()
+
 	// We can choose arbitrary number instead of 1024 which indicates the cache size in the compiler.
 	// TODO: optimize the number.
 	b, err := asm.NewBuilder("arm64", 1024)
 	if err != nil {
-		return nil, fmt.Errorf("failed to create a new assembly builder: %w", err)
+		return nil, unlockAssembler, fmt.Errorf("failed to create a new assembly builder: %w", err)
 	}
 
 	compiler := &arm64Compiler{
@@ -86,7 +100,7 @@ func newCompiler(f *wasm.FunctionInstance, ir *wazeroir.CompilationResult) (comp
 		ir:            ir,
 		labels:        map[string]*labelInfo{},
 	}
-	return compiler, nil
+	return compiler, unlockAssembler, nil
 }
 
 type arm64Compiler struct {
@@ -145,6 +159,7 @@ func (c *arm64Compiler) compile() (code []byte, staticData compiledFunctionStati
 		return
 	}
 
+	staticData = c.staticData
 	return
 }
 
@@ -597,6 +612,10 @@ func (c *arm64Compiler) compileExitFromNativeCode(status jitCallStatusCode) erro
 
 // compileHostFunction implements compiler.compileHostFunction for the arm64 architecture.
 func (c *arm64Compiler) compileHostFunction(address wasm.FunctionAddress) error {
+	// The assembler skips the first instruction so we intentionally add NOP here.
+	// TODO: delete after #233
+	c.compileNOP()
+
 	// First we must update the location stack to reflect the number of host function inputs.
 	c.pushFunctionParams()
 
@@ -667,6 +686,15 @@ func (c *arm64Compiler) compileSwap(o *wazeroir.OperationSwap) error {
 	return nil
 }
 
+// Only used in test, but define this in the main file as sometimes
+// we need to call this from the main code when debugging.
+//nolint:unused
+func (c *arm64Compiler) undefined() {
+	ud := c.newProg()
+	ud.As = obj.AUNDEF
+	c.addInstruction(ud)
+}
+
 // compileGlobalGet implements compiler.compileGlobalGet for the arm64 architecture.
 func (c *arm64Compiler) compileGlobalGet(o *wazeroir.OperationGlobalGet) error {
 	c.maybeCompileMoveTopConditionalToFreeGeneralPurposeRegister()
@@ -687,7 +715,7 @@ func (c *arm64Compiler) compileGlobalGet(o *wazeroir.OperationGlobalGet) error {
 		intMov = arm64.AMOVWU
 		floatMov = arm64.AFMOVS
 	case wasm.ValueTypeF64:
-		intMov = arm64.AMOVW
+		intMov = arm64.AMOVD
 		floatMov = arm64.AFMOVD
 	}
 
@@ -763,7 +791,7 @@ func (c *arm64Compiler) compileReadGlobalAddress(globalIndex uint32) (destinatio
 	c.compileConstToRegisterInstruction(
 		// globalIndex is an index to []*GlobalInstance, therefore
 		// we have to multiply it by the size of *GlobalInstance == the pointer size == 8.
-		arm64.AMOVW, int64(globalIndex)*8, destinationRegister,
+		arm64.AMOVD, int64(globalIndex)*8, destinationRegister,
 	)
 
 	// "reservedRegisterForTemporary = &globals[0]"
@@ -773,7 +801,7 @@ func (c *arm64Compiler) compileReadGlobalAddress(globalIndex uint32) (destinatio
 		reservedRegisterForTemporary,
 	)
 
-	// "destinationRegister = [reservedRegisterForTemporary + destinationRegister] (== &globals[globalIndex])".
+	// "destinationRegister = [reservedRegisterForTemporary + destinationRegister] (== globals[globalIndex])".
 	c.compileMemoryWithRegisterOffsetToRegisterInstruction(
 		arm64.AMOVD,
 		reservedRegisterForTemporary, destinationRegister,
@@ -1192,7 +1220,7 @@ func (c *arm64Compiler) compileCallImpl(addr wasm.FunctionAddress, addrRegister
 			compiledFunctionAddressRegister)
 	} else {
 		// Shift addrRegister by 3 because the size of *compiledFunction equals 8 bytes.
-		c.compileConstToRegisterInstruction(arm64.ALSL, 3, addrRegister)
+		c.compileConstToRegisterInstruction(arm64.ALSLW, 3, addrRegister)
 		c.compileMemoryWithRegisterOffsetToRegisterInstruction(
 			arm64.AMOVD,
 			tmp, addrRegister,
@@ -1465,7 +1493,7 @@ func (c *arm64Compiler) compileDropRange(r *wazeroir.InclusiveRange) error {
 	c.maybeCompileMoveTopConditionalToFreeGeneralPurposeRegister()
 
 	// Save the live values because we pop and release values in drop range below.
-	liveValues := c.locationStack.stack[c.locationStack.sp-uint64(r.Start):]
+	liveValues := c.locationStack.stack[c.locationStack.sp-uint64(r.Start) : c.locationStack.sp]
 	c.locationStack.sp -= uint64(r.Start)
 
 	// Note: drop target range is inclusive.
@@ -1498,6 +1526,8 @@ func (c *arm64Compiler) compileSelect() error {
 		return err
 	}
 
+	c.markRegisterUsed(cv.register)
+
 	x1, x2, err := c.popTwoValuesOnRegisters()
 	if err != nil {
 		return err
@@ -1518,7 +1548,7 @@ func (c *arm64Compiler) compileSelect() error {
 	// So we explicitly assign a general purpuse register to x1 here.
 	if isZeroRegister(x1.register) {
 		// Mark x2 and cv's regiseters are used so they won't be chosen.
-		c.markRegisterUsed(x2.register, cv.register)
+		c.markRegisterUsed(x2.register)
 		// Pick the non-zero register for x1.
 		x1Reg, err := c.allocateRegister(generalPurposeRegisterTypeInt)
 		if err != nil {
@@ -1896,7 +1926,7 @@ func (c *arm64Compiler) compileIntegerDivPrecheck(is32Bit, isSigned bool, divide
 		brIfDividendNotMinInt := c.compilelBranchInstruction(arm64.ABNE)
 
 		// Otherwise, we raise overflow error.
-		c.compileExitFromNativeCode(jitCallStatusIntegerDivisionByZero)
+		c.compileExitFromNativeCode(jitCallStatusIntegerOverflow)
 
 		c.setBranchTargetOnNext(brIfDivisorNonMinusOne, brIfDividendNotMinInt)
 	}
@@ -2340,24 +2370,37 @@ func (c *arm64Compiler) compileITruncFromF(o *wazeroir.OperationITruncFromF) err
 	c.compileRegisterToRegisterInstruction(arm64.AMSR, zeroRegister, arm64.REG_FPSR)
 
 	var convinst obj.As
-	if o.InputType == wazeroir.Float32 && o.OutputType == wazeroir.SignedInt32 {
+	var is32bitFloat = o.InputType == wazeroir.Float32
+	if is32bitFloat && o.OutputType == wazeroir.SignedInt32 {
 		convinst = arm64.AFCVTZSSW
-	} else if o.InputType == wazeroir.Float32 && o.OutputType == wazeroir.SignedInt64 {
+	} else if is32bitFloat && o.OutputType == wazeroir.SignedInt64 {
 		convinst = arm64.AFCVTZSS
-	} else if o.InputType == wazeroir.Float64 && o.OutputType == wazeroir.SignedInt32 {
+	} else if !is32bitFloat && o.OutputType == wazeroir.SignedInt32 {
 		convinst = arm64.AFCVTZSDW
-	} else if o.InputType == wazeroir.Float64 && o.OutputType == wazeroir.SignedInt64 {
+	} else if !is32bitFloat && o.OutputType == wazeroir.SignedInt64 {
 		convinst = arm64.AFCVTZSD
-	} else if o.InputType == wazeroir.Float32 && o.OutputType == wazeroir.SignedUint32 {
+	} else if is32bitFloat && o.OutputType == wazeroir.SignedUint32 {
 		convinst = arm64.AFCVTZUSW
-	} else if o.InputType == wazeroir.Float32 && o.OutputType == wazeroir.SignedUint64 {
+	} else if is32bitFloat && o.OutputType == wazeroir.SignedUint64 {
 		convinst = arm64.AFCVTZUS
-	} else if o.InputType == wazeroir.Float64 && o.OutputType == wazeroir.SignedUint32 {
+	} else if !is32bitFloat && o.OutputType == wazeroir.SignedUint32 {
 		convinst = arm64.AFCVTZUDW
-	} else if o.InputType == wazeroir.Float64 && o.OutputType == wazeroir.SignedUint64 {
+	} else if !is32bitFloat && o.OutputType == wazeroir.SignedUint64 {
 		convinst = arm64.AFCVTZUD
 	}
-	c.compileSimpleConversion(convinst, generalPurposeRegisterTypeInt)
+
+	source, err := c.popValueOnRegister()
+	if err != nil {
+		return err
+	}
+
+	destinationReg, err := c.allocateRegister(generalPurposeRegisterTypeInt)
+	if err != nil {
+		return err
+	}
+
+	c.compileRegisterToRegisterInstruction(convinst, source.register, destinationReg)
+	c.locationStack.pushValueLocationOnRegister(destinationReg)
 
 	// Obtain the floating point status register value into the general purpose register,
 	// so that we can check if the conversion resulted in undefined behavior.
@@ -2366,12 +2409,30 @@ func (c *arm64Compiler) compileITruncFromF(o *wazeroir.OperationITruncFromF) err
 	// See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register
 	c.compileRegisterAndConstSourceToNoneInstruction(arm64.ACMP, reservedRegisterForTemporary, 1)
 
-	// If so, exit the execution with jitCallStatusCodeInvalidFloatToIntConversion.
-	br := c.compilelBranchInstruction(arm64.ABNE)
-	c.compileExitFromNativeCode(jitCallStatusCodeInvalidFloatToIntConversion)
+	brOK := c.compilelBranchInstruction(arm64.ABNE)
+
+	// If so, exit the execution with errors depending on whether or not the source value is NaN.
+	{
+		var floatcmp obj.As
+		if is32bitFloat {
+			floatcmp = arm64.AFCMPS
+		} else {
+			floatcmp = arm64.AFCMPD
+		}
+		c.compileTwoRegistersToNoneInstruction(floatcmp, source.register, source.register)
+		// VS flag is set if at least one of values for FCMP is NaN.
+		// https://developer.arm.com/documentation/dui0801/g/Condition-Codes/Comparison-of-condition-code-meanings-in-integer-and-floating-point-code
+		brIfSourceNaN := c.compilelBranchInstruction(arm64.ABVS)
+
+		// If the source value is not NaN, the operation was overflow.
+		c.compileExitFromNativeCode(jitCallStatusIntegerOverflow)
+		// Otherwise, the operation was invalid as this is trying to convert NaN to integer.
+		c.setBranchTargetOnNext(brIfSourceNaN)
+		c.compileExitFromNativeCode(jitCallStatusCodeInvalidFloatToIntConversion)
+	}
 
 	// Otherwise, we branch into the next instruction.
-	c.setBranchTargetOnNext(br)
+	c.setBranchTargetOnNext(brOK)
 	return nil
 }
 
@@ -3336,7 +3397,6 @@ func (c *arm64Compiler) compileModuleContextInitialization() error {
 			arm64.AMOVD, tmpX,
 			reservedRegisterForEngine, engineModuleContextGlobalElement0AddressOffset,
 		)
-
 	}
 
 	// Update memoryElement0Address and memorySliceLen.
diff --git a/internal/wasm/jit/jit_arm64_test.go b/internal/wasm/jit/jit_arm64_test.go
index 78447a4769..bb34d0f14e 100644
--- a/internal/wasm/jit/jit_arm64_test.go
+++ b/internal/wasm/jit/jit_arm64_test.go
@@ -43,11 +43,13 @@ func requirePushTwoFloat32Consts(t *testing.T, x1, x2 float32, compiler *arm64Co
 }
 
 func (j *jitEnv) requireNewCompiler(t *testing.T) *arm64Compiler {
-	cmp, err := newCompiler(&wasm.FunctionInstance{
+	cmp, done, err := newCompiler(&wasm.FunctionInstance{
 		ModuleInstance: j.moduleInstance,
 		FunctionKind:   wasm.FunctionKindWasm,
 	}, nil)
 	require.NoError(t, err)
+	t.Cleanup(done)
+
 	ret, ok := cmp.(*arm64Compiler)
 	require.True(t, ok)
 	ret.labels = make(map[string]*labelInfo)
@@ -90,37 +92,42 @@ func TestArm64Compiler_returnFunction(t *testing.T) {
 		const callFrameNums = 10
 		stackPointerToExpectedValue := map[uint64]uint32{}
 		for funcaddr := wasm.FunctionAddress(0); funcaddr < callFrameNums; funcaddr++ {
-			//	Each function pushes its funcaddr and soon returns.
-			compiler := env.requireNewCompiler(t)
-			err := compiler.compilePreamble()
-			require.NoError(t, err)
+			// We have to do compilation in a separate subtest since each compilation takes
+			// the mutext lock and must release on the cleanup of each subtest.
+			// TODO: delete after https://github.com/tetratelabs/wazero/issues/233
+			t.Run(fmt.Sprintf("compiling existing callframe %d", funcaddr), func(t *testing.T) {
+				// Each function pushes its funcaddr and soon returns.
+				compiler := env.requireNewCompiler(t)
+				err := compiler.compilePreamble()
+				require.NoError(t, err)
 
-			// Push its funcaddr.
-			expValue := uint32(funcaddr)
-			err = compiler.compileConstI32(&wazeroir.OperationConstI32{Value: expValue})
-			require.NoError(t, err)
+				// Push its funcaddr.
+				expValue := uint32(funcaddr)
+				err = compiler.compileConstI32(&wazeroir.OperationConstI32{Value: expValue})
+				require.NoError(t, err)
 
-			err = compiler.compileReturnFunction()
-			require.NoError(t, err)
+				err = compiler.compileReturnFunction()
+				require.NoError(t, err)
 
-			code, _, _, err := compiler.compile()
-			require.NoError(t, err)
+				code, _, _, err := compiler.compile()
+				require.NoError(t, err)
 
-			// Compiles and adds to the engine.
-			compiledFunction := &compiledFunction{codeSegment: code, codeInitialAddress: uintptr(unsafe.Pointer(&code[0]))}
-			engine.addCompiledFunction(funcaddr, compiledFunction)
-
-			// Pushes the frame whose return address equals the beginning of the function just compiled.
-			frame := callFrame{
-				// Set the return address to the beginning of the function so that we can execute the constI32 above.
-				returnAddress: compiledFunction.codeInitialAddress,
-				// Note: return stack base pointer is set to funcaddr*10 and this is where the const should be pushed.
-				returnStackBasePointer: uint64(funcaddr) * 10,
-				compiledFunction:       compiledFunction,
-			}
-			engine.callFrameStack[engine.globalContext.callFrameStackPointer] = frame
-			engine.globalContext.callFrameStackPointer++
-			stackPointerToExpectedValue[frame.returnStackBasePointer] = expValue
+				// Compiles and adds to the engine.
+				compiledFunction := &compiledFunction{codeSegment: code, codeInitialAddress: uintptr(unsafe.Pointer(&code[0]))}
+				engine.addCompiledFunction(funcaddr, compiledFunction)
+
+				// Pushes the frame whose return address equals the beginning of the function just compiled.
+				frame := callFrame{
+					// Set the return address to the beginning of the function so that we can execute the constI32 above.
+					returnAddress: compiledFunction.codeInitialAddress,
+					// Note: return stack base pointer is set to funcaddr*10 and this is where the const should be pushed.
+					returnStackBasePointer: uint64(funcaddr) * 10,
+					compiledFunction:       compiledFunction,
+				}
+				engine.callFrameStack[engine.globalContext.callFrameStackPointer] = frame
+				engine.globalContext.callFrameStackPointer++
+				stackPointerToExpectedValue[frame.returnStackBasePointer] = expValue
+			})
 		}
 
 		require.Equal(t, uint64(callFrameNums), env.callFrameStackPointer())
@@ -1700,32 +1707,37 @@ func TestArm64Compiler_compileCall(t *testing.T) {
 				addTargetValue := uint32(100 + i)
 				expectedValue += addTargetValue
 
-				compiler := env.requireNewCompiler(t)
-				compiler.f = &wasm.FunctionInstance{
-					FunctionKind:   wasm.FunctionKindWasm,
-					FunctionType:   &wasm.TypeInstance{Type: targetFunctionType},
-					ModuleInstance: &wasm.ModuleInstance{},
-				}
+				// We have to do compilation in a separate subtest since each compilation takes
+				// the mutext lock and must release on the cleanup of each subtest.
+				// TODO: delete after https://github.com/tetratelabs/wazero/issues/233
+				t.Run(fmt.Sprintf("compiling call target %d", i), func(t *testing.T) {
+					compiler := env.requireNewCompiler(t)
+					compiler.f = &wasm.FunctionInstance{
+						FunctionKind:   wasm.FunctionKindWasm,
+						FunctionType:   &wasm.TypeInstance{Type: targetFunctionType},
+						ModuleInstance: &wasm.ModuleInstance{},
+					}
 
-				err := compiler.compilePreamble()
-				require.NoError(t, err)
+					err := compiler.compilePreamble()
+					require.NoError(t, err)
 
-				err = compiler.compileConstI32(&wazeroir.OperationConstI32{Value: uint32(addTargetValue)})
-				require.NoError(t, err)
-				err = compiler.compileAdd(&wazeroir.OperationAdd{Type: wazeroir.UnsignedTypeI32})
-				require.NoError(t, err)
-				err = compiler.compileReturnFunction()
-				require.NoError(t, err)
+					err = compiler.compileConstI32(&wazeroir.OperationConstI32{Value: uint32(addTargetValue)})
+					require.NoError(t, err)
+					err = compiler.compileAdd(&wazeroir.OperationAdd{Type: wazeroir.UnsignedTypeI32})
+					require.NoError(t, err)
+					err = compiler.compileReturnFunction()
+					require.NoError(t, err)
 
-				code, _, _, err := compiler.compile()
-				require.NoError(t, err)
-				addr := wasm.FunctionAddress(i)
-				engine.addCompiledFunction(addr, &compiledFunction{
-					codeSegment:        code,
-					codeInitialAddress: uintptr(unsafe.Pointer(&code[0])),
+					code, _, _, err := compiler.compile()
+					require.NoError(t, err)
+					addr := wasm.FunctionAddress(i)
+					engine.addCompiledFunction(addr, &compiledFunction{
+						codeSegment:        code,
+						codeInitialAddress: uintptr(unsafe.Pointer(&code[0])),
+					})
+					env.moduleInstance.Functions = append(env.moduleInstance.Functions,
+						&wasm.FunctionInstance{FunctionType: &wasm.TypeInstance{Type: targetFunctionType}, Address: addr})
 				})
-				env.moduleInstance.Functions = append(env.moduleInstance.Functions,
-					&wasm.FunctionInstance{FunctionType: &wasm.TypeInstance{Type: targetFunctionType}, Address: addr})
 			}
 
 			// Now we start building the caller's code.
@@ -1898,33 +1910,37 @@ func TestArm64Compiler_compileCallIndirect(t *testing.T) {
 				}
 
 				for i := 0; i < len(table); i++ {
-					t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
-						env := newJITEnvironment()
-						env.setTable(table)
-						engine := env.engine()
+					env := newJITEnvironment()
+					env.setTable(table)
+					engine := env.engine()
 
-						// First we create the call target function with function address = i,
-						// and it returns one value.
-						expectedReturnValue := uint32(i * 1000)
-						{
-							compiler := env.requireNewCompiler(t)
-							err := compiler.compilePreamble()
-							require.NoError(t, err)
-							err = compiler.compileConstI32(&wazeroir.OperationConstI32{Value: expectedReturnValue})
-							require.NoError(t, err)
-							err = compiler.compileReturnFunction()
-							require.NoError(t, err)
+					// First we create the call target function with function address = i,
+					// and it returns one value.
+					expectedReturnValue := uint32(i * 1000)
 
-							code, _, _, err := compiler.compile()
-							require.NoError(t, err)
+					// We have to do compilation in a separate subtest since each compilation takes
+					// the mutext lock and must release on the cleanup of each subtest.
+					// TODO: delete after https://github.com/tetratelabs/wazero/issues/233
+					t.Run(fmt.Sprintf("compiling call target for %d", i), func(t *testing.T) {
+						compiler := env.requireNewCompiler(t)
+						err := compiler.compilePreamble()
+						require.NoError(t, err)
+						err = compiler.compileConstI32(&wazeroir.OperationConstI32{Value: expectedReturnValue})
+						require.NoError(t, err)
+						err = compiler.compileReturnFunction()
+						require.NoError(t, err)
 
-							cf := &compiledFunction{
-								codeSegment:        code,
-								codeInitialAddress: uintptr(unsafe.Pointer(&code[0])),
-							}
-							engine.addCompiledFunction(table[i].FunctionAddress, cf)
+						code, _, _, err := compiler.compile()
+						require.NoError(t, err)
+
+						cf := &compiledFunction{
+							codeSegment:        code,
+							codeInitialAddress: uintptr(unsafe.Pointer(&code[0])),
 						}
+						engine.addCompiledFunction(table[i].FunctionAddress, cf)
+					})
 
+					t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
 						if growCallFrameStack {
 							env.setCallFrameStackPointer(engine.globalContext.callFrameStackLen - 1)
 							env.setPreviousCallFrameStackPointer(engine.globalContext.callFrameStackLen - 1)
@@ -2071,11 +2087,9 @@ func TestArm64Compiler_compileSwap(t *testing.T) {
 	err = compiler.compileReturnFunction()
 	require.NoError(t, err)
 
-	// Generate the code under test.
+	// Generate the code under test and run.
 	code, _, _, err := compiler.compile()
 	require.NoError(t, err)
-
-	// Run code.
 	env.exec(code)
 
 	require.Equal(t, uint64(op.Depth+1), env.stackPointer())
@@ -2161,7 +2175,6 @@ func TestArm64Compiler_compileModuleContextInitialization(t *testing.T) {
 			code, _, _, err := compiler.compile()
 			require.NoError(t, err)
 
-			// Run codes
 			env.exec(code)
 
 			// Check the exit status.
@@ -2279,8 +2292,6 @@ func TestArm64Compiler_compileGlobalSet(t *testing.T) {
 			// Generate the code under test.
 			code, _, _, err := compiler.compile()
 			require.NoError(t, err)
-
-			// Run code.
 			env.exec(code)
 
 			// The global value should be set to valueToSet.
@@ -3057,8 +3068,10 @@ func TestArm64Compiler_compile_Div_Rem(t *testing.T) {
 									}
 								case wazeroir.SignedTypeInt32:
 									v1, v2 := int32(x1), int32(x2)
-									if v2 == 0 || (v1 == math.MinInt32 && v2 == -1) {
+									if v2 == 0 {
 										require.Equal(t, jitCallStatusIntegerDivisionByZero, env.jitStatus())
+									} else if v1 == math.MinInt32 && v2 == -1 {
+										require.Equal(t, jitCallStatusIntegerOverflow, env.jitStatus())
 									} else {
 										require.Equal(t, v1/v2, env.stackTopAsInt32())
 									}
@@ -3070,8 +3083,10 @@ func TestArm64Compiler_compile_Div_Rem(t *testing.T) {
 									}
 								case wazeroir.SignedTypeInt64:
 									v1, v2 := int64(x1), int64(x2)
-									if v2 == 0 || (v1 == math.MinInt64 && v2 == -1) {
+									if v2 == 0 {
 										require.Equal(t, jitCallStatusIntegerDivisionByZero, env.jitStatus())
+									} else if v1 == math.MinInt64 && v2 == -1 {
+										require.Equal(t, jitCallStatusIntegerOverflow, env.jitStatus())
 									} else {
 										require.Equal(t, v1/v2, env.stackTopAsInt64())
 									}
@@ -3902,7 +3917,7 @@ func TestArm64Compiler_compileITruncFromF(t *testing.T) {
 					if tc.inputType == wazeroir.Float32 && tc.outputType == wazeroir.SignedInt32 {
 						f32 := float32(v)
 						if f32 < math.MinInt32 || f32 >= math.MaxInt32 {
-							expStatus = jitCallStatusCodeInvalidFloatToIntConversion
+							expStatus = jitCallStatusIntegerOverflow
 						}
 						if expStatus == jitCallStatusCodeReturned {
 							require.Equal(t, int32(math.Trunc(float64(f32))), env.stackTopAsInt32())
@@ -3910,21 +3925,21 @@ func TestArm64Compiler_compileITruncFromF(t *testing.T) {
 					} else if tc.inputType == wazeroir.Float32 && tc.outputType == wazeroir.SignedInt64 {
 						f32 := float32(v)
 						if f32 < math.MinInt64 || f32 >= math.MaxInt64 {
-							expStatus = jitCallStatusCodeInvalidFloatToIntConversion
+							expStatus = jitCallStatusIntegerOverflow
 						}
 						if expStatus == jitCallStatusCodeReturned {
 							require.Equal(t, int64(math.Trunc(float64(f32))), env.stackTopAsInt64())
 						}
 					} else if tc.inputType == wazeroir.Float64 && tc.outputType == wazeroir.SignedInt32 {
 						if v < math.MinInt32 || v > math.MaxInt32 {
-							expStatus = jitCallStatusCodeInvalidFloatToIntConversion
+							expStatus = jitCallStatusIntegerOverflow
 						}
 						if expStatus == jitCallStatusCodeReturned {
 							require.Equal(t, int32(math.Trunc(v)), env.stackTopAsInt32())
 						}
 					} else if tc.inputType == wazeroir.Float64 && tc.outputType == wazeroir.SignedInt64 {
 						if v < math.MinInt64 || v >= math.MaxInt64 {
-							expStatus = jitCallStatusCodeInvalidFloatToIntConversion
+							expStatus = jitCallStatusIntegerOverflow
 						}
 						if expStatus == jitCallStatusCodeReturned {
 							require.Equal(t, int64(math.Trunc(v)), env.stackTopAsInt64())
@@ -3932,14 +3947,14 @@ func TestArm64Compiler_compileITruncFromF(t *testing.T) {
 					} else if tc.inputType == wazeroir.Float32 && tc.outputType == wazeroir.SignedUint32 {
 						f32 := float32(v)
 						if f32 < 0 || f32 >= math.MaxUint32 {
-							expStatus = jitCallStatusCodeInvalidFloatToIntConversion
+							expStatus = jitCallStatusIntegerOverflow
 						}
 						if expStatus == jitCallStatusCodeReturned {
 							require.Equal(t, uint32(math.Trunc(float64(f32))), env.stackTopAsUint32())
 						}
 					} else if tc.inputType == wazeroir.Float64 && tc.outputType == wazeroir.SignedUint32 {
 						if v < 0 || v > math.MaxUint32 {
-							expStatus = jitCallStatusCodeInvalidFloatToIntConversion
+							expStatus = jitCallStatusIntegerOverflow
 						}
 						if expStatus == jitCallStatusCodeReturned {
 							require.Equal(t, uint32(math.Trunc(v)), env.stackTopAsUint32())
@@ -3947,14 +3962,14 @@ func TestArm64Compiler_compileITruncFromF(t *testing.T) {
 					} else if tc.inputType == wazeroir.Float32 && tc.outputType == wazeroir.SignedUint64 {
 						f32 := float32(v)
 						if f32 < 0 || f32 >= math.MaxUint64 {
-							expStatus = jitCallStatusCodeInvalidFloatToIntConversion
+							expStatus = jitCallStatusIntegerOverflow
 						}
 						if expStatus == jitCallStatusCodeReturned {
 							require.Equal(t, uint64(math.Trunc(float64(f32))), env.stackTopAsUint64())
 						}
 					} else if tc.inputType == wazeroir.Float64 && tc.outputType == wazeroir.SignedUint64 {
 						if v < 0 || v >= math.MaxUint64 {
-							expStatus = jitCallStatusCodeInvalidFloatToIntConversion
+							expStatus = jitCallStatusIntegerOverflow
 						}
 						if expStatus == jitCallStatusCodeReturned {
 							require.Equal(t, uint64(math.Trunc(v)), env.stackTopAsUint64())
@@ -4074,7 +4089,6 @@ func TestAmd64Compiler_compileBrTable(t *testing.T) {
 		env := newJITEnvironment()
 		code, _, _, err := c.compile()
 		require.NoError(t, err)
-		// fmt.Println(hex.EncodeToString(code))
 		env.exec(code)
 
 		// Check the returned value.
diff --git a/internal/wasm/jit/jit_other.go b/internal/wasm/jit/jit_other.go
index 8f2c919716..b5ec1d7054 100644
--- a/internal/wasm/jit/jit_other.go
+++ b/internal/wasm/jit/jit_other.go
@@ -16,6 +16,6 @@ func jitcall(codeSegment, engine uintptr) {
 	panic("unsupported GOARCH")
 }
 
-func newCompiler(f *wasm.FunctionInstance, ir *wazeroir.CompilationResult) (compiler, error) {
+func newCompiler(f *wasm.FunctionInstance, ir *wazeroir.CompilationResult) (compiler, func(), error) {
 	panic("unsupported GOARCH")
 }
diff --git a/internal/wasm/store.go b/internal/wasm/store.go
index 4d3ee34900..7e78447051 100644
--- a/internal/wasm/store.go
+++ b/internal/wasm/store.go
@@ -49,7 +49,7 @@ type (
 
 		// maximumFunctionAddress represents the limit on the number of function addresses (= function instances) in a store.
 		// Note: this is fixed to 2^27 but have this a field for testability.
-		maximumFunctionAddress int
+		maximumFunctionAddress FunctionAddress
 		//  maximumFunctionTypes represents the limit on the number of function types in a store.
 		// Note: this is fixed to 2^27 but have this a field for testability.
 		maximumFunctionTypes int
@@ -361,15 +361,17 @@ func (s *Store) getExport(moduleName string, name string, kind ExportKind) (exp
 }
 
 func (s *Store) addFunctionInstance(f *FunctionInstance) error {
-	l := len(s.Functions)
-	if l >= s.maximumFunctionAddress {
+	if f.Address >= s.maximumFunctionAddress {
 		return fmt.Errorf("too many functions in a store")
 	}
-	f.Address = FunctionAddress(len(s.Functions))
 	s.Functions = append(s.Functions, f)
 	return nil
 }
 
+func (s *Store) nextFunctionAddress() FunctionAddress {
+	return FunctionAddress(len(s.Functions))
+}
+
 func (s *Store) resolveImports(module *Module, target *ModuleInstance) error {
 	for _, is := range module.ImportSection {
 		if err := s.resolveImport(target, is); err != nil {
@@ -630,6 +632,7 @@ func (s *Store) buildFunctionInstances(module *Module, target *ModuleInstance) (
 			Body:           module.CodeSection[codeIndex].Body,
 			LocalTypes:     module.CodeSection[codeIndex].LocalTypes,
 			ModuleInstance: target,
+			Address:        s.nextFunctionAddress(),
 		}
 
 		if err := validateFunctionInstance(f, funcs, globals, mems, tables, module.TypeSection, maximumValuesOnStack); err != nil {
@@ -862,6 +865,7 @@ func (s *Store) AddHostFunction(moduleName string, hf *GoFunc) (*FunctionInstanc
 		FunctionKind:   hf.functionKind,
 		FunctionType:   typeInstance,
 		ModuleInstance: m,
+		Address:        s.nextFunctionAddress(),
 	}
 
 	if err = s.Engine.Compile(f); err != nil {
diff --git a/internal/wasm/store_test.go b/internal/wasm/store_test.go
index 1ac3d7ff68..38ca7f1155 100644
--- a/internal/wasm/store_test.go
+++ b/internal/wasm/store_test.go
@@ -138,8 +138,7 @@ func TestStore_addHostFunction(t *testing.T) {
 		s := NewStore(context.Background(), nopEngineInstance)
 		const max = 10
 		s.maximumFunctionAddress = max
-		s.Functions = make([]*FunctionInstance, max)
-		err := s.addFunctionInstance(nil)
+		err := s.addFunctionInstance(&FunctionInstance{Address: max + 1})
 		require.Error(t, err)
 	})
 	t.Run("ok", func(t *testing.T) {
@@ -153,9 +152,6 @@ func TestStore_addHostFunction(t *testing.T) {
 
 			// After the addition, one instance is added.
 			require.Len(t, s.Functions, i+1)
-
-			// The added function instance must have i for its address.
-			require.Equal(t, FunctionAddress(i), f.Address)
 		}
 	})
 }
diff --git a/tests/engine/adhoc_test.go b/tests/engine/adhoc_test.go
index 3472517628..4d43e30c5d 100644
--- a/tests/engine/adhoc_test.go
+++ b/tests/engine/adhoc_test.go
@@ -17,7 +17,7 @@ import (
 )
 
 func TestJIT(t *testing.T) {
-	if runtime.GOARCH != "amd64" {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
 		t.Skip()
 	}
 	runTests(t, wazero.NewEngineJIT)
diff --git a/tests/spectest/spec_test.go b/tests/spectest/spec_test.go
index 02208c1735..45aac5e800 100644
--- a/tests/spectest/spec_test.go
+++ b/tests/spectest/spec_test.go
@@ -230,7 +230,7 @@ func addSpectestModule(t *testing.T, store *wasm.Store) {
 }
 
 func TestJIT(t *testing.T) {
-	if runtime.GOARCH != "amd64" {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
 		t.Skip()
 	}
 	runTest(t, jit.NewEngine)