ARMv8-a JIT (work in progress)

5 years ago · 228e718c04
parent 93fec18991
commit 228e718c04
9 changed files with 597 additions and 64 deletions
--- a/src/common.hpp
+++ b/src/common.hpp
@ -90,6 +90,7 @@ namespace randomx {
 	constexpr int StoreL3Condition = 14;

 	//Prevent some unsafe configurations.
+#define RANDOMX_UNSAFE 1
 #ifndef RANDOMX_UNSAFE
 	static_assert((uint64_t)ArgonBlockSize * RANDOMX_CACHE_ACCESSES * RANDOMX_ARGON_MEMORY + 33554432 >= (uint64_t)RANDOMX_DATASET_BASE_SIZE + RANDOMX_DATASET_EXTRA_SIZE, "Unsafe configuration: Memory-time tradeoffs");
 	static_assert((128 + RANDOMX_PROGRAM_SIZE * RANDOMX_FREQ_ISTORE / 256) * (RANDOMX_PROGRAM_COUNT * RANDOMX_PROGRAM_ITERATIONS) >= RANDOMX_SCRATCHPAD_L3, "Unsafe configuration: Insufficient Scratchpad writes");
@ -119,7 +120,7 @@ namespace randomx {
 	class JitCompilerX86;
 	using JitCompiler = JitCompilerX86;
 #elif defined(__aarch64__)
-	#define RANDOMX_HAVE_COMPILER 0
+	#define RANDOMX_HAVE_COMPILER 1
 	class JitCompilerA64;
 	using JitCompiler = JitCompilerA64;
 #else
--- a/src/configuration.h
+++ b/src/configuration.h
@ -83,43 +83,43 @@ Total sum of frequencies must be 256

 //Integer instructions
 #define RANDOMX_FREQ_IADD_RS       16
-#define RANDOMX_FREQ_IADD_M         7
-#define RANDOMX_FREQ_ISUB_R        16
-#define RANDOMX_FREQ_ISUB_M         7
-#define RANDOMX_FREQ_IMUL_R        16
-#define RANDOMX_FREQ_IMUL_M         4
-#define RANDOMX_FREQ_IMULH_R        4
-#define RANDOMX_FREQ_IMULH_M        1
-#define RANDOMX_FREQ_ISMULH_R       4
-#define RANDOMX_FREQ_ISMULH_M       1
-#define RANDOMX_FREQ_IMUL_RCP       8
-#define RANDOMX_FREQ_INEG_R         2
-#define RANDOMX_FREQ_IXOR_R        15
-#define RANDOMX_FREQ_IXOR_M         5
-#define RANDOMX_FREQ_IROR_R         8
-#define RANDOMX_FREQ_IROL_R         2
-#define RANDOMX_FREQ_ISWAP_R        4
+#define RANDOMX_FREQ_IADD_M         0
+#define RANDOMX_FREQ_ISUB_R         0
+#define RANDOMX_FREQ_ISUB_M         0
+#define RANDOMX_FREQ_IMUL_R         0
+#define RANDOMX_FREQ_IMUL_M         0
+#define RANDOMX_FREQ_IMULH_R        0
+#define RANDOMX_FREQ_IMULH_M        0
+#define RANDOMX_FREQ_ISMULH_R       0
+#define RANDOMX_FREQ_ISMULH_M       0
+#define RANDOMX_FREQ_IMUL_RCP       0
+#define RANDOMX_FREQ_INEG_R         0
+#define RANDOMX_FREQ_IXOR_R         0
+#define RANDOMX_FREQ_IXOR_M         0
+#define RANDOMX_FREQ_IROR_R         0
+#define RANDOMX_FREQ_IROL_R         0
+#define RANDOMX_FREQ_ISWAP_R        0

 //Floating point instructions
-#define RANDOMX_FREQ_FSWAP_R        4
-#define RANDOMX_FREQ_FADD_R        16
-#define RANDOMX_FREQ_FADD_M         5
-#define RANDOMX_FREQ_FSUB_R        16
-#define RANDOMX_FREQ_FSUB_M         5
-#define RANDOMX_FREQ_FSCAL_R        6
-#define RANDOMX_FREQ_FMUL_R        32
-#define RANDOMX_FREQ_FDIV_M         4
-#define RANDOMX_FREQ_FSQRT_R        6
+#define RANDOMX_FREQ_FSWAP_R        0
+#define RANDOMX_FREQ_FADD_R         0
+#define RANDOMX_FREQ_FADD_M         0
+#define RANDOMX_FREQ_FSUB_R         0
+#define RANDOMX_FREQ_FSUB_M         0
+#define RANDOMX_FREQ_FSCAL_R        0
+#define RANDOMX_FREQ_FMUL_R         0
+#define RANDOMX_FREQ_FDIV_M         0
+#define RANDOMX_FREQ_FSQRT_R        0

 //Control instructions
-#define RANDOMX_FREQ_CBRANCH       25
-#define RANDOMX_FREQ_CFROUND        1
+#define RANDOMX_FREQ_CBRANCH        0
+#define RANDOMX_FREQ_CFROUND        0

 //Store instruction
-#define RANDOMX_FREQ_ISTORE        16
+#define RANDOMX_FREQ_ISTORE         0

 //No-op instruction
-#define RANDOMX_FREQ_NOP            0
+#define RANDOMX_FREQ_NOP          240
 /*                               ------
                                  256
 */
--- a/src/instruction_weights.hpp
+++ b/src/instruction_weights.hpp
@ -66,6 +66,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define REP64(x) REP32(x) REP32(x)
 #define REP128(x) REP32(x) REP32(x) REP32(x) REP32(x)
 #define REP232(x) REP128(x) REP40(x) REP40(x) REP24(x)
+#define REP240(x) REP128(x) REP64(x) REP32(x) REP16(x)
 #define REP256(x) REP128(x) REP128(x)
 #define REPNX(x,N) REP##N(x)
 #define REPN(x,N) REPNX(x,N)
--- a/src/jit_compiler_a64.cpp
+++ b/src/jit_compiler_a64.cpp
@ -0,0 +1,177 @@
+/*
+Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "jit_compiler_a64.hpp"
+#include "program.hpp"
+#include "virtual_memory.hpp"
+
+namespace ARMV8A {
+
+constexpr uint32_t B     = 0x14000000;
+constexpr uint32_t EOR   = 0xCA000000;
+constexpr uint32_t EOR32 = 0x4A000000;
+constexpr uint32_t ADD   = 0x8B000000;
+constexpr uint32_t MOVZ  = 0xD2800000;
+constexpr uint32_t MOVN  = 0x92800000;
+constexpr uint32_t MOVK  = 0xF2800000;
+
+}
+
+namespace randomx {
+
+static const size_t CodeSize = ((uint8_t*)randomx_program_aarch64_end) - ((uint8_t*)randomx_program_aarch64);
+static const size_t PrologueSize = ((uint8_t*)randomx_program_aarch64_vm_instructions) - ((uint8_t*)randomx_program_aarch64);
+
+constexpr uint32_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 };
+
+JitCompilerA64::JitCompilerA64()
+	: code((uint8_t*) allocMemoryPages(CodeSize))
+{
+	memcpy(code, (void*) randomx_program_aarch64, CodeSize);
+	enableAll();
+}
+
+JitCompilerA64::~JitCompilerA64()
+{
+	freePagedMemory(code, CodeSize);
+}
+
+void JitCompilerA64::enableAll()
+{
+	setPagesRWX(code, CodeSize);
+}
+
+void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& config)
+{
+	uint32_t codePos = PrologueSize;
+
+	for (uint32_t i = 0; i < program.getSize(); ++i)
+	{
+		Instruction& instr = program(i);
+		instr.src %= RegistersCount;
+		instr.dst %= RegistersCount;
+		(this->*engine[instr.opcode])(instr, i, codePos);
+	}
+
+	// Update spMix2
+	// eor w11, config.readReg2, config.readReg3
+	emit32(ARMV8A::EOR32 | 11 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos);
+
+	// Jump back to the main loop
+	const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos;
+	emit32(ARMV8A::B | (offset / 4), code, codePos);
+
+	// Update spMix1
+	// eor x10, config.readReg0, config.readReg1
+	codePos = ((uint8_t*)randomx_program_aarch64_update_spMix1) - ((uint8_t*)randomx_program_aarch64);
+	emit32(ARMV8A::EOR | 10 | (IntRegMap[config.readReg0] << 5) | (IntRegMap[config.readReg1] << 16), code, codePos);
+
+#ifdef __GNUC__
+	__builtin___clear_cache(reinterpret_cast<char*>(code + PrologueSize), reinterpret_cast<char*>(code + codePos));
+#endif
+}
+
+size_t JitCompilerA64::getCodeSize()
+{
+	return CodeSize;
+}
+
+void JitCompilerA64::h_IADD_RS(Instruction& instr, int i, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+	const uint32_t shift = instr.getModShift();
+
+	// add dst, src << shift
+	emit32(ARMV8A::ADD | dst | (dst << 5) | (shift << 10) | (src << 16), code, k);
+
+	if (instr.dst == RegisterNeedsDisplacement)
+	{
+		const uint32_t imm32 = instr.getImm32();
+		if (static_cast<int32_t>(imm32) < 0)
+		{
+			// movn x21, ~imm32 (16 high bits)
+			emit32(ARMV8A::MOVN | 21 | (1 << 21) | ((~imm32 >> 16) << 5), code, k);
+		}
+		else
+		{
+			// movz x21, imm32 (16 high bits)
+			emit32(ARMV8A::MOVZ | 21 | (1 << 21) | ((imm32 >> 16) << 5), code, k);
+		}
+		// movk x21, imm32 (16 low bits)
+		emit32(ARMV8A::MOVK | 21 | ((imm32 & 0xFFFF) << 5), code, k);
+		// add dst, x21
+		emit32(ARMV8A::ADD | dst | (dst << 5) | (21 << 16), code, k);
+	}
+
+	codePos = k;
+}
+
+void JitCompilerA64::h_NOP(Instruction& instr, int i, uint32_t& codePos)
+{
+}
+
+#include "instruction_weights.hpp"
+#define INST_HANDLE(x) REPN(&JitCompilerA64::h_##x, WT(x))
+
+	InstructionGeneratorA64 JitCompilerA64::engine[256] = {
+		INST_HANDLE(IADD_RS)
+		INST_HANDLE(IADD_M)
+		INST_HANDLE(ISUB_R)
+		INST_HANDLE(ISUB_M)
+		INST_HANDLE(IMUL_R)
+		INST_HANDLE(IMUL_M)
+		INST_HANDLE(IMULH_R)
+		INST_HANDLE(IMULH_M)
+		INST_HANDLE(ISMULH_R)
+		INST_HANDLE(ISMULH_M)
+		INST_HANDLE(IMUL_RCP)
+		INST_HANDLE(INEG_R)
+		INST_HANDLE(IXOR_R)
+		INST_HANDLE(IXOR_M)
+		INST_HANDLE(IROR_R)
+		INST_HANDLE(IROL_R)
+		INST_HANDLE(ISWAP_R)
+		INST_HANDLE(FSWAP_R)
+		INST_HANDLE(FADD_R)
+		INST_HANDLE(FADD_M)
+		INST_HANDLE(FSUB_R)
+		INST_HANDLE(FSUB_M)
+		INST_HANDLE(FSCAL_R)
+		INST_HANDLE(FMUL_R)
+		INST_HANDLE(FDIV_M)
+		INST_HANDLE(FSQRT_R)
+		INST_HANDLE(CBRANCH)
+		INST_HANDLE(CFROUND)
+		INST_HANDLE(ISTORE)
+		INST_HANDLE(NOP)
+	};
+
+}
--- a/src/jit_compiler_a64.hpp
+++ b/src/jit_compiler_a64.hpp
@ -32,21 +32,24 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <vector>
 #include <stdexcept>
 #include "common.hpp"
+#include "jit_compiler_a64_static.hpp"

 namespace randomx {

 	class Program;
 	class ProgramConfiguration;
 	class SuperscalarProgram;
+	class Instruction;
+
+	typedef void(JitCompilerA64::*InstructionGeneratorA64)(Instruction&, int, uint32_t&);

 	class JitCompilerA64 {
 	public:
-		JitCompilerA64() {
-			throw std::runtime_error("ARM64 JIT compiler is not implemented yet.");
-		}
-		void generateProgram(Program&, ProgramConfiguration&) {
+		JitCompilerA64();
+		~JitCompilerA64();
+
+		void generateProgram(Program&, ProgramConfiguration&);

-		}
 		void generateProgramLight(Program&, ProgramConfiguration&, uint32_t) {
 			
 		}
@ -58,19 +61,31 @@ namespace randomx {

 		}
 		ProgramFunc* getProgramFunc() {
-			return nullptr;
+			return reinterpret_cast<ProgramFunc*>(code);
 		}
 		DatasetInitFunc* getDatasetInitFunc() {
 			return nullptr;
 		}
 		uint8_t* getCode() {
-			return nullptr;
-		}
-		size_t getCodeSize() {
-			return 0;
+			return code;
 		}
+		size_t getCodeSize();
+
 		void enableWriting() {}
 		void enableExecution() {}
-		void enableAll() {}
+		void enableAll();
+
+	private:
+		static InstructionGeneratorA64 engine[256];
+		uint8_t* code;
+
+		static void emit32(uint32_t val, uint8_t* code, uint32_t& codePos)
+		{
+			*(uint32_t*)(code + codePos) = val;
+			codePos += sizeof(val);
+		}
+
+		void h_IADD_RS(Instruction&, int, uint32_t&);
+		void h_NOP(Instruction&, int, uint32_t&);
 	};
-}
+}
--- a/src/jit_compiler_a64_static.S
+++ b/src/jit_compiler_a64_static.S
@ -0,0 +1,275 @@
+	.arch armv8-a
+	.text
+	.global	randomx_program_aarch64
+	.global	randomx_program_aarch64_vm_instructions
+	.global	randomx_program_aarch64_vm_instructions_end
+	.global randomx_program_aarch64_update_spMix1
+	.global	randomx_program_aarch64_end
+
+# Register allocation
+
+# x0  -> pointer to reg buffer
+# x1  -> pointer to mem buffer and then to dataset
+# x2  -> pointer to scratchpad
+# x3  -> loop counter
+# x4  -> "r0"
+# x5  -> "r1"
+# x6  -> "r2"
+# x7  -> "r3"
+# x8  -> temporary
+# x9  -> mx, ma
+# x10 -> spMix1
+# x11 -> spMix2
+# x12 -> "r4"
+# x13 -> "r5"
+# x14 -> "r6"
+# x15 -> "r7"
+# x16 -> ScratchpadL1Mask64
+# x17 -> ScratchpadL2Mask64
+# x18 -> ScratchpadL3Mask64
+# x19 -> spAddr0
+# x20 -> spAddr1
+# x21 -> temporary
+# x22 -> temporary
+# x23 -> temporary
+# x24 -> temporary
+# x25 -> temporary
+# x26 -> ScratchpadL1Mask8
+# x27 -> ScratchpadL2Mask8
+# x28 -> ScratchpadL3Mask8
+# x29 -> CacheLineAlignMask
+
+# v0-v7 -> temporary
+# v8-v15 -> not used
+# v16 -> "f0"
+# v17 -> "f1"
+# v18 -> "f2"
+# v19 -> "f3"
+# v20 -> "e0"
+# v21 -> "e1"
+# v22 -> "e2"
+# v23 -> "e3"
+# v24 -> "a0"
+# v25 -> "a1"
+# v26 -> "a2"
+# v27 -> "a3"
+# v28 -> temporary
+# v29 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff
+# v30 -> E 'or' mask  = 0x3*00000000******3*00000000******
+# v31 -> scale mask   = 0x81f000000000000081f0000000000000
+
+randomx_program_aarch64:
+	# Save callee-saved registers
+	sub	sp, sp, 128
+	stp	x16, x17, [sp]
+	stp	x18, x19, [sp, 16]
+	stp	x20, x21, [sp, 32]
+	stp	x22, x23, [sp, 48]
+	stp	x24, x25, [sp, 64]
+	stp	x26, x27, [sp, 80]
+	stp	x28, x29, [sp, 96]
+	stp	x8, x30, [sp, 112]
+
+	# Zero integer registers
+	mov	x4, xzr
+	mov	x5, xzr
+	mov	x6, xzr
+	mov	x7, xzr
+	mov	x12, xzr
+	mov	x13, xzr
+	mov	x14, xzr
+	mov	x15, xzr
+
+	# Load ma, mx and dataset pointer
+	ldp	x9, x1, [x1]
+
+	# Load initial spMix value
+	mov	x10, x9
+
+	# Load Scratchpad masks
+	mov	x16, 16384 - 64
+	mov	x17, 262144 - 64
+	mov	x18, 2097152 - 64
+	mov	x26, 16384 - 8
+	mov	x27, 262144 - 8
+	mov	x28, 2097152 - 8
+
+	# Load CacheLineAlignMask
+	mov	x29, 0x7FFFFFC0
+
+	# Load group A registers
+	ldp	q24, q25, [x0, 192]
+	ldp	q26, q27, [x0, 224]
+
+	# Load E 'and' mask
+	mov	x21, 0x00FFFFFFFFFFFFFF
+	ins	v29.d[0], x21
+	ins	v29.d[1], x21
+
+	# Load E 'or' mask (stored in reg.f[0])
+	ldr	q30, [x0, 64]
+
+	# Load scale mask
+	mov	x21, 0x81f0000000000000
+	ins	v31.d[0], x21
+	ins	v31.d[1], x21
+
+main_loop:
+	# spAddr0 = spMix1 & ScratchpadL3Mask64;
+	# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
+	lsr	x21, x10, 32
+	and	w19, w10, w18
+	and	w20, w21, w18
+
+	# x19 = scratchpad + spAddr0
+	# x20 = scratchpad + spAddr1
+	add	x19, x19, x2
+	add	x20, x20, x2
+
+	# xor integer registers with scratchpad data (spAddr0)
+	ldp	x21, x22, [x19]
+	ldp	x23, x24, [x19, 16]
+	eor	x4, x4, x21
+	eor	x5, x5, x22
+	eor	x6, x6, x23
+	eor	x7, x7, x24
+	ldp	x21, x22, [x19, 32]
+	ldp	x23, x24, [x19, 48]
+	eor	x12, x12, x21
+	eor	x13, x13, x22
+	eor	x14, x14, x23
+	eor	x15, x15, x24
+
+	# Load group F registers (spAddr1)
+	ldpsw	x21, x22, [x20]
+	ldpsw	x23, x24, [x20, 8]
+	ins	v16.d[0], x21
+	ins	v16.d[1], x22
+	ins	v17.d[0], x23
+	ins	v17.d[1], x24
+	ldpsw	x21, x22, [x20, 16]
+	ldpsw	x23, x24, [x20, 24]
+	ins	v18.d[0], x21
+	ins	v18.d[1], x22
+	ins	v19.d[0], x23
+	ins	v19.d[1], x24
+	scvtf	v16.2d, v16.2d
+	scvtf	v17.2d, v17.2d
+	scvtf	v18.2d, v18.2d
+	scvtf	v19.2d, v19.2d
+
+	# Load group E registers (spAddr1)
+	ldpsw	x21, x22, [x20, 32]
+	ldpsw	x23, x24, [x20, 40]
+	ins	v20.d[0], x21
+	ins	v20.d[1], x22
+	ins	v21.d[0], x23
+	ins	v21.d[1], x24
+	ldpsw	x21, x22, [x20, 48]
+	ldpsw	x23, x24, [x20, 56]
+	ins	v22.d[0], x21
+	ins	v22.d[1], x22
+	ins	v23.d[0], x23
+	ins	v23.d[1], x24
+	scvtf	v20.2d, v20.2d
+	scvtf	v21.2d, v21.2d
+	scvtf	v22.2d, v22.2d
+	scvtf	v23.2d, v23.2d
+	and	v20.16b, v20.16b, v29.16b
+	and	v21.16b, v21.16b, v29.16b
+	and	v22.16b, v22.16b, v29.16b
+	and	v23.16b, v23.16b, v29.16b
+	orr	v20.16b, v20.16b, v30.16b
+	orr	v21.16b, v21.16b, v30.16b
+	orr	v22.16b, v22.16b, v30.16b
+	orr	v23.16b, v23.16b, v30.16b
+
+	# Execute VM instructions
+randomx_program_aarch64_vm_instructions:
+
+	# 16 KB buffer for generated instructions
+	.fill 4096,4,0
+
+randomx_program_aarch64_vm_instructions_end:
+
+	# mx ^= r[readReg2] ^ r[readReg3];
+	eor	x9, x9, x11
+
+	# Calculate dataset pointer for dataset prefetch
+	mov	w25, w9
+	and	x25, x25, x29
+	add	x25, x25, x1
+
+	# Prefetch dataset data
+	prfm	pldl2strm, [x25]
+
+	# mx <-> ma
+	ror	x9, x9, 32
+
+	# Calculate dataset pointer for dataset read
+	mov	w25, w9
+	and	x25, x25, x29
+	add	x25, x25, x1
+
+	# xor integer registers with dataset data
+	ldp	x21, x22, [x25]
+	ldp	x23, x24, [x25, 16]
+	eor	x4, x4, x21
+	eor	x5, x5, x22
+	eor	x6, x6, x23
+	eor	x7, x7, x24
+	ldp	x21, x22, [x25, 32]
+	ldp	x23, x24, [x25, 48]
+	eor	x12, x12, x21
+	eor	x13, x13, x22
+	eor	x14, x14, x23
+	eor	x15, x15, x24
+
+randomx_program_aarch64_update_spMix1:
+	eor	x10, x0, x0
+
+	# Store integer registers to scratchpad (spAddr1)
+	stp	x4, x5, [x20, 0]
+	stp	x6, x7, [x20, 16]
+	stp	x12, x13, [x20, 32]
+	stp	x14, x15, [x20, 48]
+
+	# xor group F and group E registers
+	eor	v16.16b, v16.16b, v20.16b
+	eor	v17.16b, v17.16b, v21.16b
+	eor	v18.16b, v18.16b, v22.16b
+	eor	v19.16b, v19.16b, v23.16b
+
+	# Store FP registers to scratchpad (spAddr0)
+	stp	q16, q17, [x19, 0]
+	stp	q18, q19, [x19, 32]
+
+	subs	x3, x3, 1
+	bne	main_loop
+	
+	# Store integer registers
+	stp	x4, x5, [x0, 0]
+	stp	x6, x7, [x0, 16]
+	stp	x12, x13, [x0, 32]
+	stp	x14, x15, [x0, 48]
+
+	# Store FP registers
+	stp	q16, q17, [x0, 64]
+	stp	q18, q19, [x0, 96]
+	stp	q20, q21, [x0, 128]
+	stp	q22, q23, [x0, 160]
+
+	# Restore callee-saved registers
+	ldp	x16, x17, [sp]
+	ldp	x18, x19, [sp, 16]
+	ldp	x20, x21, [sp, 32]
+	ldp	x22, x23, [sp, 48]
+	ldp	x24, x25, [sp, 64]
+	ldp	x26, x27, [sp, 80]
+	ldp	x28, x29, [sp, 96]
+	ldp	x8, x30, [sp, 112]
+	add	sp, sp, 128
+
+	ret
+
+randomx_program_aarch64_end:
--- a/src/jit_compiler_a64_static.hpp
+++ b/src/jit_compiler_a64_static.hpp
@ -0,0 +1,37 @@
+/*
+Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+extern "C" {
+	void randomx_program_aarch64(void* reg, void* mem, void* scratchpad, uint64_t iterations);
+	void randomx_program_aarch64_vm_instructions();
+	void randomx_program_aarch64_vm_instructions_end();
+	void randomx_program_aarch64_update_spMix1();
+	void randomx_program_aarch64_end();
+}
--- a/src/tests/benchmark.cpp
+++ b/src/tests/benchmark.cpp
@ -159,7 +159,6 @@ int main(int argc, char** argv) {
 	}

 	std::atomic<uint32_t> atomicNonce(0);
-	AtomicHash result;
 	std::vector<randomx_vm*> vms;
 	std::vector<std::thread> threads;
 	randomx_dataset* dataset;
@ -231,22 +230,42 @@ int main(int argc, char** argv) {
 			if (dataset == nullptr) {
 				throw DatasetAllocException();
 			}
-			uint32_t datasetItemCount = randomx_dataset_item_count();
-			if (initThreadCount > 1) {
-				auto perThread = datasetItemCount / initThreadCount;
-				auto remainder = datasetItemCount % initThreadCount;
-				uint32_t startItem = 0;
-				for (int i = 0; i < initThreadCount; ++i) {
-					auto count = perThread + (i == initThreadCount - 1 ? remainder : 0);
-					threads.push_back(std::thread(&randomx_init_dataset, dataset, cache, startItem, count));
-					startItem += count;
+
+			char* dataset_memory = reinterpret_cast<char*>(randomx_get_dataset_memory(dataset));
+			bool read_ok = false;
+
+			FILE* fp = fopen("dataset.bin", "rb");
+			if (fp)
+			{
+				read_ok = (fread(dataset_memory, 1, randomx::DatasetSize, fp) == randomx::DatasetSize);
+				fclose(fp);
+			}
+
+			if (!read_ok) {
+				uint32_t datasetItemCount = randomx_dataset_item_count();
+				if (initThreadCount > 1) {
+					auto perThread = datasetItemCount / initThreadCount;
+					auto remainder = datasetItemCount % initThreadCount;
+					uint32_t startItem = 0;
+					for (int i = 0; i < initThreadCount; ++i) {
+						auto count = perThread + (i == initThreadCount - 1 ? remainder : 0);
+						threads.push_back(std::thread(&randomx_init_dataset, dataset, cache, startItem, count));
+						startItem += count;
+					}
+					for (unsigned i = 0; i < threads.size(); ++i) {
+						threads[i].join();
+					}
 				}
-				for (unsigned i = 0; i < threads.size(); ++i) {
-					threads[i].join();
+				else {
+					randomx_init_dataset(dataset, cache, 0, datasetItemCount);
+				}
+
+				fp = fopen("dataset.bin", "wb");
+				if (fp)
+				{
+					fwrite(dataset_memory, 1, randomx::DatasetSize, fp);
+					fclose(fp);
 				}
-			}
-			else {
-				randomx_init_dataset(dataset, cache, 0, datasetItemCount);
 			}
 			randomx_release_cache(cache);
 			cache = nullptr;
@ -267,7 +286,10 @@ int main(int argc, char** argv) {
 			}
 			vms.push_back(vm);
 		}
-		std::cout << "Running benchmark (" << noncesCount << " nonces) ..." << std::endl;
+		for (int iter = 0; iter < 100; ++iter) {
+		std::cout << "Running benchmark (" << noncesCount << " nonces, iteration " << iter << ") ..." << std::endl;
+		atomicNonce = 0;
+		AtomicHash result;
 		sw.restart();
 		if (threadCount > 1) {
 			for (unsigned i = 0; i < vms.size(); ++i) {
@ -282,18 +304,13 @@ int main(int argc, char** argv) {
 			for (unsigned i = 0; i < threads.size(); ++i) {
 				threads[i].join();
 			}
+			threads.clear();
 		}
 		else {
 			mine(vms[0], std::ref(atomicNonce), std::ref(result), noncesCount, 0);
 		}

 		double elapsed = sw.getElapsed();
-		for (unsigned i = 0; i < vms.size(); ++i)
-			randomx_destroy_vm(vms[i]);
-		if (miningMode)
-			randomx_release_dataset(dataset);
-		else
-			randomx_release_cache(cache);
 		std::cout << "Calculated result: ";
 		result.print(std::cout);
 		if (noncesCount == 1000 && seedValue == 0)
@ -304,6 +321,13 @@ int main(int argc, char** argv) {
 		else {
 			std::cout << "Performance: " << noncesCount / elapsed << " hashes per second" << std::endl;
 		}
+		}
+		for (unsigned i = 0; i < vms.size(); ++i)
+			randomx_destroy_vm(vms[i]);
+		if (miningMode)
+			randomx_release_dataset(dataset);
+		else
+			randomx_release_cache(cache);
 	}
 	catch (MemoryException& e) {
 		std::cout << "ERROR: " << e.what() << std::endl;
--- a/src/vm_compiled.cpp
+++ b/src/vm_compiled.cpp
@ -63,6 +63,9 @@ namespace randomx {

 	template<class Allocator, bool softAes, bool secureJit>
 	void CompiledVm<Allocator, softAes, secureJit>::execute() {
+#ifdef __aarch64__
+		memcpy(reg.f, config.eMask, sizeof(config.eMask));
+#endif
 		compiler.getProgramFunc()(reg, mem, scratchpad, RANDOMX_PROGRAM_ITERATIONS);
 	}