From c6468a381646ee708eb2cc40c996856fda8c2313 Mon Sep 17 00:00:00 2001
From: SChernykh <sergey.v.chernykh@gmail.com>
Date: Sun, 22 Sep 2019 21:06:22 +0200
Subject: [PATCH] JIT compiler for ARMv8 (#125)

JIT compiler for ARMv8
---
 CMakeLists.txt                  |    6 +
 src/common.hpp                  |    2 +-
 src/intrin_portable.h           |    9 +-
 src/jit_compiler_a64.cpp        | 1068 +++++++++++++++++++++++++++++++
 src/jit_compiler_a64.hpp        |  104 ++-
 src/jit_compiler_a64_static.S   |  579 +++++++++++++++++
 src/jit_compiler_a64_static.hpp |   51 ++
 src/vm_compiled.cpp             |    3 +
 8 files changed, 1794 insertions(+), 28 deletions(-)
 create mode 100644 src/jit_compiler_a64.cpp
 create mode 100644 src/jit_compiler_a64_static.S
 create mode 100644 src/jit_compiler_a64_static.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9af9741..8abfea5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -116,6 +116,12 @@ endif()
 
 # ARMv8
 if (ARM_ID STREQUAL "aarch64" OR ARM_ID STREQUAL "arm64" OR ARM_ID STREQUAL "armv8-a")
+  list(APPEND randomx_sources
+    src/jit_compiler_a64_static.S
+    src/jit_compiler_a64.cpp)
+  # cheat because cmake and ccache hate each other
+  set_property(SOURCE src/jit_compiler_a64_static.S PROPERTY LANGUAGE C)
+
   if(ARCH STREQUAL "native")
     add_flag("-march=native")
   else()
diff --git a/src/common.hpp b/src/common.hpp
index a96fe06..a77feb3 100644
--- a/src/common.hpp
+++ b/src/common.hpp
@@ -119,7 +119,7 @@ namespace randomx {
 	class JitCompilerX86;
 	using JitCompiler = JitCompilerX86;
 #elif defined(__aarch64__)
-	#define RANDOMX_HAVE_COMPILER 0
+	#define RANDOMX_HAVE_COMPILER 1
 	class JitCompilerA64;
 	using JitCompiler = JitCompilerA64;
 #else
diff --git a/src/intrin_portable.h b/src/intrin_portable.h
index 3a75d2f..7cd23fe 100644
--- a/src/intrin_portable.h
+++ b/src/intrin_portable.h
@@ -385,7 +385,14 @@ FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
 typedef uint8x16_t rx_vec_i128;
 typedef float64x2_t rx_vec_f128;
 
-#define rx_aligned_alloc(size, align) aligned_alloc(align, size)
+inline void* rx_aligned_alloc(size_t size, size_t align) {
+	void* p;
+	if (posix_memalign(&p, align, size) == 0)
+		return p;
+
+	return 0;
+};
+
 #define rx_aligned_free(a) free(a)
 
 inline void rx_prefetch_nta(void* ptr) {
diff --git a/src/jit_compiler_a64.cpp b/src/jit_compiler_a64.cpp
new file mode 100644
index 0000000..5f6fcb3
--- /dev/null
+++ b/src/jit_compiler_a64.cpp
@@ -0,0 +1,1068 @@
+/*
+Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+Copyright (c) 2019, SChernykh    <https://github.com/SChernykh>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "jit_compiler_a64.hpp"
+#include "superscalar.hpp"
+#include "program.hpp"
+#include "reciprocal.h"
+#include "virtual_memory.hpp"
+
+namespace ARMV8A {
+
+constexpr uint32_t B           = 0x14000000;
+constexpr uint32_t EOR         = 0xCA000000;
+constexpr uint32_t EOR32       = 0x4A000000;
+constexpr uint32_t ADD         = 0x8B000000;
+constexpr uint32_t SUB         = 0xCB000000;
+constexpr uint32_t MUL         = 0x9B007C00;
+constexpr uint32_t UMULH       = 0x9BC07C00;
+constexpr uint32_t SMULH       = 0x9B407C00;
+constexpr uint32_t MOVZ        = 0xD2800000;
+constexpr uint32_t MOVN        = 0x92800000;
+constexpr uint32_t MOVK        = 0xF2800000;
+constexpr uint32_t ADD_IMM_LO  = 0x91000000;
+constexpr uint32_t ADD_IMM_HI  = 0x91400000;
+constexpr uint32_t LDR_LITERAL = 0x58000000;
+constexpr uint32_t ROR         = 0x9AC02C00;
+constexpr uint32_t ROR_IMM     = 0x93C00000;
+constexpr uint32_t MOV_REG     = 0xAA0003E0;
+constexpr uint32_t MOV_VREG_EL = 0x6E080400;
+constexpr uint32_t FADD        = 0x4E60D400;
+constexpr uint32_t FSUB        = 0x4EE0D400;
+constexpr uint32_t FEOR        = 0x6E201C00;
+constexpr uint32_t FMUL        = 0x6E60DC00;
+constexpr uint32_t FDIV        = 0x6E60FC00;
+constexpr uint32_t FSQRT       = 0x6EE1F800;
+
+}
+
+namespace randomx {
+
+static const size_t CodeSize = ((uint8_t*)randomx_init_dataset_aarch64_end) - ((uint8_t*)randomx_program_aarch64);
+static const size_t MainLoopBegin = ((uint8_t*)randomx_program_aarch64_main_loop) - ((uint8_t*)randomx_program_aarch64);
+static const size_t PrologueSize = ((uint8_t*)randomx_program_aarch64_vm_instructions) - ((uint8_t*)randomx_program_aarch64);
+static const size_t ImulRcpLiteralsEnd = ((uint8_t*)randomx_program_aarch64_imul_rcp_literals_end) - ((uint8_t*)randomx_program_aarch64);
+
+static const size_t CalcDatasetItemSize =
+	// Prologue
+	((uint8_t*)randomx_calc_dataset_item_aarch64_prefetch - (uint8_t*)randomx_calc_dataset_item_aarch64) +
+	// Main loop
+	RANDOMX_CACHE_ACCESSES * (
+		// Main loop prologue
+		((uint8_t*)randomx_calc_dataset_item_aarch64_mix - ((uint8_t*)randomx_calc_dataset_item_aarch64_prefetch)) + 4 +
+		// Inner main loop (instructions)
+		((RANDOMX_SUPERSCALAR_LATENCY * 3) + 2) * 16 +
+		// Main loop epilogue
+		((uint8_t*)randomx_calc_dataset_item_aarch64_store_result - (uint8_t*)randomx_calc_dataset_item_aarch64_mix) + 4
+	) +
+	// Epilogue
+	((uint8_t*)randomx_calc_dataset_item_aarch64_end - (uint8_t*)randomx_calc_dataset_item_aarch64_store_result);
+
+constexpr uint32_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 };
+
+template<typename T> static constexpr size_t Log2(T value) { return (value > 1) ? (Log2(value / 2) + 1) : 0; }
+
+JitCompilerA64::JitCompilerA64()
+	: code((uint8_t*) allocMemoryPages(CodeSize + CalcDatasetItemSize))
+	, literalPos(ImulRcpLiteralsEnd)
+	, num32bitLiterals(0)
+{
+	memset(reg_changed_offset, 0, sizeof(reg_changed_offset));
+	memcpy(code, (void*) randomx_program_aarch64, CodeSize);
+}
+
+JitCompilerA64::~JitCompilerA64()
+{
+	freePagedMemory(code, CodeSize + CalcDatasetItemSize);
+}
+
+void JitCompilerA64::enableWriting()
+{
+	setPagesRW(code, CodeSize + CalcDatasetItemSize);
+}
+
+void JitCompilerA64::enableExecution()
+{
+	setPagesRX(code, CodeSize + CalcDatasetItemSize);
+}
+
+void JitCompilerA64::enableAll()
+{
+	setPagesRWX(code, CodeSize + CalcDatasetItemSize);
+}
+
+void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& config)
+{
+	uint32_t codePos = MainLoopBegin + 4;
+
+	// and w16, w10, ScratchpadL3Mask64
+	emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
+
+	// and w17, w18, ScratchpadL3Mask64
+	emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
+
+	codePos = PrologueSize;
+	literalPos = ImulRcpLiteralsEnd;
+	num32bitLiterals = 0;
+
+	for (uint32_t i = 0; i < RegistersCount; ++i)
+		reg_changed_offset[i] = codePos;
+
+	for (uint32_t i = 0; i < program.getSize(); ++i)
+	{
+		Instruction& instr = program(i);
+		instr.src %= RegistersCount;
+		instr.dst %= RegistersCount;
+		(this->*engine[instr.opcode])(instr, codePos);
+	}
+
+	// Update spMix2
+	// eor w18, config.readReg2, config.readReg3
+	emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos);
+
+	// Jump back to the main loop
+	const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos;
+	emit32(ARMV8A::B | (offset / 4), code, codePos);
+
+	// and w18, w18, CacheLineAlignMask
+	codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64));
+	emit32(0x121A0000 | 18 | (18 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos);
+
+	// and w10, w10, CacheLineAlignMask
+	codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64));
+	emit32(0x121A0000 | 10 | (10 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos);
+
+	// Update spMix1
+	// eor x10, config.readReg0, config.readReg1
+	codePos = ((uint8_t*)randomx_program_aarch64_update_spMix1) - ((uint8_t*)randomx_program_aarch64);
+	emit32(ARMV8A::EOR | 10 | (IntRegMap[config.readReg0] << 5) | (IntRegMap[config.readReg1] << 16), code, codePos);
+
+#ifdef __GNUC__
+	__builtin___clear_cache(reinterpret_cast<char*>(code + MainLoopBegin), reinterpret_cast<char*>(code + codePos));
+#endif
+}
+
+void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration& config, uint32_t datasetOffset)
+{
+	uint32_t codePos = MainLoopBegin + 4;
+
+	// and w16, w10, ScratchpadL3Mask64
+	emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
+
+	// and w17, w18, ScratchpadL3Mask64
+	emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
+
+	codePos = PrologueSize;
+	literalPos = ImulRcpLiteralsEnd;
+	num32bitLiterals = 0;
+
+	for (uint32_t i = 0; i < RegistersCount; ++i)
+		reg_changed_offset[i] = codePos;
+
+	for (uint32_t i = 0; i < program.getSize(); ++i)
+	{
+		Instruction& instr = program(i);
+		instr.src %= RegistersCount;
+		instr.dst %= RegistersCount;
+		(this->*engine[instr.opcode])(instr, codePos);
+	}
+
+	// Update spMix2
+	// eor w18, config.readReg2, config.readReg3
+	emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos);
+
+	// Jump back to the main loop
+	const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end_light) - ((uint8_t*)randomx_program_aarch64)) - codePos;
+	emit32(ARMV8A::B | (offset / 4), code, codePos);
+
+	// and w2, w9, CacheLineAlignMask
+	codePos = (((uint8_t*)randomx_program_aarch64_light_cacheline_align_mask) - ((uint8_t*)randomx_program_aarch64));
+	emit32(0x121A0000 | 2 | (9 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos);
+
+	// Update spMix1
+	// eor x10, config.readReg0, config.readReg1
+	codePos = ((uint8_t*)randomx_program_aarch64_update_spMix1) - ((uint8_t*)randomx_program_aarch64);
+	emit32(ARMV8A::EOR | 10 | (IntRegMap[config.readReg0] << 5) | (IntRegMap[config.readReg1] << 16), code, codePos);
+
+	// Apply dataset offset
+	codePos = ((uint8_t*)randomx_program_aarch64_light_dataset_offset) - ((uint8_t*)randomx_program_aarch64);
+
+	datasetOffset /= CacheLineSize;
+	const uint32_t imm_lo = datasetOffset & ((1 << 12) - 1);
+	const uint32_t imm_hi = datasetOffset >> 12;
+
+	emit32(ARMV8A::ADD_IMM_LO | 2 | (2 << 5) | (imm_lo << 10), code, codePos);
+	emit32(ARMV8A::ADD_IMM_HI | 2 | (2 << 5) | (imm_hi << 10), code, codePos);
+
+#ifdef __GNUC__
+	__builtin___clear_cache(reinterpret_cast<char*>(code + MainLoopBegin), reinterpret_cast<char*>(code + codePos));
+#endif
+}
+
+template<size_t N>
+void JitCompilerA64::generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector<uint64_t> &reciprocalCache)
+{
+	uint32_t codePos = CodeSize;
+
+	uint8_t* p1 = (uint8_t*)randomx_calc_dataset_item_aarch64;
+	uint8_t* p2 = (uint8_t*)randomx_calc_dataset_item_aarch64_prefetch;
+	memcpy(code + codePos, p1, p2 - p1);
+	codePos += p2 - p1;
+
+	num32bitLiterals = 64;
+	constexpr uint32_t tmp_reg = 12;
+
+	for (size_t i = 0; i < N; ++i)
+	{
+		// and x11, x10, CacheSize / CacheLineSize - 1
+		emit32(0x92400000 | 11 | (10 << 5) | ((Log2(CacheSize / CacheLineSize) - 1) << 10), code, codePos);
+
+		p1 = ((uint8_t*)randomx_calc_dataset_item_aarch64_prefetch) + 4;
+		p2 = (uint8_t*)randomx_calc_dataset_item_aarch64_mix;
+		memcpy(code + codePos, p1, p2 - p1);
+		codePos += p2 - p1;
+
+		SuperscalarProgram& prog = programs[i];
+		const size_t progSize = prog.getSize();
+
+		uint32_t jmp_pos = codePos;
+		codePos += 4;
+
+		// Fill in literal pool
+		for (size_t j = 0; j < progSize; ++j)
+		{
+			const Instruction& instr = prog(j);
+			if (static_cast<SuperscalarInstructionType>(instr.opcode) == randomx::SuperscalarInstructionType::IMUL_RCP)
+				emit64(reciprocalCache[instr.getImm32()], code, codePos);
+		}
+
+		// Jump over literal pool
+		uint32_t literal_pos = jmp_pos;
+		emit32(ARMV8A::B | ((codePos - jmp_pos) / 4), code, literal_pos);
+
+		for (size_t j = 0; j < progSize; ++j)
+		{
+			const Instruction& instr = prog(j);
+			const uint32_t src = instr.src;
+			const uint32_t dst = instr.dst;
+
+			switch (static_cast<SuperscalarInstructionType>(instr.opcode))
+			{
+			case randomx::SuperscalarInstructionType::ISUB_R:
+				emit32(ARMV8A::SUB | dst | (dst << 5) | (src << 16), code, codePos);
+				break;
+			case randomx::SuperscalarInstructionType::IXOR_R:
+				emit32(ARMV8A::EOR | dst | (dst << 5) | (src << 16), code, codePos);
+				break;
+			case randomx::SuperscalarInstructionType::IADD_RS:
+				emit32(ARMV8A::ADD | dst | (dst << 5) | (instr.getModShift() << 10) | (src << 16), code, codePos);
+				break;
+			case randomx::SuperscalarInstructionType::IMUL_R:
+				emit32(ARMV8A::MUL | dst | (dst << 5) | (src << 16), code, codePos);
+				break;
+			case randomx::SuperscalarInstructionType::IROR_C:
+				emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((instr.getImm32() & 63) << 10) | (dst << 16), code, codePos);
+				break;
+			case randomx::SuperscalarInstructionType::IADD_C7:
+			case randomx::SuperscalarInstructionType::IADD_C8:
+			case randomx::SuperscalarInstructionType::IADD_C9:
+				emitAddImmediate(dst, dst, instr.getImm32(), code, codePos);
+				break;
+			case randomx::SuperscalarInstructionType::IXOR_C7:
+			case randomx::SuperscalarInstructionType::IXOR_C8:
+			case randomx::SuperscalarInstructionType::IXOR_C9:
+				emitMovImmediate(tmp_reg, instr.getImm32(), code, codePos);
+				emit32(ARMV8A::EOR | dst | (dst << 5) | (tmp_reg << 16), code, codePos);
+				break;
+			case randomx::SuperscalarInstructionType::IMULH_R:
+				emit32(ARMV8A::UMULH | dst | (dst << 5) | (src << 16), code, codePos);
+				break;
+			case randomx::SuperscalarInstructionType::ISMULH_R:
+				emit32(ARMV8A::SMULH | dst | (dst << 5) | (src << 16), code, codePos);
+				break;
+			case randomx::SuperscalarInstructionType::IMUL_RCP:
+				{
+					int32_t offset = (literal_pos - codePos) / 4;
+					offset &= (1 << 19) - 1;
+					literal_pos += 8;
+
+					// ldr tmp_reg, reciprocal
+					emit32(ARMV8A::LDR_LITERAL | tmp_reg | (offset << 5), code, codePos);
+
+					// mul dst, dst, tmp_reg
+					emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, codePos);
+				}
+				break;
+			default:
+				break;
+			}
+		}
+
+		p1 = (uint8_t*)randomx_calc_dataset_item_aarch64_mix;
+		p2 = (uint8_t*)randomx_calc_dataset_item_aarch64_store_result;
+		memcpy(code + codePos, p1, p2 - p1);
+		codePos += p2 - p1;
+
+		// Update registerValue
+		emit32(ARMV8A::MOV_REG | 10 | (prog.getAddressRegister() << 16), code, codePos);
+	}
+
+	p1 = (uint8_t*)randomx_calc_dataset_item_aarch64_store_result;
+	p2 = (uint8_t*)randomx_calc_dataset_item_aarch64_end;
+	memcpy(code + codePos, p1, p2 - p1);
+	codePos += p2 - p1;
+
+#ifdef __GNUC__
+	__builtin___clear_cache(reinterpret_cast<char*>(code + CodeSize), reinterpret_cast<char*>(code + codePos));
+#endif
+}
+
+template void JitCompilerA64::generateSuperscalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES], std::vector<uint64_t> &reciprocalCache);
+
+DatasetInitFunc* JitCompilerA64::getDatasetInitFunc()
+{
+	return (DatasetInitFunc*)(code + (((uint8_t*)randomx_init_dataset_aarch64) - ((uint8_t*)randomx_program_aarch64)));
+}
+
+size_t JitCompilerA64::getCodeSize()
+{
+	return CodeSize;
+}
+
+void JitCompilerA64::emitMovImmediate(uint32_t dst, uint32_t imm, uint8_t* code, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	if (imm < (1 << 16))
+	{
+		// movz tmp_reg, imm32 (16 low bits)
+		emit32(ARMV8A::MOVZ | dst | (imm << 5), code, k);
+	}
+	else
+	{
+		if (num32bitLiterals < 64)
+		{
+			if (static_cast<int32_t>(imm) < 0)
+			{
+				// smov dst, vN.s[M]
+				emit32(0x4E042C00 | dst | ((num32bitLiterals / 4) << 5) | ((num32bitLiterals % 4) << 19), code, k);
+			}
+			else
+			{
+				// umov dst, vN.s[M]
+				emit32(0x0E043C00 | dst | ((num32bitLiterals / 4) << 5) | ((num32bitLiterals % 4) << 19), code, k);
+			}
+
+			((uint32_t*)(code + ImulRcpLiteralsEnd))[num32bitLiterals] = imm;
+			++num32bitLiterals;
+		}
+		else
+		{
+			if (static_cast<int32_t>(imm) < 0)
+			{
+				// movn tmp_reg, ~imm32 (16 high bits)
+				emit32(ARMV8A::MOVN | dst | (1 << 21) | ((~imm >> 16) << 5), code, k);
+			}
+			else
+			{
+				// movz tmp_reg, imm32 (16 high bits)
+				emit32(ARMV8A::MOVZ | dst | (1 << 21) | ((imm >> 16) << 5), code, k);
+			}
+
+			// movk tmp_reg, imm32 (16 low bits)
+			emit32(ARMV8A::MOVK | dst | ((imm & 0xFFFF) << 5), code, k);
+		}
+	}
+
+	codePos = k;
+}
+
+void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm, uint8_t* code, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	if (imm < (1 << 24))
+	{
+		const uint32_t imm_lo = imm & ((1 << 12) - 1);
+		const uint32_t imm_hi = imm >> 12;
+
+		if (imm_lo && imm_hi)
+		{
+			emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k);
+			emit32(ARMV8A::ADD_IMM_HI | dst | (dst << 5) | (imm_hi << 10), code, k);
+		}
+		else if (imm_lo)
+		{
+			emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k);
+		}
+		else
+		{
+			emit32(ARMV8A::ADD_IMM_HI | dst | (src << 5) | (imm_hi << 10), code, k);
+		}
+	}
+	else
+	{
+		constexpr uint32_t tmp_reg = 18;
+		emitMovImmediate(tmp_reg, imm, code, k);
+
+		// add dst, src, tmp_reg
+		emit32(ARMV8A::ADD | dst | (src << 5) | (tmp_reg << 16), code, k);
+	}
+
+	codePos = k;
+}
+
+template<uint32_t tmp_reg>
+void JitCompilerA64::emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	uint32_t imm = instr.getImm32();
+
+	if (src != dst)
+	{
+		imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1);
+		emitAddImmediate(tmp_reg, src, imm, code, k);
+
+		constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
+		constexpr uint32_t andInstrL1 = t | ((Log2(RANDOMX_SCRATCHPAD_L1) - 4) << 10);
+		constexpr uint32_t andInstrL2 = t | ((Log2(RANDOMX_SCRATCHPAD_L2) - 4) << 10);
+
+		emit32(instr.getModMem() ? andInstrL1 : andInstrL2, code, k);
+
+		// ldr tmp_reg, [x2, tmp_reg]
+		emit32(0xf8606840 | tmp_reg | (tmp_reg << 16), code, k);
+	}
+	else
+	{
+		imm = (imm & ScratchpadL3Mask) >> 3;
+		emitMovImmediate(tmp_reg, imm, code, k);
+
+		// ldr tmp_reg, [x2, tmp_reg, lsl 3]
+		emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k);
+	}
+
+	codePos = k;
+}
+
+template<uint32_t tmp_reg_fp>
+void JitCompilerA64::emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	uint32_t imm = instr.getImm32();
+	constexpr uint32_t tmp_reg = 18;
+
+	imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1);
+	emitAddImmediate(tmp_reg, src, imm, code, k);
+
+	constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
+	constexpr uint32_t andInstrL1 = t | ((Log2(RANDOMX_SCRATCHPAD_L1) - 4) << 10);
+	constexpr uint32_t andInstrL2 = t | ((Log2(RANDOMX_SCRATCHPAD_L2) - 4) << 10);
+
+	emit32(instr.getModMem() ? andInstrL1 : andInstrL2, code, k);
+
+	// add tmp_reg, x2, tmp_reg
+	emit32(ARMV8A::ADD | tmp_reg | (2 << 5) | (tmp_reg << 16), code, k);
+
+	// ldpsw tmp_reg, tmp_reg + 1, [tmp_reg]
+	emit32(0x69400000 | tmp_reg | (tmp_reg << 5) | ((tmp_reg + 1) << 10), code, k);
+
+	// ins tmp_reg_fp.d[0], tmp_reg
+	emit32(0x4E081C00 | tmp_reg_fp | (tmp_reg << 5), code, k);
+
+	// ins tmp_reg_fp.d[1], tmp_reg + 1
+	emit32(0x4E181C00 | tmp_reg_fp | ((tmp_reg + 1) << 5), code, k);
+
+	// scvtf tmp_reg_fp.2d, tmp_reg_fp.2d
+	emit32(0x4E61D800 | tmp_reg_fp | (tmp_reg_fp << 5), code, k);
+
+	codePos = k;
+}
+
+void JitCompilerA64::h_IADD_RS(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+	const uint32_t shift = instr.getModShift();
+
+	// add dst, src << shift
+	emit32(ARMV8A::ADD | dst | (dst << 5) | (shift << 10) | (src << 16), code, k);
+
+	if (instr.dst == RegisterNeedsDisplacement)
+		emitAddImmediate(dst, dst, instr.getImm32(), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_IADD_M(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	constexpr uint32_t tmp_reg = 18;
+	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
+
+	// add dst, dst, tmp_reg
+	emit32(ARMV8A::ADD | dst | (dst << 5) | (tmp_reg << 16), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_ISUB_R(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	if (src != dst)
+	{
+		// sub dst, dst, src
+		emit32(ARMV8A::SUB | dst | (dst << 5) | (src << 16), code, k);
+	}
+	else
+	{
+		emitAddImmediate(dst, dst, -instr.getImm32(), code, k);
+	}
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_ISUB_M(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	constexpr uint32_t tmp_reg = 18;
+	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
+
+	// sub dst, dst, tmp_reg
+	emit32(ARMV8A::SUB | dst | (dst << 5) | (tmp_reg << 16), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_IMUL_R(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	if (src == dst)
+	{
+		src = 18;
+		emitMovImmediate(src, instr.getImm32(), code, k);
+	}
+
+	// mul dst, dst, src
+	emit32(ARMV8A::MUL | dst | (dst << 5) | (src << 16), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_IMUL_M(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	constexpr uint32_t tmp_reg = 18;
+	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
+
+	// sub dst, dst, tmp_reg
+	emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_IMULH_R(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	// umulh dst, dst, src
+	emit32(ARMV8A::UMULH | dst | (dst << 5) | (src << 16), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_IMULH_M(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	constexpr uint32_t tmp_reg = 18;
+	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
+
+	// umulh dst, dst, tmp_reg
+	emit32(ARMV8A::UMULH | dst | (dst << 5) | (tmp_reg << 16), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_ISMULH_R(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	// smulh dst, dst, src
+	emit32(ARMV8A::SMULH | dst | (dst << 5) | (src << 16), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_ISMULH_M(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	constexpr uint32_t tmp_reg = 18;
+	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
+
+	// smulh dst, dst, tmp_reg
+	emit32(ARMV8A::SMULH | dst | (dst << 5) | (tmp_reg << 16), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos)
+{
+	const uint64_t divisor = instr.getImm32();
+	if (isZeroOrPowerOf2(divisor))
+		return;
+
+	uint32_t k = codePos;
+
+	constexpr uint32_t tmp_reg = 18;
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	constexpr uint64_t N = 1ULL << 63;
+	const uint64_t q = N / divisor;
+	const uint64_t r = N % divisor;
+#ifdef __GNUC__
+	const uint64_t shift = 64 - __builtin_clzll(divisor);
+#else
+	uint64_t shift = 32;
+	for (uint64_t k = 1U << 31; (k & divisor) == 0; k >>= 1)
+		--shift;
+#endif
+
+	const uint32_t literal_id = (ImulRcpLiteralsEnd - literalPos) / sizeof(uint64_t);
+
+	literalPos -= sizeof(uint64_t);
+	*(uint64_t*)(code + literalPos) = (q << shift) + ((r << shift) / divisor);
+
+	if (literal_id < 13)
+	{
+		static constexpr uint32_t literal_regs[13] = { 30 << 16, 29 << 16, 28 << 16, 27 << 16, 26 << 16, 25 << 16, 24 << 16, 23 << 16, 22 << 16, 21 << 16, 20 << 16, 11 << 16, 0 };
+
+		// mul dst, dst, literal_reg
+		emit32(ARMV8A::MUL | dst | (dst << 5) | literal_regs[literal_id], code, k);
+	}
+	else
+	{
+		// ldr tmp_reg, reciprocal
+		const uint32_t offset = (literalPos - k) / 4;
+		emit32(ARMV8A::LDR_LITERAL | tmp_reg | (offset << 5), code, k);
+
+		// mul dst, dst, tmp_reg
+		emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k);
+	}
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_INEG_R(Instruction& instr, uint32_t& codePos)
+{
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	// sub dst, xzr, dst
+	emit32(ARMV8A::SUB | dst | (31 << 5) | (dst << 16), code, codePos);
+
+	reg_changed_offset[instr.dst] = codePos;
+}
+
+void JitCompilerA64::h_IXOR_R(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	if (src == dst)
+	{
+		src = 18;
+		emitMovImmediate(src, instr.getImm32(), code, k);
+	}
+
+	// eor dst, dst, src
+	emit32(ARMV8A::EOR | dst | (dst << 5) | (src << 16), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_IXOR_M(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	constexpr uint32_t tmp_reg = 18;
+	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
+
+	// eor dst, dst, tmp_reg
+	emit32(ARMV8A::EOR | dst | (dst << 5) | (tmp_reg << 16), code, k);
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_IROR_R(Instruction& instr, uint32_t& codePos)
+{
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	if (src != dst)
+	{
+		// ror dst, dst, src
+		emit32(ARMV8A::ROR | dst | (dst << 5) | (src << 16), code, codePos);
+	}
+	else
+	{
+		// ror dst, dst, imm
+		emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((instr.getImm32() & 63) << 10) | (dst << 16), code, codePos);
+	}
+
+	reg_changed_offset[instr.dst] = codePos;
+}
+
+void JitCompilerA64::h_IROL_R(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	if (src != dst)
+	{
+		constexpr uint32_t tmp_reg = 18;
+
+		// sub tmp_reg, xzr, src
+		emit32(ARMV8A::SUB | tmp_reg | (31 << 5) | (src << 16), code, k);
+
+		// ror dst, dst, tmp_reg
+		emit32(ARMV8A::ROR | dst | (dst << 5) | (tmp_reg << 16), code, k);
+	}
+	else
+	{
+		// ror dst, dst, imm
+		emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((-instr.getImm32() & 63) << 10) | (dst << 16), code, k);
+	}
+
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_ISWAP_R(Instruction& instr, uint32_t& codePos)
+{
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+
+	if (src == dst)
+		return;
+
+	uint32_t k = codePos;
+
+	constexpr uint32_t tmp_reg = 18;
+	emit32(ARMV8A::MOV_REG | tmp_reg | (dst << 16), code, k);
+	emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k);
+	emit32(ARMV8A::MOV_REG | src | (tmp_reg << 16), code, k);
+
+	reg_changed_offset[instr.src] = k;
+	reg_changed_offset[instr.dst] = k;
+	codePos = k;
+}
+
+void JitCompilerA64::h_FSWAP_R(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t dst = instr.dst + 16;
+
+	constexpr uint32_t tmp_reg_fp = 28;
+	constexpr uint32_t src_index1 = 1 << 14;
+	constexpr uint32_t dst_index1 = 1 << 20;
+
+	emit32(ARMV8A::MOV_VREG_EL | tmp_reg_fp | (dst << 5) | src_index1, code, k);
+	emit32(ARMV8A::MOV_VREG_EL | dst | (dst << 5) | dst_index1, code, k);
+	emit32(ARMV8A::MOV_VREG_EL | dst | (tmp_reg_fp << 5), code, k);
+
+	codePos = k;
+}
+
+void JitCompilerA64::h_FADD_R(Instruction& instr, uint32_t& codePos)
+{
+	const uint32_t src = (instr.src % 4) + 24;
+	const uint32_t dst = (instr.dst % 4) + 16;
+
+	emit32(ARMV8A::FADD | dst | (dst << 5) | (src << 16), code, codePos);
+}
+
+void JitCompilerA64::h_FADD_M(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = (instr.dst % 4) + 16;
+
+	constexpr uint32_t tmp_reg_fp = 28;
+	emitMemLoadFP<tmp_reg_fp>(src, instr, code, k);
+
+	emit32(ARMV8A::FADD | dst | (dst << 5) | (tmp_reg_fp << 16), code, k);
+
+	codePos = k;
+}
+
+void JitCompilerA64::h_FSUB_R(Instruction& instr, uint32_t& codePos)
+{
+	const uint32_t src = (instr.src % 4) + 24;
+	const uint32_t dst = (instr.dst % 4) + 16;
+
+	emit32(ARMV8A::FSUB | dst | (dst << 5) | (src << 16), code, codePos);
+}
+
+void JitCompilerA64::h_FSUB_M(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = (instr.dst % 4) + 16;
+
+	constexpr uint32_t tmp_reg_fp = 28;
+	emitMemLoadFP<tmp_reg_fp>(src, instr, code, k);
+
+	emit32(ARMV8A::FSUB | dst | (dst << 5) | (tmp_reg_fp << 16), code, k);
+
+	codePos = k;
+}
+
+void JitCompilerA64::h_FSCAL_R(Instruction& instr, uint32_t& codePos)
+{
+	const uint32_t dst = (instr.dst % 4) + 16;
+
+	emit32(ARMV8A::FEOR | dst | (dst << 5) | (31 << 16), code, codePos);
+}
+
+void JitCompilerA64::h_FMUL_R(Instruction& instr, uint32_t& codePos)
+{
+	const uint32_t src = (instr.src % 4) + 24;
+	const uint32_t dst = (instr.dst % 4) + 20;
+
+	emit32(ARMV8A::FMUL | dst | (dst << 5) | (src << 16), code, codePos);
+}
+
+void JitCompilerA64::h_FDIV_M(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = (instr.dst % 4) + 20;
+
+	constexpr uint32_t tmp_reg_fp = 28;
+	emitMemLoadFP<tmp_reg_fp>(src, instr, code, k);
+
+	// and tmp_reg_fp, tmp_reg_fp, and_mask_reg
+	emit32(0x4E201C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (29 << 16), code, k);
+
+	// orr tmp_reg_fp, tmp_reg_fp, or_mask_reg
+	emit32(0x4EA01C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (30 << 16), code, k);
+
+	emit32(ARMV8A::FDIV | dst | (dst << 5) | (tmp_reg_fp << 16), code, k);
+
+	codePos = k;
+}
+
+void JitCompilerA64::h_FSQRT_R(Instruction& instr, uint32_t& codePos)
+{
+	const uint32_t dst = (instr.dst % 4) + 20;
+
+	emit32(ARMV8A::FSQRT | dst | (dst << 5), code, codePos);
+}
+
+void JitCompilerA64::h_CBRANCH(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t dst = IntRegMap[instr.dst];
+	const uint32_t modCond = instr.getModCond();
+	const uint32_t shift = modCond + ConditionOffset;
+	const uint32_t imm = (instr.getImm32() | (1U << shift)) & ~(1U << (shift - 1));
+
+	emitAddImmediate(dst, dst, imm, code, k);
+
+	// tst dst, mask
+	static_assert((ConditionMask == 0xFF) && (ConditionOffset == 8), "Update tst encoding for different mask and offset");
+	emit32((0xF2781C1F - (modCond << 16)) | (dst << 5), code, k);
+
+	int32_t offset = reg_changed_offset[instr.dst];
+	offset = ((offset - k) >> 2) & ((1 << 19) - 1);
+
+	// beq target
+	emit32(0x54000000 | (offset << 5), code, k);
+
+	for (uint32_t i = 0; i < RegistersCount; ++i)
+		reg_changed_offset[i] = k;
+
+	codePos = k;
+}
+
+void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+
+	constexpr uint32_t tmp_reg = 18;
+	constexpr uint32_t fpcr_tmp_reg = 8;
+
+	// ror tmp_reg, src, imm
+	emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
+
+	// bfi fpcr_tmp_reg, tmp_reg, 40, 2
+	emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k);
+
+	// rbit tmp_reg, fpcr_tmp_reg
+	emit32(0xDAC00000 | tmp_reg | (fpcr_tmp_reg << 5), code, k);
+
+	// msr fpcr, tmp_reg
+	emit32(0xD51B4400 | tmp_reg, code, k);
+
+	codePos = k;
+}
+
+void JitCompilerA64::h_ISTORE(Instruction& instr, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+	const uint32_t dst = IntRegMap[instr.dst];
+	constexpr uint32_t tmp_reg = 18;
+
+	uint32_t imm = instr.getImm32();
+
+	if (instr.getModCond() < StoreL3Condition)
+		imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1);
+	else
+		imm &= RANDOMX_SCRATCHPAD_L3 - 1;
+
+	emitAddImmediate(tmp_reg, dst, imm, code, k);
+
+	constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
+	constexpr uint32_t andInstrL1 = t | ((Log2(RANDOMX_SCRATCHPAD_L1) - 4) << 10);
+	constexpr uint32_t andInstrL2 = t | ((Log2(RANDOMX_SCRATCHPAD_L2) - 4) << 10);
+	constexpr uint32_t andInstrL3 = t | ((Log2(RANDOMX_SCRATCHPAD_L3) - 4) << 10);
+
+	emit32((instr.getModCond() < StoreL3Condition) ? (instr.getModMem() ? andInstrL1 : andInstrL2) : andInstrL3, code, k);
+
+	// str src, [x2, tmp_reg]
+	emit32(0xF8206840 | src | (tmp_reg << 16), code, k);
+
+	codePos = k;
+}
+
+void JitCompilerA64::h_NOP(Instruction& instr, uint32_t& codePos)
+{
+}
+
+#include "instruction_weights.hpp"
+#define INST_HANDLE(x) REPN(&JitCompilerA64::h_##x, WT(x))
+
+	InstructionGeneratorA64 JitCompilerA64::engine[256] = {
+		INST_HANDLE(IADD_RS)
+		INST_HANDLE(IADD_M)
+		INST_HANDLE(ISUB_R)
+		INST_HANDLE(ISUB_M)
+		INST_HANDLE(IMUL_R)
+		INST_HANDLE(IMUL_M)
+		INST_HANDLE(IMULH_R)
+		INST_HANDLE(IMULH_M)
+		INST_HANDLE(ISMULH_R)
+		INST_HANDLE(ISMULH_M)
+		INST_HANDLE(IMUL_RCP)
+		INST_HANDLE(INEG_R)
+		INST_HANDLE(IXOR_R)
+		INST_HANDLE(IXOR_M)
+		INST_HANDLE(IROR_R)
+		INST_HANDLE(IROL_R)
+		INST_HANDLE(ISWAP_R)
+		INST_HANDLE(FSWAP_R)
+		INST_HANDLE(FADD_R)
+		INST_HANDLE(FADD_M)
+		INST_HANDLE(FSUB_R)
+		INST_HANDLE(FSUB_M)
+		INST_HANDLE(FSCAL_R)
+		INST_HANDLE(FMUL_R)
+		INST_HANDLE(FDIV_M)
+		INST_HANDLE(FSQRT_R)
+		INST_HANDLE(CBRANCH)
+		INST_HANDLE(CFROUND)
+		INST_HANDLE(ISTORE)
+		INST_HANDLE(NOP)
+	};
+}
diff --git a/src/jit_compiler_a64.hpp b/src/jit_compiler_a64.hpp
index 5a075bf..a4adb80 100644
--- a/src/jit_compiler_a64.hpp
+++ b/src/jit_compiler_a64.hpp
@@ -1,5 +1,6 @@
 /*
 Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+Copyright (c) 2019, SChernykh    <https://github.com/SChernykh>
 
 All rights reserved.
 
@@ -32,45 +33,96 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <vector>
 #include <stdexcept>
 #include "common.hpp"
+#include "jit_compiler_a64_static.hpp"
 
 namespace randomx {
 
 	class Program;
 	class ProgramConfiguration;
 	class SuperscalarProgram;
+	class Instruction;
+
+	typedef void(JitCompilerA64::*InstructionGeneratorA64)(Instruction&, uint32_t&);
 
 	class JitCompilerA64 {
 	public:
-		JitCompilerA64() {
-			throw std::runtime_error("ARM64 JIT compiler is not implemented yet.");
-		}
-		void generateProgram(Program&, ProgramConfiguration&) {
+		JitCompilerA64();
+		~JitCompilerA64();
+
+		void generateProgram(Program&, ProgramConfiguration&);
+		void generateProgramLight(Program&, ProgramConfiguration&, uint32_t);
 
-		}
-		void generateProgramLight(Program&, ProgramConfiguration&, uint32_t) {
-			
-		}
 		template<size_t N>
-		void generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector<uint64_t> &) {
+		void generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector<uint64_t> &);
 
-		}
-		void generateDatasetInitCode() {
+		void generateDatasetInitCode() {}
 
+		ProgramFunc* getProgramFunc() { return reinterpret_cast<ProgramFunc*>(code); }
+		DatasetInitFunc* getDatasetInitFunc();
+		uint8_t* getCode() { return code; }
+		size_t getCodeSize();
+
+		void enableWriting();
+		void enableExecution();
+		void enableAll();
+
+	private:
+		static InstructionGeneratorA64 engine[256];
+		uint32_t reg_changed_offset[8];
+		uint8_t* code;
+		uint32_t literalPos;
+		uint32_t num32bitLiterals;
+
+		static void emit32(uint32_t val, uint8_t* code, uint32_t& codePos)
+		{
+			*(uint32_t*)(code + codePos) = val;
+			codePos += sizeof(val);
 		}
-		ProgramFunc* getProgramFunc() {
-			return nullptr;
-		}
-		DatasetInitFunc* getDatasetInitFunc() {
-			return nullptr;
-		}
-		uint8_t* getCode() {
-			return nullptr;
-		}
-		size_t getCodeSize() {
-			return 0;
+
+		static void emit64(uint64_t val, uint8_t* code, uint32_t& codePos)
+		{
+			*(uint64_t*)(code + codePos) = val;
+			codePos += sizeof(val);
 		}
-		void enableWriting() {}
-		void enableExecution() {}
-		void enableAll() {}
+
+		void emitMovImmediate(uint32_t dst, uint32_t imm, uint8_t* code, uint32_t& codePos);
+		void emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm, uint8_t* code, uint32_t& codePos);
+
+		template<uint32_t tmp_reg>
+		void emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos);
+
+		template<uint32_t tmp_reg_fp>
+		void emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos);
+
+		void h_IADD_RS(Instruction&, uint32_t&);
+		void h_IADD_M(Instruction&, uint32_t&);
+		void h_ISUB_R(Instruction&, uint32_t&);
+		void h_ISUB_M(Instruction&, uint32_t&);
+		void h_IMUL_R(Instruction&, uint32_t&);
+		void h_IMUL_M(Instruction&, uint32_t&);
+		void h_IMULH_R(Instruction&, uint32_t&);
+		void h_IMULH_M(Instruction&, uint32_t&);
+		void h_ISMULH_R(Instruction&, uint32_t&);
+		void h_ISMULH_M(Instruction&, uint32_t&);
+		void h_IMUL_RCP(Instruction&, uint32_t&);
+		void h_INEG_R(Instruction&, uint32_t&);
+		void h_IXOR_R(Instruction&, uint32_t&);
+		void h_IXOR_M(Instruction&, uint32_t&);
+		void h_IROR_R(Instruction&, uint32_t&);
+		void h_IROL_R(Instruction&, uint32_t&);
+		void h_ISWAP_R(Instruction&, uint32_t&);
+		void h_FSWAP_R(Instruction&, uint32_t&);
+		void h_FADD_R(Instruction&, uint32_t&);
+		void h_FADD_M(Instruction&, uint32_t&);
+		void h_FSUB_R(Instruction&, uint32_t&);
+		void h_FSUB_M(Instruction&, uint32_t&);
+		void h_FSCAL_R(Instruction&, uint32_t&);
+		void h_FMUL_R(Instruction&, uint32_t&);
+		void h_FDIV_M(Instruction&, uint32_t&);
+		void h_FSQRT_R(Instruction&, uint32_t&);
+		void h_CBRANCH(Instruction&, uint32_t&);
+		void h_CFROUND(Instruction&, uint32_t&);
+		void h_ISTORE(Instruction&, uint32_t&);
+		void h_NOP(Instruction&, uint32_t&);
 	};
-}
\ No newline at end of file
+}
diff --git a/src/jit_compiler_a64_static.S b/src/jit_compiler_a64_static.S
new file mode 100644
index 0000000..2c0e439
--- /dev/null
+++ b/src/jit_compiler_a64_static.S
@@ -0,0 +1,579 @@
+# Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+# Copyright (c) 2019, SChernykh    <https://github.com/SChernykh>
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# 	* Redistributions of source code must retain the above copyright
+# 	  notice, this list of conditions and the following disclaimer.
+# 	* Redistributions in binary form must reproduce the above copyright
+# 	  notice, this list of conditions and the following disclaimer in the
+# 	  documentation and/or other materials provided with the distribution.
+# 	* Neither the name of the copyright holder nor the
+# 	  names of its contributors may be used to endorse or promote products
+# 	  derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+	.arch armv8-a
+	.text
+	.global	randomx_program_aarch64
+	.global	randomx_program_aarch64_main_loop
+	.global	randomx_program_aarch64_vm_instructions
+	.global randomx_program_aarch64_imul_rcp_literals_end
+	.global	randomx_program_aarch64_vm_instructions_end
+	.global randomx_program_aarch64_cacheline_align_mask1
+	.global randomx_program_aarch64_cacheline_align_mask2
+	.global randomx_program_aarch64_update_spMix1
+	.global randomx_program_aarch64_vm_instructions_end_light
+	.global randomx_program_aarch64_light_cacheline_align_mask
+	.global randomx_program_aarch64_light_dataset_offset
+	.global randomx_init_dataset_aarch64
+	.global randomx_init_dataset_aarch64_end
+	.global randomx_calc_dataset_item_aarch64
+	.global randomx_calc_dataset_item_aarch64_prefetch
+	.global randomx_calc_dataset_item_aarch64_mix
+	.global randomx_calc_dataset_item_aarch64_store_result
+	.global randomx_calc_dataset_item_aarch64_end
+
+#include "configuration.h"
+
+# Register allocation
+
+# x0  -> pointer to reg buffer and then literal for IMUL_RCP
+# x1  -> pointer to mem buffer and then to dataset
+# x2  -> pointer to scratchpad
+# x3  -> loop counter
+# x4  -> "r0"
+# x5  -> "r1"
+# x6  -> "r2"
+# x7  -> "r3"
+# x8  -> fpcr (reversed bits)
+# x9  -> mx, ma
+# x10 -> spMix1
+# x11 -> literal for IMUL_RCP
+# x12 -> "r4"
+# x13 -> "r5"
+# x14 -> "r6"
+# x15 -> "r7"
+# x16 -> spAddr0
+# x17 -> spAddr1
+# x18 -> temporary
+# x19 -> temporary
+# x20 -> literal for IMUL_RCP
+# x21 -> literal for IMUL_RCP
+# x22 -> literal for IMUL_RCP
+# x23 -> literal for IMUL_RCP
+# x24 -> literal for IMUL_RCP
+# x25 -> literal for IMUL_RCP
+# x26 -> literal for IMUL_RCP
+# x27 -> literal for IMUL_RCP
+# x28 -> literal for IMUL_RCP
+# x29 -> literal for IMUL_RCP
+# x30 -> literal for IMUL_RCP
+
+# v0-v15 -> store 32-bit literals
+# v16 -> "f0"
+# v17 -> "f1"
+# v18 -> "f2"
+# v19 -> "f3"
+# v20 -> "e0"
+# v21 -> "e1"
+# v22 -> "e2"
+# v23 -> "e3"
+# v24 -> "a0"
+# v25 -> "a1"
+# v26 -> "a2"
+# v27 -> "a3"
+# v28 -> temporary
+# v29 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff
+# v30 -> E 'or' mask  = 0x3*00000000******3*00000000******
+# v31 -> scale mask   = 0x81f000000000000081f0000000000000
+
+randomx_program_aarch64:
+	# Save callee-saved registers
+	sub	sp, sp, 192
+	stp	x16, x17, [sp]
+	stp	x18, x19, [sp, 16]
+	stp	x20, x21, [sp, 32]
+	stp	x22, x23, [sp, 48]
+	stp	x24, x25, [sp, 64]
+	stp	x26, x27, [sp, 80]
+	stp	x28, x29, [sp, 96]
+	stp	x8, x30, [sp, 112]
+	stp	d8, d9, [sp, 128]
+	stp	d10, d11, [sp, 144]
+	stp	d12, d13, [sp, 160]
+	stp	d14, d15, [sp, 176]
+
+	# Zero integer registers
+	mov	x4, xzr
+	mov	x5, xzr
+	mov	x6, xzr
+	mov	x7, xzr
+	mov	x12, xzr
+	mov	x13, xzr
+	mov	x14, xzr
+	mov	x15, xzr
+
+	# Load ma, mx and dataset pointer
+	ldp	x9, x1, [x1]
+
+	# Load initial spMix value
+	mov	x10, x9
+
+	# Load group A registers
+	ldp	q24, q25, [x0, 192]
+	ldp	q26, q27, [x0, 224]
+
+	# Load E 'and' mask
+	mov	x16, 0x00FFFFFFFFFFFFFF
+	ins	v29.d[0], x16
+	ins	v29.d[1], x16
+
+	# Load E 'or' mask (stored in reg.f[0])
+	ldr	q30, [x0, 64]
+
+	# Load scale mask
+	mov	x16, 0x80f0000000000000
+	ins	v31.d[0], x16
+	ins	v31.d[1], x16
+
+	# Read fpcr
+	mrs	x8, fpcr
+	rbit	x8, x8
+
+	# Save x0
+	str	x0, [sp, -16]!
+
+	# Read literals
+	ldr	x0, literal_x0
+	ldr	x11, literal_x11
+	ldr	x20, literal_x20
+	ldr	x21, literal_x21
+	ldr	x22, literal_x22
+	ldr	x23, literal_x23
+	ldr	x24, literal_x24
+	ldr	x25, literal_x25
+	ldr	x26, literal_x26
+	ldr	x27, literal_x27
+	ldr	x28, literal_x28
+	ldr	x29, literal_x29
+	ldr	x30, literal_x30
+
+	ldr	q0, literal_v0
+	ldr	q1, literal_v1
+	ldr	q2, literal_v2
+	ldr	q3, literal_v3
+	ldr	q4, literal_v4
+	ldr	q5, literal_v5
+	ldr	q6, literal_v6
+	ldr	q7, literal_v7
+	ldr	q8, literal_v8
+	ldr	q9, literal_v9
+	ldr	q10, literal_v10
+	ldr	q11, literal_v11
+	ldr	q12, literal_v12
+	ldr	q13, literal_v13
+	ldr	q14, literal_v14
+	ldr	q15, literal_v15
+
+randomx_program_aarch64_main_loop:
+	# spAddr0 = spMix1 & ScratchpadL3Mask64;
+	# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
+	lsr	x18, x10, 32
+
+	# Actual mask will be inserted by JIT compiler
+	and	w16, w10, 1
+	and	w17, w18, 1
+
+	# x16 = scratchpad + spAddr0
+	# x17 = scratchpad + spAddr1
+	add	x16, x16, x2
+	add	x17, x17, x2
+
+	# xor integer registers with scratchpad data (spAddr0)
+	ldp	x18, x19, [x16]
+	eor	x4, x4, x18
+	eor	x5, x5, x19
+	ldp	x18, x19, [x16, 16]
+	eor	x6, x6, x18
+	eor	x7, x7, x19
+	ldp	x18, x19, [x16, 32]
+	eor	x12, x12, x18
+	eor	x13, x13, x19
+	ldp	x18, x19, [x16, 48]
+	eor	x14, x14, x18
+	eor	x15, x15, x19
+
+	# Load group F registers (spAddr1)
+	ldpsw	x18, x19, [x17]
+	ins	v16.d[0], x18
+	ins	v16.d[1], x19
+	ldpsw	x18, x19, [x17, 8]
+	ins	v17.d[0], x18
+	ins	v17.d[1], x19
+	ldpsw	x18, x19, [x17, 16]
+	ins	v18.d[0], x18
+	ins	v18.d[1], x19
+	ldpsw	x18, x19, [x17, 24]
+	ins	v19.d[0], x18
+	ins	v19.d[1], x19
+	scvtf	v16.2d, v16.2d
+	scvtf	v17.2d, v17.2d
+	scvtf	v18.2d, v18.2d
+	scvtf	v19.2d, v19.2d
+
+	# Load group E registers (spAddr1)
+	ldpsw	x18, x19, [x17, 32]
+	ins	v20.d[0], x18
+	ins	v20.d[1], x19
+	ldpsw	x18, x19, [x17, 40]
+	ins	v21.d[0], x18
+	ins	v21.d[1], x19
+	ldpsw	x18, x19, [x17, 48]
+	ins	v22.d[0], x18
+	ins	v22.d[1], x19
+	ldpsw	x18, x19, [x17, 56]
+	ins	v23.d[0], x18
+	ins	v23.d[1], x19
+	scvtf	v20.2d, v20.2d
+	scvtf	v21.2d, v21.2d
+	scvtf	v22.2d, v22.2d
+	scvtf	v23.2d, v23.2d
+	and	v20.16b, v20.16b, v29.16b
+	and	v21.16b, v21.16b, v29.16b
+	and	v22.16b, v22.16b, v29.16b
+	and	v23.16b, v23.16b, v29.16b
+	orr	v20.16b, v20.16b, v30.16b
+	orr	v21.16b, v21.16b, v30.16b
+	orr	v22.16b, v22.16b, v30.16b
+	orr	v23.16b, v23.16b, v30.16b
+
+	# Execute VM instructions
+randomx_program_aarch64_vm_instructions:
+
+	# buffer for generated instructions
+	# FDIV_M is the largest instruction taking up to 12 ARMv8 instructions
+	.fill RANDOMX_PROGRAM_SIZE*12,4,0
+
+literal_x0:  .fill 1,8,0
+literal_x11: .fill 1,8,0
+literal_x20: .fill 1,8,0
+literal_x21: .fill 1,8,0
+literal_x22: .fill 1,8,0
+literal_x23: .fill 1,8,0
+literal_x24: .fill 1,8,0
+literal_x25: .fill 1,8,0
+literal_x26: .fill 1,8,0
+literal_x27: .fill 1,8,0
+literal_x28: .fill 1,8,0
+literal_x29: .fill 1,8,0
+literal_x30: .fill 1,8,0
+randomx_program_aarch64_imul_rcp_literals_end:
+
+literal_v0:  .fill 2,8,0
+literal_v1:  .fill 2,8,0
+literal_v2:  .fill 2,8,0
+literal_v3:  .fill 2,8,0
+literal_v4:  .fill 2,8,0
+literal_v5:  .fill 2,8,0
+literal_v6:  .fill 2,8,0
+literal_v7:  .fill 2,8,0
+literal_v8:  .fill 2,8,0
+literal_v9:  .fill 2,8,0
+literal_v10: .fill 2,8,0
+literal_v11: .fill 2,8,0
+literal_v12: .fill 2,8,0
+literal_v13: .fill 2,8,0
+literal_v14: .fill 2,8,0
+literal_v15: .fill 2,8,0
+
+randomx_program_aarch64_vm_instructions_end:
+
+	# mx ^= r[readReg2] ^ r[readReg3];
+	eor	x9, x9, x18
+
+	# Calculate dataset pointer for dataset prefetch
+	mov	w18, w9
+randomx_program_aarch64_cacheline_align_mask1:
+	# Actual mask will be inserted by JIT compiler
+	and	x18, x18, 1
+	add	x18, x18, x1
+
+	# Prefetch dataset data
+	prfm	pldl2strm, [x18]
+
+	# mx <-> ma
+	ror	x9, x9, 32
+
+	# Calculate dataset pointer for dataset read
+	mov	w10, w9
+randomx_program_aarch64_cacheline_align_mask2:
+	# Actual mask will be inserted by JIT compiler
+	and	x10, x10, 1
+	add	x10, x10, x1
+
+randomx_program_aarch64_xor_with_dataset_line:
+	# xor integer registers with dataset data
+	ldp	x18, x19, [x10]
+	eor	x4, x4, x18
+	eor	x5, x5, x19
+	ldp	x18, x19, [x10, 16]
+	eor	x6, x6, x18
+	eor	x7, x7, x19
+	ldp	x18, x19, [x10, 32]
+	eor	x12, x12, x18
+	eor	x13, x13, x19
+	ldp	x18, x19, [x10, 48]
+	eor	x14, x14, x18
+	eor	x15, x15, x19
+
+randomx_program_aarch64_update_spMix1:
+	# JIT compiler will replace it with "eor x10, config.readReg0, config.readReg1"
+	eor	x10, x0, x0
+
+	# Store integer registers to scratchpad (spAddr1)
+	stp	x4, x5, [x17, 0]
+	stp	x6, x7, [x17, 16]
+	stp	x12, x13, [x17, 32]
+	stp	x14, x15, [x17, 48]
+
+	# xor group F and group E registers
+	eor	v16.16b, v16.16b, v20.16b
+	eor	v17.16b, v17.16b, v21.16b
+	eor	v18.16b, v18.16b, v22.16b
+	eor	v19.16b, v19.16b, v23.16b
+
+	# Store FP registers to scratchpad (spAddr0)
+	stp	q16, q17, [x16, 0]
+	stp	q18, q19, [x16, 32]
+
+	subs	x3, x3, 1
+	bne	randomx_program_aarch64_main_loop
+	
+	# Restore x0
+	ldr	x0, [sp], 16
+
+	# Store integer registers
+	stp	x4, x5, [x0, 0]
+	stp	x6, x7, [x0, 16]
+	stp	x12, x13, [x0, 32]
+	stp	x14, x15, [x0, 48]
+
+	# Store FP registers
+	stp	q16, q17, [x0, 64]
+	stp	q18, q19, [x0, 96]
+	stp	q20, q21, [x0, 128]
+	stp	q22, q23, [x0, 160]
+
+	# Restore callee-saved registers
+	ldp	x16, x17, [sp]
+	ldp	x18, x19, [sp, 16]
+	ldp	x20, x21, [sp, 32]
+	ldp	x22, x23, [sp, 48]
+	ldp	x24, x25, [sp, 64]
+	ldp	x26, x27, [sp, 80]
+	ldp	x28, x29, [sp, 96]
+	ldp	x8, x30, [sp, 112]
+	ldp	d8, d9, [sp, 128]
+	ldp	d10, d11, [sp, 144]
+	ldp	d12, d13, [sp, 160]
+	ldp	d14, d15, [sp, 176]
+	add	sp, sp, 192
+
+	ret
+
+randomx_program_aarch64_vm_instructions_end_light:
+	sub	sp, sp, 96
+	stp	x0, x1, [sp, 64]
+	stp	x2, x30, [sp, 80]
+
+	# mx ^= r[readReg2] ^ r[readReg3];
+	eor	x9, x9, x18
+
+	# mx <-> ma
+	ror	x9, x9, 32
+
+	# x0 -> pointer to cache memory
+	mov	x0, x1
+
+	# x1 -> pointer to output
+	mov	x1, sp
+
+randomx_program_aarch64_light_cacheline_align_mask:
+	# Actual mask will be inserted by JIT compiler
+	and	w2, w9, 1
+
+	# x2 -> item number
+	lsr	x2, x2, 6
+
+randomx_program_aarch64_light_dataset_offset:
+	# Apply dataset offset (filled in by JIT compiler)
+	add	x2, x2, 0
+	add	x2, x2, 0
+
+	bl	randomx_calc_dataset_item_aarch64
+
+	mov	x10, sp
+	ldp	x0, x1, [sp, 64]
+	ldp	x2, x30, [sp, 80]
+	add	sp, sp, 96
+
+	b	randomx_program_aarch64_xor_with_dataset_line
+
+
+
+# Input parameters
+#
+# x0 -> pointer to cache
+# x1 -> pointer to dataset memory at startItem
+# x2 -> start item
+# x3 -> end item
+
+randomx_init_dataset_aarch64:
+	# Save x30 (return address)
+	str	x30, [sp, -16]!
+
+	# Load pointer to cache memory
+	ldr	x0, [x0]
+
+randomx_init_dataset_aarch64_main_loop:
+	bl	randomx_calc_dataset_item_aarch64
+	add	x1, x1, 64
+	add	x2, x2, 1
+	cmp	x2, x3
+	bne	randomx_init_dataset_aarch64_main_loop
+
+	# Restore x30 (return address)
+	ldr	x30, [sp], 16
+
+	ret
+
+randomx_init_dataset_aarch64_end:
+
+# Input parameters
+#
+# x0 -> pointer to cache memory
+# x1 -> pointer to output
+# x2 -> item number
+#
+# Register allocation
+#
+# x0-x7 -> output value (calculated dataset item)
+# x8 -> pointer to cache memory
+# x9 -> pointer to output
+# x10 -> registerValue
+# x11 -> mixBlock
+# x12 -> temporary
+# x13 -> temporary
+
+randomx_calc_dataset_item_aarch64:
+	sub	sp, sp, 112
+	stp	x0, x1, [sp]
+	stp	x2, x3, [sp, 16]
+	stp	x4, x5, [sp, 32]
+	stp	x6, x7, [sp, 48]
+	stp	x8, x9, [sp, 64]
+	stp	x10, x11, [sp, 80]
+	stp	x12, x13, [sp, 96]
+
+	mov	x8, x0
+	mov	x9, x1
+	mov	x10, x2
+
+	# rl[0] = (itemNumber + 1) * superscalarMul0;
+	ldr	x12, superscalarMul0
+	madd	x0, x2, x12, x12
+
+	# rl[1] = rl[0] ^ superscalarAdd1;
+	ldr	x12, superscalarAdd1
+	eor	x1, x0, x12
+
+	# rl[2] = rl[0] ^ superscalarAdd2;
+	ldr	x12, superscalarAdd2
+	eor	x2, x0, x12
+
+	# rl[3] = rl[0] ^ superscalarAdd3;
+	ldr	x12, superscalarAdd3
+	eor	x3, x0, x12
+
+	# rl[4] = rl[0] ^ superscalarAdd4;
+	ldr	x12, superscalarAdd4
+	eor	x4, x0, x12
+
+	# rl[5] = rl[0] ^ superscalarAdd5;
+	ldr	x12, superscalarAdd5
+	eor	x5, x0, x12
+
+	# rl[6] = rl[0] ^ superscalarAdd6;
+	ldr	x12, superscalarAdd6
+	eor	x6, x0, x12
+
+	# rl[7] = rl[0] ^ superscalarAdd7;
+	ldr	x12, superscalarAdd7
+	eor	x7, x0, x12
+
+	b	randomx_calc_dataset_item_aarch64_prefetch
+
+superscalarMul0: .quad 6364136223846793005
+superscalarAdd1: .quad 9298411001130361340
+superscalarAdd2: .quad 12065312585734608966
+superscalarAdd3: .quad 9306329213124626780
+superscalarAdd4: .quad 5281919268842080866
+superscalarAdd5: .quad 10536153434571861004
+superscalarAdd6: .quad 3398623926847679864
+superscalarAdd7: .quad 9549104520008361294
+
+# Prefetch -> SuperScalar hash -> Mix will be repeated N times
+
+randomx_calc_dataset_item_aarch64_prefetch:
+	# Actual mask will be inserted by JIT compiler
+	and	x11, x10, 1
+	add	x11, x8, x11, lsl 6
+	prfm	pldl2strm, [x11]
+
+	# Generated SuperScalar hash program goes here
+
+randomx_calc_dataset_item_aarch64_mix:
+	ldp	x12, x13, [x11]
+	eor	x0, x0, x12
+	eor	x1, x1, x13
+	ldp	x12, x13, [x11, 16]
+	eor	x2, x2, x12
+	eor	x3, x3, x13
+	ldp	x12, x13, [x11, 32]
+	eor	x4, x4, x12
+	eor	x5, x5, x13
+	ldp	x12, x13, [x11, 48]
+	eor	x6, x6, x12
+	eor	x7, x7, x13
+
+randomx_calc_dataset_item_aarch64_store_result:
+	stp	x0, x1, [x9]
+	stp	x2, x3, [x9, 16]
+	stp	x4, x5, [x9, 32]
+	stp	x6, x7, [x9, 48]
+
+	ldp	x0, x1, [sp]
+	ldp	x2, x3, [sp, 16]
+	ldp	x4, x5, [sp, 32]
+	ldp	x6, x7, [sp, 48]
+	ldp	x8, x9, [sp, 64]
+	ldp	x10, x11, [sp, 80]
+	ldp	x12, x13, [sp, 96]
+	add	sp, sp, 112
+
+	ret
+
+randomx_calc_dataset_item_aarch64_end:
diff --git a/src/jit_compiler_a64_static.hpp b/src/jit_compiler_a64_static.hpp
new file mode 100644
index 0000000..a9b922e
--- /dev/null
+++ b/src/jit_compiler_a64_static.hpp
@@ -0,0 +1,51 @@
+/*
+Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+Copyright (c) 2019, SChernykh    <https://github.com/SChernykh>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+extern "C" {
+	void randomx_program_aarch64(void* reg, void* mem, void* scratchpad, uint64_t iterations);
+	void randomx_program_aarch64_main_loop();
+	void randomx_program_aarch64_vm_instructions();
+	void randomx_program_aarch64_imul_rcp_literals_end();
+	void randomx_program_aarch64_vm_instructions_end();
+	void randomx_program_aarch64_cacheline_align_mask1();
+	void randomx_program_aarch64_cacheline_align_mask2();
+	void randomx_program_aarch64_update_spMix1();
+	void randomx_program_aarch64_vm_instructions_end_light();
+	void randomx_program_aarch64_light_cacheline_align_mask();
+	void randomx_program_aarch64_light_dataset_offset();
+	void randomx_init_dataset_aarch64();
+	void randomx_init_dataset_aarch64_end();
+	void randomx_calc_dataset_item_aarch64();
+	void randomx_calc_dataset_item_aarch64_prefetch();
+	void randomx_calc_dataset_item_aarch64_mix();
+	void randomx_calc_dataset_item_aarch64_store_result();
+	void randomx_calc_dataset_item_aarch64_end();
+}
diff --git a/src/vm_compiled.cpp b/src/vm_compiled.cpp
index c03f6b4..060abec 100644
--- a/src/vm_compiled.cpp
+++ b/src/vm_compiled.cpp
@@ -63,6 +63,9 @@ namespace randomx {
 
 	template<class Allocator, bool softAes, bool secureJit>
 	void CompiledVm<Allocator, softAes, secureJit>::execute() {
+#ifdef __aarch64__
+		memcpy(reg.f, config.eMask, sizeof(config.eMask));
+#endif
 		compiler.getProgramFunc()(reg, mem, scratchpad, RANDOMX_PROGRAM_ITERATIONS);
 	}