From c6468a381646ee708eb2cc40c996856fda8c2313 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Sun, 22 Sep 2019 21:06:22 +0200 Subject: [PATCH] JIT compiler for ARMv8 (#125) JIT compiler for ARMv8 --- CMakeLists.txt | 6 + src/common.hpp | 2 +- src/intrin_portable.h | 9 +- src/jit_compiler_a64.cpp | 1068 +++++++++++++++++++++++++++++++ src/jit_compiler_a64.hpp | 104 ++- src/jit_compiler_a64_static.S | 579 +++++++++++++++++ src/jit_compiler_a64_static.hpp | 51 ++ src/vm_compiled.cpp | 3 + 8 files changed, 1794 insertions(+), 28 deletions(-) create mode 100644 src/jit_compiler_a64.cpp create mode 100644 src/jit_compiler_a64_static.S create mode 100644 src/jit_compiler_a64_static.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 9af9741..8abfea5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,6 +116,12 @@ endif() # ARMv8 if (ARM_ID STREQUAL "aarch64" OR ARM_ID STREQUAL "arm64" OR ARM_ID STREQUAL "armv8-a") + list(APPEND randomx_sources + src/jit_compiler_a64_static.S + src/jit_compiler_a64.cpp) + # cheat because cmake and ccache hate each other + set_property(SOURCE src/jit_compiler_a64_static.S PROPERTY LANGUAGE C) + if(ARCH STREQUAL "native") add_flag("-march=native") else() diff --git a/src/common.hpp b/src/common.hpp index a96fe06..a77feb3 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -119,7 +119,7 @@ namespace randomx { class JitCompilerX86; using JitCompiler = JitCompilerX86; #elif defined(__aarch64__) - #define RANDOMX_HAVE_COMPILER 0 + #define RANDOMX_HAVE_COMPILER 1 class JitCompilerA64; using JitCompiler = JitCompilerA64; #else diff --git a/src/intrin_portable.h b/src/intrin_portable.h index 3a75d2f..7cd23fe 100644 --- a/src/intrin_portable.h +++ b/src/intrin_portable.h @@ -385,7 +385,14 @@ FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) { typedef uint8x16_t rx_vec_i128; typedef float64x2_t rx_vec_f128; -#define rx_aligned_alloc(size, align) aligned_alloc(align, size) +inline void* rx_aligned_alloc(size_t size, size_t align) { + void* p; + if (posix_memalign(&p, align, size) == 0) + return p; + + return 0; +}; + #define rx_aligned_free(a) free(a) inline void rx_prefetch_nta(void* ptr) { diff --git a/src/jit_compiler_a64.cpp b/src/jit_compiler_a64.cpp new file mode 100644 index 0000000..5f6fcb3 --- /dev/null +++ b/src/jit_compiler_a64.cpp @@ -0,0 +1,1068 @@ +/* +Copyright (c) 2018-2019, tevador +Copyright (c) 2019, SChernykh + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "jit_compiler_a64.hpp" +#include "superscalar.hpp" +#include "program.hpp" +#include "reciprocal.h" +#include "virtual_memory.hpp" + +namespace ARMV8A { + +constexpr uint32_t B = 0x14000000; +constexpr uint32_t EOR = 0xCA000000; +constexpr uint32_t EOR32 = 0x4A000000; +constexpr uint32_t ADD = 0x8B000000; +constexpr uint32_t SUB = 0xCB000000; +constexpr uint32_t MUL = 0x9B007C00; +constexpr uint32_t UMULH = 0x9BC07C00; +constexpr uint32_t SMULH = 0x9B407C00; +constexpr uint32_t MOVZ = 0xD2800000; +constexpr uint32_t MOVN = 0x92800000; +constexpr uint32_t MOVK = 0xF2800000; +constexpr uint32_t ADD_IMM_LO = 0x91000000; +constexpr uint32_t ADD_IMM_HI = 0x91400000; +constexpr uint32_t LDR_LITERAL = 0x58000000; +constexpr uint32_t ROR = 0x9AC02C00; +constexpr uint32_t ROR_IMM = 0x93C00000; +constexpr uint32_t MOV_REG = 0xAA0003E0; +constexpr uint32_t MOV_VREG_EL = 0x6E080400; +constexpr uint32_t FADD = 0x4E60D400; +constexpr uint32_t FSUB = 0x4EE0D400; +constexpr uint32_t FEOR = 0x6E201C00; +constexpr uint32_t FMUL = 0x6E60DC00; +constexpr uint32_t FDIV = 0x6E60FC00; +constexpr uint32_t FSQRT = 0x6EE1F800; + +} + +namespace randomx { + +static const size_t CodeSize = ((uint8_t*)randomx_init_dataset_aarch64_end) - ((uint8_t*)randomx_program_aarch64); +static const size_t MainLoopBegin = ((uint8_t*)randomx_program_aarch64_main_loop) - ((uint8_t*)randomx_program_aarch64); +static const size_t PrologueSize = ((uint8_t*)randomx_program_aarch64_vm_instructions) - ((uint8_t*)randomx_program_aarch64); +static const size_t ImulRcpLiteralsEnd = ((uint8_t*)randomx_program_aarch64_imul_rcp_literals_end) - ((uint8_t*)randomx_program_aarch64); + +static const size_t CalcDatasetItemSize = + // Prologue + ((uint8_t*)randomx_calc_dataset_item_aarch64_prefetch - (uint8_t*)randomx_calc_dataset_item_aarch64) + + // Main loop + RANDOMX_CACHE_ACCESSES * ( + // Main loop prologue + ((uint8_t*)randomx_calc_dataset_item_aarch64_mix - ((uint8_t*)randomx_calc_dataset_item_aarch64_prefetch)) + 4 + + // Inner main loop (instructions) + ((RANDOMX_SUPERSCALAR_LATENCY * 3) + 2) * 16 + + // Main loop epilogue + ((uint8_t*)randomx_calc_dataset_item_aarch64_store_result - (uint8_t*)randomx_calc_dataset_item_aarch64_mix) + 4 + ) + + // Epilogue + ((uint8_t*)randomx_calc_dataset_item_aarch64_end - (uint8_t*)randomx_calc_dataset_item_aarch64_store_result); + +constexpr uint32_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 }; + +template static constexpr size_t Log2(T value) { return (value > 1) ? (Log2(value / 2) + 1) : 0; } + +JitCompilerA64::JitCompilerA64() + : code((uint8_t*) allocMemoryPages(CodeSize + CalcDatasetItemSize)) + , literalPos(ImulRcpLiteralsEnd) + , num32bitLiterals(0) +{ + memset(reg_changed_offset, 0, sizeof(reg_changed_offset)); + memcpy(code, (void*) randomx_program_aarch64, CodeSize); +} + +JitCompilerA64::~JitCompilerA64() +{ + freePagedMemory(code, CodeSize + CalcDatasetItemSize); +} + +void JitCompilerA64::enableWriting() +{ + setPagesRW(code, CodeSize + CalcDatasetItemSize); +} + +void JitCompilerA64::enableExecution() +{ + setPagesRX(code, CodeSize + CalcDatasetItemSize); +} + +void JitCompilerA64::enableAll() +{ + setPagesRWX(code, CodeSize + CalcDatasetItemSize); +} + +void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& config) +{ + uint32_t codePos = MainLoopBegin + 4; + + // and w16, w10, ScratchpadL3Mask64 + emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); + + // and w17, w18, ScratchpadL3Mask64 + emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); + + codePos = PrologueSize; + literalPos = ImulRcpLiteralsEnd; + num32bitLiterals = 0; + + for (uint32_t i = 0; i < RegistersCount; ++i) + reg_changed_offset[i] = codePos; + + for (uint32_t i = 0; i < program.getSize(); ++i) + { + Instruction& instr = program(i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + (this->*engine[instr.opcode])(instr, codePos); + } + + // Update spMix2 + // eor w18, config.readReg2, config.readReg3 + emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); + + // Jump back to the main loop + const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos; + emit32(ARMV8A::B | (offset / 4), code, codePos); + + // and w18, w18, CacheLineAlignMask + codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64)); + emit32(0x121A0000 | 18 | (18 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos); + + // and w10, w10, CacheLineAlignMask + codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64)); + emit32(0x121A0000 | 10 | (10 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos); + + // Update spMix1 + // eor x10, config.readReg0, config.readReg1 + codePos = ((uint8_t*)randomx_program_aarch64_update_spMix1) - ((uint8_t*)randomx_program_aarch64); + emit32(ARMV8A::EOR | 10 | (IntRegMap[config.readReg0] << 5) | (IntRegMap[config.readReg1] << 16), code, codePos); + +#ifdef __GNUC__ + __builtin___clear_cache(reinterpret_cast(code + MainLoopBegin), reinterpret_cast(code + codePos)); +#endif +} + +void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration& config, uint32_t datasetOffset) +{ + uint32_t codePos = MainLoopBegin + 4; + + // and w16, w10, ScratchpadL3Mask64 + emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); + + // and w17, w18, ScratchpadL3Mask64 + emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); + + codePos = PrologueSize; + literalPos = ImulRcpLiteralsEnd; + num32bitLiterals = 0; + + for (uint32_t i = 0; i < RegistersCount; ++i) + reg_changed_offset[i] = codePos; + + for (uint32_t i = 0; i < program.getSize(); ++i) + { + Instruction& instr = program(i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + (this->*engine[instr.opcode])(instr, codePos); + } + + // Update spMix2 + // eor w18, config.readReg2, config.readReg3 + emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); + + // Jump back to the main loop + const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end_light) - ((uint8_t*)randomx_program_aarch64)) - codePos; + emit32(ARMV8A::B | (offset / 4), code, codePos); + + // and w2, w9, CacheLineAlignMask + codePos = (((uint8_t*)randomx_program_aarch64_light_cacheline_align_mask) - ((uint8_t*)randomx_program_aarch64)); + emit32(0x121A0000 | 2 | (9 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos); + + // Update spMix1 + // eor x10, config.readReg0, config.readReg1 + codePos = ((uint8_t*)randomx_program_aarch64_update_spMix1) - ((uint8_t*)randomx_program_aarch64); + emit32(ARMV8A::EOR | 10 | (IntRegMap[config.readReg0] << 5) | (IntRegMap[config.readReg1] << 16), code, codePos); + + // Apply dataset offset + codePos = ((uint8_t*)randomx_program_aarch64_light_dataset_offset) - ((uint8_t*)randomx_program_aarch64); + + datasetOffset /= CacheLineSize; + const uint32_t imm_lo = datasetOffset & ((1 << 12) - 1); + const uint32_t imm_hi = datasetOffset >> 12; + + emit32(ARMV8A::ADD_IMM_LO | 2 | (2 << 5) | (imm_lo << 10), code, codePos); + emit32(ARMV8A::ADD_IMM_HI | 2 | (2 << 5) | (imm_hi << 10), code, codePos); + +#ifdef __GNUC__ + __builtin___clear_cache(reinterpret_cast(code + MainLoopBegin), reinterpret_cast(code + codePos)); +#endif +} + +template +void JitCompilerA64::generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector &reciprocalCache) +{ + uint32_t codePos = CodeSize; + + uint8_t* p1 = (uint8_t*)randomx_calc_dataset_item_aarch64; + uint8_t* p2 = (uint8_t*)randomx_calc_dataset_item_aarch64_prefetch; + memcpy(code + codePos, p1, p2 - p1); + codePos += p2 - p1; + + num32bitLiterals = 64; + constexpr uint32_t tmp_reg = 12; + + for (size_t i = 0; i < N; ++i) + { + // and x11, x10, CacheSize / CacheLineSize - 1 + emit32(0x92400000 | 11 | (10 << 5) | ((Log2(CacheSize / CacheLineSize) - 1) << 10), code, codePos); + + p1 = ((uint8_t*)randomx_calc_dataset_item_aarch64_prefetch) + 4; + p2 = (uint8_t*)randomx_calc_dataset_item_aarch64_mix; + memcpy(code + codePos, p1, p2 - p1); + codePos += p2 - p1; + + SuperscalarProgram& prog = programs[i]; + const size_t progSize = prog.getSize(); + + uint32_t jmp_pos = codePos; + codePos += 4; + + // Fill in literal pool + for (size_t j = 0; j < progSize; ++j) + { + const Instruction& instr = prog(j); + if (static_cast(instr.opcode) == randomx::SuperscalarInstructionType::IMUL_RCP) + emit64(reciprocalCache[instr.getImm32()], code, codePos); + } + + // Jump over literal pool + uint32_t literal_pos = jmp_pos; + emit32(ARMV8A::B | ((codePos - jmp_pos) / 4), code, literal_pos); + + for (size_t j = 0; j < progSize; ++j) + { + const Instruction& instr = prog(j); + const uint32_t src = instr.src; + const uint32_t dst = instr.dst; + + switch (static_cast(instr.opcode)) + { + case randomx::SuperscalarInstructionType::ISUB_R: + emit32(ARMV8A::SUB | dst | (dst << 5) | (src << 16), code, codePos); + break; + case randomx::SuperscalarInstructionType::IXOR_R: + emit32(ARMV8A::EOR | dst | (dst << 5) | (src << 16), code, codePos); + break; + case randomx::SuperscalarInstructionType::IADD_RS: + emit32(ARMV8A::ADD | dst | (dst << 5) | (instr.getModShift() << 10) | (src << 16), code, codePos); + break; + case randomx::SuperscalarInstructionType::IMUL_R: + emit32(ARMV8A::MUL | dst | (dst << 5) | (src << 16), code, codePos); + break; + case randomx::SuperscalarInstructionType::IROR_C: + emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((instr.getImm32() & 63) << 10) | (dst << 16), code, codePos); + break; + case randomx::SuperscalarInstructionType::IADD_C7: + case randomx::SuperscalarInstructionType::IADD_C8: + case randomx::SuperscalarInstructionType::IADD_C9: + emitAddImmediate(dst, dst, instr.getImm32(), code, codePos); + break; + case randomx::SuperscalarInstructionType::IXOR_C7: + case randomx::SuperscalarInstructionType::IXOR_C8: + case randomx::SuperscalarInstructionType::IXOR_C9: + emitMovImmediate(tmp_reg, instr.getImm32(), code, codePos); + emit32(ARMV8A::EOR | dst | (dst << 5) | (tmp_reg << 16), code, codePos); + break; + case randomx::SuperscalarInstructionType::IMULH_R: + emit32(ARMV8A::UMULH | dst | (dst << 5) | (src << 16), code, codePos); + break; + case randomx::SuperscalarInstructionType::ISMULH_R: + emit32(ARMV8A::SMULH | dst | (dst << 5) | (src << 16), code, codePos); + break; + case randomx::SuperscalarInstructionType::IMUL_RCP: + { + int32_t offset = (literal_pos - codePos) / 4; + offset &= (1 << 19) - 1; + literal_pos += 8; + + // ldr tmp_reg, reciprocal + emit32(ARMV8A::LDR_LITERAL | tmp_reg | (offset << 5), code, codePos); + + // mul dst, dst, tmp_reg + emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, codePos); + } + break; + default: + break; + } + } + + p1 = (uint8_t*)randomx_calc_dataset_item_aarch64_mix; + p2 = (uint8_t*)randomx_calc_dataset_item_aarch64_store_result; + memcpy(code + codePos, p1, p2 - p1); + codePos += p2 - p1; + + // Update registerValue + emit32(ARMV8A::MOV_REG | 10 | (prog.getAddressRegister() << 16), code, codePos); + } + + p1 = (uint8_t*)randomx_calc_dataset_item_aarch64_store_result; + p2 = (uint8_t*)randomx_calc_dataset_item_aarch64_end; + memcpy(code + codePos, p1, p2 - p1); + codePos += p2 - p1; + +#ifdef __GNUC__ + __builtin___clear_cache(reinterpret_cast(code + CodeSize), reinterpret_cast(code + codePos)); +#endif +} + +template void JitCompilerA64::generateSuperscalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES], std::vector &reciprocalCache); + +DatasetInitFunc* JitCompilerA64::getDatasetInitFunc() +{ + return (DatasetInitFunc*)(code + (((uint8_t*)randomx_init_dataset_aarch64) - ((uint8_t*)randomx_program_aarch64))); +} + +size_t JitCompilerA64::getCodeSize() +{ + return CodeSize; +} + +void JitCompilerA64::emitMovImmediate(uint32_t dst, uint32_t imm, uint8_t* code, uint32_t& codePos) +{ + uint32_t k = codePos; + + if (imm < (1 << 16)) + { + // movz tmp_reg, imm32 (16 low bits) + emit32(ARMV8A::MOVZ | dst | (imm << 5), code, k); + } + else + { + if (num32bitLiterals < 64) + { + if (static_cast(imm) < 0) + { + // smov dst, vN.s[M] + emit32(0x4E042C00 | dst | ((num32bitLiterals / 4) << 5) | ((num32bitLiterals % 4) << 19), code, k); + } + else + { + // umov dst, vN.s[M] + emit32(0x0E043C00 | dst | ((num32bitLiterals / 4) << 5) | ((num32bitLiterals % 4) << 19), code, k); + } + + ((uint32_t*)(code + ImulRcpLiteralsEnd))[num32bitLiterals] = imm; + ++num32bitLiterals; + } + else + { + if (static_cast(imm) < 0) + { + // movn tmp_reg, ~imm32 (16 high bits) + emit32(ARMV8A::MOVN | dst | (1 << 21) | ((~imm >> 16) << 5), code, k); + } + else + { + // movz tmp_reg, imm32 (16 high bits) + emit32(ARMV8A::MOVZ | dst | (1 << 21) | ((imm >> 16) << 5), code, k); + } + + // movk tmp_reg, imm32 (16 low bits) + emit32(ARMV8A::MOVK | dst | ((imm & 0xFFFF) << 5), code, k); + } + } + + codePos = k; +} + +void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm, uint8_t* code, uint32_t& codePos) +{ + uint32_t k = codePos; + + if (imm < (1 << 24)) + { + const uint32_t imm_lo = imm & ((1 << 12) - 1); + const uint32_t imm_hi = imm >> 12; + + if (imm_lo && imm_hi) + { + emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k); + emit32(ARMV8A::ADD_IMM_HI | dst | (dst << 5) | (imm_hi << 10), code, k); + } + else if (imm_lo) + { + emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k); + } + else + { + emit32(ARMV8A::ADD_IMM_HI | dst | (src << 5) | (imm_hi << 10), code, k); + } + } + else + { + constexpr uint32_t tmp_reg = 18; + emitMovImmediate(tmp_reg, imm, code, k); + + // add dst, src, tmp_reg + emit32(ARMV8A::ADD | dst | (src << 5) | (tmp_reg << 16), code, k); + } + + codePos = k; +} + +template +void JitCompilerA64::emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos) +{ + uint32_t k = codePos; + + uint32_t imm = instr.getImm32(); + + if (src != dst) + { + imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1); + emitAddImmediate(tmp_reg, src, imm, code, k); + + constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5); + constexpr uint32_t andInstrL1 = t | ((Log2(RANDOMX_SCRATCHPAD_L1) - 4) << 10); + constexpr uint32_t andInstrL2 = t | ((Log2(RANDOMX_SCRATCHPAD_L2) - 4) << 10); + + emit32(instr.getModMem() ? andInstrL1 : andInstrL2, code, k); + + // ldr tmp_reg, [x2, tmp_reg] + emit32(0xf8606840 | tmp_reg | (tmp_reg << 16), code, k); + } + else + { + imm = (imm & ScratchpadL3Mask) >> 3; + emitMovImmediate(tmp_reg, imm, code, k); + + // ldr tmp_reg, [x2, tmp_reg, lsl 3] + emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k); + } + + codePos = k; +} + +template +void JitCompilerA64::emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos) +{ + uint32_t k = codePos; + + uint32_t imm = instr.getImm32(); + constexpr uint32_t tmp_reg = 18; + + imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1); + emitAddImmediate(tmp_reg, src, imm, code, k); + + constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5); + constexpr uint32_t andInstrL1 = t | ((Log2(RANDOMX_SCRATCHPAD_L1) - 4) << 10); + constexpr uint32_t andInstrL2 = t | ((Log2(RANDOMX_SCRATCHPAD_L2) - 4) << 10); + + emit32(instr.getModMem() ? andInstrL1 : andInstrL2, code, k); + + // add tmp_reg, x2, tmp_reg + emit32(ARMV8A::ADD | tmp_reg | (2 << 5) | (tmp_reg << 16), code, k); + + // ldpsw tmp_reg, tmp_reg + 1, [tmp_reg] + emit32(0x69400000 | tmp_reg | (tmp_reg << 5) | ((tmp_reg + 1) << 10), code, k); + + // ins tmp_reg_fp.d[0], tmp_reg + emit32(0x4E081C00 | tmp_reg_fp | (tmp_reg << 5), code, k); + + // ins tmp_reg_fp.d[1], tmp_reg + 1 + emit32(0x4E181C00 | tmp_reg_fp | ((tmp_reg + 1) << 5), code, k); + + // scvtf tmp_reg_fp.2d, tmp_reg_fp.2d + emit32(0x4E61D800 | tmp_reg_fp | (tmp_reg_fp << 5), code, k); + + codePos = k; +} + +void JitCompilerA64::h_IADD_RS(Instruction& instr, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + const uint32_t shift = instr.getModShift(); + + // add dst, src << shift + emit32(ARMV8A::ADD | dst | (dst << 5) | (shift << 10) | (src << 16), code, k); + + if (instr.dst == RegisterNeedsDisplacement) + emitAddImmediate(dst, dst, instr.getImm32(), code, k); + + reg_changed_offset[instr.dst] = k; + codePos = k; +} + +void JitCompilerA64::h_IADD_M(Instruction& instr, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + constexpr uint32_t tmp_reg = 18; + emitMemLoad(dst, src, instr, code, k); + + // add dst, dst, tmp_reg + emit32(ARMV8A::ADD | dst | (dst << 5) | (tmp_reg << 16), code, k); + + reg_changed_offset[instr.dst] = k; + codePos = k; +} + +void JitCompilerA64::h_ISUB_R(Instruction& instr, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + if (src != dst) + { + // sub dst, dst, src + emit32(ARMV8A::SUB | dst | (dst << 5) | (src << 16), code, k); + } + else + { + emitAddImmediate(dst, dst, -instr.getImm32(), code, k); + } + + reg_changed_offset[instr.dst] = k; + codePos = k; +} + +void JitCompilerA64::h_ISUB_M(Instruction& instr, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + constexpr uint32_t tmp_reg = 18; + emitMemLoad(dst, src, instr, code, k); + + // sub dst, dst, tmp_reg + emit32(ARMV8A::SUB | dst | (dst << 5) | (tmp_reg << 16), code, k); + + reg_changed_offset[instr.dst] = k; + codePos = k; +} + +void JitCompilerA64::h_IMUL_R(Instruction& instr, uint32_t& codePos) +{ + uint32_t k = codePos; + + uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + if (src == dst) + { + src = 18; + emitMovImmediate(src, instr.getImm32(), code, k); + } + + // mul dst, dst, src + emit32(ARMV8A::MUL | dst | (dst << 5) | (src << 16), code, k); + + reg_changed_offset[instr.dst] = k; + codePos = k; +} + +void JitCompilerA64::h_IMUL_M(Instruction& instr, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + constexpr uint32_t tmp_reg = 18; + emitMemLoad(dst, src, instr, code, k); + + // sub dst, dst, tmp_reg + emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k); + + reg_changed_offset[instr.dst] = k; + codePos = k; +} + +void JitCompilerA64::h_IMULH_R(Instruction& instr, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + // umulh dst, dst, src + emit32(ARMV8A::UMULH | dst | (dst << 5) | (src << 16), code, k); + + reg_changed_offset[instr.dst] = k; + codePos = k; +} + +void JitCompilerA64::h_IMULH_M(Instruction& instr, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + constexpr uint32_t tmp_reg = 18; + emitMemLoad(dst, src, instr, code, k); + + // umulh dst, dst, tmp_reg + emit32(ARMV8A::UMULH | dst | (dst << 5) | (tmp_reg << 16), code, k); + + reg_changed_offset[instr.dst] = k; + codePos = k; +} + +void JitCompilerA64::h_ISMULH_R(Instruction& instr, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + // smulh dst, dst, src + emit32(ARMV8A::SMULH | dst | (dst << 5) | (src << 16), code, k); + + reg_changed_offset[instr.dst] = k; + codePos = k; +} + +void JitCompilerA64::h_ISMULH_M(Instruction& instr, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + constexpr uint32_t tmp_reg = 18; + emitMemLoad(dst, src, instr, code, k); + + // smulh dst, dst, tmp_reg + emit32(ARMV8A::SMULH | dst | (dst << 5) | (tmp_reg << 16), code, k); + + reg_changed_offset[instr.dst] = k; + codePos = k; +} + +void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos) +{ + const uint64_t divisor = instr.getImm32(); + if (isZeroOrPowerOf2(divisor)) + return; + + uint32_t k = codePos; + + constexpr uint32_t tmp_reg = 18; + const uint32_t dst = IntRegMap[instr.dst]; + + constexpr uint64_t N = 1ULL << 63; + const uint64_t q = N / divisor; + const uint64_t r = N % divisor; +#ifdef __GNUC__ + const uint64_t shift = 64 - __builtin_clzll(divisor); +#else + uint64_t shift = 32; + for (uint64_t k = 1U << 31; (k & divisor) == 0; k >>= 1) + --shift; +#endif + + const uint32_t literal_id = (ImulRcpLiteralsEnd - literalPos) / sizeof(uint64_t); + + literalPos -= sizeof(uint64_t); + *(uint64_t*)(code + literalPos) = (q << shift) + ((r << shift) / divisor); + + if (literal_id < 13) + { + static constexpr uint32_t literal_regs[13] = { 30 << 16, 29 << 16, 28 << 16, 27 << 16, 26 << 16, 25 << 16, 24 << 16, 23 << 16, 22 << 16, 21 << 16, 20 << 16, 11 << 16, 0 }; + + // mul dst, dst, literal_reg + emit32(ARMV8A::MUL | dst | (dst << 5) | literal_regs[literal_id], code, k); + } + else + { + // ldr tmp_reg, reciprocal + const uint32_t offset = (literalPos - k) / 4; + emit32(ARMV8A::LDR_LITERAL | tmp_reg | (offset << 5), code, k); + + // mul dst, dst, tmp_reg + emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k); + } + + reg_changed_offset[instr.dst] = k; + codePos = k; +} + +void JitCompilerA64::h_INEG_R(Instruction& instr, uint32_t& codePos) +{ + const uint32_t dst = IntRegMap[instr.dst]; + + // sub dst, xzr, dst + emit32(ARMV8A::SUB | dst | (31 << 5) | (dst << 16), code, codePos); + + reg_changed_offset[instr.dst] = codePos; +} + +void JitCompilerA64::h_IXOR_R(Instruction& instr, uint32_t& codePos) +{ + uint32_t k = codePos; + + uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + if (src == dst) + { + src = 18; + emitMovImmediate(src, instr.getImm32(), code, k); + } + + // eor dst, dst, src + emit32(ARMV8A::EOR | dst | (dst << 5) | (src << 16), code, k); + + reg_changed_offset[instr.dst] = k; + codePos = k; +} + +void JitCompilerA64::h_IXOR_M(Instruction& instr, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + constexpr uint32_t tmp_reg = 18; + emitMemLoad(dst, src, instr, code, k); + + // eor dst, dst, tmp_reg + emit32(ARMV8A::EOR | dst | (dst << 5) | (tmp_reg << 16), code, k); + + reg_changed_offset[instr.dst] = k; + codePos = k; +} + +void JitCompilerA64::h_IROR_R(Instruction& instr, uint32_t& codePos) +{ + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + if (src != dst) + { + // ror dst, dst, src + emit32(ARMV8A::ROR | dst | (dst << 5) | (src << 16), code, codePos); + } + else + { + // ror dst, dst, imm + emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((instr.getImm32() & 63) << 10) | (dst << 16), code, codePos); + } + + reg_changed_offset[instr.dst] = codePos; +} + +void JitCompilerA64::h_IROL_R(Instruction& instr, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + if (src != dst) + { + constexpr uint32_t tmp_reg = 18; + + // sub tmp_reg, xzr, src + emit32(ARMV8A::SUB | tmp_reg | (31 << 5) | (src << 16), code, k); + + // ror dst, dst, tmp_reg + emit32(ARMV8A::ROR | dst | (dst << 5) | (tmp_reg << 16), code, k); + } + else + { + // ror dst, dst, imm + emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((-instr.getImm32() & 63) << 10) | (dst << 16), code, k); + } + + reg_changed_offset[instr.dst] = k; + codePos = k; +} + +void JitCompilerA64::h_ISWAP_R(Instruction& instr, uint32_t& codePos) +{ + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + if (src == dst) + return; + + uint32_t k = codePos; + + constexpr uint32_t tmp_reg = 18; + emit32(ARMV8A::MOV_REG | tmp_reg | (dst << 16), code, k); + emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k); + emit32(ARMV8A::MOV_REG | src | (tmp_reg << 16), code, k); + + reg_changed_offset[instr.src] = k; + reg_changed_offset[instr.dst] = k; + codePos = k; +} + +void JitCompilerA64::h_FSWAP_R(Instruction& instr, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t dst = instr.dst + 16; + + constexpr uint32_t tmp_reg_fp = 28; + constexpr uint32_t src_index1 = 1 << 14; + constexpr uint32_t dst_index1 = 1 << 20; + + emit32(ARMV8A::MOV_VREG_EL | tmp_reg_fp | (dst << 5) | src_index1, code, k); + emit32(ARMV8A::MOV_VREG_EL | dst | (dst << 5) | dst_index1, code, k); + emit32(ARMV8A::MOV_VREG_EL | dst | (tmp_reg_fp << 5), code, k); + + codePos = k; +} + +void JitCompilerA64::h_FADD_R(Instruction& instr, uint32_t& codePos) +{ + const uint32_t src = (instr.src % 4) + 24; + const uint32_t dst = (instr.dst % 4) + 16; + + emit32(ARMV8A::FADD | dst | (dst << 5) | (src << 16), code, codePos); +} + +void JitCompilerA64::h_FADD_M(Instruction& instr, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = (instr.dst % 4) + 16; + + constexpr uint32_t tmp_reg_fp = 28; + emitMemLoadFP(src, instr, code, k); + + emit32(ARMV8A::FADD | dst | (dst << 5) | (tmp_reg_fp << 16), code, k); + + codePos = k; +} + +void JitCompilerA64::h_FSUB_R(Instruction& instr, uint32_t& codePos) +{ + const uint32_t src = (instr.src % 4) + 24; + const uint32_t dst = (instr.dst % 4) + 16; + + emit32(ARMV8A::FSUB | dst | (dst << 5) | (src << 16), code, codePos); +} + +void JitCompilerA64::h_FSUB_M(Instruction& instr, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = (instr.dst % 4) + 16; + + constexpr uint32_t tmp_reg_fp = 28; + emitMemLoadFP(src, instr, code, k); + + emit32(ARMV8A::FSUB | dst | (dst << 5) | (tmp_reg_fp << 16), code, k); + + codePos = k; +} + +void JitCompilerA64::h_FSCAL_R(Instruction& instr, uint32_t& codePos) +{ + const uint32_t dst = (instr.dst % 4) + 16; + + emit32(ARMV8A::FEOR | dst | (dst << 5) | (31 << 16), code, codePos); +} + +void JitCompilerA64::h_FMUL_R(Instruction& instr, uint32_t& codePos) +{ + const uint32_t src = (instr.src % 4) + 24; + const uint32_t dst = (instr.dst % 4) + 20; + + emit32(ARMV8A::FMUL | dst | (dst << 5) | (src << 16), code, codePos); +} + +void JitCompilerA64::h_FDIV_M(Instruction& instr, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = (instr.dst % 4) + 20; + + constexpr uint32_t tmp_reg_fp = 28; + emitMemLoadFP(src, instr, code, k); + + // and tmp_reg_fp, tmp_reg_fp, and_mask_reg + emit32(0x4E201C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (29 << 16), code, k); + + // orr tmp_reg_fp, tmp_reg_fp, or_mask_reg + emit32(0x4EA01C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (30 << 16), code, k); + + emit32(ARMV8A::FDIV | dst | (dst << 5) | (tmp_reg_fp << 16), code, k); + + codePos = k; +} + +void JitCompilerA64::h_FSQRT_R(Instruction& instr, uint32_t& codePos) +{ + const uint32_t dst = (instr.dst % 4) + 20; + + emit32(ARMV8A::FSQRT | dst | (dst << 5), code, codePos); +} + +void JitCompilerA64::h_CBRANCH(Instruction& instr, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t dst = IntRegMap[instr.dst]; + const uint32_t modCond = instr.getModCond(); + const uint32_t shift = modCond + ConditionOffset; + const uint32_t imm = (instr.getImm32() | (1U << shift)) & ~(1U << (shift - 1)); + + emitAddImmediate(dst, dst, imm, code, k); + + // tst dst, mask + static_assert((ConditionMask == 0xFF) && (ConditionOffset == 8), "Update tst encoding for different mask and offset"); + emit32((0xF2781C1F - (modCond << 16)) | (dst << 5), code, k); + + int32_t offset = reg_changed_offset[instr.dst]; + offset = ((offset - k) >> 2) & ((1 << 19) - 1); + + // beq target + emit32(0x54000000 | (offset << 5), code, k); + + for (uint32_t i = 0; i < RegistersCount; ++i) + reg_changed_offset[i] = k; + + codePos = k; +} + +void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + + constexpr uint32_t tmp_reg = 18; + constexpr uint32_t fpcr_tmp_reg = 8; + + // ror tmp_reg, src, imm + emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k); + + // bfi fpcr_tmp_reg, tmp_reg, 40, 2 + emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k); + + // rbit tmp_reg, fpcr_tmp_reg + emit32(0xDAC00000 | tmp_reg | (fpcr_tmp_reg << 5), code, k); + + // msr fpcr, tmp_reg + emit32(0xD51B4400 | tmp_reg, code, k); + + codePos = k; +} + +void JitCompilerA64::h_ISTORE(Instruction& instr, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + constexpr uint32_t tmp_reg = 18; + + uint32_t imm = instr.getImm32(); + + if (instr.getModCond() < StoreL3Condition) + imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1); + else + imm &= RANDOMX_SCRATCHPAD_L3 - 1; + + emitAddImmediate(tmp_reg, dst, imm, code, k); + + constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5); + constexpr uint32_t andInstrL1 = t | ((Log2(RANDOMX_SCRATCHPAD_L1) - 4) << 10); + constexpr uint32_t andInstrL2 = t | ((Log2(RANDOMX_SCRATCHPAD_L2) - 4) << 10); + constexpr uint32_t andInstrL3 = t | ((Log2(RANDOMX_SCRATCHPAD_L3) - 4) << 10); + + emit32((instr.getModCond() < StoreL3Condition) ? (instr.getModMem() ? andInstrL1 : andInstrL2) : andInstrL3, code, k); + + // str src, [x2, tmp_reg] + emit32(0xF8206840 | src | (tmp_reg << 16), code, k); + + codePos = k; +} + +void JitCompilerA64::h_NOP(Instruction& instr, uint32_t& codePos) +{ +} + +#include "instruction_weights.hpp" +#define INST_HANDLE(x) REPN(&JitCompilerA64::h_##x, WT(x)) + + InstructionGeneratorA64 JitCompilerA64::engine[256] = { + INST_HANDLE(IADD_RS) + INST_HANDLE(IADD_M) + INST_HANDLE(ISUB_R) + INST_HANDLE(ISUB_M) + INST_HANDLE(IMUL_R) + INST_HANDLE(IMUL_M) + INST_HANDLE(IMULH_R) + INST_HANDLE(IMULH_M) + INST_HANDLE(ISMULH_R) + INST_HANDLE(ISMULH_M) + INST_HANDLE(IMUL_RCP) + INST_HANDLE(INEG_R) + INST_HANDLE(IXOR_R) + INST_HANDLE(IXOR_M) + INST_HANDLE(IROR_R) + INST_HANDLE(IROL_R) + INST_HANDLE(ISWAP_R) + INST_HANDLE(FSWAP_R) + INST_HANDLE(FADD_R) + INST_HANDLE(FADD_M) + INST_HANDLE(FSUB_R) + INST_HANDLE(FSUB_M) + INST_HANDLE(FSCAL_R) + INST_HANDLE(FMUL_R) + INST_HANDLE(FDIV_M) + INST_HANDLE(FSQRT_R) + INST_HANDLE(CBRANCH) + INST_HANDLE(CFROUND) + INST_HANDLE(ISTORE) + INST_HANDLE(NOP) + }; +} diff --git a/src/jit_compiler_a64.hpp b/src/jit_compiler_a64.hpp index 5a075bf..a4adb80 100644 --- a/src/jit_compiler_a64.hpp +++ b/src/jit_compiler_a64.hpp @@ -1,5 +1,6 @@ /* Copyright (c) 2018-2019, tevador +Copyright (c) 2019, SChernykh All rights reserved. @@ -32,45 +33,96 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include "common.hpp" +#include "jit_compiler_a64_static.hpp" namespace randomx { class Program; class ProgramConfiguration; class SuperscalarProgram; + class Instruction; + + typedef void(JitCompilerA64::*InstructionGeneratorA64)(Instruction&, uint32_t&); class JitCompilerA64 { public: - JitCompilerA64() { - throw std::runtime_error("ARM64 JIT compiler is not implemented yet."); - } - void generateProgram(Program&, ProgramConfiguration&) { + JitCompilerA64(); + ~JitCompilerA64(); + + void generateProgram(Program&, ProgramConfiguration&); + void generateProgramLight(Program&, ProgramConfiguration&, uint32_t); - } - void generateProgramLight(Program&, ProgramConfiguration&, uint32_t) { - - } template - void generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector &) { + void generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector &); - } - void generateDatasetInitCode() { + void generateDatasetInitCode() {} + ProgramFunc* getProgramFunc() { return reinterpret_cast(code); } + DatasetInitFunc* getDatasetInitFunc(); + uint8_t* getCode() { return code; } + size_t getCodeSize(); + + void enableWriting(); + void enableExecution(); + void enableAll(); + + private: + static InstructionGeneratorA64 engine[256]; + uint32_t reg_changed_offset[8]; + uint8_t* code; + uint32_t literalPos; + uint32_t num32bitLiterals; + + static void emit32(uint32_t val, uint8_t* code, uint32_t& codePos) + { + *(uint32_t*)(code + codePos) = val; + codePos += sizeof(val); } - ProgramFunc* getProgramFunc() { - return nullptr; - } - DatasetInitFunc* getDatasetInitFunc() { - return nullptr; - } - uint8_t* getCode() { - return nullptr; - } - size_t getCodeSize() { - return 0; + + static void emit64(uint64_t val, uint8_t* code, uint32_t& codePos) + { + *(uint64_t*)(code + codePos) = val; + codePos += sizeof(val); } - void enableWriting() {} - void enableExecution() {} - void enableAll() {} + + void emitMovImmediate(uint32_t dst, uint32_t imm, uint8_t* code, uint32_t& codePos); + void emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm, uint8_t* code, uint32_t& codePos); + + template + void emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos); + + template + void emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos); + + void h_IADD_RS(Instruction&, uint32_t&); + void h_IADD_M(Instruction&, uint32_t&); + void h_ISUB_R(Instruction&, uint32_t&); + void h_ISUB_M(Instruction&, uint32_t&); + void h_IMUL_R(Instruction&, uint32_t&); + void h_IMUL_M(Instruction&, uint32_t&); + void h_IMULH_R(Instruction&, uint32_t&); + void h_IMULH_M(Instruction&, uint32_t&); + void h_ISMULH_R(Instruction&, uint32_t&); + void h_ISMULH_M(Instruction&, uint32_t&); + void h_IMUL_RCP(Instruction&, uint32_t&); + void h_INEG_R(Instruction&, uint32_t&); + void h_IXOR_R(Instruction&, uint32_t&); + void h_IXOR_M(Instruction&, uint32_t&); + void h_IROR_R(Instruction&, uint32_t&); + void h_IROL_R(Instruction&, uint32_t&); + void h_ISWAP_R(Instruction&, uint32_t&); + void h_FSWAP_R(Instruction&, uint32_t&); + void h_FADD_R(Instruction&, uint32_t&); + void h_FADD_M(Instruction&, uint32_t&); + void h_FSUB_R(Instruction&, uint32_t&); + void h_FSUB_M(Instruction&, uint32_t&); + void h_FSCAL_R(Instruction&, uint32_t&); + void h_FMUL_R(Instruction&, uint32_t&); + void h_FDIV_M(Instruction&, uint32_t&); + void h_FSQRT_R(Instruction&, uint32_t&); + void h_CBRANCH(Instruction&, uint32_t&); + void h_CFROUND(Instruction&, uint32_t&); + void h_ISTORE(Instruction&, uint32_t&); + void h_NOP(Instruction&, uint32_t&); }; -} \ No newline at end of file +} diff --git a/src/jit_compiler_a64_static.S b/src/jit_compiler_a64_static.S new file mode 100644 index 0000000..2c0e439 --- /dev/null +++ b/src/jit_compiler_a64_static.S @@ -0,0 +1,579 @@ +# Copyright (c) 2018-2019, tevador +# Copyright (c) 2019, SChernykh +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the copyright holder nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + .arch armv8-a + .text + .global randomx_program_aarch64 + .global randomx_program_aarch64_main_loop + .global randomx_program_aarch64_vm_instructions + .global randomx_program_aarch64_imul_rcp_literals_end + .global randomx_program_aarch64_vm_instructions_end + .global randomx_program_aarch64_cacheline_align_mask1 + .global randomx_program_aarch64_cacheline_align_mask2 + .global randomx_program_aarch64_update_spMix1 + .global randomx_program_aarch64_vm_instructions_end_light + .global randomx_program_aarch64_light_cacheline_align_mask + .global randomx_program_aarch64_light_dataset_offset + .global randomx_init_dataset_aarch64 + .global randomx_init_dataset_aarch64_end + .global randomx_calc_dataset_item_aarch64 + .global randomx_calc_dataset_item_aarch64_prefetch + .global randomx_calc_dataset_item_aarch64_mix + .global randomx_calc_dataset_item_aarch64_store_result + .global randomx_calc_dataset_item_aarch64_end + +#include "configuration.h" + +# Register allocation + +# x0 -> pointer to reg buffer and then literal for IMUL_RCP +# x1 -> pointer to mem buffer and then to dataset +# x2 -> pointer to scratchpad +# x3 -> loop counter +# x4 -> "r0" +# x5 -> "r1" +# x6 -> "r2" +# x7 -> "r3" +# x8 -> fpcr (reversed bits) +# x9 -> mx, ma +# x10 -> spMix1 +# x11 -> literal for IMUL_RCP +# x12 -> "r4" +# x13 -> "r5" +# x14 -> "r6" +# x15 -> "r7" +# x16 -> spAddr0 +# x17 -> spAddr1 +# x18 -> temporary +# x19 -> temporary +# x20 -> literal for IMUL_RCP +# x21 -> literal for IMUL_RCP +# x22 -> literal for IMUL_RCP +# x23 -> literal for IMUL_RCP +# x24 -> literal for IMUL_RCP +# x25 -> literal for IMUL_RCP +# x26 -> literal for IMUL_RCP +# x27 -> literal for IMUL_RCP +# x28 -> literal for IMUL_RCP +# x29 -> literal for IMUL_RCP +# x30 -> literal for IMUL_RCP + +# v0-v15 -> store 32-bit literals +# v16 -> "f0" +# v17 -> "f1" +# v18 -> "f2" +# v19 -> "f3" +# v20 -> "e0" +# v21 -> "e1" +# v22 -> "e2" +# v23 -> "e3" +# v24 -> "a0" +# v25 -> "a1" +# v26 -> "a2" +# v27 -> "a3" +# v28 -> temporary +# v29 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff +# v30 -> E 'or' mask = 0x3*00000000******3*00000000****** +# v31 -> scale mask = 0x81f000000000000081f0000000000000 + +randomx_program_aarch64: + # Save callee-saved registers + sub sp, sp, 192 + stp x16, x17, [sp] + stp x18, x19, [sp, 16] + stp x20, x21, [sp, 32] + stp x22, x23, [sp, 48] + stp x24, x25, [sp, 64] + stp x26, x27, [sp, 80] + stp x28, x29, [sp, 96] + stp x8, x30, [sp, 112] + stp d8, d9, [sp, 128] + stp d10, d11, [sp, 144] + stp d12, d13, [sp, 160] + stp d14, d15, [sp, 176] + + # Zero integer registers + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x12, xzr + mov x13, xzr + mov x14, xzr + mov x15, xzr + + # Load ma, mx and dataset pointer + ldp x9, x1, [x1] + + # Load initial spMix value + mov x10, x9 + + # Load group A registers + ldp q24, q25, [x0, 192] + ldp q26, q27, [x0, 224] + + # Load E 'and' mask + mov x16, 0x00FFFFFFFFFFFFFF + ins v29.d[0], x16 + ins v29.d[1], x16 + + # Load E 'or' mask (stored in reg.f[0]) + ldr q30, [x0, 64] + + # Load scale mask + mov x16, 0x80f0000000000000 + ins v31.d[0], x16 + ins v31.d[1], x16 + + # Read fpcr + mrs x8, fpcr + rbit x8, x8 + + # Save x0 + str x0, [sp, -16]! + + # Read literals + ldr x0, literal_x0 + ldr x11, literal_x11 + ldr x20, literal_x20 + ldr x21, literal_x21 + ldr x22, literal_x22 + ldr x23, literal_x23 + ldr x24, literal_x24 + ldr x25, literal_x25 + ldr x26, literal_x26 + ldr x27, literal_x27 + ldr x28, literal_x28 + ldr x29, literal_x29 + ldr x30, literal_x30 + + ldr q0, literal_v0 + ldr q1, literal_v1 + ldr q2, literal_v2 + ldr q3, literal_v3 + ldr q4, literal_v4 + ldr q5, literal_v5 + ldr q6, literal_v6 + ldr q7, literal_v7 + ldr q8, literal_v8 + ldr q9, literal_v9 + ldr q10, literal_v10 + ldr q11, literal_v11 + ldr q12, literal_v12 + ldr q13, literal_v13 + ldr q14, literal_v14 + ldr q15, literal_v15 + +randomx_program_aarch64_main_loop: + # spAddr0 = spMix1 & ScratchpadL3Mask64; + # spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64; + lsr x18, x10, 32 + + # Actual mask will be inserted by JIT compiler + and w16, w10, 1 + and w17, w18, 1 + + # x16 = scratchpad + spAddr0 + # x17 = scratchpad + spAddr1 + add x16, x16, x2 + add x17, x17, x2 + + # xor integer registers with scratchpad data (spAddr0) + ldp x18, x19, [x16] + eor x4, x4, x18 + eor x5, x5, x19 + ldp x18, x19, [x16, 16] + eor x6, x6, x18 + eor x7, x7, x19 + ldp x18, x19, [x16, 32] + eor x12, x12, x18 + eor x13, x13, x19 + ldp x18, x19, [x16, 48] + eor x14, x14, x18 + eor x15, x15, x19 + + # Load group F registers (spAddr1) + ldpsw x18, x19, [x17] + ins v16.d[0], x18 + ins v16.d[1], x19 + ldpsw x18, x19, [x17, 8] + ins v17.d[0], x18 + ins v17.d[1], x19 + ldpsw x18, x19, [x17, 16] + ins v18.d[0], x18 + ins v18.d[1], x19 + ldpsw x18, x19, [x17, 24] + ins v19.d[0], x18 + ins v19.d[1], x19 + scvtf v16.2d, v16.2d + scvtf v17.2d, v17.2d + scvtf v18.2d, v18.2d + scvtf v19.2d, v19.2d + + # Load group E registers (spAddr1) + ldpsw x18, x19, [x17, 32] + ins v20.d[0], x18 + ins v20.d[1], x19 + ldpsw x18, x19, [x17, 40] + ins v21.d[0], x18 + ins v21.d[1], x19 + ldpsw x18, x19, [x17, 48] + ins v22.d[0], x18 + ins v22.d[1], x19 + ldpsw x18, x19, [x17, 56] + ins v23.d[0], x18 + ins v23.d[1], x19 + scvtf v20.2d, v20.2d + scvtf v21.2d, v21.2d + scvtf v22.2d, v22.2d + scvtf v23.2d, v23.2d + and v20.16b, v20.16b, v29.16b + and v21.16b, v21.16b, v29.16b + and v22.16b, v22.16b, v29.16b + and v23.16b, v23.16b, v29.16b + orr v20.16b, v20.16b, v30.16b + orr v21.16b, v21.16b, v30.16b + orr v22.16b, v22.16b, v30.16b + orr v23.16b, v23.16b, v30.16b + + # Execute VM instructions +randomx_program_aarch64_vm_instructions: + + # buffer for generated instructions + # FDIV_M is the largest instruction taking up to 12 ARMv8 instructions + .fill RANDOMX_PROGRAM_SIZE*12,4,0 + +literal_x0: .fill 1,8,0 +literal_x11: .fill 1,8,0 +literal_x20: .fill 1,8,0 +literal_x21: .fill 1,8,0 +literal_x22: .fill 1,8,0 +literal_x23: .fill 1,8,0 +literal_x24: .fill 1,8,0 +literal_x25: .fill 1,8,0 +literal_x26: .fill 1,8,0 +literal_x27: .fill 1,8,0 +literal_x28: .fill 1,8,0 +literal_x29: .fill 1,8,0 +literal_x30: .fill 1,8,0 +randomx_program_aarch64_imul_rcp_literals_end: + +literal_v0: .fill 2,8,0 +literal_v1: .fill 2,8,0 +literal_v2: .fill 2,8,0 +literal_v3: .fill 2,8,0 +literal_v4: .fill 2,8,0 +literal_v5: .fill 2,8,0 +literal_v6: .fill 2,8,0 +literal_v7: .fill 2,8,0 +literal_v8: .fill 2,8,0 +literal_v9: .fill 2,8,0 +literal_v10: .fill 2,8,0 +literal_v11: .fill 2,8,0 +literal_v12: .fill 2,8,0 +literal_v13: .fill 2,8,0 +literal_v14: .fill 2,8,0 +literal_v15: .fill 2,8,0 + +randomx_program_aarch64_vm_instructions_end: + + # mx ^= r[readReg2] ^ r[readReg3]; + eor x9, x9, x18 + + # Calculate dataset pointer for dataset prefetch + mov w18, w9 +randomx_program_aarch64_cacheline_align_mask1: + # Actual mask will be inserted by JIT compiler + and x18, x18, 1 + add x18, x18, x1 + + # Prefetch dataset data + prfm pldl2strm, [x18] + + # mx <-> ma + ror x9, x9, 32 + + # Calculate dataset pointer for dataset read + mov w10, w9 +randomx_program_aarch64_cacheline_align_mask2: + # Actual mask will be inserted by JIT compiler + and x10, x10, 1 + add x10, x10, x1 + +randomx_program_aarch64_xor_with_dataset_line: + # xor integer registers with dataset data + ldp x18, x19, [x10] + eor x4, x4, x18 + eor x5, x5, x19 + ldp x18, x19, [x10, 16] + eor x6, x6, x18 + eor x7, x7, x19 + ldp x18, x19, [x10, 32] + eor x12, x12, x18 + eor x13, x13, x19 + ldp x18, x19, [x10, 48] + eor x14, x14, x18 + eor x15, x15, x19 + +randomx_program_aarch64_update_spMix1: + # JIT compiler will replace it with "eor x10, config.readReg0, config.readReg1" + eor x10, x0, x0 + + # Store integer registers to scratchpad (spAddr1) + stp x4, x5, [x17, 0] + stp x6, x7, [x17, 16] + stp x12, x13, [x17, 32] + stp x14, x15, [x17, 48] + + # xor group F and group E registers + eor v16.16b, v16.16b, v20.16b + eor v17.16b, v17.16b, v21.16b + eor v18.16b, v18.16b, v22.16b + eor v19.16b, v19.16b, v23.16b + + # Store FP registers to scratchpad (spAddr0) + stp q16, q17, [x16, 0] + stp q18, q19, [x16, 32] + + subs x3, x3, 1 + bne randomx_program_aarch64_main_loop + + # Restore x0 + ldr x0, [sp], 16 + + # Store integer registers + stp x4, x5, [x0, 0] + stp x6, x7, [x0, 16] + stp x12, x13, [x0, 32] + stp x14, x15, [x0, 48] + + # Store FP registers + stp q16, q17, [x0, 64] + stp q18, q19, [x0, 96] + stp q20, q21, [x0, 128] + stp q22, q23, [x0, 160] + + # Restore callee-saved registers + ldp x16, x17, [sp] + ldp x18, x19, [sp, 16] + ldp x20, x21, [sp, 32] + ldp x22, x23, [sp, 48] + ldp x24, x25, [sp, 64] + ldp x26, x27, [sp, 80] + ldp x28, x29, [sp, 96] + ldp x8, x30, [sp, 112] + ldp d8, d9, [sp, 128] + ldp d10, d11, [sp, 144] + ldp d12, d13, [sp, 160] + ldp d14, d15, [sp, 176] + add sp, sp, 192 + + ret + +randomx_program_aarch64_vm_instructions_end_light: + sub sp, sp, 96 + stp x0, x1, [sp, 64] + stp x2, x30, [sp, 80] + + # mx ^= r[readReg2] ^ r[readReg3]; + eor x9, x9, x18 + + # mx <-> ma + ror x9, x9, 32 + + # x0 -> pointer to cache memory + mov x0, x1 + + # x1 -> pointer to output + mov x1, sp + +randomx_program_aarch64_light_cacheline_align_mask: + # Actual mask will be inserted by JIT compiler + and w2, w9, 1 + + # x2 -> item number + lsr x2, x2, 6 + +randomx_program_aarch64_light_dataset_offset: + # Apply dataset offset (filled in by JIT compiler) + add x2, x2, 0 + add x2, x2, 0 + + bl randomx_calc_dataset_item_aarch64 + + mov x10, sp + ldp x0, x1, [sp, 64] + ldp x2, x30, [sp, 80] + add sp, sp, 96 + + b randomx_program_aarch64_xor_with_dataset_line + + + +# Input parameters +# +# x0 -> pointer to cache +# x1 -> pointer to dataset memory at startItem +# x2 -> start item +# x3 -> end item + +randomx_init_dataset_aarch64: + # Save x30 (return address) + str x30, [sp, -16]! + + # Load pointer to cache memory + ldr x0, [x0] + +randomx_init_dataset_aarch64_main_loop: + bl randomx_calc_dataset_item_aarch64 + add x1, x1, 64 + add x2, x2, 1 + cmp x2, x3 + bne randomx_init_dataset_aarch64_main_loop + + # Restore x30 (return address) + ldr x30, [sp], 16 + + ret + +randomx_init_dataset_aarch64_end: + +# Input parameters +# +# x0 -> pointer to cache memory +# x1 -> pointer to output +# x2 -> item number +# +# Register allocation +# +# x0-x7 -> output value (calculated dataset item) +# x8 -> pointer to cache memory +# x9 -> pointer to output +# x10 -> registerValue +# x11 -> mixBlock +# x12 -> temporary +# x13 -> temporary + +randomx_calc_dataset_item_aarch64: + sub sp, sp, 112 + stp x0, x1, [sp] + stp x2, x3, [sp, 16] + stp x4, x5, [sp, 32] + stp x6, x7, [sp, 48] + stp x8, x9, [sp, 64] + stp x10, x11, [sp, 80] + stp x12, x13, [sp, 96] + + mov x8, x0 + mov x9, x1 + mov x10, x2 + + # rl[0] = (itemNumber + 1) * superscalarMul0; + ldr x12, superscalarMul0 + madd x0, x2, x12, x12 + + # rl[1] = rl[0] ^ superscalarAdd1; + ldr x12, superscalarAdd1 + eor x1, x0, x12 + + # rl[2] = rl[0] ^ superscalarAdd2; + ldr x12, superscalarAdd2 + eor x2, x0, x12 + + # rl[3] = rl[0] ^ superscalarAdd3; + ldr x12, superscalarAdd3 + eor x3, x0, x12 + + # rl[4] = rl[0] ^ superscalarAdd4; + ldr x12, superscalarAdd4 + eor x4, x0, x12 + + # rl[5] = rl[0] ^ superscalarAdd5; + ldr x12, superscalarAdd5 + eor x5, x0, x12 + + # rl[6] = rl[0] ^ superscalarAdd6; + ldr x12, superscalarAdd6 + eor x6, x0, x12 + + # rl[7] = rl[0] ^ superscalarAdd7; + ldr x12, superscalarAdd7 + eor x7, x0, x12 + + b randomx_calc_dataset_item_aarch64_prefetch + +superscalarMul0: .quad 6364136223846793005 +superscalarAdd1: .quad 9298411001130361340 +superscalarAdd2: .quad 12065312585734608966 +superscalarAdd3: .quad 9306329213124626780 +superscalarAdd4: .quad 5281919268842080866 +superscalarAdd5: .quad 10536153434571861004 +superscalarAdd6: .quad 3398623926847679864 +superscalarAdd7: .quad 9549104520008361294 + +# Prefetch -> SuperScalar hash -> Mix will be repeated N times + +randomx_calc_dataset_item_aarch64_prefetch: + # Actual mask will be inserted by JIT compiler + and x11, x10, 1 + add x11, x8, x11, lsl 6 + prfm pldl2strm, [x11] + + # Generated SuperScalar hash program goes here + +randomx_calc_dataset_item_aarch64_mix: + ldp x12, x13, [x11] + eor x0, x0, x12 + eor x1, x1, x13 + ldp x12, x13, [x11, 16] + eor x2, x2, x12 + eor x3, x3, x13 + ldp x12, x13, [x11, 32] + eor x4, x4, x12 + eor x5, x5, x13 + ldp x12, x13, [x11, 48] + eor x6, x6, x12 + eor x7, x7, x13 + +randomx_calc_dataset_item_aarch64_store_result: + stp x0, x1, [x9] + stp x2, x3, [x9, 16] + stp x4, x5, [x9, 32] + stp x6, x7, [x9, 48] + + ldp x0, x1, [sp] + ldp x2, x3, [sp, 16] + ldp x4, x5, [sp, 32] + ldp x6, x7, [sp, 48] + ldp x8, x9, [sp, 64] + ldp x10, x11, [sp, 80] + ldp x12, x13, [sp, 96] + add sp, sp, 112 + + ret + +randomx_calc_dataset_item_aarch64_end: diff --git a/src/jit_compiler_a64_static.hpp b/src/jit_compiler_a64_static.hpp new file mode 100644 index 0000000..a9b922e --- /dev/null +++ b/src/jit_compiler_a64_static.hpp @@ -0,0 +1,51 @@ +/* +Copyright (c) 2018-2019, tevador +Copyright (c) 2019, SChernykh + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +extern "C" { + void randomx_program_aarch64(void* reg, void* mem, void* scratchpad, uint64_t iterations); + void randomx_program_aarch64_main_loop(); + void randomx_program_aarch64_vm_instructions(); + void randomx_program_aarch64_imul_rcp_literals_end(); + void randomx_program_aarch64_vm_instructions_end(); + void randomx_program_aarch64_cacheline_align_mask1(); + void randomx_program_aarch64_cacheline_align_mask2(); + void randomx_program_aarch64_update_spMix1(); + void randomx_program_aarch64_vm_instructions_end_light(); + void randomx_program_aarch64_light_cacheline_align_mask(); + void randomx_program_aarch64_light_dataset_offset(); + void randomx_init_dataset_aarch64(); + void randomx_init_dataset_aarch64_end(); + void randomx_calc_dataset_item_aarch64(); + void randomx_calc_dataset_item_aarch64_prefetch(); + void randomx_calc_dataset_item_aarch64_mix(); + void randomx_calc_dataset_item_aarch64_store_result(); + void randomx_calc_dataset_item_aarch64_end(); +} diff --git a/src/vm_compiled.cpp b/src/vm_compiled.cpp index c03f6b4..060abec 100644 --- a/src/vm_compiled.cpp +++ b/src/vm_compiled.cpp @@ -63,6 +63,9 @@ namespace randomx { template void CompiledVm::execute() { +#ifdef __aarch64__ + memcpy(reg.f, config.eMask, sizeof(config.eMask)); +#endif compiler.getProgramFunc()(reg, mem, scratchpad, RANDOMX_PROGRAM_ITERATIONS); }