diff --git a/src/configuration.h b/src/configuration.h index 935f891..dbbf25e 100644 --- a/src/configuration.h +++ b/src/configuration.h @@ -85,20 +85,20 @@ Total sum of frequencies must be 256 #define RANDOMX_FREQ_IADD_RS 16 #define RANDOMX_FREQ_IADD_M 7 #define RANDOMX_FREQ_ISUB_R 16 -#define RANDOMX_FREQ_ISUB_M 0 -#define RANDOMX_FREQ_IMUL_R 0 -#define RANDOMX_FREQ_IMUL_M 0 -#define RANDOMX_FREQ_IMULH_R 0 -#define RANDOMX_FREQ_IMULH_M 0 -#define RANDOMX_FREQ_ISMULH_R 0 -#define RANDOMX_FREQ_ISMULH_M 0 -#define RANDOMX_FREQ_IMUL_RCP 0 -#define RANDOMX_FREQ_INEG_R 0 -#define RANDOMX_FREQ_IXOR_R 0 -#define RANDOMX_FREQ_IXOR_M 0 -#define RANDOMX_FREQ_IROR_R 0 -#define RANDOMX_FREQ_IROL_R 0 -#define RANDOMX_FREQ_ISWAP_R 0 +#define RANDOMX_FREQ_ISUB_M 7 +#define RANDOMX_FREQ_IMUL_R 16 +#define RANDOMX_FREQ_IMUL_M 4 +#define RANDOMX_FREQ_IMULH_R 4 +#define RANDOMX_FREQ_IMULH_M 1 +#define RANDOMX_FREQ_ISMULH_R 4 +#define RANDOMX_FREQ_ISMULH_M 1 +#define RANDOMX_FREQ_IMUL_RCP 8 +#define RANDOMX_FREQ_INEG_R 2 +#define RANDOMX_FREQ_IXOR_R 15 +#define RANDOMX_FREQ_IXOR_M 5 +#define RANDOMX_FREQ_IROR_R 8 +#define RANDOMX_FREQ_IROL_R 2 +#define RANDOMX_FREQ_ISWAP_R 4 //Floating point instructions #define RANDOMX_FREQ_FSWAP_R 0 @@ -116,10 +116,10 @@ Total sum of frequencies must be 256 #define RANDOMX_FREQ_CFROUND 0 //Store instruction -#define RANDOMX_FREQ_ISTORE 0 +#define RANDOMX_FREQ_ISTORE 16 //No-op instruction -#define RANDOMX_FREQ_NOP 217 +#define RANDOMX_FREQ_NOP 120 /* ------ 256 */ diff --git a/src/instruction_weights.hpp b/src/instruction_weights.hpp index db088f6..6c63008 100644 --- a/src/instruction_weights.hpp +++ b/src/instruction_weights.hpp @@ -64,8 +64,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define REP33(x) REP32(x) x, #define REP40(x) REP32(x) REP8(x) #define REP64(x) REP32(x) REP32(x) +#define REP120(x) REP64(x) REP32(x) REP24(x) #define REP128(x) REP32(x) REP32(x) REP32(x) REP32(x) -#define REP217(x) REP128(x) REP64(x) REP25(x) #define REP232(x) REP128(x) REP40(x) REP40(x) REP24(x) #define REP256(x) REP128(x) REP128(x) #define REPNX(x,N) REP##N(x) diff --git a/src/jit_compiler_a64.cpp b/src/jit_compiler_a64.cpp index f09e13d..caa634f 100644 --- a/src/jit_compiler_a64.cpp +++ b/src/jit_compiler_a64.cpp @@ -30,19 +30,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "jit_compiler_a64.hpp" #include "program.hpp" #include "virtual_memory.hpp" +#include "reciprocal.h" namespace ARMV8A { -constexpr uint32_t B = 0x14000000; -constexpr uint32_t EOR = 0xCA000000; -constexpr uint32_t EOR32 = 0x4A000000; -constexpr uint32_t ADD = 0x8B000000; -constexpr uint32_t SUB = 0xCB000000; -constexpr uint32_t MOVZ = 0xD2800000; -constexpr uint32_t MOVN = 0x92800000; -constexpr uint32_t MOVK = 0xF2800000; -constexpr uint32_t ADD_IMM_LO = 0x91000000; -constexpr uint32_t ADD_IMM_HI = 0x91400000; +constexpr uint32_t B = 0x14000000; +constexpr uint32_t EOR = 0xCA000000; +constexpr uint32_t EOR32 = 0x4A000000; +constexpr uint32_t ADD = 0x8B000000; +constexpr uint32_t SUB = 0xCB000000; +constexpr uint32_t MUL = 0x9B007C00; +constexpr uint32_t UMULH = 0x9BC07C00; +constexpr uint32_t SMULH = 0x9B407C00; +constexpr uint32_t MOVZ = 0xD2800000; +constexpr uint32_t MOVN = 0x92800000; +constexpr uint32_t MOVK = 0xF2800000; +constexpr uint32_t ADD_IMM_LO = 0x91000000; +constexpr uint32_t ADD_IMM_HI = 0x91400000; +constexpr uint32_t LDR_LITERAL = 0x58000000; +constexpr uint32_t ROR = 0x9AC02C00; +constexpr uint32_t ROR_IMM = 0x93C00000; +constexpr uint32_t MOV_REG = 0xAA0003E0; } @@ -50,6 +58,7 @@ namespace randomx { static const size_t CodeSize = ((uint8_t*)randomx_program_aarch64_end) - ((uint8_t*)randomx_program_aarch64); static const size_t PrologueSize = ((uint8_t*)randomx_program_aarch64_vm_instructions) - ((uint8_t*)randomx_program_aarch64); +static const size_t InstructionsEnd = ((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64); constexpr uint32_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 }; @@ -57,6 +66,7 @@ template static constexpr size_t Log2(T value) { return (value > 1) JitCompilerA64::JitCompilerA64() : code((uint8_t*) allocMemoryPages(CodeSize)) + , literalPos(InstructionsEnd) { memcpy(code, (void*) randomx_program_aarch64, CodeSize); enableAll(); @@ -75,6 +85,7 @@ void JitCompilerA64::enableAll() void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& config) { uint32_t codePos = PrologueSize; + literalPos = InstructionsEnd; for (uint32_t i = 0; i < program.getSize(); ++i) { @@ -171,6 +182,39 @@ void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm, codePos = k; } +template +void JitCompilerA64::emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos) +{ + uint32_t k = codePos; + + uint32_t imm = instr.getImm32(); + + if (src != dst) + { + imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1); + emitAddImmediate(tmp_reg, src, imm, code, k); + + constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5); + constexpr uint32_t andInstrL1 = t | ((Log2(RANDOMX_SCRATCHPAD_L1) - 4) << 10); + constexpr uint32_t andInstrL2 = t | ((Log2(RANDOMX_SCRATCHPAD_L2) - 4) << 10); + + emit32(instr.getModMem() ? andInstrL1 : andInstrL2, code, k); + + // ldr tmp_reg, [x2, tmp_reg] + emit32(0xf8606840 | tmp_reg | (tmp_reg << 16), code, k); + } + else + { + imm = (imm & ScratchpadL3Mask) >> 3; + emitMovImmediate(tmp_reg, imm, code, k); + + // ldr tmp_reg, [x2, tmp_reg, lsl 3] + emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k); + } + + codePos = k; +} + void JitCompilerA64::h_IADD_RS(Instruction& instr, int i, uint32_t& codePos) { uint32_t k = codePos; @@ -194,55 +238,299 @@ void JitCompilerA64::h_IADD_M(Instruction& instr, int i, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - uint32_t imm = instr.getImm32(); constexpr uint32_t tmp_reg = 21; + emitMemLoad(dst, src, instr, code, k); - if (src != dst) - { - imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1); - emitAddImmediate(tmp_reg, src, imm, code, k); + // add dst, dst, tmp_reg + emit32(ARMV8A::ADD | dst | (dst << 5) | (tmp_reg << 16), code, k); - constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5); - constexpr uint32_t andInstrL1 = t | ((Log2(RANDOMX_SCRATCHPAD_L1) - 4) << 10); - constexpr uint32_t andInstrL2 = t | ((Log2(RANDOMX_SCRATCHPAD_L2) - 4) << 10); + codePos = k; +} - emit32(instr.getModMem() ? andInstrL1 : andInstrL2, code, k); +void JitCompilerA64::h_ISUB_R(Instruction& instr, int i, uint32_t& codePos) +{ + uint32_t k = codePos; - // ldr tmp_reg, [x2, tmp_reg] - emit32(0xf8606840 | tmp_reg | (tmp_reg << 16), code, k); + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + if (src != dst) + { + // sub dst, dst, src + emit32(ARMV8A::SUB | dst | (dst << 5) | (src << 16), code, k); } else { - imm = (imm & ScratchpadL3Mask) >> 3; - emitMovImmediate(tmp_reg, imm, code, k); + emitAddImmediate(dst, dst, -instr.getImm32(), code, k); + } - // ldr tmp_reg, [x2, tmp_reg, lsl 3] - emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k); + codePos = k; +} + +void JitCompilerA64::h_ISUB_M(Instruction& instr, int i, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + constexpr uint32_t tmp_reg = 21; + emitMemLoad(dst, src, instr, code, k); + + // sub dst, dst, tmp_reg + emit32(ARMV8A::SUB | dst | (dst << 5) | (tmp_reg << 16), code, k); + + codePos = k; +} + +void JitCompilerA64::h_IMUL_R(Instruction& instr, int i, uint32_t& codePos) +{ + uint32_t k = codePos; + + uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + if (src == dst) + { + src = 21; + emitMovImmediate(src, instr.getImm32(), code, k); } - // add dst, dst, tmp_reg - emit32(ARMV8A::ADD | dst | (dst << 5) | (tmp_reg << 16), code, k); + // mul dst, dst, src + emit32(ARMV8A::MUL | dst | (dst << 5) | (src << 16), code, k); codePos = k; } -void JitCompilerA64::h_ISUB_R(Instruction& instr, int i, uint32_t& codePos) +void JitCompilerA64::h_IMUL_M(Instruction& instr, int i, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + constexpr uint32_t tmp_reg = 21; + emitMemLoad(dst, src, instr, code, k); + + // sub dst, dst, tmp_reg + emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k); + + codePos = k; +} + +void JitCompilerA64::h_IMULH_R(Instruction& instr, int i, uint32_t& codePos) { uint32_t k = codePos; const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; + // umulh dst, dst, src + emit32(ARMV8A::UMULH | dst | (dst << 5) | (src << 16), code, k); + + codePos = k; +} + +void JitCompilerA64::h_IMULH_M(Instruction& instr, int i, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + constexpr uint32_t tmp_reg = 21; + emitMemLoad(dst, src, instr, code, k); + + // sub dst, dst, tmp_reg + emit32(ARMV8A::UMULH | dst | (dst << 5) | (tmp_reg << 16), code, k); + + codePos = k; +} + +void JitCompilerA64::h_ISMULH_R(Instruction& instr, int i, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + // umulh dst, dst, src + emit32(ARMV8A::SMULH | dst | (dst << 5) | (src << 16), code, k); + + codePos = k; +} + +void JitCompilerA64::h_ISMULH_M(Instruction& instr, int i, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + constexpr uint32_t tmp_reg = 21; + emitMemLoad(dst, src, instr, code, k); + + // sub dst, dst, tmp_reg + emit32(ARMV8A::SMULH | dst | (dst << 5) | (tmp_reg << 16), code, k); + + codePos = k; +} + +void JitCompilerA64::h_IMUL_RCP(Instruction& instr, int i, uint32_t& codePos) +{ + const uint64_t divisor = instr.getImm32(); + if (isZeroOrPowerOf2(divisor)) + return; + + uint32_t k = codePos; + + constexpr uint32_t tmp_reg = 21; + const uint32_t dst = IntRegMap[instr.dst]; + + literalPos -= sizeof(uint64_t); + *(uint64_t*)(code + literalPos) = randomx_reciprocal(divisor); + + // ldr tmp_reg, reciprocal + const uint32_t offset = (literalPos - k) / 4; + emit32(ARMV8A::LDR_LITERAL | tmp_reg | (offset << 5), code, k); + + // mul dst, dst, src + emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k); + + codePos = k; +} + +void JitCompilerA64::h_INEG_R(Instruction& instr, int i, uint32_t& codePos) +{ + const uint32_t dst = IntRegMap[instr.dst]; + + // sub dst, xzr, dst + emit32(ARMV8A::SUB | dst | (31 << 5) | (dst << 16), code, codePos); +} + +void JitCompilerA64::h_IXOR_R(Instruction& instr, int i, uint32_t& codePos) +{ + uint32_t k = codePos; + + uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + if (src == dst) + { + src = 21; + emitMovImmediate(src, instr.getImm32(), code, k); + } + + // eor dst, dst, src + emit32(ARMV8A::EOR | dst | (dst << 5) | (src << 16), code, k); + + codePos = k; +} + +void JitCompilerA64::h_IXOR_M(Instruction& instr, int i, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + constexpr uint32_t tmp_reg = 21; + emitMemLoad(dst, src, instr, code, k); + + // sub dst, dst, tmp_reg + emit32(ARMV8A::EOR | dst | (dst << 5) | (tmp_reg << 16), code, k); + + codePos = k; +} + +void JitCompilerA64::h_IROR_R(Instruction& instr, int i, uint32_t& codePos) +{ + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + if (src != dst) { - // sub dst, dst, src - emit32(ARMV8A::SUB | dst | (dst << 5) | (src << 16), code, k); + // ror dst, dst, src + emit32(ARMV8A::ROR | dst | (dst << 5) | (src << 16), code, codePos); } else { - emitAddImmediate(dst, dst, -instr.getImm32(), code, k); + // ror dst, dst, imm + emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((instr.getImm32() & 63) << 10) | (dst << 16), code, codePos); } +} + +void JitCompilerA64::h_IROL_R(Instruction& instr, int i, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + if (src != dst) + { + constexpr uint32_t tmp_reg = 21; + + // sub tmp_reg, xzr, src + emit32(ARMV8A::SUB | tmp_reg | (31 << 5) | (src << 16), code, k); + + // ror dst, dst, src + emit32(ARMV8A::ROR | dst | (dst << 5) | (tmp_reg << 16), code, k); + } + else + { + // ror dst, dst, imm + emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((-instr.getImm32() & 63) << 10) | (dst << 16), code, k); + } + + codePos = k; +} + +void JitCompilerA64::h_ISWAP_R(Instruction& instr, int i, uint32_t& codePos) +{ + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + + if (src == dst) + return; + + uint32_t k = codePos; + + constexpr uint32_t tmp_reg = 21; + emit32(ARMV8A::MOV_REG | tmp_reg | (dst << 16), code, k); + emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k); + emit32(ARMV8A::MOV_REG | src | (tmp_reg << 16), code, k); + + codePos = k; +} + +void JitCompilerA64::h_ISTORE(Instruction& instr, int i, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + const uint32_t dst = IntRegMap[instr.dst]; + constexpr uint32_t tmp_reg = 21; + + uint32_t imm = instr.getImm32(); + + if (instr.getModCond() < StoreL3Condition) + imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1); + else + imm &= RANDOMX_SCRATCHPAD_L3 - 1; + + emitAddImmediate(tmp_reg, dst, imm, code, k); + + constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5); + constexpr uint32_t andInstrL1 = t | ((Log2(RANDOMX_SCRATCHPAD_L1) - 4) << 10); + constexpr uint32_t andInstrL2 = t | ((Log2(RANDOMX_SCRATCHPAD_L2) - 4) << 10); + constexpr uint32_t andInstrL3 = t | ((Log2(RANDOMX_SCRATCHPAD_L3) - 4) << 10); + + emit32((instr.getModCond() < StoreL3Condition) ? (instr.getModMem() ? andInstrL1 : andInstrL2) : andInstrL3, code, k); + + // str src, [x2, tmp_reg] + emit32(0xf8356840 | src | (tmp_reg << 16), code, k); codePos = k; } diff --git a/src/jit_compiler_a64.hpp b/src/jit_compiler_a64.hpp index d24e62a..33ed591 100644 --- a/src/jit_compiler_a64.hpp +++ b/src/jit_compiler_a64.hpp @@ -79,6 +79,7 @@ namespace randomx { private: static InstructionGeneratorA64 engine[256]; uint8_t* code; + uint32_t literalPos; static void emit32(uint32_t val, uint8_t* code, uint32_t& codePos) { @@ -89,9 +90,27 @@ namespace randomx { void emitMovImmediate(uint32_t dst, uint32_t imm, uint8_t* code, uint32_t& codePos); void emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm, uint8_t* code, uint32_t& codePos); + template + void emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos); + void h_IADD_RS(Instruction&, int, uint32_t&); void h_IADD_M(Instruction&, int, uint32_t&); void h_ISUB_R(Instruction&, int, uint32_t&); + void h_ISUB_M(Instruction&, int, uint32_t&); + void h_IMUL_R(Instruction&, int, uint32_t&); + void h_IMUL_M(Instruction&, int, uint32_t&); + void h_IMULH_R(Instruction&, int, uint32_t&); + void h_IMULH_M(Instruction&, int, uint32_t&); + void h_ISMULH_R(Instruction&, int, uint32_t&); + void h_ISMULH_M(Instruction&, int, uint32_t&); + void h_IMUL_RCP(Instruction&, int, uint32_t&); + void h_INEG_R(Instruction&, int, uint32_t&); + void h_IXOR_R(Instruction&, int, uint32_t&); + void h_IXOR_M(Instruction&, int, uint32_t&); + void h_IROR_R(Instruction&, int, uint32_t&); + void h_IROL_R(Instruction&, int, uint32_t&); + void h_ISWAP_R(Instruction&, int, uint32_t&); + void h_ISTORE(Instruction&, int, uint32_t&); void h_NOP(Instruction&, int, uint32_t&); }; }