Added integer instructions

armv8-a-jit
SChernykh 5 years ago
parent e6065a95e9
commit a6b79dd3aa

@ -85,20 +85,20 @@ Total sum of frequencies must be 256
#define RANDOMX_FREQ_IADD_RS 16
#define RANDOMX_FREQ_IADD_M 7
#define RANDOMX_FREQ_ISUB_R 16
#define RANDOMX_FREQ_ISUB_M 0
#define RANDOMX_FREQ_IMUL_R 0
#define RANDOMX_FREQ_IMUL_M 0
#define RANDOMX_FREQ_IMULH_R 0
#define RANDOMX_FREQ_IMULH_M 0
#define RANDOMX_FREQ_ISMULH_R 0
#define RANDOMX_FREQ_ISMULH_M 0
#define RANDOMX_FREQ_IMUL_RCP 0
#define RANDOMX_FREQ_INEG_R 0
#define RANDOMX_FREQ_IXOR_R 0
#define RANDOMX_FREQ_IXOR_M 0
#define RANDOMX_FREQ_IROR_R 0
#define RANDOMX_FREQ_IROL_R 0
#define RANDOMX_FREQ_ISWAP_R 0
#define RANDOMX_FREQ_ISUB_M 7
#define RANDOMX_FREQ_IMUL_R 16
#define RANDOMX_FREQ_IMUL_M 4
#define RANDOMX_FREQ_IMULH_R 4
#define RANDOMX_FREQ_IMULH_M 1
#define RANDOMX_FREQ_ISMULH_R 4
#define RANDOMX_FREQ_ISMULH_M 1
#define RANDOMX_FREQ_IMUL_RCP 8
#define RANDOMX_FREQ_INEG_R 2
#define RANDOMX_FREQ_IXOR_R 15
#define RANDOMX_FREQ_IXOR_M 5
#define RANDOMX_FREQ_IROR_R 8
#define RANDOMX_FREQ_IROL_R 2
#define RANDOMX_FREQ_ISWAP_R 4
//Floating point instructions
#define RANDOMX_FREQ_FSWAP_R 0
@ -116,10 +116,10 @@ Total sum of frequencies must be 256
#define RANDOMX_FREQ_CFROUND 0
//Store instruction
#define RANDOMX_FREQ_ISTORE 0
#define RANDOMX_FREQ_ISTORE 16
//No-op instruction
#define RANDOMX_FREQ_NOP 217
#define RANDOMX_FREQ_NOP 120
/* ------
256
*/

@ -64,8 +64,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define REP33(x) REP32(x) x,
#define REP40(x) REP32(x) REP8(x)
#define REP64(x) REP32(x) REP32(x)
#define REP120(x) REP64(x) REP32(x) REP24(x)
#define REP128(x) REP32(x) REP32(x) REP32(x) REP32(x)
#define REP217(x) REP128(x) REP64(x) REP25(x)
#define REP232(x) REP128(x) REP40(x) REP40(x) REP24(x)
#define REP256(x) REP128(x) REP128(x)
#define REPNX(x,N) REP##N(x)

@ -30,19 +30,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "jit_compiler_a64.hpp"
#include "program.hpp"
#include "virtual_memory.hpp"
#include "reciprocal.h"
namespace ARMV8A {
constexpr uint32_t B = 0x14000000;
constexpr uint32_t EOR = 0xCA000000;
constexpr uint32_t EOR32 = 0x4A000000;
constexpr uint32_t ADD = 0x8B000000;
constexpr uint32_t SUB = 0xCB000000;
constexpr uint32_t MOVZ = 0xD2800000;
constexpr uint32_t MOVN = 0x92800000;
constexpr uint32_t MOVK = 0xF2800000;
constexpr uint32_t ADD_IMM_LO = 0x91000000;
constexpr uint32_t ADD_IMM_HI = 0x91400000;
constexpr uint32_t B = 0x14000000;
constexpr uint32_t EOR = 0xCA000000;
constexpr uint32_t EOR32 = 0x4A000000;
constexpr uint32_t ADD = 0x8B000000;
constexpr uint32_t SUB = 0xCB000000;
constexpr uint32_t MUL = 0x9B007C00;
constexpr uint32_t UMULH = 0x9BC07C00;
constexpr uint32_t SMULH = 0x9B407C00;
constexpr uint32_t MOVZ = 0xD2800000;
constexpr uint32_t MOVN = 0x92800000;
constexpr uint32_t MOVK = 0xF2800000;
constexpr uint32_t ADD_IMM_LO = 0x91000000;
constexpr uint32_t ADD_IMM_HI = 0x91400000;
constexpr uint32_t LDR_LITERAL = 0x58000000;
constexpr uint32_t ROR = 0x9AC02C00;
constexpr uint32_t ROR_IMM = 0x93C00000;
constexpr uint32_t MOV_REG = 0xAA0003E0;
}
@ -50,6 +58,7 @@ namespace randomx {
static const size_t CodeSize = ((uint8_t*)randomx_program_aarch64_end) - ((uint8_t*)randomx_program_aarch64);
static const size_t PrologueSize = ((uint8_t*)randomx_program_aarch64_vm_instructions) - ((uint8_t*)randomx_program_aarch64);
static const size_t InstructionsEnd = ((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64);
constexpr uint32_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 };
@ -57,6 +66,7 @@ template<typename T> static constexpr size_t Log2(T value) { return (value > 1)
JitCompilerA64::JitCompilerA64()
: code((uint8_t*) allocMemoryPages(CodeSize))
, literalPos(InstructionsEnd)
{
memcpy(code, (void*) randomx_program_aarch64, CodeSize);
enableAll();
@ -75,6 +85,7 @@ void JitCompilerA64::enableAll()
void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& config)
{
uint32_t codePos = PrologueSize;
literalPos = InstructionsEnd;
for (uint32_t i = 0; i < program.getSize(); ++i)
{
@ -171,6 +182,39 @@ void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm,
codePos = k;
}
template<uint32_t tmp_reg>
void JitCompilerA64::emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos)
{
uint32_t k = codePos;
uint32_t imm = instr.getImm32();
if (src != dst)
{
imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1);
emitAddImmediate(tmp_reg, src, imm, code, k);
constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
constexpr uint32_t andInstrL1 = t | ((Log2(RANDOMX_SCRATCHPAD_L1) - 4) << 10);
constexpr uint32_t andInstrL2 = t | ((Log2(RANDOMX_SCRATCHPAD_L2) - 4) << 10);
emit32(instr.getModMem() ? andInstrL1 : andInstrL2, code, k);
// ldr tmp_reg, [x2, tmp_reg]
emit32(0xf8606840 | tmp_reg | (tmp_reg << 16), code, k);
}
else
{
imm = (imm & ScratchpadL3Mask) >> 3;
emitMovImmediate(tmp_reg, imm, code, k);
// ldr tmp_reg, [x2, tmp_reg, lsl 3]
emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k);
}
codePos = k;
}
void JitCompilerA64::h_IADD_RS(Instruction& instr, int i, uint32_t& codePos)
{
uint32_t k = codePos;
@ -194,55 +238,299 @@ void JitCompilerA64::h_IADD_M(Instruction& instr, int i, uint32_t& codePos)
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
uint32_t imm = instr.getImm32();
constexpr uint32_t tmp_reg = 21;
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
if (src != dst)
{
imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1);
emitAddImmediate(tmp_reg, src, imm, code, k);
// add dst, dst, tmp_reg
emit32(ARMV8A::ADD | dst | (dst << 5) | (tmp_reg << 16), code, k);
constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
constexpr uint32_t andInstrL1 = t | ((Log2(RANDOMX_SCRATCHPAD_L1) - 4) << 10);
constexpr uint32_t andInstrL2 = t | ((Log2(RANDOMX_SCRATCHPAD_L2) - 4) << 10);
codePos = k;
}
emit32(instr.getModMem() ? andInstrL1 : andInstrL2, code, k);
void JitCompilerA64::h_ISUB_R(Instruction& instr, int i, uint32_t& codePos)
{
uint32_t k = codePos;
// ldr tmp_reg, [x2, tmp_reg]
emit32(0xf8606840 | tmp_reg | (tmp_reg << 16), code, k);
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
if (src != dst)
{
// sub dst, dst, src
emit32(ARMV8A::SUB | dst | (dst << 5) | (src << 16), code, k);
}
else
{
imm = (imm & ScratchpadL3Mask) >> 3;
emitMovImmediate(tmp_reg, imm, code, k);
emitAddImmediate(dst, dst, -instr.getImm32(), code, k);
}
// ldr tmp_reg, [x2, tmp_reg, lsl 3]
emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k);
codePos = k;
}
void JitCompilerA64::h_ISUB_M(Instruction& instr, int i, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
constexpr uint32_t tmp_reg = 21;
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
// sub dst, dst, tmp_reg
emit32(ARMV8A::SUB | dst | (dst << 5) | (tmp_reg << 16), code, k);
codePos = k;
}
void JitCompilerA64::h_IMUL_R(Instruction& instr, int i, uint32_t& codePos)
{
uint32_t k = codePos;
uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
if (src == dst)
{
src = 21;
emitMovImmediate(src, instr.getImm32(), code, k);
}
// add dst, dst, tmp_reg
emit32(ARMV8A::ADD | dst | (dst << 5) | (tmp_reg << 16), code, k);
// mul dst, dst, src
emit32(ARMV8A::MUL | dst | (dst << 5) | (src << 16), code, k);
codePos = k;
}
void JitCompilerA64::h_ISUB_R(Instruction& instr, int i, uint32_t& codePos)
void JitCompilerA64::h_IMUL_M(Instruction& instr, int i, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
constexpr uint32_t tmp_reg = 21;
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
// sub dst, dst, tmp_reg
emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k);
codePos = k;
}
void JitCompilerA64::h_IMULH_R(Instruction& instr, int i, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
// umulh dst, dst, src
emit32(ARMV8A::UMULH | dst | (dst << 5) | (src << 16), code, k);
codePos = k;
}
void JitCompilerA64::h_IMULH_M(Instruction& instr, int i, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
constexpr uint32_t tmp_reg = 21;
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
// sub dst, dst, tmp_reg
emit32(ARMV8A::UMULH | dst | (dst << 5) | (tmp_reg << 16), code, k);
codePos = k;
}
void JitCompilerA64::h_ISMULH_R(Instruction& instr, int i, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
// umulh dst, dst, src
emit32(ARMV8A::SMULH | dst | (dst << 5) | (src << 16), code, k);
codePos = k;
}
void JitCompilerA64::h_ISMULH_M(Instruction& instr, int i, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
constexpr uint32_t tmp_reg = 21;
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
// sub dst, dst, tmp_reg
emit32(ARMV8A::SMULH | dst | (dst << 5) | (tmp_reg << 16), code, k);
codePos = k;
}
void JitCompilerA64::h_IMUL_RCP(Instruction& instr, int i, uint32_t& codePos)
{
const uint64_t divisor = instr.getImm32();
if (isZeroOrPowerOf2(divisor))
return;
uint32_t k = codePos;
constexpr uint32_t tmp_reg = 21;
const uint32_t dst = IntRegMap[instr.dst];
literalPos -= sizeof(uint64_t);
*(uint64_t*)(code + literalPos) = randomx_reciprocal(divisor);
// ldr tmp_reg, reciprocal
const uint32_t offset = (literalPos - k) / 4;
emit32(ARMV8A::LDR_LITERAL | tmp_reg | (offset << 5), code, k);
// mul dst, dst, src
emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k);
codePos = k;
}
void JitCompilerA64::h_INEG_R(Instruction& instr, int i, uint32_t& codePos)
{
const uint32_t dst = IntRegMap[instr.dst];
// sub dst, xzr, dst
emit32(ARMV8A::SUB | dst | (31 << 5) | (dst << 16), code, codePos);
}
void JitCompilerA64::h_IXOR_R(Instruction& instr, int i, uint32_t& codePos)
{
uint32_t k = codePos;
uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
if (src == dst)
{
src = 21;
emitMovImmediate(src, instr.getImm32(), code, k);
}
// eor dst, dst, src
emit32(ARMV8A::EOR | dst | (dst << 5) | (src << 16), code, k);
codePos = k;
}
void JitCompilerA64::h_IXOR_M(Instruction& instr, int i, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
constexpr uint32_t tmp_reg = 21;
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
// sub dst, dst, tmp_reg
emit32(ARMV8A::EOR | dst | (dst << 5) | (tmp_reg << 16), code, k);
codePos = k;
}
void JitCompilerA64::h_IROR_R(Instruction& instr, int i, uint32_t& codePos)
{
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
if (src != dst)
{
// sub dst, dst, src
emit32(ARMV8A::SUB | dst | (dst << 5) | (src << 16), code, k);
// ror dst, dst, src
emit32(ARMV8A::ROR | dst | (dst << 5) | (src << 16), code, codePos);
}
else
{
emitAddImmediate(dst, dst, -instr.getImm32(), code, k);
// ror dst, dst, imm
emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((instr.getImm32() & 63) << 10) | (dst << 16), code, codePos);
}
}
void JitCompilerA64::h_IROL_R(Instruction& instr, int i, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
if (src != dst)
{
constexpr uint32_t tmp_reg = 21;
// sub tmp_reg, xzr, src
emit32(ARMV8A::SUB | tmp_reg | (31 << 5) | (src << 16), code, k);
// ror dst, dst, src
emit32(ARMV8A::ROR | dst | (dst << 5) | (tmp_reg << 16), code, k);
}
else
{
// ror dst, dst, imm
emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((-instr.getImm32() & 63) << 10) | (dst << 16), code, k);
}
codePos = k;
}
void JitCompilerA64::h_ISWAP_R(Instruction& instr, int i, uint32_t& codePos)
{
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
if (src == dst)
return;
uint32_t k = codePos;
constexpr uint32_t tmp_reg = 21;
emit32(ARMV8A::MOV_REG | tmp_reg | (dst << 16), code, k);
emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k);
emit32(ARMV8A::MOV_REG | src | (tmp_reg << 16), code, k);
codePos = k;
}
void JitCompilerA64::h_ISTORE(Instruction& instr, int i, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
constexpr uint32_t tmp_reg = 21;
uint32_t imm = instr.getImm32();
if (instr.getModCond() < StoreL3Condition)
imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1);
else
imm &= RANDOMX_SCRATCHPAD_L3 - 1;
emitAddImmediate(tmp_reg, dst, imm, code, k);
constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
constexpr uint32_t andInstrL1 = t | ((Log2(RANDOMX_SCRATCHPAD_L1) - 4) << 10);
constexpr uint32_t andInstrL2 = t | ((Log2(RANDOMX_SCRATCHPAD_L2) - 4) << 10);
constexpr uint32_t andInstrL3 = t | ((Log2(RANDOMX_SCRATCHPAD_L3) - 4) << 10);
emit32((instr.getModCond() < StoreL3Condition) ? (instr.getModMem() ? andInstrL1 : andInstrL2) : andInstrL3, code, k);
// str src, [x2, tmp_reg]
emit32(0xf8356840 | src | (tmp_reg << 16), code, k);
codePos = k;
}

@ -79,6 +79,7 @@ namespace randomx {
private:
static InstructionGeneratorA64 engine[256];
uint8_t* code;
uint32_t literalPos;
static void emit32(uint32_t val, uint8_t* code, uint32_t& codePos)
{
@ -89,9 +90,27 @@ namespace randomx {
void emitMovImmediate(uint32_t dst, uint32_t imm, uint8_t* code, uint32_t& codePos);
void emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm, uint8_t* code, uint32_t& codePos);
template<uint32_t tmp_reg>
void emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos);
void h_IADD_RS(Instruction&, int, uint32_t&);
void h_IADD_M(Instruction&, int, uint32_t&);
void h_ISUB_R(Instruction&, int, uint32_t&);
void h_ISUB_M(Instruction&, int, uint32_t&);
void h_IMUL_R(Instruction&, int, uint32_t&);
void h_IMUL_M(Instruction&, int, uint32_t&);
void h_IMULH_R(Instruction&, int, uint32_t&);
void h_IMULH_M(Instruction&, int, uint32_t&);
void h_ISMULH_R(Instruction&, int, uint32_t&);
void h_ISMULH_M(Instruction&, int, uint32_t&);
void h_IMUL_RCP(Instruction&, int, uint32_t&);
void h_INEG_R(Instruction&, int, uint32_t&);
void h_IXOR_R(Instruction&, int, uint32_t&);
void h_IXOR_M(Instruction&, int, uint32_t&);
void h_IROR_R(Instruction&, int, uint32_t&);
void h_IROL_R(Instruction&, int, uint32_t&);
void h_ISWAP_R(Instruction&, int, uint32_t&);
void h_ISTORE(Instruction&, int, uint32_t&);
void h_NOP(Instruction&, int, uint32_t&);
};
}

Loading…
Cancel
Save