Added CFROUND

Also optimized register allocation
armv8-a-jit
SChernykh 5 years ago
parent e49499043c
commit 3a6993290c

@ -113,13 +113,13 @@ Total sum of frequencies must be 256
//Control instructions //Control instructions
#define RANDOMX_FREQ_CBRANCH 0 #define RANDOMX_FREQ_CBRANCH 0
#define RANDOMX_FREQ_CFROUND 0 #define RANDOMX_FREQ_CFROUND 1
//Store instruction //Store instruction
#define RANDOMX_FREQ_ISTORE 16 #define RANDOMX_FREQ_ISTORE 16
//No-op instruction //No-op instruction
#define RANDOMX_FREQ_NOP 26 #define RANDOMX_FREQ_NOP 25
/* ------ /* ------
256 256
*/ */

@ -64,6 +64,7 @@ constexpr uint32_t FSQRT = 0x6EE1F800;
namespace randomx { namespace randomx {
static const size_t CodeSize = ((uint8_t*)randomx_program_aarch64_end) - ((uint8_t*)randomx_program_aarch64); static const size_t CodeSize = ((uint8_t*)randomx_program_aarch64_end) - ((uint8_t*)randomx_program_aarch64);
static const size_t MainLoopBegin = ((uint8_t*)randomx_program_aarch64_main_loop) - ((uint8_t*)randomx_program_aarch64);
static const size_t PrologueSize = ((uint8_t*)randomx_program_aarch64_vm_instructions) - ((uint8_t*)randomx_program_aarch64); static const size_t PrologueSize = ((uint8_t*)randomx_program_aarch64_vm_instructions) - ((uint8_t*)randomx_program_aarch64);
static const size_t InstructionsEnd = ((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64); static const size_t InstructionsEnd = ((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64);
@ -91,7 +92,15 @@ void JitCompilerA64::enableAll()
void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& config) void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& config)
{ {
uint32_t codePos = PrologueSize; uint32_t codePos = MainLoopBegin + 4;
// and w16, w10, ScratchpadL3Mask64
emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
// and w17, w18, ScratchpadL3Mask64
emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
codePos = PrologueSize;
literalPos = InstructionsEnd; literalPos = InstructionsEnd;
for (uint32_t i = 0; i < program.getSize(); ++i) for (uint32_t i = 0; i < program.getSize(); ++i)
@ -110,13 +119,21 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con
const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos; const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos;
emit32(ARMV8A::B | (offset / 4), code, codePos); emit32(ARMV8A::B | (offset / 4), code, codePos);
// and w20, w20, CacheLineAlignMask
codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64));
emit32(0x121A0000 | 20 | (20 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos);
// and w20, w20, CacheLineAlignMask
codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64));
emit32(0x121A0000 | 20 | (20 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos);
// Update spMix1 // Update spMix1
// eor x10, config.readReg0, config.readReg1 // eor x10, config.readReg0, config.readReg1
codePos = ((uint8_t*)randomx_program_aarch64_update_spMix1) - ((uint8_t*)randomx_program_aarch64); codePos = ((uint8_t*)randomx_program_aarch64_update_spMix1) - ((uint8_t*)randomx_program_aarch64);
emit32(ARMV8A::EOR | 10 | (IntRegMap[config.readReg0] << 5) | (IntRegMap[config.readReg1] << 16), code, codePos); emit32(ARMV8A::EOR | 10 | (IntRegMap[config.readReg0] << 5) | (IntRegMap[config.readReg1] << 16), code, codePos);
#ifdef __GNUC__ #ifdef __GNUC__
__builtin___clear_cache(reinterpret_cast<char*>(code + PrologueSize), reinterpret_cast<char*>(code + codePos)); __builtin___clear_cache(reinterpret_cast<char*>(code + MainLoopBegin), reinterpret_cast<char*>(code + codePos));
#endif #endif
} }
@ -179,7 +196,7 @@ void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm,
} }
else else
{ {
constexpr uint32_t tmp_reg = 21; constexpr uint32_t tmp_reg = 18;
emitMovImmediate(tmp_reg, imm, code, k); emitMovImmediate(tmp_reg, imm, code, k);
// add dst, src, tmp_reg // add dst, src, tmp_reg
@ -228,7 +245,7 @@ void JitCompilerA64::emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* co
uint32_t k = codePos; uint32_t k = codePos;
uint32_t imm = instr.getImm32(); uint32_t imm = instr.getImm32();
constexpr uint32_t tmp_reg = 21; constexpr uint32_t tmp_reg = 18;
imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1); imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1);
emitAddImmediate(tmp_reg, src, imm, code, k); emitAddImmediate(tmp_reg, src, imm, code, k);
@ -281,7 +298,7 @@ void JitCompilerA64::h_IADD_M(Instruction& instr, int i, uint32_t& codePos)
const uint32_t src = IntRegMap[instr.src]; const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst]; const uint32_t dst = IntRegMap[instr.dst];
constexpr uint32_t tmp_reg = 21; constexpr uint32_t tmp_reg = 18;
emitMemLoad<tmp_reg>(dst, src, instr, code, k); emitMemLoad<tmp_reg>(dst, src, instr, code, k);
// add dst, dst, tmp_reg // add dst, dst, tmp_reg
@ -317,7 +334,7 @@ void JitCompilerA64::h_ISUB_M(Instruction& instr, int i, uint32_t& codePos)
const uint32_t src = IntRegMap[instr.src]; const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst]; const uint32_t dst = IntRegMap[instr.dst];
constexpr uint32_t tmp_reg = 21; constexpr uint32_t tmp_reg = 18;
emitMemLoad<tmp_reg>(dst, src, instr, code, k); emitMemLoad<tmp_reg>(dst, src, instr, code, k);
// sub dst, dst, tmp_reg // sub dst, dst, tmp_reg
@ -335,7 +352,7 @@ void JitCompilerA64::h_IMUL_R(Instruction& instr, int i, uint32_t& codePos)
if (src == dst) if (src == dst)
{ {
src = 21; src = 18;
emitMovImmediate(src, instr.getImm32(), code, k); emitMovImmediate(src, instr.getImm32(), code, k);
} }
@ -352,7 +369,7 @@ void JitCompilerA64::h_IMUL_M(Instruction& instr, int i, uint32_t& codePos)
const uint32_t src = IntRegMap[instr.src]; const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst]; const uint32_t dst = IntRegMap[instr.dst];
constexpr uint32_t tmp_reg = 21; constexpr uint32_t tmp_reg = 18;
emitMemLoad<tmp_reg>(dst, src, instr, code, k); emitMemLoad<tmp_reg>(dst, src, instr, code, k);
// sub dst, dst, tmp_reg // sub dst, dst, tmp_reg
@ -381,7 +398,7 @@ void JitCompilerA64::h_IMULH_M(Instruction& instr, int i, uint32_t& codePos)
const uint32_t src = IntRegMap[instr.src]; const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst]; const uint32_t dst = IntRegMap[instr.dst];
constexpr uint32_t tmp_reg = 21; constexpr uint32_t tmp_reg = 18;
emitMemLoad<tmp_reg>(dst, src, instr, code, k); emitMemLoad<tmp_reg>(dst, src, instr, code, k);
// umulh dst, dst, tmp_reg // umulh dst, dst, tmp_reg
@ -410,7 +427,7 @@ void JitCompilerA64::h_ISMULH_M(Instruction& instr, int i, uint32_t& codePos)
const uint32_t src = IntRegMap[instr.src]; const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst]; const uint32_t dst = IntRegMap[instr.dst];
constexpr uint32_t tmp_reg = 21; constexpr uint32_t tmp_reg = 18;
emitMemLoad<tmp_reg>(dst, src, instr, code, k); emitMemLoad<tmp_reg>(dst, src, instr, code, k);
// smulh dst, dst, tmp_reg // smulh dst, dst, tmp_reg
@ -427,23 +444,33 @@ void JitCompilerA64::h_IMUL_RCP(Instruction& instr, int i, uint32_t& codePos)
uint32_t k = codePos; uint32_t k = codePos;
constexpr uint32_t tmp_reg = 21; constexpr uint32_t tmp_reg = 18;
const uint32_t dst = IntRegMap[instr.dst]; const uint32_t dst = IntRegMap[instr.dst];
const uint64_t N = 1ULL << 63; constexpr uint64_t N = 1ULL << 63;
const uint64_t q = N / divisor; const uint64_t q = N / divisor;
const uint64_t r = N % divisor; const uint64_t r = N % divisor;
const uint64_t shift = 64 - __builtin_clzll(divisor); const uint64_t shift = 64 - __builtin_clzll(divisor);
const uint32_t literal_id = (InstructionsEnd - literalPos) / sizeof(uint64_t);
literalPos -= sizeof(uint64_t); literalPos -= sizeof(uint64_t);
*(uint64_t*)(code + literalPos) = (q << shift) + ((r << shift) / divisor); *(uint64_t*)(code + literalPos) = (q << shift) + ((r << shift) / divisor);
// ldr tmp_reg, reciprocal if (literal_id < 10)
const uint32_t offset = (literalPos - k) / 4; {
emit32(ARMV8A::LDR_LITERAL | tmp_reg | (offset << 5), code, k); // mul dst, dst, literal_reg
emit32(ARMV8A::MUL | dst | (dst << 5) | ((30 - literal_id) << 16), code, k);
}
else
{
// ldr tmp_reg, reciprocal
const uint32_t offset = (literalPos - k) / 4;
emit32(ARMV8A::LDR_LITERAL | tmp_reg | (offset << 5), code, k);
// mul dst, dst, src // mul dst, dst, src
emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k); emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k);
}
codePos = k; codePos = k;
} }
@ -465,7 +492,7 @@ void JitCompilerA64::h_IXOR_R(Instruction& instr, int i, uint32_t& codePos)
if (src == dst) if (src == dst)
{ {
src = 21; src = 18;
emitMovImmediate(src, instr.getImm32(), code, k); emitMovImmediate(src, instr.getImm32(), code, k);
} }
@ -482,7 +509,7 @@ void JitCompilerA64::h_IXOR_M(Instruction& instr, int i, uint32_t& codePos)
const uint32_t src = IntRegMap[instr.src]; const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst]; const uint32_t dst = IntRegMap[instr.dst];
constexpr uint32_t tmp_reg = 21; constexpr uint32_t tmp_reg = 18;
emitMemLoad<tmp_reg>(dst, src, instr, code, k); emitMemLoad<tmp_reg>(dst, src, instr, code, k);
// sub dst, dst, tmp_reg // sub dst, dst, tmp_reg
@ -517,12 +544,12 @@ void JitCompilerA64::h_IROL_R(Instruction& instr, int i, uint32_t& codePos)
if (src != dst) if (src != dst)
{ {
constexpr uint32_t tmp_reg = 21; constexpr uint32_t tmp_reg = 18;
// sub tmp_reg, xzr, src // sub tmp_reg, xzr, src
emit32(ARMV8A::SUB | tmp_reg | (31 << 5) | (src << 16), code, k); emit32(ARMV8A::SUB | tmp_reg | (31 << 5) | (src << 16), code, k);
// ror dst, dst, src // ror dst, dst, tmp_reg
emit32(ARMV8A::ROR | dst | (dst << 5) | (tmp_reg << 16), code, k); emit32(ARMV8A::ROR | dst | (dst << 5) | (tmp_reg << 16), code, k);
} }
else else
@ -544,7 +571,7 @@ void JitCompilerA64::h_ISWAP_R(Instruction& instr, int i, uint32_t& codePos)
uint32_t k = codePos; uint32_t k = codePos;
constexpr uint32_t tmp_reg = 21; constexpr uint32_t tmp_reg = 18;
emit32(ARMV8A::MOV_REG | tmp_reg | (dst << 16), code, k); emit32(ARMV8A::MOV_REG | tmp_reg | (dst << 16), code, k);
emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k); emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k);
emit32(ARMV8A::MOV_REG | src | (tmp_reg << 16), code, k); emit32(ARMV8A::MOV_REG | src | (tmp_reg << 16), code, k);
@ -658,13 +685,46 @@ void JitCompilerA64::h_FSQRT_R(Instruction& instr, int i, uint32_t& codePos)
emit32(ARMV8A::FSQRT | dst | (dst << 5), code, codePos); emit32(ARMV8A::FSQRT | dst | (dst << 5), code, codePos);
} }
void JitCompilerA64::h_CBRANCH(Instruction& instr, int i, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t dst = IntRegMap[instr.src];
codePos = k;
}
void JitCompilerA64::h_CFROUND(Instruction& instr, int i, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
constexpr uint32_t tmp_reg = 18;
constexpr uint32_t fprc_tmp_reg = 8;
// ror tmp_reg, src, imm
emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
// bfi fprc_tmp_reg, tmp_reg, 40, 2
emit32(0xB3580400 | fprc_tmp_reg | (tmp_reg << 5), code, k);
// rbit tmp_reg, fprc_tmp_reg
emit32(0xDAC00000 | tmp_reg | (fprc_tmp_reg << 5), code, k);
// msr fpcr, tmp_reg
emit32(0xD51B4400 | tmp_reg, code, k);
codePos = k;
}
void JitCompilerA64::h_ISTORE(Instruction& instr, int i, uint32_t& codePos) void JitCompilerA64::h_ISTORE(Instruction& instr, int i, uint32_t& codePos)
{ {
uint32_t k = codePos; uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src]; const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst]; const uint32_t dst = IntRegMap[instr.dst];
constexpr uint32_t tmp_reg = 21; constexpr uint32_t tmp_reg = 18;
uint32_t imm = instr.getImm32(); uint32_t imm = instr.getImm32();
@ -683,7 +743,7 @@ void JitCompilerA64::h_ISTORE(Instruction& instr, int i, uint32_t& codePos)
emit32((instr.getModCond() < StoreL3Condition) ? (instr.getModMem() ? andInstrL1 : andInstrL2) : andInstrL3, code, k); emit32((instr.getModCond() < StoreL3Condition) ? (instr.getModMem() ? andInstrL1 : andInstrL2) : andInstrL3, code, k);
// str src, [x2, tmp_reg] // str src, [x2, tmp_reg]
emit32(0xf8356840 | src | (tmp_reg << 16), code, k); emit32(0xF8206840 | src | (tmp_reg << 16), code, k);
codePos = k; codePos = k;
} }

@ -122,6 +122,8 @@ namespace randomx {
void h_FMUL_R(Instruction&, int, uint32_t&); void h_FMUL_R(Instruction&, int, uint32_t&);
void h_FDIV_M(Instruction&, int, uint32_t&); void h_FDIV_M(Instruction&, int, uint32_t&);
void h_FSQRT_R(Instruction&, int, uint32_t&); void h_FSQRT_R(Instruction&, int, uint32_t&);
void h_CBRANCH(Instruction&, int, uint32_t&);
void h_CFROUND(Instruction&, int, uint32_t&);
void h_ISTORE(Instruction&, int, uint32_t&); void h_ISTORE(Instruction&, int, uint32_t&);
void h_NOP(Instruction&, int, uint32_t&); void h_NOP(Instruction&, int, uint32_t&);
}; };

@ -28,8 +28,11 @@
.arch armv8-a .arch armv8-a
.text .text
.global randomx_program_aarch64 .global randomx_program_aarch64
.global randomx_program_aarch64_main_loop
.global randomx_program_aarch64_vm_instructions .global randomx_program_aarch64_vm_instructions
.global randomx_program_aarch64_vm_instructions_end .global randomx_program_aarch64_vm_instructions_end
.global randomx_program_aarch64_cacheline_align_mask1
.global randomx_program_aarch64_cacheline_align_mask2
.global randomx_program_aarch64_update_spMix1 .global randomx_program_aarch64_update_spMix1
.global randomx_program_aarch64_end .global randomx_program_aarch64_end
@ -43,7 +46,7 @@
# x5 -> "r1" # x5 -> "r1"
# x6 -> "r2" # x6 -> "r2"
# x7 -> "r3" # x7 -> "r3"
# x8 -> temporary # x8 -> fpcr (reversed bits)
# x9 -> mx, ma # x9 -> mx, ma
# x10 -> spMix1 # x10 -> spMix1
# x11 -> spMix2 # x11 -> spMix2
@ -51,23 +54,23 @@
# x13 -> "r5" # x13 -> "r5"
# x14 -> "r6" # x14 -> "r6"
# x15 -> "r7" # x15 -> "r7"
# x16 -> ScratchpadL1Mask64 # x16 -> spAddr0
# x17 -> ScratchpadL2Mask64 # x17 -> spAddr1
# x18 -> ScratchpadL3Mask64 # x18 -> temporary
# x19 -> spAddr0 # x19 -> temporary
# x20 -> spAddr1 # x20 -> temporary
# x21 -> temporary # x21 -> literal for IMUL_RCP
# x22 -> temporary # x22 -> literal for IMUL_RCP
# x23 -> temporary # x23 -> literal for IMUL_RCP
# x24 -> temporary # x24 -> literal for IMUL_RCP
# x25 -> temporary # x25 -> literal for IMUL_RCP
# x26 -> ScratchpadL1Mask8 # x26 -> literal for IMUL_RCP
# x27 -> ScratchpadL2Mask8 # x27 -> literal for IMUL_RCP
# x28 -> ScratchpadL3Mask8 # x28 -> literal for IMUL_RCP
# x29 -> CacheLineAlignMask # x29 -> literal for IMUL_RCP
# x30 -> literal for IMUL_RCP
# v0-v7 -> temporary
# v8-v15 -> not used # v0-v15 -> not used
# v16 -> "f0" # v16 -> "f0"
# v17 -> "f1" # v17 -> "f1"
# v18 -> "f2" # v18 -> "f2"
@ -113,91 +116,98 @@ randomx_program_aarch64:
# Load initial spMix value # Load initial spMix value
mov x10, x9 mov x10, x9
# Load Scratchpad masks
mov x16, 16384 - 64
mov x17, 262144 - 64
mov x18, 2097152 - 64
mov x26, 16384 - 8
mov x27, 262144 - 8
mov x28, 2097152 - 8
# Load CacheLineAlignMask
mov x29, 0x7FFFFFC0
# Load group A registers # Load group A registers
ldp q24, q25, [x0, 192] ldp q24, q25, [x0, 192]
ldp q26, q27, [x0, 224] ldp q26, q27, [x0, 224]
# Load E 'and' mask # Load E 'and' mask
mov x21, 0x00FFFFFFFFFFFFFF mov x16, 0x00FFFFFFFFFFFFFF
ins v29.d[0], x21 ins v29.d[0], x16
ins v29.d[1], x21 ins v29.d[1], x16
# Load E 'or' mask (stored in reg.f[0]) # Load E 'or' mask (stored in reg.f[0])
ldr q30, [x0, 64] ldr q30, [x0, 64]
# Load scale mask # Load scale mask
mov x21, 0x80f0000000000000 mov x16, 0x80f0000000000000
ins v31.d[0], x21 ins v31.d[0], x16
ins v31.d[1], x21 ins v31.d[1], x16
main_loop: # Read fpcr
mrs x8, fpcr
rbit x8, x8
# Read literals
ldr x21, literal_x21
ldr x22, literal_x22
ldr x23, literal_x23
ldr x24, literal_x24
ldr x25, literal_x25
ldr x26, literal_x26
ldr x27, literal_x27
ldr x28, literal_x28
ldr x29, literal_x29
ldr x30, literal_x30
randomx_program_aarch64_main_loop:
# spAddr0 = spMix1 & ScratchpadL3Mask64; # spAddr0 = spMix1 & ScratchpadL3Mask64;
# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64; # spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
lsr x21, x10, 32 lsr x18, x10, 32
and w19, w10, w18
and w20, w21, w18 # Actual mask will be inserted by JIT compiler
and w16, w10, 1
and w17, w18, 1
# x19 = scratchpad + spAddr0 # x16 = scratchpad + spAddr0
# x20 = scratchpad + spAddr1 # x17 = scratchpad + spAddr1
add x19, x19, x2 add x16, x16, x2
add x20, x20, x2 add x17, x17, x2
# xor integer registers with scratchpad data (spAddr0) # xor integer registers with scratchpad data (spAddr0)
ldp x21, x22, [x19] ldp x18, x19, [x16]
ldp x23, x24, [x19, 16] eor x4, x4, x18
eor x4, x4, x21 eor x5, x5, x19
eor x5, x5, x22 ldp x18, x19, [x16, 16]
eor x6, x6, x23 eor x6, x6, x18
eor x7, x7, x24 eor x7, x7, x19
ldp x21, x22, [x19, 32] ldp x18, x19, [x16, 32]
ldp x23, x24, [x19, 48] eor x12, x12, x18
eor x12, x12, x21 eor x13, x13, x19
eor x13, x13, x22 ldp x18, x19, [x16, 48]
eor x14, x14, x23 eor x14, x14, x18
eor x15, x15, x24 eor x15, x15, x19
# Load group F registers (spAddr1) # Load group F registers (spAddr1)
ldpsw x21, x22, [x20] ldpsw x18, x19, [x17]
ldpsw x23, x24, [x20, 8] ins v16.d[0], x18
ins v16.d[0], x21 ins v16.d[1], x19
ins v16.d[1], x22 ldpsw x18, x19, [x17, 8]
ins v17.d[0], x23 ins v17.d[0], x18
ins v17.d[1], x24 ins v17.d[1], x19
ldpsw x21, x22, [x20, 16] ldpsw x18, x19, [x17, 16]
ldpsw x23, x24, [x20, 24] ins v18.d[0], x18
ins v18.d[0], x21 ins v18.d[1], x19
ins v18.d[1], x22 ldpsw x18, x19, [x17, 24]
ins v19.d[0], x23 ins v19.d[0], x18
ins v19.d[1], x24 ins v19.d[1], x19
scvtf v16.2d, v16.2d scvtf v16.2d, v16.2d
scvtf v17.2d, v17.2d scvtf v17.2d, v17.2d
scvtf v18.2d, v18.2d scvtf v18.2d, v18.2d
scvtf v19.2d, v19.2d scvtf v19.2d, v19.2d
# Load group E registers (spAddr1) # Load group E registers (spAddr1)
ldpsw x21, x22, [x20, 32] ldpsw x18, x19, [x17, 32]
ldpsw x23, x24, [x20, 40] ins v20.d[0], x18
ins v20.d[0], x21 ins v20.d[1], x19
ins v20.d[1], x22 ldpsw x18, x19, [x17, 40]
ins v21.d[0], x23 ins v21.d[0], x18
ins v21.d[1], x24 ins v21.d[1], x19
ldpsw x21, x22, [x20, 48] ldpsw x18, x19, [x17, 48]
ldpsw x23, x24, [x20, 56] ins v22.d[0], x18
ins v22.d[0], x21 ins v22.d[1], x19
ins v22.d[1], x22 ldpsw x18, x19, [x17, 56]
ins v23.d[0], x23 ins v23.d[0], x18
ins v23.d[1], x24 ins v23.d[1], x19
scvtf v20.2d, v20.2d scvtf v20.2d, v20.2d
scvtf v21.2d, v21.2d scvtf v21.2d, v21.2d
scvtf v22.2d, v22.2d scvtf v22.2d, v22.2d
@ -214,8 +224,38 @@ main_loop:
# Execute VM instructions # Execute VM instructions
randomx_program_aarch64_vm_instructions: randomx_program_aarch64_vm_instructions:
# 16 KB buffer for generated instructions # 12 KB buffer for generated instructions
.fill 4096,4,0 .fill 3072,4,0
literal_x21:
.fill 1,8,0
literal_x22:
.fill 1,8,0
literal_x23:
.fill 1,8,0
literal_x24:
.fill 1,8,0
literal_x25:
.fill 1,8,0
literal_x26:
.fill 1,8,0
literal_x27:
.fill 1,8,0
literal_x28:
.fill 1,8,0
literal_x29:
.fill 1,8,0
literal_x30:
.fill 1,8,0
randomx_program_aarch64_vm_instructions_end: randomx_program_aarch64_vm_instructions_end:
@ -223,43 +263,47 @@ randomx_program_aarch64_vm_instructions_end:
eor x9, x9, x11 eor x9, x9, x11
# Calculate dataset pointer for dataset prefetch # Calculate dataset pointer for dataset prefetch
mov w25, w9 mov w20, w9
and x25, x25, x29 randomx_program_aarch64_cacheline_align_mask1:
add x25, x25, x1 # Actual mask will be inserted by JIT compiler
and x20, x20, 1
add x20, x20, x1
# Prefetch dataset data # Prefetch dataset data
prfm pldl2strm, [x25] prfm pldl2strm, [x20]
# mx <-> ma # mx <-> ma
ror x9, x9, 32 ror x9, x9, 32
# Calculate dataset pointer for dataset read # Calculate dataset pointer for dataset read
mov w25, w9 mov w20, w9
and x25, x25, x29 randomx_program_aarch64_cacheline_align_mask2:
add x25, x25, x1 # Actual mask will be inserted by JIT compiler
and x20, x20, 1
add x20, x20, x1
# xor integer registers with dataset data # xor integer registers with dataset data
ldp x21, x22, [x25] ldp x18, x19, [x20]
ldp x23, x24, [x25, 16] eor x4, x4, x18
eor x4, x4, x21 eor x5, x5, x19
eor x5, x5, x22 ldp x18, x19, [x20, 16]
eor x6, x6, x23 eor x6, x6, x18
eor x7, x7, x24 eor x7, x7, x19
ldp x21, x22, [x25, 32] ldp x18, x19, [x20, 32]
ldp x23, x24, [x25, 48] eor x12, x12, x18
eor x12, x12, x21 eor x13, x13, x19
eor x13, x13, x22 ldp x18, x19, [x20, 48]
eor x14, x14, x23 eor x14, x14, x18
eor x15, x15, x24 eor x15, x15, x19
randomx_program_aarch64_update_spMix1: randomx_program_aarch64_update_spMix1:
eor x10, x0, x0 eor x10, x0, x0
# Store integer registers to scratchpad (spAddr1) # Store integer registers to scratchpad (spAddr1)
stp x4, x5, [x20, 0] stp x4, x5, [x17, 0]
stp x6, x7, [x20, 16] stp x6, x7, [x17, 16]
stp x12, x13, [x20, 32] stp x12, x13, [x17, 32]
stp x14, x15, [x20, 48] stp x14, x15, [x17, 48]
# xor group F and group E registers # xor group F and group E registers
eor v16.16b, v16.16b, v20.16b eor v16.16b, v16.16b, v20.16b
@ -268,11 +312,11 @@ randomx_program_aarch64_update_spMix1:
eor v19.16b, v19.16b, v23.16b eor v19.16b, v19.16b, v23.16b
# Store FP registers to scratchpad (spAddr0) # Store FP registers to scratchpad (spAddr0)
stp q16, q17, [x19, 0] stp q16, q17, [x16, 0]
stp q18, q19, [x19, 32] stp q18, q19, [x16, 32]
subs x3, x3, 1 subs x3, x3, 1
bne main_loop bne randomx_program_aarch64_main_loop
# Store integer registers # Store integer registers
stp x4, x5, [x0, 0] stp x4, x5, [x0, 0]

@ -31,8 +31,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
extern "C" { extern "C" {
void randomx_program_aarch64(void* reg, void* mem, void* scratchpad, uint64_t iterations); void randomx_program_aarch64(void* reg, void* mem, void* scratchpad, uint64_t iterations);
void randomx_program_aarch64_main_loop();
void randomx_program_aarch64_vm_instructions(); void randomx_program_aarch64_vm_instructions();
void randomx_program_aarch64_vm_instructions_end(); void randomx_program_aarch64_vm_instructions_end();
void randomx_program_aarch64_cacheline_align_mask1();
void randomx_program_aarch64_cacheline_align_mask2();
void randomx_program_aarch64_update_spMix1(); void randomx_program_aarch64_update_spMix1();
void randomx_program_aarch64_end(); void randomx_program_aarch64_end();
} }

Loading…
Cancel
Save