diff --git a/src/configuration.h b/src/configuration.h index a561159..fe4bea8 100644 --- a/src/configuration.h +++ b/src/configuration.h @@ -113,13 +113,13 @@ Total sum of frequencies must be 256 //Control instructions #define RANDOMX_FREQ_CBRANCH 0 -#define RANDOMX_FREQ_CFROUND 0 +#define RANDOMX_FREQ_CFROUND 1 //Store instruction #define RANDOMX_FREQ_ISTORE 16 //No-op instruction -#define RANDOMX_FREQ_NOP 26 +#define RANDOMX_FREQ_NOP 25 /* ------ 256 */ diff --git a/src/jit_compiler_a64.cpp b/src/jit_compiler_a64.cpp index 72cfac6..38216c9 100644 --- a/src/jit_compiler_a64.cpp +++ b/src/jit_compiler_a64.cpp @@ -64,6 +64,7 @@ constexpr uint32_t FSQRT = 0x6EE1F800; namespace randomx { static const size_t CodeSize = ((uint8_t*)randomx_program_aarch64_end) - ((uint8_t*)randomx_program_aarch64); +static const size_t MainLoopBegin = ((uint8_t*)randomx_program_aarch64_main_loop) - ((uint8_t*)randomx_program_aarch64); static const size_t PrologueSize = ((uint8_t*)randomx_program_aarch64_vm_instructions) - ((uint8_t*)randomx_program_aarch64); static const size_t InstructionsEnd = ((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64); @@ -91,7 +92,15 @@ void JitCompilerA64::enableAll() void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& config) { - uint32_t codePos = PrologueSize; + uint32_t codePos = MainLoopBegin + 4; + + // and w16, w10, ScratchpadL3Mask64 + emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); + + // and w17, w18, ScratchpadL3Mask64 + emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); + + codePos = PrologueSize; literalPos = InstructionsEnd; for (uint32_t i = 0; i < program.getSize(); ++i) @@ -110,13 +119,21 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos; emit32(ARMV8A::B | (offset / 4), code, codePos); + // and w20, w20, CacheLineAlignMask + codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64)); + emit32(0x121A0000 | 20 | (20 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos); + + // and w20, w20, CacheLineAlignMask + codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64)); + emit32(0x121A0000 | 20 | (20 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos); + // Update spMix1 // eor x10, config.readReg0, config.readReg1 codePos = ((uint8_t*)randomx_program_aarch64_update_spMix1) - ((uint8_t*)randomx_program_aarch64); emit32(ARMV8A::EOR | 10 | (IntRegMap[config.readReg0] << 5) | (IntRegMap[config.readReg1] << 16), code, codePos); #ifdef __GNUC__ - __builtin___clear_cache(reinterpret_cast(code + PrologueSize), reinterpret_cast(code + codePos)); + __builtin___clear_cache(reinterpret_cast(code + MainLoopBegin), reinterpret_cast(code + codePos)); #endif } @@ -179,7 +196,7 @@ void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm, } else { - constexpr uint32_t tmp_reg = 21; + constexpr uint32_t tmp_reg = 18; emitMovImmediate(tmp_reg, imm, code, k); // add dst, src, tmp_reg @@ -228,7 +245,7 @@ void JitCompilerA64::emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* co uint32_t k = codePos; uint32_t imm = instr.getImm32(); - constexpr uint32_t tmp_reg = 21; + constexpr uint32_t tmp_reg = 18; imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1); emitAddImmediate(tmp_reg, src, imm, code, k); @@ -281,7 +298,7 @@ void JitCompilerA64::h_IADD_M(Instruction& instr, int i, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 21; + constexpr uint32_t tmp_reg = 18; emitMemLoad(dst, src, instr, code, k); // add dst, dst, tmp_reg @@ -317,7 +334,7 @@ void JitCompilerA64::h_ISUB_M(Instruction& instr, int i, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 21; + constexpr uint32_t tmp_reg = 18; emitMemLoad(dst, src, instr, code, k); // sub dst, dst, tmp_reg @@ -335,7 +352,7 @@ void JitCompilerA64::h_IMUL_R(Instruction& instr, int i, uint32_t& codePos) if (src == dst) { - src = 21; + src = 18; emitMovImmediate(src, instr.getImm32(), code, k); } @@ -352,7 +369,7 @@ void JitCompilerA64::h_IMUL_M(Instruction& instr, int i, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 21; + constexpr uint32_t tmp_reg = 18; emitMemLoad(dst, src, instr, code, k); // sub dst, dst, tmp_reg @@ -381,7 +398,7 @@ void JitCompilerA64::h_IMULH_M(Instruction& instr, int i, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 21; + constexpr uint32_t tmp_reg = 18; emitMemLoad(dst, src, instr, code, k); // umulh dst, dst, tmp_reg @@ -410,7 +427,7 @@ void JitCompilerA64::h_ISMULH_M(Instruction& instr, int i, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 21; + constexpr uint32_t tmp_reg = 18; emitMemLoad(dst, src, instr, code, k); // smulh dst, dst, tmp_reg @@ -427,23 +444,33 @@ void JitCompilerA64::h_IMUL_RCP(Instruction& instr, int i, uint32_t& codePos) uint32_t k = codePos; - constexpr uint32_t tmp_reg = 21; + constexpr uint32_t tmp_reg = 18; const uint32_t dst = IntRegMap[instr.dst]; - const uint64_t N = 1ULL << 63; + constexpr uint64_t N = 1ULL << 63; const uint64_t q = N / divisor; const uint64_t r = N % divisor; const uint64_t shift = 64 - __builtin_clzll(divisor); + const uint32_t literal_id = (InstructionsEnd - literalPos) / sizeof(uint64_t); + literalPos -= sizeof(uint64_t); *(uint64_t*)(code + literalPos) = (q << shift) + ((r << shift) / divisor); - // ldr tmp_reg, reciprocal - const uint32_t offset = (literalPos - k) / 4; - emit32(ARMV8A::LDR_LITERAL | tmp_reg | (offset << 5), code, k); + if (literal_id < 10) + { + // mul dst, dst, literal_reg + emit32(ARMV8A::MUL | dst | (dst << 5) | ((30 - literal_id) << 16), code, k); + } + else + { + // ldr tmp_reg, reciprocal + const uint32_t offset = (literalPos - k) / 4; + emit32(ARMV8A::LDR_LITERAL | tmp_reg | (offset << 5), code, k); - // mul dst, dst, src - emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k); + // mul dst, dst, src + emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k); + } codePos = k; } @@ -465,7 +492,7 @@ void JitCompilerA64::h_IXOR_R(Instruction& instr, int i, uint32_t& codePos) if (src == dst) { - src = 21; + src = 18; emitMovImmediate(src, instr.getImm32(), code, k); } @@ -482,7 +509,7 @@ void JitCompilerA64::h_IXOR_M(Instruction& instr, int i, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 21; + constexpr uint32_t tmp_reg = 18; emitMemLoad(dst, src, instr, code, k); // sub dst, dst, tmp_reg @@ -517,12 +544,12 @@ void JitCompilerA64::h_IROL_R(Instruction& instr, int i, uint32_t& codePos) if (src != dst) { - constexpr uint32_t tmp_reg = 21; + constexpr uint32_t tmp_reg = 18; // sub tmp_reg, xzr, src emit32(ARMV8A::SUB | tmp_reg | (31 << 5) | (src << 16), code, k); - // ror dst, dst, src + // ror dst, dst, tmp_reg emit32(ARMV8A::ROR | dst | (dst << 5) | (tmp_reg << 16), code, k); } else @@ -544,7 +571,7 @@ void JitCompilerA64::h_ISWAP_R(Instruction& instr, int i, uint32_t& codePos) uint32_t k = codePos; - constexpr uint32_t tmp_reg = 21; + constexpr uint32_t tmp_reg = 18; emit32(ARMV8A::MOV_REG | tmp_reg | (dst << 16), code, k); emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k); emit32(ARMV8A::MOV_REG | src | (tmp_reg << 16), code, k); @@ -658,13 +685,46 @@ void JitCompilerA64::h_FSQRT_R(Instruction& instr, int i, uint32_t& codePos) emit32(ARMV8A::FSQRT | dst | (dst << 5), code, codePos); } +void JitCompilerA64::h_CBRANCH(Instruction& instr, int i, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t dst = IntRegMap[instr.src]; + + codePos = k; +} + +void JitCompilerA64::h_CFROUND(Instruction& instr, int i, uint32_t& codePos) +{ + uint32_t k = codePos; + + const uint32_t src = IntRegMap[instr.src]; + + constexpr uint32_t tmp_reg = 18; + constexpr uint32_t fprc_tmp_reg = 8; + + // ror tmp_reg, src, imm + emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k); + + // bfi fprc_tmp_reg, tmp_reg, 40, 2 + emit32(0xB3580400 | fprc_tmp_reg | (tmp_reg << 5), code, k); + + // rbit tmp_reg, fprc_tmp_reg + emit32(0xDAC00000 | tmp_reg | (fprc_tmp_reg << 5), code, k); + + // msr fpcr, tmp_reg + emit32(0xD51B4400 | tmp_reg, code, k); + + codePos = k; +} + void JitCompilerA64::h_ISTORE(Instruction& instr, int i, uint32_t& codePos) { uint32_t k = codePos; const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 21; + constexpr uint32_t tmp_reg = 18; uint32_t imm = instr.getImm32(); @@ -683,7 +743,7 @@ void JitCompilerA64::h_ISTORE(Instruction& instr, int i, uint32_t& codePos) emit32((instr.getModCond() < StoreL3Condition) ? (instr.getModMem() ? andInstrL1 : andInstrL2) : andInstrL3, code, k); // str src, [x2, tmp_reg] - emit32(0xf8356840 | src | (tmp_reg << 16), code, k); + emit32(0xF8206840 | src | (tmp_reg << 16), code, k); codePos = k; } diff --git a/src/jit_compiler_a64.hpp b/src/jit_compiler_a64.hpp index 6b69580..c59db13 100644 --- a/src/jit_compiler_a64.hpp +++ b/src/jit_compiler_a64.hpp @@ -122,6 +122,8 @@ namespace randomx { void h_FMUL_R(Instruction&, int, uint32_t&); void h_FDIV_M(Instruction&, int, uint32_t&); void h_FSQRT_R(Instruction&, int, uint32_t&); + void h_CBRANCH(Instruction&, int, uint32_t&); + void h_CFROUND(Instruction&, int, uint32_t&); void h_ISTORE(Instruction&, int, uint32_t&); void h_NOP(Instruction&, int, uint32_t&); }; diff --git a/src/jit_compiler_a64_static.S b/src/jit_compiler_a64_static.S index 60aeea7..5de5269 100644 --- a/src/jit_compiler_a64_static.S +++ b/src/jit_compiler_a64_static.S @@ -28,8 +28,11 @@ .arch armv8-a .text .global randomx_program_aarch64 + .global randomx_program_aarch64_main_loop .global randomx_program_aarch64_vm_instructions .global randomx_program_aarch64_vm_instructions_end + .global randomx_program_aarch64_cacheline_align_mask1 + .global randomx_program_aarch64_cacheline_align_mask2 .global randomx_program_aarch64_update_spMix1 .global randomx_program_aarch64_end @@ -43,7 +46,7 @@ # x5 -> "r1" # x6 -> "r2" # x7 -> "r3" -# x8 -> temporary +# x8 -> fpcr (reversed bits) # x9 -> mx, ma # x10 -> spMix1 # x11 -> spMix2 @@ -51,23 +54,23 @@ # x13 -> "r5" # x14 -> "r6" # x15 -> "r7" -# x16 -> ScratchpadL1Mask64 -# x17 -> ScratchpadL2Mask64 -# x18 -> ScratchpadL3Mask64 -# x19 -> spAddr0 -# x20 -> spAddr1 -# x21 -> temporary -# x22 -> temporary -# x23 -> temporary -# x24 -> temporary -# x25 -> temporary -# x26 -> ScratchpadL1Mask8 -# x27 -> ScratchpadL2Mask8 -# x28 -> ScratchpadL3Mask8 -# x29 -> CacheLineAlignMask - -# v0-v7 -> temporary -# v8-v15 -> not used +# x16 -> spAddr0 +# x17 -> spAddr1 +# x18 -> temporary +# x19 -> temporary +# x20 -> temporary +# x21 -> literal for IMUL_RCP +# x22 -> literal for IMUL_RCP +# x23 -> literal for IMUL_RCP +# x24 -> literal for IMUL_RCP +# x25 -> literal for IMUL_RCP +# x26 -> literal for IMUL_RCP +# x27 -> literal for IMUL_RCP +# x28 -> literal for IMUL_RCP +# x29 -> literal for IMUL_RCP +# x30 -> literal for IMUL_RCP + +# v0-v15 -> not used # v16 -> "f0" # v17 -> "f1" # v18 -> "f2" @@ -113,91 +116,98 @@ randomx_program_aarch64: # Load initial spMix value mov x10, x9 - # Load Scratchpad masks - mov x16, 16384 - 64 - mov x17, 262144 - 64 - mov x18, 2097152 - 64 - mov x26, 16384 - 8 - mov x27, 262144 - 8 - mov x28, 2097152 - 8 - - # Load CacheLineAlignMask - mov x29, 0x7FFFFFC0 - # Load group A registers ldp q24, q25, [x0, 192] ldp q26, q27, [x0, 224] # Load E 'and' mask - mov x21, 0x00FFFFFFFFFFFFFF - ins v29.d[0], x21 - ins v29.d[1], x21 + mov x16, 0x00FFFFFFFFFFFFFF + ins v29.d[0], x16 + ins v29.d[1], x16 # Load E 'or' mask (stored in reg.f[0]) ldr q30, [x0, 64] # Load scale mask - mov x21, 0x80f0000000000000 - ins v31.d[0], x21 - ins v31.d[1], x21 - -main_loop: + mov x16, 0x80f0000000000000 + ins v31.d[0], x16 + ins v31.d[1], x16 + + # Read fpcr + mrs x8, fpcr + rbit x8, x8 + + # Read literals + ldr x21, literal_x21 + ldr x22, literal_x22 + ldr x23, literal_x23 + ldr x24, literal_x24 + ldr x25, literal_x25 + ldr x26, literal_x26 + ldr x27, literal_x27 + ldr x28, literal_x28 + ldr x29, literal_x29 + ldr x30, literal_x30 + +randomx_program_aarch64_main_loop: # spAddr0 = spMix1 & ScratchpadL3Mask64; # spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64; - lsr x21, x10, 32 - and w19, w10, w18 - and w20, w21, w18 + lsr x18, x10, 32 + + # Actual mask will be inserted by JIT compiler + and w16, w10, 1 + and w17, w18, 1 - # x19 = scratchpad + spAddr0 - # x20 = scratchpad + spAddr1 - add x19, x19, x2 - add x20, x20, x2 + # x16 = scratchpad + spAddr0 + # x17 = scratchpad + spAddr1 + add x16, x16, x2 + add x17, x17, x2 # xor integer registers with scratchpad data (spAddr0) - ldp x21, x22, [x19] - ldp x23, x24, [x19, 16] - eor x4, x4, x21 - eor x5, x5, x22 - eor x6, x6, x23 - eor x7, x7, x24 - ldp x21, x22, [x19, 32] - ldp x23, x24, [x19, 48] - eor x12, x12, x21 - eor x13, x13, x22 - eor x14, x14, x23 - eor x15, x15, x24 + ldp x18, x19, [x16] + eor x4, x4, x18 + eor x5, x5, x19 + ldp x18, x19, [x16, 16] + eor x6, x6, x18 + eor x7, x7, x19 + ldp x18, x19, [x16, 32] + eor x12, x12, x18 + eor x13, x13, x19 + ldp x18, x19, [x16, 48] + eor x14, x14, x18 + eor x15, x15, x19 # Load group F registers (spAddr1) - ldpsw x21, x22, [x20] - ldpsw x23, x24, [x20, 8] - ins v16.d[0], x21 - ins v16.d[1], x22 - ins v17.d[0], x23 - ins v17.d[1], x24 - ldpsw x21, x22, [x20, 16] - ldpsw x23, x24, [x20, 24] - ins v18.d[0], x21 - ins v18.d[1], x22 - ins v19.d[0], x23 - ins v19.d[1], x24 + ldpsw x18, x19, [x17] + ins v16.d[0], x18 + ins v16.d[1], x19 + ldpsw x18, x19, [x17, 8] + ins v17.d[0], x18 + ins v17.d[1], x19 + ldpsw x18, x19, [x17, 16] + ins v18.d[0], x18 + ins v18.d[1], x19 + ldpsw x18, x19, [x17, 24] + ins v19.d[0], x18 + ins v19.d[1], x19 scvtf v16.2d, v16.2d scvtf v17.2d, v17.2d scvtf v18.2d, v18.2d scvtf v19.2d, v19.2d # Load group E registers (spAddr1) - ldpsw x21, x22, [x20, 32] - ldpsw x23, x24, [x20, 40] - ins v20.d[0], x21 - ins v20.d[1], x22 - ins v21.d[0], x23 - ins v21.d[1], x24 - ldpsw x21, x22, [x20, 48] - ldpsw x23, x24, [x20, 56] - ins v22.d[0], x21 - ins v22.d[1], x22 - ins v23.d[0], x23 - ins v23.d[1], x24 + ldpsw x18, x19, [x17, 32] + ins v20.d[0], x18 + ins v20.d[1], x19 + ldpsw x18, x19, [x17, 40] + ins v21.d[0], x18 + ins v21.d[1], x19 + ldpsw x18, x19, [x17, 48] + ins v22.d[0], x18 + ins v22.d[1], x19 + ldpsw x18, x19, [x17, 56] + ins v23.d[0], x18 + ins v23.d[1], x19 scvtf v20.2d, v20.2d scvtf v21.2d, v21.2d scvtf v22.2d, v22.2d @@ -214,8 +224,38 @@ main_loop: # Execute VM instructions randomx_program_aarch64_vm_instructions: - # 16 KB buffer for generated instructions - .fill 4096,4,0 + # 12 KB buffer for generated instructions + .fill 3072,4,0 + +literal_x21: + .fill 1,8,0 + +literal_x22: + .fill 1,8,0 + +literal_x23: + .fill 1,8,0 + +literal_x24: + .fill 1,8,0 + +literal_x25: + .fill 1,8,0 + +literal_x26: + .fill 1,8,0 + +literal_x27: + .fill 1,8,0 + +literal_x28: + .fill 1,8,0 + +literal_x29: + .fill 1,8,0 + +literal_x30: + .fill 1,8,0 randomx_program_aarch64_vm_instructions_end: @@ -223,43 +263,47 @@ randomx_program_aarch64_vm_instructions_end: eor x9, x9, x11 # Calculate dataset pointer for dataset prefetch - mov w25, w9 - and x25, x25, x29 - add x25, x25, x1 + mov w20, w9 +randomx_program_aarch64_cacheline_align_mask1: + # Actual mask will be inserted by JIT compiler + and x20, x20, 1 + add x20, x20, x1 # Prefetch dataset data - prfm pldl2strm, [x25] + prfm pldl2strm, [x20] # mx <-> ma ror x9, x9, 32 # Calculate dataset pointer for dataset read - mov w25, w9 - and x25, x25, x29 - add x25, x25, x1 + mov w20, w9 +randomx_program_aarch64_cacheline_align_mask2: + # Actual mask will be inserted by JIT compiler + and x20, x20, 1 + add x20, x20, x1 # xor integer registers with dataset data - ldp x21, x22, [x25] - ldp x23, x24, [x25, 16] - eor x4, x4, x21 - eor x5, x5, x22 - eor x6, x6, x23 - eor x7, x7, x24 - ldp x21, x22, [x25, 32] - ldp x23, x24, [x25, 48] - eor x12, x12, x21 - eor x13, x13, x22 - eor x14, x14, x23 - eor x15, x15, x24 + ldp x18, x19, [x20] + eor x4, x4, x18 + eor x5, x5, x19 + ldp x18, x19, [x20, 16] + eor x6, x6, x18 + eor x7, x7, x19 + ldp x18, x19, [x20, 32] + eor x12, x12, x18 + eor x13, x13, x19 + ldp x18, x19, [x20, 48] + eor x14, x14, x18 + eor x15, x15, x19 randomx_program_aarch64_update_spMix1: eor x10, x0, x0 # Store integer registers to scratchpad (spAddr1) - stp x4, x5, [x20, 0] - stp x6, x7, [x20, 16] - stp x12, x13, [x20, 32] - stp x14, x15, [x20, 48] + stp x4, x5, [x17, 0] + stp x6, x7, [x17, 16] + stp x12, x13, [x17, 32] + stp x14, x15, [x17, 48] # xor group F and group E registers eor v16.16b, v16.16b, v20.16b @@ -268,11 +312,11 @@ randomx_program_aarch64_update_spMix1: eor v19.16b, v19.16b, v23.16b # Store FP registers to scratchpad (spAddr0) - stp q16, q17, [x19, 0] - stp q18, q19, [x19, 32] + stp q16, q17, [x16, 0] + stp q18, q19, [x16, 32] subs x3, x3, 1 - bne main_loop + bne randomx_program_aarch64_main_loop # Store integer registers stp x4, x5, [x0, 0] diff --git a/src/jit_compiler_a64_static.hpp b/src/jit_compiler_a64_static.hpp index 4f29a26..a8b0459 100644 --- a/src/jit_compiler_a64_static.hpp +++ b/src/jit_compiler_a64_static.hpp @@ -31,8 +31,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. extern "C" { void randomx_program_aarch64(void* reg, void* mem, void* scratchpad, uint64_t iterations); + void randomx_program_aarch64_main_loop(); void randomx_program_aarch64_vm_instructions(); void randomx_program_aarch64_vm_instructions_end(); + void randomx_program_aarch64_cacheline_align_mask1(); + void randomx_program_aarch64_cacheline_align_mask2(); void randomx_program_aarch64_update_spMix1(); void randomx_program_aarch64_end(); }