Added CFROUND

Also optimized register allocation
5 years ago · 3a6993290c
parent e49499043c
commit 3a6993290c
5 changed files with 243 additions and 134 deletions
--- a/src/configuration.h
+++ b/src/configuration.h
@ -113,13 +113,13 @@ Total sum of frequencies must be 256
 //Control instructions
 #define RANDOMX_FREQ_CBRANCH        0
-#define RANDOMX_FREQ_CFROUND        0
+#define RANDOMX_FREQ_CFROUND        1
 //Store instruction
 #define RANDOMX_FREQ_ISTORE        16
 //No-op instruction
-#define RANDOMX_FREQ_NOP           26
+#define RANDOMX_FREQ_NOP           25
 /*                               ------
                                  256
 */
--- a/src/jit_compiler_a64.cpp
+++ b/src/jit_compiler_a64.cpp
@ -64,6 +64,7 @@ constexpr uint32_t FSQRT       = 0x6EE1F800;
 namespace randomx {
 static const size_t CodeSize = ((uint8_t*)randomx_program_aarch64_end) - ((uint8_t*)randomx_program_aarch64);
 static const size_t MainLoopBegin = ((uint8_t*)randomx_program_aarch64_main_loop) - ((uint8_t*)randomx_program_aarch64);
 static const size_t PrologueSize = ((uint8_t*)randomx_program_aarch64_vm_instructions) - ((uint8_t*)randomx_program_aarch64);
 static const size_t InstructionsEnd = ((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64);
@ -91,7 +92,15 @@ void JitCompilerA64::enableAll()
 void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& config)
 {
-	uint32_t codePos = PrologueSize;
+	uint32_t codePos = MainLoopBegin + 4;
 	// and w16, w10, ScratchpadL3Mask64
 	emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
 	// and w17, w18, ScratchpadL3Mask64
 	emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
 	codePos = PrologueSize;
 	literalPos = InstructionsEnd;
 	for (uint32_t i = 0; i < program.getSize(); ++i)
@ -110,13 +119,21 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con
 	const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos;
 	emit32(ARMV8A::B | (offset / 4), code, codePos);
 	// and w20, w20, CacheLineAlignMask
 	codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64));
 	emit32(0x121A0000 | 20 | (20 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos);
 	// and w20, w20, CacheLineAlignMask
 	codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64));
 	emit32(0x121A0000 | 20 | (20 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos);
 	// Update spMix1
 	// eor x10, config.readReg0, config.readReg1
 	codePos = ((uint8_t*)randomx_program_aarch64_update_spMix1) - ((uint8_t*)randomx_program_aarch64);
 	emit32(ARMV8A::EOR | 10 | (IntRegMap[config.readReg0] << 5) | (IntRegMap[config.readReg1] << 16), code, codePos);
 #ifdef __GNUC__
-	__builtin___clear_cache(reinterpret_cast<char*>(code + PrologueSize), reinterpret_cast<char*>(code + codePos));
+	__builtin___clear_cache(reinterpret_cast<char*>(code + MainLoopBegin), reinterpret_cast<char*>(code + codePos));
 #endif
 }
@ -179,7 +196,7 @@ void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm,
 	}
 	else
 	{
-		constexpr uint32_t tmp_reg = 21;
+		constexpr uint32_t tmp_reg = 18;
 		emitMovImmediate(tmp_reg, imm, code, k);
 		// add dst, src, tmp_reg
@ -228,7 +245,7 @@ void JitCompilerA64::emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* co
 	uint32_t k = codePos;
 	uint32_t imm = instr.getImm32();
-	constexpr uint32_t tmp_reg = 21;
+	constexpr uint32_t tmp_reg = 18;
 	imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1);
 	emitAddImmediate(tmp_reg, src, imm, code, k);
@ -281,7 +298,7 @@ void JitCompilerA64::h_IADD_M(Instruction& instr, int i, uint32_t& codePos)
 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];
-	constexpr uint32_t tmp_reg = 21;
+	constexpr uint32_t tmp_reg = 18;
 	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
 	// add dst, dst, tmp_reg
@ -317,7 +334,7 @@ void JitCompilerA64::h_ISUB_M(Instruction& instr, int i, uint32_t& codePos)
 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];
-	constexpr uint32_t tmp_reg = 21;
+	constexpr uint32_t tmp_reg = 18;
 	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
 	// sub dst, dst, tmp_reg
@ -335,7 +352,7 @@ void JitCompilerA64::h_IMUL_R(Instruction& instr, int i, uint32_t& codePos)
 	if (src == dst)
 	{
-		src = 21;
+		src = 18;
 		emitMovImmediate(src, instr.getImm32(), code, k);
 	}
@ -352,7 +369,7 @@ void JitCompilerA64::h_IMUL_M(Instruction& instr, int i, uint32_t& codePos)
 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];
-	constexpr uint32_t tmp_reg = 21;
+	constexpr uint32_t tmp_reg = 18;
 	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
 	// sub dst, dst, tmp_reg
@ -381,7 +398,7 @@ void JitCompilerA64::h_IMULH_M(Instruction& instr, int i, uint32_t& codePos)
 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];
-	constexpr uint32_t tmp_reg = 21;
+	constexpr uint32_t tmp_reg = 18;
 	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
 	// umulh dst, dst, tmp_reg
@ -410,7 +427,7 @@ void JitCompilerA64::h_ISMULH_M(Instruction& instr, int i, uint32_t& codePos)
 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];
-	constexpr uint32_t tmp_reg = 21;
+	constexpr uint32_t tmp_reg = 18;
 	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
 	// smulh dst, dst, tmp_reg
@ -427,23 +444,33 @@ void JitCompilerA64::h_IMUL_RCP(Instruction& instr, int i, uint32_t& codePos)
 	uint32_t k = codePos;
-	constexpr uint32_t tmp_reg = 21;
+	constexpr uint32_t tmp_reg = 18;
 	const uint32_t dst = IntRegMap[instr.dst];
-	const uint64_t N = 1ULL << 63;
+	constexpr uint64_t N = 1ULL << 63;
 	const uint64_t q = N / divisor;
 	const uint64_t r = N % divisor;
 	const uint64_t shift = 64 - __builtin_clzll(divisor);
 	const uint32_t literal_id = (InstructionsEnd - literalPos) / sizeof(uint64_t);
 	literalPos -= sizeof(uint64_t);
 	*(uint64_t*)(code + literalPos) = (q << shift) + ((r << shift) / divisor);
-	// ldr tmp_reg, reciprocal
+	if (literal_id < 10)
-	const uint32_t offset = (literalPos - k) / 4;
+	{
-	emit32(ARMV8A::LDR_LITERAL | tmp_reg | (offset << 5), code, k);
+		// mul dst, dst, literal_reg
 		emit32(ARMV8A::MUL | dst | (dst << 5) | ((30 - literal_id) << 16), code, k);
 	}
 	else
 	{
 		// ldr tmp_reg, reciprocal
 		const uint32_t offset = (literalPos - k) / 4;
 		emit32(ARMV8A::LDR_LITERAL | tmp_reg | (offset << 5), code, k);
-	// mul dst, dst, src
+		// mul dst, dst, src
-	emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k);
+		emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k);
 	}
 	codePos = k;
 }
@ -465,7 +492,7 @@ void JitCompilerA64::h_IXOR_R(Instruction& instr, int i, uint32_t& codePos)
 	if (src == dst)
 	{
-		src = 21;
+		src = 18;
 		emitMovImmediate(src, instr.getImm32(), code, k);
 	}
@ -482,7 +509,7 @@ void JitCompilerA64::h_IXOR_M(Instruction& instr, int i, uint32_t& codePos)
 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];
-	constexpr uint32_t tmp_reg = 21;
+	constexpr uint32_t tmp_reg = 18;
 	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
 	// sub dst, dst, tmp_reg
@ -517,12 +544,12 @@ void JitCompilerA64::h_IROL_R(Instruction& instr, int i, uint32_t& codePos)
 	if (src != dst)
 	{
-		constexpr uint32_t tmp_reg = 21;
+		constexpr uint32_t tmp_reg = 18;
 		// sub tmp_reg, xzr, src
 		emit32(ARMV8A::SUB | tmp_reg | (31 << 5) | (src << 16), code, k);
-		// ror dst, dst, src
+		// ror dst, dst, tmp_reg
 		emit32(ARMV8A::ROR | dst | (dst << 5) | (tmp_reg << 16), code, k);
 	}
 	else
@ -544,7 +571,7 @@ void JitCompilerA64::h_ISWAP_R(Instruction& instr, int i, uint32_t& codePos)
 	uint32_t k = codePos;
-	constexpr uint32_t tmp_reg = 21;
+	constexpr uint32_t tmp_reg = 18;
 	emit32(ARMV8A::MOV_REG | tmp_reg | (dst << 16), code, k);
 	emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k);
 	emit32(ARMV8A::MOV_REG | src | (tmp_reg << 16), code, k);
@ -658,13 +685,46 @@ void JitCompilerA64::h_FSQRT_R(Instruction& instr, int i, uint32_t& codePos)
 	emit32(ARMV8A::FSQRT | dst | (dst << 5), code, codePos);
 }
 void JitCompilerA64::h_CBRANCH(Instruction& instr, int i, uint32_t& codePos)
 {
 	uint32_t k = codePos;
 	const uint32_t dst = IntRegMap[instr.src];
 	codePos = k;
 }
 void JitCompilerA64::h_CFROUND(Instruction& instr, int i, uint32_t& codePos)
 {
 	uint32_t k = codePos;
 	const uint32_t src = IntRegMap[instr.src];
 	constexpr uint32_t tmp_reg = 18;
 	constexpr uint32_t fprc_tmp_reg = 8;
 	// ror tmp_reg, src, imm
 	emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
 	// bfi fprc_tmp_reg, tmp_reg, 40, 2
 	emit32(0xB3580400 | fprc_tmp_reg | (tmp_reg << 5), code, k);
 	// rbit tmp_reg, fprc_tmp_reg
 	emit32(0xDAC00000 | tmp_reg | (fprc_tmp_reg << 5), code, k);
 	// msr fpcr, tmp_reg
 	emit32(0xD51B4400 | tmp_reg, code, k);
 	codePos = k;
 }
 void JitCompilerA64::h_ISTORE(Instruction& instr, int i, uint32_t& codePos)
 {
 	uint32_t k = codePos;
 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];
-	constexpr uint32_t tmp_reg = 21;
+	constexpr uint32_t tmp_reg = 18;
 	uint32_t imm = instr.getImm32();
@ -683,7 +743,7 @@ void JitCompilerA64::h_ISTORE(Instruction& instr, int i, uint32_t& codePos)
 	emit32((instr.getModCond() < StoreL3Condition) ? (instr.getModMem() ? andInstrL1 : andInstrL2) : andInstrL3, code, k);
 	// str src, [x2, tmp_reg]
-	emit32(0xf8356840 | src | (tmp_reg << 16), code, k);
+	emit32(0xF8206840 | src | (tmp_reg << 16), code, k);
 	codePos = k;
 }
--- a/src/jit_compiler_a64.hpp
+++ b/src/jit_compiler_a64.hpp
@ -122,6 +122,8 @@ namespace randomx {
 		void h_FMUL_R(Instruction&, int, uint32_t&);
 		void h_FDIV_M(Instruction&, int, uint32_t&);
 		void h_FSQRT_R(Instruction&, int, uint32_t&);
 		void h_CBRANCH(Instruction&, int, uint32_t&);
 		void h_CFROUND(Instruction&, int, uint32_t&);
 		void h_ISTORE(Instruction&, int, uint32_t&);
 		void h_NOP(Instruction&, int, uint32_t&);
 	};
--- a/src/jit_compiler_a64_static.S
+++ b/src/jit_compiler_a64_static.S
@ -28,8 +28,11 @@
 	.arch armv8-a
 	.text
 	.global	randomx_program_aarch64
 	.global	randomx_program_aarch64_main_loop
 	.global	randomx_program_aarch64_vm_instructions
 	.global	randomx_program_aarch64_vm_instructions_end
 	.global randomx_program_aarch64_cacheline_align_mask1
 	.global randomx_program_aarch64_cacheline_align_mask2
 	.global randomx_program_aarch64_update_spMix1
 	.global	randomx_program_aarch64_end
@ -43,7 +46,7 @@
 # x5  -> "r1"
 # x6  -> "r2"
 # x7  -> "r3"
-# x8  -> temporary
+# x8  -> fpcr (reversed bits)
 # x9  -> mx, ma
 # x10 -> spMix1
 # x11 -> spMix2
@ -51,23 +54,23 @@
 # x13 -> "r5"
 # x14 -> "r6"
 # x15 -> "r7"
-# x16 -> ScratchpadL1Mask64
+# x16 -> spAddr0
-# x17 -> ScratchpadL2Mask64
+# x17 -> spAddr1
-# x18 -> ScratchpadL3Mask64
+# x18 -> temporary
-# x19 -> spAddr0
+# x19 -> temporary
-# x20 -> spAddr1
+# x20 -> temporary
-# x21 -> temporary
+# x21 -> literal for IMUL_RCP
-# x22 -> temporary
+# x22 -> literal for IMUL_RCP
-# x23 -> temporary
+# x23 -> literal for IMUL_RCP
-# x24 -> temporary
+# x24 -> literal for IMUL_RCP
-# x25 -> temporary
+# x25 -> literal for IMUL_RCP
-# x26 -> ScratchpadL1Mask8
+# x26 -> literal for IMUL_RCP
-# x27 -> ScratchpadL2Mask8
+# x27 -> literal for IMUL_RCP
-# x28 -> ScratchpadL3Mask8
+# x28 -> literal for IMUL_RCP
-# x29 -> CacheLineAlignMask
+# x29 -> literal for IMUL_RCP
-
+# x30 -> literal for IMUL_RCP
-# v0-v7 -> temporary
+
-# v8-v15 -> not used
+# v0-v15 -> not used
 # v16 -> "f0"
 # v17 -> "f1"
 # v18 -> "f2"
@ -113,91 +116,98 @@ randomx_program_aarch64:
 	# Load initial spMix value
 	mov	x10, x9
 	# Load Scratchpad masks
 	mov	x16, 16384 - 64
 	mov	x17, 262144 - 64
 	mov	x18, 2097152 - 64
 	mov	x26, 16384 - 8
 	mov	x27, 262144 - 8
 	mov	x28, 2097152 - 8
 	# Load CacheLineAlignMask
 	mov	x29, 0x7FFFFFC0
 	# Load group A registers
 	ldp	q24, q25, [x0, 192]
 	ldp	q26, q27, [x0, 224]
 	# Load E 'and' mask
-	mov	x21, 0x00FFFFFFFFFFFFFF
+	mov	x16, 0x00FFFFFFFFFFFFFF
-	ins	v29.d[0], x21
+	ins	v29.d[0], x16
-	ins	v29.d[1], x21
+	ins	v29.d[1], x16
 	# Load E 'or' mask (stored in reg.f[0])
 	ldr	q30, [x0, 64]
 	# Load scale mask
-	mov	x21, 0x80f0000000000000
+	mov	x16, 0x80f0000000000000
-	ins	v31.d[0], x21
+	ins	v31.d[0], x16
-	ins	v31.d[1], x21
+	ins	v31.d[1], x16
-
+
-main_loop:
+	# Read fpcr
 	mrs	x8, fpcr
 	rbit	x8, x8
 	# Read literals
 	ldr	x21, literal_x21
 	ldr	x22, literal_x22
 	ldr	x23, literal_x23
 	ldr	x24, literal_x24
 	ldr	x25, literal_x25
 	ldr	x26, literal_x26
 	ldr	x27, literal_x27
 	ldr	x28, literal_x28
 	ldr	x29, literal_x29
 	ldr	x30, literal_x30
 randomx_program_aarch64_main_loop:
 	# spAddr0 = spMix1 & ScratchpadL3Mask64;
 	# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
-	lsr	x21, x10, 32
+	lsr	x18, x10, 32
-	and	w19, w10, w18
+
-	and	w20, w21, w18
+	# Actual mask will be inserted by JIT compiler
 	and	w16, w10, 1
 	and	w17, w18, 1
-	# x19 = scratchpad + spAddr0
+	# x16 = scratchpad + spAddr0
-	# x20 = scratchpad + spAddr1
+	# x17 = scratchpad + spAddr1
-	add	x19, x19, x2
+	add	x16, x16, x2
-	add	x20, x20, x2
+	add	x17, x17, x2
 	# xor integer registers with scratchpad data (spAddr0)
-	ldp	x21, x22, [x19]
+	ldp	x18, x19, [x16]
-	ldp	x23, x24, [x19, 16]
+	eor	x4, x4, x18
-	eor	x4, x4, x21
+	eor	x5, x5, x19
-	eor	x5, x5, x22
+	ldp	x18, x19, [x16, 16]
-	eor	x6, x6, x23
+	eor	x6, x6, x18
-	eor	x7, x7, x24
+	eor	x7, x7, x19
-	ldp	x21, x22, [x19, 32]
+	ldp	x18, x19, [x16, 32]
-	ldp	x23, x24, [x19, 48]
+	eor	x12, x12, x18
-	eor	x12, x12, x21
+	eor	x13, x13, x19
-	eor	x13, x13, x22
+	ldp	x18, x19, [x16, 48]
-	eor	x14, x14, x23
+	eor	x14, x14, x18
-	eor	x15, x15, x24
+	eor	x15, x15, x19
 	# Load group F registers (spAddr1)
-	ldpsw	x21, x22, [x20]
+	ldpsw	x18, x19, [x17]
-	ldpsw	x23, x24, [x20, 8]
+	ins	v16.d[0], x18
-	ins	v16.d[0], x21
+	ins	v16.d[1], x19
-	ins	v16.d[1], x22
+	ldpsw	x18, x19, [x17, 8]
-	ins	v17.d[0], x23
+	ins	v17.d[0], x18
-	ins	v17.d[1], x24
+	ins	v17.d[1], x19
-	ldpsw	x21, x22, [x20, 16]
+	ldpsw	x18, x19, [x17, 16]
-	ldpsw	x23, x24, [x20, 24]
+	ins	v18.d[0], x18
-	ins	v18.d[0], x21
+	ins	v18.d[1], x19
-	ins	v18.d[1], x22
+	ldpsw	x18, x19, [x17, 24]
-	ins	v19.d[0], x23
+	ins	v19.d[0], x18
-	ins	v19.d[1], x24
+	ins	v19.d[1], x19
 	scvtf	v16.2d, v16.2d
 	scvtf	v17.2d, v17.2d
 	scvtf	v18.2d, v18.2d
 	scvtf	v19.2d, v19.2d
 	# Load group E registers (spAddr1)
-	ldpsw	x21, x22, [x20, 32]
+	ldpsw	x18, x19, [x17, 32]
-	ldpsw	x23, x24, [x20, 40]
+	ins	v20.d[0], x18
-	ins	v20.d[0], x21
+	ins	v20.d[1], x19
-	ins	v20.d[1], x22
+	ldpsw	x18, x19, [x17, 40]
-	ins	v21.d[0], x23
+	ins	v21.d[0], x18
-	ins	v21.d[1], x24
+	ins	v21.d[1], x19
-	ldpsw	x21, x22, [x20, 48]
+	ldpsw	x18, x19, [x17, 48]
-	ldpsw	x23, x24, [x20, 56]
+	ins	v22.d[0], x18
-	ins	v22.d[0], x21
+	ins	v22.d[1], x19
-	ins	v22.d[1], x22
+	ldpsw	x18, x19, [x17, 56]
-	ins	v23.d[0], x23
+	ins	v23.d[0], x18
-	ins	v23.d[1], x24
+	ins	v23.d[1], x19
 	scvtf	v20.2d, v20.2d
 	scvtf	v21.2d, v21.2d
 	scvtf	v22.2d, v22.2d
@ -214,8 +224,38 @@ main_loop:
 	# Execute VM instructions
 randomx_program_aarch64_vm_instructions:
-	# 16 KB buffer for generated instructions
+	# 12 KB buffer for generated instructions
-	.fill 4096,4,0
+	.fill 3072,4,0
 literal_x21:
 	.fill 1,8,0
 literal_x22:
 	.fill 1,8,0
 literal_x23:
 	.fill 1,8,0
 literal_x24:
 	.fill 1,8,0
 literal_x25:
 	.fill 1,8,0
 literal_x26:
 	.fill 1,8,0
 literal_x27:
 	.fill 1,8,0
 literal_x28:
 	.fill 1,8,0
 literal_x29:
 	.fill 1,8,0
 literal_x30:
 	.fill 1,8,0
 randomx_program_aarch64_vm_instructions_end:
@ -223,43 +263,47 @@ randomx_program_aarch64_vm_instructions_end:
 	eor	x9, x9, x11
 	# Calculate dataset pointer for dataset prefetch
-	mov	w25, w9
+	mov	w20, w9
-	and	x25, x25, x29
+randomx_program_aarch64_cacheline_align_mask1:
-	add	x25, x25, x1
+	# Actual mask will be inserted by JIT compiler
 	and	x20, x20, 1
 	add	x20, x20, x1
 	# Prefetch dataset data
-	prfm	pldl2strm, [x25]
+	prfm	pldl2strm, [x20]
 	# mx <-> ma
 	ror	x9, x9, 32
 	# Calculate dataset pointer for dataset read
-	mov	w25, w9
+	mov	w20, w9
-	and	x25, x25, x29
+randomx_program_aarch64_cacheline_align_mask2:
-	add	x25, x25, x1
+	# Actual mask will be inserted by JIT compiler
 	and	x20, x20, 1
 	add	x20, x20, x1
 	# xor integer registers with dataset data
-	ldp	x21, x22, [x25]
+	ldp	x18, x19, [x20]
-	ldp	x23, x24, [x25, 16]
+	eor	x4, x4, x18
-	eor	x4, x4, x21
+	eor	x5, x5, x19
-	eor	x5, x5, x22
+	ldp	x18, x19, [x20, 16]
-	eor	x6, x6, x23
+	eor	x6, x6, x18
-	eor	x7, x7, x24
+	eor	x7, x7, x19
-	ldp	x21, x22, [x25, 32]
+	ldp	x18, x19, [x20, 32]
-	ldp	x23, x24, [x25, 48]
+	eor	x12, x12, x18
-	eor	x12, x12, x21
+	eor	x13, x13, x19
-	eor	x13, x13, x22
+	ldp	x18, x19, [x20, 48]
-	eor	x14, x14, x23
+	eor	x14, x14, x18
-	eor	x15, x15, x24
+	eor	x15, x15, x19
 randomx_program_aarch64_update_spMix1:
 	eor	x10, x0, x0
 	# Store integer registers to scratchpad (spAddr1)
-	stp	x4, x5, [x20, 0]
+	stp	x4, x5, [x17, 0]
-	stp	x6, x7, [x20, 16]
+	stp	x6, x7, [x17, 16]
-	stp	x12, x13, [x20, 32]
+	stp	x12, x13, [x17, 32]
-	stp	x14, x15, [x20, 48]
+	stp	x14, x15, [x17, 48]
 	# xor group F and group E registers
 	eor	v16.16b, v16.16b, v20.16b
@ -268,11 +312,11 @@ randomx_program_aarch64_update_spMix1:
 	eor	v19.16b, v19.16b, v23.16b
 	# Store FP registers to scratchpad (spAddr0)
-	stp	q16, q17, [x19, 0]
+	stp	q16, q17, [x16, 0]
-	stp	q18, q19, [x19, 32]
+	stp	q18, q19, [x16, 32]
 	subs	x3, x3, 1
-	bne	main_loop
+	bne	randomx_program_aarch64_main_loop
 	# Store integer registers
 	stp	x4, x5, [x0, 0]
--- a/src/jit_compiler_a64_static.hpp
+++ b/src/jit_compiler_a64_static.hpp
@ -31,8 +31,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 extern "C" {
 	void randomx_program_aarch64(void* reg, void* mem, void* scratchpad, uint64_t iterations);
 	void randomx_program_aarch64_main_loop();
 	void randomx_program_aarch64_vm_instructions();
 	void randomx_program_aarch64_vm_instructions_end();
 	void randomx_program_aarch64_cacheline_align_mask1();
 	void randomx_program_aarch64_cacheline_align_mask2();
 	void randomx_program_aarch64_update_spMix1();
 	void randomx_program_aarch64_end();
 }