Added CFROUND

Also optimized register allocation
5 years ago · 3a6993290c
parent e49499043c
commit 3a6993290c
5 changed files with 243 additions and 134 deletions
--- a/src/configuration.h
+++ b/src/configuration.h
@ -113,13 +113,13 @@ Total sum of frequencies must be 256

 //Control instructions
 #define RANDOMX_FREQ_CBRANCH        0
-#define RANDOMX_FREQ_CFROUND        0
+#define RANDOMX_FREQ_CFROUND        1

 //Store instruction
 #define RANDOMX_FREQ_ISTORE        16

 //No-op instruction
-#define RANDOMX_FREQ_NOP           26
+#define RANDOMX_FREQ_NOP           25
 /*                               ------
                                  256
 */
--- a/src/jit_compiler_a64.cpp
+++ b/src/jit_compiler_a64.cpp
@ -64,6 +64,7 @@ constexpr uint32_t FSQRT       = 0x6EE1F800;
 namespace randomx {

 static const size_t CodeSize = ((uint8_t*)randomx_program_aarch64_end) - ((uint8_t*)randomx_program_aarch64);
+static const size_t MainLoopBegin = ((uint8_t*)randomx_program_aarch64_main_loop) - ((uint8_t*)randomx_program_aarch64);
 static const size_t PrologueSize = ((uint8_t*)randomx_program_aarch64_vm_instructions) - ((uint8_t*)randomx_program_aarch64);
 static const size_t InstructionsEnd = ((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64);

@ -91,7 +92,15 @@ void JitCompilerA64::enableAll()

 void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& config)
 {
-	uint32_t codePos = PrologueSize;
+	uint32_t codePos = MainLoopBegin + 4;
+
+	// and w16, w10, ScratchpadL3Mask64
+	emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
+
+	// and w17, w18, ScratchpadL3Mask64
+	emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
+
+	codePos = PrologueSize;
 	literalPos = InstructionsEnd;

 	for (uint32_t i = 0; i < program.getSize(); ++i)
@ -110,13 +119,21 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con
 	const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos;
 	emit32(ARMV8A::B | (offset / 4), code, codePos);

+	// and w20, w20, CacheLineAlignMask
+	codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64));
+	emit32(0x121A0000 | 20 | (20 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos);
+
+	// and w20, w20, CacheLineAlignMask
+	codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64));
+	emit32(0x121A0000 | 20 | (20 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos);
+
 	// Update spMix1
 	// eor x10, config.readReg0, config.readReg1
 	codePos = ((uint8_t*)randomx_program_aarch64_update_spMix1) - ((uint8_t*)randomx_program_aarch64);
 	emit32(ARMV8A::EOR | 10 | (IntRegMap[config.readReg0] << 5) | (IntRegMap[config.readReg1] << 16), code, codePos);

 #ifdef __GNUC__
-	__builtin___clear_cache(reinterpret_cast<char*>(code + PrologueSize), reinterpret_cast<char*>(code + codePos));
+	__builtin___clear_cache(reinterpret_cast<char*>(code + MainLoopBegin), reinterpret_cast<char*>(code + codePos));
 #endif
 }

@ -179,7 +196,7 @@ void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm,
 	}
 	else
 	{
-		constexpr uint32_t tmp_reg = 21;
+		constexpr uint32_t tmp_reg = 18;
 		emitMovImmediate(tmp_reg, imm, code, k);

 		// add dst, src, tmp_reg
@ -228,7 +245,7 @@ void JitCompilerA64::emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* co
 	uint32_t k = codePos;

 	uint32_t imm = instr.getImm32();
-	constexpr uint32_t tmp_reg = 21;
+	constexpr uint32_t tmp_reg = 18;

 	imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1);
 	emitAddImmediate(tmp_reg, src, imm, code, k);
@ -281,7 +298,7 @@ void JitCompilerA64::h_IADD_M(Instruction& instr, int i, uint32_t& codePos)
 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];

-	constexpr uint32_t tmp_reg = 21;
+	constexpr uint32_t tmp_reg = 18;
 	emitMemLoad<tmp_reg>(dst, src, instr, code, k);

 	// add dst, dst, tmp_reg
@ -317,7 +334,7 @@ void JitCompilerA64::h_ISUB_M(Instruction& instr, int i, uint32_t& codePos)
 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];

-	constexpr uint32_t tmp_reg = 21;
+	constexpr uint32_t tmp_reg = 18;
 	emitMemLoad<tmp_reg>(dst, src, instr, code, k);

 	// sub dst, dst, tmp_reg
@ -335,7 +352,7 @@ void JitCompilerA64::h_IMUL_R(Instruction& instr, int i, uint32_t& codePos)

 	if (src == dst)
 	{
-		src = 21;
+		src = 18;
 		emitMovImmediate(src, instr.getImm32(), code, k);
 	}

@ -352,7 +369,7 @@ void JitCompilerA64::h_IMUL_M(Instruction& instr, int i, uint32_t& codePos)
 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];

-	constexpr uint32_t tmp_reg = 21;
+	constexpr uint32_t tmp_reg = 18;
 	emitMemLoad<tmp_reg>(dst, src, instr, code, k);

 	// sub dst, dst, tmp_reg
@ -381,7 +398,7 @@ void JitCompilerA64::h_IMULH_M(Instruction& instr, int i, uint32_t& codePos)
 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];

-	constexpr uint32_t tmp_reg = 21;
+	constexpr uint32_t tmp_reg = 18;
 	emitMemLoad<tmp_reg>(dst, src, instr, code, k);

 	// umulh dst, dst, tmp_reg
@ -410,7 +427,7 @@ void JitCompilerA64::h_ISMULH_M(Instruction& instr, int i, uint32_t& codePos)
 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];

-	constexpr uint32_t tmp_reg = 21;
+	constexpr uint32_t tmp_reg = 18;
 	emitMemLoad<tmp_reg>(dst, src, instr, code, k);

 	// smulh dst, dst, tmp_reg
@ -427,23 +444,33 @@ void JitCompilerA64::h_IMUL_RCP(Instruction& instr, int i, uint32_t& codePos)

 	uint32_t k = codePos;

-	constexpr uint32_t tmp_reg = 21;
+	constexpr uint32_t tmp_reg = 18;
 	const uint32_t dst = IntRegMap[instr.dst];

-	const uint64_t N = 1ULL << 63;
+	constexpr uint64_t N = 1ULL << 63;
 	const uint64_t q = N / divisor;
 	const uint64_t r = N % divisor;
 	const uint64_t shift = 64 - __builtin_clzll(divisor);

+	const uint32_t literal_id = (InstructionsEnd - literalPos) / sizeof(uint64_t);
+
 	literalPos -= sizeof(uint64_t);
 	*(uint64_t*)(code + literalPos) = (q << shift) + ((r << shift) / divisor);

-	// ldr tmp_reg, reciprocal
-	const uint32_t offset = (literalPos - k) / 4;
-	emit32(ARMV8A::LDR_LITERAL | tmp_reg | (offset << 5), code, k);
+	if (literal_id < 10)
+	{
+		// mul dst, dst, literal_reg
+		emit32(ARMV8A::MUL | dst | (dst << 5) | ((30 - literal_id) << 16), code, k);
+	}
+	else
+	{
+		// ldr tmp_reg, reciprocal
+		const uint32_t offset = (literalPos - k) / 4;
+		emit32(ARMV8A::LDR_LITERAL | tmp_reg | (offset << 5), code, k);

-	// mul dst, dst, src
-	emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k);
+		// mul dst, dst, src
+		emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k);
+	}

 	codePos = k;
 }
@ -465,7 +492,7 @@ void JitCompilerA64::h_IXOR_R(Instruction& instr, int i, uint32_t& codePos)

 	if (src == dst)
 	{
-		src = 21;
+		src = 18;
 		emitMovImmediate(src, instr.getImm32(), code, k);
 	}

@ -482,7 +509,7 @@ void JitCompilerA64::h_IXOR_M(Instruction& instr, int i, uint32_t& codePos)
 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];

-	constexpr uint32_t tmp_reg = 21;
+	constexpr uint32_t tmp_reg = 18;
 	emitMemLoad<tmp_reg>(dst, src, instr, code, k);

 	// sub dst, dst, tmp_reg
@ -517,12 +544,12 @@ void JitCompilerA64::h_IROL_R(Instruction& instr, int i, uint32_t& codePos)

 	if (src != dst)
 	{
-		constexpr uint32_t tmp_reg = 21;
+		constexpr uint32_t tmp_reg = 18;

 		// sub tmp_reg, xzr, src
 		emit32(ARMV8A::SUB | tmp_reg | (31 << 5) | (src << 16), code, k);

-		// ror dst, dst, src
+		// ror dst, dst, tmp_reg
 		emit32(ARMV8A::ROR | dst | (dst << 5) | (tmp_reg << 16), code, k);
 	}
 	else
@ -544,7 +571,7 @@ void JitCompilerA64::h_ISWAP_R(Instruction& instr, int i, uint32_t& codePos)

 	uint32_t k = codePos;

-	constexpr uint32_t tmp_reg = 21;
+	constexpr uint32_t tmp_reg = 18;
 	emit32(ARMV8A::MOV_REG | tmp_reg | (dst << 16), code, k);
 	emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k);
 	emit32(ARMV8A::MOV_REG | src | (tmp_reg << 16), code, k);
@ -658,13 +685,46 @@ void JitCompilerA64::h_FSQRT_R(Instruction& instr, int i, uint32_t& codePos)
 	emit32(ARMV8A::FSQRT | dst | (dst << 5), code, codePos);
 }

+void JitCompilerA64::h_CBRANCH(Instruction& instr, int i, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t dst = IntRegMap[instr.src];
+
+	codePos = k;
+}
+
+void JitCompilerA64::h_CFROUND(Instruction& instr, int i, uint32_t& codePos)
+{
+	uint32_t k = codePos;
+
+	const uint32_t src = IntRegMap[instr.src];
+
+	constexpr uint32_t tmp_reg = 18;
+	constexpr uint32_t fprc_tmp_reg = 8;
+
+	// ror tmp_reg, src, imm
+	emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
+
+	// bfi fprc_tmp_reg, tmp_reg, 40, 2
+	emit32(0xB3580400 | fprc_tmp_reg | (tmp_reg << 5), code, k);
+
+	// rbit tmp_reg, fprc_tmp_reg
+	emit32(0xDAC00000 | tmp_reg | (fprc_tmp_reg << 5), code, k);
+
+	// msr fpcr, tmp_reg
+	emit32(0xD51B4400 | tmp_reg, code, k);
+
+	codePos = k;
+}
+
 void JitCompilerA64::h_ISTORE(Instruction& instr, int i, uint32_t& codePos)
 {
 	uint32_t k = codePos;

 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];
-	constexpr uint32_t tmp_reg = 21;
+	constexpr uint32_t tmp_reg = 18;

 	uint32_t imm = instr.getImm32();

@ -683,7 +743,7 @@ void JitCompilerA64::h_ISTORE(Instruction& instr, int i, uint32_t& codePos)
 	emit32((instr.getModCond() < StoreL3Condition) ? (instr.getModMem() ? andInstrL1 : andInstrL2) : andInstrL3, code, k);

 	// str src, [x2, tmp_reg]
-	emit32(0xf8356840 | src | (tmp_reg << 16), code, k);
+	emit32(0xF8206840 | src | (tmp_reg << 16), code, k);

 	codePos = k;
 }
--- a/src/jit_compiler_a64.hpp
+++ b/src/jit_compiler_a64.hpp
@ -122,6 +122,8 @@ namespace randomx {
 		void h_FMUL_R(Instruction&, int, uint32_t&);
 		void h_FDIV_M(Instruction&, int, uint32_t&);
 		void h_FSQRT_R(Instruction&, int, uint32_t&);
+		void h_CBRANCH(Instruction&, int, uint32_t&);
+		void h_CFROUND(Instruction&, int, uint32_t&);
 		void h_ISTORE(Instruction&, int, uint32_t&);
 		void h_NOP(Instruction&, int, uint32_t&);
 	};
--- a/src/jit_compiler_a64_static.S
+++ b/src/jit_compiler_a64_static.S
@ -28,8 +28,11 @@
 	.arch armv8-a
 	.text
 	.global	randomx_program_aarch64
+	.global	randomx_program_aarch64_main_loop
 	.global	randomx_program_aarch64_vm_instructions
 	.global	randomx_program_aarch64_vm_instructions_end
+	.global randomx_program_aarch64_cacheline_align_mask1
+	.global randomx_program_aarch64_cacheline_align_mask2
 	.global randomx_program_aarch64_update_spMix1
 	.global	randomx_program_aarch64_end

@ -43,7 +46,7 @@
 # x5  -> "r1"
 # x6  -> "r2"
 # x7  -> "r3"
-# x8  -> temporary
+# x8  -> fpcr (reversed bits)
 # x9  -> mx, ma
 # x10 -> spMix1
 # x11 -> spMix2
@ -51,23 +54,23 @@
 # x13 -> "r5"
 # x14 -> "r6"
 # x15 -> "r7"
-# x16 -> ScratchpadL1Mask64
-# x17 -> ScratchpadL2Mask64
-# x18 -> ScratchpadL3Mask64
-# x19 -> spAddr0
-# x20 -> spAddr1
-# x21 -> temporary
-# x22 -> temporary
-# x23 -> temporary
-# x24 -> temporary
-# x25 -> temporary
-# x26 -> ScratchpadL1Mask8
-# x27 -> ScratchpadL2Mask8
-# x28 -> ScratchpadL3Mask8
-# x29 -> CacheLineAlignMask
-
-# v0-v7 -> temporary
-# v8-v15 -> not used
+# x16 -> spAddr0
+# x17 -> spAddr1
+# x18 -> temporary
+# x19 -> temporary
+# x20 -> temporary
+# x21 -> literal for IMUL_RCP
+# x22 -> literal for IMUL_RCP
+# x23 -> literal for IMUL_RCP
+# x24 -> literal for IMUL_RCP
+# x25 -> literal for IMUL_RCP
+# x26 -> literal for IMUL_RCP
+# x27 -> literal for IMUL_RCP
+# x28 -> literal for IMUL_RCP
+# x29 -> literal for IMUL_RCP
+# x30 -> literal for IMUL_RCP
+
+# v0-v15 -> not used
 # v16 -> "f0"
 # v17 -> "f1"
 # v18 -> "f2"
@ -113,91 +116,98 @@ randomx_program_aarch64:
 	# Load initial spMix value
 	mov	x10, x9

-	# Load Scratchpad masks
-	mov	x16, 16384 - 64
-	mov	x17, 262144 - 64
-	mov	x18, 2097152 - 64
-	mov	x26, 16384 - 8
-	mov	x27, 262144 - 8
-	mov	x28, 2097152 - 8
-
-	# Load CacheLineAlignMask
-	mov	x29, 0x7FFFFFC0
-
 	# Load group A registers
 	ldp	q24, q25, [x0, 192]
 	ldp	q26, q27, [x0, 224]

 	# Load E 'and' mask
-	mov	x21, 0x00FFFFFFFFFFFFFF
-	ins	v29.d[0], x21
-	ins	v29.d[1], x21
+	mov	x16, 0x00FFFFFFFFFFFFFF
+	ins	v29.d[0], x16
+	ins	v29.d[1], x16

 	# Load E 'or' mask (stored in reg.f[0])
 	ldr	q30, [x0, 64]

 	# Load scale mask
-	mov	x21, 0x80f0000000000000
-	ins	v31.d[0], x21
-	ins	v31.d[1], x21
-
-main_loop:
+	mov	x16, 0x80f0000000000000
+	ins	v31.d[0], x16
+	ins	v31.d[1], x16
+
+	# Read fpcr
+	mrs	x8, fpcr
+	rbit	x8, x8
+
+	# Read literals
+	ldr	x21, literal_x21
+	ldr	x22, literal_x22
+	ldr	x23, literal_x23
+	ldr	x24, literal_x24
+	ldr	x25, literal_x25
+	ldr	x26, literal_x26
+	ldr	x27, literal_x27
+	ldr	x28, literal_x28
+	ldr	x29, literal_x29
+	ldr	x30, literal_x30
+
+randomx_program_aarch64_main_loop:
 	# spAddr0 = spMix1 & ScratchpadL3Mask64;
 	# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
-	lsr	x21, x10, 32
-	and	w19, w10, w18
-	and	w20, w21, w18
+	lsr	x18, x10, 32
+
+	# Actual mask will be inserted by JIT compiler
+	and	w16, w10, 1
+	and	w17, w18, 1

-	# x19 = scratchpad + spAddr0
-	# x20 = scratchpad + spAddr1
-	add	x19, x19, x2
-	add	x20, x20, x2
+	# x16 = scratchpad + spAddr0
+	# x17 = scratchpad + spAddr1
+	add	x16, x16, x2
+	add	x17, x17, x2

 	# xor integer registers with scratchpad data (spAddr0)
-	ldp	x21, x22, [x19]
-	ldp	x23, x24, [x19, 16]
-	eor	x4, x4, x21
-	eor	x5, x5, x22
-	eor	x6, x6, x23
-	eor	x7, x7, x24
-	ldp	x21, x22, [x19, 32]
-	ldp	x23, x24, [x19, 48]
-	eor	x12, x12, x21
-	eor	x13, x13, x22
-	eor	x14, x14, x23
-	eor	x15, x15, x24
+	ldp	x18, x19, [x16]
+	eor	x4, x4, x18
+	eor	x5, x5, x19
+	ldp	x18, x19, [x16, 16]
+	eor	x6, x6, x18
+	eor	x7, x7, x19
+	ldp	x18, x19, [x16, 32]
+	eor	x12, x12, x18
+	eor	x13, x13, x19
+	ldp	x18, x19, [x16, 48]
+	eor	x14, x14, x18
+	eor	x15, x15, x19

 	# Load group F registers (spAddr1)
-	ldpsw	x21, x22, [x20]
-	ldpsw	x23, x24, [x20, 8]
-	ins	v16.d[0], x21
-	ins	v16.d[1], x22
-	ins	v17.d[0], x23
-	ins	v17.d[1], x24
-	ldpsw	x21, x22, [x20, 16]
-	ldpsw	x23, x24, [x20, 24]
-	ins	v18.d[0], x21
-	ins	v18.d[1], x22
-	ins	v19.d[0], x23
-	ins	v19.d[1], x24
+	ldpsw	x18, x19, [x17]
+	ins	v16.d[0], x18
+	ins	v16.d[1], x19
+	ldpsw	x18, x19, [x17, 8]
+	ins	v17.d[0], x18
+	ins	v17.d[1], x19
+	ldpsw	x18, x19, [x17, 16]
+	ins	v18.d[0], x18
+	ins	v18.d[1], x19
+	ldpsw	x18, x19, [x17, 24]
+	ins	v19.d[0], x18
+	ins	v19.d[1], x19
 	scvtf	v16.2d, v16.2d
 	scvtf	v17.2d, v17.2d
 	scvtf	v18.2d, v18.2d
 	scvtf	v19.2d, v19.2d

 	# Load group E registers (spAddr1)
-	ldpsw	x21, x22, [x20, 32]
-	ldpsw	x23, x24, [x20, 40]
-	ins	v20.d[0], x21
-	ins	v20.d[1], x22
-	ins	v21.d[0], x23
-	ins	v21.d[1], x24
-	ldpsw	x21, x22, [x20, 48]
-	ldpsw	x23, x24, [x20, 56]
-	ins	v22.d[0], x21
-	ins	v22.d[1], x22
-	ins	v23.d[0], x23
-	ins	v23.d[1], x24
+	ldpsw	x18, x19, [x17, 32]
+	ins	v20.d[0], x18
+	ins	v20.d[1], x19
+	ldpsw	x18, x19, [x17, 40]
+	ins	v21.d[0], x18
+	ins	v21.d[1], x19
+	ldpsw	x18, x19, [x17, 48]
+	ins	v22.d[0], x18
+	ins	v22.d[1], x19
+	ldpsw	x18, x19, [x17, 56]
+	ins	v23.d[0], x18
+	ins	v23.d[1], x19
 	scvtf	v20.2d, v20.2d
 	scvtf	v21.2d, v21.2d
 	scvtf	v22.2d, v22.2d
@ -214,8 +224,38 @@ main_loop:
 	# Execute VM instructions
 randomx_program_aarch64_vm_instructions:

-	# 16 KB buffer for generated instructions
-	.fill 4096,4,0
+	# 12 KB buffer for generated instructions
+	.fill 3072,4,0
+
+literal_x21:
+	.fill 1,8,0
+
+literal_x22:
+	.fill 1,8,0
+
+literal_x23:
+	.fill 1,8,0
+
+literal_x24:
+	.fill 1,8,0
+
+literal_x25:
+	.fill 1,8,0
+
+literal_x26:
+	.fill 1,8,0
+
+literal_x27:
+	.fill 1,8,0
+
+literal_x28:
+	.fill 1,8,0
+
+literal_x29:
+	.fill 1,8,0
+
+literal_x30:
+	.fill 1,8,0

 randomx_program_aarch64_vm_instructions_end:

@ -223,43 +263,47 @@ randomx_program_aarch64_vm_instructions_end:
 	eor	x9, x9, x11

 	# Calculate dataset pointer for dataset prefetch
-	mov	w25, w9
-	and	x25, x25, x29
-	add	x25, x25, x1
+	mov	w20, w9
+randomx_program_aarch64_cacheline_align_mask1:
+	# Actual mask will be inserted by JIT compiler
+	and	x20, x20, 1
+	add	x20, x20, x1

 	# Prefetch dataset data
-	prfm	pldl2strm, [x25]
+	prfm	pldl2strm, [x20]

 	# mx <-> ma
 	ror	x9, x9, 32

 	# Calculate dataset pointer for dataset read
-	mov	w25, w9
-	and	x25, x25, x29
-	add	x25, x25, x1
+	mov	w20, w9
+randomx_program_aarch64_cacheline_align_mask2:
+	# Actual mask will be inserted by JIT compiler
+	and	x20, x20, 1
+	add	x20, x20, x1

 	# xor integer registers with dataset data
-	ldp	x21, x22, [x25]
-	ldp	x23, x24, [x25, 16]
-	eor	x4, x4, x21
-	eor	x5, x5, x22
-	eor	x6, x6, x23
-	eor	x7, x7, x24
-	ldp	x21, x22, [x25, 32]
-	ldp	x23, x24, [x25, 48]
-	eor	x12, x12, x21
-	eor	x13, x13, x22
-	eor	x14, x14, x23
-	eor	x15, x15, x24
+	ldp	x18, x19, [x20]
+	eor	x4, x4, x18
+	eor	x5, x5, x19
+	ldp	x18, x19, [x20, 16]
+	eor	x6, x6, x18
+	eor	x7, x7, x19
+	ldp	x18, x19, [x20, 32]
+	eor	x12, x12, x18
+	eor	x13, x13, x19
+	ldp	x18, x19, [x20, 48]
+	eor	x14, x14, x18
+	eor	x15, x15, x19

 randomx_program_aarch64_update_spMix1:
 	eor	x10, x0, x0

 	# Store integer registers to scratchpad (spAddr1)
-	stp	x4, x5, [x20, 0]
-	stp	x6, x7, [x20, 16]
-	stp	x12, x13, [x20, 32]
-	stp	x14, x15, [x20, 48]
+	stp	x4, x5, [x17, 0]
+	stp	x6, x7, [x17, 16]
+	stp	x12, x13, [x17, 32]
+	stp	x14, x15, [x17, 48]

 	# xor group F and group E registers
 	eor	v16.16b, v16.16b, v20.16b
@ -268,11 +312,11 @@ randomx_program_aarch64_update_spMix1:
 	eor	v19.16b, v19.16b, v23.16b

 	# Store FP registers to scratchpad (spAddr0)
-	stp	q16, q17, [x19, 0]
-	stp	q18, q19, [x19, 32]
+	stp	q16, q17, [x16, 0]
+	stp	q18, q19, [x16, 32]

 	subs	x3, x3, 1
-	bne	main_loop
+	bne	randomx_program_aarch64_main_loop
 	
 	# Store integer registers
 	stp	x4, x5, [x0, 0]
--- a/src/jit_compiler_a64_static.hpp
+++ b/src/jit_compiler_a64_static.hpp
@ -31,8 +31,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 extern "C" {
 	void randomx_program_aarch64(void* reg, void* mem, void* scratchpad, uint64_t iterations);
+	void randomx_program_aarch64_main_loop();
 	void randomx_program_aarch64_vm_instructions();
 	void randomx_program_aarch64_vm_instructions_end();
+	void randomx_program_aarch64_cacheline_align_mask1();
+	void randomx_program_aarch64_cacheline_align_mask2();
 	void randomx_program_aarch64_update_spMix1();
 	void randomx_program_aarch64_end();
 }