Added explicit STORE instructions

JIT compiler
6 years ago · 005c67f64c
parent d2cb086221
commit 005c67f64c
27 changed files with 1688 additions and 1455 deletions
--- a/src/AssemblyGeneratorX86.cpp
+++ b/src/AssemblyGeneratorX86.cpp
@ -75,6 +75,11 @@ namespace RandomX {
 		asmCode << "\tand " << reg << ", " << ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask) << std::endl;
 	}

+	void AssemblyGeneratorX86::genAddressRegDst(Instruction& instr, int maskAlign = 8) {
+		asmCode << "\tmov eax" << ", " << regR32[instr.dst] << std::endl;
+		asmCode << "\tand eax" << ", " << ((instr.alt % 4) ? (ScratchpadL1Mask & (-maskAlign)) : (ScratchpadL2Mask & (-maskAlign))) << std::endl;
+	}
+
 	int32_t AssemblyGeneratorX86::genAddressImm(Instruction& instr) {
 		return instr.imm32 & ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask);
 	}
@ -425,7 +430,7 @@ namespace RandomX {

 	//6 uOPs
 	void AssemblyGeneratorX86::h_CFROUND(Instruction& instr, int i) {
-		asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
+		asmCode << "\tmov rax, " << regR[instr.src] << std::endl;
 		int rotate = (13 - (instr.alt & 63)) & 63;
 		if (rotate != 0)
 			asmCode << "\trol rax, " << rotate << std::endl;
@ -474,6 +479,18 @@ namespace RandomX {
 		asmCode << "\tadd " << regR[instr.dst] << ", rcx" << std::endl;
 	}

+	//3 uOPs
+	void AssemblyGeneratorX86::h_ISTORE(Instruction& instr, int i) {
+		genAddressRegDst(instr);
+		asmCode << "\tmov qword ptr [rsi+rax], " << regR[instr.src] << std::endl;
+	}
+
+	//3 uOPs
+	void AssemblyGeneratorX86::h_FSTORE(Instruction& instr, int i) {
+		genAddressRegDst(instr, 16);
+		asmCode << "\tmovapd xmmword ptr [rsi+rax], " << regFE[instr.src] << std::endl;
+	}
+
 #include "instructionWeights.hpp"
 #define INST_HANDLE(x) REPN(&AssemblyGeneratorX86::h_##x, WT(x))

@ -520,5 +537,8 @@ namespace RandomX {
 		INST_HANDLE(COND_R)
 		INST_HANDLE(COND_M)
 		INST_HANDLE(CFROUND)
+
+		INST_HANDLE(ISTORE)
+		INST_HANDLE(FSTORE)
 	};
 }
--- a/src/AssemblyGeneratorX86.hpp
+++ b/src/AssemblyGeneratorX86.hpp
@ -38,16 +38,8 @@ namespace RandomX {
 		static InstructionGenerator engine[256];
 		std::stringstream asmCode;

-		void gena(Instruction&, int);
-		void genar(Instruction&, int);
-		void genaf(Instruction&, int);
-		void genbiashift(Instruction&, const char*);
-		void genbia(Instruction&);
-		void genbia32(Instruction&);
-		void genbf(Instruction&, const char*);
-		void gencr(Instruction&, bool);
-		void gencf(Instruction&, bool);
 		void genAddressReg(Instruction&, const char*);
+		void genAddressRegDst(Instruction&, int);
 		int32_t genAddressImm(Instruction&);

 		void generateCode(Instruction&, int);
@ -85,5 +77,7 @@ namespace RandomX {
 		void  h_COND_R(Instruction&, int);
 		void  h_COND_M(Instruction&, int);
 		void  h_CFROUND(Instruction&, int);
+		void  h_ISTORE(Instruction&, int);
+		void  h_FSTORE(Instruction&, int);
 	};
 }
--- a/src/CompiledVirtualMachine.cpp
+++ b/src/CompiledVirtualMachine.cpp
@ -71,14 +71,14 @@ namespace RandomX {
 			reg.a[i].hi.u64 = getSmallPositiveFloatBits(reg.f[i].hi.u64);
 		}
 		compiler.generateProgram(gen);
-		mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & ~7;
+		mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & -64;
 		mem.mx = *(((uint32_t*)seed) + 5);
 	}

 	void CompiledVirtualMachine::execute() {
-		executeProgram(reg, mem, scratchpad, InstructionCount);
+		//executeProgram(reg, mem, scratchpad, InstructionCount);
 		totalSize += compiler.getCodeSize();
-		//compiler.getProgramFunc()(reg, mem, scratchpad);
+		compiler.getProgramFunc()(reg, mem, scratchpad, InstructionCount);
 #ifdef TRACEVM
 		for (int32_t i = InstructionCount - 1; i >= 0; --i) {
 			std::cout << std::hex << tracepad[i].u64 << std::endl;
--- a/src/Instruction.cpp
+++ b/src/Instruction.cpp
@ -32,6 +32,10 @@ namespace RandomX {
 		os << ((alt % 4) ? "L1" : "L2") << "[r" << (int)src << "]";
 	}

+	void Instruction::genAddressRegDst(std::ostream& os) const {
+		os << ((alt % 4) ? "L1" : "L2") << "[r" << (int)dst << "]";
+	}
+
 	void Instruction::genAddressImm(std::ostream& os) const {
 		os << ((alt % 4) ? "L1" : "L2") << "[" << (imm32 & ((alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)) << "]";
 	}
@ -276,7 +280,7 @@ namespace RandomX {
 	}

 	void Instruction::h_CFROUND(std::ostream& os) const {
-		os << "r" << (int)dst << ", " << (alt & 63) << std::endl;
+		os << "r" << (int)src << ", " << (alt & 63) << std::endl;
 	}

 	static inline const char* condition(int index) {
@ -311,6 +315,18 @@ namespace RandomX {
 		os << ", " << imm32 << ")" << std::endl;
 	}

+	void  Instruction::h_ISTORE(std::ostream& os) const {
+		genAddressRegDst(os);
+		os << ", r" << (int)src << std::endl;
+	}
+
+	void  Instruction::h_FSTORE(std::ostream& os) const {
+		const char reg = (src >= 4) ? 'e' : 'f';
+		genAddressRegDst(os);
+		auto srcIndex = src % 4;
+		os << ", " << reg << srcIndex << std::endl;
+	}
+
 #include "instructionWeights.hpp"
 #define INST_NAME(x) REPN(#x, WT(x))
 #define INST_HANDLE(x) REPN(&Instruction::h_##x, WT(x))
@ -358,6 +374,9 @@ namespace RandomX {
 		INST_NAME(COND_R)
 		INST_NAME(COND_M)
 		INST_NAME(CFROUND)
+
+		INST_NAME(ISTORE)
+		INST_NAME(FSTORE)
 	};

 	InstructionVisualizer Instruction::engine[256] = {
@ -403,6 +422,9 @@ namespace RandomX {
 		INST_HANDLE(COND_R)
 		INST_HANDLE(COND_M)
 		INST_HANDLE(CFROUND)
+
+		INST_HANDLE(ISTORE)
+		INST_HANDLE(FSTORE)
 	};

 }
--- a/src/Instruction.hpp
+++ b/src/Instruction.hpp
@ -49,6 +49,7 @@ namespace RandomX {

 		void genAddressReg(std::ostream& os) const;
 		void genAddressImm(std::ostream& os) const;
+		void genAddressRegDst(std::ostream&) const;

 		void  h_IADD_R(std::ostream&) const;
 		void  h_IADD_M(std::ostream&) const;
@ -83,6 +84,8 @@ namespace RandomX {
 		void  h_COND_R(std::ostream&) const;
 		void  h_COND_M(std::ostream&) const;
 		void  h_CFROUND(std::ostream&) const;
+		void  h_ISTORE(std::ostream&) const;
+		void  h_FSTORE(std::ostream&) const;
 	};

 	static_assert(sizeof(Instruction) == 8, "Invalid alignment of struct Instruction");
--- a/src/JitCompilerX86-static.S
+++ b/src/JitCompilerX86-static.S
@ -27,11 +27,16 @@
 #define DECL(x) x
 #endif
 .global DECL(randomx_program_prologue)
-.global DECL(randomx_program_begin)
+.global DECL(randomx_loop_begin)
+.global DECL(randomx_program_load_int)
+.global DECL(randomx_program_load_flt)
+.global DECL(randomx_program_start)
+.global DECL(randomx_program_read_dataset)
+.global DECL(randomx_program_store_int)
+.global DECL(randomx_program_store_flt)
+.global DECL(randomx_program_loop_end)
 .global DECL(randomx_program_epilogue)
-.global DECL(randomx_program_read)
 .global DECL(randomx_program_end)
-.global DECL(randomx_program_transform)

 #define db .byte

@ -40,21 +45,37 @@ DECL(randomx_program_prologue):
 	#include "asm/program_prologue_linux.inc"

 .align 64
-DECL(randomx_program_begin):
+	#include "asm/program_xmm_constants.inc"
+
+.align 64
+DECL(randomx_loop_begin):
+	nop
+
+DECL(randomx_program_load_int):
+	#include "asm/program_load_int.inc"
+
+DECL(randomx_program_load_flt):
+	#include "asm/program_load_flt.inc"
+
+DECL(randomx_program_start):
+	nop
+
+DECL(randomx_program_read_dataset):
+	#include "asm/program_read_dataset.inc"
+
+DECL(randomx_program_store_int):
+	#include "asm/program_store_int.inc"
+
+DECL(randomx_program_store_flt):
+	#include "asm/program_store_flt.inc"
+
+DECL(randomx_program_loop_end):
 	nop

 .align 64
 DECL(randomx_program_epilogue):
 	#include "asm/program_epilogue_linux.inc"

-.align 64
-DECL(randomx_program_read):
-	#include "asm/program_read.inc"
-
 .align 64
 DECL(randomx_program_end):
 	nop
-
-.align 8
-DECL(randomx_program_transform):
-	#include "asm/program_transform_address.inc"
--- a/src/JitCompilerX86-static.asm
+++ b/src/JitCompilerX86-static.asm
@ -20,12 +20,16 @@ IFDEF RAX
 _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE

 PUBLIC randomx_program_prologue
-PUBLIC randomx_program_begin
+PUBLIC randomx_loop_begin
+PUBLIC randomx_program_load_int
+PUBLIC randomx_program_load_flt
+PUBLIC randomx_program_start
+PUBLIC randomx_program_read_dataset
+PUBLIC randomx_program_store_int
+PUBLIC randomx_program_store_flt
+PUBLIC randomx_program_loop_end
 PUBLIC randomx_program_epilogue
-PUBLIC randomx_program_read
 PUBLIC randomx_program_end
-PUBLIC randomx_program_transform
-

 ALIGN 64
 randomx_program_prologue PROC
@ -33,30 +37,51 @@ randomx_program_prologue PROC
 randomx_program_prologue ENDP

 ALIGN 64
-randomx_program_begin PROC
+	include asm/program_xmm_constants.inc
+
+ALIGN 64
+randomx_loop_begin PROC
+	nop
+randomx_loop_begin ENDP
+
+randomx_program_load_int PROC
+	include asm/program_load_int.inc
+randomx_program_load_int ENDP
+
+randomx_program_load_flt PROC
+	include asm/program_load_flt.inc
+randomx_program_load_flt ENDP
+
+randomx_program_start PROC
 	nop
-randomx_program_begin ENDP
+randomx_program_start ENDP
+
+randomx_program_read_dataset PROC
+	include asm/program_read_dataset.inc
+randomx_program_read_dataset ENDP
+
+randomx_program_store_int PROC
+	include asm/program_store_int.inc
+randomx_program_store_int ENDP
+
+randomx_program_store_flt PROC
+	include asm/program_store_flt.inc
+randomx_program_store_flt ENDP
+
+randomx_program_loop_end PROC
+	nop
+randomx_program_loop_end ENDP

 ALIGN 64
 randomx_program_epilogue PROC
 	include asm/program_epilogue_win64.inc
 randomx_program_epilogue ENDP

-ALIGN 64
-randomx_program_read PROC
-	include asm/program_read.inc
-randomx_program_read ENDP
-
 ALIGN 64
 randomx_program_end PROC
 	nop
 randomx_program_end ENDP

-ALIGN 8
-randomx_program_transform PROC
-	include asm/program_transform_address.inc
-randomx_program_transform ENDP
-
 _RANDOMX_JITX86_STATIC ENDS

 ENDIF
--- a/src/JitCompilerX86-static.hpp
+++ b/src/JitCompilerX86-static.hpp
@ -18,10 +18,15 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 */

 extern "C" {
-  void randomx_program_prologue();
-  void randomx_program_begin();
-  void randomx_program_epilogue();
-  void randomx_program_transform();
-  void randomx_program_read();
-  void randomx_program_end();
+	void randomx_program_prologue();
+	void randomx_loop_begin();
+	void randomx_program_load_int();
+	void randomx_program_load_flt();
+	void randomx_program_start();
+	void randomx_program_read_dataset();
+	void randomx_program_store_int();
+	void randomx_program_store_flt();
+	void randomx_program_loop_end();
+	void randomx_program_epilogue();
+	void randomx_program_end();
 }
--- a/src/JitCompilerX86.cpp
+++ b/src/JitCompilerX86.cpp
--- a/src/JitCompilerX86.hpp
+++ b/src/JitCompilerX86.hpp
@ -30,16 +30,10 @@ namespace RandomX {

 	class JitCompilerX86;

-	typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&, int);
+	typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&);

 	constexpr uint32_t CodeSize = 64 * 1024;

-	struct CallOffset {
-		CallOffset(int32_t p, int32_t i) : pos(p), index(i) {}
-		int32_t pos;
-		int32_t index;
-	};
-
 	class JitCompilerX86 {
 	public:
 		JitCompilerX86();
@ -55,66 +49,82 @@ namespace RandomX {
 		static InstructionGeneratorX86 engine[256];
 		uint8_t* code;
 		int32_t codePos;
-		std::vector<int32_t> instructionOffsets;
-		std::vector<CallOffset> callOffsets;
-
-		void gena(Instruction&);
-		void genar(Instruction&);
-		void genaf(Instruction&);
-		void genbiashift(Instruction&, uint16_t, uint16_t);
-		void genbia(Instruction&, uint16_t, uint16_t);
-		void genbia32(Instruction&, uint16_t, uint8_t);
-		void genbf(Instruction&, uint8_t);
-		void scratchpadStoreR(Instruction&, uint32_t, bool);
-		void scratchpadStoreF(Instruction&, int, uint32_t, bool);
-		void gencr(Instruction&, bool);
-		void gencf(Instruction&);
-		void generateCode(Instruction&, int);
-		void fixCallOffsets();
+
+		void genAddressReg(Instruction&, bool);
+		void genAddressRegDst(Instruction&, bool);
+		void genAddressImm(Instruction&);
+		void genSIB(int scale, int index, int base);
+
+		void generateCode(Instruction&);

 		void emitByte(uint8_t val) {
 			code[codePos] = val;
 			codePos++;
 		}

-		template<typename T>
-		void emit(T val) {
-			*reinterpret_cast<T*>(code + codePos) = val;
-			codePos += sizeof(T);
+		void emit32(uint32_t val) {
+			code[codePos + 0] = val;
+			code[codePos + 1] = val >> 8;
+			code[codePos + 2] = val >> 16;
+			code[codePos + 3] = val >> 24;
+			codePos += 4;
+		}
+
+		void emit64(uint64_t val) {
+			code[codePos + 0] = val;
+			code[codePos + 1] = val >> 8;
+			code[codePos + 2] = val >> 16;
+			code[codePos + 3] = val >> 24;
+			code[codePos + 4] = val >> 32;
+			code[codePos + 5] = val >> 40;
+			code[codePos + 6] = val >> 48;
+			code[codePos + 7] = val >> 56;
+			codePos += 8;
+		}
+
+		template<size_t N>
+		void emit(const uint8_t (&src)[N]) {
+			for (int i = 0; i < N; ++i) {
+				code[codePos + i] = src[i];
+			}
+			codePos += N;
 		}

-		void h_ADD_64(Instruction&, int);
-		void h_ADD_32(Instruction&, int);
-		void h_SUB_64(Instruction&, int);
-		void h_SUB_32(Instruction&, int);
-		void h_MUL_64(Instruction&, int);
-		void h_MULH_64(Instruction&, int);
-		void h_MUL_32(Instruction&, int);
-		void h_IMUL_32(Instruction&, int);
-		void h_IMULH_64(Instruction&, int);
-		void h_DIV_64(Instruction&, int);
-		void h_IDIV_64(Instruction&, int);
-		void h_AND_64(Instruction&, int);
-		void h_AND_32(Instruction&, int);
-		void h_OR_64(Instruction&, int);
-		void h_OR_32(Instruction&, int);
-		void h_XOR_64(Instruction&, int);
-		void h_XOR_32(Instruction&, int);
-		void h_SHL_64(Instruction&, int);
-		void h_SHR_64(Instruction&, int);
-		void h_SAR_64(Instruction&, int);
-		void h_ROL_64(Instruction&, int);
-		void h_ROR_64(Instruction&, int);
-		void h_FPADD(Instruction&, int);
-		void h_FPSUB(Instruction&, int);
-		void h_FPMUL(Instruction&, int);
-		void h_FPDIV(Instruction&, int);
-		void h_FPSQRT(Instruction&, int);
-		void h_FPROUND(Instruction&, int);
-		void h_JUMP(Instruction&, int);
-		void h_CALL(Instruction&, int);
-		void h_RET(Instruction&, int);
-		void h_NOP(Instruction&, int);
+		void  h_IADD_R(Instruction&);
+		void  h_IADD_M(Instruction&);
+		void  h_IADD_RC(Instruction&);
+		void  h_ISUB_R(Instruction&);
+		void  h_ISUB_M(Instruction&);
+		void  h_IMUL_9C(Instruction&);
+		void  h_IMUL_R(Instruction&);
+		void  h_IMUL_M(Instruction&);
+		void  h_IMULH_R(Instruction&);
+		void  h_IMULH_M(Instruction&);
+		void  h_ISMULH_R(Instruction&);
+		void  h_ISMULH_M(Instruction&);
+		void  h_IDIV_C(Instruction&);
+		void  h_ISDIV_C(Instruction&);
+		void  h_INEG_R(Instruction&);
+		void  h_IXOR_R(Instruction&);
+		void  h_IXOR_M(Instruction&);
+		void  h_IROR_R(Instruction&);
+		void  h_IROL_R(Instruction&);
+		void  h_FPSWAP_R(Instruction&);
+		void  h_FPADD_R(Instruction&);
+		void  h_FPADD_M(Instruction&);
+		void  h_FPSUB_R(Instruction&);
+		void  h_FPSUB_M(Instruction&);
+		void  h_FPNEG_R(Instruction&);
+		void  h_FPMUL_R(Instruction&);
+		void  h_FPMUL_M(Instruction&);
+		void  h_FPDIV_R(Instruction&);
+		void  h_FPDIV_M(Instruction&);
+		void  h_FPSQRT_R(Instruction&);
+		void  h_COND_R(Instruction&);
+		void  h_COND_M(Instruction&);
+		void  h_CFROUND(Instruction&);
+		void  h_ISTORE(Instruction&);
+		void  h_FSTORE(Instruction&);
 	};

 }
--- a/src/asm/program_epilogue_store.inc
+++ b/src/asm/program_epilogue_store.inc
@ -1,9 +1,5 @@
-	;# unroll VM stack
-	mov rsp, rdi
-
 	;# save VM register values
 	pop rcx
-	pop rcx
 	mov qword ptr [rcx+0], r8
 	mov qword ptr [rcx+8], r9
 	mov qword ptr [rcx+16], r10
@ -12,12 +8,12 @@
 	mov qword ptr [rcx+40], r13
 	mov qword ptr [rcx+48], r14
 	mov qword ptr [rcx+56], r15
-	movapd xmmword ptr [rcx+64], xmm8
-	movapd xmmword ptr [rcx+80], xmm9
-	movapd xmmword ptr [rcx+96], xmm2
-	movapd xmmword ptr [rcx+112], xmm3
+	movdqa xmmword ptr [rcx+64], xmm0
+	movdqa xmmword ptr [rcx+80], xmm1
+	movdqa xmmword ptr [rcx+96], xmm2
+	movdqa xmmword ptr [rcx+112], xmm3
 	lea rcx, [rcx+64]
-	movapd xmmword ptr [rcx+64], xmm4
-	movapd xmmword ptr [rcx+80], xmm5
-	movapd xmmword ptr [rcx+96], xmm6
-	movapd xmmword ptr [rcx+112], xmm7
+	movdqa xmmword ptr [rcx+64], xmm4
+	movdqa xmmword ptr [rcx+80], xmm5
+	movdqa xmmword ptr [rcx+96], xmm6
+	movdqa xmmword ptr [rcx+112], xmm7
--- a/src/asm/program_epilogue_win64.inc
+++ b/src/asm/program_epilogue_win64.inc
@ -1,6 +1,12 @@
 	include program_epilogue_store.inc

 	;# restore callee-saved registers - Microsoft x64 calling convention
+	movdqu xmm15, xmmword ptr [rsp]
+	movdqu xmm14, xmmword ptr [rsp+16]
+	movdqu xmm13, xmmword ptr [rsp+32]
+	movdqu xmm12, xmmword ptr [rsp+48]
+	movdqu xmm11, xmmword ptr [rsp+64]
+	add rsp, 80
 	movdqu xmm10, xmmword ptr [rsp]
 	movdqu xmm9, xmmword ptr [rsp+16]
 	movdqu xmm8, xmmword ptr [rsp+32]
@ -17,4 +23,4 @@
 	pop rbx

 	;# program finished
-	ret	0
+	ret
--- a/src/asm/program_load_flt.inc
+++ b/src/asm/program_load_flt.inc
@ -0,0 +1,14 @@
+	and eax, 262080
+	lea rcx, [rsi+rax]
+	cvtdq2pd xmm0, qword ptr [rcx+0]
+	cvtdq2pd xmm1, qword ptr [rcx+8]
+	cvtdq2pd xmm2, qword ptr [rcx+16]
+	cvtdq2pd xmm3, qword ptr [rcx+24]
+	cvtdq2pd xmm4, qword ptr [rcx+32]
+	cvtdq2pd xmm5, qword ptr [rcx+40]
+	cvtdq2pd xmm6, qword ptr [rcx+48]
+	cvtdq2pd xmm7, qword ptr [rcx+56]
+	andps xmm4, xmm14
+	andps xmm5, xmm14
+	andps xmm6, xmm14
+	andps xmm7, xmm14
--- a/src/asm/program_load_int.inc
+++ b/src/asm/program_load_int.inc
@ -0,0 +1,10 @@
+	and eax, 262080
+	lea rcx, [rsi+rax]
+	xor r8,  qword ptr [rcx+0]
+	xor r9,  qword ptr [rcx+8]
+	xor r10, qword ptr [rcx+16]
+	xor r11, qword ptr [rcx+24]
+	xor r12, qword ptr [rcx+32]
+	xor r13, qword ptr [rcx+40]
+	xor r14, qword ptr [rcx+48]
+	xor r15, qword ptr [rcx+56]
--- a/src/asm/program_prologue_linux.inc
+++ b/src/asm/program_prologue_linux.inc
@ -7,13 +7,14 @@
 	push r15

 	;# function arguments
+	mov rbx, rcx                ;# loop counter
 	push rdi                    ;# RegisterFile& registerFile
+	mov rcx, rdi
 	mov rbp, qword ptr [rsi]    ;# "mx", "ma"
-	mov rax, qword ptr [rsi+8]  ;# uint8_t* dataset
-	push rax
+	mov eax, ebp                ;# "mx"
+	mov rdi, qword ptr [rsi+8]  ;# uint8_t* dataset
 	mov rsi, rdx                ;# convertible_t* scratchpad
-	mov rcx, rdi

 	#include "program_prologue_load.inc"

-	jmp randomx_program_begin
+	jmp DECL(randomx_loop_begin)
--- a/src/asm/program_prologue_load.inc
+++ b/src/asm/program_prologue_load.inc
@ -1,27 +1,20 @@
-	mov rdi, rsp      ;# beginning of VM stack
-	mov ebx, 262145   ;# number of VM instructions to execute + 1
+	;# zero integer registers
+	xor r8, r8
+	xor r9, r9
+	xor r10, r10
+	xor r11, r11
+	xor r12, r12
+	xor r13, r13
+	xor r14, r14
+	xor r15, r15

-	xorps xmm10, xmm10
-	cmpeqpd xmm10, xmm10
-	psrlq xmm10, 1    ;# mask for absolute value = 0x7fffffffffffffff7fffffffffffffff
+	;# load constant registers
+	lea rcx, [rcx+120]
+	movapd xmm8, xmmword ptr [rcx+72]
+	movapd xmm9, xmmword ptr [rcx+88]
+	movapd xmm10, xmmword ptr [rcx+104]
+	movapd xmm11, xmmword ptr [rcx+120]
+	movapd xmm13, xmmword ptr [minDbl]
+	movapd xmm14, xmmword ptr [absMask]
+	movapd xmm15, xmmword ptr [signMask]

-	;# load integer registers
-	mov r8, qword ptr [rcx+0]
-	mov r9, qword ptr [rcx+8]
-	mov r10, qword ptr [rcx+16]
-	mov r11, qword ptr [rcx+24]
-	mov r12, qword ptr [rcx+32]
-	mov r13, qword ptr [rcx+40]
-	mov r14, qword ptr [rcx+48]
-	mov r15, qword ptr [rcx+56]
-
-	;# load floating point registers
-	movapd xmm8, xmmword ptr [rcx+64]
-	movapd xmm9, xmmword ptr [rcx+80]
-	movapd xmm2, xmmword ptr [rcx+96]
-	movapd xmm3, xmmword ptr [rcx+112]
-	lea rcx, [rcx+64]
-	movapd xmm4, xmmword ptr [rcx+64]
-	movapd xmm5, xmmword ptr [rcx+80]
-	movapd xmm6, xmmword ptr [rcx+96]
-	movapd xmm7, xmmword ptr [rcx+112]
--- a/src/asm/program_prologue_win64.inc
+++ b/src/asm/program_prologue_win64.inc
@ -13,14 +13,21 @@
 	movdqu xmmword ptr [rsp+32], xmm8
 	movdqu xmmword ptr [rsp+16], xmm9
 	movdqu xmmword ptr [rsp+0], xmm10
+	sub rsp, 80
+	movdqu xmmword ptr [rsp+64], xmm11
+	movdqu xmmword ptr [rsp+48], xmm12
+	movdqu xmmword ptr [rsp+32], xmm13
+	movdqu xmmword ptr [rsp+16], xmm14
+	movdqu xmmword ptr [rsp+0], xmm15

-	;# function arguments
-	push rcx                    ;# RegisterFile& registerFile
-	mov rbp, qword ptr [rdx]    ;# "mx", "ma"
-	mov rax, qword ptr [rdx+8]  ;# uint8_t* dataset
-	push rax
-	mov rsi, r8                 ;# convertible_t* scratchpad
+	; function arguments
+	push rcx                    ; RegisterFile& registerFile
+	mov rbp, qword ptr [rdx]    ; "mx", "ma"
+	mov eax, ebp                ; "mx"
+	mov rdi, qword ptr [rdx+8]  ; uint8_t* dataset
+	mov rsi, r8                 ; convertible_t* scratchpad
+	mov rbx, r9                 ; loop counter

 	include program_prologue_load.inc

-	jmp randomx_program_begin
+	jmp randomx_loop_begin
--- a/src/asm/program_read.inc
+++ b/src/asm/program_read.inc
@ -1,20 +0,0 @@
-	db 0, 0, 0, 0                   ;# TransformAddress placeholder
-	mov rcx, qword ptr [rdi]        ;# load the dataset address
-	xor rbp, rax                    ;# modify "mx"
-	;# prefetch cacheline "mx"
-	and rbp, -64                    ;# align "mx" to the start of a cache line
-	mov edx, ebp                    ;# edx = mx
-	prefetchnta byte ptr [rcx+rdx]
-	;# read cacheline "ma"
-	ror rbp, 32                     ;# swap "ma" and "mx"
-	mov edx, ebp                    ;# edx = ma
-	lea rcx, [rcx+rdx]              ;# dataset cache line
-	xor r8,  qword ptr [rcx+0]
-	xor r9,  qword ptr [rcx+8]
-	xor r10, qword ptr [rcx+16]
-	xor r11, qword ptr [rcx+24]
-	xor r12, qword ptr [rcx+32]
-	xor r13, qword ptr [rcx+40]
-	xor r14, qword ptr [rcx+48]
-	xor r15, qword ptr [rcx+56]
-	ret
--- a/src/asm/program_read_dataset.inc
+++ b/src/asm/program_read_dataset.inc
@ -0,0 +1,16 @@
+	xor rbp, rax                       ;# modify "mx"
+	and rbp, -64                       ;# align "mx" to the start of a cache line
+	mov edx, ebp                       ;# edx = mx
+	prefetchnta byte ptr [rdi+rdx]
+	ror rbp, 32                        ;# swap "ma" and "mx"
+	mov edx, ebp                       ;# edx = ma
+	lea rcx, [rdi+rdx]                 ;# dataset cache line
+	xor r8,  qword ptr [rcx+0]
+	xor r9,  qword ptr [rcx+8]
+	xor r10, qword ptr [rcx+16]
+	xor r11, qword ptr [rcx+24]
+	xor r12, qword ptr [rcx+32]
+	xor r13, qword ptr [rcx+40]
+	xor r14, qword ptr [rcx+48]
+	xor r15, qword ptr [rcx+56]
+	
--- a/src/asm/program_store_flt.inc
+++ b/src/asm/program_store_flt.inc
@ -0,0 +1,11 @@
+	and eax, 262080
+	lea rcx, [rsi+rax]
+	mulpd xmm0, xmm4
+	mulpd xmm1, xmm5
+	mulpd xmm2, xmm6
+	mulpd xmm3, xmm7
+	movapd xmmword ptr [rcx+0], xmm0
+	movapd xmmword ptr [rcx+16], xmm1
+	movapd xmmword ptr [rcx+32], xmm2
+	movapd xmmword ptr [rcx+48], xmm3
+
--- a/src/asm/program_store_int.inc
+++ b/src/asm/program_store_int.inc
@ -0,0 +1,10 @@
+	and eax, 262080
+	lea rcx, [rsi+rax]
+	mov qword ptr [rcx+0], r8
+	mov qword ptr [rcx+8], r9
+	mov qword ptr [rcx+16], r10
+	mov qword ptr [rcx+24], r11
+	mov qword ptr [rcx+32], r12
+	mov qword ptr [rcx+40], r13
+	mov qword ptr [rcx+48], r14
+	mov qword ptr [rcx+56], r15
--- a/src/asm/program_xmm_constants.inc
+++ b/src/asm/program_xmm_constants.inc
@ -0,0 +1,6 @@
+minDbl:
+	db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0
+absMask:
+	db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127
+signMask:
+	db 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 128
--- a/src/common.hpp
+++ b/src/common.hpp
@ -81,6 +81,8 @@ namespace RandomX {
 	constexpr uint32_t ScratchpadL3 = ScratchpadSize / sizeof(convertible_t);
 	constexpr int ScratchpadL1Mask = (ScratchpadL1 - 1) * 8;
 	constexpr int ScratchpadL2Mask = (ScratchpadL2 - 1) * 8;
+	constexpr int ScratchpadL1Mask16 = (ScratchpadL1 / 2 - 1) * 16;
+	constexpr int ScratchpadL2Mask16 = (ScratchpadL2 / 2 - 1) * 16;
 	constexpr uint32_t TransformationCount = 90;
 	constexpr int RegistersCount = 8;

@ -129,7 +131,7 @@ namespace RandomX {

 	typedef void(*DatasetReadFunc)(addr_t, MemoryRegisters&, RegisterFile&);

-	typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*);
+	typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t);

 	extern "C" {
 		void executeProgram(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t);
--- a/src/executeProgram-win64.asm
+++ b/src/executeProgram-win64.asm
@ -21,14 +21,6 @@ _RANDOMX_EXECUTE_PROGRAM SEGMENT PAGE READ EXECUTE

 PUBLIC executeProgram

-ALIGN 16
-minDbl:
-db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0
-absMask:
-db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127
-signMask:
-db 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 128
-
 executeProgram PROC
 	; REGISTER ALLOCATION:
 	; rax -> temporary
@ -114,6 +106,17 @@ executeProgram PROC
 	movapd xmm14, xmmword ptr [absMask]
 	movapd xmm15, xmmword ptr [signMask]

+	jmp program_begin
+
+ALIGN 64
+minDbl:
+	db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0
+absMask:
+	db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127
+signMask:
+	db 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 128
+
+ALIGN 64
 program_begin:
 	xor eax, r8d                      ;# read address register 1
 	and eax, 262080
@ -144,7 +147,7 @@ program_begin:

 	;# 256 instructions
 	include program.inc
-	
+
 	mov eax, r8d                       ;# read address register 1
 	xor eax, r9d                       ;# read address register 2
 	xor rbp, rax                       ;# modify "mx"
--- a/src/instructionWeights.hpp
+++ b/src/instructionWeights.hpp
@ -22,21 +22,21 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 //Integer
 #define WT_IADD_R 10
 #define WT_IADD_M 3
-#define WT_IADD_RC 12
+#define WT_IADD_RC 10
 #define WT_ISUB_R 10
 #define WT_ISUB_M 3
-#define WT_IMUL_9C 12
-#define WT_IMUL_R 24
-#define WT_IMUL_M 8
+#define WT_IMUL_9C 10
+#define WT_IMUL_R 20
+#define WT_IMUL_M 6
 #define WT_IMULH_R 6
 #define WT_IMULH_M 2
 #define WT_ISMULH_R 6
 #define WT_ISMULH_M 2
 #define WT_IDIV_C 4
-#define WT_ISDIV_C 2
-#define WT_INEG_R 4
-#define WT_IXOR_R 15
-#define WT_IXOR_M 5
+#define WT_ISDIV_C 4
+#define WT_INEG_R 2
+#define WT_IXOR_R 12
+#define WT_IXOR_M 4
 #define WT_IROR_R 10
 #define WT_IROL_R 10

@ -58,10 +58,14 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #define WT_FPSQRT_R 6

 //Control
-#define WT_COND_R 15
-#define WT_COND_M 5
+#define WT_COND_R 12
+#define WT_COND_M 4
 #define WT_CFROUND 1

+//Store
+#define WT_ISTORE 12
+#define WT_FSTORE 6
+
 #define WT_NOP 0

 constexpr int wtSum = WT_IADD_R + WT_IADD_M + WT_IADD_RC + WT_ISUB_R + \
@ -70,7 +74,7 @@ WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IDIV_C + WT_ISDIV_C + \
 WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \
 WT_FPSWAP_R + WT_FPADD_R + WT_FPADD_M + WT_FPSUB_R + WT_FPSUB_M + \
 WT_FPNEG_R + WT_FPMUL_R + WT_FPMUL_M + WT_FPDIV_R + WT_FPDIV_M + \
-WT_FPSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_NOP;
+WT_FPSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_ISTORE + WT_FSTORE + WT_NOP;

 static_assert(wtSum == 256,
 	"Sum of instruction weights must be 256");
@ -116,3 +120,40 @@ static_assert(wtSum == 256,
 #define REPN(x,N) REPNX(x,N)
 #define NUM(x) x
 #define WT(x) NUM(WT_##x)
+
+#define REPCASE0(x)
+#define REPCASE1(x) case __COUNTER__:
+#define REPCASE2(x) REPCASE1(x) case __COUNTER__:
+#define REPCASE3(x) REPCASE2(x) case __COUNTER__:
+#define REPCASE4(x) REPCASE3(x) case __COUNTER__:
+#define REPCASE5(x) REPCASE4(x) case __COUNTER__:
+#define REPCASE6(x) REPCASE5(x) case __COUNTER__:
+#define REPCASE7(x) REPCASE6(x) case __COUNTER__:
+#define REPCASE8(x) REPCASE7(x) case __COUNTER__:
+#define REPCASE9(x) REPCASE8(x) case __COUNTER__:
+#define REPCASE10(x) REPCASE9(x) case __COUNTER__:
+#define REPCASE11(x) REPCASE10(x) case __COUNTER__:
+#define REPCASE12(x) REPCASE11(x) case __COUNTER__:
+#define REPCASE13(x) REPCASE12(x) case __COUNTER__:
+#define REPCASE14(x) REPCASE13(x) case __COUNTER__:
+#define REPCASE15(x) REPCASE14(x) case __COUNTER__:
+#define REPCASE16(x) REPCASE15(x) case __COUNTER__:
+#define REPCASE17(x) REPCASE16(x) case __COUNTER__:
+#define REPCASE18(x) REPCASE17(x) case __COUNTER__:
+#define REPCASE19(x) REPCASE18(x) case __COUNTER__:
+#define REPCASE20(x) REPCASE19(x) case __COUNTER__:
+#define REPCASE21(x) REPCASE20(x) case __COUNTER__:
+#define REPCASE22(x) REPCASE21(x) case __COUNTER__:
+#define REPCASE23(x) REPCASE22(x) case __COUNTER__:
+#define REPCASE24(x) REPCASE23(x) case __COUNTER__:
+#define REPCASE25(x) REPCASE24(x) case __COUNTER__:
+#define REPCASE26(x) REPCASE25(x) case __COUNTER__:
+#define REPCASE27(x) REPCASE26(x) case __COUNTER__:
+#define REPCASE28(x) REPCASE27(x) case __COUNTER__:
+#define REPCASE29(x) REPCASE28(x) case __COUNTER__:
+#define REPCASE30(x) REPCASE29(x) case __COUNTER__:
+#define REPCASE31(x) REPCASE30(x) case __COUNTER__:
+#define REPCASE32(x) REPCASE31(x) case __COUNTER__:
+#define REPCASENX(x,N) REPCASE##N(x)
+#define REPCASEN(x,N) REPCASENX(x,N)
+#define CASE_REP(x) REPCASEN(x, WT(x))
--- a/src/main.cpp
+++ b/src/main.cpp
@ -174,7 +174,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic<int>& atomicNonce, AtomicHash
 		for (int chain = 0; chain < 16; ++chain) {
 			vm->initializeProgram(hash);
 			int segment = hash[3] & 3;
-			vm->setScratchpad(scratchpad);// +segment * RandomX::ScratchpadSize / 4);
+			vm->setScratchpad(scratchpad + segment * RandomX::ScratchpadSize / 4);
 			vm->execute();
 			vm->getResult(nullptr, 0, hash);
 		}
--- a/src/program.inc
+++ b/src/program.inc