SuperscalarHash interpreter

Linux assembly code
5 years ago · 2132e5fef5
parent b4c02051fa
commit 2132e5fef5
11 changed files with 310 additions and 74 deletions
--- a/5
+++ b/5
@ -9,7 +9,7 @@ OBJDIR=obj
 LDFLAGS=-lpthread
 CPPSRC=src/argon2_core.c src/Cache.cpp src/divideByConstantCodegen.c src/Instruction.cpp src/JitCompilerX86.cpp src/Program.cpp src/VirtualMachine.cpp src/argon2_ref.c src/CompiledVirtualMachine.cpp src/executeProgram-linux.cpp src/instructionsPortable.cpp src/LightClientAsyncWorker.cpp src/softAes.cpp src/virtualMemory.cpp src/AssemblyGeneratorX86.cpp  src/dataset.cpp src/hashAes1Rx4.cpp src/InterpretedVirtualMachine.cpp src/main.cpp src/TestAluFpu.cpp src/blake2/blake2b.c
 TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o)
-ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o CompiledLightVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o Cache.o virtualMemory.o reciprocal.o LightClientAsyncWorker.o hashAes1Rx4.o)
+ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o CompiledLightVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o softAes.o VirtualMachine.o Cache.o virtualMemory.o reciprocal.o LightClientAsyncWorker.o hashAes1Rx4.o LightProgramGenerator.o)
 ifeq ($(PLATFORM),amd64)
    ROBJS += $(OBJDIR)/JitCompilerX86-static.o $(OBJDIR)/squareHash.o
    CXXFLAGS += -maes
@ -99,6 +99,9 @@ $(OBJDIR)/InterpretedVirtualMachine.o: $(addprefix $(SRCDIR)/,InterpretedVirtual

 $(OBJDIR)/LightClientAsyncWorker.o: $(addprefix $(SRCDIR)/,LightClientAsyncWorker.cpp LightClientAsyncWorker.hpp common.hpp) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/LightClientAsyncWorker.cpp -o $@
+
+$(OBJDIR)/LightProgramGenerator.o: $(addprefix $(SRCDIR)/,LightProgramGenerator.cpp LightProgramGenerator.hpp Program.hpp blake2/blake2.h blake2/endian.h configuration.h) | $(OBJDIR)
+	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/LightProgramGenerator.cpp -o $@
  
 $(OBJDIR)/main.o: $(addprefix $(SRCDIR)/,main.cpp InterpretedVirtualMachine.hpp Stopwatch.hpp blake2/blake2.h VirtualMachine.hpp common.hpp blake2/endian.h Program.hpp Instruction.hpp intrinPortable.h CompiledVirtualMachine.hpp JitCompilerX86.hpp AssemblyGeneratorX86.hpp dataset.hpp Cache.hpp virtualMemory.hpp hashAes1Rx4.hpp softAes.h configuration.h) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/main.cpp -o $@
--- a/src/Instruction.hpp
+++ b/src/Instruction.hpp
@ -30,7 +30,7 @@ namespace RandomX {
 	typedef void(Instruction::*InstructionVisualizer)(std::ostream&) const;

 	namespace InstructionType {
-		constexpr int IADD_R = 0;
+		constexpr int IADD_RS = 0;
 		constexpr int IADD_M = 1;
 		constexpr int IADD_RC = 2;
 		constexpr int ISUB_R = 3;
--- a/src/InterpretedVirtualMachine.cpp
+++ b/src/InterpretedVirtualMachine.cpp
@ -36,6 +36,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #ifdef STATS
 #include <algorithm>
 #endif
+#include "LightProgramGenerator.hpp"

 #ifdef FPUCHECK
 constexpr bool fpuCheck = true;
@ -45,17 +46,20 @@ constexpr bool fpuCheck = false;

 namespace RandomX {

-	InterpretedVirtualMachine::~InterpretedVirtualMachine() {
-
-	}
-
-	void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) {
+	template<bool superscalar>
+	void InterpretedVirtualMachine<superscalar>::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) {
 		mem.ds = ds;
 		readDataset = &datasetReadLight;
 		datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize;
+		if(superscalar)
+			precompileSuperscalar(programs);
 	}

-	void InterpretedVirtualMachine::initialize() {
+	template void InterpretedVirtualMachine<true>::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]);
+	template void InterpretedVirtualMachine<false>::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]);
+
+	template<bool superscalar>
+	void InterpretedVirtualMachine<superscalar>::initialize() {
 		VirtualMachine::initialize();
 		for (unsigned i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) {
 			program(i).src %= RegistersCount;
@ -63,12 +67,19 @@ namespace RandomX {
 		}
 	}

-	void InterpretedVirtualMachine::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) {
+	template void InterpretedVirtualMachine<true>::initialize();
+	template void InterpretedVirtualMachine<false>::initialize();
+
+	template<bool superscalar>
+	void InterpretedVirtualMachine<superscalar>::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) {
 		for (int ic = 0; ic < RANDOMX_PROGRAM_SIZE; ++ic) {
 			executeBytecode(ic, r, f, e, a);
 		}
 	}

+	template void InterpretedVirtualMachine<true>::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]);
+	template void InterpretedVirtualMachine<false>::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]);
+
 	static void print(int_reg_t r) {
 		std::cout << std::hex << std::setw(16) << std::setfill('0') << r << std::endl;
 	}
@ -98,14 +109,15 @@ namespace RandomX {
 		return std::fpclassify(x) == FP_SUBNORMAL;
 	}

-	FORCE_INLINE void InterpretedVirtualMachine::executeBytecode(int& ic, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) {
+	 template<bool superscalar>
+	 FORCE_INLINE void InterpretedVirtualMachine<superscalar>::executeBytecode(int& ic, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) {
 		auto& ibc = byteCode[ic];
 		if (trace) std::cout << std::dec << std::setw(3) << ic << " " << program(ic);
 		//if(trace) printState(r, f, e, a);
 		switch (ibc.type)
 		{
-			case InstructionType::IADD_R: {
-				*ibc.idst += *ibc.isrc;
+			case InstructionType::IADD_RS: {
+				*ibc.idst += (*ibc.isrc << ibc.shift) + ibc.imm;
 			} break;

 			case InstructionType::IADD_M: {
@ -289,7 +301,8 @@ namespace RandomX {
 #endif
 	}

-	void InterpretedVirtualMachine::execute() {
+	template<bool superscalar>
+	void InterpretedVirtualMachine<superscalar>::execute() {
 		int_reg_t r[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
 		__m128d f[4];
 		__m128d e[4];
@ -350,11 +363,16 @@ namespace RandomX {

 			mem.mx ^= r[readReg2] ^ r[readReg3];
 			mem.mx &= CacheLineAlignMask;
-			Cache& cache = mem.ds.cache;
-			uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)];
-			initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8);
-			for (int i = 0; i < RegistersCount; ++i)
-				r[i] ^= datasetLine[i];
+			if (superscalar) {
+				executeSuperscalar(datasetBase + mem.ma / CacheLineSize, r);
+			}
+			else {
+				Cache& cache = mem.ds.cache;
+				uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)];
+				initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8);
+				for (int i = 0; i < RegistersCount; ++i)
+					r[i] ^= datasetLine[i];
+			}
 			std::swap(mem.mx, mem.ma);

 			if (trace) {
@ -419,6 +437,9 @@ namespace RandomX {
 		_mm_store_pd(&reg.e[3].lo, e[3]);
 	}

+	template void InterpretedVirtualMachine<true>::execute();
+	template void InterpretedVirtualMachine<false>::execute();
+
 	static int getConditionRegister(int(&registerUsage)[8]) {
 		int min = INT_MAX;
 		int minIndex;
@ -431,9 +452,118 @@ namespace RandomX {
 		return minIndex;
 	}

+	constexpr uint64_t superscalarMul0 = 6364136223846793005ULL;
+	constexpr uint64_t superscalarAdd1 = 9298410992540426048ULL;
+	constexpr uint64_t superscalarAdd2 = 12065312585734608966ULL;
+	constexpr uint64_t superscalarAdd3 = 9306329213124610396ULL;
+	constexpr uint64_t superscalarAdd4 = 5281919268842080866ULL;
+	constexpr uint64_t superscalarAdd5 = 10536153434571861004ULL;
+	constexpr uint64_t superscalarAdd6 = 3398623926847679864ULL;
+	constexpr uint64_t superscalarAdd7 = 9549104520008361294ULL;
+
+	static uint8_t* getMixBlock(uint64_t registerValue, Cache& cache) {
+		uint8_t* mixBlock;
+		if (RANDOMX_ARGON_GROWTH == 0) {
+			constexpr uint32_t mask = (RANDOMX_ARGON_MEMORY * ArgonBlockSize / CacheLineSize - 1);
+			mixBlock = cache.memory + (registerValue & mask) * CacheLineSize;
+		}
+		else {
+			const uint32_t modulus = cache.size / CacheLineSize;
+			mixBlock = cache.memory + (registerValue % modulus) * CacheLineSize;
+		}
+		return mixBlock;
+	}
+
+	template<bool superscalar>
+	void InterpretedVirtualMachine<superscalar>::executeSuperscalar(uint32_t blockNumber, int_reg_t(&r)[8]) {
+		int_reg_t rl[8];
+		uint8_t* mixBlock;
+		uint64_t registerValue = blockNumber;
+		rl[0] = (blockNumber + 1) * superscalarMul0;
+		rl[1] = rl[0] ^ superscalarAdd1;
+		rl[2] = rl[0] ^ superscalarAdd2;
+		rl[3] = rl[0] ^ superscalarAdd3;
+		rl[4] = rl[0] ^ superscalarAdd4;
+		rl[5] = rl[0] ^ superscalarAdd5;
+		rl[6] = rl[0] ^ superscalarAdd6;
+		rl[7] = rl[0] ^ superscalarAdd7;
+		Cache& cache = mem.ds.cache;
+		for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) {
+			mixBlock = getMixBlock(registerValue, cache);
+			LightProgram& prog = superScalarPrograms[i];
+			for (unsigned j = 0; j < prog.getSize(); ++j) {
+				Instruction& instr = prog(j);
+				switch (instr.opcode)
+				{
+					case RandomX::LightInstructionType::ISUB_R:
+						rl[instr.dst] -= rl[instr.src];
+						break;
+					case RandomX::LightInstructionType::IXOR_R:
+						rl[instr.dst] ^= rl[instr.src];
+						break;
+					case RandomX::LightInstructionType::IADD_RS:
+						rl[instr.dst] += rl[instr.src] << (instr.mod % 4);
+						break;
+					case RandomX::LightInstructionType::IMUL_R:
+						rl[instr.dst] *= rl[instr.src];
+						break;
+					case RandomX::LightInstructionType::IROR_C:
+						rl[instr.dst] = rotr(rl[instr.dst], instr.getImm32());
+						break;
+					case RandomX::LightInstructionType::IADD_C7:
+					case RandomX::LightInstructionType::IADD_C8:
+					case RandomX::LightInstructionType::IADD_C9:
+						rl[instr.dst] += signExtend2sCompl(instr.getImm32());
+						break;
+					case RandomX::LightInstructionType::IXOR_C7:
+					case RandomX::LightInstructionType::IXOR_C8:
+					case RandomX::LightInstructionType::IXOR_C9:
+						rl[instr.dst] ^= signExtend2sCompl(instr.getImm32());
+						break;
+					case RandomX::LightInstructionType::IMULH_R:
+						rl[instr.dst] = mulh(rl[instr.dst], rl[instr.src]);
+						break;
+					case RandomX::LightInstructionType::ISMULH_R:
+						rl[instr.dst] = smulh(rl[instr.dst], rl[instr.src]);
+						break;
+					case RandomX::LightInstructionType::IMUL_RCP:
+						rl[instr.dst] *= reciprocals[instr.getImm32()];
+						break;
+					default:
+						UNREACHABLE;
+				}
+			}
+			
+			for(unsigned q = 0; q < 8; ++q)
+				rl[q] ^= load64(mixBlock + 8 * q);
+
+			registerValue = rl[prog.getAddressRegister()];
+		}
+
+		for (unsigned q = 0; q < 8; ++q)
+			r[q] ^= rl[q];
+	}
+
+	template<bool superscalar>
+	void InterpretedVirtualMachine<superscalar>::precompileSuperscalar(LightProgram* programs) {
+		memcpy(superScalarPrograms, programs, sizeof(superScalarPrograms));
+		reciprocals.clear();
+		for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) {
+			for (unsigned j = 0; j < superScalarPrograms[i].getSize(); ++j) {
+				Instruction& instr = superScalarPrograms[i](j);
+				if (instr.opcode == LightInstructionType::IMUL_RCP) {
+					auto rcp = reciprocal(instr.getImm32());
+					instr.setImm32(reciprocals.size());
+					reciprocals.push_back(rcp);
+				}	
+			}
+		}
+	}
+
 #include "instructionWeights.hpp"

-	void InterpretedVirtualMachine::precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) {
+	template<bool superscalar>
+	void InterpretedVirtualMachine<superscalar>::precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) {
 		int registerUsage[8];
 		for (unsigned i = 0; i < 8; ++i) {
 			registerUsage[i] = -1;
@ -445,14 +575,17 @@ namespace RandomX {
 				CASE_REP(IADD_RS) {
 					auto dst = instr.dst % RegistersCount;
 					auto src = instr.src % RegistersCount;
-					ibc.type = InstructionType::IADD_R;
+					ibc.type = InstructionType::IADD_RS;
 					ibc.idst = &r[dst];
-					if (src != dst) {
+					if (dst != 5) {
 						ibc.isrc = &r[src];
+						ibc.shift = instr.mod % 4;
+						ibc.imm = 0;
 					}
 					else {
+						ibc.isrc = &r[src];
+						ibc.shift = instr.mod % 4;
 						ibc.imm = signExtend2sCompl(instr.getImm32());
-						ibc.isrc = &ibc.imm;
 					}
 					registerUsage[instr.dst] = i;
 				} break;
--- a/src/InterpretedVirtualMachine.hpp
+++ b/src/InterpretedVirtualMachine.hpp
@ -23,23 +23,17 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include "VirtualMachine.hpp"
 #include "Program.hpp"
 #include "intrinPortable.h"
+#include <vector>

 namespace RandomX {

-	class ITransform {
-	public:
-		virtual int32_t apply(int32_t) const = 0;
-		virtual const char* getName() const = 0;
-		virtual std::ostream& printAsm(std::ostream&) const = 0;
-		virtual std::ostream& printCxx(std::ostream&) const = 0;
-	};
-
 	struct InstructionByteCode;
-	class InterpretedVirtualMachine;
+	template<bool superscalar> class InterpretedVirtualMachine;

-	typedef void(InterpretedVirtualMachine::*InstructionHandler)(Instruction&);
+	template<bool superscalar>
+	using InstructionHandler = void(InterpretedVirtualMachine<superscalar>::*)(Instruction&);

-	struct alignas(8) InstructionByteCode {
+	struct InstructionByteCode {
 		union {
 			int_reg_t* idst;
 			__m128d* fdst;
@ -62,6 +56,7 @@ namespace RandomX {

 	constexpr int asedwfagdewsa = sizeof(InstructionByteCode);

+	template<bool superscalar>
 	class InterpretedVirtualMachine : public VirtualMachine {
 	public:
 		void* operator new(size_t size) {
@ -74,16 +69,17 @@ namespace RandomX {
 			_mm_free(ptr);
 		}
 		InterpretedVirtualMachine(bool soft) : softAes(soft) {}
-		~InterpretedVirtualMachine();
+		~InterpretedVirtualMachine() {}
 		void setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override;
 		void initialize() override;
 		void execute() override;
 	private:
-		static InstructionHandler engine[256];
+		static InstructionHandler<superscalar> engine[256];
 		DatasetReadFunc readDataset;
 		bool softAes;
 		InstructionByteCode byteCode[RANDOMX_PROGRAM_SIZE];
-		
+		std::vector<uint64_t> reciprocals;
+		alignas(64) LightProgram superScalarPrograms[RANDOMX_CACHE_ACCESSES];
 #ifdef STATS
 		int count_ADD_64 = 0;
 		int count_ADD_32 = 0;
@ -131,7 +127,9 @@ namespace RandomX {
 		int datasetAccess[256] = { 0 };
 #endif
 		void precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]);
+		void precompileSuperscalar(LightProgram*);
 		void executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]);
 		void executeBytecode(int& i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]);
+		void executeSuperscalar(uint32_t blockNumber, int_reg_t(&r)[8]);
 	};
 }
--- a/src/JitCompilerX86-static.S
+++ b/src/JitCompilerX86-static.S
@ -32,10 +32,18 @@
 .global DECL(randomx_program_start)
 .global DECL(randomx_program_read_dataset)
 .global DECL(randomx_program_read_dataset_light)
+.global DECL(randomx_program_read_dataset_sshash_init)
+.global DECL(randomx_program_read_dataset_sshash_fin)
+.global DECL(randomx_program_read_dataset_light_sub)
+.global DECL(randomx_dataset_init)
 .global DECL(randomx_program_loop_store)
 .global DECL(randomx_program_loop_end)
 .global DECL(randomx_program_read_dataset_light_sub)
 .global DECL(randomx_program_epilogue)
+.global DECL(randomx_sshash_load)
+.global DECL(randomx_sshash_prefetch)
+.global DECL(randomx_sshash_end)
+.global DECL(randomx_sshash_init)
 .global DECL(randomx_program_end)

 #define db .byte
@ -63,6 +71,12 @@ DECL(randomx_program_read_dataset):
 DECL(randomx_program_read_dataset_light):
 	#include "asm/program_read_dataset_light.inc"

+DECL(randomx_program_read_dataset_sshash_init):
+	#include "asm/program_read_dataset_sshash_init.inc"
+
+DECL(randomx_program_read_dataset_sshash_fin):
+	#include "asm/program_read_dataset_sshash_fin.inc"
+
 DECL(randomx_program_loop_store):
 	#include "asm/program_loop_store.inc"

@ -75,10 +89,84 @@ DECL(randomx_program_read_dataset_light_sub):
 squareHashSub:
 	#include "asm/squareHash.inc"

+.balign 64
+DECL(randomx_dataset_init):
+	push rbx
+	push rbp
+	push r12
+	push r13
+	push r14
+	push r15
+	;# cache in rdi
+	;# dataset in rsi
+	mov rbp, rdx  ;# block index
+	push rcx      ;# max. block index
+init_block_loop:
+	prefetchw byte ptr [rsi]
+	mov rbx, rbp
+	.byte 232 ;# 0xE8 = call
+	;# .set CALL_LOC, 
+	.int 32768 - (call_offset - DECL(randomx_dataset_init))
+call_offset:
+	mov qword ptr [rsi+0], r8
+	mov qword ptr [rsi+8], r9
+	mov qword ptr [rsi+16], r10
+	mov qword ptr [rsi+24], r11
+	mov qword ptr [rsi+32], r12
+	mov qword ptr [rsi+40], r13
+	mov qword ptr [rsi+48], r14
+	mov qword ptr [rsi+56], r15
+	add rbp, 1
+	add rsi, 64
+	cmp rbp, qword ptr [rsp]
+	jb init_block_loop
+	pop rcx
+	pop r15
+	pop r14
+	pop r13
+	pop r12
+	pop rbp
+	pop rbx
+	ret
+
 .balign 64
 DECL(randomx_program_epilogue):
 	#include "asm/program_epilogue_linux.inc"

+.balign 64
+DECL(randomx_sshash_load):
+	#include "asm/program_sshash_load.inc"
+
+DECL(randomx_sshash_prefetch):
+	#include "asm/program_sshash_prefetch.inc"
+
+DECL(randomx_sshash_end):
+	nop
+
+.balign 64
+DECL(randomx_sshash_init):
+	lea r8, [rbx+1]
+	#include "asm/program_sshash_prefetch.inc"
+	imul r8, qword ptr r0_mul[rip]
+	mov r9, qword ptr r1_add[rip]
+	xor r9, r8
+	mov r10, qword ptr r2_add[rip]
+	xor r10, r8
+	mov r11, qword ptr r3_add[rip]
+	xor r11, r8
+	mov r12, qword ptr r4_add[rip]
+	xor r12, r8
+	mov r13, qword ptr r5_add[rip]
+	xor r13, r8
+	mov r14, qword ptr r6_add[rip]
+	xor r14, r8
+	mov r15, qword ptr r7_add[rip]
+	xor r15, r8
+	jmp DECL(randomx_program_end)
+
+.balign 64
+	#include "asm/program_sshash_constants.inc"
+	
 .balign 64
 DECL(randomx_program_end):
 	nop
--- a/src/JitCompilerX86-static.asm
+++ b/src/JitCompilerX86-static.asm
@ -68,35 +68,11 @@ randomx_program_read_dataset_light PROC
 randomx_program_read_dataset_light ENDP

 randomx_program_read_dataset_sshash_init PROC
-	sub rsp, 72
-	mov qword ptr [rsp+64], rbx
-	mov qword ptr [rsp+56], r8
-	mov qword ptr [rsp+48], r9
-	mov qword ptr [rsp+40], r10
-	mov qword ptr [rsp+32], r11
-	mov qword ptr [rsp+24], r12
-	mov qword ptr [rsp+16], r13
-	mov qword ptr [rsp+8], r14
-	mov qword ptr [rsp+0], r15
-	xor rbp, rax                       ;# modify "mx"
-	ror rbp, 32                        ;# swap "ma" and "mx"
-	mov ebx, ebp                       ;# ecx = ma
-	and ebx, 2147483584                ;# align "ma" to the start of a cache line
-	shr ebx, 6                         ;# ebx = Dataset block number
-	;# call 32768
+	include asm/program_read_dataset_sshash_init.inc
 randomx_program_read_dataset_sshash_init ENDP

 randomx_program_read_dataset_sshash_fin PROC
-	mov rbx, qword ptr [rsp+64]
-	xor r8, qword ptr [rsp+56]
-	xor r9, qword ptr [rsp+48]
-	xor r10, qword ptr [rsp+40]
-	xor r11, qword ptr [rsp+32]
-	xor r12, qword ptr [rsp+24]
-	xor r13, qword ptr [rsp+16]
-	xor r14, qword ptr [rsp+8]
-	xor r15, qword ptr [rsp+0]
-	add rsp, 72
+	include asm/program_read_dataset_sshash_fin.inc
 randomx_program_read_dataset_sshash_fin ENDP

 randomx_program_loop_store PROC
--- a/src/LightProgramGenerator.cpp
+++ b/src/LightProgramGenerator.cpp
@ -17,10 +17,11 @@ You should have received a copy of the GNU General Public License
 along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 */

+#include <stddef.h>
 #include "blake2/blake2.h"
 #include "configuration.h"
 #include "Program.hpp"
-#include "blake2/endian.h";
+#include "blake2/endian.h"
 #include <iostream>
 #include <vector>
 #include <algorithm>
@ -793,7 +794,7 @@ namespace RandomX {
 				mop.setCycle(scheduleCycle);
 				if (scheduleCycle < 0) {
 					if (TRACE) std::cout << "; Failed at cycle " << cycle << std::endl;
-					return DBL_MIN;
+					return 0;
 				}

 				if (instrIndex == currentInstruction.getInfo().getSrcOp()) {
--- a/src/asm/program_read_dataset_sshash_fin.inc
+++ b/src/asm/program_read_dataset_sshash_fin.inc
@ -0,0 +1,10 @@
+	mov rbx, qword ptr [rsp+64]
+	xor r8, qword ptr [rsp+56]
+	xor r9, qword ptr [rsp+48]
+	xor r10, qword ptr [rsp+40]
+	xor r11, qword ptr [rsp+32]
+	xor r12, qword ptr [rsp+24]
+	xor r13, qword ptr [rsp+16]
+	xor r14, qword ptr [rsp+8]
+	xor r15, qword ptr [rsp+0]
+	add rsp, 72
--- a/src/asm/program_read_dataset_sshash_init.inc
+++ b/src/asm/program_read_dataset_sshash_init.inc
@ -0,0 +1,16 @@
+	sub rsp, 72
+	mov qword ptr [rsp+64], rbx
+	mov qword ptr [rsp+56], r8
+	mov qword ptr [rsp+48], r9
+	mov qword ptr [rsp+40], r10
+	mov qword ptr [rsp+32], r11
+	mov qword ptr [rsp+24], r12
+	mov qword ptr [rsp+16], r13
+	mov qword ptr [rsp+8], r14
+	mov qword ptr [rsp+0], r15
+	xor rbp, rax                       ;# modify "mx"
+	ror rbp, 32                        ;# swap "ma" and "mx"
+	mov ebx, ebp                       ;# ecx = ma
+	and ebx, 2147483584                ;# align "ma" to the start of a cache line
+	shr ebx, 6                         ;# ebx = Dataset block number
+	;# call 32768
--- a/src/asm/program_sshash_constants.inc
+++ b/src/asm/program_sshash_constants.inc
@ -1,16 +1,24 @@
-r0_mul: ;# 6364136223846793005
+r0_mul:
+	;#/ 6364136223846793005
 	db 45, 127, 149, 76, 45, 244, 81, 88
-r1_add: ;# 9298410992540426048
+r1_add:
+	;#/ 9298410992540426048
 	db 64, 159, 245, 89, 136, 151, 10, 129
-r2_add: ;# 12065312585734608966
+r2_add:
+	;#/ 12065312585734608966
 	db 70, 216, 194, 56, 223, 153, 112, 167
-r3_add: ;# 9306329213124610396
+r3_add:
+	;#/ 9306329213124610396
 	db 92, 9, 34, 191, 28, 185, 38, 129
-r4_add: ;# 5281919268842080866
+r4_add:
+	;#/ 5281919268842080866
 	db 98, 138, 159, 23, 151, 37, 77, 73
-r5_add: ;# 10536153434571861004
+r5_add:
+	;#/ 10536153434571861004
 	db 12, 236, 170, 206, 185, 239, 55, 146
-r6_add: ;# 3398623926847679864
+r6_add:
+	;#/ 3398623926847679864
 	db 120, 45, 230, 108, 116, 86, 42, 47
-r7_add: ;# 9549104520008361294
+r7_add:
+	;#/ 9549104520008361294
 	db 78, 229, 44, 182, 247, 59, 133, 132
--- a/src/main.cpp
+++ b/src/main.cpp
@ -301,6 +301,7 @@ int main(int argc, char** argv) {
 				RandomX::JitCompilerX86 jit86;
 				jit86.generateSuperScalarHash(programs);
 				jit86.getDatasetInitFunc()(cache.memory, dataset.dataset.memory, 0, datasetBlockCount);
+			//dump((const char*)dataset.dataset.memory, RANDOMX_DATASET_SIZE, "dataset.dat");
 			}
 			else {
 				if (initThreadCount > 1) {
@ -331,10 +332,12 @@ int main(int argc, char** argv) {
 			else {
 				if (jit && useSuperscalar)
 					vm = new RandomX::CompiledLightVirtualMachine<true>();
-				else if(jit)
+				else if (jit)
 					vm = new RandomX::CompiledLightVirtualMachine<false>();
+				else if (useSuperscalar)
+					vm = new RandomX::InterpretedVirtualMachine<true>(softAes);
 				else
-					vm = new RandomX::InterpretedVirtualMachine(softAes);
+					vm = new RandomX::InterpretedVirtualMachine<false>(softAes);
 			}
 			vm->setDataset(dataset, datasetSize, programs);
 			vms.push_back(vm);