C API - first working version

5 years ago · 296e77eebc
parent 67046a9f38
commit 296e77eebc
36 changed files with 1277 additions and 929 deletions
--- a/src/AssemblyGeneratorX86.cpp
+++ b/src/AssemblyGeneratorX86.cpp
@ -25,7 +25,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include "Program.hpp"
 #include "superscalarGenerator.hpp"

-namespace RandomX {
+namespace randomx {

 	static const char* regR[8] = { "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" };
 	static const char* regR32[8] = { "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d" };
@ -69,54 +69,54 @@ namespace RandomX {
 			Instruction& instr = prog(i);
 			switch (instr.opcode)
 			{
-			case RandomX::SuperscalarInstructionType::ISUB_R:
+			case SuperscalarInstructionType::ISUB_R:
 				asmCode << "sub " << regR[instr.dst] << ", " << regR[instr.src] << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::IXOR_R:
+			case SuperscalarInstructionType::IXOR_R:
 				asmCode << "xor " << regR[instr.dst] << ", " << regR[instr.src] << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::IADD_RS:
+			case SuperscalarInstructionType::IADD_RS:
 				asmCode << "lea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.getModShift2())) << "]" << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::IMUL_R:
+			case SuperscalarInstructionType::IMUL_R:
 				asmCode << "imul " << regR[instr.dst] << ", " << regR[instr.src] << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::IROR_C:
+			case SuperscalarInstructionType::IROR_C:
 				asmCode << "ror " << regR[instr.dst] << ", " << instr.getImm32() << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::IADD_C7:
+			case SuperscalarInstructionType::IADD_C7:
 				asmCode << "add " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::IXOR_C7:
+			case SuperscalarInstructionType::IXOR_C7:
 				asmCode << "xor " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::IADD_C8:
+			case SuperscalarInstructionType::IADD_C8:
 				asmCode << "add " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl;
 				asmCode << "nop" << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::IXOR_C8:
+			case SuperscalarInstructionType::IXOR_C8:
 				asmCode << "xor " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl;
 				asmCode << "nop" << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::IADD_C9:
+			case SuperscalarInstructionType::IADD_C9:
 				asmCode << "add " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl;
 				asmCode << "xchg ax, ax ;nop" << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::IXOR_C9:
+			case SuperscalarInstructionType::IXOR_C9:
 				asmCode << "xor " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl;
 				asmCode << "xchg ax, ax ;nop" << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::IMULH_R:
+			case SuperscalarInstructionType::IMULH_R:
 				asmCode << "mov rax, " << regR[instr.dst] << std::endl;
 				asmCode << "mul " << regR[instr.src] << std::endl;
 				asmCode << "mov " << regR[instr.dst] << ", rdx" << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::ISMULH_R:
+			case SuperscalarInstructionType::ISMULH_R:
 				asmCode << "mov rax, " << regR[instr.dst] << std::endl;
 				asmCode << "imul " << regR[instr.src] << std::endl;
 				asmCode << "mov " << regR[instr.dst] << ", rdx" << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::IMUL_RCP:
+			case SuperscalarInstructionType::IMUL_RCP:
 				asmCode << "mov rax, " << (int64_t)reciprocal(instr.getImm32()) << std::endl;
 				asmCode << "imul " << regR[instr.dst] << ", rax" << std::endl;
 				break;
@ -178,38 +178,38 @@ namespace RandomX {
 			Instruction& instr = prog(i);
 			switch (instr.opcode)
 			{
-			case RandomX::SuperscalarInstructionType::ISUB_R:
+			case SuperscalarInstructionType::ISUB_R:
 				asmCode << regR[instr.dst] << " -= " << regR[instr.src] << ";" << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::IXOR_R:
+			case SuperscalarInstructionType::IXOR_R:
 				asmCode << regR[instr.dst] << " ^= " << regR[instr.src] << ";" << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::IADD_RS:
+			case SuperscalarInstructionType::IADD_RS:
 				asmCode << regR[instr.dst] << " += " << regR[instr.src] << "*" << (1 << (instr.getModShift2())) << ";" << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::IMUL_R:
+			case SuperscalarInstructionType::IMUL_R:
 				asmCode << regR[instr.dst] << " *= " << regR[instr.src] << ";" << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::IROR_C:
+			case SuperscalarInstructionType::IROR_C:
 				asmCode << regR[instr.dst] << " = rotr(" << regR[instr.dst] << ", " << instr.getImm32() << ");" << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::IADD_C7:
-			case RandomX::SuperscalarInstructionType::IADD_C8:
-			case RandomX::SuperscalarInstructionType::IADD_C9:
+			case SuperscalarInstructionType::IADD_C7:
+			case SuperscalarInstructionType::IADD_C8:
+			case SuperscalarInstructionType::IADD_C9:
 				asmCode << regR[instr.dst] << " += " << (int32_t)instr.getImm32() << ";" << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::IXOR_C7:
-			case RandomX::SuperscalarInstructionType::IXOR_C8:
-			case RandomX::SuperscalarInstructionType::IXOR_C9:
+			case SuperscalarInstructionType::IXOR_C7:
+			case SuperscalarInstructionType::IXOR_C8:
+			case SuperscalarInstructionType::IXOR_C9:
 				asmCode << regR[instr.dst] << " ^= " << (int32_t)instr.getImm32() << ";" << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::IMULH_R:
+			case SuperscalarInstructionType::IMULH_R:
 				asmCode << regR[instr.dst] << " = mulh(" << regR[instr.dst] << ", " << regR[instr.src] << ");" << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::ISMULH_R:
+			case SuperscalarInstructionType::ISMULH_R:
 				asmCode << regR[instr.dst] << " = smulh(" << regR[instr.dst] << ", " << regR[instr.src] << ");" << std::endl;
 				break;
-			case RandomX::SuperscalarInstructionType::IMUL_RCP:
+			case SuperscalarInstructionType::IMUL_RCP:
 				asmCode << regR[instr.dst] << " *= " << (int64_t)reciprocal(instr.getImm32()) << ";" << std::endl;
 				break;
 			default:
--- a/src/AssemblyGeneratorX86.hpp
+++ b/src/AssemblyGeneratorX86.hpp
@ -24,7 +24,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include "common.hpp"
 #include <sstream>

-namespace RandomX {
+namespace randomx {

 	class Program;
 	class SuperscalarProgram;
--- a/src/Blake2Generator.cpp
+++ b/src/Blake2Generator.cpp
@ -23,7 +23,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include "Blake2Generator.hpp"
 #include "common.hpp"

-namespace RandomX {
+namespace randomx {

 	Blake2Generator::Blake2Generator(const void* seed, int nonce) : dataIndex(sizeof(data)) {
 		memset(data, 0, sizeof(data));
--- a/src/Blake2Generator.hpp
+++ b/src/Blake2Generator.hpp
@ -20,7 +20,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #pragma once
 #include <cstdint>

-namespace RandomX {
+namespace randomx {

 	class Blake2Generator {
 	public:
--- a/src/Cache.cpp
+++ b/src/Cache.cpp
@ -1,82 +0,0 @@
-/*
-Copyright (c) 2018 tevador
-
-This file is part of RandomX.
-
-RandomX is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-RandomX is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
-*/
-
-#include <cstring>
-#include "Cache.hpp"
-#include "argon2.h"
-#include "argon2_core.h"
-
-namespace RandomX {
-
-	static_assert(RANDOMX_ARGON_MEMORY % (RANDOMX_ARGON_LANES * ARGON2_SYNC_POINTS) == 0, "RANDOMX_ARGON_MEMORY - invalid value");
-	static_assert(RANDOMX_ARGON_GROWTH % (RANDOMX_ARGON_LANES * ARGON2_SYNC_POINTS) == 0, "RANDOMX_ARGON_GROWTH - invalid value");
-
-	void argonFill(Cache& cache, const void* seed, size_t seedSize) {
-		uint32_t memory_blocks, segment_length;
-		argon2_instance_t instance;
-		argon2_context context;
-
-		context.out = nullptr;
-		context.outlen = 0;
-		context.pwd = CONST_CAST(uint8_t *)seed;
-		context.pwdlen = (uint32_t)seedSize;
-		context.salt = CONST_CAST(uint8_t *)RANDOMX_ARGON_SALT;
-		context.saltlen = (uint32_t)ArgonSaltSize;
-		context.secret = NULL;
-		context.secretlen = 0;
-		context.ad = NULL;
-		context.adlen = 0;
-		context.t_cost = RANDOMX_ARGON_ITERATIONS;
-		context.m_cost = cache.size / ArgonBlockSize;
-		context.lanes = RANDOMX_ARGON_LANES;
-		context.threads = 1;
-		context.allocate_cbk = NULL;
-		context.free_cbk = NULL;
-		context.flags = ARGON2_DEFAULT_FLAGS;
-		context.version = ARGON2_VERSION_NUMBER;
-
-		/* 2. Align memory size */
-		/* Minimum memory_blocks = 8L blocks, where L is the number of lanes */
-		memory_blocks = context.m_cost;
-
-		segment_length = memory_blocks / (context.lanes * ARGON2_SYNC_POINTS);
-
-		instance.version = context.version;
-		instance.memory = NULL;
-		instance.passes = context.t_cost;
-		instance.memory_blocks = memory_blocks;
-		instance.segment_length = segment_length;
-		instance.lane_length = segment_length * ARGON2_SYNC_POINTS;
-		instance.lanes = context.lanes;
-		instance.threads = context.threads;
-		instance.type = Argon2_d;
-		instance.memory = (block*)cache.memory;
-
-		if (instance.threads > instance.lanes) {
-			instance.threads = instance.lanes;
-		}
-
-		/* 3. Initialization: Hashing inputs, allocating memory, filling first
-		 * blocks
-		 */
-		argon_initialize(&instance, &context);
-
-		fill_memory_blocks(&instance);
-	}
-}
--- a/src/Cache.hpp
+++ b/src/Cache.hpp
@ -1,52 +0,0 @@
-/*
-Copyright (c) 2018 tevador
-
-This file is part of RandomX.
-
-RandomX is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-RandomX is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
-*/
-
-#pragma once
-
-#include <cstdint>
-#include <new>
-#include "common.hpp"
-#include "intrinPortable.h"
-#include "virtualMemory.hpp"
-
-namespace RandomX {
-
-	void argonFill(Cache& cache, const void* seed, size_t seedSize);
-
-	inline uint8_t* allocCache(size_t size, bool largePages) {
-		if (largePages) {
-			return (uint8_t*)allocLargePagesMemory(size);
-		}
-		else {
-			void* ptr = _mm_malloc(size, CacheLineSize);
-			if (ptr == nullptr)
-				throw std::bad_alloc();
-			return (uint8_t*)ptr;
-		}
-	}
-
-	inline void deallocCache(Cache cache, bool largePages) {
-		if (largePages) {
-			freePagedMemory(cache.memory, cache.size);
-		}
-		else {
-			_mm_free(cache.memory);
-		}
-	}
-}
--- a/src/CompiledLightVirtualMachine.cpp
+++ b/src/CompiledLightVirtualMachine.cpp
@ -21,27 +21,25 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include "common.hpp"
 #include <stdexcept>

-namespace RandomX {
-
-	template<bool superscalar>
-	void CompiledLightVirtualMachine<superscalar>::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) {
-		mem.ds = ds;
-		datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize;
-		if(superscalar)
-			compiler.generateSuperScalarHash(programs);
+namespace randomx {
+
+	template<class Allocator, bool softAes>
+	void CompiledLightVm<Allocator, softAes>::setCache(randomx_cache* cache) {
+		this->mem.memory = cache->memory;
+		//datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize;
+		this->compiler.generateSuperscalarHash(cache->programs, cache->reciprocalCache);
 		//datasetBasePtr = ds.dataset.memory;
 	}

-	template void CompiledLightVirtualMachine<true>::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]);
-	template void CompiledLightVirtualMachine<false>::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]);
-
-	template<bool superscalar>
-	void CompiledLightVirtualMachine<superscalar>::initialize() {
-		VirtualMachine::initialize();
-		compiler.generateProgramLight<superscalar>(program, config);
+	template<class Allocator, bool softAes>
+	void CompiledLightVm<Allocator, softAes>::initialize() {
+		randomx_vm::initialize();
+		this->compiler.generateProgramLight(this->program, this->config);
 		//mem.ds.dataset.memory = datasetBasePtr + (datasetBase * CacheLineSize);
 	}

-	template void CompiledLightVirtualMachine<true>::initialize();
-	template void CompiledLightVirtualMachine<false>::initialize();
+	template class CompiledLightVm<AlignedAllocator<CacheLineSize>, false>;
+	template class CompiledLightVm<AlignedAllocator<CacheLineSize>, true>;
+	template class CompiledLightVm<LargePageAllocator, false>;
+	template class CompiledLightVm<LargePageAllocator, true>;
 }
--- a/src/CompiledLightVirtualMachine.hpp
+++ b/src/CompiledLightVirtualMachine.hpp
@ -24,22 +24,27 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include "JitCompilerX86.hpp"
 #include "intrinPortable.h"

-namespace RandomX {
+namespace randomx {

-	template<bool superscalar>
-	class CompiledLightVirtualMachine : public CompiledVirtualMachine {
+	template<class Allocator, bool softAes>
+	class CompiledLightVm : public CompiledVm<Allocator, softAes> {
 	public:
 		void* operator new(size_t size) {
-			void* ptr = _mm_malloc(size, 64);
+			void* ptr = AlignedAllocator<CacheLineSize>::allocMemory(size);
 			if (ptr == nullptr)
 				throw std::bad_alloc();
 			return ptr;
 		}
 		void operator delete(void* ptr) {
-			_mm_free(ptr);
+			AlignedAllocator<CacheLineSize>::freeMemory(ptr, sizeof(CompiledLightVm));
 		}
-		CompiledLightVirtualMachine() {}
-		void setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override;
+		void setCache(randomx_cache* cache) override;
+		void setDataset(randomx_dataset* dataset) override {}
 		void initialize() override;
 	};
+
+	using CompiledLightVmDefault = CompiledLightVm<AlignedAllocator<CacheLineSize>, true>;
+	using CompiledLightVmHardAes = CompiledLightVm<AlignedAllocator<CacheLineSize>, false>;
+	using CompiledLightVmLargePage = CompiledLightVm<LargePageAllocator, false>;
+	using CompiledLightVmLargePageHardAes = CompiledLightVm<LargePageAllocator, true>;
 }
--- a/src/CompiledVirtualMachine.cpp
+++ b/src/CompiledVirtualMachine.cpp
@ -21,34 +21,34 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include "common.hpp"
 #include <stdexcept>

-namespace RandomX {
+namespace randomx {

-	//static_assert(sizeof(MemoryRegisters) == 2 * sizeof(addr_t) + sizeof(uintptr_t), "Invalid alignment of struct RandomX::MemoryRegisters");
-	static_assert(sizeof(RegisterFile) == 256, "Invalid alignment of struct RandomX::RegisterFile");
+	static_assert(sizeof(MemoryRegisters) == 2 * sizeof(addr_t) + sizeof(uintptr_t), "Invalid alignment of struct randomx::MemoryRegisters");
+	static_assert(sizeof(RegisterFile) == 256, "Invalid alignment of struct randomx::RegisterFile");

-	CompiledVirtualMachine::CompiledVirtualMachine() {
-	}

-	void CompiledVirtualMachine::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) {
-		mem.ds = ds;
-		datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize;
-		datasetBasePtr = ds.dataset.memory;
+	template<class Allocator, bool softAes>
+	void CompiledVm<Allocator, softAes>::setDataset(randomx_dataset* dataset) {
+		this->mem.memory = dataset->memory;
+		//datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize;
+		//datasetBasePtr = ds.dataset.memory;
 	}

-	void CompiledVirtualMachine::initialize() {
-		VirtualMachine::initialize();
-		compiler.generateProgram(program, config);
-		mem.ds.dataset.memory = datasetBasePtr + (datasetBase * CacheLineSize);
+	template<class Allocator, bool softAes>
+	void CompiledVm<Allocator, softAes>::initialize() {
+		randomx_vm::initialize();
+		this->compiler.generateProgram(this->program, this->config);
+		//mem.ds.dataset.memory = datasetBasePtr + (datasetBase * CacheLineSize);
 	}

-	void CompiledVirtualMachine::execute() {
+	template<class Allocator, bool softAes>
+	void CompiledVm<Allocator, softAes>::execute() {
 		//executeProgram(reg, mem, scratchpad, InstructionCount);
-		compiler.getProgramFunc()(reg, mem, scratchpad, RANDOMX_PROGRAM_ITERATIONS);
-#ifdef TRACEVM
-		for (int32_t i = InstructionCount - 1; i >= 0; --i) {
-			std::cout << std::hex << tracepad[i].u64 << std::endl;
-		}
-#endif
-
+		compiler.getProgramFunc()(this->reg, this->mem, this->scratchpad, RANDOMX_PROGRAM_ITERATIONS);
 	}
+
+	template class CompiledVm<AlignedAllocator<CacheLineSize>, false>;
+	template class CompiledVm<AlignedAllocator<CacheLineSize>, true>;
+	template class CompiledVm<LargePageAllocator, false>;
+	template class CompiledVm<LargePageAllocator, true>;
 }
--- a/src/CompiledVirtualMachine.hpp
+++ b/src/CompiledVirtualMachine.hpp
@ -18,38 +18,39 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 */

 #pragma once
-//#define TRACEVM
+
 #include <new>
 #include "VirtualMachine.hpp"
 #include "JitCompilerX86.hpp"
-#include "intrinPortable.h"

-namespace RandomX {
+namespace randomx {

 	extern "C" {
 		void executeProgram(RegisterFile&, MemoryRegisters&, uint8_t* /* scratchpad */, uint64_t);
 	}

-	class CompiledVirtualMachine : public VirtualMachine {
+	template<class Allocator, bool softAes>
+	class CompiledVm : public VmBase<Allocator, softAes> {
 	public:
 		void* operator new(size_t size) {
-			void* ptr = _mm_malloc(size, 64);
+			void* ptr = AlignedAllocator<CacheLineSize>::allocMemory(size);
 			if (ptr == nullptr)
 				throw std::bad_alloc();
 			return ptr;
 		}
 		void operator delete(void* ptr) {
-			_mm_free(ptr);
+			AlignedAllocator<CacheLineSize>::freeMemory(ptr, sizeof(CompiledVm));
 		}
-		CompiledVirtualMachine();
-		void setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override;
+		void setDataset(randomx_dataset* dataset) override;
+		void execute() override;
 		void initialize() override;
-		virtual void execute() override;
-		void* getProgram() {
-			return compiler.getCode();
-		}
 	protected:
 		JitCompilerX86 compiler;
 		uint8_t* datasetBasePtr;
 	};
-}
+
+	using CompiledVmDefault = CompiledVm<AlignedAllocator<CacheLineSize>, true>;
+	using CompiledVmHardAes = CompiledVm<AlignedAllocator<CacheLineSize>, false>;
+	using CompiledVmLargePage = CompiledVm<LargePageAllocator, false>;
+	using CompiledVmLargePageHardAes = CompiledVm<LargePageAllocator, true>;
+}
--- a/src/Instruction.cpp
+++ b/src/Instruction.cpp
@ -20,7 +20,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include "Instruction.hpp"
 #include "common.hpp"

-namespace RandomX {
+namespace randomx {

 	void Instruction::print(std::ostream& os) const {
 		os << names[opcode] << " ";
--- a/src/Instruction.hpp
+++ b/src/Instruction.hpp
@ -23,7 +23,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include <iostream>
 #include "blake2/endian.h"

-namespace RandomX {
+namespace randomx {

 	class Instruction;

--- a/src/InterpretedLightVirtualMachine.cpp
+++ b/src/InterpretedLightVirtualMachine.cpp
@ -0,0 +1,47 @@
+/*
+Copyright (c) 2018 tevador
+
+This file is part of RandomX.
+
+RandomX is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
+*/
+
+#include "InterpretedLightVirtualMachine.hpp"
+#include "dataset.hpp"
+
+namespace randomx {
+
+	template<class Allocator, bool softAes>
+	void InterpretedLightVm<Allocator, softAes>::setCache(randomx_cache* cache) {
+		mem.memory = cache->memory;
+		//datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize;
+		cachePtr = cache;
+	}
+
+	template<class Allocator, bool softAes>
+	void InterpretedLightVm<Allocator, softAes>::datasetRead(uint32_t address, int_reg_t(&r)[8]) {
+		uint32_t blockNumber = address / CacheLineSize;
+		int_reg_t rl[8];
+		
+		initDatasetBlock(cachePtr, (uint8_t*)rl, blockNumber);
+
+		for (unsigned q = 0; q < 8; ++q)
+			r[q] ^= rl[q];
+	}
+
+	template class InterpretedLightVm<AlignedAllocator<CacheLineSize>, false>;
+	template class InterpretedLightVm<AlignedAllocator<CacheLineSize>, true>;
+	template class InterpretedLightVm<LargePageAllocator, false>;
+	template class InterpretedLightVm<LargePageAllocator, true>;
+}
--- a/src/InterpretedLightVirtualMachine.hpp
+++ b/src/InterpretedLightVirtualMachine.hpp
@ -0,0 +1,53 @@
+/*
+Copyright (c) 2018 tevador
+
+This file is part of RandomX.
+
+RandomX is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <new>
+#include "InterpretedVirtualMachine.hpp"
+#include "superscalar_program.hpp"
+
+namespace randomx {
+
+	template<class Allocator, bool softAes>
+	class InterpretedLightVm : public InterpretedVm<Allocator, softAes> {
+	public:
+		using VmBase<Allocator, softAes>::mem;
+		void* operator new(size_t size) {
+			void* ptr = AlignedAllocator<CacheLineSize>::allocMemory(size);
+			if (ptr == nullptr)
+				throw std::bad_alloc();
+			return ptr;
+		}
+		void operator delete(void* ptr) {
+			AlignedAllocator<CacheLineSize>::freeMemory(ptr, sizeof(InterpretedLightVm));
+		}
+		void setDataset(randomx_dataset* dataset) override { }
+		void setCache(randomx_cache* cache) override;
+	protected:
+		virtual void datasetRead(uint32_t address, int_reg_t(&r)[8]);
+	private:
+		randomx_cache* cachePtr;
+	};
+
+	using InterpretedLightVmDefault = InterpretedLightVm<AlignedAllocator<CacheLineSize>, true>;
+	using InterpretedLightVmHardAes = InterpretedLightVm<AlignedAllocator<CacheLineSize>, false>;
+	using InterpretedLightVmLargePage = InterpretedLightVm<LargePageAllocator, false>;
+	using InterpretedLightVmLargePageHardAes = InterpretedLightVm<LargePageAllocator, true>;
+}
--- a/src/InterpretedVirtualMachine.cpp
+++ b/src/InterpretedVirtualMachine.cpp
@ -21,7 +21,6 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #define RANDOMX_JUMP
 #include "InterpretedVirtualMachine.hpp"
 #include "dataset.hpp"
-#include "Cache.hpp"
 #include <iostream>
 #include <iomanip>
 #include <stdexcept>
@ -32,10 +31,6 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include <climits>
 #include "intrinPortable.h"
 #include "reciprocal.h"
-#ifdef STATS
-#include <algorithm>
-#endif
-#include "superscalarGenerator.hpp"

 #ifdef FPUCHECK
 constexpr bool fpuCheck = true;
@ -43,44 +38,31 @@ constexpr bool fpuCheck = true;
 constexpr bool fpuCheck = false;
 #endif

-namespace RandomX {
+namespace randomx {

 	static int_reg_t Zero = 0;

-	template<bool superscalar>
-	void InterpretedVirtualMachine<superscalar>::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) {
-		mem.ds = ds;
-		readDataset = &datasetReadLight;
-		datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize;
-		if(superscalar)
-			precompileSuperscalar(programs);
+	template<class Allocator, bool softAes>
+	void InterpretedVm<Allocator, softAes>::setDataset(randomx_dataset* dataset) {
+		mem.memory = dataset->memory;
 	}

-	template void InterpretedVirtualMachine<true>::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]);
-	template void InterpretedVirtualMachine<false>::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]);
-
-	template<bool superscalar>
-	void InterpretedVirtualMachine<superscalar>::initialize() {
-		VirtualMachine::initialize();
+	template<class Allocator, bool softAes>
+	void InterpretedVm<Allocator, softAes>::initialize() {
+		randomx_vm::initialize();
 		for (unsigned i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) {
 			program(i).src %= RegistersCount;
 			program(i).dst %= RegistersCount;
 		}
 	}

-	template void InterpretedVirtualMachine<true>::initialize();
-	template void InterpretedVirtualMachine<false>::initialize();
-
-	template<bool superscalar>
-	void InterpretedVirtualMachine<superscalar>::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) {
+	template<class Allocator, bool softAes>
+	void InterpretedVm<Allocator, softAes>::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) {
 		for (int ic = 0; ic < RANDOMX_PROGRAM_SIZE; ++ic) {
 			executeBytecode(ic, r, f, e, a);
 		}
 	}

-	template void InterpretedVirtualMachine<true>::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]);
-	template void InterpretedVirtualMachine<false>::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]);
-
 	static void print(int_reg_t r) {
 		std::cout << std::hex << std::setw(16) << std::setfill('0') << r << std::endl;
 	}
@ -110,14 +92,14 @@ namespace RandomX {
 		return std::fpclassify(x) == FP_SUBNORMAL;
 	}

-	template<bool superscalar>
-	FORCE_INLINE void* InterpretedVirtualMachine<superscalar>::getScratchpadAddress(InstructionByteCode& ibc) {
+	template<class Allocator, bool softAes>
+	FORCE_INLINE void* InterpretedVm<Allocator, softAes>::getScratchpadAddress(InstructionByteCode& ibc) {
 		uint32_t addr = (*ibc.isrc + ibc.imm) & ibc.memMask;
 		return scratchpad + addr;
 	}

-	template<bool superscalar>
-	FORCE_INLINE __m128d InterpretedVirtualMachine<superscalar>::maskRegisterExponentMantissa(__m128d x) {
+	template<class Allocator, bool softAes>
+	FORCE_INLINE __m128d InterpretedVm<Allocator, softAes>::maskRegisterExponentMantissa(__m128d x) {
 		constexpr uint64_t mantissaMask64 = (1ULL << 52) - 1;
 		const __m128d mantissaMask = _mm_castsi128_pd(_mm_set_epi64x(mantissaMask64, mantissaMask64));
 		const __m128d exponentMask = _mm_load_pd((const double*)&config.eMask);
@ -126,8 +108,8 @@ namespace RandomX {
 		return x;
 	}

-	 template<bool superscalar>
-	 FORCE_INLINE void InterpretedVirtualMachine<superscalar>::executeBytecode(int& ic, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) {
+	template<class Allocator, bool softAes>
+	void InterpretedVm<Allocator, softAes>::executeBytecode(int& ic, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) {
 		auto& ibc = byteCode[ic];
 		if (trace) std::cout << std::dec << std::setw(3) << ic << " " << program(ic);
 		//if(trace) printState(r, f, e, a);
@ -318,8 +300,8 @@ namespace RandomX {
 #endif
 	}

-	template<bool superscalar>
-	void InterpretedVirtualMachine<superscalar>::execute() {
+	template<class Allocator, bool softAes>
+	void InterpretedVm<Allocator, softAes>::execute() {
 		int_reg_t r[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
 		__m128d f[4];
 		__m128d e[4];
@ -380,16 +362,8 @@ namespace RandomX {

 			mem.mx ^= r[config.readReg2] ^ r[config.readReg3];
 			mem.mx &= CacheLineAlignMask;
-			if (superscalar) {
-				executeSuperscalar(datasetBase + mem.ma / CacheLineSize, r);
-			}
-			else {
-				Cache& cache = mem.ds.cache;
-				uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)];
-				initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8);
-				for (int i = 0; i < RegistersCount; ++i)
-					r[i] ^= datasetLine[i];
-			}
+			datasetRead(mem.ma, r);
+			//executeSuperscalar(datasetBase + mem.ma / CacheLineSize, r);
 			std::swap(mem.mx, mem.ma);

 			if (trace) {
@ -454,9 +428,6 @@ namespace RandomX {
 		_mm_store_pd(&reg.e[3].lo, e[3]);
 	}

-	template void InterpretedVirtualMachine<true>::execute();
-	template void InterpretedVirtualMachine<false>::execute();
-
 	static int getConditionRegister(int(&registerUsage)[8]) {
 		int min = INT_MAX;
 		int minIndex;
@ -469,108 +440,14 @@ namespace RandomX {
 		return minIndex;
 	}

-	constexpr uint64_t superscalarMul0 = 6364136223846793005ULL;
-	constexpr uint64_t superscalarAdd1 = 9298410992540426748ULL;
-	constexpr uint64_t superscalarAdd2 = 12065312585734608966ULL;
-	constexpr uint64_t superscalarAdd3 = 9306329213124610396ULL;
-	constexpr uint64_t superscalarAdd4 = 5281919268842080866ULL;
-	constexpr uint64_t superscalarAdd5 = 10536153434571861004ULL;
-	constexpr uint64_t superscalarAdd6 = 3398623926847679864ULL;
-	constexpr uint64_t superscalarAdd7 = 9549104520008361294ULL;
-
-	static uint8_t* getMixBlock(uint64_t registerValue, Cache& cache) {
-		uint8_t* mixBlock;
-		if (RANDOMX_ARGON_GROWTH == 0) {
-			constexpr uint32_t mask = (RANDOMX_ARGON_MEMORY * ArgonBlockSize / CacheLineSize - 1);
-			mixBlock = cache.memory + (registerValue & mask) * CacheLineSize;
-		}
-		else {
-			const uint32_t modulus = cache.size / CacheLineSize;
-			mixBlock = cache.memory + (registerValue % modulus) * CacheLineSize;
-		}
-		return mixBlock;
-	}
-
-	template<bool superscalar>
-	void InterpretedVirtualMachine<superscalar>::executeSuperscalar(int_reg_t(&r)[8], SuperscalarProgram& prog, std::vector<uint64_t>& reciprocals) {
-		for (unsigned j = 0; j < prog.getSize(); ++j) {
-			Instruction& instr = prog(j);
-			switch (instr.opcode)
-			{
-			case RandomX::SuperscalarInstructionType::ISUB_R:
-				r[instr.dst] -= r[instr.src];
-				break;
-			case RandomX::SuperscalarInstructionType::IXOR_R:
-				r[instr.dst] ^= r[instr.src];
-				break;
-			case RandomX::SuperscalarInstructionType::IADD_RS:
-				r[instr.dst] += r[instr.src] << instr.getModShift2();
-				break;
-			case RandomX::SuperscalarInstructionType::IMUL_R:
-				r[instr.dst] *= r[instr.src];
-				break;
-			case RandomX::SuperscalarInstructionType::IROR_C:
-				r[instr.dst] = rotr(r[instr.dst], instr.getImm32());
-				break;
-			case RandomX::SuperscalarInstructionType::IADD_C7:
-			case RandomX::SuperscalarInstructionType::IADD_C8:
-			case RandomX::SuperscalarInstructionType::IADD_C9:
-				r[instr.dst] += signExtend2sCompl(instr.getImm32());
-				break;
-			case RandomX::SuperscalarInstructionType::IXOR_C7:
-			case RandomX::SuperscalarInstructionType::IXOR_C8:
-			case RandomX::SuperscalarInstructionType::IXOR_C9:
-				r[instr.dst] ^= signExtend2sCompl(instr.getImm32());
-				break;
-			case RandomX::SuperscalarInstructionType::IMULH_R:
-				r[instr.dst] = mulh(r[instr.dst], r[instr.src]);
-				break;
-			case RandomX::SuperscalarInstructionType::ISMULH_R:
-				r[instr.dst] = smulh(r[instr.dst], r[instr.src]);
-				break;
-			case RandomX::SuperscalarInstructionType::IMUL_RCP:
-				if(superscalar)
-					r[instr.dst] *= reciprocals[instr.getImm32()];
-				else
-					r[instr.dst] *= reciprocal(instr.getImm32());
-				break;
-			default:
-				UNREACHABLE;
-			}
-		}
-	}
-
-	template<bool superscalar>
-	void InterpretedVirtualMachine<superscalar>::executeSuperscalar(uint32_t blockNumber, int_reg_t(&r)[8]) {
-		int_reg_t rl[8];
-		uint8_t* mixBlock;
-		uint64_t registerValue = blockNumber;
-		rl[0] = (blockNumber + 1) * superscalarMul0;
-		rl[1] = rl[0] ^ superscalarAdd1;
-		rl[2] = rl[0] ^ superscalarAdd2;
-		rl[3] = rl[0] ^ superscalarAdd3;
-		rl[4] = rl[0] ^ superscalarAdd4;
-		rl[5] = rl[0] ^ superscalarAdd5;
-		rl[6] = rl[0] ^ superscalarAdd6;
-		rl[7] = rl[0] ^ superscalarAdd7;
-		Cache& cache = mem.ds.cache;
-		for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) {
-			mixBlock = getMixBlock(registerValue, cache);
-			SuperscalarProgram& prog = superScalarPrograms[i];
-			
-			executeSuperscalar(rl, prog, reciprocals);
-
-			for(unsigned q = 0; q < 8; ++q)
-				rl[q] ^= load64(mixBlock + 8 * q);
-
-			registerValue = rl[prog.getAddressRegister()];
-		}
-
-		for (unsigned q = 0; q < 8; ++q)
-			r[q] ^= rl[q];
+	template<class Allocator, bool softAes>
+	void InterpretedVm<Allocator, softAes>::datasetRead(uint32_t address, int_reg_t(&r)[8]) {
+		uint64_t* datasetLine = (uint64_t*)(mem.memory + address);
+		for (int i = 0; i < RegistersCount; ++i)
+			r[i] ^= datasetLine[i];
 	}

-	template<bool superscalar>
+	/*template<bool superscalar>
 	void InterpretedVirtualMachine<superscalar>::precompileSuperscalar(SuperscalarProgram* programs) {
 		memcpy(superScalarPrograms, programs, sizeof(superScalarPrograms));
 		reciprocals.clear();
@ -584,12 +461,12 @@ namespace RandomX {
 				}	
 			}
 		}
-	}
+	}*/

 #include "instructionWeights.hpp"

-	template<bool superscalar>
-	void InterpretedVirtualMachine<superscalar>::precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) {
+	template<class Allocator, bool softAes>
+	void InterpretedVm<Allocator, softAes>::precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) {
 		int registerUsage[8];
 		for (unsigned i = 0; i < 8; ++i) {
 			registerUsage[i] = -1;
@ -1007,4 +884,9 @@ namespace RandomX {
 			}
 		}
 	}
+
+	template class InterpretedVm<AlignedAllocator<CacheLineSize>, false>;
+	template class InterpretedVm<AlignedAllocator<CacheLineSize>, true>;
+	template class InterpretedVm<LargePageAllocator, false>;
+	template class InterpretedVm<LargePageAllocator, true>;
 }
--- a/src/InterpretedVirtualMachine.hpp
+++ b/src/InterpretedVirtualMachine.hpp
@ -18,20 +18,14 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 */

 #pragma once
-//#define STATS
+
 #include <new>
 #include "VirtualMachine.hpp"
 #include "Program.hpp"
 #include "intrinPortable.h"
 #include <vector>

-namespace RandomX {
-
-	struct InstructionByteCode;
-	template<bool superscalar> class InterpretedVirtualMachine;
-
-	template<bool superscalar>
-	using InstructionHandler = void(InterpretedVirtualMachine<superscalar>::*)(Instruction&);
+namespace randomx {

 	struct InstructionByteCode {
 		union {
@ -56,83 +50,40 @@ namespace RandomX {

 	constexpr int asedwfagdewsa = sizeof(InstructionByteCode);

-	template<bool superscalar>
-	class InterpretedVirtualMachine : public VirtualMachine {
+	template<class Allocator, bool softAes>
+	class InterpretedVm : public VmBase<Allocator, softAes> {
 	public:
+		using VmBase<Allocator, softAes>::mem;
+		using VmBase<Allocator, softAes>::scratchpad;
+		using VmBase<Allocator, softAes>::program;
+		using VmBase<Allocator, softAes>::config;
+		using VmBase<Allocator, softAes>::reg;
 		void* operator new(size_t size) {
-			void* ptr = _mm_malloc(size, 64);
+			void* ptr = AlignedAllocator<CacheLineSize>::allocMemory(size);
 			if (ptr == nullptr)
 				throw std::bad_alloc();
 			return ptr;
 		}
 		void operator delete(void* ptr) {
-			_mm_free(ptr);
+			AlignedAllocator<CacheLineSize>::freeMemory(ptr, sizeof(InterpretedVm));
 		}
-		InterpretedVirtualMachine(bool soft) : softAes(soft) {}
-		~InterpretedVirtualMachine() {}
-		void setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override;
-		void initialize() override;
 		void execute() override;
-		static void executeSuperscalar(int_reg_t(&r)[8], SuperscalarProgram& prog, std::vector<uint64_t>& reciprocals);
+		void setDataset(randomx_dataset* dataset) override;
+		void initialize() override;
+	protected:
+		virtual void datasetRead(uint32_t blockNumber, int_reg_t(&r)[8]);
 	private:
-		static InstructionHandler<superscalar> engine[256];
-		DatasetReadFunc readDataset;
-		bool softAes;
-		InstructionByteCode byteCode[RANDOMX_PROGRAM_SIZE];
-		std::vector<uint64_t> reciprocals;
-		alignas(64) SuperscalarProgram superScalarPrograms[RANDOMX_CACHE_ACCESSES];
-#ifdef STATS
-		int count_ADD_64 = 0;
-		int count_ADD_32 = 0;
-		int count_SUB_64 = 0;
-		int count_SUB_32 = 0;
-		int count_MUL_64 = 0;
-		int count_MULH_64 = 0;
-		int count_MUL_32 = 0;
-		int count_IMUL_32 = 0;
-		int count_IMULH_64 = 0;
-		int count_DIV_64 = 0;
-		int count_IDIV_64 = 0;
-		int count_AND_64 = 0;
-		int count_AND_32 = 0;
-		int count_OR_64 = 0;
-		int count_OR_32 = 0;
-		int count_XOR_64 = 0;
-		int count_XOR_32 = 0;
-		int count_SHL_64 = 0;
-		int count_SHR_64 = 0;
-		int count_SAR_64 = 0;
-		int count_ROL_64 = 0;
-		int count_ROR_64 = 0;
-		int count_FADD = 0;
-		int count_FSUB = 0;
-		int count_FMUL = 0;
-		int count_FDIV = 0;
-		int count_FSQRT = 0;
-		int count_FPROUND = 0;
-		int count_JUMP_taken = 0;
-		int count_JUMP_not_taken = 0;
-		int count_jump_taken[8] = { 0 };
-		int count_jump_not_taken[8] = { 0 };
-		int count_max_stack = 0;
-		int count_retdepth = 0;
-		int count_retdepth_max = 0;
-		int count_endstack = 0;
-		int count_instructions[RANDOMX_PROGRAM_SIZE] = { 0 };
-		int count_FADD_nop = 0;
-		int count_FADD_nop2 = 0;
-		int count_FSUB_nop = 0;
-		int count_FSUB_nop2 = 0;
-		int count_FMUL_nop = 0;
-		int count_FMUL_nop2 = 0;
-		int datasetAccess[256] = { 0 };
-#endif
 		void precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]);
-		void precompileSuperscalar(SuperscalarProgram*);
 		void executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]);
 		void executeBytecode(int& i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]);
-		void executeSuperscalar(uint32_t blockNumber, int_reg_t(&r)[8]);
 		void* getScratchpadAddress(InstructionByteCode& ibc);
 		__m128d maskRegisterExponentMantissa(__m128d);
+
+		InstructionByteCode byteCode[RANDOMX_PROGRAM_SIZE];
 	};
+
+	using InterpretedVmDefault = InterpretedVm<AlignedAllocator<CacheLineSize>, true>;
+	using InterpretedVmHardAes = InterpretedVm<AlignedAllocator<CacheLineSize>, false>;
+	using InterpretedVmLargePage = InterpretedVm<LargePageAllocator, false>;
+	using InterpretedVmLargePageHardAes = InterpretedVm<LargePageAllocator, true>;
 }
--- a/src/JitCompilerX86-static.S
+++ b/src/JitCompilerX86-static.S
@ -97,7 +97,7 @@ DECL(randomx_dataset_init):
 	push r13
 	push r14
 	push r15
-	;# cache in rdi
+	mov rdi, qword ptr [rdi+8] ;# after virtual method table pointer
 	;# dataset in rsi
 	mov rbp, rdx  ;# block index
 	push rcx      ;# max. block index
--- a/src/JitCompilerX86-static.asm
+++ b/src/JitCompilerX86-static.asm
@ -100,7 +100,7 @@ randomx_dataset_init PROC
 	push r13
 	push r14
 	push r15
-	mov rdi, rcx ;# cache
+	mov rdi, qword ptr [rcx+8] ;# after virtual method table pointer
 	mov rsi, rdx ;# dataset
 	mov rbp, r8  ;# block index
 	push r9      ;# max. block index
--- a/src/JitCompilerX86.cpp
+++ b/src/JitCompilerX86.cpp
@ -28,7 +28,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.

 #define RANDOMX_JUMP

-namespace RandomX {
+namespace randomx {

 #if !defined(_M_X64) && !defined(__x86_64__)
 	JitCompilerX86::JitCompilerX86() {
@ -238,33 +238,29 @@ namespace RandomX {
 		generateProgramEpilogue(prog);
 	}

-	template<bool superscalar>
 	void JitCompilerX86::generateProgramLight(Program& prog, ProgramConfiguration& pcfg) {
 		if (RANDOMX_CACHE_ACCESSES != 8)
 			throw std::runtime_error("JIT compiler: Unsupported value of RANDOMX_CACHE_ACCESSES");
 		if (RANDOMX_ARGON_GROWTH != 0)
 			throw std::runtime_error("JIT compiler: Unsupported value of RANDOMX_ARGON_GROWTH");
 		generateProgramPrologue(prog, pcfg);
-		if (superscalar) {
+		//if (superscalar) {
 			emit(codeReadDatasetLightSshInit, readDatasetLightInitSize);
 			emitByte(CALL);
 			emit32(superScalarHashOffset - (codePos + 4));
 			emit(codeReadDatasetLightSshFin, readDatasetLightFinSize);
-		}
+		/*}
 		else {
 			memcpy(code + codePos, codeReadDatasetLight, readDatasetLightSize);
 			codePos += readDatasetLightSize;
 			emitByte(CALL);
 			emit32(readDatasetLightSubOffset - (codePos + 4));
-		}
+		}*/
 		generateProgramEpilogue(prog);
 	}

-	template void JitCompilerX86::generateProgramLight<true>(Program& prog, ProgramConfiguration& pcfg);
-	template void JitCompilerX86::generateProgramLight<false>(Program& prog, ProgramConfiguration& pcfg);
-
 	template<size_t N>
-	void JitCompilerX86::generateSuperScalarHash(SuperscalarProgram(&programs)[N]) {
+	void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector<uint64_t> &reciprocalCache) {
 		memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize);
 		codePos = superScalarHashOffset + codeSshInitSize;
 		for (unsigned j = 0; j < N; ++j) {
@ -273,7 +269,7 @@ namespace RandomX {
 				Instruction& instr = prog(i);
 				instr.src %= RegistersCount;
 				instr.dst %= RegistersCount;
-				generateCode<SuperscalarProgram>(instr, i);
+				generateSuperscalarCode(instr, reciprocalCache);
 			}
 			emit(codeShhLoad, codeSshLoadSize);
 			if (j < N - 1) {
@ -293,7 +289,7 @@ namespace RandomX {
 	}

 	template
-	void JitCompilerX86::generateSuperScalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]);
+	void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES], std::vector<uint64_t> &reciprocalCache);

 	void JitCompilerX86::generateDatasetInitCode() {
 		memcpy(code, codeDatasetInit, datasetInitSize);
@ -314,7 +310,12 @@ namespace RandomX {
 		emitByte(0xc0 + pcfg.readReg1);
 		memcpy(code + codePos, codeLoopLoad, loopLoadSize);
 		codePos += loopLoadSize;
-		generateCode(prog);
+		for (unsigned i = 0; i < prog.getSize(); ++i) {
+			Instruction& instr = prog(i);
+			instr.src %= RegistersCount;
+			instr.dst %= RegistersCount;
+			generateCode(instr, i);
+		}
 		emit(REX_MOV_RR);
 		emitByte(0xc0 + pcfg.readReg2);
 		emit(REX_XOR_EAX);
@ -331,7 +332,6 @@ namespace RandomX {
 		emit32(epilogueOffset - codePos - 4);
 	}

-	template<class P>
 	void JitCompilerX86::generateCode(Instruction& instr, int i) {
 #ifdef RANDOMX_JUMP
 		instructionOffsets.push_back(codePos);
@ -340,67 +340,66 @@ namespace RandomX {
 		(this->*generator)(instr, i);
 	}

-	template<>
-	void JitCompilerX86::generateCode<SuperscalarProgram>(Instruction& instr, int i) {
+	void JitCompilerX86::generateSuperscalarCode(Instruction& instr, std::vector<uint64_t> &reciprocalCache) {
 		switch (instr.opcode)
 		{
-		case RandomX::SuperscalarInstructionType::ISUB_R:
+		case randomx::SuperscalarInstructionType::ISUB_R:
 			emit(REX_SUB_RR);
 			emitByte(0xc0 + 8 * instr.dst + instr.src);
 			break;
-		case RandomX::SuperscalarInstructionType::IXOR_R:
+		case randomx::SuperscalarInstructionType::IXOR_R:
 			emit(REX_XOR_RR);
 			emitByte(0xc0 + 8 * instr.dst + instr.src);
 			break;
-		case RandomX::SuperscalarInstructionType::IADD_RS:
+		case randomx::SuperscalarInstructionType::IADD_RS:
 			emit(REX_LEA);
 			emitByte(0x04 + 8 * instr.dst);
 			genSIB(instr.getModShift2(), instr.src, instr.dst);
 			break;
-		case RandomX::SuperscalarInstructionType::IMUL_R:
+		case randomx::SuperscalarInstructionType::IMUL_R:
 			emit(REX_IMUL_RR);
 			emitByte(0xc0 + 8 * instr.dst + instr.src);
 			break;
-		case RandomX::SuperscalarInstructionType::IROR_C:
+		case randomx::SuperscalarInstructionType::IROR_C:
 			emit(REX_ROT_I8);
 			emitByte(0xc8 + instr.dst);
 			emitByte(instr.getImm32() & 63);
 			break;
-		case RandomX::SuperscalarInstructionType::IADD_C7:
+		case randomx::SuperscalarInstructionType::IADD_C7:
 			emit(REX_81);
 			emitByte(0xc0 + instr.dst);
 			emit32(instr.getImm32());
 			break;
-		case RandomX::SuperscalarInstructionType::IXOR_C7:
+		case randomx::SuperscalarInstructionType::IXOR_C7:
 			emit(REX_XOR_RI);
 			emitByte(0xf0 + instr.dst);
 			emit32(instr.getImm32());
 			break;
-		case RandomX::SuperscalarInstructionType::IADD_C8:
+		case randomx::SuperscalarInstructionType::IADD_C8:
 			emit(REX_81);
 			emitByte(0xc0 + instr.dst);
 			emit32(instr.getImm32());
 			emit(NOP1);
 			break;
-		case RandomX::SuperscalarInstructionType::IXOR_C8:
+		case randomx::SuperscalarInstructionType::IXOR_C8:
 			emit(REX_XOR_RI);
 			emitByte(0xf0 + instr.dst);
 			emit32(instr.getImm32());
 			emit(NOP1);
 			break;
-		case RandomX::SuperscalarInstructionType::IADD_C9:
+		case randomx::SuperscalarInstructionType::IADD_C9:
 			emit(REX_81);
 			emitByte(0xc0 + instr.dst);
 			emit32(instr.getImm32());
 			emit(NOP2);
 			break;
-		case RandomX::SuperscalarInstructionType::IXOR_C9:
+		case randomx::SuperscalarInstructionType::IXOR_C9:
 			emit(REX_XOR_RI);
 			emitByte(0xf0 + instr.dst);
 			emit32(instr.getImm32());
 			emit(NOP2);
 			break;
-		case RandomX::SuperscalarInstructionType::IMULH_R:
+		case randomx::SuperscalarInstructionType::IMULH_R:
 			emit(REX_MOV_RR64);
 			emitByte(0xc0 + instr.dst);
 			emit(REX_MUL_R);
@ -408,7 +407,7 @@ namespace RandomX {
 			emit(REX_MOV_R64R);
 			emitByte(0xc2 + 8 * instr.dst);
 			break;
-		case RandomX::SuperscalarInstructionType::ISMULH_R:
+		case randomx::SuperscalarInstructionType::ISMULH_R:
 			emit(REX_MOV_RR64);
 			emitByte(0xc0 + instr.dst);
 			emit(REX_MUL_R);
@ -416,9 +415,9 @@ namespace RandomX {
 			emit(REX_MOV_R64R);
 			emitByte(0xc2 + 8 * instr.dst);
 			break;
-		case RandomX::SuperscalarInstructionType::IMUL_RCP:
+		case randomx::SuperscalarInstructionType::IMUL_RCP:
 			emit(MOV_RAX_I);
-			emit64(reciprocal(instr.getImm32()));
+			emit64(reciprocalCache[instr.getImm32()]);
 			emit(REX_IMUL_RM);
 			emitByte(0xc0 + 8 * instr.dst);
 			break;
@ -427,8 +426,6 @@ namespace RandomX {
 		}
 	}

-	template void JitCompilerX86::generateCode<Program>(Instruction& instr, int i);
-
 	void JitCompilerX86::genAddressReg(Instruction& instr, bool rax = true) {
 		emit(LEA_32);
 		emitByte(0x80 + instr.src + (rax ? 0 : 8));
--- a/src/JitCompilerX86.hpp
+++ b/src/JitCompilerX86.hpp
@ -21,10 +21,11 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.

 #include "common.hpp"
 #include "Instruction.hpp"
+#include "superscalar_program.hpp"
 #include <cstring>
 #include <vector>

-namespace RandomX {
+namespace randomx {

 	class Program;
 	class ProgramConfiguration;
@ -40,15 +41,14 @@ namespace RandomX {
 		JitCompilerX86();
 		~JitCompilerX86();
 		void generateProgram(Program&, ProgramConfiguration&);
-		template<bool superscalar>
 		void generateProgramLight(Program&, ProgramConfiguration&);
 		template<size_t N>
-		void generateSuperScalarHash(SuperscalarProgram (&programs)[N]);
+		void generateSuperscalarHash(SuperscalarProgram (&programs)[N], std::vector<uint64_t> &);
+		void generateDatasetInitCode();
 		ProgramFunc getProgramFunc() {
 			return (ProgramFunc)code;
 		}
 		DatasetInitFunc getDatasetInitFunc() {
-			generateDatasetInitCode();
 			return (DatasetInitFunc)code;
 		}
 		uint8_t* getCode() {
@ -62,18 +62,6 @@ namespace RandomX {
 		uint8_t* code;
 		int32_t codePos;

-		template<class P>
-		void generateCode(P& prog) {
-			for (unsigned i = 0; i < prog.getSize(); ++i) {
-				Instruction& instr = prog(i);
-				instr.src %= RegistersCount;
-				instr.dst %= RegistersCount;
-				generateCode<P>(instr, i);
-			}
-		}
-
-		void generateDatasetInitCode();
-
 		void generateProgramPrologue(Program&, ProgramConfiguration&);
 		void generateProgramEpilogue(Program&);
 		int getConditionRegister();
@ -84,8 +72,8 @@ namespace RandomX {

 		void handleCondition(Instruction&, int);

-		template<class P>
 		void generateCode(Instruction&, int);
+		void generateSuperscalarCode(Instruction &, std::vector<uint64_t> &);

 		void emitByte(uint8_t val) {
 			code[codePos] = val;
--- a/src/Program.hpp
+++ b/src/Program.hpp
@ -25,7 +25,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include "Instruction.hpp"
 #include "blake2/endian.h"

-namespace RandomX {
+namespace randomx {

 	struct ProgramConfiguration {
 		uint64_t eMask[2];
@ -59,46 +59,4 @@ namespace RandomX {
 	};

 	static_assert(sizeof(Program) % 64 == 0, "Invalid size of class Program");
-
-	class SuperscalarProgram {
-	public:
-		Instruction& operator()(int pc) {
-			return programBuffer[pc];
-		}
-		friend std::ostream& operator<<(std::ostream& os, const SuperscalarProgram& p) {
-			p.print(os);
-			return os;
-		}
-		uint32_t getSize() {
-			return size;
-		}
-		void setSize(uint32_t val) {
-			size = val;
-		}
-		int getAddressRegister() {
-			return addrReg;
-		}
-		void setAddressRegister(uint32_t val) {
-			addrReg = val;
-		}
-		double ipc;
-		int codeSize;
-		int macroOps;
-		int decodeCycles;
-		int cpuLatency;
-		int asicLatency;
-		int mulCount;
-		int cpuLatencies[8];
-		int asicLatencies[8];
-	private:
-		void print(std::ostream& os) const {
-			for (unsigned i = 0; i < size; ++i) {
-				auto instr = programBuffer[i];
-				os << instr;
-			}
-		}
-		Instruction programBuffer[RANDOMX_SUPERSCALAR_MAX_SIZE];
-		uint32_t size;
-		int addrReg;
-	};
 }
--- a/src/VirtualMachine.cpp
+++ b/src/VirtualMachine.cpp
@ -24,9 +24,60 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include <cstring>
 #include <iomanip>
 #include "intrinPortable.h"
+#include "allocator.hpp"

-std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf) {
-	for (int i = 0; i < RandomX::RegistersCount; ++i)
+randomx_vm::~randomx_vm() {
+
+}
+
+void randomx_vm::resetRoundingMode() {
+	initFpu();
+}
+
+constexpr int mantissaSize = 52;
+constexpr int exponentSize = 11;
+constexpr uint64_t mantissaMask = (1ULL << mantissaSize) - 1;
+constexpr uint64_t exponentMask = (1ULL << exponentSize) - 1;
+constexpr int exponentBias = 1023;
+
+static inline uint64_t getSmallPositiveFloatBits(uint64_t entropy) {
+	auto exponent = entropy >> 59; //0..31
+	auto mantissa = entropy & mantissaMask;
+	exponent += exponentBias;
+	exponent &= exponentMask;
+	exponent <<= mantissaSize;
+	return exponent | mantissa;
+}
+
+void randomx_vm::initialize() {
+	store64(&reg.a[0].lo, getSmallPositiveFloatBits(program.getEntropy(0)));
+	store64(&reg.a[0].hi, getSmallPositiveFloatBits(program.getEntropy(1)));
+	store64(&reg.a[1].lo, getSmallPositiveFloatBits(program.getEntropy(2)));
+	store64(&reg.a[1].hi, getSmallPositiveFloatBits(program.getEntropy(3)));
+	store64(&reg.a[2].lo, getSmallPositiveFloatBits(program.getEntropy(4)));
+	store64(&reg.a[2].hi, getSmallPositiveFloatBits(program.getEntropy(5)));
+	store64(&reg.a[3].lo, getSmallPositiveFloatBits(program.getEntropy(6)));
+	store64(&reg.a[3].hi, getSmallPositiveFloatBits(program.getEntropy(7)));
+	mem.ma = program.getEntropy(8) & randomx::CacheLineAlignMask;
+	mem.mx = program.getEntropy(10);
+	auto addressRegisters = program.getEntropy(12);
+	config.readReg0 = 0 + (addressRegisters & 1);
+	addressRegisters >>= 1;
+	config.readReg1 = 2 + (addressRegisters & 1);
+	addressRegisters >>= 1;
+	config.readReg2 = 4 + (addressRegisters & 1);
+	addressRegisters >>= 1;
+	config.readReg3 = 6 + (addressRegisters & 1);
+	//datasetBase = program.getEntropy(13) % datasetRange;
+	constexpr uint64_t mask22bit = (1ULL << 22) - 1;
+	constexpr uint64_t maskExp240 = ieee_get_exponent_mask<-240>();
+	store64(&config.eMask[0], (program.getEntropy(14) & mask22bit) | maskExp240);
+	store64(&config.eMask[1], (program.getEntropy(15) & mask22bit) | maskExp240);
+}
+
+//TODO
+std::ostream& operator<<(std::ostream& os, const randomx::RegisterFile& rf) {
+	for (int i = 0; i < randomx::RegistersCount; ++i)
 		os << std::hex << "r" << i << " = " << rf.r[i] << std::endl << std::dec;
 	for (int i = 0; i < 4; ++i)
 		os << std::hex << "f" << i << " = " << *(uint64_t*)&rf.f[i].hi << " (" << rf.f[i].hi << ")" << std::endl
@ -40,66 +91,32 @@ std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf) {
 	return os;
 }

-namespace RandomX {
-
-	constexpr int mantissaSize = 52;
-	constexpr int exponentSize = 11;
-	constexpr uint64_t mantissaMask = (1ULL << mantissaSize) - 1;
-	constexpr uint64_t exponentMask = (1ULL << exponentSize) - 1;
-	constexpr int exponentBias = 1023;
-
-	static inline uint64_t getSmallPositiveFloatBits(uint64_t entropy) {
-		auto exponent = entropy >> 59; //0..31
-		auto mantissa = entropy & mantissaMask;
-		exponent += exponentBias;
-		exponent &= exponentMask;
-		exponent <<= mantissaSize;
-		return exponent | mantissa;
-	}
+namespace randomx {

-	VirtualMachine::VirtualMachine() {
-		mem.ds.dataset.memory = nullptr;
+	template<class Allocator, bool softAes>
+	VmBase<Allocator, softAes>::~VmBase() {
+		Allocator::freeMemory(scratchpad, ScratchpadSize);
 	}

-	void VirtualMachine::resetRoundingMode() {
-		initFpu();
+	template<class Allocator, bool softAes>
+	bool VmBase<Allocator, softAes>::allocate() {
+		scratchpad = (uint8_t*)Allocator::allocMemory(ScratchpadSize);
+		return scratchpad != nullptr;
 	}

-	void VirtualMachine::initialize() {
-		store64(&reg.a[0].lo, getSmallPositiveFloatBits(program.getEntropy(0)));
-		store64(&reg.a[0].hi, getSmallPositiveFloatBits(program.getEntropy(1)));
-		store64(&reg.a[1].lo, getSmallPositiveFloatBits(program.getEntropy(2)));
-		store64(&reg.a[1].hi, getSmallPositiveFloatBits(program.getEntropy(3)));
-		store64(&reg.a[2].lo, getSmallPositiveFloatBits(program.getEntropy(4)));
-		store64(&reg.a[2].hi, getSmallPositiveFloatBits(program.getEntropy(5)));
-		store64(&reg.a[3].lo, getSmallPositiveFloatBits(program.getEntropy(6)));
-		store64(&reg.a[3].hi, getSmallPositiveFloatBits(program.getEntropy(7)));
-		mem.ma = program.getEntropy(8) & CacheLineAlignMask;
-		mem.mx = program.getEntropy(10);
-		auto addressRegisters = program.getEntropy(12);
-		config.readReg0 = 0 + (addressRegisters & 1);
-		addressRegisters >>= 1;
-		config.readReg1 = 2 + (addressRegisters & 1);
-		addressRegisters >>= 1;
-		config.readReg2 = 4 + (addressRegisters & 1);
-		addressRegisters >>= 1;
-		config.readReg3 = 6 + (addressRegisters & 1);
-		datasetBase = program.getEntropy(13) % datasetRange;
-		constexpr uint64_t mask22bit = (1ULL << 22) - 1;
-		constexpr uint64_t maskExp240 = ieee_get_exponent_mask<-240>();
-		store64(&config.eMask[0], (program.getEntropy(14) & mask22bit) | maskExp240);
-		store64(&config.eMask[1], (program.getEntropy(15) & mask22bit) | maskExp240);
+	template<class Allocator, bool softAes>
+	void VmBase<Allocator, softAes>::generate(void* seed, void* buffer, size_t bufferSize) {
+		fillAes1Rx4<softAes>(seed, bufferSize, buffer);
 	}

-	template<bool softAes>
-	void VirtualMachine::getResult(void* scratchpad, size_t scratchpadSize, void* outHash) {
-		if (scratchpadSize > 0) {
-			hashAes1Rx4<softAes>(scratchpad, scratchpadSize, &reg.a);
-		}
-		blake2b(outHash, ResultSize, &reg, sizeof(RegisterFile), nullptr, 0);
+	template<class Allocator, bool softAes>
+	void VmBase<Allocator, softAes>::getFinalResult(void* out, size_t outSize) {
+		hashAes1Rx4<softAes>(scratchpad, ScratchpadSize, &reg.a);
+		blake2b(out, outSize, &reg, sizeof(RegisterFile), nullptr, 0);
 	}

-	template void VirtualMachine::getResult<false>(void* scratchpad, size_t scratchpadSize, void* outHash);
-	template void VirtualMachine::getResult<true>(void* scratchpad, size_t scratchpadSize, void* outHash);
-
+	template class VmBase<AlignedAllocator<CacheLineSize>, false>;
+	template class VmBase<AlignedAllocator<CacheLineSize>, true>;
+	template class VmBase<LargePageAllocator, false>;
+	template class VmBase<LargePageAllocator, true>;
 }
--- a/src/VirtualMachine.hpp
+++ b/src/VirtualMachine.hpp
@ -18,38 +18,40 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 */

 #pragma once
+
 #include <cstdint>
 #include "common.hpp"
+#include "dataset.hpp"
 #include "Program.hpp"

-namespace RandomX {
+/* Global namespace for C binding */
+struct randomx_vm {
+	virtual ~randomx_vm() = 0;
+	virtual bool allocate() = 0;
+	virtual void generate(void* seed, void* buffer, size_t bufferSize) = 0;
+	void resetRoundingMode();
+	virtual void initialize();
+	virtual void execute() = 0;
+	virtual void getFinalResult(void* out, size_t outSize) = 0;
+	virtual void setDataset(randomx_dataset* dataset) { }
+	virtual void setCache(randomx_cache* cache) { }
+
+	alignas(64) randomx::Program program;
+	alignas(64) randomx::RegisterFile reg;
+	alignas(16) randomx::ProgramConfiguration config;
+	randomx::MemoryRegisters mem;
+	uint8_t* scratchpad;
+};

-	class VirtualMachine {
+namespace randomx {
+
+	template<class Allocator, bool softAes>
+	class VmBase : public randomx_vm {
 	public:
-		VirtualMachine();
-		virtual ~VirtualMachine() {}
-		virtual void setDataset(dataset_t ds, uint64_t size, SuperscalarProgram (&programs)[RANDOMX_CACHE_ACCESSES]) = 0;
-		void setScratchpad(void* ptr) {
-			scratchpad = (uint8_t*)ptr;
-		}
-		void resetRoundingMode();
-		virtual void initialize();
-		virtual void execute() = 0;
-		template<bool softAes>
-		void getResult(void* scratchpad, size_t scratchpadSize, void* outHash);
-		const RegisterFile& getRegisterFile() {
-			return reg;
-		}
-		Program* getProgramBuffer() {
-			return &program;
-		}
-	protected:
-		alignas(64) Program program;
-		alignas(64) RegisterFile reg;
-		alignas(16) ProgramConfiguration config;
-		MemoryRegisters mem;
-		uint8_t* scratchpad;
-		uint32_t datasetRange;
-		uint32_t datasetBase;
+		~VmBase() override;
+		bool allocate() override;
+		void generate(void* seed, void* buffer, size_t bufferSize) override;
+		void getFinalResult(void* out, size_t outSize) override;
 	};
+
 }
--- a/src/allocator.cpp
+++ b/src/allocator.cpp
@ -0,0 +1,52 @@
+/*
+Copyright (c) 2019 tevador
+
+This file is part of RandomX.
+
+RandomX is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "allocator.hpp"
+#include "virtualMemory.hpp"
+#include "intrinPortable.h"
+#include "common.hpp"
+
+namespace randomx {
+
+	template<size_t alignment>
+	void* AlignedAllocator<alignment>::allocMemory(size_t count) {
+		return _mm_malloc(count, alignment);
+	}
+
+	template<size_t alignment>
+	void AlignedAllocator<alignment>::freeMemory(void* ptr, size_t count) {
+		_mm_free(ptr);
+	}
+
+	template void* AlignedAllocator<CacheLineSize>::allocMemory(size_t count);
+	template void AlignedAllocator<CacheLineSize>::freeMemory(void* ptr, size_t count);
+	template void* AlignedAllocator<sizeof(__m128i)>::allocMemory(size_t count);
+	template void AlignedAllocator<sizeof(__m128i)>::freeMemory(void* ptr, size_t count);
+
+	void* LargePageAllocator::allocMemory(size_t count) {
+		return allocLargePagesMemory(count);
+	}
+
+	void LargePageAllocator::freeMemory(void* ptr, size_t count) {
+		freePagedMemory(ptr, count);
+	};
+
+}
--- a/src/allocator.hpp
+++ b/src/allocator.hpp
@ -0,0 +1,37 @@
+/*
+Copyright (c) 2019 tevador
+
+This file is part of RandomX.
+
+RandomX is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <cstddef>
+
+namespace randomx {
+
+	template<size_t alignment>
+	struct AlignedAllocator {
+		static void* allocMemory(size_t);
+		static void freeMemory(void*, size_t);
+	};
+
+	struct LargePageAllocator {
+		static void* allocMemory(size_t);
+		static void freeMemory(void*, size_t);
+	};
+
+}
--- a/src/common.hpp
+++ b/src/common.hpp
@ -23,8 +23,9 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include <iostream>
 #include "blake2/endian.h"
 #include "configuration.h"
+#include "randomx.h"

-namespace RandomX {
+namespace randomx {

 	static_assert((RANDOMX_ARGON_MEMORY & (RANDOMX_ARGON_MEMORY - 1)) == 0, "RANDOMX_ARGON_MEMORY must be a power of 2.");
 	static_assert((RANDOMX_DATASET_SIZE & (RANDOMX_DATASET_SIZE - 1)) == 0, "RANDOMX_DATASET_SIZE must be a power of 2.");
@ -58,6 +59,7 @@ namespace RandomX {
 	constexpr int ArgonBlockSize = 1024;
 	constexpr int ArgonSaltSize = sizeof(RANDOMX_ARGON_SALT) - 1;
 	constexpr int CacheLineSize = 64;
+	constexpr int ScratchpadSize = RANDOMX_SCRATCHPAD_L3;
 	constexpr uint32_t CacheLineAlignMask = (RANDOMX_DATASET_SIZE - 1) & ~(CacheLineSize - 1);
 	constexpr uint32_t CacheSize = RANDOMX_ARGON_MEMORY * 1024;
 	constexpr int CacheBlockCount = CacheSize / CacheLineSize;
@ -98,39 +100,9 @@ namespace RandomX {
 	constexpr int RegisterNeedsDisplacement = 5; //x86 r13 register
 	constexpr int RegisterNeedsSib = 4; //x86 r12 register

-	struct Cache {
-		uint8_t* memory;
-		uint64_t size;
-	};
-
-	struct Dataset : public Cache {
-	};
-
-	class ILightClientAsyncWorker {
-	public:
-		virtual ~ILightClientAsyncWorker() {}
-		virtual void prepareBlock(addr_t) = 0;
-		virtual void prepareBlocks(void* out, uint32_t startBlock, uint32_t blockCount) = 0;
-		virtual const uint64_t* getBlock(addr_t) = 0;
-		virtual void getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) = 0;
-		virtual void sync() = 0;
-		const Cache& getCache() {
-			return cache;
-		}
-	protected:
-		ILightClientAsyncWorker(const Cache& c) : cache(c) {}
-		const Cache& cache;
-	};
-
-	union dataset_t {
-		Dataset dataset;
-		Cache cache;
-		ILightClientAsyncWorker* asyncWorker;
-	};
-
 	struct MemoryRegisters {
 		addr_t mx, ma;
-		dataset_t ds;
+		uint8_t* memory = nullptr;
 	};

 	struct RegisterFile {
@ -141,9 +113,8 @@ namespace RandomX {
 	};

 	typedef void(*DatasetReadFunc)(addr_t, MemoryRegisters&, int_reg_t(&reg)[RegistersCount]);
-
 	typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, uint8_t* /* scratchpad */, uint64_t);
-	typedef void(*DatasetInitFunc)(uint8_t* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock);
+	typedef void(*DatasetInitFunc)(randomx_cache* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock);
 }

-std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf);
+std::ostream& operator<<(std::ostream& os, const randomx::RegisterFile& rf);
--- a/src/dataset.cpp
+++ b/src/dataset.cpp
@ -22,14 +22,17 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include <stdexcept>
 #include <cstring>
 #include <limits>
+#include <cstring>

 #include "common.hpp"
 #include "dataset.hpp"
-#include "Cache.hpp"
 #include "virtualMemory.hpp"
-#include "softAes.h"
-#include "squareHash.h"
+#include "superscalarGenerator.hpp"
+#include "Blake2Generator.hpp"
+#include "reciprocal.h"
 #include "blake2/endian.h"
+#include "argon2.h"
+#include "argon2_core.h"

 #if defined(__SSE2__)
 #include <wmmintrin.h>
@ -38,113 +41,174 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #define PREFETCH(memory)
 #endif

-namespace RandomX {
+randomx_dataset::~randomx_dataset() {

-#if true //RANDOMX_ARGON_GROWTH != 0 || (!defined(_M_X64) && !defined(__x86_64__))
-	static FORCE_INLINE uint8_t* selectMixBlock(const Cache& cache, uint64_t& currentIndex, uint64_t& nextIndex) {
-		uint8_t* mixBlock;
-		if (RANDOMX_ARGON_GROWTH == 0) {
-			constexpr uint32_t mask = (RANDOMX_ARGON_MEMORY * ArgonBlockSize / CacheLineSize - 1);
-			mixBlock = cache.memory + (currentIndex & mask) * CacheLineSize;
-		}
-		else {
-			const uint32_t modulus = cache.size / CacheLineSize;
-			mixBlock = cache.memory + (currentIndex % modulus) * CacheLineSize;
-		}
-		PREFETCHNTA(mixBlock);
-		nextIndex = squareHash(currentIndex + nextIndex);
-		return mixBlock;
-	}
+}

-	static FORCE_INLINE void mixCache(uint8_t* mixBlock, uint64_t& c0, uint64_t& c1, uint64_t& c2, uint64_t& c3, uint64_t& c4, uint64_t& c5, uint64_t& c6, uint64_t& c7) {
-		c0 ^= load64(mixBlock + 0);
-		c1 ^= load64(mixBlock + 8);
-		c2 ^= load64(mixBlock + 16);
-		c3 ^= load64(mixBlock + 24);
-		c4 ^= load64(mixBlock + 32);
-		c5 ^= load64(mixBlock + 40);
-		c6 ^= load64(mixBlock + 48);
-		c7 ^= load64(mixBlock + 56);
+static_assert(RANDOMX_ARGON_MEMORY % (RANDOMX_ARGON_LANES * ARGON2_SYNC_POINTS) == 0, "RANDOMX_ARGON_MEMORY - invalid value");
+
+void randomx_cache::initialize(const void *seed, size_t seedSize) {
+	uint32_t memory_blocks, segment_length;
+	argon2_instance_t instance;
+	argon2_context context;
+
+	context.out = nullptr;
+	context.outlen = 0;
+	context.pwd = CONST_CAST(uint8_t *)seed;
+	context.pwdlen = (uint32_t)seedSize;
+	context.salt = CONST_CAST(uint8_t *)RANDOMX_ARGON_SALT;
+	context.saltlen = (uint32_t)randomx::ArgonSaltSize;
+	context.secret = NULL;
+	context.secretlen = 0;
+	context.ad = NULL;
+	context.adlen = 0;
+	context.t_cost = RANDOMX_ARGON_ITERATIONS;
+	context.m_cost = RANDOMX_ARGON_MEMORY;
+	context.lanes = RANDOMX_ARGON_LANES;
+	context.threads = 1;
+	context.allocate_cbk = NULL;
+	context.free_cbk = NULL;
+	context.flags = ARGON2_DEFAULT_FLAGS;
+	context.version = ARGON2_VERSION_NUMBER;
+
+	/* 2. Align memory size */
+	/* Minimum memory_blocks = 8L blocks, where L is the number of lanes */
+	memory_blocks = context.m_cost;
+
+	segment_length = memory_blocks / (context.lanes * ARGON2_SYNC_POINTS);
+
+	instance.version = context.version;
+	instance.memory = NULL;
+	instance.passes = context.t_cost;
+	instance.memory_blocks = memory_blocks;
+	instance.segment_length = segment_length;
+	instance.lane_length = segment_length * ARGON2_SYNC_POINTS;
+	instance.lanes = context.lanes;
+	instance.threads = context.threads;
+	instance.type = Argon2_d;
+	instance.memory = (block*)memory;
+
+	if (instance.threads > instance.lanes) {
+		instance.threads = instance.lanes;
 	}

-	void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber, unsigned iterations) {
-		uint64_t c0, c1, c2, c3, c4, c5, c6, c7;
-
-		c0 = blockNumber;
-		c1 = c2 = c3 = c4 = c5 = c6 = c7 = 0;
-
-		uint8_t* mixBlock;
-
-		for (auto i = 0; i < iterations; ++i) {
-			mixBlock = selectMixBlock(cache, c0, c1);
-			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
-
-			mixBlock = selectMixBlock(cache, c1, c2);
-			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
+	/* 3. Initialization: Hashing inputs, allocating memory, filling first
+	 * blocks
+	 */
+	argon_initialize(&instance, &context);
+
+	fill_memory_blocks(&instance);
+
+	reciprocalCache.clear();
+	randomx::Blake2Generator gen(seed, 1000);
+	for (int i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) {
+		randomx::generateSuperscalar(programs[i], gen);
+		for (unsigned j = 0; j < programs[i].getSize(); ++j) {
+			auto& instr = programs[i](j);
+			if (instr.opcode == randomx::SuperscalarInstructionType::IMUL_RCP) {
+				auto rcp = reciprocal(instr.getImm32());
+				instr.setImm32(reciprocalCache.size());
+				reciprocalCache.push_back(rcp);
+			}
+		}
+	}
+}

-			mixBlock = selectMixBlock(cache, c2, c3);
-			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
+namespace randomx {

-			mixBlock = selectMixBlock(cache, c3, c4);
-			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
+	template<class Allocator>
+	bool Dataset<Allocator>::allocate() {
+		memory = (uint8_t*)Allocator::allocMemory(RANDOMX_DATASET_SIZE);
+		return true;
+	}

-			mixBlock = selectMixBlock(cache, c4, c5);
-			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
+	template<class Allocator>
+	Dataset<Allocator>::~Dataset() {
+		Allocator::freeMemory(memory, RANDOMX_DATASET_SIZE);
+	}

-			mixBlock = selectMixBlock(cache, c5, c6);
-			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
+	template<class Allocator>
+	bool Cache<Allocator>::allocate() {
+		memory = (uint8_t*)Allocator::allocMemory(RANDOMX_ARGON_MEMORY * ARGON2_BLOCK_SIZE);
+		return true;
+	}

-			mixBlock = selectMixBlock(cache, c6, c7);
-			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
+	template<class Allocator>
+	Cache<Allocator>::~Cache() {
+		Allocator::freeMemory(memory, RANDOMX_ARGON_MEMORY * ARGON2_BLOCK_SIZE);
+	}

-			mixBlock = selectMixBlock(cache, c7, c0);
-			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
-		}
+	template<class Allocator>
+	DatasetInitFunc Cache<Allocator>::getInitFunc() {
+		return &initDataset;
+	}

-		store64(out + 0, c0);
-		store64(out + 8, c1);
-		store64(out + 16, c2);
-		store64(out + 24, c3);
-		store64(out + 32, c4);
-		store64(out + 40, c5);
-		store64(out + 48, c6);
-		store64(out + 56, c7);
+	template<class Allocator>
+	DatasetInitFunc CacheWithJit<Allocator>::getInitFunc() {
+		return jit.getDatasetInitFunc();
 	}
-#endif

-	void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) {
-		uint64_t* datasetLine = (uint64_t*)(memory.ds.dataset.memory + memory.ma);
-		memory.mx ^= addr;
-		memory.mx &= -64; //align to cache line
-		std::swap(memory.mx, memory.ma);
-		PREFETCHNTA(memory.ds.dataset.memory + memory.ma);
-		for (int i = 0; i < RegistersCount; ++i)
-			reg.r[i] ^= datasetLine[i];
+	template<class Allocator>
+	void CacheWithJit<Allocator>::initialize(const void *seed, size_t seedSize) {
+		randomx_cache::initialize(seed, seedSize);
+		jit.generateSuperscalarHash(programs, reciprocalCache);
+		jit.generateDatasetInitCode();
 	}

-	void datasetReadLight(addr_t addr, MemoryRegisters& memory, int_reg_t (&reg)[RegistersCount]) {
-		memory.mx ^= addr;
-		memory.mx &= CacheLineAlignMask; //align to cache line
-		Cache& cache = memory.ds.cache;
-		uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)];
-		initBlock(cache, (uint8_t*)datasetLine, memory.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8);
-		for (int i = 0; i < RegistersCount; ++i)
-			reg[i] ^= datasetLine[i];
-		std::swap(memory.mx, memory.ma);
+	template class Dataset<AlignedAllocator<CacheLineSize>>;
+	template class Dataset<LargePageAllocator>;
+	template class Cache<AlignedAllocator<CacheLineSize>>;
+	template class Cache<LargePageAllocator>;
+	template class CacheWithJit<AlignedAllocator<CacheLineSize>>;
+	template class CacheWithJit<LargePageAllocator>;
+
+	constexpr uint64_t superscalarMul0 = 6364136223846793005ULL;
+	constexpr uint64_t superscalarAdd1 = 9298410992540426748ULL;
+	constexpr uint64_t superscalarAdd2 = 12065312585734608966ULL;
+	constexpr uint64_t superscalarAdd3 = 9306329213124610396ULL;
+	constexpr uint64_t superscalarAdd4 = 5281919268842080866ULL;
+	constexpr uint64_t superscalarAdd5 = 10536153434571861004ULL;
+	constexpr uint64_t superscalarAdd6 = 3398623926847679864ULL;
+	constexpr uint64_t superscalarAdd7 = 9549104520008361294ULL;
+
+	static inline uint8_t* getMixBlock(uint64_t registerValue, uint8_t *memory) {
+		constexpr uint32_t mask = (RANDOMX_ARGON_MEMORY * ArgonBlockSize / CacheLineSize - 1);
+		return memory + (registerValue & mask) * CacheLineSize;
 	}

-	void datasetReadLightAsync(addr_t addr, MemoryRegisters& memory, int_reg_t(&reg)[RegistersCount]) {
-		ILightClientAsyncWorker* aw = memory.ds.asyncWorker;
-		const uint64_t* datasetLine = aw->getBlock(memory.ma);
-		for (int i = 0; i < RegistersCount; ++i)
-			reg[i] ^= datasetLine[i];
-		memory.mx ^= addr;
-		memory.mx &= CacheLineAlignMask; //align to cache line
-		std::swap(memory.mx, memory.ma);
-		aw->prepareBlock(memory.ma);
+	void initDatasetBlock(randomx_cache* cache, uint8_t* out, uint64_t blockNumber) {
+		int_reg_t rl[8];
+		uint8_t* mixBlock;
+		uint64_t registerValue = blockNumber;
+		rl[0] = (blockNumber + 1) * superscalarMul0;
+		rl[1] = rl[0] ^ superscalarAdd1;
+		rl[2] = rl[0] ^ superscalarAdd2;
+		rl[3] = rl[0] ^ superscalarAdd3;
+		rl[4] = rl[0] ^ superscalarAdd4;
+		rl[5] = rl[0] ^ superscalarAdd5;
+		rl[6] = rl[0] ^ superscalarAdd6;
+		rl[7] = rl[0] ^ superscalarAdd7;
+		for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) {
+			mixBlock = getMixBlock(registerValue, cache->memory);
+			SuperscalarProgram& prog = cache->programs[i];
+
+			executeSuperscalar(rl, prog, &cache->reciprocalCache);
+
+			for (unsigned q = 0; q < 8; ++q)
+				rl[q] ^= load64(mixBlock + 8 * q);
+
+			registerValue = rl[prog.getAddressRegister()];
+		}
+
+		memcpy(out, &rl, CacheLineSize);
 	}

-	void datasetAlloc(dataset_t& ds, bool largePages) {
+	void initDataset(randomx_cache* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock) {
+		for (uint32_t blockNumber = startBlock; blockNumber < endBlock; ++blockNumber, dataset += CacheLineSize)
+			initDatasetBlock(cache, dataset, blockNumber);
+	}
+	
+	/*void datasetAlloc(dataset_t& ds, bool largePages) {
 		if (std::numeric_limits<size_t>::max() < RANDOMX_DATASET_SIZE)
 			throw std::runtime_error("Platform doesn't support enough memory for the dataset");
 		if (largePages) {
@ -158,14 +222,8 @@ namespace RandomX {
 		}
 	}

-	void datasetInit(Cache& cache, Dataset& ds, uint32_t startBlock, uint32_t blockCount) {
-		for (uint64_t i = startBlock; i < startBlock + blockCount; ++i) {
-			initBlock(cache, ds.memory + i * CacheLineSize, i, RANDOMX_CACHE_ACCESSES / 8);
-		}
-	}
-
 	void datasetInitCache(const void* seed, dataset_t& ds, bool largePages) {
 		ds.cache.memory = allocCache(ds.cache.size, largePages);
 		argonFill(ds.cache, seed, SeedSize);
-	}
+	}*/
 }
--- a/src/dataset.hpp
+++ b/src/dataset.hpp
@ -20,26 +20,62 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #pragma once

 #include <cstdint>
+#include <vector>
 #include "intrinPortable.h"
 #include "common.hpp"
+#include "randomx.h"
+#include "Program.hpp"
+#include "superscalar_program.hpp"
+#include "JitCompilerX86.hpp"
+#include "allocator.hpp"

-namespace RandomX {
+struct randomx_dataset {
+	virtual ~randomx_dataset() = 0;
+	virtual bool allocate() = 0;
+	uint8_t* memory = nullptr;
+};

-#if false //RANDOMX_ARGON_GROWTH == 0 && (defined(_M_X64) || defined(__x86_64__))
-	extern "C"
-#endif
-	void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber, unsigned iterations);
+struct randomx_cache : public randomx_dataset {
+	virtual randomx::DatasetInitFunc getInitFunc() = 0;
+	virtual void initialize(const void *seed, size_t seedSize); //argon2
+	randomx::SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES];
+	std::vector<uint64_t> reciprocalCache;
+};

-	void datasetAlloc(dataset_t& ds, bool largePages);

-	void datasetInit(Cache& cache, Dataset& ds, uint32_t startBlock, uint32_t blockCount);

-	void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile&);
+namespace randomx {

-	void datasetInitCache(const void* seed, dataset_t& dataset, bool largePages);
+	template<class Allocator>
+	struct Dataset : public randomx_dataset {
+		~Dataset() override;
+		bool allocate() override;
+	};

-	void datasetReadLight(addr_t addr, MemoryRegisters& memory, int_reg_t(&reg)[RegistersCount]);
+	using DatasetDefault = Dataset<AlignedAllocator<CacheLineSize>>;
+	using DatasetLargePage = Dataset<LargePageAllocator>;

-	void datasetReadLightAsync(addr_t addr, MemoryRegisters& memory, int_reg_t(&reg)[RegistersCount]);
-}
+	template<class Allocator>
+	struct Cache : public randomx_cache {
+		~Cache() override;
+		bool allocate() override;
+		DatasetInitFunc getInitFunc() override;
+	};
+
+	template<class Allocator>
+	struct CacheWithJit : public Cache<Allocator> {
+		using Cache<Allocator>::programs;
+		using Cache<Allocator>::reciprocalCache;
+		void initialize(const void *seed, size_t seedSize) override;
+		DatasetInitFunc getInitFunc() override;
+		JitCompilerX86 jit;
+	};

+	using CacheDefault = Cache<AlignedAllocator<CacheLineSize>>;
+	using CacheWithJitDefault = CacheWithJit<AlignedAllocator<CacheLineSize>>;
+	using CacheLargePage = Cache<LargePageAllocator>;
+	using CacheWithJitLargePage = CacheWithJit<LargePageAllocator>;
+
+	void initDatasetBlock(randomx_cache* cache, uint8_t* out, uint64_t blockNumber);
+	void initDataset(randomx_cache* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock);
+}
--- a/src/main.cpp
+++ b/src/main.cpp
@ -17,31 +17,28 @@ You should have received a copy of the GNU General Public License
 along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 */
 //#define TRACE
-#include "InterpretedVirtualMachine.hpp"
-#include "CompiledVirtualMachine.hpp"
-#include "CompiledLightVirtualMachine.hpp"
-#include "AssemblyGeneratorX86.hpp"
+
+//#include "AssemblyGeneratorX86.hpp"
 #include "Stopwatch.hpp"
-#include "blake2/blake2.h"
+//#include "blake2/blake2.h"
 #include "blake2/endian.h"
 #include <fstream>
 #include <iostream>
 #include <iomanip>
 #include <exception>
 #include <cstring>
-#include "Program.hpp"
+//#include "Program.hpp"
 #include <string>
+#include <vector>
 #include <thread>
 #include <atomic>
-#include "dataset.hpp"
-#include "Cache.hpp"
-#include "hashAes1Rx4.hpp"
-#include "superscalarGenerator.hpp"
-#include "JitCompilerX86.hpp"
+//#include "hashAes1Rx4.hpp"
+//#include "JitCompilerX86.hpp"
+#include "randomx.h"

 const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 };

-const uint8_t blockTemplate__[] = {
+const uint8_t blockTemplate_[] = {
 		0x07, 0x07, 0xf7, 0xa4, 0xf0, 0xd6, 0x05, 0xb3, 0x03, 0x26, 0x08, 0x16, 0xba, 0x3f, 0x10, 0x90, 0x2e, 0x1a, 0x14,
 		0x5a, 0xc5, 0xfa, 0xd3, 0xaa, 0x3a, 0xf6, 0xea, 0x44, 0xc1, 0x18, 0x69, 0xdc, 0x4f, 0x85, 0x3f, 0x00, 0x2b, 0x2e,
 		0xea, 0x00, 0x00, 0x00, 0x00, 0x77, 0xb2, 0x06, 0xa0, 0x2c, 0xa5, 0xb1, 0xd4, 0xce, 0x6b, 0xbf, 0xdf, 0x0a, 0xca,
@ -131,77 +128,57 @@ void printUsage(const char* executable) {

 template<bool softAes>
 void generateAsm(uint32_t nonce) {
-	alignas(16) uint64_t hash[8];
-	uint8_t blockTemplate[sizeof(blockTemplate__)];
-	memcpy(blockTemplate, blockTemplate__, sizeof(blockTemplate));
+	/*alignas(16) uint64_t hash[8];
+	uint8_t blockTemplate[sizeof(blockTemplate_)];
+	memcpy(blockTemplate, blockTemplate_, sizeof(blockTemplate));
 	store32(blockTemplate + 39, nonce);
 	blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0);
 	uint8_t scratchpad[RANDOMX_SCRATCHPAD_L3];
 	fillAes1Rx4<softAes>((void*)hash, RANDOMX_SCRATCHPAD_L3, scratchpad);
-	RandomX::AssemblyGeneratorX86 asmX86;
-	RandomX::Program p;
+	randomx::AssemblyGeneratorX86 asmX86;
+	randomx::Program p;
 	fillAes1Rx4<softAes>(hash, sizeof(p), &p);
 	asmX86.generateProgram(p);
-	asmX86.printCode(std::cout);
+	asmX86.printCode(std::cout);*/
 }

 template<bool softAes>
 void generateNative(uint32_t nonce) {
-	alignas(16) uint64_t hash[8];
-	uint8_t blockTemplate[sizeof(blockTemplate__)];
-	memcpy(blockTemplate, blockTemplate__, sizeof(blockTemplate));
+	/*alignas(16) uint64_t hash[8];
+	uint8_t blockTemplate[sizeof(blockTemplate_)];
+	memcpy(blockTemplate, blockTemplate_, sizeof(blockTemplate));
 	store32(blockTemplate + 39, nonce);
 	blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0);
 	uint8_t scratchpad[RANDOMX_SCRATCHPAD_L3];
 	fillAes1Rx4<softAes>((void*)hash, RANDOMX_SCRATCHPAD_L3, scratchpad);
-	alignas(16) RandomX::Program prog;
+	alignas(16) randomx::Program prog;
 	fillAes1Rx4<softAes>((void*)hash, sizeof(prog), &prog);
 	for (int i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) {
 		prog(i).dst %= 8;
 		prog(i).src %= 8;
 	}
-	std::cout << prog << std::endl;
+	std::cout << prog << std::endl;*/
 }

-template<bool softAes>
-void mine(RandomX::VirtualMachine* vm, std::atomic<uint32_t>& atomicNonce, AtomicHash& result, uint32_t noncesCount, int thread, uint8_t* scratchpad) {
-	alignas(16) uint64_t hash[8];
-	uint8_t blockTemplate[sizeof(blockTemplate__)];
-	memcpy(blockTemplate, blockTemplate__, sizeof(blockTemplate));
+void mine(randomx_vm* vm, std::atomic<uint32_t>& atomicNonce, AtomicHash& result, uint32_t noncesCount, int thread) {
+	uint64_t hash[RANDOMX_HASH_SIZE / 4];
+	uint8_t blockTemplate[sizeof(blockTemplate_)];
+	memcpy(blockTemplate, blockTemplate_, sizeof(blockTemplate));
 	void* noncePtr = blockTemplate + 39;
 	auto nonce = atomicNonce.fetch_add(1);

 	while (nonce < noncesCount) {
 		//std::cout << "Thread " << thread << " nonce " << nonce << std::endl;
 		store32(noncePtr, nonce);
-		blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0);
-		fillAes1Rx4<softAes>((void*)hash, RANDOMX_SCRATCHPAD_L3, scratchpad);
-		//dump((char*)scratchpad, RANDOMX_SCRATCHPAD_L3, "spad-before.txt");
-		vm->resetRoundingMode();
-		vm->setScratchpad(scratchpad);
-		for (int chain = 0; chain < RANDOMX_PROGRAM_COUNT - 1; ++chain) {
-			fillAes1Rx4<softAes>((void*)hash, sizeof(RandomX::Program), vm->getProgramBuffer());
-			vm->initialize();
-			vm->execute();
-			vm->getResult<false>(nullptr, 0, hash);
-		}
-		fillAes1Rx4<softAes>((void*)hash, sizeof(RandomX::Program), vm->getProgramBuffer());
-		vm->initialize();
-		vm->execute();
-		/*if (RandomX::trace) {
-			for (int j = 0; j < RandomX::ProgramLength; ++j) {
-				uint64_t res = *(uint64_t*)(scratchpad + 8 * (RandomX::ProgramLength - 1 - j));
-				std::cout << std::hex << std::setw(16) << std::setfill('0') << res << std::endl;
-			}
-		}*/
-		vm->getResult<softAes>(scratchpad, RANDOMX_SCRATCHPAD_L3, hash);
-		//dump((char*)scratchpad, RANDOMX_SCRATCHPAD_L3, "spad-after.txt");
+		
+		randomx_calculate_hash(vm, blockTemplate, sizeof(blockTemplate), &hash);
+
 		result.xorWith(hash);
-		if (RandomX::trace) {
+		/*if (randomx::trace) {
 			std::cout << "Nonce: " << nonce << " ";
 			outputHex(std::cout, (char*)hash, 16);
 			std::cout << std::endl;
-		}
+		}*/
 		nonce = atomicNonce.fetch_add(1);
 	}
 }
@ -227,16 +204,16 @@ int main(int argc, char** argv) {
 	readOption("--genSuperscalar", argc, argv, genSuperscalar);
 	readOption("--legacy", argc, argv, legacy);

-	if (genSuperscalar) {
-		RandomX::SuperscalarProgram p;
-		RandomX::Blake2Generator gen(seed, programCount);
-		RandomX::generateSuperscalar(p, gen);
-		RandomX::AssemblyGeneratorX86 asmX86;
+	/*if (genSuperscalar) {
+		randomx::SuperscalarProgram p;
+		randomx::Blake2Generator gen(seed, programCount);
+		randomx::generateSuperscalar(p, gen);
+		randomx::AssemblyGeneratorX86 asmX86;
 		asmX86.generateAsm(p);
 		//std::ofstream file("lightProg2.asm");
 		asmX86.printCode(std::cout);
 		return 0;
-	}
+	}*/

 	if (genAsm) {
 		if (softAes)
@ -264,15 +241,42 @@ int main(int argc, char** argv) {

 	std::atomic<uint32_t> atomicNonce(0);
 	AtomicHash result;
-	std::vector<RandomX::VirtualMachine*> vms;
+	std::vector<randomx_vm*> vms;
 	std::vector<std::thread> threads;
-	RandomX::dataset_t dataset;
-	const uint64_t cacheSize = (RANDOMX_ARGON_MEMORY + RANDOMX_ARGON_GROWTH * epoch) * RandomX::ArgonBlockSize;
-	const uint64_t datasetSize = (RANDOMX_DATASET_SIZE + RANDOMX_DS_GROWTH * epoch);
-	dataset.cache.size = cacheSize;
-	RandomX::SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES];
+	randomx_dataset* dataset;
+	randomx_cache* cache;
+	randomx_flags flags = RANDOMX_FLAG_DEFAULT;

-	std::cout << "RandomX - " << (miningMode ? "mining" : "verification") << " mode" << std::endl;
+	if (miningMode) {
+		flags = (randomx_flags)(flags | RANDOMX_FLAG_FULL_MEM);
+		std::cout << "RandomX - full memory mode (2 GiB)" << std::endl;
+	} else {
+		std::cout << "RandomX - light memory mode (256 MiB)" << std::endl;
+	}
+
+	if (jit) {
+		flags = (randomx_flags)(flags | RANDOMX_FLAG_JIT);
+		std::cout << "RandomX - JIT compiled mode" << std::endl;
+	}
+	else {
+		std::cout << "RandomX - interpreted mode" << std::endl;
+	}
+
+	if (softAes) {
+		std::cout << "RandomX - software AES mode" << std::endl;
+	}
+	else {
+		flags = (randomx_flags)(flags | RANDOMX_FLAG_HARD_AES);
+		std::cout << "RandomX - hardware AES mode" << std::endl;
+	}
+
+	if (largePages) {
+		flags = (randomx_flags)(flags | RANDOMX_FLAG_LARGE_PAGES);
+		std::cout << "RandomX - large pages mode" << std::endl;
+	}
+	else {
+		std::cout << "RandomX - small pages mode" << std::endl;
+	}

 	std::cout << "Initializing";
 	if(miningMode)
@ -281,116 +285,60 @@ int main(int argc, char** argv) {

 	try {
 		Stopwatch sw(true);
-		RandomX::datasetInitCache(seed, dataset, largePages);
-		if (RandomX::trace) {
+		cache = randomx_alloc_cache(flags);
+		randomx_init_cache(cache, seed, sizeof(seed));
+		/*if (randomx::trace) {
 			std::cout << "Cache: " << std::endl;
 			outputHex(std::cout, (char*)dataset.cache.memory, sizeof(__m128i));
 			std::cout << std::endl;
-		}
-		if (!legacy) {
-			RandomX::Blake2Generator gen(seed, programCount);
-			for (int i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) {
-				RandomX::generateSuperscalar(programs[i], gen);
-			}
-		}
-		if (!miningMode) {
-			std::cout << "Cache (" << cacheSize << " bytes) initialized in " << sw.getElapsed() << " s" << std::endl;
-		}
-		else {
-			auto cache = dataset.cache;
-			dataset.dataset.size = datasetSize;
-			RandomX::datasetAlloc(dataset, largePages);
-			const uint64_t datasetBlockCount = datasetSize / RandomX::CacheLineSize;
-			if (!legacy) {
-				RandomX::JitCompilerX86 jit86;
-				jit86.generateSuperScalarHash(programs);
-				RandomX::DatasetInitFunc dsfunc = jit86.getDatasetInitFunc();
-				if (initThreadCount > 1) {
-					auto perThread = datasetBlockCount / initThreadCount;
-					auto remainder = datasetBlockCount % initThreadCount;
-					uint32_t startBlock = 0;
-					uint32_t endBlock = 0;
-					for (int i = 0; i < initThreadCount; ++i) {
-						auto count = perThread + (i == initThreadCount - 1 ? remainder : 0);
-						endBlock += count;
-						threads.push_back(std::thread(dsfunc, cache.memory, dataset.dataset.memory + startBlock * RandomX::CacheLineSize, startBlock, endBlock));
-						startBlock += count;
-					}
-					for (unsigned i = 0; i < threads.size(); ++i) {
-						threads[i].join();
-					}
+		}*/
+		if (miningMode) {
+			dataset = randomx_alloc_dataset(flags);
+			if (initThreadCount > 1) {
+				auto perThread = RANDOMX_DATASET_BLOCKS / initThreadCount;
+				auto remainder = RANDOMX_DATASET_BLOCKS % initThreadCount;
+				uint32_t startBlock = 0;
+				for (int i = 0; i < initThreadCount; ++i) {
+					auto count = perThread + (i == initThreadCount - 1 ? remainder : 0);
+					threads.push_back(std::thread(&randomx_init_dataset, dataset, cache, startBlock, count));
+					startBlock += count;
 				}
-				else {
-					dsfunc(cache.memory, dataset.dataset.memory, 0, datasetBlockCount);
+				for (unsigned i = 0; i < threads.size(); ++i) {
+					threads[i].join();
 				}
-				//dump((const char*)dataset.dataset.memory, RANDOMX_DATASET_SIZE, "dataset.dat");
 			}
 			else {
-				if (initThreadCount > 1) {
-					auto perThread = datasetBlockCount / initThreadCount;
-					auto remainder = datasetBlockCount % initThreadCount;
-					for (int i = 0; i < initThreadCount; ++i) {
-						auto count = perThread + (i == initThreadCount - 1 ? remainder : 0);
-						threads.push_back(std::thread(&RandomX::datasetInit, std::ref(cache), std::ref(dataset.dataset), i * perThread, count));
-					}
-					for (unsigned i = 0; i < threads.size(); ++i) {
-						threads[i].join();
-					}
-				}
-				else {
-					RandomX::datasetInit(cache, dataset.dataset, 0, datasetBlockCount);
-				}
+				randomx_init_dataset(dataset, cache, 0, RANDOMX_DATASET_BLOCKS);
 			}
-			RandomX::deallocCache(cache, largePages);
+			//dump((const char*)dataset.dataset.memory, RANDOMX_DATASET_SIZE, "dataset.dat");
+			randomx_release_cache(cache);
 			threads.clear();
-			std::cout << "Dataset (" << datasetSize << " bytes) initialized in " << sw.getElapsed() << " s" << std::endl;
 		}
+		std::cout << "Memory initialized in " << sw.getElapsed() << " s" << std::endl;
 		std::cout << "Initializing " << threadCount << " virtual machine(s) ..." << std::endl;
 		for (int i = 0; i < threadCount; ++i) {
-			RandomX::VirtualMachine* vm;
-			if (miningMode) {
-				vm = new RandomX::CompiledVirtualMachine();
-			}
-			else {
-				if (jit && !legacy)
-					vm = new RandomX::CompiledLightVirtualMachine<true>();
-				else if (jit)
-					vm = new RandomX::CompiledLightVirtualMachine<false>();
-				else if (!legacy)
-					vm = new RandomX::InterpretedVirtualMachine<true>(softAes);
-				else
-					vm = new RandomX::InterpretedVirtualMachine<false>(softAes);
-			}
-			vm->setDataset(dataset, datasetSize, programs);
+			randomx_vm *vm = randomx_create_vm(flags);
+			if (miningMode)
+				randomx_vm_set_dataset(vm, dataset);
+			else
+				randomx_vm_set_cache(vm, cache);
 			vms.push_back(vm);
 		}
-		uint8_t* scratchpadMem;
-		if (largePages) {
-			scratchpadMem = (uint8_t*)allocLargePagesMemory(threadCount * RANDOMX_SCRATCHPAD_L3);
-		}
-		else {
-			scratchpadMem = (uint8_t*)_mm_malloc(threadCount * RANDOMX_SCRATCHPAD_L3, RandomX::CacheLineSize);
-		}
 		std::cout << "Running benchmark (" << programCount << " nonces) ..." << std::endl;
 		sw.restart();
 		if (threadCount > 1) {
 			for (unsigned i = 0; i < vms.size(); ++i) {
 				if (softAes)
-					threads.push_back(std::thread(&mine<true>, vms[i], std::ref(atomicNonce), std::ref(result), programCount, i, scratchpadMem + RANDOMX_SCRATCHPAD_L3 * i));
+					threads.push_back(std::thread(&mine, vms[i], std::ref(atomicNonce), std::ref(result), programCount, i));
 				else
-					threads.push_back(std::thread(&mine<false>, vms[i], std::ref(atomicNonce), std::ref(result), programCount, i, scratchpadMem + RANDOMX_SCRATCHPAD_L3 * i));
+					threads.push_back(std::thread(&mine, vms[i], std::ref(atomicNonce), std::ref(result), programCount, i));
 			}
 			for (unsigned i = 0; i < threads.size(); ++i) {
 				threads[i].join();
 			}
 		}
 		else {
-			if(softAes)
-				mine<true>(vms[0], std::ref(atomicNonce), std::ref(result), programCount, 0, scratchpadMem);
-			else
-				mine<false>(vms[0], std::ref(atomicNonce), std::ref(result), programCount, 0, scratchpadMem);
-			/*if (miningMode)
-				std::cout << "Average program size: " << ((RandomX::CompiledVirtualMachine*)vms[0])->getTotalSize() / programCount / RandomX::ChainLength << std::endl;*/
+			mine(vms[0], std::ref(atomicNonce), std::ref(result), programCount, 0);
 		}
 		double elapsed = sw.getElapsed();
 		std::cout << "Calculated result: ";
--- a/src/randomx.cpp
+++ b/src/randomx.cpp
@ -0,0 +1,209 @@
+/*
+Copyright (c) 2019 tevador
+
+This file is part of RandomX.
+
+RandomX is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
+*/
+
+#include "randomx.h"
+#include "dataset.hpp"
+#include "VirtualMachine.hpp"
+#include "./InterpretedVirtualMachine.hpp"
+#include "./InterpretedLightVirtualMachine.hpp"
+#include "./CompiledVirtualMachine.hpp"
+#include "./CompiledLightVirtualMachine.hpp"
+#include "virtualMemory.hpp"
+#include "blake2/blake2.h"
+
+extern "C" {
+
+	randomx_cache *randomx_alloc_cache(randomx_flags flags) {
+		randomx_cache *cache;
+		switch (flags & (RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES))
+		{
+		case RANDOMX_FLAG_DEFAULT:
+			cache = new randomx::CacheDefault();
+			break;
+
+		case RANDOMX_FLAG_JIT:
+			cache = new randomx::CacheWithJitDefault();
+			break;
+
+		case RANDOMX_FLAG_LARGE_PAGES:
+			cache = new randomx::CacheLargePage();
+			break;
+
+		case RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES:
+			cache = new randomx::CacheWithJitLargePage();
+			break;
+
+		default:
+			UNREACHABLE;
+		}
+
+		if (!cache->allocate()) {
+			delete cache;
+			cache = nullptr;
+		}
+
+		return cache;
+	}
+
+	void randomx_init_cache(randomx_cache *cache, const void *seed, size_t seedSize) {
+		cache->initialize(seed, seedSize);
+	}
+
+	void randomx_release_cache(randomx_cache* cache) {
+		delete cache;
+	}
+
+	randomx_dataset *randomx_alloc_dataset(randomx_flags flags) {
+		randomx_dataset *dataset;
+		if (flags & RANDOMX_FLAG_LARGE_PAGES) {
+			dataset = new randomx::DatasetLargePage();
+		}
+		else {
+			dataset = new randomx::DatasetDefault();
+		}
+		if (!dataset->allocate()) {
+			delete dataset;
+			dataset = nullptr;
+		}
+
+		return dataset;
+	}
+
+	void randomx_init_dataset(randomx_dataset *dataset, randomx_cache *cache, unsigned long startBlock, unsigned long blockCount) {
+		randomx::DatasetInitFunc dsfunc = cache->getInitFunc();
+		dsfunc(cache, dataset->memory + startBlock * randomx::CacheLineSize, startBlock, startBlock + blockCount);
+	}
+
+	void randomx_release_dataset(randomx_dataset *dataset) {
+		delete dataset;
+	}
+
+	randomx_vm *randomx_create_vm(randomx_flags flags) {
+		randomx_vm *vm;
+		switch (flags & (RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES)) {
+		case RANDOMX_FLAG_DEFAULT: //0
+			vm = new randomx::InterpretedLightVmDefault();
+			break;
+
+		case RANDOMX_FLAG_FULL_MEM: //1
+			vm = new randomx::InterpretedVmDefault();
+			break;
+
+		case RANDOMX_FLAG_JIT: //2
+			vm = new randomx::CompiledLightVmDefault();
+			break;
+
+		case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT: //3
+			vm = new randomx::CompiledVmDefault();
+			break;
+
+		case RANDOMX_FLAG_HARD_AES: //4
+			vm = new randomx::InterpretedLightVmHardAes();
+			break;
+
+		case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_HARD_AES: //5
+			vm = new randomx::InterpretedVmHardAes();
+			break;
+
+		case RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES: //6
+			vm = new randomx::CompiledLightVmHardAes();
+			break;
+
+		case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES: //7
+			vm = new randomx::CompiledVmHardAes();
+			break;
+
+		case RANDOMX_FLAG_LARGE_PAGES: //8
+			vm = new randomx::InterpretedLightVmLargePage();
+			break;
+
+		case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_LARGE_PAGES: //9
+			vm = new randomx::InterpretedVmLargePage();
+			break;
+
+		case RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES: //10
+			vm = new randomx::CompiledLightVmLargePage();
+			break;
+
+		case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES: //11
+			vm = new randomx::CompiledVmLargePage();
+			break;
+
+		case RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES: //12
+			vm = new randomx::InterpretedLightVmLargePageHardAes();
+			break;
+
+		case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES: //13
+			vm = new randomx::InterpretedVmLargePageHardAes();
+			break;
+
+		case RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES: //14
+			vm = new randomx::CompiledLightVmLargePageHardAes();
+			break;
+
+		case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES: //15
+			vm = new randomx::CompiledVmLargePageHardAes();
+			break;
+
+		default:
+			UNREACHABLE;
+		}
+
+		if (!vm->allocate()) {
+			delete vm;
+			vm = nullptr;
+		}
+
+		return vm;
+	}
+
+	void randomx_vm_set_cache(randomx_vm *machine, randomx_cache* cache) {
+		machine->setCache(cache);
+	}
+
+	void randomx_vm_set_dataset(randomx_vm *machine, randomx_dataset *dataset) {
+		machine->setDataset(dataset);
+	}
+
+	void randomx_destroy_vm(randomx_vm *machine) {
+		delete machine;
+	}
+
+	void randomx_calculate_hash(randomx_vm *machine, void *input, size_t inputSize, void *output) {
+		alignas(16) uint64_t hash[8];
+		blake2b(hash, sizeof(hash), input, inputSize, nullptr, 0);
+		machine->generate(&hash, machine->scratchpad, randomx::ScratchpadSize);
+		//fillAes1Rx4<false>((void*)hash, RANDOMX_SCRATCHPAD_L3, machine->scratchpad);
+		//dump((char*)scratchpad, RANDOMX_SCRATCHPAD_L3, "spad-before.txt");
+		machine->resetRoundingMode();
+		for (int chain = 0; chain < RANDOMX_PROGRAM_COUNT - 1; ++chain) {
+			machine->generate(&hash, &machine->program, sizeof(randomx::Program));
+			//fillAes1Rx4<softAes>((void*)hash, sizeof(RandomX::Program), vm->getProgramBuffer());
+			machine->initialize();
+			machine->execute();
+			blake2b(hash, sizeof(hash), &machine->reg, sizeof(machine->reg), nullptr, 0);
+		}
+		machine->generate((void*)hash, &machine->program, sizeof(randomx::Program));
+		//fillAes1Rx4<softAes>((void*)hash, sizeof(RandomX::Program), vm->getProgramBuffer());
+		machine->initialize();
+		machine->execute();
+		machine->getFinalResult(output, 64);
+	}
+
+}
--- a/src/randomx.h
+++ b/src/randomx.h
@ -0,0 +1,130 @@
+/*
+Copyright (c) 2019 tevador
+
+This file is part of RandomX.
+
+RandomX is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
+*/
+
+#ifndef RANDOMX_H
+#define RANDOMX_H
+
+/*
+
+Minimal usage example:
+----------------------
+
+#include "randomx.h"
+#include <stdio.h>
+
+int main() {
+  const char mySeed[] = "RandomX example seed";
+  const char myInput[] = "RandomX example input";
+  char hash[RANDOMX_HASH_SIZE];
+
+  randomx_cache *myCache = randomx_alloc_cache(RANDOMX_FLAG_DEFAULT);
+  randomx_init_cache(myCache, mySeed, sizeof mySeed);
+  randomx_vm *myMachine = randomx_create_vm(RANDOMX_FLAG_DEFAULT);
+  randomx_vm_set_cache(myMachine, myCache);
+
+  randomx_calculate_hash(myMachine, myInput, sizeof myInput, hash);
+
+  randomx_destroy_vm(myMachine);
+  randomx_release_cache(myCache);
+
+  for (unsigned i = 0; i < RANDOMX_HASH_SIZE; ++i)
+    printf("%02x", hash[i]);
+
+  printf("\n");
+
+  return 0;
+}
+
+Optimized usage example:
+------------------------
+
+#include "randomx.h"
+#include <stdio.h>
+
+int main() {
+  const char mySeed[] = "RandomX example seed";
+  const char myInput[] = "RandomX example input";
+  char hash[RANDOMX_HASH_SIZE];
+
+  randomx_cache *myCache = randomx_alloc_cache(RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES);
+  randomx_init_cache(myCache, mySeed, sizeof mySeed);
+
+  randomx_dataset *myDataset = randomx_alloc_dataset(RANDOMX_FLAG_LARGE_PAGES);
+  randomx_init_dataset(myDataset, myCache, 0, RANDOMX_DATASET_BLOCKS);
+  randomx_release_cache(myCache);
+
+  randomx_vm *myMachine = randomx_create_vm(RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES);
+  randomx_vm_set_dataset(myMachine, myDataset);
+
+  randomx_calculate_hash(myMachine, myInput, sizeof myInput, hash);
+
+  randomx_destroy_vm(myMachine);
+  randomx_release_dataset(myDataset);
+
+  for (unsigned i = 0; i < RANDOMX_HASH_SIZE; ++i)
+    printf("%02x", hash[i]);
+
+  printf("\n");
+
+  return 0;
+}
+  
+*/
+
+#include <stddef.h>
+
+#define RANDOMX_HASH_SIZE 32
+#define RANDOMX_DATASET_BLOCKS 33554432UL
+
+typedef enum {
+  RANDOMX_FLAG_DEFAULT = 0,
+  RANDOMX_FLAG_FULL_MEM = 1,
+  RANDOMX_FLAG_JIT = 2,
+  RANDOMX_FLAG_HARD_AES = 4,
+  RANDOMX_FLAG_LARGE_PAGES = 8,
+} randomx_flags;
+
+typedef struct randomx_dataset randomx_dataset;
+typedef struct randomx_cache randomx_cache;
+typedef struct randomx_vm randomx_vm;
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+randomx_cache *randomx_alloc_cache(randomx_flags flags);
+void randomx_init_cache(randomx_cache *cache, const void *seed, size_t seedSize);
+void randomx_release_cache(randomx_cache* cache);
+
+randomx_dataset *randomx_alloc_dataset(randomx_flags flags);
+void randomx_init_dataset(randomx_dataset *dataset, randomx_cache *cache, unsigned long startBlock, unsigned long blockCount);
+void randomx_release_dataset(randomx_dataset *dataset);
+
+randomx_vm *randomx_create_vm(randomx_flags flags);
+void randomx_vm_set_cache(randomx_vm *machine, randomx_cache* cache);
+void randomx_vm_set_dataset(randomx_vm *machine, randomx_dataset *dataset);
+void randomx_destroy_vm(randomx_vm *machine);
+
+void randomx_calculate_hash(randomx_vm *machine, void *input, size_t inputSize, void *output);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
--- a/src/superscalarGenerator.cpp
+++ b/src/superscalarGenerator.cpp
@ -26,8 +26,10 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include <stdexcept>
 #include <iomanip>
 #include "superscalarGenerator.hpp"
+#include "intrinPortable.h"
+#include "reciprocal.h"

-namespace RandomX {
+namespace randomx {

 	static bool isMultiplication(int type) {
 		return type == SuperscalarInstructionType::IMUL_R || type == SuperscalarInstructionType::IMULH_R || type == SuperscalarInstructionType::ISMULH_R || type == SuperscalarInstructionType::IMUL_RCP;
@ -842,4 +844,52 @@ namespace RandomX {
 			std::cout << std::endl;
 		}*/
 	}
+
+	void executeSuperscalar(int_reg_t(&r)[8], SuperscalarProgram& prog, std::vector<uint64_t> *reciprocals) {
+		for (unsigned j = 0; j < prog.getSize(); ++j) {
+			Instruction& instr = prog(j);
+			switch (instr.opcode)
+			{
+			case randomx::SuperscalarInstructionType::ISUB_R:
+				r[instr.dst] -= r[instr.src];
+				break;
+			case randomx::SuperscalarInstructionType::IXOR_R:
+				r[instr.dst] ^= r[instr.src];
+				break;
+			case randomx::SuperscalarInstructionType::IADD_RS:
+				r[instr.dst] += r[instr.src] << instr.getModShift2();
+				break;
+			case randomx::SuperscalarInstructionType::IMUL_R:
+				r[instr.dst] *= r[instr.src];
+				break;
+			case randomx::SuperscalarInstructionType::IROR_C:
+				r[instr.dst] = rotr(r[instr.dst], instr.getImm32());
+				break;
+			case randomx::SuperscalarInstructionType::IADD_C7:
+			case randomx::SuperscalarInstructionType::IADD_C8:
+			case randomx::SuperscalarInstructionType::IADD_C9:
+				r[instr.dst] += signExtend2sCompl(instr.getImm32());
+				break;
+			case randomx::SuperscalarInstructionType::IXOR_C7:
+			case randomx::SuperscalarInstructionType::IXOR_C8:
+			case randomx::SuperscalarInstructionType::IXOR_C9:
+				r[instr.dst] ^= signExtend2sCompl(instr.getImm32());
+				break;
+			case randomx::SuperscalarInstructionType::IMULH_R:
+				r[instr.dst] = mulh(r[instr.dst], r[instr.src]);
+				break;
+			case randomx::SuperscalarInstructionType::ISMULH_R:
+				r[instr.dst] = smulh(r[instr.dst], r[instr.src]);
+				break;
+			case randomx::SuperscalarInstructionType::IMUL_RCP:
+				if (reciprocals != nullptr)
+					r[instr.dst] *= (*reciprocals)[instr.getImm32()];
+				else
+					r[instr.dst] *= reciprocal(instr.getImm32());
+				break;
+			default:
+				UNREACHABLE;
+			}
+		}
+	}
 }
--- a/src/superscalarGenerator.hpp
+++ b/src/superscalarGenerator.hpp
@ -18,10 +18,11 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 */

 #pragma once
-#include "Program.hpp"
+#include "superscalar_program.hpp"
 #include "Blake2Generator.hpp"
+#include <vector>

-namespace RandomX {
+namespace randomx {
 	                                              //                  Intel Ivy Bridge reference
 	namespace SuperscalarInstructionType {        //uOPs (decode)   execution ports         latency       code size
 		constexpr int ISUB_R = 0;                 //1               p015                    1               3 (sub)
@ -44,4 +45,5 @@ namespace RandomX {
 	}

 	void generateSuperscalar(SuperscalarProgram& prog, Blake2Generator& gen);
+	void executeSuperscalar(int_reg_t(&r)[8], SuperscalarProgram& prog, std::vector<uint64_t> *reciprocals = nullptr);
 }
--- a/src/superscalar_program.hpp
+++ b/src/superscalar_program.hpp
@ -0,0 +1,70 @@
+/*
+Copyright (c) 2019 tevador
+
+This file is part of RandomX.
+
+RandomX is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <cstdint>
+#include "Instruction.hpp"
+#include "configuration.h"
+
+namespace randomx {
+
+	class SuperscalarProgram {
+	public:
+		Instruction& operator()(int pc) {
+			return programBuffer[pc];
+		}
+		friend std::ostream& operator<<(std::ostream& os, const SuperscalarProgram& p) {
+			p.print(os);
+			return os;
+		}
+		uint32_t getSize() {
+			return size;
+		}
+		void setSize(uint32_t val) {
+			size = val;
+		}
+		int getAddressRegister() {
+			return addrReg;
+		}
+		void setAddressRegister(uint32_t val) {
+			addrReg = val;
+		}
+		double ipc;
+		int codeSize;
+		int macroOps;
+		int decodeCycles;
+		int cpuLatency;
+		int asicLatency;
+		int mulCount;
+		int cpuLatencies[8];
+		int asicLatencies[8];
+	private:
+		void print(std::ostream& os) const {
+			for (unsigned i = 0; i < size; ++i) {
+				auto instr = programBuffer[i];
+				os << instr;
+			}
+		}
+		Instruction programBuffer[RANDOMX_SUPERSCALAR_MAX_SIZE];
+		uint32_t size;
+		int addrReg;
+	};
+
+}
--- a/vcxproj/randomx.vcxproj
+++ b/vcxproj/randomx.vcxproj
@ -124,20 +124,22 @@
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
+    <ClCompile Include="..\src\allocator.cpp" />
    <ClCompile Include="..\src\argon2_core.c" />
    <ClCompile Include="..\src\argon2_ref.c" />
    <ClCompile Include="..\src\AssemblyGeneratorX86.cpp" />
    <ClCompile Include="..\src\Blake2Generator.cpp" />
    <ClCompile Include="..\src\blake2\blake2b.c" />
-    <ClCompile Include="..\src\Cache.cpp" />
    <ClCompile Include="..\src\CompiledLightVirtualMachine.cpp" />
    <ClCompile Include="..\src\CompiledVirtualMachine.cpp" />
    <ClCompile Include="..\src\dataset.cpp" />
    <ClCompile Include="..\src\hashAes1Rx4.cpp" />
    <ClCompile Include="..\src\Instruction.cpp" />
    <ClCompile Include="..\src\instructionsPortable.cpp" />
+    <ClCompile Include="..\src\InterpretedLightVirtualMachine.cpp" />
    <ClCompile Include="..\src\InterpretedVirtualMachine.cpp" />
    <ClCompile Include="..\src\JitCompilerX86.cpp" />
+    <ClCompile Include="..\src\randomx.cpp" />
    <ClCompile Include="..\src\superscalarGenerator.cpp" />
    <ClCompile Include="..\src\main.cpp" />
    <ClCompile Include="..\src\reciprocal.c" />
@ -150,11 +152,11 @@
    <MASM Include="..\src\squareHash.asm" />
  </ItemGroup>
  <ItemGroup>
+    <ClInclude Include="..\src\allocator.hpp" />
    <ClInclude Include="..\src\argon2.h" />
    <ClInclude Include="..\src\argon2_core.h" />
    <ClInclude Include="..\src\AssemblyGeneratorX86.hpp" />
    <ClInclude Include="..\src\Blake2Generator.hpp" />
-    <ClInclude Include="..\src\Cache.hpp" />
    <ClInclude Include="..\src\catch.hpp" />
    <ClInclude Include="..\src\common.hpp" />
    <ClInclude Include="..\src\CompiledLightVirtualMachine.hpp" />
@ -164,16 +166,19 @@
    <ClInclude Include="..\src\hashAes1Rx4.hpp" />
    <ClInclude Include="..\src\Instruction.hpp" />
    <ClInclude Include="..\src\instructionWeights.hpp" />
+    <ClInclude Include="..\src\InterpretedLightVirtualMachine.hpp" />
    <ClInclude Include="..\src\InterpretedVirtualMachine.hpp" />
    <ClInclude Include="..\src\intrinPortable.h" />
    <ClInclude Include="..\src\JitCompilerX86-static.hpp" />
    <ClInclude Include="..\src\JitCompilerX86.hpp" />
+    <ClInclude Include="..\src\randomx.h" />
    <ClInclude Include="..\src\superscalarGenerator.hpp" />
    <ClInclude Include="..\src\Program.hpp" />
    <ClInclude Include="..\src\reciprocal.h" />
    <ClInclude Include="..\src\softAes.h" />
    <ClInclude Include="..\src\squareHash.h" />
    <ClInclude Include="..\src\Stopwatch.hpp" />
+    <ClInclude Include="..\src\superscalar_program.hpp" />
    <ClInclude Include="..\src\VirtualMachine.hpp" />
    <ClInclude Include="..\src\virtualMemory.hpp" />
  </ItemGroup>
--- a/vcxproj/randomx.vcxproj.filters
+++ b/vcxproj/randomx.vcxproj.filters
@ -27,9 +27,6 @@
    <ClCompile Include="..\src\AssemblyGeneratorX86.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\Cache.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
    <ClCompile Include="..\src\CompiledLightVirtualMachine.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
@ -75,6 +72,15 @@
    <ClCompile Include="..\src\superscalarGenerator.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="..\src\randomx.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\allocator.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\InterpretedLightVirtualMachine.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <MASM Include="..\src\JitCompilerX86-static.asm">
@ -94,9 +100,6 @@
    <ClInclude Include="..\src\AssemblyGeneratorX86.hpp">
      <Filter>Header Files</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\Cache.hpp">
-      <Filter>Header Files</Filter>
-    </ClInclude>
    <ClInclude Include="..\src\catch.hpp">
      <Filter>Header Files</Filter>
    </ClInclude>
@ -163,5 +166,17 @@
    <ClInclude Include="..\src\superscalarGenerator.hpp">
      <Filter>Header Files</Filter>
    </ClInclude>
+    <ClInclude Include="..\src\randomx.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\allocator.hpp">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\InterpretedLightVirtualMachine.hpp">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\superscalar_program.hpp">
+      <Filter>Header Files</Filter>
+    </ClInclude>
  </ItemGroup>
 </Project>