diff --git a/makefile b/makefile index 354d706..356180f 100644 --- a/makefile +++ b/makefile @@ -11,7 +11,7 @@ SRCDIR=src OBJDIR=obj LDFLAGS= TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o) -ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o executeProgram-linux.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o) +ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o) SRC1=$(addprefix $(SRCDIR)/,TestAluFpu.cpp instructions.hpp Pcg32.hpp) all: release test @@ -55,8 +55,8 @@ $(OBJDIR)/CompiledVirtualMachine.o: $(addprefix $(SRCDIR)/,CompiledVirtualMachin $(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp Pcg32.hpp argon2_core.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/dataset.cpp -o $@ -$(OBJDIR)/executeProgram-linux.o: $(addprefix $(SRCDIR)/,executeProgram-linux.cpp common.hpp) | $(OBJDIR) - $(CXX) $(CXXFLAGS) -c $(SRCDIR)/executeProgram-linux.cpp -o $@ +$(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp) | $(OBJDIR) + $(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@ $(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp instructions.hpp intrinPortable.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/instructionsPortable.cpp -o $@ diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index fadc2a6..f9535f3 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -218,10 +218,6 @@ namespace RandomX { } } - static inline int wrapi(int i) { - return i % RandomX::ProgramLength; - } - void AssemblyGeneratorX86::h_ADD_64(Instruction& instr, int i) { gena(instr); asmCode << "\tadd rax, "; @@ -468,14 +464,14 @@ namespace RandomX { asmCode << "\tcmp " << regR32[instr.regb % RegistersCount] << ", " << instr.imm1 << std::endl; asmCode << "\tjbe short taken_call_" << i << std::endl; gencr(instr); - asmCode << "\tjmp rx_i_" << wrapi(i + 1) << std::endl; + asmCode << "\tjmp rx_i_" << wrapInstr(i + 1) << std::endl; asmCode << "taken_call_" << i << ":" << std::endl; } if (trace) { asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rax" << std::endl; } asmCode << "\tpush rax" << std::endl; - asmCode << "\tcall rx_i_" << wrapi(i + (instr.imm0 & 127) + 2) << std::endl; + asmCode << "\tcall rx_i_" << wrapInstr(i + (instr.imm0 & 127) + 2) << std::endl; } void AssemblyGeneratorX86::h_RET(Instruction& instr, int i) { diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index 79dac1d..fdb1498 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -21,23 +21,35 @@ along with RandomX. If not, see. #include "Pcg32.hpp" #include "common.hpp" #include "instructions.hpp" +#include namespace RandomX { + CompiledVirtualMachine::CompiledVirtualMachine(bool softAes) : VirtualMachine(softAes) { +#if !defined(_M_X64) && !defined(__x86_64__) + throw std::runtime_error("Compiled VM only supports x86-64 CPUs"); +#endif + } + + void CompiledVirtualMachine::initializeDataset(const void* seed, bool lightClient) { + if (lightClient) { + throw std::runtime_error("Compiled VM does not support light-client mode"); + } + VirtualMachine::initializeDataset(seed, lightClient); + } + void CompiledVirtualMachine::initializeProgram(const void* seed) { Pcg32 gen(seed); for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) { *(((uint32_t*)®) + i) = gen(); } - for (unsigned i = 0; i < ProgramLength; ++i) { - gen(); gen(); gen(); gen(); - } + compiler.generateProgram(gen); mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & ~7; mem.mx = *(((uint32_t*)seed) + 5); } void CompiledVirtualMachine::execute() { - executeProgram(reg, mem, readDataset, scratchpad); + compiler.getProgramFunc()(reg, mem, scratchpad); #ifdef TRACE for (int32_t i = InstructionCount - 1; i >= 0; --i) { std::cout << std::hex << tracepad[i].u64 << std::endl; diff --git a/src/CompiledVirtualMachine.hpp b/src/CompiledVirtualMachine.hpp index b2b7a1c..b5b1d63 100644 --- a/src/CompiledVirtualMachine.hpp +++ b/src/CompiledVirtualMachine.hpp @@ -20,17 +20,21 @@ along with RandomX. If not, see. #pragma once //#define TRACE #include "VirtualMachine.hpp" -#include "Program.hpp" -#include +#include "JitCompilerX86.hpp" namespace RandomX { class CompiledVirtualMachine : public VirtualMachine { public: - CompiledVirtualMachine(bool softAes) : VirtualMachine(softAes) {} - virtual void initializeProgram(const void* seed) override; + CompiledVirtualMachine(bool softAes); + void initializeDataset(const void* seed, bool light = false) override; + void initializeProgram(const void* seed) override; virtual void execute() override; + void* getProgram() { + return compiler.getCode(); + } private: + JitCompilerX86 compiler; #ifdef TRACE convertible_t tracepad[InstructionCount]; #endif diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp new file mode 100644 index 0000000..82444d4 --- /dev/null +++ b/src/JitCompilerX86.cpp @@ -0,0 +1,747 @@ +/* +Copyright (c) 2018 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include "JitCompilerX86.hpp" +#include "Pcg32.hpp" +#include +#include + +#ifdef _WIN32 +#include +#elif defined(__linux__) +#include +#include +#include +#else +#error "Unsupported operating system" +#endif + +namespace RandomX { + + /* + REGISTER ALLOCATION: + + rax -> temporary + rbx -> MemoryRegisters& memory + rcx -> temporary + rdx -> temporary + rsi -> convertible_t& scratchpad + rdi -> "ic" (instruction counter) + rbp -> beginning of VM stack + rsp -> end of VM stack + r8 -> "r0" + r9 -> "r1" + r10 -> "r2" + r11 -> "r3" + r12 -> "r4" + r13 -> "r5" + r14 -> "r6" + r15 -> "r7" + xmm0 -> temporary + xmm1 -> temporary + xmm2 -> "f2" + xmm3 -> "f3" + xmm4 -> "f4" + xmm5 -> "f5" + xmm6 -> "f6" + xmm7 -> "f7" + xmm8 -> "f0" + xmm9 -> "f1" + + STACK STRUCTURE: + + | + | + | saved registers + | + v + [rbp] RegisterFile& registerFile + | + | + | VM stack + | + v + [rsp] last element of VM stack + + */ + + const uint8_t prologue[] = { + 0x53, //push rbx + 0x55, //push rbp +#ifdef _WIN32 + 0x57, //push rdi + 0x56, //push rsi +#endif + 0x41, 0x54, //push r12 + 0x41, 0x55, //push r13 + 0x41, 0x56, //push r14 + 0x41, 0x57, //push r15 +#ifdef _WIN32 + 0x48, 0x83, 0xec, 0x48, //sub rsp,0x48 + 0xf3, 0x0f, 0x7f, 0x74, 0x24, 0x30, //movdqu XMMWORD PTR[rsp + 0x30],xmm6 + 0xf3, 0x0f, 0x7f, 0x7c, 0x24, 0x20, //movdqu XMMWORD PTR[rsp + 0x20],xmm7 + 0xf3, 0x44, 0x0f, 0x7f, 0x44, 0x24, 0x10, //movdqu XMMWORD PTR[rsp + 0x10],xmm8 + 0xf3, 0x44, 0x0f, 0x7f, 0x0c, 0x24, //movdqu XMMWORD PTR[rsp],xmm9 + 0x51, //push rcx + 0x48, 0x8b, 0xda, //mov rbx,rdx + 0x49, 0x8b, 0xf0, //mov rsi,r8 +#else + 0x57, //push rdi + 0x48, 0x8b, 0xde, //mov rbx, rsi + 0x48, 0x8b, 0xf2, //mov rsi, rdx + 0x48, 0x8b, 0xcf, //mov rcx, rdi +#endif + 0x48, 0x8b, 0xec, //mov rbp,rsp + 0x48, 0xc7, 0xc7, 0x00, 0x00, 0x10, 0x00, //mov rdi,0x100000 + 0x4c, 0x8b, 0x01, //mov r8,QWORD PTR[rcx] + 0x4c, 0x8b, 0x49, 0x08, //mov r9,QWORD PTR[rcx+0x8] + 0x4c, 0x8b, 0x51, 0x10, //mov r10,QWORD PTR[rcx+0x10] + 0x4c, 0x8b, 0x59, 0x18, //mov r11,QWORD PTR[rcx+0x18] + 0x4c, 0x8b, 0x61, 0x20, //mov r12,QWORD PTR[rcx+0x20] + 0x4c, 0x8b, 0x69, 0x28, //mov r13,QWORD PTR[rcx+0x28] + 0x4c, 0x8b, 0x71, 0x30, //mov r14,QWORD PTR[rcx+0x30] + 0x4c, 0x8b, 0x79, 0x38, //mov r15,QWORD PTR[rcx+0x38] + 0xc7, 0x44, 0x24, 0xf8, 0xc0, 0x9f, 0x00, //mov DWORD PTR[rsp-0x8],0x9fc0 + 0x00, + 0x0f, 0xae, 0x54, 0x24, 0xf8, //ldmxcsr DWORD PTR[rsp-0x8] + 0xf2, 0x4c, 0x0f, 0x2a, 0x41, 0x40, //cvtsi2sd xmm8,QWORD PTR[rcx+0x40] + 0xf2, 0x4c, 0x0f, 0x2a, 0x49, 0x48, //cvtsi2sd xmm9,QWORD PTR[rcx+0x48] + 0xf2, 0x48, 0x0f, 0x2a, 0x51, 0x50, //cvtsi2sd xmm2,QWORD PTR[rcx+0x50] + 0xf2, 0x48, 0x0f, 0x2a, 0x59, 0x58, //cvtsi2sd xmm3,QWORD PTR[rcx+0x58] + 0xf2, 0x48, 0x0f, 0x2a, 0x61, 0x60, //cvtsi2sd xmm4,QWORD PTR[rcx+0x60] + 0xf2, 0x48, 0x0f, 0x2a, 0x69, 0x68, //cvtsi2sd xmm5,QWORD PTR[rcx+0x68] + 0xf2, 0x48, 0x0f, 0x2a, 0x71, 0x70, //cvtsi2sd xmm6,QWORD PTR[rcx+0x70] + 0xf2, 0x48, 0x0f, 0x2a, 0x79, 0x78, //cvtsi2sd xmm7,QWORD PTR[rcx+0x78] + }; + + const uint8_t epilogue[] = { + 0x48, 0x8b, 0xe5, //mov rsp,rbp + 0x59, //pop rcx + 0x4c, 0x89, 0x01, //mov QWORD PTR [rcx],r8 + 0x4c, 0x89, 0x49, 0x08, //mov QWORD PTR [rcx+0x8],r9 + 0x4c, 0x89, 0x51, 0x10, //mov QWORD PTR [rcx+0x10],r10 + 0x4c, 0x89, 0x59, 0x18, //mov QWORD PTR [rcx+0x18],r11 + 0x4c, 0x89, 0x61, 0x20, //mov QWORD PTR [rcx+0x20],r12 + 0x4c, 0x89, 0x69, 0x28, //mov QWORD PTR [rcx+0x28],r13 + 0x4c, 0x89, 0x71, 0x30, //mov QWORD PTR [rcx+0x30],r14 + 0x4c, 0x89, 0x79, 0x38, //mov QWORD PTR [rcx+0x38],r15 + 0x66, 0x4c, 0x0f, 0x7e, 0x41, 0x40, //movq QWORD PTR [rcx+0x40],xmm8 + 0x66, 0x4c, 0x0f, 0x7e, 0x49, 0x48, //movq QWORD PTR [rcx+0x48],xmm9 + 0x66, 0x48, 0x0f, 0x7e, 0x51, 0x50, //movq QWORD PTR [rcx+0x50],xmm2 + 0x66, 0x48, 0x0f, 0x7e, 0x59, 0x58, //movq QWORD PTR [rcx+0x58],xmm3 + 0x66, 0x48, 0x0f, 0x7e, 0x61, 0x60, //movq QWORD PTR [rcx+0x60],xmm4 + 0x66, 0x48, 0x0f, 0x7e, 0x69, 0x68, //movq QWORD PTR [rcx+0x68],xmm5 + 0x66, 0x48, 0x0f, 0x7e, 0x71, 0x70, //movq QWORD PTR [rcx+0x70],xmm6 + 0x66, 0x48, 0x0f, 0x7e, 0x79, 0x78, //movq QWORD PTR [rcx+0x78],xmm7 +#ifdef _WIN32 + 0xf3, 0x44, 0x0f, 0x6f, 0x0c, 0x24, //movdqu xmm9,XMMWORD PTR [rsp] + 0xf3, 0x44, 0x0f, 0x6f, 0x44, 0x24, 0x10, //movdqu xmm8,XMMWORD PTR [rsp+0x10] + 0xf3, 0x0f, 0x6f, 0x7c, 0x24, 0x20, //movdqu xmm7,XMMWORD PTR [rsp+0x20] + 0xf3, 0x0f, 0x6f, 0x74, 0x24, 0x30, //movdqu xmm6,XMMWORD PTR [rsp+0x30] + 0x48, 0x83, 0xc4, 0x48, //add rsp,0x48 +#endif + 0x41, 0x5f, //pop r15 + 0x41, 0x5e, //pop r14 + 0x41, 0x5d, //pop r13 + 0x41, 0x5c, //pop r12 +#ifdef _WIN32 + 0x5e, //pop rsi + 0x5f, //pop rdi +#endif + 0x5d, //pop rbp + 0x5b, //pop rbx + 0xc3, //ret + }; + + //41 bytes -> 1 cache line + const uint8_t readDatasetSub[] = { + 0x8b, 0x13, //mov edx,DWORD PTR [rbx] + 0x48, 0x8b, 0x43, 0x08, //mov rax,QWORD PTR [rbx+0x8] + 0x48, 0x8b, 0x04, 0x10, //mov rax,QWORD PTR [rax+rdx*1] + 0x83, 0x03, 0x08, //add DWORD PTR [rbx],0x8 + 0x33, 0x4b, 0x04, //xor ecx,DWORD PTR [rbx+0x4] + 0x89, 0x4b, 0x04, //mov DWORD PTR [rbx+0x4],ecx + 0xf7, 0xc1, 0xf8, 0xff, 0x00, 0x00, //test ecx,0xfff8 + 0x75, 0x0d, //jne + 0x83, 0xe1, 0xf8, //and ecx,0xfffffff8 + 0x89, 0x0b, //mov DWORD PTR [rbx],ecx + 0x48, 0x8b, 0x53, 0x08, //mov rdx,QWORD PTR [rbx+0x8] + 0x0f, 0x18, 0x0c, 0x0a, //prefetcht0 BYTE PTR [rdx+rcx*1] + 0xc3, //ret + }; + + constexpr int getNumCacheLines(size_t size) { + return (size + (CacheLineSize - 1)) / CacheLineSize; + } + + constexpr int32_t align(int32_t pos, int32_t align) { + return ((pos - 1) / align + 1) * align; + } + + constexpr int32_t readDatasetSubOffset = CodeSize - CacheLineSize * getNumCacheLines(sizeof(readDatasetSub)); + constexpr int32_t epilogueOffset = readDatasetSubOffset - CacheLineSize * getNumCacheLines(sizeof(epilogue)); + constexpr int32_t startOffsetAligned = align(sizeof(prologue), CacheLineSize); + + JitCompilerX86::JitCompilerX86() { +#ifdef _WIN32 + code = (uint8_t*)VirtualAlloc(nullptr, CodeSize, MEM_COMMIT, PAGE_EXECUTE_READWRITE); + if (code == nullptr) + throw std::runtime_error("VirtualAlloc failed"); +#else + auto pagesize = sysconf(_SC_PAGE_SIZE); + if (pagesize == -1) + throw std::runtime_error("sysconf failed"); + + code = (uint8_t*)memalign(pagesize, CodeSize); + if (code == nullptr) + throw std::runtime_error("memalign failed"); + + if (mprotect(code, CodeSize, PROT_READ | PROT_WRITE | PROT_EXEC) == -1) + throw std::runtime_error("mprotect failed"); +#endif + memcpy(code, prologue, sizeof(prologue)); + if (startOffsetAligned - sizeof(prologue) > 4) { + codePos = sizeof(prologue); + emitByte(0xeb); + emitByte(startOffsetAligned - (codePos + 1)); + } + memcpy(code + readDatasetSubOffset, readDatasetSub, sizeof(readDatasetSub)); + memcpy(code + epilogueOffset, epilogue, sizeof(epilogue)); + } + + void JitCompilerX86::generateProgram(Pcg32& gen) { + instructionOffsets.clear(); + callOffsets.clear(); + codePos = startOffsetAligned; + Instruction instr; + for (unsigned i = 0; i < ProgramLength; ++i) { + for (unsigned j = 0; j < sizeof(instr) / sizeof(Pcg32::result_type); ++j) { + *(((uint32_t*)&instr) + j) = gen(); + } + generateCode(instr, i); + } + emitByte(0xe9); + emit(instructionOffsets[0] - (codePos + 4)); + fixCallOffsets(); + } + + void JitCompilerX86::generateCode(Instruction& instr, int i) { + instructionOffsets.push_back(codePos); + emit(0x880fcfff); //dec edx; js + emit(epilogueOffset - (codePos + 4)); //jump offset (RIP-relative) + gena(instr); + auto generator = engine[instr.opcode]; + (this->*generator)(instr, i); + } + + void JitCompilerX86::fixCallOffsets() { + for (CallOffset& co : callOffsets) { + *reinterpret_cast(code + co.pos) = instructionOffsets[co.index] - (co.pos + 4); + } + } + + void JitCompilerX86::gena(Instruction& instr) { + emit(uint16_t(0x8149)); //xor + emitByte(0xf0 + (instr.rega % RegistersCount)); + emit(instr.addr0); + int32_t pc; + switch (instr.loca & 7) + { + case 0: + case 1: + case 2: + case 3: + emit(uint16_t(0x8b41)); //mov + emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega + emitByte(0xe8); //call + emit(readDatasetSubOffset - (codePos + 4)); + return; + + case 4: + emit(uint16_t(0x8b41)); //mov + emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega + emitByte(0x25); //and + emit(ScratchpadL2 - 1); //whole scratchpad + emit(0xc6048b48); // mov rax,QWORD PTR [rsi+rax*8] + return; + + default: + emit(uint16_t(0x8b41)); //mov + emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega + emitByte(0x25); //and + emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad + emit(0xc6048b48); // mov rax,QWORD PTR [rsi+rax*8] + return; + } + } + + void JitCompilerX86::genbr0(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) { + if ((instr.locb & 7) <= 5) { + emit(uint16_t(0x8b49)); //mov + emitByte(0xc8 + (instr.regb % RegistersCount)); //rcx, regb + emitByte(0x48); //REX.W + emit(opcodeReg); //xxx rax, cl + } + else { + emitByte(0x48); //REX.W + emit(opcodeImm); //xxx rax, imm8 + emitByte((instr.imm0 & 63)); + } + } + + void JitCompilerX86::genbr1(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) { + if ((instr.locb & 7) <= 5) { + emit(opcodeReg); // xxx rax, r64 + emitByte(0xc0 + (instr.regb % RegistersCount)); + } + else { + emit(opcodeImm); // xxx rax, imm32 + emit(instr.imm1); + } + } + + void JitCompilerX86::genbr132(Instruction& instr, uint16_t opcodeReg, uint8_t opcodeImm) { + if ((instr.locb & 7) <= 5) { + emit(opcodeReg); // xxx eax, r32 + emitByte(0xc0 + (instr.regb % RegistersCount)); + } + else { + emitByte(opcodeImm); // xxx eax, imm32 + emit(instr.imm1); + } + } + + void JitCompilerX86::genbf(Instruction& instr, uint8_t opcode) { + emit(0x48f2fffff8002548); //and rax,0xfffffffffffff800; cvtsi2sd xmm0,rax + emit(uint16_t(0x2a0f)); + emitByte(0xc0); + if ((instr.locb & 7) <= 5) { + int regb = (instr.regb % RegistersCount); + emitByte(0xf2); //xxxsd xmm0,regb + if (regb <= 1) { + emitByte(0x41); //REX + } + emitByte(0x0f); + emitByte(opcode); + emitByte(0xc0 + regb); + } + else { + convertible_t bimm; + bimm.f64 = (double)instr.imm1; + emit(uint16_t(0xb848)); //movabs rax,imm64 + emit(bimm.i64); + emitByte(0x66); //movq xmm1,rax + emit(0xc86e0f48); + emit(uint16_t(0x0ff2)); //xxxsd xmm0,xmm1 + emitByte(opcode); + emitByte(0xc1); + } + } + + void JitCompilerX86::gencr(Instruction& instr) { + switch (instr.locc & 7) + { + case 0: + emit(0x41c88b48); //mov rcx, rax; REX + emitByte(0x8b); // mov + emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc + emitByte(0x35); // xor eax + emit(instr.addr1); + emitByte(0x25); //and + emit(ScratchpadL2 - 1); //whole scratchpad + emit(0xc60c8948); // mov QWORD PTR [rsi+rax*8],rcx + break; + + case 1: + case 2: + case 3: + emit(0x41c88b48); //mov rcx, rax; REX + emitByte(0x8b); // mov + emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc + emitByte(0x35); // xor eax + emit(instr.addr1); + emitByte(0x25); //and + emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad + emit(0xc60c8948); // mov QWORD PTR [rsi+rax*8],rcx + break; + + default: + emit(uint16_t(0x8b4c)); //mov + emitByte(0xc0 + 8 * (instr.regc % RegistersCount)); //regc, rax + break; + } + } + + void JitCompilerX86::gencf(Instruction& instr) { + int regc = (instr.regc % RegistersCount); + switch (instr.locc & 7) + { + case 0: + emit(uint16_t(0x8b41)); //mov + emitByte(0xc0 + regc); //eax, regc + emitByte(0x35); // xor eax + emit(instr.addr1); + emitByte(0x25); //and + emit(ScratchpadL2 - 1); //whole scratchpad + emit(uint16_t(0x4866)); //prefix + emit(0xc6047e0f); // movq QWORD PTR [rsi+rax*8],xmm0 + break; + + case 1: + case 2: + case 3: + emit(uint16_t(0x8b41)); //mov + emitByte(0xc0 + regc); //eax, regc + emitByte(0x35); // xor eax + emit(instr.addr1); + emitByte(0x25); //and + emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad + emit(uint16_t(0x4866)); //prefix + emit(0xc6047e0f); // movq QWORD PTR [rsi+rax*8],xmm0 + break; + + default: + emitByte(0xf2); + if (regc <= 1) { + emitByte(0x44); //REX + } + emit(uint16_t(0x100f)); //movsd + emitByte(0xc0 + 8 * regc); // regc, xmm0 + break; + } + } + + void JitCompilerX86::h_ADD_64(Instruction& instr, int i) { + genbr1(instr, 0x0349, 0x0548); + gencr(instr); + } + + void JitCompilerX86::h_ADD_32(Instruction& instr, int i) { + genbr132(instr, 0x0341, 0x05); + gencr(instr); + } + + void JitCompilerX86::h_SUB_64(Instruction& instr, int i) { + genbr1(instr, 0x2b49, 0x2d48); + gencr(instr); + } + + void JitCompilerX86::h_SUB_32(Instruction& instr, int i) { + genbr132(instr, 0x2b41, 0x2d); + gencr(instr); + } + + void JitCompilerX86::h_MUL_64(Instruction& instr, int i) { + if ((instr.locb & 7) <= 5) { + emitByte(0x49); //REX + emit(uint16_t(0xaf0f)); // imul rax, r64 + emitByte(0xc0 + (instr.regb % RegistersCount)); + } + else { + emitByte(0x48); //REX + emit(uint16_t(0xc069)); // imul rax, rax, imm32 + emit(instr.imm1); + } + gencr(instr); + } + + void JitCompilerX86::h_MULH_64(Instruction& instr, int i) { + if ((instr.locb & 7) <= 5) { + emit(uint16_t(0x8b49)); //mov rcx, r64 + emitByte(0xc8 + (instr.regb % RegistersCount)); + } + else { + emitByte(0x48); + emit(uint16_t(0xc1c7)); // mov rcx, imm32 + emit(instr.imm1); + } + emitByte(0x48); + emit(uint16_t(0xe1f7)); // mul rcx + emitByte(0x48); + emit(uint16_t(0xc28b)); // mov rax,rdx + gencr(instr); + } + + void JitCompilerX86::h_MUL_32(Instruction& instr, int i) { + emit(uint16_t(0xc88b)); //mov ecx, eax + if ((instr.locb & 7) <= 5) { + emit(uint16_t(0x8b41)); // mov eax, r32 + emitByte(0xc0 + (instr.regb % RegistersCount)); + } + else { + emitByte(0xb8); // mov eax, imm32 + emit(instr.imm1); + } + emit(0xc1af0f48); //imul rax,rcx + gencr(instr); + } + + void JitCompilerX86::h_IMUL_32(Instruction& instr, int i) { + emitByte(0x48); + emit(uint16_t(0xc863)); //movsxd rcx,eax + if ((instr.locb & 7) <= 5) { + emit(uint16_t(0x6349)); //movsxd rax,r32 + emitByte(0xc0 + (instr.regb % RegistersCount)); + } + else { + emitByte(0x48); + emit(uint16_t(0xc0c7)); // mov rax, imm32 + emit(instr.imm1); + } + emit(0xc1af0f48); //imul rax,rcx + gencr(instr); + } + + void JitCompilerX86::h_IMULH_64(Instruction& instr, int i) { + if ((instr.locb & 7) <= 5) { + emit(uint16_t(0x8b49)); //mov rcx, r64 + emitByte(0xc8 + (instr.regb % RegistersCount)); + } + else { + emitByte(0x48); + emit(uint16_t(0xc1c7)); // mov rcx, imm32 + emit(instr.imm1); + } + emitByte(0x48); + emit(uint16_t(0xe9f7)); // imul rcx + emitByte(0x48); + emit(uint16_t(0xc28b)); // mov rax,rdx + gencr(instr); + } + + void JitCompilerX86::h_DIV_64(Instruction& instr, int i) { + if ((instr.locb & 7) <= 5) { + emitByte(0xb9); //mov ecx, 1 + emit(1); + emit(uint16_t(0x8b41)); //mov edx, r32 + emitByte(0xd0 + (instr.regb % RegistersCount)); + emit(0x450fd285); //test edx, edx; cmovne ecx,edx + emitByte(0xca); + } + else { + emitByte(0xb9); //mov ecx, imm32 + emit(instr.imm1 != 0 ? instr.imm1 : 1); + } + emit(0xf748d233); //xor edx,edx; div rcx + emitByte(0xf1); + gencr(instr); + } + + void JitCompilerX86::h_IDIV_64(Instruction& instr, int i) { + if ((instr.locb & 7) <= 5) { + emit(uint16_t(0x8b41)); //mov edx, r32 + emitByte(0xd0 + (instr.regb % RegistersCount)); + } + else { + emitByte(0xba); // xxx edx, imm32 + emit(instr.imm1); + } + emit(0xc88b480b75fffa83); + emit(0x1274c9ff48c1d148); + emit(0x0fd28500000001b9); + emit(0x489948c96348ca45); + emit(uint16_t(0xf9f7)); //idiv rcx + gencr(instr); + } + + void JitCompilerX86::h_AND_64(Instruction& instr, int i) { + genbr1(instr, 0x2349, 0x2548); + gencr(instr); + } + + void JitCompilerX86::h_AND_32(Instruction& instr, int i) { + genbr132(instr, 0x2341, 0x25); + gencr(instr); + } + + void JitCompilerX86::h_OR_64(Instruction& instr, int i) { + genbr1(instr, 0x0b49, 0x0d48); + gencr(instr); + } + + void JitCompilerX86::h_OR_32(Instruction& instr, int i) { + genbr132(instr, 0x0b41, 0x0d); + gencr(instr); + } + + void JitCompilerX86::h_XOR_64(Instruction& instr, int i) { + genbr1(instr, 0x3349, 0x3548); + gencr(instr); + } + + void JitCompilerX86::h_XOR_32(Instruction& instr, int i) { + genbr132(instr, 0x3341, 0x35); + gencr(instr); + } + + void JitCompilerX86::h_SHL_64(Instruction& instr, int i) { + genbr0(instr, 0xe0d3, 0xe0c1); + gencr(instr); + } + + void JitCompilerX86::h_SHR_64(Instruction& instr, int i) { + genbr0(instr, 0xe8d3, 0xe8c1); + gencr(instr); + } + + void JitCompilerX86::h_SAR_64(Instruction& instr, int i) { + genbr0(instr, 0xf8d3, 0xf8c1); + gencr(instr); + } + + void JitCompilerX86::h_ROL_64(Instruction& instr, int i) { + genbr0(instr, 0xc0d3, 0xc0c1); + gencr(instr); + } + + void JitCompilerX86::h_ROR_64(Instruction& instr, int i) { + genbr0(instr, 0xc8d3, 0xc8c1); + gencr(instr); + } + + void JitCompilerX86::h_FPADD(Instruction& instr, int i) { + genbf(instr, 0x58); + gencf(instr); + } + + void JitCompilerX86::h_FPSUB(Instruction& instr, int i) { + genbf(instr, 0x5c); + gencf(instr); + } + + void JitCompilerX86::h_FPMUL(Instruction& instr, int i) { + emit(uint16_t(0x0d48)); //or rax,0x800 + emit(0x00000800); + genbf(instr, 0x59); + gencf(instr); + } + + void JitCompilerX86::h_FPDIV(Instruction& instr, int i) { + emit(uint16_t(0x0d48)); //or rax,0x800 + emit(0x00000800); + genbf(instr, 0x5e); + gencf(instr); + } + + void JitCompilerX86::h_FPSQRT(Instruction& instr, int i) { + emit(uint16_t(0xb948)); //or movabs rcx, imm64 + emit(0x7ffffffffffff800); + emit(0xc02a0f48f2c12348); //and rax,rcx; cvtsi2sd xmm0,rax + emit(0xc0510ff2); //sqrtsd xmm0,xmm0 + gencf(instr); + } + + void JitCompilerX86::h_FPROUND(Instruction& instr, int i) { + emit(0x81480de0c1c88b48); + emit(0x600025fffff800e1); + emit(0x0dc12a0f48f20000); + emit(0xf824448900009fc0); + emit(0x2454ae0f); //ldmxcsr DWORD PTR [rsp-0x8] + emitByte(0xf8); + gencf(instr); + } + + void JitCompilerX86::h_CALL(Instruction& instr, int i) { + if ((instr.locb & 7) <= 5) { + emit(uint16_t(0x8141)); //cmp regb, imm32 + emitByte(0xf8 + (instr.regb % RegistersCount)); + emit(instr.imm1); + if ((instr.locc & 7) <= 3) { + emit(uint16_t(0x1676)); //jmp + } + else { + emit(uint16_t(0x0576)); //jmp + } + gencr(instr); + emit(uint16_t(0x06eb)); //jmp to next + } + emitByte(0x50); //push rax + emitByte(0xe8); //call + i = wrapInstr(i + (instr.imm0 & 127) + 2); + if (i < instructionOffsets.size()) { + emit(instructionOffsets[i] - (codePos + 4)); + } + else { + callOffsets.push_back(CallOffset(codePos, i)); + codePos += 4; + } + } + + void JitCompilerX86::h_RET(Instruction& instr, int i) { + int crlen = 0; + int blen = 0; + if ((instr.locc & 7) <= 3) { + crlen = 17; + } + if ((instr.locb & 7) <= 5) { + blen = 9; + } + emit(0x74e53b48); //cmp rsp, rbp; je + emitByte(11 + blen + crlen); + if ((instr.locb & 7) <= 5) { + emit(uint16_t(0x8141)); //cmp regb, imm32 + emitByte(0xf8 + (instr.regb % RegistersCount)); + emit(instr.imm1); + emitByte(0x77); //jmp + emitByte(11 + crlen); + } + emitByte(0x48); + emit(0x08244433); //xor rax,QWORD PTR [rsp+0x8] + gencr(instr); + emitByte(0xc2); //ret 8 + emit(uint16_t(0x0008)); + gencr(instr); + } + +#include "instructionWeights.hpp" +#define INST_HANDLE(x) REPN(&JitCompilerX86::h_##x, WT(x)) + + InstructionGeneratorX86 JitCompilerX86::engine[256] = { + INST_HANDLE(ADD_64) + INST_HANDLE(ADD_32) + INST_HANDLE(SUB_64) + INST_HANDLE(SUB_32) + INST_HANDLE(MUL_64) + INST_HANDLE(MULH_64) + INST_HANDLE(MUL_32) + INST_HANDLE(IMUL_32) + INST_HANDLE(IMULH_64) + INST_HANDLE(DIV_64) + INST_HANDLE(IDIV_64) + INST_HANDLE(AND_64) + INST_HANDLE(AND_32) + INST_HANDLE(OR_64) + INST_HANDLE(OR_32) + INST_HANDLE(XOR_64) + INST_HANDLE(XOR_32) + INST_HANDLE(SHL_64) + INST_HANDLE(SHR_64) + INST_HANDLE(SAR_64) + INST_HANDLE(ROL_64) + INST_HANDLE(ROR_64) + INST_HANDLE(FPADD) + INST_HANDLE(FPSUB) + INST_HANDLE(FPMUL) + INST_HANDLE(FPDIV) + INST_HANDLE(FPSQRT) + INST_HANDLE(FPROUND) + INST_HANDLE(CALL) + INST_HANDLE(RET) + }; +} \ No newline at end of file diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp new file mode 100644 index 0000000..c453ba1 --- /dev/null +++ b/src/JitCompilerX86.hpp @@ -0,0 +1,114 @@ +/* +Copyright (c) 2018 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#pragma once + +#include "common.hpp" +#include "Instruction.hpp" +#include +#include + +class Pcg32; + +namespace RandomX { + + class JitCompilerX86; + + typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&, int); + + constexpr uint32_t CodeSize = 64 * 1024; + constexpr uint32_t CacheLineSize = 64; + + struct CallOffset { + CallOffset(int32_t p, int32_t i) : pos(p), index(i) {} + int32_t pos; + int32_t index; + }; + + class JitCompilerX86 { + public: + JitCompilerX86(); + void generateProgram(Pcg32&); + ProgramFunc getProgramFunc() { + return (ProgramFunc)code; + } + uint8_t* getCode() { + return code; + } + private: + static InstructionGeneratorX86 engine[256]; + uint8_t* code; + int32_t codePos; + std::vector instructionOffsets; + std::vector callOffsets; + + void gena(Instruction&); + void genbr0(Instruction&, uint16_t, uint16_t); + void genbr1(Instruction&, uint16_t, uint16_t); + void genbr132(Instruction&, uint16_t, uint8_t); + void genbf(Instruction&, uint8_t); + void gencr(Instruction&); + void gencf(Instruction&); + void generateCode(Instruction&, int); + void fixCallOffsets(); + + void emitByte(uint8_t val) { + code[codePos] = val; + codePos++; + } + + template + void emit(T val) { + *reinterpret_cast(code + codePos) = val; + codePos += sizeof(T); + } + + void h_ADD_64(Instruction&, int); + void h_ADD_32(Instruction&, int); + void h_SUB_64(Instruction&, int); + void h_SUB_32(Instruction&, int); + void h_MUL_64(Instruction&, int); + void h_MULH_64(Instruction&, int); + void h_MUL_32(Instruction&, int); + void h_IMUL_32(Instruction&, int); + void h_IMULH_64(Instruction&, int); + void h_DIV_64(Instruction&, int); + void h_IDIV_64(Instruction&, int); + void h_AND_64(Instruction&, int); + void h_AND_32(Instruction&, int); + void h_OR_64(Instruction&, int); + void h_OR_32(Instruction&, int); + void h_XOR_64(Instruction&, int); + void h_XOR_32(Instruction&, int); + void h_SHL_64(Instruction&, int); + void h_SHR_64(Instruction&, int); + void h_SAR_64(Instruction&, int); + void h_ROL_64(Instruction&, int); + void h_ROR_64(Instruction&, int); + void h_FPADD(Instruction&, int); + void h_FPSUB(Instruction&, int); + void h_FPMUL(Instruction&, int); + void h_FPDIV(Instruction&, int); + void h_FPSQRT(Instruction&, int); + void h_FPROUND(Instruction&, int); + void h_CALL(Instruction&, int); + void h_RET(Instruction&, int); + }; + +} \ No newline at end of file diff --git a/src/VirtualMachine.cpp b/src/VirtualMachine.cpp index 3951a86..501c069 100644 --- a/src/VirtualMachine.cpp +++ b/src/VirtualMachine.cpp @@ -72,4 +72,8 @@ namespace RandomX { memcpy(scratchpad, mem.dataset + ScratchpadSize * index, ScratchpadSize); } } + + void VirtualMachine::getResult(void* out) { + + } } \ No newline at end of file diff --git a/src/VirtualMachine.hpp b/src/VirtualMachine.hpp index 826ca2f..5c83fa5 100644 --- a/src/VirtualMachine.hpp +++ b/src/VirtualMachine.hpp @@ -27,10 +27,11 @@ namespace RandomX { public: VirtualMachine(bool softAes); virtual ~VirtualMachine() {} - void initializeDataset(const void* seed, bool light = false); + virtual void initializeDataset(const void* seed, bool light = false); void initializeScratchpad(uint32_t index); virtual void initializeProgram(const void* seed) = 0; virtual void execute() = 0; + void getResult(void*); const RegisterFile& getRegisterFile() const { return reg; } diff --git a/src/common.hpp b/src/common.hpp index 765a1fc..28c95cf 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -70,6 +70,10 @@ namespace RandomX { constexpr uint32_t ScratchpadL2 = ScratchpadSize / sizeof(convertible_t); constexpr int RegistersCount = 8; + inline int wrapInstr(int i) { + return i % RandomX::ProgramLength; + } + struct LightClientMemory { uint8_t* cache; uint8_t* block; @@ -107,7 +111,9 @@ namespace RandomX { typedef convertible_t(*DatasetReadFunc)(addr_t, MemoryRegisters&); + typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*); + extern "C" { - void executeProgram(RegisterFile& registerFile, MemoryRegisters& memory, DatasetReadFunc readFunc, convertible_t* scratchpad); + void executeProgram(RegisterFile& registerFile, MemoryRegisters& memory, convertible_t* scratchpad); } } \ No newline at end of file diff --git a/src/executeProgram-linux.cpp b/src/executeProgram-linux.cpp deleted file mode 100644 index 91cbab9..0000000 --- a/src/executeProgram-linux.cpp +++ /dev/null @@ -1,10 +0,0 @@ -#include "common.hpp" -#include - -namespace RandomX { - extern "C" { - void executeProgram(RegisterFile& registerFile, MemoryRegisters& memory, DatasetReadFunc readFunc, convertible_t* scratchpad) { - throw std::runtime_error("not implemented"); - } - } -} \ No newline at end of file diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index 476e6dc..a125409 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -54,8 +54,7 @@ executeProgram PROC ; | saved registers ; | ; v - ; [rbp+8] RegisterFile& registerFile - ; [rbp] DatasetReadFunc readFunc + ; [rbp] RegisterFile& registerFile ; | ; | ; | VM stack @@ -72,7 +71,7 @@ executeProgram PROC push r13 push r14 push r15 - sub rsp, 64 + sub rsp, 72 movdqu xmmword ptr [rsp+48], xmm6 movdqu xmmword ptr [rsp+32], xmm7 movdqu xmmword ptr [rsp+16], xmm8 @@ -81,8 +80,7 @@ executeProgram PROC ; function arguments push rcx ; RegisterFile& registerFile mov rbx, rdx ; MemoryRegisters& memory - push r8 ; DatasetReadFunc readFunc - mov rsi, r9 ; convertible_t& scratchpad + mov rsi, r8 ; convertible_t& scratchpad mov rbp, rsp ; beginning of VM stack mov rdi, 1048576 ; number of VM instructions to execute @@ -96,8 +94,8 @@ executeProgram PROC mov r13, qword ptr [rcx+40] mov r14, qword ptr [rcx+48] mov r15, qword ptr [rcx+56] - mov dword ptr [rsp - 8], 40896 - ldmxcsr dword ptr [rsp - 8] + mov dword ptr [rsp-8], 40896 + ldmxcsr dword ptr [rsp-8] cvtsi2sd xmm8, qword ptr [rcx+64] cvtsi2sd xmm9, qword ptr [rcx+72] cvtsi2sd xmm2, qword ptr [rcx+80] @@ -114,10 +112,9 @@ executeProgram PROC rx_finish: ; unroll the stack mov rsp, rbp - add rsp, 16 ; save VM register values - mov rcx, qword ptr [rbp+8] + pop rcx mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 mov qword ptr [rcx+16], r10 @@ -136,11 +133,11 @@ rx_finish: movd qword ptr [rcx+120], xmm7 ; load callee-saved registers - movdqu xmm9, xmmword ptr [rsp+0] + movdqu xmm9, xmmword ptr [rsp] movdqu xmm8, xmmword ptr [rsp+16] movdqu xmm7, xmmword ptr [rsp+32] movdqu xmm6, xmmword ptr [rsp+48] - add rsp, 64 + add rsp, 72 pop r15 pop r14 pop r13 diff --git a/src/main.cpp b/src/main.cpp index 0637176..f27407e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -109,13 +109,13 @@ int main(int argc, char** argv) { RandomX::VirtualMachine* vm; - if (compiled) { - vm = new RandomX::CompiledVirtualMachine(softAes); - } - else { - vm = new RandomX::InterpretedVirtualMachine(softAes); - } try { + if (compiled) { + vm = new RandomX::CompiledVirtualMachine(softAes); + } + else { + vm = new RandomX::InterpretedVirtualMachine(softAes); + } std::cout << "Initializing..." << std::endl; Stopwatch sw(true); vm->initializeDataset(seed, lightClient);