From bd0dba88a8f165e59633a2308845751c13a2c30e Mon Sep 17 00:00:00 2001 From: tevador Date: Sun, 20 Jan 2019 00:44:01 +0100 Subject: [PATCH] 4 scratchpad segments --- src/CompiledVirtualMachine.cpp | 7 +++- src/CompiledVirtualMachine.hpp | 2 +- src/InterpretedVirtualMachine.cpp | 2 +- src/InterpretedVirtualMachine.hpp | 2 +- src/JitCompilerX86.cpp | 28 +++++++-------- src/VirtualMachine.cpp | 9 +++-- src/VirtualMachine.hpp | 9 +++-- src/asm/program_epilogue_store.inc | 16 ++++----- src/asm/program_prologue_load.inc | 56 ++++++------------------------ src/main.cpp | 26 +++++++++++--- 10 files changed, 75 insertions(+), 82 deletions(-) diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index 28a3cca..5e87b50 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -33,7 +33,7 @@ namespace RandomX { mem.ds = ds; } - void CompiledVirtualMachine::initializeScratchpad(uint32_t index) { + void CompiledVirtualMachine::initializeScratchpad(uint8_t* scratchpad, int32_t index) { memcpy(scratchpad, mem.ds.dataset + ScratchpadSize * index, ScratchpadSize); } @@ -42,6 +42,11 @@ namespace RandomX { for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) { *(((uint32_t*)®) + i) = gen(); } + FPINIT(); + for (int i = 0; i < RegistersCount; ++i) { + reg.f[i].lo.f64 = (double)reg.f[i].lo.i64; + reg.f[i].hi.f64 = (double)reg.f[i].hi.i64; + } compiler.generateProgram(gen); mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & ~7; mem.mx = *(((uint32_t*)seed) + 5); diff --git a/src/CompiledVirtualMachine.hpp b/src/CompiledVirtualMachine.hpp index 98b0b78..f969732 100644 --- a/src/CompiledVirtualMachine.hpp +++ b/src/CompiledVirtualMachine.hpp @@ -39,7 +39,7 @@ namespace RandomX { } CompiledVirtualMachine(); void setDataset(dataset_t ds) override; - void initializeScratchpad(uint32_t index) override; + void initializeScratchpad(uint8_t* scratchpad, int32_t index) override; void initializeProgram(const void* seed) override; virtual void execute() override; void* getProgram() { diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index 54d2279..d7e4fc4 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -69,7 +69,7 @@ namespace RandomX { } } - void InterpretedVirtualMachine::initializeScratchpad(uint32_t index) { + void InterpretedVirtualMachine::initializeScratchpad(uint8_t* scratchpad, int32_t index) { uint32_t startingBlock = (ScratchpadSize / CacheLineSize) * index; if (asyncWorker) { ILightClientAsyncWorker* worker = mem.ds.asyncWorker; diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp index 7745cad..fba081a 100644 --- a/src/InterpretedVirtualMachine.hpp +++ b/src/InterpretedVirtualMachine.hpp @@ -42,7 +42,7 @@ namespace RandomX { InterpretedVirtualMachine(bool soft, bool async) : softAes(soft), asyncWorker(async) {} ~InterpretedVirtualMachine(); void setDataset(dataset_t ds) override; - void initializeScratchpad(uint32_t index) override; + void initializeScratchpad(uint8_t* scratchpad, int32_t index) override; void initializeProgram(const void* seed) override; void execute() override; const Program& getProgam() { diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 070d13a..ee91fc3 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -182,17 +182,17 @@ namespace RandomX { emitByte(0xe8); //xor rbp, rax } emitByte(0x25); //and eax, - if (instr.loca & 15) { + //if (instr.loca & 15) { if (instr.loca & 3) { emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad } else { emit(ScratchpadL2 - 1); //first 256 KiB of scratchpad } - } + /*} else { emit(ScratchpadL3 - 1); //whole scratchpad - } + }*/ } void JitCompilerX86::genar(Instruction& instr) { @@ -271,7 +271,7 @@ namespace RandomX { } void JitCompilerX86::gencr(Instruction& instr, bool rax = true) { - if (instr.locc & 16) { //write to register + if (instr.locc & 8) { //write to register emit(uint16_t(0x8b4c)); //mov if (rax) { emitByte(0xc0 + 8 * (instr.regc % RegistersCount)); //regc, rax @@ -281,17 +281,17 @@ namespace RandomX { } } else { - if (instr.locc & 15) { - if (instr.locc & 3) { + //if (instr.locc & 7) { + if (instr.locc & 1) { scratchpadStoreR(instr, ScratchpadL1, rax); } else { scratchpadStoreR(instr, ScratchpadL2, rax); } - } + /*} else { scratchpadStoreR(instr, ScratchpadL3, rax); - } + }*/ } } @@ -319,18 +319,18 @@ namespace RandomX { } emit(uint16_t(0x280f)); //movaps emitByte(0xc0 + 8 * regc); // regc, xmm0 - if (instr.locc & 16) { //write to scratchpad - if (instr.locc & 15) { - if (instr.locc & 3) { //C.LOC.W + if (instr.locc & 8) { //write to scratchpad + //if (instr.locc & 7) { + if (instr.locc & 1) { //C.LOC.W scratchpadStoreF(instr, regc, ScratchpadL1, (instr.locc & 128)); //first 16 KiB of scratchpad } else { scratchpadStoreF(instr, regc, ScratchpadL2, (instr.locc & 128)); //first 256 KiB of scratchpad } - } - else { + //} + /*else { scratchpadStoreF(instr, regc, ScratchpadL3, (instr.locc & 128)); //whole scratchpad - } + }*/ } } diff --git a/src/VirtualMachine.cpp b/src/VirtualMachine.cpp index 0cdc007..01de3d9 100644 --- a/src/VirtualMachine.cpp +++ b/src/VirtualMachine.cpp @@ -39,11 +39,16 @@ namespace RandomX { mem.ds.dataset = nullptr; } - void VirtualMachine::getResult(void* out) { + void VirtualMachine::getResult(void* scratchpad, size_t scratchpadSize, void* out) { constexpr size_t smallStateLength = sizeof(RegisterFile) / sizeof(uint64_t) + 8; alignas(16) uint64_t smallState[smallStateLength]; memcpy(smallState, ®, sizeof(RegisterFile)); - hashAes1Rx4(scratchpad, ScratchpadSize, smallState + 24); + if (scratchpadSize > 0) { + hashAes1Rx4(scratchpad, scratchpadSize, smallState + 24); + } + else { + memset(smallState + 24, 0, 64); + } blake2b(out, ResultSize, smallState, sizeof(smallState), nullptr, 0); } } \ No newline at end of file diff --git a/src/VirtualMachine.hpp b/src/VirtualMachine.hpp index 78f7cf6..fe48e13 100644 --- a/src/VirtualMachine.hpp +++ b/src/VirtualMachine.hpp @@ -28,10 +28,13 @@ namespace RandomX { VirtualMachine(); virtual ~VirtualMachine() {} virtual void setDataset(dataset_t ds) = 0; - virtual void initializeScratchpad(uint32_t index) = 0; + virtual void initializeScratchpad(uint8_t* scratchpad, int32_t index) = 0; + void setScratchpad(void* ptr) { + scratchpad = (convertible_t*)ptr; + } virtual void initializeProgram(const void* seed) = 0; virtual void execute() = 0; - void getResult(void*); + void getResult(void*, size_t, void*); const RegisterFile& getRegisterFile() { return reg; } @@ -39,6 +42,6 @@ namespace RandomX { DatasetReadFunc readDataset; alignas(16) RegisterFile reg; MemoryRegisters mem; - alignas(64) convertible_t scratchpad[ScratchpadLength]; + convertible_t* scratchpad; }; } \ No newline at end of file diff --git a/src/asm/program_epilogue_store.inc b/src/asm/program_epilogue_store.inc index 90b26ce..95a4752 100644 --- a/src/asm/program_epilogue_store.inc +++ b/src/asm/program_epilogue_store.inc @@ -12,12 +12,12 @@ mov qword ptr [rcx+40], r13 mov qword ptr [rcx+48], r14 mov qword ptr [rcx+56], r15 - movdqa xmmword ptr [rcx+64], xmm8 - movdqa xmmword ptr [rcx+80], xmm9 - movdqa xmmword ptr [rcx+96], xmm2 - movdqa xmmword ptr [rcx+112], xmm3 + movapd xmmword ptr [rcx+64], xmm8 + movapd xmmword ptr [rcx+80], xmm9 + movapd xmmword ptr [rcx+96], xmm2 + movapd xmmword ptr [rcx+112], xmm3 lea rcx, [rcx+64] - movdqa xmmword ptr [rcx+64], xmm4 - movdqa xmmword ptr [rcx+80], xmm5 - movdqa xmmword ptr [rcx+96], xmm6 - movdqa xmmword ptr [rcx+112], xmm7 \ No newline at end of file + movapd xmmword ptr [rcx+64], xmm4 + movapd xmmword ptr [rcx+80], xmm5 + movapd xmmword ptr [rcx+96], xmm6 + movapd xmmword ptr [rcx+112], xmm7 \ No newline at end of file diff --git a/src/asm/program_prologue_load.inc b/src/asm/program_prologue_load.inc index ef4f96e..9ceeed6 100644 --- a/src/asm/program_prologue_load.inc +++ b/src/asm/program_prologue_load.inc @@ -1,14 +1,10 @@ mov rdi, rsp ;# beginning of VM stack - mov ebx, 1048577 ;# number of VM instructions to execute + 1 + mov ebx, 262145 ;# number of VM instructions to execute + 1 xorps xmm10, xmm10 cmpeqpd xmm10, xmm10 psrlq xmm10, 1 ;# mask for absolute value = 0x7fffffffffffffff7fffffffffffffff - ;# reset rounding mode - mov dword ptr [rsp-8], 40896 - ldmxcsr dword ptr [rsp-8] - ;# load integer registers mov r8, qword ptr [rcx+0] mov r9, qword ptr [rcx+8] @@ -19,45 +15,13 @@ mov r14, qword ptr [rcx+48] mov r15, qword ptr [rcx+56] - ;# initialize floating point registers - xorps xmm8, xmm8 - cvtsi2sd xmm8, qword ptr [rcx+72] - pslldq xmm8, 8 - cvtsi2sd xmm8, qword ptr [rcx+64] - - xorps xmm9, xmm9 - cvtsi2sd xmm9, qword ptr [rcx+88] - pslldq xmm9, 8 - cvtsi2sd xmm9, qword ptr [rcx+80] - - xorps xmm2, xmm2 - cvtsi2sd xmm2, qword ptr [rcx+104] - pslldq xmm2, 8 - cvtsi2sd xmm2, qword ptr [rcx+96] - - xorps xmm3, xmm3 - cvtsi2sd xmm3, qword ptr [rcx+120] - pslldq xmm3, 8 - cvtsi2sd xmm3, qword ptr [rcx+112] - + ;# load floating point registers + movapd xmm8, xmmword ptr [rcx+64] + movapd xmm9, xmmword ptr [rcx+80] + movapd xmm2, xmmword ptr [rcx+96] + movapd xmm3, xmmword ptr [rcx+112] lea rcx, [rcx+64] - - xorps xmm4, xmm4 - cvtsi2sd xmm4, qword ptr [rcx+72] - pslldq xmm4, 8 - cvtsi2sd xmm4, qword ptr [rcx+64] - - xorps xmm5, xmm5 - cvtsi2sd xmm5, qword ptr [rcx+88] - pslldq xmm5, 8 - cvtsi2sd xmm5, qword ptr [rcx+80] - - xorps xmm6, xmm6 - cvtsi2sd xmm6, qword ptr [rcx+104] - pslldq xmm6, 8 - cvtsi2sd xmm6, qword ptr [rcx+96] - - xorps xmm7, xmm7 - cvtsi2sd xmm7, qword ptr [rcx+120] - pslldq xmm7, 8 - cvtsi2sd xmm7, qword ptr [rcx+112] \ No newline at end of file + movapd xmm4, xmmword ptr [rcx+64] + movapd xmm5, xmmword ptr [rcx+80] + movapd xmm6, xmmword ptr [rcx+96] + movapd xmm7, xmmword ptr [rcx+112] diff --git a/src/main.cpp b/src/main.cpp index 5edb0df..84c76c8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -130,7 +130,7 @@ void generateAsm(int nonce) { asmX86.printCode(std::cout); } -void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash& result, int noncesCount, int thread) { +void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash& result, int noncesCount, int thread, uint8_t* scratchpad) { uint64_t hash[4]; unsigned char blockTemplate[] = { 0x07, 0x07, 0xf7, 0xa4, 0xf0, 0xd6, 0x05, 0xb3, 0x03, 0x26, 0x08, 0x16, 0xba, 0x3f, 0x10, 0x90, 0x2e, 0x1a, 0x14, @@ -146,11 +146,20 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash *noncePtr = nonce; blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); int spIndex = ((uint8_t*)hash)[24] | ((((uint8_t*)hash)[25] & 15) << 8); - vm->initializeScratchpad(spIndex); + vm->initializeScratchpad(scratchpad, spIndex); vm->initializeProgram(hash); //dump((char*)((RandomX::CompiledVirtualMachine*)vm)->getProgram(), RandomX::CodeSize, "code-1337-jmp.txt"); + vm->setScratchpad(scratchpad + 3 * RandomX::ScratchpadSize / 4); vm->execute(); - vm->getResult(hash); + vm->setScratchpad(scratchpad + 2 * RandomX::ScratchpadSize / 4); + vm->execute(); + vm->getResult(nullptr, 0, hash); + vm->initializeProgram(hash); + vm->setScratchpad(scratchpad + 1 * RandomX::ScratchpadSize / 4); + vm->execute(); + vm->setScratchpad(scratchpad + 0 * RandomX::ScratchpadSize / 4); + vm->execute(); + vm->getResult(scratchpad, RandomX::ScratchpadSize, hash); result.xorWith(hash); if (RandomX::trace) { std::cout << "Nonce: " << nonce << " "; @@ -274,18 +283,25 @@ int main(int argc, char** argv) { vm->setDataset(dataset); vms.push_back(vm); } + uint8_t* scratchpadMem; + if (largePages) { + scratchpadMem = (uint8_t*)allocLargePagesMemory(RandomX::ScratchpadSize * (threadCount + 1) / 2); + } + else { + scratchpadMem = (uint8_t*)_mm_malloc(threadCount * RandomX::ScratchpadSize, RandomX::CacheLineSize); + } std::cout << "Running benchmark (" << programCount << " programs) ..." << std::endl; sw.restart(); if (threadCount > 1) { for (int i = 0; i < vms.size(); ++i) { - threads.push_back(std::thread(&mine, vms[i], std::ref(atomicNonce), std::ref(result), programCount, i)); + threads.push_back(std::thread(&mine, vms[i], std::ref(atomicNonce), std::ref(result), programCount, i, scratchpadMem + RandomX::ScratchpadSize * i)); } for (int i = 0; i < threads.size(); ++i) { threads[i].join(); } } else { - mine(vms[0], std::ref(atomicNonce), std::ref(result), programCount, 0); + mine(vms[0], std::ref(atomicNonce), std::ref(result), programCount, 0, scratchpadMem); if (compiled) std::cout << "Average program size: " << ((RandomX::CompiledVirtualMachine*)vms[0])->getTotalSize() / programCount << std::endl; }