From 6e3136b37fd9771d6683994c8767018d55257a5c Mon Sep 17 00:00:00 2001 From: tevador Date: Sat, 6 Apr 2019 17:07:40 +0200 Subject: [PATCH] Fixed cache alignment Performance tuning --- src/Cache.hpp | 2 +- src/JitCompilerX86.cpp | 2 +- src/LightProgramGenerator.cpp | 49 ++++++++++++++++++----------- src/asm/program_sshash_load.inc | 16 +++++----- src/asm/program_sshash_prefetch.inc | 2 +- src/configuration.h | 2 +- 6 files changed, 43 insertions(+), 30 deletions(-) diff --git a/src/Cache.hpp b/src/Cache.hpp index 5656baf..bfc7ddf 100644 --- a/src/Cache.hpp +++ b/src/Cache.hpp @@ -34,7 +34,7 @@ namespace RandomX { return (uint8_t*)allocLargePagesMemory(size); } else { - void* ptr = _mm_malloc(size, sizeof(__m128i)); + void* ptr = _mm_malloc(size, CacheLineSize); if (ptr == nullptr) throw std::bad_alloc(); return (uint8_t*)ptr; diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 8c49326..d6e27f1 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -628,7 +628,7 @@ namespace RandomX { emitByte(0xc8 + instr.dst); } else { - if (NOP_TEST) { + if (false && NOP_TEST) { emit(NOP4); return; } diff --git a/src/LightProgramGenerator.cpp b/src/LightProgramGenerator.cpp index 900e2ae..d5ebadf 100644 --- a/src/LightProgramGenerator.cpp +++ b/src/LightProgramGenerator.cpp @@ -216,7 +216,7 @@ namespace RandomX { const MacroOp MacroOp::Sub_ri = MacroOp("sub r,i", 7, 1, ExecutionPort::P015); const MacroOp MacroOp::Imul_rr = MacroOp("imul r,r", 4, 3, ExecutionPort::P1); const MacroOp MacroOp::Imul_rri = MacroOp("imul r,r,i", 7, 3, ExecutionPort::P1); - const MacroOp MacroOp::Imul_r = MacroOp("imul r", 3, 3, ExecutionPort::P1, ExecutionPort::P5); + const MacroOp MacroOp::Imul_r = MacroOp("imul r", 3, 4, ExecutionPort::P1, ExecutionPort::P5); const MacroOp MacroOp::Mul_r = MacroOp("mul r", 3, 3, ExecutionPort::P1, ExecutionPort::P5); const MacroOp MacroOp::Mov_rr = MacroOp("mov r,r", 3); const MacroOp MacroOp::Mov_ri64 = MacroOp("mov rax,i64", 10, 1, ExecutionPort::P015); @@ -357,9 +357,11 @@ namespace RandomX { const char* getName() const { return name_; } - const DecoderBuffer* fetchNext(int prevType, Blake2Generator& gen) const { - if (prevType == LightInstructionType::IMULH_R || prevType == LightInstructionType::ISMULH_R) + const DecoderBuffer* fetchNext(int instrType, int cycle, int mulCount, Blake2Generator& gen) const { + if (instrType == LightInstructionType::IMULH_R || instrType == LightInstructionType::ISMULH_R) return &decodeBuffer3310; //2-1-1 decode + if (mulCount < cycle) + return &decodeBuffer4444_mul; if (index_ == 0) { return &decodeBuffer4444; //IMUL_RCP end } @@ -381,15 +383,16 @@ namespace RandomX { static const DecoderBuffer decodeBuffer7333; static const DecoderBuffer decodeBuffer3337; static const DecoderBuffer decodeBuffer4444; + static const DecoderBuffer decodeBuffer4444_mul; static const DecoderBuffer decodeBuffer3733; static const DecoderBuffer decodeBuffer3373; static const DecoderBuffer decodeBuffer133; static const DecoderBuffer* decodeBuffers[7]; const DecoderBuffer* fetchNextDefault(Blake2Generator& gen) const { int select; - do { - select = gen.getByte() & 7; - } while (select == 7); + //do { + select = gen.getByte() & 3; + //} while (select == 7); return decodeBuffers[select]; } }; @@ -397,17 +400,16 @@ namespace RandomX { const DecoderBuffer DecoderBuffer::decodeBuffer3310 = DecoderBuffer("3,3,10", 0, buffer0); const DecoderBuffer DecoderBuffer::decodeBuffer7333 = DecoderBuffer("7,3,3,3", 1, buffer1); const DecoderBuffer DecoderBuffer::decodeBuffer3337 = DecoderBuffer("3,3,3,7", 2, buffer2); + const DecoderBuffer DecoderBuffer::decodeBuffer4444_mul = DecoderBuffer("4,4,4,4-MUL", 3, buffer4); const DecoderBuffer DecoderBuffer::decodeBuffer4444 = DecoderBuffer("4,4,4,4", 4, buffer4); + const DecoderBuffer DecoderBuffer::decodeBuffer3733 = DecoderBuffer("3,7,3,3", 5, buffer5); const DecoderBuffer DecoderBuffer::decodeBuffer3373 = DecoderBuffer("3,3,7,3", 6, buffer6); const DecoderBuffer DecoderBuffer::decodeBuffer133 = DecoderBuffer("13,3", 7, buffer7); const DecoderBuffer* DecoderBuffer::decodeBuffers[7] = { &DecoderBuffer::decodeBuffer3310, - &DecoderBuffer::decodeBuffer7333, &DecoderBuffer::decodeBuffer3337, - &DecoderBuffer::decodeBuffer4444, - &DecoderBuffer::decodeBuffer4444, &DecoderBuffer::decodeBuffer3733, &DecoderBuffer::decodeBuffer3373, }; @@ -417,8 +419,8 @@ namespace RandomX { const LightInstructionInfo* slot_3[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R }; const LightInstructionInfo* slot_3L[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IMULH_R, &LightInstructionInfo::ISMULH_R }; const LightInstructionInfo* slot_3C[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IROR_R, &LightInstructionInfo::IXOR_R }; - const LightInstructionInfo* slot_4[] = { &LightInstructionInfo::IMUL_R, &LightInstructionInfo::IROR_C, &LightInstructionInfo::IADD_RS, &LightInstructionInfo::IMUL_R }; - const LightInstructionInfo* slot_7[] = { &LightInstructionInfo::ISUB_C, &LightInstructionInfo::IMUL_C, &LightInstructionInfo::IXOR_C, &LightInstructionInfo::ISUB_C }; + const LightInstructionInfo* slot_4[] = { &LightInstructionInfo::IROR_C, &LightInstructionInfo::IADD_RS }; + const LightInstructionInfo* slot_7[] = { &LightInstructionInfo::IXOR_C, &LightInstructionInfo::ISUB_C }; const LightInstructionInfo* slot_7L = &LightInstructionInfo::COND_R; const LightInstructionInfo* slot_10 = &LightInstructionInfo::IMUL_RCP; @@ -448,27 +450,34 @@ namespace RandomX { instr.setImm32(imm32_); } - static LightInstruction createForSlot(Blake2Generator& gen, int slotSize, bool isLast = false, bool complex = false) { + static LightInstruction createForSlot(Blake2Generator& gen, int slotSize, int fetchType, bool isLast, bool isFirst) { switch (slotSize) { case 3: if (isLast) { return create(slot_3L[gen.getByte() & 3], gen); } - else if (complex) { + else if (false && isFirst && fetchType == 0) { return create(slot_3C[gen.getByte() & 3], gen); } else { return create(slot_3[gen.getByte() & 1], gen); } case 4: - return create(slot_4[gen.getByte() & 3], gen); + if (fetchType == 3 && !isLast) { + return create(&LightInstructionInfo::IMUL_R, gen); + } + else { + return create(slot_4[gen.getByte() & 1], gen); + } case 7: if (false && isLast) { return create(slot_7L, gen); } - else { - return create(slot_7[gen.getByte() & 3], gen); + if (false && isFirst) { + return create(&LightInstructionInfo::IMUL_C, gen); + } else { + return create(slot_7[gen.getByte() & 1], gen); } case 10: return create(slot_10, gen); @@ -664,7 +673,11 @@ namespace RandomX { constexpr int V4_SRC_INDEX_BITS = 3; constexpr int V4_DST_INDEX_BITS = 3; constexpr int CYCLE_MAP_SIZE = RANDOMX_LPROG_LATENCY + 3; +#ifndef _DEBUG constexpr bool TRACE = false; +#else + constexpr bool TRACE = true; +#endif static int blakeCounter = 0; @@ -803,7 +816,7 @@ namespace RandomX { constexpr int MAX_ATTEMPTS = 4; while(!portsSaturated) { - fetchLine = fetchLine->fetchNext(currentInstruction.getType(), gen); + fetchLine = fetchLine->fetchNext(currentInstruction.getType(), cycle, mulCount, gen); if (TRACE) std::cout << "; ------------- fetch cycle " << cycle << " (" << fetchLine->getName() << ")" << std::endl; mopIndex = 0; @@ -813,7 +826,7 @@ namespace RandomX { if (instrIndex >= currentInstruction.getInfo().getSize()) { if (portsSaturated) break; - currentInstruction = LightInstruction::createForSlot(gen, fetchLine->getCounts()[mopIndex], fetchLine->getSize() == mopIndex + 1, fetchLine->getIndex() == 0 && mopIndex == 0); + currentInstruction = LightInstruction::createForSlot(gen, fetchLine->getCounts()[mopIndex], fetchLine->getIndex(), fetchLine->getSize() == mopIndex + 1, mopIndex == 0); instrIndex = 0; if (TRACE) std::cout << "; " << currentInstruction.getInfo().getName() << std::endl; } diff --git a/src/asm/program_sshash_load.inc b/src/asm/program_sshash_load.inc index a9ae9a2..5351356 100644 --- a/src/asm/program_sshash_load.inc +++ b/src/asm/program_sshash_load.inc @@ -1,8 +1,8 @@ - ;xor r8, qword ptr [rbx+0] - ;xor r9, qword ptr [rbx+8] - ;xor r10, qword ptr [rbx+16] - ;xor r11, qword ptr [rbx+24] - ;xor r12, qword ptr [rbx+32] - ;xor r13, qword ptr [rbx+40] - ;xor r14, qword ptr [rbx+48] - ;xor r15, qword ptr [rbx+56] \ No newline at end of file + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] \ No newline at end of file diff --git a/src/asm/program_sshash_prefetch.inc b/src/asm/program_sshash_prefetch.inc index 78faba4..96ec35a 100644 --- a/src/asm/program_sshash_prefetch.inc +++ b/src/asm/program_sshash_prefetch.inc @@ -1,4 +1,4 @@ and rbx, 4194303 shl rbx, 6 add rbx, rdi - ; prefetchnta byte ptr [rbx] \ No newline at end of file + prefetchnta byte ptr [rbx] \ No newline at end of file diff --git a/src/configuration.h b/src/configuration.h index 72e44a4..6d9912d 100644 --- a/src/configuration.h +++ b/src/configuration.h @@ -37,7 +37,7 @@ along with RandomX. If not, see. //Number of random Cache accesses per Dataset block. Minimum is 2. #define RANDOMX_CACHE_ACCESSES 8 -#define RANDOMX_LPROG_LATENCY 130 +#define RANDOMX_LPROG_LATENCY 170 #define RANDOMX_LPROG_ASIC_LATENCY 84 #define RANDOMX_LPROG_MIN_SIZE 225 #define RANDOMX_LPROG_MAX_SIZE 512