diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index aeabe15..1fbe825 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -351,7 +351,7 @@ namespace RandomX { //mem.mx &= CacheLineAlignMask; Cache& cache = mem.ds.cache; uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; - initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize); + initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8); for (int i = 0; i < RegistersCount; ++i) r[i] ^= datasetLine[i]; std::swap(mem.mx, mem.ma); diff --git a/src/LightClientAsyncWorker.cpp b/src/LightClientAsyncWorker.cpp index 3d895b8..fbba713 100644 --- a/src/LightClientAsyncWorker.cpp +++ b/src/LightClientAsyncWorker.cpp @@ -54,7 +54,7 @@ namespace RandomX { #endif uint32_t currentBlock = addr / CacheLineSize; if (currentBlock != startBlock || output != currentLine.data()) { - initBlock(cache, (uint8_t*)currentLine.data(), currentBlock); + initBlock(cache, (uint8_t*)currentLine.data(), currentBlock, RANDOMX_CACHE_ACCESSES / 8); } else { sync(); @@ -81,7 +81,7 @@ namespace RandomX { void LightClientAsyncWorker::getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) { for (uint32_t i = 0; i < blockCount; ++i) { - initBlock(cache, (uint8_t*)out + CacheLineSize * i, startBlock + i); + initBlock(cache, (uint8_t*)out + CacheLineSize * i, startBlock + i, RANDOMX_CACHE_ACCESSES / 8); } } @@ -101,7 +101,7 @@ namespace RandomX { std::cout << sw.getElapsed() << ": runWorker-getBlocks " << startBlock << "/" << blockCount << std::endl; #endif //getBlocks(output, startBlock, blockCount); - initBlock(cache, (uint8_t*)output, startBlock); + initBlock(cache, (uint8_t*)output, startBlock, RANDOMX_CACHE_ACCESSES / 8); hasWork = false; #ifdef TRACE std::cout << sw.getElapsed() << ": runWorker-finished " << startBlock << "/" << blockCount << std::endl; diff --git a/src/dataset.cpp b/src/dataset.cpp index 098a23c..0a96d86 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -40,34 +40,65 @@ along with RandomX. If not, see. namespace RandomX { - void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber) { +#if !defined(_M_X64) + static FORCE_INLINE uint8_t* selectMixBlock(const Cache& cache, uint64_t& currentIndex, uint64_t& nextIndex) { + uint8_t* mixBlock; + if (RANDOMX_ARGON_GROWTH == 0) { + constexpr uint32_t mask = (RANDOMX_ARGON_MEMORY * ArgonBlockSize / CacheLineSize - 1); + mixBlock = cache.memory + (currentIndex & mask) * CacheLineSize; + } + else { + const uint32_t modulus = cache.size / CacheLineSize; + mixBlock = cache.memory + (currentIndex % modulus) * CacheLineSize; + } + PREFETCHNTA(mixBlock); + nextIndex = squareHash(currentIndex + nextIndex); + return mixBlock; + } + + static FORCE_INLINE void mixCache(uint8_t* mixBlock, uint64_t& c0, uint64_t& c1, uint64_t& c2, uint64_t& c3, uint64_t& c4, uint64_t& c5, uint64_t& c6, uint64_t& c7) { + c0 ^= load64(mixBlock + 0); + c1 ^= load64(mixBlock + 8); + c2 ^= load64(mixBlock + 16); + c3 ^= load64(mixBlock + 24); + c4 ^= load64(mixBlock + 32); + c5 ^= load64(mixBlock + 40); + c6 ^= load64(mixBlock + 48); + c7 ^= load64(mixBlock + 56); + } + + void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber, unsigned iterations) { uint64_t c0, c1, c2, c3, c4, c5, c6, c7; - c0 = 4ULL * blockNumber; + c0 = blockNumber; c1 = c2 = c3 = c4 = c5 = c6 = c7 = 0; - constexpr uint32_t mask = (CacheSize - 1) & CacheLineAlignMask; + uint8_t* mixBlock; - for (auto i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { - const uint8_t* mixBlock; - if (RANDOMX_ARGON_GROWTH == 0) { - constexpr uint32_t mask = (RANDOMX_ARGON_MEMORY * ArgonBlockSize / CacheLineSize - 1); - mixBlock = cache.memory + (c0 & mask) * CacheLineSize; - } - else { - const uint32_t modulus = cache.size / CacheLineSize; - mixBlock = cache.memory + (c0 % modulus) * CacheLineSize; - } - PREFETCHNTA(mixBlock); - c0 = squareHash(c0); - c0 ^= load64(mixBlock + 0); - c1 ^= load64(mixBlock + 8); - c2 ^= load64(mixBlock + 16); - c3 ^= load64(mixBlock + 24); - c4 ^= load64(mixBlock + 32); - c5 ^= load64(mixBlock + 40); - c6 ^= load64(mixBlock + 48); - c7 ^= load64(mixBlock + 56); + for (auto i = 0; i < RANDOMX_CACHE_ACCESSES / 8; ++i) { + mixBlock = selectMixBlock(cache, c0, c1); + mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7); + + mixBlock = selectMixBlock(cache, c1, c2); + mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7); + + mixBlock = selectMixBlock(cache, c2, c3); + mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7); + + mixBlock = selectMixBlock(cache, c3, c4); + mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7); + + mixBlock = selectMixBlock(cache, c4, c5); + mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7); + + mixBlock = selectMixBlock(cache, c5, c6); + mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7); + + mixBlock = selectMixBlock(cache, c6, c7); + mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7); + + mixBlock = selectMixBlock(cache, c7, c0); + mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7); } store64(out + 0, c0); @@ -79,6 +110,7 @@ namespace RandomX { store64(out + 48, c6); store64(out + 56, c7); } +#endif void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) { uint64_t* datasetLine = (uint64_t*)(memory.ds.dataset.memory + memory.ma); @@ -95,7 +127,7 @@ namespace RandomX { memory.mx &= CacheLineAlignMask; //align to cache line Cache& cache = memory.ds.cache; uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; - initBlock(cache, (uint8_t*)datasetLine, memory.ma / CacheLineSize); + initBlock(cache, (uint8_t*)datasetLine, memory.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8); for (int i = 0; i < RegistersCount; ++i) reg[i] ^= datasetLine[i]; std::swap(memory.mx, memory.ma); @@ -128,7 +160,7 @@ namespace RandomX { void datasetInit(Cache& cache, Dataset& ds, uint32_t startBlock, uint32_t blockCount) { for (uint64_t i = startBlock; i < startBlock + blockCount; ++i) { - initBlock(cache, ds.memory + i * CacheLineSize, i); + initBlock(cache, ds.memory + i * CacheLineSize, i, RANDOMX_CACHE_ACCESSES / 8); } } diff --git a/src/dataset.hpp b/src/dataset.hpp index 8ad47f7..de6ac0b 100644 --- a/src/dataset.hpp +++ b/src/dataset.hpp @@ -25,7 +25,10 @@ along with RandomX. If not, see. namespace RandomX { - void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber); +#if defined(_M_X64) + extern "C" +#endif + void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber, unsigned iterations); void datasetAlloc(dataset_t& ds, bool largePages); diff --git a/src/squareHash.asm b/src/squareHash.asm index f7442ec..8917428 100644 --- a/src/squareHash.asm +++ b/src/squareHash.asm @@ -1,6 +1,7 @@ IFDEF RAX PUBLIC squareHash +PUBLIC initBlock .code @@ -8,6 +9,189 @@ squareHash PROC include asm/squareHash.inc squareHash ENDP +; rcx = cache +; rdx = out +; r8 = blockNumber +; r9 = iterations +initBlock PROC + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + mov rsi, r9 + mov rdi, qword ptr [rcx] + mov rbp, rdx + prefetcht0 byte ptr [rbp] + ; r8 = blockNumber + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +initBlock_loop: + ; c0 + mov rbx, r8 + and rbx, 4194303 + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] + lea rcx, [r8+r9] + call squareHash + mov r9, rax + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] + ; c1 + mov rbx, r9 + and rbx, 4194303 + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] + lea rcx, [r9+r10] + call squareHash + mov r10, rax + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] + ; c2 + mov rbx, r10 + and rbx, 4194303 + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] + lea rcx, [r10+r11] + call squareHash + mov r11, rax + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] + ; c3 + mov rbx, r11 + and rbx, 4194303 + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] + lea rcx, [r11+r12] + call squareHash + mov r12, rax + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] + ; c4 + mov rbx, r12 + and rbx, 4194303 + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] + lea rcx, [r12+r13] + call squareHash + mov r13, rax + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] + ; c5 + mov rbx, r13 + and rbx, 4194303 + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] + lea rcx, [r13+r14] + call squareHash + mov r14, rax + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] + ; c6 + mov rbx, r14 + and rbx, 4194303 + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] + lea rcx, [r14+r15] + call squareHash + mov r15, rax + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] + ; c7 + mov rbx, r15 + and rbx, 4194303 + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] + lea rcx, [r15+r8] + call squareHash + mov r8, rax + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] + sub rsi, 1 + jnz initBlock_loop + mov qword ptr [rbp+0], r8 + mov qword ptr [rbp+8], r9 + mov qword ptr [rbp+16], r10 + mov qword ptr [rbp+24], r11 + mov qword ptr [rbp+32], r12 + mov qword ptr [rbp+40], r13 + mov qword ptr [rbp+48], r14 + mov qword ptr [rbp+56], r15 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + ret +initBlock ENDP + ENDIF END \ No newline at end of file