diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp
index aeabe15..1fbe825 100644
--- a/src/InterpretedVirtualMachine.cpp
+++ b/src/InterpretedVirtualMachine.cpp
@@ -351,7 +351,7 @@ namespace RandomX {
//mem.mx &= CacheLineAlignMask;
Cache& cache = mem.ds.cache;
uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)];
- initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize);
+ initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8);
for (int i = 0; i < RegistersCount; ++i)
r[i] ^= datasetLine[i];
std::swap(mem.mx, mem.ma);
diff --git a/src/LightClientAsyncWorker.cpp b/src/LightClientAsyncWorker.cpp
index 3d895b8..fbba713 100644
--- a/src/LightClientAsyncWorker.cpp
+++ b/src/LightClientAsyncWorker.cpp
@@ -54,7 +54,7 @@ namespace RandomX {
#endif
uint32_t currentBlock = addr / CacheLineSize;
if (currentBlock != startBlock || output != currentLine.data()) {
- initBlock(cache, (uint8_t*)currentLine.data(), currentBlock);
+ initBlock(cache, (uint8_t*)currentLine.data(), currentBlock, RANDOMX_CACHE_ACCESSES / 8);
}
else {
sync();
@@ -81,7 +81,7 @@ namespace RandomX {
void LightClientAsyncWorker::getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) {
for (uint32_t i = 0; i < blockCount; ++i) {
- initBlock(cache, (uint8_t*)out + CacheLineSize * i, startBlock + i);
+ initBlock(cache, (uint8_t*)out + CacheLineSize * i, startBlock + i, RANDOMX_CACHE_ACCESSES / 8);
}
}
@@ -101,7 +101,7 @@ namespace RandomX {
std::cout << sw.getElapsed() << ": runWorker-getBlocks " << startBlock << "/" << blockCount << std::endl;
#endif
//getBlocks(output, startBlock, blockCount);
- initBlock(cache, (uint8_t*)output, startBlock);
+ initBlock(cache, (uint8_t*)output, startBlock, RANDOMX_CACHE_ACCESSES / 8);
hasWork = false;
#ifdef TRACE
std::cout << sw.getElapsed() << ": runWorker-finished " << startBlock << "/" << blockCount << std::endl;
diff --git a/src/dataset.cpp b/src/dataset.cpp
index 098a23c..0a96d86 100644
--- a/src/dataset.cpp
+++ b/src/dataset.cpp
@@ -40,34 +40,65 @@ along with RandomX. If not, see.
namespace RandomX {
- void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber) {
+#if !defined(_M_X64)
+ static FORCE_INLINE uint8_t* selectMixBlock(const Cache& cache, uint64_t& currentIndex, uint64_t& nextIndex) {
+ uint8_t* mixBlock;
+ if (RANDOMX_ARGON_GROWTH == 0) {
+ constexpr uint32_t mask = (RANDOMX_ARGON_MEMORY * ArgonBlockSize / CacheLineSize - 1);
+ mixBlock = cache.memory + (currentIndex & mask) * CacheLineSize;
+ }
+ else {
+ const uint32_t modulus = cache.size / CacheLineSize;
+ mixBlock = cache.memory + (currentIndex % modulus) * CacheLineSize;
+ }
+ PREFETCHNTA(mixBlock);
+ nextIndex = squareHash(currentIndex + nextIndex);
+ return mixBlock;
+ }
+
+ static FORCE_INLINE void mixCache(uint8_t* mixBlock, uint64_t& c0, uint64_t& c1, uint64_t& c2, uint64_t& c3, uint64_t& c4, uint64_t& c5, uint64_t& c6, uint64_t& c7) {
+ c0 ^= load64(mixBlock + 0);
+ c1 ^= load64(mixBlock + 8);
+ c2 ^= load64(mixBlock + 16);
+ c3 ^= load64(mixBlock + 24);
+ c4 ^= load64(mixBlock + 32);
+ c5 ^= load64(mixBlock + 40);
+ c6 ^= load64(mixBlock + 48);
+ c7 ^= load64(mixBlock + 56);
+ }
+
+ void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber, unsigned iterations) {
uint64_t c0, c1, c2, c3, c4, c5, c6, c7;
- c0 = 4ULL * blockNumber;
+ c0 = blockNumber;
c1 = c2 = c3 = c4 = c5 = c6 = c7 = 0;
- constexpr uint32_t mask = (CacheSize - 1) & CacheLineAlignMask;
+ uint8_t* mixBlock;
- for (auto i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) {
- const uint8_t* mixBlock;
- if (RANDOMX_ARGON_GROWTH == 0) {
- constexpr uint32_t mask = (RANDOMX_ARGON_MEMORY * ArgonBlockSize / CacheLineSize - 1);
- mixBlock = cache.memory + (c0 & mask) * CacheLineSize;
- }
- else {
- const uint32_t modulus = cache.size / CacheLineSize;
- mixBlock = cache.memory + (c0 % modulus) * CacheLineSize;
- }
- PREFETCHNTA(mixBlock);
- c0 = squareHash(c0);
- c0 ^= load64(mixBlock + 0);
- c1 ^= load64(mixBlock + 8);
- c2 ^= load64(mixBlock + 16);
- c3 ^= load64(mixBlock + 24);
- c4 ^= load64(mixBlock + 32);
- c5 ^= load64(mixBlock + 40);
- c6 ^= load64(mixBlock + 48);
- c7 ^= load64(mixBlock + 56);
+ for (auto i = 0; i < RANDOMX_CACHE_ACCESSES / 8; ++i) {
+ mixBlock = selectMixBlock(cache, c0, c1);
+ mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
+
+ mixBlock = selectMixBlock(cache, c1, c2);
+ mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
+
+ mixBlock = selectMixBlock(cache, c2, c3);
+ mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
+
+ mixBlock = selectMixBlock(cache, c3, c4);
+ mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
+
+ mixBlock = selectMixBlock(cache, c4, c5);
+ mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
+
+ mixBlock = selectMixBlock(cache, c5, c6);
+ mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
+
+ mixBlock = selectMixBlock(cache, c6, c7);
+ mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
+
+ mixBlock = selectMixBlock(cache, c7, c0);
+ mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
}
store64(out + 0, c0);
@@ -79,6 +110,7 @@ namespace RandomX {
store64(out + 48, c6);
store64(out + 56, c7);
}
+#endif
void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) {
uint64_t* datasetLine = (uint64_t*)(memory.ds.dataset.memory + memory.ma);
@@ -95,7 +127,7 @@ namespace RandomX {
memory.mx &= CacheLineAlignMask; //align to cache line
Cache& cache = memory.ds.cache;
uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)];
- initBlock(cache, (uint8_t*)datasetLine, memory.ma / CacheLineSize);
+ initBlock(cache, (uint8_t*)datasetLine, memory.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8);
for (int i = 0; i < RegistersCount; ++i)
reg[i] ^= datasetLine[i];
std::swap(memory.mx, memory.ma);
@@ -128,7 +160,7 @@ namespace RandomX {
void datasetInit(Cache& cache, Dataset& ds, uint32_t startBlock, uint32_t blockCount) {
for (uint64_t i = startBlock; i < startBlock + blockCount; ++i) {
- initBlock(cache, ds.memory + i * CacheLineSize, i);
+ initBlock(cache, ds.memory + i * CacheLineSize, i, RANDOMX_CACHE_ACCESSES / 8);
}
}
diff --git a/src/dataset.hpp b/src/dataset.hpp
index 8ad47f7..de6ac0b 100644
--- a/src/dataset.hpp
+++ b/src/dataset.hpp
@@ -25,7 +25,10 @@ along with RandomX. If not, see.
namespace RandomX {
- void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber);
+#if defined(_M_X64)
+ extern "C"
+#endif
+ void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber, unsigned iterations);
void datasetAlloc(dataset_t& ds, bool largePages);
diff --git a/src/squareHash.asm b/src/squareHash.asm
index f7442ec..8917428 100644
--- a/src/squareHash.asm
+++ b/src/squareHash.asm
@@ -1,6 +1,7 @@
IFDEF RAX
PUBLIC squareHash
+PUBLIC initBlock
.code
@@ -8,6 +9,189 @@ squareHash PROC
include asm/squareHash.inc
squareHash ENDP
+; rcx = cache
+; rdx = out
+; r8 = blockNumber
+; r9 = iterations
+initBlock PROC
+ push rbx
+ push rbp
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+ mov rsi, r9
+ mov rdi, qword ptr [rcx]
+ mov rbp, rdx
+ prefetcht0 byte ptr [rbp]
+ ; r8 = blockNumber
+ xor r9, r9
+ xor r10, r10
+ xor r11, r11
+ xor r12, r12
+ xor r13, r13
+ xor r14, r14
+ xor r15, r15
+initBlock_loop:
+ ; c0
+ mov rbx, r8
+ and rbx, 4194303
+ shl rbx, 6
+ add rbx, rdi
+ prefetchnta byte ptr [rbx]
+ lea rcx, [r8+r9]
+ call squareHash
+ mov r9, rax
+ xor r8, qword ptr [rbx+0]
+ xor r9, qword ptr [rbx+8]
+ xor r10, qword ptr [rbx+16]
+ xor r11, qword ptr [rbx+24]
+ xor r12, qword ptr [rbx+32]
+ xor r13, qword ptr [rbx+40]
+ xor r14, qword ptr [rbx+48]
+ xor r15, qword ptr [rbx+56]
+ ; c1
+ mov rbx, r9
+ and rbx, 4194303
+ shl rbx, 6
+ add rbx, rdi
+ prefetchnta byte ptr [rbx]
+ lea rcx, [r9+r10]
+ call squareHash
+ mov r10, rax
+ xor r8, qword ptr [rbx+0]
+ xor r9, qword ptr [rbx+8]
+ xor r10, qword ptr [rbx+16]
+ xor r11, qword ptr [rbx+24]
+ xor r12, qword ptr [rbx+32]
+ xor r13, qword ptr [rbx+40]
+ xor r14, qword ptr [rbx+48]
+ xor r15, qword ptr [rbx+56]
+ ; c2
+ mov rbx, r10
+ and rbx, 4194303
+ shl rbx, 6
+ add rbx, rdi
+ prefetchnta byte ptr [rbx]
+ lea rcx, [r10+r11]
+ call squareHash
+ mov r11, rax
+ xor r8, qword ptr [rbx+0]
+ xor r9, qword ptr [rbx+8]
+ xor r10, qword ptr [rbx+16]
+ xor r11, qword ptr [rbx+24]
+ xor r12, qword ptr [rbx+32]
+ xor r13, qword ptr [rbx+40]
+ xor r14, qword ptr [rbx+48]
+ xor r15, qword ptr [rbx+56]
+ ; c3
+ mov rbx, r11
+ and rbx, 4194303
+ shl rbx, 6
+ add rbx, rdi
+ prefetchnta byte ptr [rbx]
+ lea rcx, [r11+r12]
+ call squareHash
+ mov r12, rax
+ xor r8, qword ptr [rbx+0]
+ xor r9, qword ptr [rbx+8]
+ xor r10, qword ptr [rbx+16]
+ xor r11, qword ptr [rbx+24]
+ xor r12, qword ptr [rbx+32]
+ xor r13, qword ptr [rbx+40]
+ xor r14, qword ptr [rbx+48]
+ xor r15, qword ptr [rbx+56]
+ ; c4
+ mov rbx, r12
+ and rbx, 4194303
+ shl rbx, 6
+ add rbx, rdi
+ prefetchnta byte ptr [rbx]
+ lea rcx, [r12+r13]
+ call squareHash
+ mov r13, rax
+ xor r8, qword ptr [rbx+0]
+ xor r9, qword ptr [rbx+8]
+ xor r10, qword ptr [rbx+16]
+ xor r11, qword ptr [rbx+24]
+ xor r12, qword ptr [rbx+32]
+ xor r13, qword ptr [rbx+40]
+ xor r14, qword ptr [rbx+48]
+ xor r15, qword ptr [rbx+56]
+ ; c5
+ mov rbx, r13
+ and rbx, 4194303
+ shl rbx, 6
+ add rbx, rdi
+ prefetchnta byte ptr [rbx]
+ lea rcx, [r13+r14]
+ call squareHash
+ mov r14, rax
+ xor r8, qword ptr [rbx+0]
+ xor r9, qword ptr [rbx+8]
+ xor r10, qword ptr [rbx+16]
+ xor r11, qword ptr [rbx+24]
+ xor r12, qword ptr [rbx+32]
+ xor r13, qword ptr [rbx+40]
+ xor r14, qword ptr [rbx+48]
+ xor r15, qword ptr [rbx+56]
+ ; c6
+ mov rbx, r14
+ and rbx, 4194303
+ shl rbx, 6
+ add rbx, rdi
+ prefetchnta byte ptr [rbx]
+ lea rcx, [r14+r15]
+ call squareHash
+ mov r15, rax
+ xor r8, qword ptr [rbx+0]
+ xor r9, qword ptr [rbx+8]
+ xor r10, qword ptr [rbx+16]
+ xor r11, qword ptr [rbx+24]
+ xor r12, qword ptr [rbx+32]
+ xor r13, qword ptr [rbx+40]
+ xor r14, qword ptr [rbx+48]
+ xor r15, qword ptr [rbx+56]
+ ; c7
+ mov rbx, r15
+ and rbx, 4194303
+ shl rbx, 6
+ add rbx, rdi
+ prefetchnta byte ptr [rbx]
+ lea rcx, [r15+r8]
+ call squareHash
+ mov r8, rax
+ xor r8, qword ptr [rbx+0]
+ xor r9, qword ptr [rbx+8]
+ xor r10, qword ptr [rbx+16]
+ xor r11, qword ptr [rbx+24]
+ xor r12, qword ptr [rbx+32]
+ xor r13, qword ptr [rbx+40]
+ xor r14, qword ptr [rbx+48]
+ xor r15, qword ptr [rbx+56]
+ sub rsi, 1
+ jnz initBlock_loop
+ mov qword ptr [rbp+0], r8
+ mov qword ptr [rbp+8], r9
+ mov qword ptr [rbp+16], r10
+ mov qword ptr [rbp+24], r11
+ mov qword ptr [rbp+32], r12
+ mov qword ptr [rbp+40], r13
+ mov qword ptr [rbp+48], r14
+ mov qword ptr [rbp+56], r15
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbp
+ pop rbx
+ ret
+initBlock ENDP
+
ENDIF
END
\ No newline at end of file