initBlock: cycle columns, asm implementation

5 years ago · edde7672e0
parent 55a22febbd
commit edde7672e0
5 changed files with 249 additions and 30 deletions
--- a/src/InterpretedVirtualMachine.cpp
+++ b/src/InterpretedVirtualMachine.cpp
@ -351,7 +351,7 @@ namespace RandomX {
 				//mem.mx &= CacheLineAlignMask;
 				Cache& cache = mem.ds.cache;
 				uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)];
-				initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize);
+				initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8);
 				for (int i = 0; i < RegistersCount; ++i)
 					r[i] ^= datasetLine[i];
 				std::swap(mem.mx, mem.ma);
--- a/src/LightClientAsyncWorker.cpp
+++ b/src/LightClientAsyncWorker.cpp
@ -54,7 +54,7 @@ namespace RandomX {
 #endif
 		uint32_t currentBlock = addr / CacheLineSize;
 		if (currentBlock != startBlock || output != currentLine.data()) {
-			initBlock(cache, (uint8_t*)currentLine.data(), currentBlock);
+			initBlock(cache, (uint8_t*)currentLine.data(), currentBlock, RANDOMX_CACHE_ACCESSES / 8);
 		}
 		else {
 			sync();
@ -81,7 +81,7 @@ namespace RandomX {

 	void LightClientAsyncWorker::getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) {
 		for (uint32_t i = 0; i < blockCount; ++i) {
-			initBlock(cache, (uint8_t*)out + CacheLineSize * i, startBlock + i);
+			initBlock(cache, (uint8_t*)out + CacheLineSize * i, startBlock + i, RANDOMX_CACHE_ACCESSES / 8);
 		}
 	}

@ -101,7 +101,7 @@ namespace RandomX {
 			std::cout << sw.getElapsed() << ": runWorker-getBlocks " << startBlock << "/" << blockCount << std::endl;
 #endif
 			//getBlocks(output, startBlock, blockCount);
-			initBlock(cache, (uint8_t*)output, startBlock);
+			initBlock(cache, (uint8_t*)output, startBlock, RANDOMX_CACHE_ACCESSES / 8);
 			hasWork = false;
 #ifdef TRACE
 			std::cout << sw.getElapsed() << ": runWorker-finished " << startBlock << "/" << blockCount << std::endl;
--- a/src/dataset.cpp
+++ b/src/dataset.cpp
@ -40,34 +40,65 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.

 namespace RandomX {

-	void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber) {
+#if !defined(_M_X64)
+	static FORCE_INLINE uint8_t* selectMixBlock(const Cache& cache, uint64_t& currentIndex, uint64_t& nextIndex) {
+		uint8_t* mixBlock;
+		if (RANDOMX_ARGON_GROWTH == 0) {
+			constexpr uint32_t mask = (RANDOMX_ARGON_MEMORY * ArgonBlockSize / CacheLineSize - 1);
+			mixBlock = cache.memory + (currentIndex & mask) * CacheLineSize;
+		}
+		else {
+			const uint32_t modulus = cache.size / CacheLineSize;
+			mixBlock = cache.memory + (currentIndex % modulus) * CacheLineSize;
+		}
+		PREFETCHNTA(mixBlock);
+		nextIndex = squareHash(currentIndex + nextIndex);
+		return mixBlock;
+	}
+
+	static FORCE_INLINE void mixCache(uint8_t* mixBlock, uint64_t& c0, uint64_t& c1, uint64_t& c2, uint64_t& c3, uint64_t& c4, uint64_t& c5, uint64_t& c6, uint64_t& c7) {
+		c0 ^= load64(mixBlock + 0);
+		c1 ^= load64(mixBlock + 8);
+		c2 ^= load64(mixBlock + 16);
+		c3 ^= load64(mixBlock + 24);
+		c4 ^= load64(mixBlock + 32);
+		c5 ^= load64(mixBlock + 40);
+		c6 ^= load64(mixBlock + 48);
+		c7 ^= load64(mixBlock + 56);
+	}
+
+	void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber, unsigned iterations) {
 		uint64_t c0, c1, c2, c3, c4, c5, c6, c7;

-		c0 = 4ULL * blockNumber;
+		c0 = blockNumber;
 		c1 = c2 = c3 = c4 = c5 = c6 = c7 = 0;

-		constexpr uint32_t mask = (CacheSize - 1) & CacheLineAlignMask;
+		uint8_t* mixBlock;

-		for (auto i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) {
-			const uint8_t* mixBlock;
-			if (RANDOMX_ARGON_GROWTH == 0) {
-				constexpr uint32_t mask = (RANDOMX_ARGON_MEMORY * ArgonBlockSize / CacheLineSize - 1);
-				mixBlock = cache.memory + (c0 & mask) * CacheLineSize;
-			}
-			else {
-				const uint32_t modulus = cache.size / CacheLineSize;
-				mixBlock = cache.memory + (c0 % modulus) * CacheLineSize;
-			}
-			PREFETCHNTA(mixBlock);
-			c0 = squareHash(c0);
-			c0 ^= load64(mixBlock + 0);
-			c1 ^= load64(mixBlock + 8);
-			c2 ^= load64(mixBlock + 16);
-			c3 ^= load64(mixBlock + 24);
-			c4 ^= load64(mixBlock + 32);
-			c5 ^= load64(mixBlock + 40);
-			c6 ^= load64(mixBlock + 48);
-			c7 ^= load64(mixBlock + 56);
+		for (auto i = 0; i < RANDOMX_CACHE_ACCESSES / 8; ++i) {
+			mixBlock = selectMixBlock(cache, c0, c1);
+			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
+
+			mixBlock = selectMixBlock(cache, c1, c2);
+			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
+
+			mixBlock = selectMixBlock(cache, c2, c3);
+			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
+
+			mixBlock = selectMixBlock(cache, c3, c4);
+			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
+
+			mixBlock = selectMixBlock(cache, c4, c5);
+			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
+
+			mixBlock = selectMixBlock(cache, c5, c6);
+			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
+
+			mixBlock = selectMixBlock(cache, c6, c7);
+			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
+
+			mixBlock = selectMixBlock(cache, c7, c0);
+			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
 		}

 		store64(out + 0, c0);
@ -79,6 +110,7 @@ namespace RandomX {
 		store64(out + 48, c6);
 		store64(out + 56, c7);
 	}
+#endif

 	void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) {
 		uint64_t* datasetLine = (uint64_t*)(memory.ds.dataset.memory + memory.ma);
@ -95,7 +127,7 @@ namespace RandomX {
 		memory.mx &= CacheLineAlignMask; //align to cache line
 		Cache& cache = memory.ds.cache;
 		uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)];
-		initBlock(cache, (uint8_t*)datasetLine, memory.ma / CacheLineSize);
+		initBlock(cache, (uint8_t*)datasetLine, memory.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8);
 		for (int i = 0; i < RegistersCount; ++i)
 			reg[i] ^= datasetLine[i];
 		std::swap(memory.mx, memory.ma);
@ -128,7 +160,7 @@ namespace RandomX {

 	void datasetInit(Cache& cache, Dataset& ds, uint32_t startBlock, uint32_t blockCount) {
 		for (uint64_t i = startBlock; i < startBlock + blockCount; ++i) {
-			initBlock(cache, ds.memory + i * CacheLineSize, i);
+			initBlock(cache, ds.memory + i * CacheLineSize, i, RANDOMX_CACHE_ACCESSES / 8);
 		}
 	}

--- a/src/dataset.hpp
+++ b/src/dataset.hpp
@ -25,7 +25,10 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.

 namespace RandomX {

-	void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber);
+#if defined(_M_X64)
+	extern "C"
+#endif
+	void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber, unsigned iterations);

 	void datasetAlloc(dataset_t& ds, bool largePages);

--- a/src/squareHash.asm
+++ b/src/squareHash.asm
@ -1,6 +1,7 @@
 IFDEF RAX

 PUBLIC squareHash
+PUBLIC initBlock

 .code

@ -8,6 +9,189 @@ squareHash PROC
 	include asm/squareHash.inc
 squareHash ENDP

+; rcx = cache
+; rdx = out
+; r8 = blockNumber
+; r9 = iterations
+initBlock PROC
+	push rbx
+	push rbp
+	push rsi
+	push rdi
+	push r12
+	push r13
+	push r14
+	push r15
+	mov rsi, r9
+	mov rdi, qword ptr [rcx]
+	mov rbp, rdx
+	prefetcht0 byte ptr [rbp]
+	; r8 = blockNumber
+	xor r9, r9
+	xor r10, r10
+	xor r11, r11
+	xor r12, r12
+	xor r13, r13
+	xor r14, r14
+	xor r15, r15
+initBlock_loop:
+	; c0
+	mov rbx, r8
+	and rbx, 4194303
+	shl rbx, 6
+	add rbx, rdi
+	prefetchnta byte ptr [rbx]
+	lea rcx, [r8+r9]
+	call squareHash
+	mov r9, rax
+	xor r8, qword ptr [rbx+0]
+	xor r9, qword ptr [rbx+8]
+	xor r10, qword ptr [rbx+16]
+	xor r11, qword ptr [rbx+24]
+	xor r12, qword ptr [rbx+32]
+	xor r13, qword ptr [rbx+40]
+	xor r14, qword ptr [rbx+48]
+	xor r15, qword ptr [rbx+56]
+	; c1
+	mov rbx, r9
+	and rbx, 4194303
+	shl rbx, 6
+	add rbx, rdi
+	prefetchnta byte ptr [rbx]
+	lea rcx, [r9+r10]
+	call squareHash
+	mov r10, rax
+	xor r8, qword ptr [rbx+0]
+	xor r9, qword ptr [rbx+8]
+	xor r10, qword ptr [rbx+16]
+	xor r11, qword ptr [rbx+24]
+	xor r12, qword ptr [rbx+32]
+	xor r13, qword ptr [rbx+40]
+	xor r14, qword ptr [rbx+48]
+	xor r15, qword ptr [rbx+56]
+	; c2
+	mov rbx, r10
+	and rbx, 4194303
+	shl rbx, 6
+	add rbx, rdi
+	prefetchnta byte ptr [rbx]
+	lea rcx, [r10+r11]
+	call squareHash
+	mov r11, rax
+	xor r8, qword ptr [rbx+0]
+	xor r9, qword ptr [rbx+8]
+	xor r10, qword ptr [rbx+16]
+	xor r11, qword ptr [rbx+24]
+	xor r12, qword ptr [rbx+32]
+	xor r13, qword ptr [rbx+40]
+	xor r14, qword ptr [rbx+48]
+	xor r15, qword ptr [rbx+56]
+	; c3
+	mov rbx, r11
+	and rbx, 4194303
+	shl rbx, 6
+	add rbx, rdi
+	prefetchnta byte ptr [rbx]
+	lea rcx, [r11+r12]
+	call squareHash
+	mov r12, rax
+	xor r8, qword ptr [rbx+0]
+	xor r9, qword ptr [rbx+8]
+	xor r10, qword ptr [rbx+16]
+	xor r11, qword ptr [rbx+24]
+	xor r12, qword ptr [rbx+32]
+	xor r13, qword ptr [rbx+40]
+	xor r14, qword ptr [rbx+48]
+	xor r15, qword ptr [rbx+56]
+	; c4
+	mov rbx, r12
+	and rbx, 4194303
+	shl rbx, 6
+	add rbx, rdi
+	prefetchnta byte ptr [rbx]
+	lea rcx, [r12+r13]
+	call squareHash
+	mov r13, rax
+	xor r8, qword ptr [rbx+0]
+	xor r9, qword ptr [rbx+8]
+	xor r10, qword ptr [rbx+16]
+	xor r11, qword ptr [rbx+24]
+	xor r12, qword ptr [rbx+32]
+	xor r13, qword ptr [rbx+40]
+	xor r14, qword ptr [rbx+48]
+	xor r15, qword ptr [rbx+56]
+	; c5
+	mov rbx, r13
+	and rbx, 4194303
+	shl rbx, 6
+	add rbx, rdi
+	prefetchnta byte ptr [rbx]
+	lea rcx, [r13+r14]
+	call squareHash
+	mov r14, rax
+	xor r8, qword ptr [rbx+0]
+	xor r9, qword ptr [rbx+8]
+	xor r10, qword ptr [rbx+16]
+	xor r11, qword ptr [rbx+24]
+	xor r12, qword ptr [rbx+32]
+	xor r13, qword ptr [rbx+40]
+	xor r14, qword ptr [rbx+48]
+	xor r15, qword ptr [rbx+56]
+	; c6
+	mov rbx, r14
+	and rbx, 4194303
+	shl rbx, 6
+	add rbx, rdi
+	prefetchnta byte ptr [rbx]
+	lea rcx, [r14+r15]
+	call squareHash
+	mov r15, rax
+	xor r8, qword ptr [rbx+0]
+	xor r9, qword ptr [rbx+8]
+	xor r10, qword ptr [rbx+16]
+	xor r11, qword ptr [rbx+24]
+	xor r12, qword ptr [rbx+32]
+	xor r13, qword ptr [rbx+40]
+	xor r14, qword ptr [rbx+48]
+	xor r15, qword ptr [rbx+56]
+	; c7
+	mov rbx, r15
+	and rbx, 4194303
+	shl rbx, 6
+	add rbx, rdi
+	prefetchnta byte ptr [rbx]
+	lea rcx, [r15+r8]
+	call squareHash
+	mov r8, rax
+	xor r8, qword ptr [rbx+0]
+	xor r9, qword ptr [rbx+8]
+	xor r10, qword ptr [rbx+16]
+	xor r11, qword ptr [rbx+24]
+	xor r12, qword ptr [rbx+32]
+	xor r13, qword ptr [rbx+40]
+	xor r14, qword ptr [rbx+48]
+	xor r15, qword ptr [rbx+56]
+	sub rsi, 1
+	jnz initBlock_loop
+	mov qword ptr [rbp+0], r8
+	mov qword ptr [rbp+8], r9
+	mov qword ptr [rbp+16], r10
+	mov qword ptr [rbp+24], r11
+	mov qword ptr [rbp+32], r12
+	mov qword ptr [rbp+40], r13
+	mov qword ptr [rbp+48], r14
+	mov qword ptr [rbp+56], r15
+	pop r15
+	pop r14
+	pop r13
+	pop r12
+	pop rdi
+	pop rsi
+	pop rbp
+	pop rbx
+	ret
+initBlock ENDP
+
 ENDIF

 END