Optimized dataset read (#211)

* Optimized dataset read There was a false dependency on readReg2 and readReg3 (caused by `xor rbp, rax` instruction) when reading dataset item (see design.md - 4.6.2 Loop execution, steps 5 and 7). This change uses `ma` register to read dataset item before the whole `rbp` (`ma` and `mx`) is changed, so superscalar and out-of-order CPU can start executing it earlier. Results: https://i.imgur.com/Bpeq9mx.png ~1% speedup on modern Intel/AMD CPUs. * ARMv8: optimized dataset read Break dependency from readReg2 and readReg3. * Fixed light mode hashing
3 years ago · 3c8c7ee097
parent c12097400b
commit 3c8c7ee097
5 changed files with 20 additions and 18 deletions
--- a/src/asm/program_prologue_linux.inc
+++ b/src/asm/program_prologue_linux.inc
@ -15,6 +15,7 @@
 	mov rsi, rdx                ;# uint8_t* scratchpad

 	mov rax, rbp
+	ror rbp, 32

 	;# zero integer registers
 	xor r8, r8
--- a/src/asm/program_prologue_win64.inc
+++ b/src/asm/program_prologue_win64.inc
@ -28,6 +28,7 @@
 	mov rbx, r9                 ;# loop counter

 	mov rax, rbp
+	ror rbp, 32

 	;# zero integer registers
 	xor r8, r8
--- a/src/asm/program_read_dataset.inc
+++ b/src/asm/program_read_dataset.inc
@ -1,17 +1,16 @@
+	mov ecx, ebp                       ;# ecx = ma
+	and ecx, RANDOMX_DATASET_BASE_MASK
+	xor r8, qword ptr [rdi+rcx]
+	ror rbp, 32                        ;# swap "ma" and "mx"
 	xor rbp, rax                       ;# modify "mx"
 	mov edx, ebp                       ;# edx = mx
 	and edx, RANDOMX_DATASET_BASE_MASK
 	prefetchnta byte ptr [rdi+rdx]
-	ror rbp, 32                        ;# swap "ma" and "mx"
-	mov edx, ebp                       ;# edx = ma
-	and edx, RANDOMX_DATASET_BASE_MASK
-	lea rcx, [rdi+rdx]                 ;# dataset cache line
-	xor r8,  qword ptr [rcx+0]
-	xor r9,  qword ptr [rcx+8]
-	xor r10, qword ptr [rcx+16]
-	xor r11, qword ptr [rcx+24]
-	xor r12, qword ptr [rcx+32]
-	xor r13, qword ptr [rcx+40]
-	xor r14, qword ptr [rcx+48]
-	xor r15, qword ptr [rcx+56]
+	xor r9,  qword ptr [rdi+rcx+8]
+	xor r10, qword ptr [rdi+rcx+16]
+	xor r11, qword ptr [rdi+rcx+24]
+	xor r12, qword ptr [rdi+rcx+32]
+	xor r13, qword ptr [rdi+rcx+40]
+	xor r14, qword ptr [rdi+rcx+48]
+	xor r15, qword ptr [rdi+rcx+56]
 	
--- a/src/asm/program_read_dataset_sshash_init.inc
+++ b/src/asm/program_read_dataset_sshash_init.inc
@ -8,10 +8,10 @@
 	mov qword ptr [rsp+16], r13
 	mov qword ptr [rsp+8], r14
 	mov qword ptr [rsp+0], r15
-	xor rbp, rax                       ;# modify "mx"
 	ror rbp, 32                        ;# swap "ma" and "mx"
-	mov ebx, ebp                       ;# ecx = ma
-	and ebx, RANDOMX_DATASET_BASE_MASK
-	shr ebx, 6                         ;# ebx = Dataset block number
+	xor rbp, rax                       ;# modify "mx"
+	mov rbx, rbp                       ;# ebx = ma
+	shr rbx, 38
+	and ebx, RANDOMX_DATASET_BASE_MASK / 64 ;# ebx = Dataset block number
 	;# add ebx, datasetOffset / 64
 	;# call 32768
--- a/src/jit_compiler_a64_static.S
+++ b/src/jit_compiler_a64_static.S
@ -307,6 +307,9 @@ literal_v14: .fill 2,8,0
 literal_v15: .fill 2,8,0

 DECL(randomx_program_aarch64_vm_instructions_end):
+	# Calculate dataset pointer for dataset read
+	# Do it here to break false dependency from readReg2 and readReg3 (see next line)
+	lsr	x10, x9, 32

 	# mx ^= r[readReg2] ^ r[readReg3];
 	eor	x9, x9, x18
@ -324,8 +327,6 @@ DECL(randomx_program_aarch64_cacheline_align_mask1):
 	# mx <-> ma
 	ror	x9, x9, 32

-	# Calculate dataset pointer for dataset read
-	mov	w10, w9
 DECL(randomx_program_aarch64_cacheline_align_mask2):
 	# Actual mask will be inserted by JIT compiler
 	and	x10, x10, 1