Optimized dataset read (#211)

* Optimized dataset read

There was a false dependency on readReg2 and readReg3 (caused by `xor rbp, rax` instruction) when reading dataset item (see design.md - 4.6.2 Loop execution, steps 5 and 7). This change uses `ma` register to read dataset item before the whole `rbp` (`ma` and `mx`) is changed, so superscalar and out-of-order CPU can start executing it earlier.

Results: https://i.imgur.com/Bpeq9mx.png

~1% speedup on modern Intel/AMD CPUs.

* ARMv8: optimized dataset read

Break dependency from readReg2 and readReg3.

* Fixed light mode hashing
pull/220/head
SChernykh 3 years ago committed by GitHub
parent c12097400b
commit 3c8c7ee097
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -15,6 +15,7 @@
mov rsi, rdx ;# uint8_t* scratchpad mov rsi, rdx ;# uint8_t* scratchpad
mov rax, rbp mov rax, rbp
ror rbp, 32
;# zero integer registers ;# zero integer registers
xor r8, r8 xor r8, r8

@ -28,6 +28,7 @@
mov rbx, r9 ;# loop counter mov rbx, r9 ;# loop counter
mov rax, rbp mov rax, rbp
ror rbp, 32
;# zero integer registers ;# zero integer registers
xor r8, r8 xor r8, r8

@ -1,17 +1,16 @@
mov ecx, ebp ;# ecx = ma
and ecx, RANDOMX_DATASET_BASE_MASK
xor r8, qword ptr [rdi+rcx]
ror rbp, 32 ;# swap "ma" and "mx"
xor rbp, rax ;# modify "mx" xor rbp, rax ;# modify "mx"
mov edx, ebp ;# edx = mx mov edx, ebp ;# edx = mx
and edx, RANDOMX_DATASET_BASE_MASK and edx, RANDOMX_DATASET_BASE_MASK
prefetchnta byte ptr [rdi+rdx] prefetchnta byte ptr [rdi+rdx]
ror rbp, 32 ;# swap "ma" and "mx" xor r9, qword ptr [rdi+rcx+8]
mov edx, ebp ;# edx = ma xor r10, qword ptr [rdi+rcx+16]
and edx, RANDOMX_DATASET_BASE_MASK xor r11, qword ptr [rdi+rcx+24]
lea rcx, [rdi+rdx] ;# dataset cache line xor r12, qword ptr [rdi+rcx+32]
xor r8, qword ptr [rcx+0] xor r13, qword ptr [rdi+rcx+40]
xor r9, qword ptr [rcx+8] xor r14, qword ptr [rdi+rcx+48]
xor r10, qword ptr [rcx+16] xor r15, qword ptr [rdi+rcx+56]
xor r11, qword ptr [rcx+24]
xor r12, qword ptr [rcx+32]
xor r13, qword ptr [rcx+40]
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]

@ -8,10 +8,10 @@
mov qword ptr [rsp+16], r13 mov qword ptr [rsp+16], r13
mov qword ptr [rsp+8], r14 mov qword ptr [rsp+8], r14
mov qword ptr [rsp+0], r15 mov qword ptr [rsp+0], r15
xor rbp, rax ;# modify "mx"
ror rbp, 32 ;# swap "ma" and "mx" ror rbp, 32 ;# swap "ma" and "mx"
mov ebx, ebp ;# ecx = ma xor rbp, rax ;# modify "mx"
and ebx, RANDOMX_DATASET_BASE_MASK mov rbx, rbp ;# ebx = ma
shr ebx, 6 ;# ebx = Dataset block number shr rbx, 38
and ebx, RANDOMX_DATASET_BASE_MASK / 64 ;# ebx = Dataset block number
;# add ebx, datasetOffset / 64 ;# add ebx, datasetOffset / 64
;# call 32768 ;# call 32768

@ -307,6 +307,9 @@ literal_v14: .fill 2,8,0
literal_v15: .fill 2,8,0 literal_v15: .fill 2,8,0
DECL(randomx_program_aarch64_vm_instructions_end): DECL(randomx_program_aarch64_vm_instructions_end):
# Calculate dataset pointer for dataset read
# Do it here to break false dependency from readReg2 and readReg3 (see next line)
lsr x10, x9, 32
# mx ^= r[readReg2] ^ r[readReg3]; # mx ^= r[readReg2] ^ r[readReg3];
eor x9, x9, x18 eor x9, x9, x18
@ -324,8 +327,6 @@ DECL(randomx_program_aarch64_cacheline_align_mask1):
# mx <-> ma # mx <-> ma
ror x9, x9, 32 ror x9, x9, 32
# Calculate dataset pointer for dataset read
mov w10, w9
DECL(randomx_program_aarch64_cacheline_align_mask2): DECL(randomx_program_aarch64_cacheline_align_mask2):
# Actual mask will be inserted by JIT compiler # Actual mask will be inserted by JIT compiler
and x10, x10, 1 and x10, x10, 1

Loading…
Cancel
Save