From d991de4f796647bbca084fd0cbbc7d9a2648abe1 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Sun, 15 Sep 2019 20:30:39 +0200 Subject: [PATCH] Store 32-bit literals in unused SIMD registers --- src/jit_compiler_a64.cpp | 44 ++++++++++---- src/jit_compiler_a64.hpp | 1 + src/jit_compiler_a64_static.S | 101 +++++++++++++++++++------------- src/jit_compiler_a64_static.hpp | 1 + 4 files changed, 95 insertions(+), 52 deletions(-) diff --git a/src/jit_compiler_a64.cpp b/src/jit_compiler_a64.cpp index ddbeca6..07a8114 100644 --- a/src/jit_compiler_a64.cpp +++ b/src/jit_compiler_a64.cpp @@ -66,6 +66,7 @@ namespace randomx { static const size_t CodeSize = ((uint8_t*)randomx_program_aarch64_end) - ((uint8_t*)randomx_program_aarch64); static const size_t MainLoopBegin = ((uint8_t*)randomx_program_aarch64_main_loop) - ((uint8_t*)randomx_program_aarch64); static const size_t PrologueSize = ((uint8_t*)randomx_program_aarch64_vm_instructions) - ((uint8_t*)randomx_program_aarch64); +static const size_t ImulRcpLiteralsEnd = ((uint8_t*)randomx_program_aarch64_imul_rcp_literals_end) - ((uint8_t*)randomx_program_aarch64); static const size_t InstructionsEnd = ((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64); constexpr uint32_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 }; @@ -74,7 +75,8 @@ template static constexpr size_t Log2(T value) { return (value > 1) JitCompilerA64::JitCompilerA64() : code((uint8_t*) allocMemoryPages(CodeSize)) - , literalPos(InstructionsEnd) + , literalPos(ImulRcpLiteralsEnd) + , num32bitLiterals(0) { memset(reg_changed_offset, 0, sizeof(reg_changed_offset)); memcpy(code, (void*) randomx_program_aarch64, CodeSize); @@ -102,7 +104,8 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); codePos = PrologueSize; - literalPos = InstructionsEnd; + literalPos = ImulRcpLiteralsEnd; + num32bitLiterals = 0; for (uint32_t i = 0; i < RegistersCount; ++i) reg_changed_offset[i] = codePos; @@ -157,19 +160,38 @@ void JitCompilerA64::emitMovImmediate(uint32_t dst, uint32_t imm, uint8_t* code, } else { - if (static_cast(imm) < 0) + if (num32bitLiterals < 64) { - // movn tmp_reg, ~imm32 (16 high bits) - emit32(ARMV8A::MOVN | dst | (1 << 21) | ((~imm >> 16) << 5), code, k); + if (static_cast(imm) < 0) + { + // smov dst, vN.s[M] + emit32(0x4E042C00 | dst | ((num32bitLiterals / 4) << 5) | ((num32bitLiterals % 4) << 19), code, k); + } + else + { + // umov dst, vN.s[M] + emit32(0x0E043C00 | dst | ((num32bitLiterals / 4) << 5) | ((num32bitLiterals % 4) << 19), code, k); + } + + ((uint32_t*)(code + ImulRcpLiteralsEnd))[num32bitLiterals] = imm; + ++num32bitLiterals; } else { - // movz tmp_reg, imm32 (16 high bits) - emit32(ARMV8A::MOVZ | dst | (1 << 21) | ((imm >> 16) << 5), code, k); + if (static_cast(imm) < 0) + { + // movn tmp_reg, ~imm32 (16 high bits) + emit32(ARMV8A::MOVN | dst | (1 << 21) | ((~imm >> 16) << 5), code, k); + } + else + { + // movz tmp_reg, imm32 (16 high bits) + emit32(ARMV8A::MOVZ | dst | (1 << 21) | ((imm >> 16) << 5), code, k); + } + + // movk tmp_reg, imm32 (16 low bits) + emit32(ARMV8A::MOVK | dst | ((imm & 0xFFFF) << 5), code, k); } - - // movk tmp_reg, imm32 (16 low bits) - emit32(ARMV8A::MOVK | dst | ((imm & 0xFFFF) << 5), code, k); } codePos = k; @@ -472,7 +494,7 @@ void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos) --shift; #endif - const uint32_t literal_id = (InstructionsEnd - literalPos) / sizeof(uint64_t); + const uint32_t literal_id = (ImulRcpLiteralsEnd - literalPos) / sizeof(uint64_t); literalPos -= sizeof(uint64_t); *(uint64_t*)(code + literalPos) = (q << shift) + ((r << shift) / divisor); diff --git a/src/jit_compiler_a64.hpp b/src/jit_compiler_a64.hpp index 6d67b30..9c6fe11 100644 --- a/src/jit_compiler_a64.hpp +++ b/src/jit_compiler_a64.hpp @@ -81,6 +81,7 @@ namespace randomx { uint32_t reg_changed_offset[8]; uint8_t* code; uint32_t literalPos; + uint32_t num32bitLiterals; static void emit32(uint32_t val, uint8_t* code, uint32_t& codePos) { diff --git a/src/jit_compiler_a64_static.S b/src/jit_compiler_a64_static.S index 0504904..8ce200a 100644 --- a/src/jit_compiler_a64_static.S +++ b/src/jit_compiler_a64_static.S @@ -30,6 +30,7 @@ .global randomx_program_aarch64 .global randomx_program_aarch64_main_loop .global randomx_program_aarch64_vm_instructions + .global randomx_program_aarch64_imul_rcp_literals_end .global randomx_program_aarch64_vm_instructions_end .global randomx_program_aarch64_cacheline_align_mask1 .global randomx_program_aarch64_cacheline_align_mask2 @@ -70,7 +71,7 @@ # x29 -> literal for IMUL_RCP # x30 -> literal for IMUL_RCP -# v0-v15 -> not used +# v0-v15 -> store 32-bit literals # v16 -> "f0" # v17 -> "f1" # v18 -> "f2" @@ -90,7 +91,7 @@ randomx_program_aarch64: # Save callee-saved registers - sub sp, sp, 128 + sub sp, sp, 192 stp x16, x17, [sp] stp x18, x19, [sp, 16] stp x20, x21, [sp, 32] @@ -99,6 +100,10 @@ randomx_program_aarch64: stp x26, x27, [sp, 80] stp x28, x29, [sp, 96] stp x8, x30, [sp, 112] + stp d8, d9, [sp, 128] + stp d10, d11, [sp, 144] + stp d12, d13, [sp, 160] + stp d14, d15, [sp, 176] # Zero integer registers mov x4, xzr @@ -155,6 +160,23 @@ randomx_program_aarch64: ldr x29, literal_x29 ldr x30, literal_x30 + ldr q0, literal_v0 + ldr q1, literal_v1 + ldr q2, literal_v2 + ldr q3, literal_v3 + ldr q4, literal_v4 + ldr q5, literal_v5 + ldr q6, literal_v6 + ldr q7, literal_v7 + ldr q8, literal_v8 + ldr q9, literal_v9 + ldr q10, literal_v10 + ldr q11, literal_v11 + ldr q12, literal_v12 + ldr q13, literal_v13 + ldr q14, literal_v14 + ldr q15, literal_v15 + randomx_program_aarch64_main_loop: # spAddr0 = spMix1 & ScratchpadL3Mask64; # spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64; @@ -233,44 +255,37 @@ randomx_program_aarch64_vm_instructions: # 12 KB buffer for generated instructions .fill 3072,4,0 -literal_x0: - .fill 1,8,0 - -literal_x11: - .fill 1,8,0 - -literal_x20: - .fill 1,8,0 - -literal_x21: - .fill 1,8,0 - -literal_x22: - .fill 1,8,0 - -literal_x23: - .fill 1,8,0 - -literal_x24: - .fill 1,8,0 - -literal_x25: - .fill 1,8,0 - -literal_x26: - .fill 1,8,0 - -literal_x27: - .fill 1,8,0 - -literal_x28: - .fill 1,8,0 - -literal_x29: - .fill 1,8,0 - -literal_x30: - .fill 1,8,0 +literal_x0: .fill 1,8,0 +literal_x11: .fill 1,8,0 +literal_x20: .fill 1,8,0 +literal_x21: .fill 1,8,0 +literal_x22: .fill 1,8,0 +literal_x23: .fill 1,8,0 +literal_x24: .fill 1,8,0 +literal_x25: .fill 1,8,0 +literal_x26: .fill 1,8,0 +literal_x27: .fill 1,8,0 +literal_x28: .fill 1,8,0 +literal_x29: .fill 1,8,0 +literal_x30: .fill 1,8,0 +randomx_program_aarch64_imul_rcp_literals_end: + +literal_v0: .fill 2,8,0 +literal_v1: .fill 2,8,0 +literal_v2: .fill 2,8,0 +literal_v3: .fill 2,8,0 +literal_v4: .fill 2,8,0 +literal_v5: .fill 2,8,0 +literal_v6: .fill 2,8,0 +literal_v7: .fill 2,8,0 +literal_v8: .fill 2,8,0 +literal_v9: .fill 2,8,0 +literal_v10: .fill 2,8,0 +literal_v11: .fill 2,8,0 +literal_v12: .fill 2,8,0 +literal_v13: .fill 2,8,0 +literal_v14: .fill 2,8,0 +literal_v15: .fill 2,8,0 randomx_program_aarch64_vm_instructions_end: @@ -357,7 +372,11 @@ randomx_program_aarch64_update_spMix1: ldp x26, x27, [sp, 80] ldp x28, x29, [sp, 96] ldp x8, x30, [sp, 112] - add sp, sp, 128 + ldp d8, d9, [sp, 128] + ldp d10, d11, [sp, 144] + ldp d12, d13, [sp, 160] + ldp d14, d15, [sp, 176] + add sp, sp, 192 ret diff --git a/src/jit_compiler_a64_static.hpp b/src/jit_compiler_a64_static.hpp index a8b0459..0065ffd 100644 --- a/src/jit_compiler_a64_static.hpp +++ b/src/jit_compiler_a64_static.hpp @@ -33,6 +33,7 @@ extern "C" { void randomx_program_aarch64(void* reg, void* mem, void* scratchpad, uint64_t iterations); void randomx_program_aarch64_main_loop(); void randomx_program_aarch64_vm_instructions(); + void randomx_program_aarch64_imul_rcp_literals_end(); void randomx_program_aarch64_vm_instructions_end(); void randomx_program_aarch64_cacheline_align_mask1(); void randomx_program_aarch64_cacheline_align_mask2();