You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
347 lines
8.0 KiB
347 lines
8.0 KiB
# Copyright (c) 2018-2019, tevador <tevador@gmail.com>
|
|
# Copyright (c) 2019, SChernykh <https://github.com/SChernykh>
|
|
#
|
|
# All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
# * Neither the name of the copyright holder nor the
|
|
# names of its contributors may be used to endorse or promote products
|
|
# derived from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
.arch armv8-a
|
|
.text
|
|
.global randomx_program_aarch64
|
|
.global randomx_program_aarch64_main_loop
|
|
.global randomx_program_aarch64_vm_instructions
|
|
.global randomx_program_aarch64_vm_instructions_end
|
|
.global randomx_program_aarch64_cacheline_align_mask1
|
|
.global randomx_program_aarch64_cacheline_align_mask2
|
|
.global randomx_program_aarch64_update_spMix1
|
|
.global randomx_program_aarch64_end
|
|
|
|
# Register allocation
|
|
|
|
# x0 -> pointer to reg buffer
|
|
# x1 -> pointer to mem buffer and then to dataset
|
|
# x2 -> pointer to scratchpad
|
|
# x3 -> loop counter
|
|
# x4 -> "r0"
|
|
# x5 -> "r1"
|
|
# x6 -> "r2"
|
|
# x7 -> "r3"
|
|
# x8 -> fpcr (reversed bits)
|
|
# x9 -> mx, ma
|
|
# x10 -> spMix1
|
|
# x11 -> spMix2
|
|
# x12 -> "r4"
|
|
# x13 -> "r5"
|
|
# x14 -> "r6"
|
|
# x15 -> "r7"
|
|
# x16 -> spAddr0
|
|
# x17 -> spAddr1
|
|
# x18 -> temporary
|
|
# x19 -> temporary
|
|
# x20 -> temporary
|
|
# x21 -> literal for IMUL_RCP
|
|
# x22 -> literal for IMUL_RCP
|
|
# x23 -> literal for IMUL_RCP
|
|
# x24 -> literal for IMUL_RCP
|
|
# x25 -> literal for IMUL_RCP
|
|
# x26 -> literal for IMUL_RCP
|
|
# x27 -> literal for IMUL_RCP
|
|
# x28 -> literal for IMUL_RCP
|
|
# x29 -> literal for IMUL_RCP
|
|
# x30 -> literal for IMUL_RCP
|
|
|
|
# v0-v15 -> not used
|
|
# v16 -> "f0"
|
|
# v17 -> "f1"
|
|
# v18 -> "f2"
|
|
# v19 -> "f3"
|
|
# v20 -> "e0"
|
|
# v21 -> "e1"
|
|
# v22 -> "e2"
|
|
# v23 -> "e3"
|
|
# v24 -> "a0"
|
|
# v25 -> "a1"
|
|
# v26 -> "a2"
|
|
# v27 -> "a3"
|
|
# v28 -> temporary
|
|
# v29 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff
|
|
# v30 -> E 'or' mask = 0x3*00000000******3*00000000******
|
|
# v31 -> scale mask = 0x81f000000000000081f0000000000000
|
|
|
|
randomx_program_aarch64:
|
|
# Save callee-saved registers
|
|
sub sp, sp, 128
|
|
stp x16, x17, [sp]
|
|
stp x18, x19, [sp, 16]
|
|
stp x20, x21, [sp, 32]
|
|
stp x22, x23, [sp, 48]
|
|
stp x24, x25, [sp, 64]
|
|
stp x26, x27, [sp, 80]
|
|
stp x28, x29, [sp, 96]
|
|
stp x8, x30, [sp, 112]
|
|
|
|
# Zero integer registers
|
|
mov x4, xzr
|
|
mov x5, xzr
|
|
mov x6, xzr
|
|
mov x7, xzr
|
|
mov x12, xzr
|
|
mov x13, xzr
|
|
mov x14, xzr
|
|
mov x15, xzr
|
|
|
|
# Load ma, mx and dataset pointer
|
|
ldp x9, x1, [x1]
|
|
|
|
# Load initial spMix value
|
|
mov x10, x9
|
|
|
|
# Load group A registers
|
|
ldp q24, q25, [x0, 192]
|
|
ldp q26, q27, [x0, 224]
|
|
|
|
# Load E 'and' mask
|
|
mov x16, 0x00FFFFFFFFFFFFFF
|
|
ins v29.d[0], x16
|
|
ins v29.d[1], x16
|
|
|
|
# Load E 'or' mask (stored in reg.f[0])
|
|
ldr q30, [x0, 64]
|
|
|
|
# Load scale mask
|
|
mov x16, 0x80f0000000000000
|
|
ins v31.d[0], x16
|
|
ins v31.d[1], x16
|
|
|
|
# Read fpcr
|
|
mrs x8, fpcr
|
|
rbit x8, x8
|
|
|
|
# Read literals
|
|
ldr x21, literal_x21
|
|
ldr x22, literal_x22
|
|
ldr x23, literal_x23
|
|
ldr x24, literal_x24
|
|
ldr x25, literal_x25
|
|
ldr x26, literal_x26
|
|
ldr x27, literal_x27
|
|
ldr x28, literal_x28
|
|
ldr x29, literal_x29
|
|
ldr x30, literal_x30
|
|
|
|
randomx_program_aarch64_main_loop:
|
|
# spAddr0 = spMix1 & ScratchpadL3Mask64;
|
|
# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
|
|
lsr x18, x10, 32
|
|
|
|
# Actual mask will be inserted by JIT compiler
|
|
and w16, w10, 1
|
|
and w17, w18, 1
|
|
|
|
# x16 = scratchpad + spAddr0
|
|
# x17 = scratchpad + spAddr1
|
|
add x16, x16, x2
|
|
add x17, x17, x2
|
|
|
|
# xor integer registers with scratchpad data (spAddr0)
|
|
ldp x18, x19, [x16]
|
|
eor x4, x4, x18
|
|
eor x5, x5, x19
|
|
ldp x18, x19, [x16, 16]
|
|
eor x6, x6, x18
|
|
eor x7, x7, x19
|
|
ldp x18, x19, [x16, 32]
|
|
eor x12, x12, x18
|
|
eor x13, x13, x19
|
|
ldp x18, x19, [x16, 48]
|
|
eor x14, x14, x18
|
|
eor x15, x15, x19
|
|
|
|
# Load group F registers (spAddr1)
|
|
ldpsw x18, x19, [x17]
|
|
ins v16.d[0], x18
|
|
ins v16.d[1], x19
|
|
ldpsw x18, x19, [x17, 8]
|
|
ins v17.d[0], x18
|
|
ins v17.d[1], x19
|
|
ldpsw x18, x19, [x17, 16]
|
|
ins v18.d[0], x18
|
|
ins v18.d[1], x19
|
|
ldpsw x18, x19, [x17, 24]
|
|
ins v19.d[0], x18
|
|
ins v19.d[1], x19
|
|
scvtf v16.2d, v16.2d
|
|
scvtf v17.2d, v17.2d
|
|
scvtf v18.2d, v18.2d
|
|
scvtf v19.2d, v19.2d
|
|
|
|
# Load group E registers (spAddr1)
|
|
ldpsw x18, x19, [x17, 32]
|
|
ins v20.d[0], x18
|
|
ins v20.d[1], x19
|
|
ldpsw x18, x19, [x17, 40]
|
|
ins v21.d[0], x18
|
|
ins v21.d[1], x19
|
|
ldpsw x18, x19, [x17, 48]
|
|
ins v22.d[0], x18
|
|
ins v22.d[1], x19
|
|
ldpsw x18, x19, [x17, 56]
|
|
ins v23.d[0], x18
|
|
ins v23.d[1], x19
|
|
scvtf v20.2d, v20.2d
|
|
scvtf v21.2d, v21.2d
|
|
scvtf v22.2d, v22.2d
|
|
scvtf v23.2d, v23.2d
|
|
and v20.16b, v20.16b, v29.16b
|
|
and v21.16b, v21.16b, v29.16b
|
|
and v22.16b, v22.16b, v29.16b
|
|
and v23.16b, v23.16b, v29.16b
|
|
orr v20.16b, v20.16b, v30.16b
|
|
orr v21.16b, v21.16b, v30.16b
|
|
orr v22.16b, v22.16b, v30.16b
|
|
orr v23.16b, v23.16b, v30.16b
|
|
|
|
# Execute VM instructions
|
|
randomx_program_aarch64_vm_instructions:
|
|
|
|
# 12 KB buffer for generated instructions
|
|
.fill 3072,4,0
|
|
|
|
literal_x21:
|
|
.fill 1,8,0
|
|
|
|
literal_x22:
|
|
.fill 1,8,0
|
|
|
|
literal_x23:
|
|
.fill 1,8,0
|
|
|
|
literal_x24:
|
|
.fill 1,8,0
|
|
|
|
literal_x25:
|
|
.fill 1,8,0
|
|
|
|
literal_x26:
|
|
.fill 1,8,0
|
|
|
|
literal_x27:
|
|
.fill 1,8,0
|
|
|
|
literal_x28:
|
|
.fill 1,8,0
|
|
|
|
literal_x29:
|
|
.fill 1,8,0
|
|
|
|
literal_x30:
|
|
.fill 1,8,0
|
|
|
|
randomx_program_aarch64_vm_instructions_end:
|
|
|
|
# mx ^= r[readReg2] ^ r[readReg3];
|
|
eor x9, x9, x11
|
|
|
|
# Calculate dataset pointer for dataset prefetch
|
|
mov w20, w9
|
|
randomx_program_aarch64_cacheline_align_mask1:
|
|
# Actual mask will be inserted by JIT compiler
|
|
and x20, x20, 1
|
|
add x20, x20, x1
|
|
|
|
# Prefetch dataset data
|
|
prfm pldl2strm, [x20]
|
|
|
|
# mx <-> ma
|
|
ror x9, x9, 32
|
|
|
|
# Calculate dataset pointer for dataset read
|
|
mov w20, w9
|
|
randomx_program_aarch64_cacheline_align_mask2:
|
|
# Actual mask will be inserted by JIT compiler
|
|
and x20, x20, 1
|
|
add x20, x20, x1
|
|
|
|
# xor integer registers with dataset data
|
|
ldp x18, x19, [x20]
|
|
eor x4, x4, x18
|
|
eor x5, x5, x19
|
|
ldp x18, x19, [x20, 16]
|
|
eor x6, x6, x18
|
|
eor x7, x7, x19
|
|
ldp x18, x19, [x20, 32]
|
|
eor x12, x12, x18
|
|
eor x13, x13, x19
|
|
ldp x18, x19, [x20, 48]
|
|
eor x14, x14, x18
|
|
eor x15, x15, x19
|
|
|
|
randomx_program_aarch64_update_spMix1:
|
|
eor x10, x0, x0
|
|
|
|
# Store integer registers to scratchpad (spAddr1)
|
|
stp x4, x5, [x17, 0]
|
|
stp x6, x7, [x17, 16]
|
|
stp x12, x13, [x17, 32]
|
|
stp x14, x15, [x17, 48]
|
|
|
|
# xor group F and group E registers
|
|
eor v16.16b, v16.16b, v20.16b
|
|
eor v17.16b, v17.16b, v21.16b
|
|
eor v18.16b, v18.16b, v22.16b
|
|
eor v19.16b, v19.16b, v23.16b
|
|
|
|
# Store FP registers to scratchpad (spAddr0)
|
|
stp q16, q17, [x16, 0]
|
|
stp q18, q19, [x16, 32]
|
|
|
|
subs x3, x3, 1
|
|
bne randomx_program_aarch64_main_loop
|
|
|
|
# Store integer registers
|
|
stp x4, x5, [x0, 0]
|
|
stp x6, x7, [x0, 16]
|
|
stp x12, x13, [x0, 32]
|
|
stp x14, x15, [x0, 48]
|
|
|
|
# Store FP registers
|
|
stp q16, q17, [x0, 64]
|
|
stp q18, q19, [x0, 96]
|
|
stp q20, q21, [x0, 128]
|
|
stp q22, q23, [x0, 160]
|
|
|
|
# Restore callee-saved registers
|
|
ldp x16, x17, [sp]
|
|
ldp x18, x19, [sp, 16]
|
|
ldp x20, x21, [sp, 32]
|
|
ldp x22, x23, [sp, 48]
|
|
ldp x24, x25, [sp, 64]
|
|
ldp x26, x27, [sp, 80]
|
|
ldp x28, x29, [sp, 96]
|
|
ldp x8, x30, [sp, 112]
|
|
add sp, sp, 128
|
|
|
|
ret
|
|
|
|
randomx_program_aarch64_end:
|