/*
Copyright (c) 2023 tevador <tevador@gmail.com>

All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
	* Redistributions of source code must retain the above copyright
	  notice, this list of conditions and the following disclaimer.
	* Redistributions in binary form must reproduce the above copyright
	  notice, this list of conditions and the following disclaimer in the
	  documentation and/or other materials provided with the distribution.
	* Neither the name of the copyright holder nor the
	  names of its contributors may be used to endorse or promote products
	  derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#define DECL(x) x

.text
.option rvc

#include "configuration.h"

.global DECL(randomx_riscv64_literals)
.global DECL(randomx_riscv64_literals_end)
.global DECL(randomx_riscv64_data_init)
.global DECL(randomx_riscv64_fix_data_call)
.global DECL(randomx_riscv64_prologue)
.global DECL(randomx_riscv64_loop_begin)
.global DECL(randomx_riscv64_data_read)
.global DECL(randomx_riscv64_data_read_light)
.global DECL(randomx_riscv64_fix_loop_call)
.global DECL(randomx_riscv64_spad_store)
.global DECL(randomx_riscv64_spad_store_hardaes)
.global DECL(randomx_riscv64_spad_store_softaes)
.global DECL(randomx_riscv64_loop_end)
.global DECL(randomx_riscv64_fix_continue_loop)
.global DECL(randomx_riscv64_epilogue)
.global DECL(randomx_riscv64_softaes)
.global DECL(randomx_riscv64_program_end)
.global DECL(randomx_riscv64_ssh_init)
.global DECL(randomx_riscv64_ssh_load)
.global DECL(randomx_riscv64_ssh_prefetch)
.global DECL(randomx_riscv64_ssh_end)

/* The literal pool can fit at most 494 IMUL_RCP literals */
#if RANDOMX_PROGRAM_SIZE > 494
    #error RANDOMX_PROGRAM_SIZE larger than 494 is not supported.
#endif

#define RANDOMX_CACHE_MASK (RANDOMX_ARGON_MEMORY*16-1)

/* shared literal pool: 4 KB */
    /* space for 256 IMUL_RCP literals -2048 */
    /* filled by JIT compiler */
DECL(randomx_riscv64_literals):
literal_pool:
    /* SuperscalarHash constants +0 */
    .dword 6364136223846793005
    .dword 9298411001130361340
    .dword 12065312585734608966
    .dword 9306329213124626780
    .dword 5281919268842080866
    .dword 10536153434571861004
    .dword 3398623926847679864
    .dword 9549104520008361294
    /* CFROUND lookup table +64 */
    .word  0x00000000 /* RTN */
    .word  0x00000002 /* RDN */
    .word  0x00000003 /* RUP */
    .word  0x00000001 /* RTZ */
    /* mask literals +80,+84,+88,+92,+96,+104 */
    .word (RANDOMX_SCRATCHPAD_L1-8)
    .word (RANDOMX_SCRATCHPAD_L2-8)
    .word (RANDOMX_SCRATCHPAD_L3-64)
    .word (RANDOMX_DATASET_BASE_SIZE-64)
    .dword 0x80f0000000000000
    .dword 0x00ffffffffffffff
DECL(randomx_riscv64_literals_end):
    /* E reg. set masks, +112,+120 */
    .dword 0 /* filled by JIT compiler */
    .dword 0 /* filled by JIT compiler */
    /* soft AES table addresses, +128,+136 */
    .dword 0 /* filled by JIT compiler */
    .dword 0 /* filled by JIT compiler */
    /* space for 238 IMUL_RCP literals, +144 */
    .fill 238,8,0 /* filled by JIT compiler */

/* ================================= */
/* Dataset init function entry point */
/* ================================= */

/* Register allocation:
   ----------------------
  x0      -> zero
  x1      -> temp/return address
  x2      -> stack pointer (sp)
  x3      -> literal pool pointer
  x5      -> dataset pointer
  x6      -> cache pointer
  x7      -> temp/itemNumber
  x8-x15  -> SuperscalarHash registers
  x16     -> itemNumber
  x17     -> endItem
  x28-x31 -> temp

  Stack layout:
  ------------------------
  sp+
  0   -> return address
  8   -> saved x3
  16  -> saved x8-x9
  32  -> caller stack
*/
DECL(randomx_riscv64_data_init):
    addi sp, sp, -32
    /* dataset ptr */
    mv x5, x11
    /* cache->memory */
    ld x6, 0(x10)
    /* callee saved registers */
    sd x1, 0(sp)
    sd x3, 8(sp)
    /* literal pool */
    lla x3, literal_pool
    sd x8, 16(sp)
    sd x9, 24(sp)
    /* startItem */
    mv x16, x12
    /* endItem */
    mv x17, x13
init_item:
    mv x7, x16
DECL(randomx_riscv64_fix_data_call):
    jal superscalar_hash /* JIT compiler will adjust the offset */
    sd x8, 0(x5)
    sd x9, 8(x5)
    sd x10, 16(x5)
    sd x11, 24(x5)
    sd x12, 32(x5)
    sd x13, 40(x5)
    sd x14, 48(x5)
    sd x15, 56(x5)
    addi x5, x5, 64
    addi x16, x16, 1
    bltu x16, x17, init_item
    ld x1, 0(sp)
    ld x3, 8(sp)
    ld x8, 16(sp)
    ld x9, 24(sp)
    addi sp, sp, 32
    ret

/* ====================================== */
/* Program execution function entry point */
/* ====================================== */

/* Register allocation:
   ----------------------
  x0      -> zero
  x1      -> temp/scratchpad L3 mask
  x2      -> stack pointer (sp)
  x3      -> literal pool pointer
  x5      -> scratchpad pointer
  x6      -> dataset/cache pointer
  x7      -> temp/next dataset access
  x8      -> temp
  x9      -> temp
  x10     -> scratchpad L1 mask (0x0000000000003ff8)
  x11     -> scratchpad L2 mask (0x000000000003fff8)
  x12     -> FSCAL_R mask       (0x80f0000000000000)
  x13     -> E reg. clear mask  (0x00ffffffffffffff)
  x14     -> E reg. set mask    (0x3*00000000******)
  x15     -> E reg. set mask    (0x3*00000000******)
  x16-x23 -> VM registers "r0"-"r7"
  x24     -> iteration counter "ic"
  x25     -> VM registers "mx", "ma"
  x26     -> spAddr0
  x27     -> spAddr1
  x28-x31 -> temp/literals for IMUL_RCP (4x)

  (Note: We avoid using x4 because it breaks debugging with gdb.)

  f0-f7   -> VM registers "f0"-"f3"
  f8-f15  -> VM registers "e0"-"e3"
  f16-f23 -> VM registers "a0"-"a3"
  f24-f25 -> temp
  f26-f31 -> literals for IMUL_RCP (6x)

  Stack layout:
  ------------------------
  sp+
  0   -> return address
  8   -> register file ptr
  16  -> saved x3-x4
  32  -> saved x8-x9
  48  -> saved x18-x27
  128 -> saved f8-f9
  144 -> saved f18-f27
  224 -> caller stack
*/

DECL(randomx_riscv64_prologue):
    addi sp, sp, -224
    /* scratchpad pointer */
    mv x5, x12
    /* register file pointer */
    sd x10, 8(sp)
    /* callee saved registers */
    sd x3, 16(sp)
    sd x8, 32(sp)
    sd x9, 40(sp)
    sd x18, 48(sp)
    sd x19, 56(sp)
    sd x20, 64(sp)
    sd x21, 72(sp)
    sd x22, 80(sp)
    sd x23, 88(sp)
    sd x24, 96(sp)
    sd x25, 104(sp)
    sd x26, 112(sp)
    sd x27, 120(sp)
    fsd f8, 128(sp)
    fsd f9, 136(sp)
    fsd f18, 144(sp)
    fsd f19, 152(sp)
    fsd f20, 160(sp)
    fsd f21, 168(sp)
    fsd f22, 176(sp)
    fsd f23, 184(sp)
    fsd f24, 192(sp)
    fsd f25, 200(sp)
    fsd f26, 208(sp)
    fsd f27, 216(sp)
    /* iteration counter */
    mv x24, x13
    /* return address */
    sd x1, 0(sp)
    /* literal pool */
    lla x3, literal_pool
    /* load (ma, mx) */
    ld x25, 0(x11)
    /* dataset ptr */
    ld x6, 8(x11)
    /* load dataset mask */
    lwu x1, 92(x3)
    /* zero registers r0-r3, load a0-a1 */
    li x16, 0
    fld f16, 192(x10)
    li x17, 0
    fld f17, 200(x10)
    srli x7, x25, 32 /* x7 = ma */
    li x18, 0
    fld f18, 208(x10)
    mv x27, x7 /* x27 = ma */
    li x19, 0
    fld f19, 216(x10)
    /* set dataset read address */
    and x7, x7, x1
    add x7, x7, x6
    /* zero registers r4-r7, load a2-a3 */
    li x20, 0
    fld f20, 224(x10)
    li x21, 0
    fld f21, 232(x10)
    li x22, 0
    fld f22, 240(x10)
    li x23, 0
    fld f23, 248(x10)
    /* load L3 mask */
    lwu x1, 88(x3)
    /* load scratchpad masks */
    lwu x10, 80(x3)
    lwu x11, 84(x3)
    /* set spAddr0, spAddr1 */
    and x26, x25, x1
    and x27, x27, x1
    add x26, x26, x5
    add x27, x27, x5
    /* align L3 mask */
    addi x1, x1, 56
    /* FSCAL, E reg. masks */
    ld x12, 96(x3)
    ld x13, 104(x3)
    ld x14, 112(x3)
    ld x15, 120(x3)
    /* IMUL_RCP literals */
    fld f26, 176(x3)
    fld f27, 184(x3)
    fld f28, 192(x3)
    fld f29, 200(x3)
    fld f30, 208(x3)
    fld f31, 216(x3)

.balign 4
DECL(randomx_riscv64_loop_begin):
loop_begin:
    /* mix integer registers */
    ld x8, 0(x26)
    ld x9, 8(x26)
    ld x30, 16(x26)
    ld x31, 24(x26)
    xor x16, x16, x8
    ld x8, 32(x26)
    xor x17, x17, x9
    ld x9, 40(x26)
    xor x18, x18, x30
    ld x30, 48(x26)
    xor x19, x19, x31
    ld x31, 56(x26)
    xor x20, x20, x8
    lw x8, 0(x27)
    xor x21, x21, x9
    lw x9, 4(x27)
    xor x22, x22, x30
    lw x30, 8(x27)
    xor x23, x23, x31
    lw x31, 12(x27)
    /* load F registers */
    fcvt.d.w f0, x8
    lw x8, 16(x27)
    fcvt.d.w f1, x9
    lw x9, 20(x27)
    fcvt.d.w f2, x30
    lw x30, 24(x27)
    fcvt.d.w f3, x31
    lw x31, 28(x27)
    fcvt.d.w f4, x8
    lw x8, 32(x27)
    fcvt.d.w f5, x9
    lw x9, 36(x27)
    fcvt.d.w f6, x30
    lw x30, 40(x27)
    fcvt.d.w f7, x31
    lw x31, 44(x27)
    /* load E registers */
    fcvt.d.w f8, x8
    lw x8, 48(x27)
    fcvt.d.w f9, x9
    lw x9, 52(x27)
    fcvt.d.w f10, x30
    lw x30, 56(x27)
    fcvt.d.w f11, x31
    lw x31, 60(x27)
    fcvt.d.w f12, x8
    fmv.x.d x8, f8
    fcvt.d.w f13, x9
    fmv.x.d x9, f9
    fcvt.d.w f14, x30
    fmv.x.d x30, f10
    fcvt.d.w f15, x31
    fmv.x.d x31, f11
    and x8, x8, x13
    and x9, x9, x13
    or x8, x8, x14
    or x9, x9, x15
    and x30, x30, x13
    and x31, x31, x13
    or x30, x30, x14
    or x31, x31, x15
    fmv.d.x f8, x8
    fmv.d.x f9, x9
    fmv.d.x f10, x30
    fmv.d.x f11, x31
    fmv.x.d x8, f12
    fmv.x.d x9, f13
    fmv.x.d x30, f14
    fmv.x.d x31, f15
    and x8, x8, x13
    and x9, x9, x13
    or x8, x8, x14
    or x9, x9, x15
    fmv.d.x f12, x8
    fmv.d.x f13, x9
    and x30, x30, x13
    and x31, x31, x13
    or x30, x30, x14
    or x31, x31, x15
    fmv.d.x f14, x30
    fmv.d.x f15, x31
    /* reload clobbered IMUL_RCP regs */
    ld x28, 144(x3)
    ld x29, 152(x3)
    ld x30, 160(x3)
    ld x31, 168(x3)

DECL(randomx_riscv64_data_read):
    xor x8, x20, x22 /* JIT compiler will adjust the registers */
    /* load dataset mask */
    lwu x1, 92(x3)
    /* zero-extend x8 */
#ifdef __riscv_zba
    zext.w x8, x8
#else
    slli x8, x8, 32
    srli x8, x8, 32
#endif
    /* update "mx" */
    xor x25, x25, x8
    /* read dataset and update registers */
    ld x8, 0(x7)
    ld x9, 8(x7)
    ld x30, 16(x7)
    ld x31, 24(x7)
    xor x16, x16, x8
    ld x8, 32(x7)
    xor x17, x17, x9
    ld x9, 40(x7)
    xor x18, x18, x30
    ld x30, 48(x7)
    xor x19, x19, x31
    ld x31, 56(x7)
    xor x20, x20, x8
    /* calculate the next dataset address */
    and x7, x25, x1
    xor x21, x21, x9
    add x7, x7, x6
    xor x22, x22, x30
    /* prefetch - doesn't seem to have any effect */
    /* ld x0, 0(x7) */
    xor x23, x23, x31
    /* swap mx <-> ma */
#ifdef __riscv_zbb
    rori x25, x25, 32
#else
    srli x9, x25, 32
    slli x25, x25, 32
    or x25, x25, x9
#endif

DECL(randomx_riscv64_data_read_light):
    xor x8, x20, x22 /* JIT compiler will adjust the registers */
    /* load dataset offset */
    lui x9, 0x02000  /* JIT compiler will adjust the immediate */
    addi x9, x9, -64
    /* load dataset mask */
    lwu x1, 92(x3)
    /* swap mx <-> ma */
#ifdef __riscv_zbb
    rori x25, x25, 32
#else
    srli x31, x25, 32
    slli x25, x25, 32
    or x25, x25, x31
#endif
    slli x8, x8, 32
    /* update "mx" */
    xor x25, x25, x8
    /* the next dataset item */
    and x7, x25, x1
    srli x7, x7, 6
    add x7, x7, x9
DECL(randomx_riscv64_fix_loop_call):
    jal superscalar_hash /* JIT compiler will adjust the offset */
    xor x16, x16, x8
    xor x17, x17, x9
    xor x18, x18, x10
    xor x19, x19, x11
    xor x20, x20, x12
    xor x21, x21, x13
    xor x22, x22, x14
    xor x23, x23, x15
    /* restore clobbered registers */
    lwu x10, 80(x3)
    lwu x11, 84(x3)
    ld x12, 96(x3)
    ld x13, 104(x3)
    ld x14, 112(x3)
    ld x15, 120(x3)

DECL(randomx_riscv64_spad_store):
    /* store integer registers */
    sd x16, 0(x27)
    sd x17, 8(x27)
    sd x18, 16(x27)
    sd x19, 24(x27)
    sd x20, 32(x27)
    sd x21, 40(x27)
    sd x22, 48(x27)
    sd x23, 56(x27)
    /* XOR and store f0,e0 */
    fmv.x.d x8, f0
    fmv.x.d x9, f8
    fmv.x.d x30, f1
    fmv.x.d x31, f9
    xor x8, x8, x9
    xor x30, x30, x31
    sd x8, 0(x26)
    fmv.d.x f0, x8
    sd x30, 8(x26)
    fmv.d.x f1, x30
    /* XOR and store f1,e1 */
    fmv.x.d x8, f2
    fmv.x.d x9, f10
    fmv.x.d x30, f3
    fmv.x.d x31, f11
    xor x8, x8, x9
    xor x30, x30, x31
    sd x8, 16(x26)
    fmv.d.x f2, x8
    sd x30, 24(x26)
    fmv.d.x f3, x30
    /* XOR and store f2,e2 */
    fmv.x.d x8, f4
    fmv.x.d x9, f12
    fmv.x.d x30, f5
    fmv.x.d x31, f13
    xor x8, x8, x9
    xor x30, x30, x31
    sd x8, 32(x26)
    fmv.d.x f4, x8
    sd x30, 40(x26)
    fmv.d.x f5, x30
    /* XOR and store f3,e3 */
    fmv.x.d x8, f6
    fmv.x.d x9, f14
    fmv.x.d x30, f7
    fmv.x.d x31, f15
    xor x8, x8, x9
    xor x30, x30, x31
    sd x8, 48(x26)
    fmv.d.x f6, x8
    sd x30, 56(x26)
    fmv.d.x f7, x30

DECL(randomx_riscv64_spad_store_hardaes):
    nop /* not implemented */

DECL(randomx_riscv64_spad_store_softaes):
    /* store integer registers */
    sd x16, 0(x27)
    sd x17, 8(x27)
    sd x18, 16(x27)
    sd x19, 24(x27)
    sd x20, 32(x27)
    sd x21, 40(x27)
    sd x22, 48(x27)
    sd x23, 56(x27)
    /* process f0 with 4 AES rounds */
    fmv.x.d x8, f8
    fmv.x.d x10, f9
    fmv.x.d x30, f0
    fmv.x.d x31, f1
    jal softaes_enc
    fmv.x.d x8, f10
    fmv.x.d x10, f11
    jal softaes_enc
    fmv.x.d x8, f12
    fmv.x.d x10, f13
    jal softaes_enc
    fmv.x.d x8, f14
    fmv.x.d x10, f15
    jal softaes_enc
    sd x30, 0(x26)
    fmv.d.x f0, x30
    sd x31, 8(x26)
    fmv.d.x f1, x31
    /* process f1 with 4 AES rounds */
    fmv.x.d x8, f8
    fmv.x.d x10, f9
    fmv.x.d x30, f2
    fmv.x.d x31, f3
    jal softaes_dec
    fmv.x.d x8, f10
    fmv.x.d x10, f11
    jal softaes_dec
    fmv.x.d x8, f12
    fmv.x.d x10, f13
    jal softaes_dec
    fmv.x.d x8, f14
    fmv.x.d x10, f15
    jal softaes_dec
    sd x30, 16(x26)
    fmv.d.x f2, x30
    sd x31, 24(x26)
    fmv.d.x f3, x31
    /* process f2 with 4 AES rounds */
    fmv.x.d x8, f8
    fmv.x.d x10, f9
    fmv.x.d x30, f4
    fmv.x.d x31, f5
    jal softaes_enc
    fmv.x.d x8, f10
    fmv.x.d x10, f11
    jal softaes_enc
    fmv.x.d x8, f12
    fmv.x.d x10, f13
    jal softaes_enc
    fmv.x.d x8, f14
    fmv.x.d x10, f15
    jal softaes_enc
    sd x30, 32(x26)
    fmv.d.x f4, x30
    sd x31, 40(x26)
    fmv.d.x f5, x31
    /* process f3 with 4 AES rounds */
    fmv.x.d x8, f8
    fmv.x.d x10, f9
    fmv.x.d x30, f6
    fmv.x.d x31, f7
    jal softaes_dec
    fmv.x.d x8, f10
    fmv.x.d x10, f11
    jal softaes_dec
    fmv.x.d x8, f12
    fmv.x.d x10, f13
    jal softaes_dec
    fmv.x.d x8, f14
    fmv.x.d x10, f15
    jal softaes_dec
    sd x30, 48(x26)
    fmv.d.x f6, x30
    sd x31, 56(x26)
    fmv.d.x f7, x31
    /* restore clobbered registers */
    lwu x10, 80(x3)
    lwu x11, 84(x3)
    ld x12, 96(x3)
    ld x13, 104(x3)
    ld x14, 112(x3)
    ld x15, 120(x3)

DECL(randomx_riscv64_loop_end):
    xor x26, x16, x18 /* JIT compiler will adjust the registers */
    /* load L3 mask */
    lwu x1, 88(x3)
    addi x24, x24, -1
    srli x27, x26, 32
    /* set spAddr0, spAddr1 */
    and x26, x26, x1
    and x27, x27, x1
    add x26, x26, x5
    add x27, x27, x5
    /* align L3 mask */
    addi x1, x1, 56
    /* conditional branch doesn't have sufficient range */
    j condition_check
DECL(randomx_riscv64_fix_continue_loop):
continue_loop:
    .word 0 /* JIT compiler will write a jump to loop_begin  */
condition_check:
    bnez x24, continue_loop

DECL(randomx_riscv64_epilogue):
    /* restore callee saved registers */
    ld x10, 8(sp)
    ld x1, 0(sp)
    ld x3, 16(sp)
    ld x8, 32(sp)
    ld x9, 40(sp)
    ld x24, 96(sp)
    ld x25, 104(sp)
    ld x26, 112(sp)
    ld x27, 120(sp)
    fld f18, 144(sp)
    fld f19, 152(sp)
    fld f20, 160(sp)
    fld f21, 168(sp)
    fld f22, 176(sp)
    fld f23, 184(sp)
    fld f24, 192(sp)
    fld f25, 200(sp)
    fld f26, 208(sp)
    fld f27, 216(sp)
    /* save VM registers */
    sd x16, 0(x10)
    sd x17, 8(x10)
    sd x18, 16(x10)
    sd x19, 24(x10)
    sd x20, 32(x10)
    sd x21, 40(x10)
    sd x22, 48(x10)
    sd x23, 56(x10)
    fsd f0, 64(x10)
    fsd f1, 72(x10)
    fsd f2, 80(x10)
    fsd f3, 88(x10)
    fsd f4, 96(x10)
    fsd f5, 104(x10)
    fsd f6, 112(x10)
    fsd f7, 120(x10)
    fsd f8, 128(x10)
    fsd f9, 136(x10)
    fsd f10, 144(x10)
    fsd f11, 152(x10)
    fsd f12, 160(x10)
    fsd f13, 168(x10)
    fsd f14, 176(x10)
    fsd f15, 184(x10)
    /* restore callee saved registers */
    ld x18, 48(sp)
    ld x19, 56(sp)
    ld x20, 64(sp)
    ld x21, 72(sp)
    ld x22, 80(sp)
    ld x23, 88(sp)
    fld f8, 128(sp)
    fld f9, 136(sp)
    /* restore stack pointer */
    addi sp, sp, 224
    /* return */
    ret

/*
    Soft AES subroutines
        in:
                  x3 = literal pool
             x8, x10 = round key
            x30, x31 = plaintext
        out:
            x30, x31 = ciphertext
        clobbers:
             x8-x11 (limbs)
            x12-x13 (LUTs)
            x14-x15 (temp)
*/
DECL(randomx_riscv64_softaes):
softaes_enc:
    /* enc. lookup table */
    ld x13, 128(x3)

    /* load the round key into x8, x9, x10, x11 */
    srli x9, x8, 32
    srli x11, x10, 32
#ifdef __riscv_zba
    zext.w x8, x8
    zext.w x10, x10
#else
    slli x8, x8, 32
    slli x10, x10, 32
    srli x8, x8, 32
    srli x10, x10, 32
#endif

    /* byte 0 */
    andi x14, x30, 255
    srli x30, x30, 8
    addi x12, x13, -2048
#ifdef __riscv_zba
    sh2add x14, x14, x13
#else
    slli x14, x14, 2
    add x14, x14, x13
#endif
    lwu x14, -2048(x14)

    /* byte 1 */
    andi x15, x30, 255
    srli x30, x30, 8
#ifdef __riscv_zba
    sh2add x15, x15, x12
#else
    slli x15, x15, 2
    add x15, x15, x12
#endif
    lwu x15, 1024(x15)
    xor x8, x8, x14

    /* byte 2 */
    andi x14, x30, 255
    srli x30, x30, 8
#ifdef __riscv_zba
    sh2add x14, x14, x13
#else
    slli x14, x14, 2
    add x14, x14, x13
#endif
    lwu x14, 0(x14)
    xor x11, x11, x15

    /* byte 3 */
    andi x15, x30, 255
    srli x30, x30, 8
#ifdef __riscv_zba
    sh2add x15, x15, x13
#else
    slli x15, x15, 2
    add x15, x15, x13
#endif
    lwu x15, 1024(x15)
    xor x10, x10, x14

    /* byte 4 */
    andi x14, x30, 255
    srli x30, x30, 8
#ifdef __riscv_zba
    sh2add x14, x14, x12
#else
    slli x14, x14, 2
    add x14, x14, x12
#endif
    lwu x14, 0(x14)
    xor x9, x9, x15

    /* byte 5 */
    andi x15, x30, 255
    srli x30, x30, 8
#ifdef __riscv_zba
    sh2add x15, x15, x12
#else
    slli x15, x15, 2
    add x15, x15, x12
#endif
    lwu x15, 1024(x15)
    xor x9, x9, x14

    /* byte 6 */
    andi x14, x30, 255
    srli x30, x30, 8
#ifdef __riscv_zba
    sh2add x14, x14, x13
#else
    slli x14, x14, 2
    add x14, x14, x13
#endif
    lwu x14, 0(x14)
    xor x8, x8, x15

    /* byte 7 */
    andi x15, x30, 255
#ifdef __riscv_zba
    sh2add x15, x15, x13
#else
    slli x15, x15, 2
    add x15, x15, x13
#endif
    lwu x15, 1024(x15)
    xor x11, x11, x14

    /* byte 8 */
    andi x14, x31, 255
    srli x31, x31, 8
#ifdef __riscv_zba
    sh2add x14, x14, x12
#else
    slli x14, x14, 2
    add x14, x14, x12
#endif
    lwu x14, 0(x14)
    xor x10, x10, x15

    /* byte 9 */
    andi x15, x31, 255
    srli x31, x31, 8
#ifdef __riscv_zba
    sh2add x15, x15, x12
#else
    slli x15, x15, 2
    add x15, x15, x12
#endif
    lwu x15, 1024(x15)
    xor x10, x10, x14

    /* byte 10 */
    andi x14, x31, 255
    srli x31, x31, 8
#ifdef __riscv_zba
    sh2add x14, x14, x13
#else
    slli x14, x14, 2
    add x14, x14, x13
#endif
    lwu x14, 0(x14)
    xor x9, x9, x15

    /* byte 11 */
    andi x15, x31, 255
    srli x31, x31, 8
#ifdef __riscv_zba
    sh2add x15, x15, x13
#else
    slli x15, x15, 2
    add x15, x15, x13
#endif
    lwu x15, 1024(x15)
    xor x8, x8, x14

    /* byte 12 */
    andi x14, x31, 255
    srli x31, x31, 8
#ifdef __riscv_zba
    sh2add x14, x14, x12
#else
    slli x14, x14, 2
    add x14, x14, x12
#endif
    lwu x14, 0(x14)
    xor x11, x11, x15

    /* byte 13 */
    andi x15, x31, 255
    srli x31, x31, 8
#ifdef __riscv_zba
    sh2add x15, x15, x12
#else
    slli x15, x15, 2
    add x15, x15, x12
#endif
    lwu x15, 1024(x15)
    xor x11, x11, x14

    /* byte 14 */
    andi x14, x31, 255
    srli x31, x31, 8
#ifdef __riscv_zba
    sh2add x14, x14, x13
#else
    slli x14, x14, 2
    add x14, x14, x13
#endif
    lwu x14, 0(x14)
    xor x10, x10, x15

    /* byte 15 */
    andi x15, x31, 255
#ifdef __riscv_zba
    sh2add x15, x15, x13
#else
    slli x15, x15, 2
    add x15, x15, x13
#endif
    lwu x15, 1024(x15)
    xor x9, x9, x14

    slli x11, x11, 32
    slli x9, x9, 32
    or x30, x8, x9
    or x31, x10, x11
    xor x30, x30, x15

    ret

softaes_dec:
    /* dec. lookup table */
    ld x13, 136(x3)

    /* load the round key into x8, x9, x10, x11 */
    srli x9, x8, 32
    srli x11, x10, 32
#ifdef __riscv_zba
    zext.w x8, x8
    zext.w x10, x10
#else
    slli x8, x8, 32
    slli x10, x10, 32
    srli x8, x8, 32
    srli x10, x10, 32
#endif

    /* byte 0 */
    andi x14, x30, 255
    srli x30, x30, 8
    addi x12, x13, -2048
#ifdef __riscv_zba
    sh2add x14, x14, x13
#else
    slli x14, x14, 2
    add x14, x14, x13
#endif
    lwu x14, -2048(x14)

    /* byte 1 */
    andi x15, x30, 255
    srli x30, x30, 8
#ifdef __riscv_zba
    sh2add x15, x15, x12
#else
    slli x15, x15, 2
    add x15, x15, x12
#endif
    lwu x15, 1024(x15)
    xor x8, x8, x14

    /* byte 2 */
    andi x14, x30, 255
    srli x30, x30, 8
#ifdef __riscv_zba
    sh2add x14, x14, x13
#else
    slli x14, x14, 2
    add x14, x14, x13
#endif
    lwu x14, 0(x14)
    xor x9, x9, x15

    /* byte 3 */
    andi x15, x30, 255
    srli x30, x30, 8
#ifdef __riscv_zba
    sh2add x15, x15, x13
#else
    slli x15, x15, 2
    add x15, x15, x13
#endif
    lwu x15, 1024(x15)
    xor x10, x10, x14

    /* byte 4 */
    andi x14, x30, 255
    srli x30, x30, 8
#ifdef __riscv_zba
    sh2add x14, x14, x12
#else
    slli x14, x14, 2
    add x14, x14, x12
#endif
    lwu x14, 0(x14)
    xor x11, x11, x15

    /* byte 5 */
    andi x15, x30, 255
    srli x30, x30, 8
#ifdef __riscv_zba
    sh2add x15, x15, x12
#else
    slli x15, x15, 2
    add x15, x15, x12
#endif
    lwu x15, 1024(x15)
    xor x9, x9, x14

    /* byte 6 */
    andi x14, x30, 255
    srli x30, x30, 8
#ifdef __riscv_zba
    sh2add x14, x14, x13
#else
    slli x14, x14, 2
    add x14, x14, x13
#endif
    lwu x14, 0(x14)
    xor x10, x10, x15

    /* byte 7 */
    andi x15, x30, 255
#ifdef __riscv_zba
    sh2add x15, x15, x13
#else
    slli x15, x15, 2
    add x15, x15, x13
#endif
    lwu x15, 1024(x15)
    xor x11, x11, x14

    /* byte 8 */
    andi x14, x31, 255
    srli x31, x31, 8
#ifdef __riscv_zba
    sh2add x14, x14, x12
#else
    slli x14, x14, 2
    add x14, x14, x12
#endif
    lwu x14, 0(x14)
    xor x8, x8, x15

    /* byte 9 */
    andi x15, x31, 255
    srli x31, x31, 8
#ifdef __riscv_zba
    sh2add x15, x15, x12
#else
    slli x15, x15, 2
    add x15, x15, x12
#endif
    lwu x15, 1024(x15)
    xor x10, x10, x14

    /* byte 10 */
    andi x14, x31, 255
    srli x31, x31, 8
#ifdef __riscv_zba
    sh2add x14, x14, x13
#else
    slli x14, x14, 2
    add x14, x14, x13
#endif
    lwu x14, 0(x14)
    xor x11, x11, x15

    /* byte 11 */
    andi x15, x31, 255
    srli x31, x31, 8
#ifdef __riscv_zba
    sh2add x15, x15, x13
#else
    slli x15, x15, 2
    add x15, x15, x13
#endif
    lwu x15, 1024(x15)
    xor x8, x8, x14

    /* byte 12 */
    andi x14, x31, 255
    srli x31, x31, 8
#ifdef __riscv_zba
    sh2add x14, x14, x12
#else
    slli x14, x14, 2
    add x14, x14, x12
#endif
    lwu x14, 0(x14)
    xor x9, x9, x15

    /* byte 13 */
    andi x15, x31, 255
    srli x31, x31, 8
#ifdef __riscv_zba
    sh2add x15, x15, x12
#else
    slli x15, x15, 2
    add x15, x15, x12
#endif
    lwu x15, 1024(x15)
    xor x11, x11, x14

    /* byte 14 */
    andi x14, x31, 255
    srli x31, x31, 8
#ifdef __riscv_zba
    sh2add x14, x14, x13
#else
    slli x14, x14, 2
    add x14, x14, x13
#endif
    lwu x14, 0(x14)
    xor x8, x8, x15

    /* byte 15 */
    andi x15, x31, 255
#ifdef __riscv_zba
    sh2add x15, x15, x13
#else
    slli x15, x15, 2
    add x15, x15, x13
#endif
    lwu x15, 1024(x15)
    xor x9, x9, x14

    slli x11, x11, 32
    slli x9, x9, 32
    or x30, x8, x9
    or x31, x10, x11
    xor x31, x31, x15

    ret

DECL(randomx_riscv64_program_end):
    nop


/* literal pool for SuperscalarHash */
    /* space for remaining IMUL_RCP literals */
ssh_literal_pool:
    /* space for 256 IMUL_RCP literals */
    .fill 256,8,0

/*
    SuperscalarHash subroutine
        in:
            x3 = literal pool
            x6 = cache
            x7 = itemNumber
        out:
            x8-x15 = 64-byte hash
        clobbers:
            x7, x28-x31
*/
DECL(randomx_riscv64_ssh_init):
superscalar_hash:
    ld x30, 0(x3) /* superscalarMul0 */
    addi x8, x7, 1
    ld x9, 8(x3)
    li x31, RANDOMX_CACHE_MASK
    ld x10, 16(x3)
    ld x11, 24(x3)
    mul x8, x8, x30
    ld x12, 32(x3)
    ld x13, 40(x3)
    lla x30, ssh_literal_pool
    ld x14, 48(x3)
    and x7, x7, x31
    ld x15, 56(x3)
    slli x7, x7, 6
    xor x9, x9, x8
    add x7, x7, x6
    xor x10, x10, x8
    /* load the first IMUL_RCP literal */
    ld x31, 2040(x30)
    xor x11, x11, x8
    xor x12, x12, x8
    xor x13, x13, x8
    xor x14, x14, x8
    xor x15, x15, x8

DECL(randomx_riscv64_ssh_load):
    ld x28, 0(x7)
    ld x29, 8(x7)
    xor x8, x8, x28
    ld x28, 16(x7)
    xor x9, x9, x29
    ld x29, 24(x7)
    xor x10, x10, x28
    ld x28, 32(x7)
    xor x11, x11, x29
    ld x29, 40(x7)
    xor x12, x12, x28
    ld x28, 48(x7)
    xor x13, x13, x29
    ld x29, 56(x7)
    xor x14, x14, x28
    li x7, RANDOMX_CACHE_MASK
    xor x15, x15, x29

DECL(randomx_riscv64_ssh_prefetch):
    and x7, x8, x7   /* JIT compiler will adjust the register */
    slli x7, x7, 6
    add x7, x7, x6
    /* prefetch - doesn't seem to have any effect */
    /* ld x0, 0(x7) */

DECL(randomx_riscv64_ssh_end):
    nop