ALU and FPU tests

6 years ago · f19995d4c5
parent ec2d378fce
commit f19995d4c5
7 changed files with 14967 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -205,7 +205,7 @@ The shift/rotate instructions use just the bottom 6 bits of the `B` operand (`im
 |22|FSUB|A - B|
 |22|FMUL|A * B|
 |8|FDIV|A / B|
-|6|FSQRT|sqrt(A)|
+|6|FABSQRT|sqrt(A)|
 |2|FROUND|A|

 FPU instructions conform to the IEEE-754 specification, so they must give correctly rounded results. Initial rounding mode is RN (Round to Nearest). Denormal values may not be produced by any operation.
@ -214,8 +214,8 @@ FPU instructions conform to the IEEE-754 specification, so they must give correc

 Operands loaded from memory are treated as signed 64-bit integers and converted to double precision floating point format. Operands loaded from floating point registers are used directly.

-##### FSQRT
-The sign bit of the FSQRT operand is always cleared first, so only non-negative values are used.
+##### FABSQRT
+The sign bit of the FABSQRT operand is always cleared first, so only non-negative values are used.

 *In x86, the `SQRTSD` instruction must be used. The legacy `FSQRT` instruction doesn't produce correctly rounded results in all cases.*

@ -225,11 +225,11 @@ The FROUND instruction changes the rounding mode for all subsequent FPU operatio
 |A[1:0]|rounding mode|
 |-------|------------|
 |00|Round to Nearest (RN) mode|
-|01|Round towards Plus Infinity (RP) mode
-|10|Round towards Minus Infinity (RM) mode
+|01|Round towards Minus Infinity (RM) mode
+|10|Round towards Plus Infinity (RP) mode
 |11|Round towards Zero (RZ) mode

-*The two-bit flag value exactly corresponds to bits 13-14 of the x86 `MXCSR` register and bits 22-23 of the ARM `FPSCR` register.*
+*The two-bit flag value exactly corresponds to bits 13-14 of the x86 `MXCSR` register and bits 23 and 22 (reversed) of the ARM `FPSCR` register.*

 ### Control flow instructions
 The following 2 control flow instructions are supported:
--- a/tests/test_alu_fpu/Instructions.h
+++ b/tests/test_alu_fpu/Instructions.h
@ -0,0 +1,69 @@
+//RandomX ALU + FPU test
+//https://github.com/tevador/RandomX
+//License: GPL v3
+
+#include <cstdint>
+
+namespace RandomX {
+
+	constexpr int RoundToNearest = 0;
+	constexpr int RoundDown = 1;
+	constexpr int RoundUp = 2;
+	constexpr int RoundToZero = 3;
+
+	typedef union {
+		double f64;
+		int64_t i64;
+		uint64_t u64;
+		int32_t i32;
+		uint32_t u32;
+	} convertible_t;
+
+	extern "C" {
+		void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c);
+		void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c);
+		void SUB_64(convertible_t& a, convertible_t& b, convertible_t& c);
+		void SUB_32(convertible_t& a, convertible_t& b, convertible_t& c);
+		void MUL_64(convertible_t& a, convertible_t& b, convertible_t& c);
+		void MULH_64(convertible_t& a, convertible_t& b, convertible_t& c);
+		void MUL_32(convertible_t& a, convertible_t& b, convertible_t& c);
+		void IMUL_32(convertible_t& a, convertible_t& b, convertible_t& c);
+		void IMULH_64(convertible_t& a, convertible_t& b, convertible_t& c);
+		void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c);
+		void IDIV_64(convertible_t& a, convertible_t& b, convertible_t& c);
+		void AND_64(convertible_t& a, convertible_t& b, convertible_t& c);
+		void AND_32(convertible_t& a, convertible_t& b, convertible_t& c);
+		void OR_64(convertible_t& a, convertible_t& b, convertible_t& c);
+		void OR_32(convertible_t& a, convertible_t& b, convertible_t& c);
+		void XOR_64(convertible_t& a, convertible_t& b, convertible_t& c);
+		void XOR_32(convertible_t& a, convertible_t& b, convertible_t& c);
+		void SHL_64(convertible_t& a, convertible_t& b, convertible_t& c);
+		void SHR_64(convertible_t& a, convertible_t& b, convertible_t& c);
+		void SAR_64(convertible_t& a, convertible_t& b, convertible_t& c);
+		void ROL_64(convertible_t& a, convertible_t& b, convertible_t& c);
+		void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c);
+		void FPINIT();
+		void FADD_64(convertible_t& a, double b, convertible_t& c);
+		void FSUB_64(convertible_t& a, double b, convertible_t& c);
+		void FMUL_64(convertible_t& a, double b, convertible_t& c);
+		void FDIV_64(convertible_t& a, double b, convertible_t& c);
+		void FABSQRT(convertible_t& a, convertible_t& b, convertible_t& c);
+		void FROUND(convertible_t& a, convertible_t& b, convertible_t& c);
+
+		inline void FADD(convertible_t& a, convertible_t& b, convertible_t& c) {
+			FADD_64(a, (double)b.i64, c);
+		}
+
+		inline void FSUB(convertible_t& a, convertible_t& b, convertible_t& c) {
+			FSUB_64(a, (double)b.i64, c);
+		}
+
+		inline void FMUL(convertible_t& a, convertible_t& b, convertible_t& c) {
+			FMUL_64(a, (double)b.i64, c);
+		}
+
+		inline void FDIV(convertible_t& a, convertible_t& b, convertible_t& c) {
+			FDIV_64(a, (double)b.i64, c);
+		}
+	}
+}
--- a/tests/test_alu_fpu/InstructionsPortable.cpp
+++ b/tests/test_alu_fpu/InstructionsPortable.cpp
@ -0,0 +1,248 @@
+//RandomX ALU + FPU test
+//https://github.com/tevador/RandomX
+//License: GPL v3
+
+#include "Instructions.h"
+#include <cfenv>
+#include <cmath>
+
+#if defined(__SIZEOF_INT128__)
+	typedef unsigned __int128 uint128_t;
+	typedef __int128 int128_t;
+	static inline uint64_t __umulhi64(uint64_t a, uint64_t b) {
+		return ((uint128_t)a * b) >> 64;
+	}
+	static inline uint64_t __imulhi64(int64_t a, int64_t b) {
+		return ((int128_t)a * b) >> 64;
+	}
+	#define umulhi64 __umulhi64
+	#define imulhi64 __imulhi64
+#endif
+
+#if defined(_MSC_VER)
+	#include <intrin.h>
+	#include <stdlib.h>
+	#define ror64 _rotr64
+	#define rol64 _rotl64
+	#ifdef __MACHINEARM64_X64
+		#define umulhi64 __umulh
+	#endif
+	#ifdef __MACHINEX64
+		static inline uint64_t __imulhi64(int64_t a, int64_t b) {
+			int64_t hi;
+			_mul128(a, b, &hi);
+			return hi;
+		}
+		#define imulhi64 __imulhi64
+	#endif
+	#ifdef __MACHINEX86_X64
+		#define sar64 __ll_rshift
+	#endif
+#endif
+
+#ifndef ror64
+	static inline uint64_t __ror64(uint64_t a, int b) {
+		return (a >> b) | (a << (64 - b));
+	}
+	#define ror64 __ror64
+#endif
+
+#ifndef rol64
+	static inline uint64_t __rol64(uint64_t a, int b) {
+		return (a << b) | (a >> (64 - b));
+	}
+	#define rol64 __rol64
+#endif
+
+#ifndef sar64
+	#include <type_traits>
+	constexpr int64_t builtintShr64(int64_t value, int shift) noexcept {
+		return value >> shift;
+	}
+
+	struct usesArithmeticShift : std::integral_constant<bool, builtintShr64(-1LL, 1) == -1LL> {
+	};
+
+	static inline int64_t __sar64(int64_t a, int b) {
+		return usesArithmeticShift::value ? builtintShr64(a, b) : (a < 0 ? ~(~a >> b) : a >> b);
+	}
+	#define sar64 __sar64
+#endif
+
+#ifndef umulhi64
+	#define LO(x) ((x)&0xffffffff)
+	#define HI(x) ((x)>>32)
+	static inline uint64_t __umulhi64(uint64_t a, uint64_t b) {
+		uint64_t ah = HI(a), al = LO(a);
+		uint64_t bh = HI(b), bl = LO(b);
+		uint64_t x00 = al * bl;
+		uint64_t x01 = al * bh;
+		uint64_t x10 = ah * bl;
+		uint64_t x11 = ah * bh;
+		uint64_t m1 = LO(x10) + LO(x01) + HI(x00);
+		uint64_t m2 = HI(x10) + HI(x01) + LO(x11) + HI(m1);
+		uint64_t m3 = HI(x11) + HI(m2);
+
+		return (m3 << 32) + LO(m2);
+	}
+	#define umulhi64 __umulhi64
+#endif
+
+#ifndef imulhi64
+	static inline int64_t __imulhi64(int64_t a, int64_t b) {
+		int64_t hi = umulhi64(a, b);
+		if (a < 0LL) hi -= b;
+		if (b < 0LL) hi -= a;
+		return hi;
+	}
+	#define imulhi64 __imulhi64
+#endif
+
+static double FlushDenormal(double x) {
+	if (std::fpclassify(x) == FP_SUBNORMAL) {
+		return 0;
+	}
+	return x;
+}
+
+#define FTZ(x) FlushDenormal(x)
+
+namespace RandomX {
+
+	extern "C" {
+
+		void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.u64 = a.u64 + b.u64;
+		}
+
+		void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.u64 = a.u32 + b.u32;
+		}
+
+		void SUB_64(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.u64 = a.u64 - b.u64;
+		}
+
+		void SUB_32(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.u64 = a.u32 - b.u32;
+		}
+
+		void MUL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.u64 = a.u64 * b.u64;
+		}
+
+		void MULH_64(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.u64 = umulhi64(a.u64, b.u64);
+		}
+
+		void MUL_32(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.u64 = (uint64_t)a.u32 * b.u32;
+		}
+
+		void IMUL_32(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.i64 = (int64_t)a.i32 * b.i32;
+		}
+
+		void IMULH_64(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.i64 = imulhi64(a.i64, b.i64);
+		}
+
+		void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.u64 = a.u64 / (b.u32 != 0 ? b.u32 : 1U);
+		}
+
+		void IDIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
+			if (a.i64 == INT64_MIN && b.i64 == -1)
+				c.i64 = INT64_MIN;
+			else
+				c.i64 = a.i64 / (b.i32 != 0 ? b.i32 : 1);
+		}
+
+		void AND_64(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.u64 = a.u64 & b.u64;
+		}
+
+		void AND_32(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.u64 = a.u32 & b.u32;
+		}
+
+		void OR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.u64 = a.u64 | b.u64;
+		}
+
+		void OR_32(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.u64 = a.u32 | b.u32;
+		}
+
+		void XOR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.u64 = a.u64 ^ b.u64;
+		}
+
+		void XOR_32(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.u64 = a.u32 ^ b.u32;
+		}
+
+		void SHL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.u64 = a.u64 << (b.u64 & 63);
+		}
+
+		void SHR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.u64 = a.u64 >> (b.u64 & 63);
+		}
+
+		void SAR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.u64 = sar64(a.i64, b.u64 & 63);
+		}
+
+		void ROL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.u64 = rol64(a.u64, (b.u64 & 63));
+		}
+
+		void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.u64 = ror64(a.u64, (b.u64 & 63));
+		}
+
+		void FPINIT() {
+			fesetround(FE_TONEAREST);
+		}
+
+		void FADD_64(convertible_t& a, double b, convertible_t& c) {
+			c.f64 = FTZ((double)a.i64 + b);
+		}
+
+		void FSUB_64(convertible_t& a, double b, convertible_t& c) {
+			c.f64 = FTZ((double)a.i64 - b);
+		}
+
+		void FMUL_64(convertible_t& a, double b, convertible_t& c) {
+			c.f64 = FTZ((double)a.i64 * b);
+		}
+
+		void FDIV_64(convertible_t& a, double b, convertible_t& c) {
+			c.f64 = FTZ((double)a.i64 / b);
+		}
+
+		void FABSQRT(convertible_t& a, convertible_t& b, convertible_t& c) {
+			double d = fabs((double)a.i64);
+			c.f64 = FTZ(sqrt(d));
+		}
+
+		void FROUND(convertible_t& a, convertible_t& b, convertible_t& c) {
+			c.f64 = (double)a.i64;
+			switch (a.u64 & 3) {
+				case RoundDown:
+					fesetround(FE_DOWNWARD);
+					break;
+				case RoundUp:
+					fesetround(FE_UPWARD);
+					break;
+				case RoundToZero:
+					fesetround(FE_TOWARDZERO);
+					break;
+				default:
+					fesetround(FE_TONEAREST);
+					break;
+			}
+		}
+	}
+}
--- a/tests/test_alu_fpu/InstructionsX64.asm
+++ b/tests/test_alu_fpu/InstructionsX64.asm
@ -0,0 +1,276 @@
+;RandomX ALU + FPU test
+;https://github.com/tevador/RandomX
+;License: GPL v3
+
+PUBLIC ADD_64
+PUBLIC ADD_32
+PUBLIC SUB_64
+PUBLIC SUB_32
+PUBLIC MUL_64
+PUBLIC MULH_64
+PUBLIC MUL_32
+PUBLIC IMUL_32
+PUBLIC IMULH_64
+PUBLIC DIV_64
+PUBLIC IDIV_64
+PUBLIC AND_64
+PUBLIC AND_32
+PUBLIC OR_64
+PUBLIC OR_32
+PUBLIC XOR_64
+PUBLIC XOR_32
+PUBLIC SHL_64
+PUBLIC SHR_64
+PUBLIC SAR_64
+PUBLIC ROL_64
+PUBLIC ROR_64
+PUBLIC FPINIT
+PUBLIC FADD_64
+PUBLIC FSUB_64
+PUBLIC FMUL_64
+PUBLIC FDIV_64
+PUBLIC FABSQRT
+PUBLIC FROUND
+
+CONST	SEGMENT
+__XMMABS	DB	0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 07fH, 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 07fH
+CONST	ENDS
+
+.code
+
+ADD_64 PROC
+	mov	rax, QWORD PTR [rcx]
+	add	rax, QWORD PTR [rdx]
+	mov	QWORD PTR [r8], rax
+	ret	0
+ADD_64 ENDP
+
+ADD_32 PROC
+	mov	eax, DWORD PTR [rcx]
+	add	eax, DWORD PTR [rdx]
+	mov	QWORD PTR [r8], rax
+	ret	0
+ADD_32 ENDP
+
+SUB_64 PROC
+	mov	rax, QWORD PTR [rcx]
+	sub	rax, QWORD PTR [rdx]
+	mov	QWORD PTR [r8], rax
+	ret	0
+SUB_64 ENDP
+
+SUB_32 PROC
+	mov	eax, DWORD PTR [rcx]
+	sub	eax, DWORD PTR [rdx]
+	mov	QWORD PTR [r8], rax
+	ret	0
+SUB_32 ENDP
+
+MUL_64 PROC
+	mov	rax, QWORD PTR [rcx]
+	imul	rax, QWORD PTR [rdx]
+	mov	QWORD PTR [r8], rax
+	ret	0
+MUL_64 ENDP
+
+MULH_64 PROC
+	mov	rax, QWORD PTR [rdx]
+	mul	QWORD PTR [rcx]
+	mov	QWORD PTR [r8], rdx
+	ret	0
+MULH_64 ENDP
+
+MUL_32 PROC
+	mov	r9d, DWORD PTR [rcx]
+	mov	eax, DWORD PTR [rdx]
+	imul	r9, rax
+	mov	QWORD PTR [r8], r9
+	ret	0
+MUL_32 ENDP
+
+IMUL_32 PROC
+	movsxd	r9, DWORD PTR [rcx]
+	movsxd	rax, DWORD PTR [rdx]
+	imul	r9, rax
+	mov	QWORD PTR [r8], r9
+	ret	0
+IMUL_32 ENDP
+
+IMULH_64 PROC
+	mov	rax, QWORD PTR [rdx]
+	imul	QWORD PTR [rcx]
+	mov	QWORD PTR [r8], rdx
+	ret	0
+IMULH_64 ENDP
+
+DIV_64 PROC
+	mov	r9d, DWORD PTR [rdx]
+	mov	eax, 1
+	test	r9d, r9d
+	cmovne	eax, r9d
+	xor	edx, edx
+	mov	r9d, eax
+	mov	rax, QWORD PTR [rcx]
+	div	r9
+	mov	QWORD PTR [r8], rax
+	ret 0
+DIV_64 ENDP
+
+IDIV_64 PROC
+	mov	rax, QWORD PTR [rcx]
+	mov	rcx, -9223372036854775808
+	cmp	rax, rcx
+	jne	SHORT SAFE_IDIV_64
+	cmp	QWORD PTR [rdx], -1
+	jne	SHORT SAFE_IDIV_64
+	mov	QWORD PTR [r8], rcx
+	ret	0
+SAFE_IDIV_64:
+	mov	ecx, DWORD PTR [rdx]
+	test	ecx, ecx
+	mov	edx, 1
+	cmovne	edx, ecx
+	movsxd	rcx, edx
+	cqo
+	idiv	rcx
+	mov	QWORD PTR [r8], rax
+	ret 0
+IDIV_64 ENDP
+
+AND_64 PROC
+	mov	rax, QWORD PTR [rcx]
+	and	rax, QWORD PTR [rdx]
+	mov	QWORD PTR [r8], rax
+	ret	0
+AND_64 ENDP
+
+AND_32 PROC
+	mov	eax, DWORD PTR [rcx]
+	and	eax, DWORD PTR [rdx]
+	mov	QWORD PTR [r8], rax
+	ret	0
+AND_32 ENDP
+
+OR_64 PROC
+	mov	rax, QWORD PTR [rcx]
+	or	rax, QWORD PTR [rdx]
+	mov	QWORD PTR [r8], rax
+	ret	0
+OR_64 ENDP
+
+OR_32 PROC
+	mov	eax, DWORD PTR [rcx]
+	or	eax, DWORD PTR [rdx]
+	mov	QWORD PTR [r8], rax
+	ret	0
+OR_32 ENDP
+
+XOR_64 PROC
+	mov	rax, QWORD PTR [rcx]
+	xor	rax, QWORD PTR [rdx]
+	mov	QWORD PTR [r8], rax
+	ret	0
+XOR_64 ENDP
+
+XOR_32 PROC
+	mov	eax, DWORD PTR [rcx]
+	xor	eax, DWORD PTR [rdx]
+	mov	QWORD PTR [r8], rax
+	ret	0
+XOR_32 ENDP
+
+SHL_64 PROC
+	mov	rax, QWORD PTR [rcx]
+	mov	rcx, QWORD PTR [rdx]
+	shl	rax, cl
+	mov	QWORD PTR [r8], rax
+	ret	0
+SHL_64 ENDP
+
+SHR_64 PROC
+	mov	rax, QWORD PTR [rcx]
+	mov	rcx, QWORD PTR [rdx]
+	shr	rax, cl
+	mov	QWORD PTR [r8], rax
+	ret	0
+SHR_64 ENDP
+
+SAR_64 PROC
+	mov	rax, QWORD PTR [rcx]
+	mov	rcx, QWORD PTR [rdx]
+	sar	rax, cl
+	mov	QWORD PTR [r8], rax
+	ret	0
+SAR_64 ENDP
+
+ROL_64 PROC
+	mov	rax, QWORD PTR [rcx]
+	mov	rcx, QWORD PTR [rdx]
+	rol	rax, cl
+	mov	QWORD PTR [r8], rax
+	ret	0
+ROL_64 ENDP
+
+ROR_64 PROC
+	mov	rax, QWORD PTR [rcx]
+	mov	rcx, QWORD PTR [rdx]
+	ror	rax, cl
+	mov	QWORD PTR [r8], rax
+	ret	0
+ROR_64 ENDP
+
+FPINIT PROC
+	mov	DWORD PTR [rsp+8], 40896
+	ldmxcsr	DWORD PTR [rsp+8]
+	ret	0
+FPINIT ENDP
+
+FADD_64 PROC
+	cvtsi2sd xmm0, QWORD PTR [rcx]
+	addsd	xmm0, xmm1
+	movsd	QWORD PTR [r8], xmm0
+	ret	0
+FADD_64 ENDP
+
+FSUB_64 PROC
+	cvtsi2sd xmm0, QWORD PTR [rcx]
+	subsd	xmm0, xmm1
+	movsd	QWORD PTR [r8], xmm0
+	ret	0
+FSUB_64 ENDP
+
+FMUL_64 PROC
+	cvtsi2sd xmm0, QWORD PTR [rcx]
+	mulsd	xmm0, xmm1
+	movsd	QWORD PTR [r8], xmm0
+	ret	0
+FMUL_64 ENDP
+
+FDIV_64 PROC
+	cvtsi2sd xmm0, QWORD PTR [rcx]
+	divsd	xmm0, xmm1
+	movsd	QWORD PTR [r8], xmm0
+	ret	0
+FDIV_64 ENDP
+
+FABSQRT PROC
+	cvtsi2sd xmm0, QWORD PTR [rcx]
+	andps	xmm0, XMMWORD PTR __XMMABS
+	sqrtsd	xmm1, xmm0
+	movsd	QWORD PTR [r8], xmm1
+	ret	0
+FABSQRT ENDP
+
+FROUND PROC
+	cvtsi2sd xmm0, QWORD PTR [rcx]
+	movsd	QWORD PTR [r8], xmm0
+	mov	rax, QWORD PTR [rcx]
+	shl	rax, 13
+	and	eax, 24576
+	or	eax, 40896
+	mov	DWORD PTR [rsp+8], eax
+	ldmxcsr	DWORD PTR [rsp+8]
+	ret	0
+FROUND ENDP
+
+END
--- a/tests/test_alu_fpu/TestAluFpu.cpp
+++ b/tests/test_alu_fpu/TestAluFpu.cpp
@ -0,0 +1,283 @@
+//RandomX ALU + FPU test
+//https://github.com/tevador/RandomX
+//License: GPL v3
+
+#include <iostream>
+#include <iomanip>
+#include <limits>
+#include "Instructions.h"
+
+using namespace RandomX;
+
+typedef void(*VmOperation)(convertible_t&, convertible_t&, convertible_t&);
+
+double rxRound(uint32_t mode, int64_t x, int64_t y, VmOperation op) {
+	convertible_t a, b, c;
+	a.u64 = mode;
+	FROUND(a, b, c);
+	a.i64 = x;
+	b.i64 = y;
+	op(a, b, c);
+	return c.f64;
+}
+
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#define RX_EXECUTE_U64(va, vb, INST) do { \
+	a.u64 = va; \
+	b.u64 = vb; \
+	INST(a, b, c); \
+	} while(false)
+
+#define RX_EXECUTE_I64(va, vb, INST) do { \
+	a.i64 = va; \
+	b.i64 = vb; \
+	INST(a, b, c); \
+	} while(false)
+
+TEST_CASE("Integer addition (64-bit)", "[ADD_64]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(0xFFFFFFFF, 0x1, ADD_64);
+	REQUIRE(c.u64 == 0x100000000);
+
+	RX_EXECUTE_U64(0x8000000000000000, 0x8000000000000000, ADD_64);
+	REQUIRE(c.u64 == 0x0);
+}
+
+TEST_CASE("Integer addition (32-bit)", "[ADD_32]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(0xFFFFFFFF, 0x1, ADD_32);
+	REQUIRE(c.u64 == 0);
+
+	RX_EXECUTE_U64(0xFF00000000000001, 0x0000000100000001, ADD_32);
+	REQUIRE(c.u64 == 2);
+}
+
+TEST_CASE("Integer subtraction (64-bit)", "[SUB_64]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(1, 0xFFFFFFFF, SUB_64);
+	REQUIRE(c.u64 == 0xFFFFFFFF00000002);
+}
+
+TEST_CASE("Integer subtraction (32-bit)", "[SUB_32]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(1, 0xFFFFFFFF, SUB_32);
+	REQUIRE(c.u64 == 2);
+}
+
+TEST_CASE("Unsigned multiplication (64-bit, low half)", "[MUL_64]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(0xBC550E96BA88A72B, 0xF5391FA9F18D6273, MUL_64);
+	REQUIRE(c.u64 == 0x28723424A9108E51);
+}
+
+TEST_CASE("Unsigned multiplication (64-bit, high half)", "[MULH_64]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(0xBC550E96BA88A72B, 0xF5391FA9F18D6273, MULH_64);
+	REQUIRE(c.u64 == 0xB4676D31D2B34883);
+}
+
+TEST_CASE("Unsigned multiplication (32-bit x 32-bit -> 64-bit)", "[MUL_32]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(0xBC550E96BA88A72B, 0xF5391FA9F18D6273, MUL_32);
+	REQUIRE(c.u64 == 0xB001AA5FA9108E51);
+}
+
+TEST_CASE("Signed multiplication (32-bit x 32-bit -> 64-bit)", "[IMUL_32]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(0xBC550E96BA88A72B, 0xF5391FA9F18D6273, IMUL_32);
+	REQUIRE(c.u64 == 0x03EBA0C1A9108E51);
+}
+
+TEST_CASE("Signed multiplication (64-bit, high half)", "[IMULH_64]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(0xBC550E96BA88A72B, 0xF5391FA9F18D6273, IMULH_64);
+	REQUIRE(c.u64 == 0x02D93EF1269D3EE5);
+}
+
+TEST_CASE("Unsigned division (64-bit / 32-bit -> 32-bit)", "[DIV_64]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(8774217225983458895, 3014068202, DIV_64);
+	REQUIRE(c.u64 == 2911087818);
+
+	RX_EXECUTE_U64(8774217225983458895, 0, DIV_64);
+	REQUIRE(c.u64 == 8774217225983458895);
+
+	RX_EXECUTE_U64(3014068202, 8774217225983458895, DIV_64);
+	REQUIRE(c.u64 == 2);
+}
+
+TEST_CASE("Signed division (64-bit / 32-bit -> 32-bit)", "[IDIV_64]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(8774217225983458895, 3014068202, IDIV_64);
+	REQUIRE(c.u64 == 0xFFFFFFFE67B4994E);
+
+	RX_EXECUTE_U64(8774217225983458895, 0, IDIV_64);
+	REQUIRE(c.u64 == 8774217225983458895);
+
+	RX_EXECUTE_U64(0x8000000000000000, 0xFFFFFFFFFFFFFFFF, IDIV_64);
+	REQUIRE(c.u64 == 0x8000000000000000);
+
+	RX_EXECUTE_U64(0xFFFFFFFFB3A707EA, 8774217225983458895, IDIV_64);
+	REQUIRE(c.u64 == 0xFFFFFFFFFFFFFFFF);
+}
+
+TEST_CASE("Bitwise AND (64-bit)", "[AND_64]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA, AND_64);
+	REQUIRE(c.u64 == 0x8888888888888888);
+}
+
+TEST_CASE("Bitwise AND (32-bit)", "[AND_32]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA, AND_32);
+	REQUIRE(c.u64 == 0x88888888);
+}
+
+TEST_CASE("Bitwise OR (64-bit)", "[OR_64]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(0x4444444444444444, 0xAAAAAAAAAAAAAAAA, OR_64);
+	REQUIRE(c.u64 == 0xEEEEEEEEEEEEEEEE);
+}
+
+TEST_CASE("Bitwise OR (32-bit)", "[OR_32]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(0x4444444444444444, 0xAAAAAAAAAAAAAAAA, OR_32);
+	REQUIRE(c.u64 == 0xEEEEEEEE);
+}
+
+TEST_CASE("Bitwise XOR (64-bit)", "[XOR_64]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(0x8888888888888888, 0xAAAAAAAAAAAAAAAA, XOR_64);
+	REQUIRE(c.u64 == 0x2222222222222222);
+}
+
+TEST_CASE("Bitwise XOR (32-bit)", "[XOR_32]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(0x8888888888888888, 0xAAAAAAAAAAAAAAAA, XOR_32);
+	REQUIRE(c.u64 == 0x22222222);
+}
+
+TEST_CASE("Logical left shift (64-bit)", "[SHL_64]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(0x3, 52, SHL_64);
+	REQUIRE(c.u64 == 0x30000000000000);
+
+	RX_EXECUTE_U64(953360005391419562, 4569451684712230561, SHL_64);
+	REQUIRE(c.u64 == 6978065200108797952);
+
+	RX_EXECUTE_U64(0x8000000000000000, 1, SHL_64);
+	REQUIRE(c.u64 == 0);
+}
+
+TEST_CASE("Logical right shift (64-bit)", "[SHR_64]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(0x3, 52, SHR_64);
+	REQUIRE(c.u64 == 0);
+
+	RX_EXECUTE_U64(953360005391419562, 4569451684712230561, SHR_64);
+	REQUIRE(c.u64 == 110985711);
+
+	RX_EXECUTE_U64(0x8000000000000000, 1, SHR_64);
+	REQUIRE(c.u64 == 0x4000000000000000);
+}
+
+TEST_CASE("Arithmetic right shift (64-bit)", "[SAR_64]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_I64(-9, 2, SAR_64);
+	REQUIRE(c.i64 == -3);
+
+	RX_EXECUTE_I64(INT64_MIN, 63, SAR_64);
+	REQUIRE(c.i64 == -1);
+
+	RX_EXECUTE_I64(INT64_MAX, 163768499474606398, SAR_64);
+	REQUIRE(c.i64 == 1);
+}
+
+TEST_CASE("Circular left shift (64-bit)", "[ROL_64]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(0x3, 52, ROL_64);
+	REQUIRE(c.u64 == 0x30000000000000);
+
+	RX_EXECUTE_U64(953360005391419562, 4569451684712230561, ROL_64);
+	REQUIRE(c.u64 == 6978065200552740799);
+
+	RX_EXECUTE_U64(0x8000000000000000, 1, ROL_64);
+	REQUIRE(c.u64 == 1);
+}
+
+TEST_CASE("Circular right shift (64-bit)", "[ROR_64]") {
+	convertible_t a, b, c;
+
+	RX_EXECUTE_U64(0x3, 52, ROR_64);
+	REQUIRE(c.u64 == 12288);
+
+	RX_EXECUTE_U64(953360005391419562, 4569451684712230561, ROR_64);
+	REQUIRE(c.u64 == 0xD835C455069D81EF);
+
+	RX_EXECUTE_U64(0x8000000000000000, 1, ROR_64);
+	REQUIRE(c.u64 == 0x4000000000000000);
+}
+
+TEST_CASE("Denormal numbers are flushed to zero", "[FTZ]") {
+	FPINIT();
+	convertible_t a, c;
+	a.i64 = 1;
+	FDIV_64(a, std::numeric_limits<double>::max(), c);
+	REQUIRE(c.f64 == 0.0);
+}
+
+TEST_CASE("IEEE-754 compliance", "[FPU]") {
+	FPINIT();
+	convertible_t a, c;
+
+	a.i64 = 1;
+	FDIV_64(a, 0, c);
+	REQUIRE(c.f64 == std::numeric_limits<double>::infinity());
+
+	a.i64 = -1;
+	FDIV_64(a, 0, c);
+	REQUIRE(c.f64 == -std::numeric_limits<double>::infinity());
+
+	REQUIRE(rxRound(RoundToNearest, 33073499373184121, -37713516328519941, &FADD) == -4640016955335824.0);
+	REQUIRE(rxRound(RoundDown, 33073499373184121, -37713516328519941, &FADD) == -4640016955335824.0);
+	REQUIRE(rxRound(RoundUp, 33073499373184121, -37713516328519941, &FADD) == -4640016955335812.0);
+	REQUIRE(rxRound(RoundToZero, 33073499373184121, -37713516328519941, &FADD) == -4640016955335816.0);
+
+	REQUIRE(rxRound(RoundToNearest, -8570200862721897289, -1111111111111111119, &FSUB) == -7.4590897516107858e+18);
+	REQUIRE(rxRound(RoundDown, -8570200862721897289, -1111111111111111119, &FSUB) == -7.4590897516107868e+18);
+	REQUIRE(rxRound(RoundUp, -8570200862721897289, -1111111111111111119, &FSUB) == -7.4590897516107848e+18);
+	REQUIRE(rxRound(RoundToZero, -8570200862721897289, -1111111111111111119, &FSUB) == -7.4590897516107848e+18);
+
+	REQUIRE(rxRound(RoundToNearest, 1, -10, &FDIV) == -0.10000000000000001);
+	REQUIRE(rxRound(RoundDown, 1, -10, &FDIV) == -0.10000000000000001);
+	REQUIRE(rxRound(RoundUp, 1, -10, &FDIV) == -0.099999999999999992);
+	REQUIRE(rxRound(RoundToZero, 1, -10, &FDIV) == -0.099999999999999992);
+
+	REQUIRE(rxRound(RoundToNearest, -2, 0, &FABSQRT) == 1.4142135623730951);
+	REQUIRE(rxRound(RoundDown, -2, 0, &FABSQRT) == 1.4142135623730949);
+	REQUIRE(rxRound(RoundUp, -2, 0, &FABSQRT) == 1.4142135623730951);
+	REQUIRE(rxRound(RoundToZero, -2, 0, &FABSQRT) == 1.4142135623730949);
+}
--- a/tests/test_alu_fpu/catch.hpp
+++ b/tests/test_alu_fpu/catch.hpp
--- a/tests/test_alu_fpu/makefile
+++ b/tests/test_alu_fpu/makefile
@ -0,0 +1,10 @@
+CXXFLAGS=-Wall -std=c++17 -O0
+
+TestAluFpu: TestAluFpu.o InstructionsPortable.o
+	$(CXX) TestAluFpu.o InstructionsPortable.o -o $@
+  
+TestAluFpu.o: TestAluFpu.cpp
+InstructionsPortable.o: InstructionsPortable.cpp
+
+clean:
+	rm -f TestAluFpu TestAluFpu.o InstructionsPortable.o