ALU and FPU tests

6 years ago · f19995d4c5
parent ec2d378fce
commit f19995d4c5
7 changed files with 14967 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -205,7 +205,7 @@ The shift/rotate instructions use just the bottom 6 bits of the `B` operand (`im
 |22|FSUB|A - B|
 |22|FMUL|A * B|
 |8|FDIV|A / B|
-|6|FSQRT|sqrt(A)|
+|6|FABSQRT|sqrt(A)|
 |2|FROUND|A|
 FPU instructions conform to the IEEE-754 specification, so they must give correctly rounded results. Initial rounding mode is RN (Round to Nearest). Denormal values may not be produced by any operation.
@ -214,8 +214,8 @@ FPU instructions conform to the IEEE-754 specification, so they must give correc
 Operands loaded from memory are treated as signed 64-bit integers and converted to double precision floating point format. Operands loaded from floating point registers are used directly.
-##### FSQRT
+##### FABSQRT
-The sign bit of the FSQRT operand is always cleared first, so only non-negative values are used.
+The sign bit of the FABSQRT operand is always cleared first, so only non-negative values are used.
 *In x86, the `SQRTSD` instruction must be used. The legacy `FSQRT` instruction doesn't produce correctly rounded results in all cases.*
@ -225,11 +225,11 @@ The FROUND instruction changes the rounding mode for all subsequent FPU operatio
 |A[1:0]|rounding mode|
 |-------|------------|
 |00|Round to Nearest (RN) mode|
-|01|Round towards Plus Infinity (RP) mode
+|01|Round towards Minus Infinity (RM) mode
-|10|Round towards Minus Infinity (RM) mode
+|10|Round towards Plus Infinity (RP) mode
 |11|Round towards Zero (RZ) mode
-*The two-bit flag value exactly corresponds to bits 13-14 of the x86 `MXCSR` register and bits 22-23 of the ARM `FPSCR` register.*
+*The two-bit flag value exactly corresponds to bits 13-14 of the x86 `MXCSR` register and bits 23 and 22 (reversed) of the ARM `FPSCR` register.*
 ### Control flow instructions
 The following 2 control flow instructions are supported:
--- a/tests/test_alu_fpu/Instructions.h
+++ b/tests/test_alu_fpu/Instructions.h
@ -0,0 +1,69 @@
 //RandomX ALU + FPU test
 //https://github.com/tevador/RandomX
 //License: GPL v3
 #include <cstdint>
 namespace RandomX {
 	constexpr int RoundToNearest = 0;
 	constexpr int RoundDown = 1;
 	constexpr int RoundUp = 2;
 	constexpr int RoundToZero = 3;
 	typedef union {
 		double f64;
 		int64_t i64;
 		uint64_t u64;
 		int32_t i32;
 		uint32_t u32;
 	} convertible_t;
 	extern "C" {
 		void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c);
 		void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c);
 		void SUB_64(convertible_t& a, convertible_t& b, convertible_t& c);
 		void SUB_32(convertible_t& a, convertible_t& b, convertible_t& c);
 		void MUL_64(convertible_t& a, convertible_t& b, convertible_t& c);
 		void MULH_64(convertible_t& a, convertible_t& b, convertible_t& c);
 		void MUL_32(convertible_t& a, convertible_t& b, convertible_t& c);
 		void IMUL_32(convertible_t& a, convertible_t& b, convertible_t& c);
 		void IMULH_64(convertible_t& a, convertible_t& b, convertible_t& c);
 		void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c);
 		void IDIV_64(convertible_t& a, convertible_t& b, convertible_t& c);
 		void AND_64(convertible_t& a, convertible_t& b, convertible_t& c);
 		void AND_32(convertible_t& a, convertible_t& b, convertible_t& c);
 		void OR_64(convertible_t& a, convertible_t& b, convertible_t& c);
 		void OR_32(convertible_t& a, convertible_t& b, convertible_t& c);
 		void XOR_64(convertible_t& a, convertible_t& b, convertible_t& c);
 		void XOR_32(convertible_t& a, convertible_t& b, convertible_t& c);
 		void SHL_64(convertible_t& a, convertible_t& b, convertible_t& c);
 		void SHR_64(convertible_t& a, convertible_t& b, convertible_t& c);
 		void SAR_64(convertible_t& a, convertible_t& b, convertible_t& c);
 		void ROL_64(convertible_t& a, convertible_t& b, convertible_t& c);
 		void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c);
 		void FPINIT();
 		void FADD_64(convertible_t& a, double b, convertible_t& c);
 		void FSUB_64(convertible_t& a, double b, convertible_t& c);
 		void FMUL_64(convertible_t& a, double b, convertible_t& c);
 		void FDIV_64(convertible_t& a, double b, convertible_t& c);
 		void FABSQRT(convertible_t& a, convertible_t& b, convertible_t& c);
 		void FROUND(convertible_t& a, convertible_t& b, convertible_t& c);
 		inline void FADD(convertible_t& a, convertible_t& b, convertible_t& c) {
 			FADD_64(a, (double)b.i64, c);
 		}
 		inline void FSUB(convertible_t& a, convertible_t& b, convertible_t& c) {
 			FSUB_64(a, (double)b.i64, c);
 		}
 		inline void FMUL(convertible_t& a, convertible_t& b, convertible_t& c) {
 			FMUL_64(a, (double)b.i64, c);
 		}
 		inline void FDIV(convertible_t& a, convertible_t& b, convertible_t& c) {
 			FDIV_64(a, (double)b.i64, c);
 		}
 	}
 }
--- a/tests/test_alu_fpu/InstructionsPortable.cpp
+++ b/tests/test_alu_fpu/InstructionsPortable.cpp
@ -0,0 +1,248 @@
 //RandomX ALU + FPU test
 //https://github.com/tevador/RandomX
 //License: GPL v3
 #include "Instructions.h"
 #include <cfenv>
 #include <cmath>
 #if defined(__SIZEOF_INT128__)
 	typedef unsigned __int128 uint128_t;
 	typedef __int128 int128_t;
 	static inline uint64_t __umulhi64(uint64_t a, uint64_t b) {
 		return ((uint128_t)a * b) >> 64;
 	}
 	static inline uint64_t __imulhi64(int64_t a, int64_t b) {
 		return ((int128_t)a * b) >> 64;
 	}
 	#define umulhi64 __umulhi64
 	#define imulhi64 __imulhi64
 #endif
 #if defined(_MSC_VER)
 	#include <intrin.h>
 	#include <stdlib.h>
 	#define ror64 _rotr64
 	#define rol64 _rotl64
 	#ifdef __MACHINEARM64_X64
 		#define umulhi64 __umulh
 	#endif
 	#ifdef __MACHINEX64
 		static inline uint64_t __imulhi64(int64_t a, int64_t b) {
 			int64_t hi;
 			_mul128(a, b, &hi);
 			return hi;
 		}
 		#define imulhi64 __imulhi64
 	#endif
 	#ifdef __MACHINEX86_X64
 		#define sar64 __ll_rshift
 	#endif
 #endif
 #ifndef ror64
 	static inline uint64_t __ror64(uint64_t a, int b) {
 		return (a >> b) | (a << (64 - b));
 	}
 	#define ror64 __ror64
 #endif
 #ifndef rol64
 	static inline uint64_t __rol64(uint64_t a, int b) {
 		return (a << b) | (a >> (64 - b));
 	}
 	#define rol64 __rol64
 #endif
 #ifndef sar64
 	#include <type_traits>
 	constexpr int64_t builtintShr64(int64_t value, int shift) noexcept {
 		return value >> shift;
 	}
 	struct usesArithmeticShift : std::integral_constant<bool, builtintShr64(-1LL, 1) == -1LL> {
 	};
 	static inline int64_t __sar64(int64_t a, int b) {
 		return usesArithmeticShift::value ? builtintShr64(a, b) : (a < 0 ? ~(~a >> b) : a >> b);
 	}
 	#define sar64 __sar64
 #endif
 #ifndef umulhi64
 	#define LO(x) ((x)&0xffffffff)
 	#define HI(x) ((x)>>32)
 	static inline uint64_t __umulhi64(uint64_t a, uint64_t b) {
 		uint64_t ah = HI(a), al = LO(a);
 		uint64_t bh = HI(b), bl = LO(b);
 		uint64_t x00 = al * bl;
 		uint64_t x01 = al * bh;
 		uint64_t x10 = ah * bl;
 		uint64_t x11 = ah * bh;
 		uint64_t m1 = LO(x10) + LO(x01) + HI(x00);
 		uint64_t m2 = HI(x10) + HI(x01) + LO(x11) + HI(m1);
 		uint64_t m3 = HI(x11) + HI(m2);
 		return (m3 << 32) + LO(m2);
 	}
 	#define umulhi64 __umulhi64
 #endif
 #ifndef imulhi64
 	static inline int64_t __imulhi64(int64_t a, int64_t b) {
 		int64_t hi = umulhi64(a, b);
 		if (a < 0LL) hi -= b;
 		if (b < 0LL) hi -= a;
 		return hi;
 	}
 	#define imulhi64 __imulhi64
 #endif
 static double FlushDenormal(double x) {
 	if (std::fpclassify(x) == FP_SUBNORMAL) {
 		return 0;
 	}
 	return x;
 }
 #define FTZ(x) FlushDenormal(x)
 namespace RandomX {
 	extern "C" {
 		void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.u64 = a.u64 + b.u64;
 		}
 		void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.u64 = a.u32 + b.u32;
 		}
 		void SUB_64(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.u64 = a.u64 - b.u64;
 		}
 		void SUB_32(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.u64 = a.u32 - b.u32;
 		}
 		void MUL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.u64 = a.u64 * b.u64;
 		}
 		void MULH_64(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.u64 = umulhi64(a.u64, b.u64);
 		}
 		void MUL_32(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.u64 = (uint64_t)a.u32 * b.u32;
 		}
 		void IMUL_32(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.i64 = (int64_t)a.i32 * b.i32;
 		}
 		void IMULH_64(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.i64 = imulhi64(a.i64, b.i64);
 		}
 		void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.u64 = a.u64 / (b.u32 != 0 ? b.u32 : 1U);
 		}
 		void IDIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
 			if (a.i64 == INT64_MIN && b.i64 == -1)
 				c.i64 = INT64_MIN;
 			else
 				c.i64 = a.i64 / (b.i32 != 0 ? b.i32 : 1);
 		}
 		void AND_64(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.u64 = a.u64 & b.u64;
 		}
 		void AND_32(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.u64 = a.u32 & b.u32;
 		}
 		void OR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.u64 = a.u64 | b.u64;
 		}
 		void OR_32(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.u64 = a.u32 | b.u32;
 		}
 		void XOR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.u64 = a.u64 ^ b.u64;
 		}
 		void XOR_32(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.u64 = a.u32 ^ b.u32;
 		}
 		void SHL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.u64 = a.u64 << (b.u64 & 63);
 		}
 		void SHR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.u64 = a.u64 >> (b.u64 & 63);
 		}
 		void SAR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.u64 = sar64(a.i64, b.u64 & 63);
 		}
 		void ROL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.u64 = rol64(a.u64, (b.u64 & 63));
 		}
 		void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.u64 = ror64(a.u64, (b.u64 & 63));
 		}
 		void FPINIT() {
 			fesetround(FE_TONEAREST);
 		}
 		void FADD_64(convertible_t& a, double b, convertible_t& c) {
 			c.f64 = FTZ((double)a.i64 + b);
 		}
 		void FSUB_64(convertible_t& a, double b, convertible_t& c) {
 			c.f64 = FTZ((double)a.i64 - b);
 		}
 		void FMUL_64(convertible_t& a, double b, convertible_t& c) {
 			c.f64 = FTZ((double)a.i64 * b);
 		}
 		void FDIV_64(convertible_t& a, double b, convertible_t& c) {
 			c.f64 = FTZ((double)a.i64 / b);
 		}
 		void FABSQRT(convertible_t& a, convertible_t& b, convertible_t& c) {
 			double d = fabs((double)a.i64);
 			c.f64 = FTZ(sqrt(d));
 		}
 		void FROUND(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.f64 = (double)a.i64;
 			switch (a.u64 & 3) {
 				case RoundDown:
 					fesetround(FE_DOWNWARD);
 					break;
 				case RoundUp:
 					fesetround(FE_UPWARD);
 					break;
 				case RoundToZero:
 					fesetround(FE_TOWARDZERO);
 					break;
 				default:
 					fesetround(FE_TONEAREST);
 					break;
 			}
 		}
 	}
 }
--- a/tests/test_alu_fpu/InstructionsX64.asm
+++ b/tests/test_alu_fpu/InstructionsX64.asm
@ -0,0 +1,276 @@
 ;RandomX ALU + FPU test
 ;https://github.com/tevador/RandomX
 ;License: GPL v3
 PUBLIC ADD_64
 PUBLIC ADD_32
 PUBLIC SUB_64
 PUBLIC SUB_32
 PUBLIC MUL_64
 PUBLIC MULH_64
 PUBLIC MUL_32
 PUBLIC IMUL_32
 PUBLIC IMULH_64
 PUBLIC DIV_64
 PUBLIC IDIV_64
 PUBLIC AND_64
 PUBLIC AND_32
 PUBLIC OR_64
 PUBLIC OR_32
 PUBLIC XOR_64
 PUBLIC XOR_32
 PUBLIC SHL_64
 PUBLIC SHR_64
 PUBLIC SAR_64
 PUBLIC ROL_64
 PUBLIC ROR_64
 PUBLIC FPINIT
 PUBLIC FADD_64
 PUBLIC FSUB_64
 PUBLIC FMUL_64
 PUBLIC FDIV_64
 PUBLIC FABSQRT
 PUBLIC FROUND
 CONST	SEGMENT
 __XMMABS	DB	0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 07fH, 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 07fH
 CONST	ENDS
 .code
 ADD_64 PROC
 	mov	rax, QWORD PTR [rcx]
 	add	rax, QWORD PTR [rdx]
 	mov	QWORD PTR [r8], rax
 	ret	0
 ADD_64 ENDP
 ADD_32 PROC
 	mov	eax, DWORD PTR [rcx]
 	add	eax, DWORD PTR [rdx]
 	mov	QWORD PTR [r8], rax
 	ret	0
 ADD_32 ENDP
 SUB_64 PROC
 	mov	rax, QWORD PTR [rcx]
 	sub	rax, QWORD PTR [rdx]
 	mov	QWORD PTR [r8], rax
 	ret	0
 SUB_64 ENDP
 SUB_32 PROC
 	mov	eax, DWORD PTR [rcx]
 	sub	eax, DWORD PTR [rdx]
 	mov	QWORD PTR [r8], rax
 	ret	0
 SUB_32 ENDP
 MUL_64 PROC
 	mov	rax, QWORD PTR [rcx]
 	imul	rax, QWORD PTR [rdx]
 	mov	QWORD PTR [r8], rax
 	ret	0
 MUL_64 ENDP
 MULH_64 PROC
 	mov	rax, QWORD PTR [rdx]
 	mul	QWORD PTR [rcx]
 	mov	QWORD PTR [r8], rdx
 	ret	0
 MULH_64 ENDP
 MUL_32 PROC
 	mov	r9d, DWORD PTR [rcx]
 	mov	eax, DWORD PTR [rdx]
 	imul	r9, rax
 	mov	QWORD PTR [r8], r9
 	ret	0
 MUL_32 ENDP
 IMUL_32 PROC
 	movsxd	r9, DWORD PTR [rcx]
 	movsxd	rax, DWORD PTR [rdx]
 	imul	r9, rax
 	mov	QWORD PTR [r8], r9
 	ret	0
 IMUL_32 ENDP
 IMULH_64 PROC
 	mov	rax, QWORD PTR [rdx]
 	imul	QWORD PTR [rcx]
 	mov	QWORD PTR [r8], rdx
 	ret	0
 IMULH_64 ENDP
 DIV_64 PROC
 	mov	r9d, DWORD PTR [rdx]
 	mov	eax, 1
 	test	r9d, r9d
 	cmovne	eax, r9d
 	xor	edx, edx
 	mov	r9d, eax
 	mov	rax, QWORD PTR [rcx]
 	div	r9
 	mov	QWORD PTR [r8], rax
 	ret 0
 DIV_64 ENDP
 IDIV_64 PROC
 	mov	rax, QWORD PTR [rcx]
 	mov	rcx, -9223372036854775808
 	cmp	rax, rcx
 	jne	SHORT SAFE_IDIV_64
 	cmp	QWORD PTR [rdx], -1
 	jne	SHORT SAFE_IDIV_64
 	mov	QWORD PTR [r8], rcx
 	ret	0
 SAFE_IDIV_64:
 	mov	ecx, DWORD PTR [rdx]
 	test	ecx, ecx
 	mov	edx, 1
 	cmovne	edx, ecx
 	movsxd	rcx, edx
 	cqo
 	idiv	rcx
 	mov	QWORD PTR [r8], rax
 	ret 0
 IDIV_64 ENDP
 AND_64 PROC
 	mov	rax, QWORD PTR [rcx]
 	and	rax, QWORD PTR [rdx]
 	mov	QWORD PTR [r8], rax
 	ret	0
 AND_64 ENDP
 AND_32 PROC
 	mov	eax, DWORD PTR [rcx]
 	and	eax, DWORD PTR [rdx]
 	mov	QWORD PTR [r8], rax
 	ret	0
 AND_32 ENDP
 OR_64 PROC
 	mov	rax, QWORD PTR [rcx]
 	or	rax, QWORD PTR [rdx]
 	mov	QWORD PTR [r8], rax
 	ret	0
 OR_64 ENDP
 OR_32 PROC
 	mov	eax, DWORD PTR [rcx]
 	or	eax, DWORD PTR [rdx]
 	mov	QWORD PTR [r8], rax
 	ret	0
 OR_32 ENDP
 XOR_64 PROC
 	mov	rax, QWORD PTR [rcx]
 	xor	rax, QWORD PTR [rdx]
 	mov	QWORD PTR [r8], rax
 	ret	0
 XOR_64 ENDP
 XOR_32 PROC
 	mov	eax, DWORD PTR [rcx]
 	xor	eax, DWORD PTR [rdx]
 	mov	QWORD PTR [r8], rax
 	ret	0
 XOR_32 ENDP
 SHL_64 PROC
 	mov	rax, QWORD PTR [rcx]
 	mov	rcx, QWORD PTR [rdx]
 	shl	rax, cl
 	mov	QWORD PTR [r8], rax
 	ret	0
 SHL_64 ENDP
 SHR_64 PROC
 	mov	rax, QWORD PTR [rcx]
 	mov	rcx, QWORD PTR [rdx]
 	shr	rax, cl
 	mov	QWORD PTR [r8], rax
 	ret	0
 SHR_64 ENDP
 SAR_64 PROC
 	mov	rax, QWORD PTR [rcx]
 	mov	rcx, QWORD PTR [rdx]
 	sar	rax, cl
 	mov	QWORD PTR [r8], rax
 	ret	0
 SAR_64 ENDP
 ROL_64 PROC
 	mov	rax, QWORD PTR [rcx]
 	mov	rcx, QWORD PTR [rdx]
 	rol	rax, cl
 	mov	QWORD PTR [r8], rax
 	ret	0
 ROL_64 ENDP
 ROR_64 PROC
 	mov	rax, QWORD PTR [rcx]
 	mov	rcx, QWORD PTR [rdx]
 	ror	rax, cl
 	mov	QWORD PTR [r8], rax
 	ret	0
 ROR_64 ENDP
 FPINIT PROC
 	mov	DWORD PTR [rsp+8], 40896
 	ldmxcsr	DWORD PTR [rsp+8]
 	ret	0
 FPINIT ENDP
 FADD_64 PROC
 	cvtsi2sd xmm0, QWORD PTR [rcx]
 	addsd	xmm0, xmm1
 	movsd	QWORD PTR [r8], xmm0
 	ret	0
 FADD_64 ENDP
 FSUB_64 PROC
 	cvtsi2sd xmm0, QWORD PTR [rcx]
 	subsd	xmm0, xmm1
 	movsd	QWORD PTR [r8], xmm0
 	ret	0
 FSUB_64 ENDP
 FMUL_64 PROC
 	cvtsi2sd xmm0, QWORD PTR [rcx]
 	mulsd	xmm0, xmm1
 	movsd	QWORD PTR [r8], xmm0
 	ret	0
 FMUL_64 ENDP
 FDIV_64 PROC
 	cvtsi2sd xmm0, QWORD PTR [rcx]
 	divsd	xmm0, xmm1
 	movsd	QWORD PTR [r8], xmm0
 	ret	0
 FDIV_64 ENDP
 FABSQRT PROC
 	cvtsi2sd xmm0, QWORD PTR [rcx]
 	andps	xmm0, XMMWORD PTR __XMMABS
 	sqrtsd	xmm1, xmm0
 	movsd	QWORD PTR [r8], xmm1
 	ret	0
 FABSQRT ENDP
 FROUND PROC
 	cvtsi2sd xmm0, QWORD PTR [rcx]
 	movsd	QWORD PTR [r8], xmm0
 	mov	rax, QWORD PTR [rcx]
 	shl	rax, 13
 	and	eax, 24576
 	or	eax, 40896
 	mov	DWORD PTR [rsp+8], eax
 	ldmxcsr	DWORD PTR [rsp+8]
 	ret	0
 FROUND ENDP
 END
--- a/tests/test_alu_fpu/TestAluFpu.cpp
+++ b/tests/test_alu_fpu/TestAluFpu.cpp
@ -0,0 +1,283 @@
 //RandomX ALU + FPU test
 //https://github.com/tevador/RandomX
 //License: GPL v3
 #include <iostream>
 #include <iomanip>
 #include <limits>
 #include "Instructions.h"
 using namespace RandomX;
 typedef void(*VmOperation)(convertible_t&, convertible_t&, convertible_t&);
 double rxRound(uint32_t mode, int64_t x, int64_t y, VmOperation op) {
 	convertible_t a, b, c;
 	a.u64 = mode;
 	FROUND(a, b, c);
 	a.i64 = x;
 	b.i64 = y;
 	op(a, b, c);
 	return c.f64;
 }
 #define CATCH_CONFIG_MAIN
 #include "catch.hpp"
 #define RX_EXECUTE_U64(va, vb, INST) do { \
 	a.u64 = va; \
 	b.u64 = vb; \
 	INST(a, b, c); \
 	} while(false)
 #define RX_EXECUTE_I64(va, vb, INST) do { \
 	a.i64 = va; \
 	b.i64 = vb; \
 	INST(a, b, c); \
 	} while(false)
 TEST_CASE("Integer addition (64-bit)", "[ADD_64]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(0xFFFFFFFF, 0x1, ADD_64);
 	REQUIRE(c.u64 == 0x100000000);
 	RX_EXECUTE_U64(0x8000000000000000, 0x8000000000000000, ADD_64);
 	REQUIRE(c.u64 == 0x0);
 }
 TEST_CASE("Integer addition (32-bit)", "[ADD_32]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(0xFFFFFFFF, 0x1, ADD_32);
 	REQUIRE(c.u64 == 0);
 	RX_EXECUTE_U64(0xFF00000000000001, 0x0000000100000001, ADD_32);
 	REQUIRE(c.u64 == 2);
 }
 TEST_CASE("Integer subtraction (64-bit)", "[SUB_64]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(1, 0xFFFFFFFF, SUB_64);
 	REQUIRE(c.u64 == 0xFFFFFFFF00000002);
 }
 TEST_CASE("Integer subtraction (32-bit)", "[SUB_32]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(1, 0xFFFFFFFF, SUB_32);
 	REQUIRE(c.u64 == 2);
 }
 TEST_CASE("Unsigned multiplication (64-bit, low half)", "[MUL_64]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(0xBC550E96BA88A72B, 0xF5391FA9F18D6273, MUL_64);
 	REQUIRE(c.u64 == 0x28723424A9108E51);
 }
 TEST_CASE("Unsigned multiplication (64-bit, high half)", "[MULH_64]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(0xBC550E96BA88A72B, 0xF5391FA9F18D6273, MULH_64);
 	REQUIRE(c.u64 == 0xB4676D31D2B34883);
 }
 TEST_CASE("Unsigned multiplication (32-bit x 32-bit -> 64-bit)", "[MUL_32]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(0xBC550E96BA88A72B, 0xF5391FA9F18D6273, MUL_32);
 	REQUIRE(c.u64 == 0xB001AA5FA9108E51);
 }
 TEST_CASE("Signed multiplication (32-bit x 32-bit -> 64-bit)", "[IMUL_32]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(0xBC550E96BA88A72B, 0xF5391FA9F18D6273, IMUL_32);
 	REQUIRE(c.u64 == 0x03EBA0C1A9108E51);
 }
 TEST_CASE("Signed multiplication (64-bit, high half)", "[IMULH_64]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(0xBC550E96BA88A72B, 0xF5391FA9F18D6273, IMULH_64);
 	REQUIRE(c.u64 == 0x02D93EF1269D3EE5);
 }
 TEST_CASE("Unsigned division (64-bit / 32-bit -> 32-bit)", "[DIV_64]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(8774217225983458895, 3014068202, DIV_64);
 	REQUIRE(c.u64 == 2911087818);
 	RX_EXECUTE_U64(8774217225983458895, 0, DIV_64);
 	REQUIRE(c.u64 == 8774217225983458895);
 	RX_EXECUTE_U64(3014068202, 8774217225983458895, DIV_64);
 	REQUIRE(c.u64 == 2);
 }
 TEST_CASE("Signed division (64-bit / 32-bit -> 32-bit)", "[IDIV_64]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(8774217225983458895, 3014068202, IDIV_64);
 	REQUIRE(c.u64 == 0xFFFFFFFE67B4994E);
 	RX_EXECUTE_U64(8774217225983458895, 0, IDIV_64);
 	REQUIRE(c.u64 == 8774217225983458895);
 	RX_EXECUTE_U64(0x8000000000000000, 0xFFFFFFFFFFFFFFFF, IDIV_64);
 	REQUIRE(c.u64 == 0x8000000000000000);
 	RX_EXECUTE_U64(0xFFFFFFFFB3A707EA, 8774217225983458895, IDIV_64);
 	REQUIRE(c.u64 == 0xFFFFFFFFFFFFFFFF);
 }
 TEST_CASE("Bitwise AND (64-bit)", "[AND_64]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA, AND_64);
 	REQUIRE(c.u64 == 0x8888888888888888);
 }
 TEST_CASE("Bitwise AND (32-bit)", "[AND_32]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA, AND_32);
 	REQUIRE(c.u64 == 0x88888888);
 }
 TEST_CASE("Bitwise OR (64-bit)", "[OR_64]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(0x4444444444444444, 0xAAAAAAAAAAAAAAAA, OR_64);
 	REQUIRE(c.u64 == 0xEEEEEEEEEEEEEEEE);
 }
 TEST_CASE("Bitwise OR (32-bit)", "[OR_32]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(0x4444444444444444, 0xAAAAAAAAAAAAAAAA, OR_32);
 	REQUIRE(c.u64 == 0xEEEEEEEE);
 }
 TEST_CASE("Bitwise XOR (64-bit)", "[XOR_64]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(0x8888888888888888, 0xAAAAAAAAAAAAAAAA, XOR_64);
 	REQUIRE(c.u64 == 0x2222222222222222);
 }
 TEST_CASE("Bitwise XOR (32-bit)", "[XOR_32]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(0x8888888888888888, 0xAAAAAAAAAAAAAAAA, XOR_32);
 	REQUIRE(c.u64 == 0x22222222);
 }
 TEST_CASE("Logical left shift (64-bit)", "[SHL_64]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(0x3, 52, SHL_64);
 	REQUIRE(c.u64 == 0x30000000000000);
 	RX_EXECUTE_U64(953360005391419562, 4569451684712230561, SHL_64);
 	REQUIRE(c.u64 == 6978065200108797952);
 	RX_EXECUTE_U64(0x8000000000000000, 1, SHL_64);
 	REQUIRE(c.u64 == 0);
 }
 TEST_CASE("Logical right shift (64-bit)", "[SHR_64]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(0x3, 52, SHR_64);
 	REQUIRE(c.u64 == 0);
 	RX_EXECUTE_U64(953360005391419562, 4569451684712230561, SHR_64);
 	REQUIRE(c.u64 == 110985711);
 	RX_EXECUTE_U64(0x8000000000000000, 1, SHR_64);
 	REQUIRE(c.u64 == 0x4000000000000000);
 }
 TEST_CASE("Arithmetic right shift (64-bit)", "[SAR_64]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_I64(-9, 2, SAR_64);
 	REQUIRE(c.i64 == -3);
 	RX_EXECUTE_I64(INT64_MIN, 63, SAR_64);
 	REQUIRE(c.i64 == -1);
 	RX_EXECUTE_I64(INT64_MAX, 163768499474606398, SAR_64);
 	REQUIRE(c.i64 == 1);
 }
 TEST_CASE("Circular left shift (64-bit)", "[ROL_64]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(0x3, 52, ROL_64);
 	REQUIRE(c.u64 == 0x30000000000000);
 	RX_EXECUTE_U64(953360005391419562, 4569451684712230561, ROL_64);
 	REQUIRE(c.u64 == 6978065200552740799);
 	RX_EXECUTE_U64(0x8000000000000000, 1, ROL_64);
 	REQUIRE(c.u64 == 1);
 }
 TEST_CASE("Circular right shift (64-bit)", "[ROR_64]") {
 	convertible_t a, b, c;
 	RX_EXECUTE_U64(0x3, 52, ROR_64);
 	REQUIRE(c.u64 == 12288);
 	RX_EXECUTE_U64(953360005391419562, 4569451684712230561, ROR_64);
 	REQUIRE(c.u64 == 0xD835C455069D81EF);
 	RX_EXECUTE_U64(0x8000000000000000, 1, ROR_64);
 	REQUIRE(c.u64 == 0x4000000000000000);
 }
 TEST_CASE("Denormal numbers are flushed to zero", "[FTZ]") {
 	FPINIT();
 	convertible_t a, c;
 	a.i64 = 1;
 	FDIV_64(a, std::numeric_limits<double>::max(), c);
 	REQUIRE(c.f64 == 0.0);
 }
 TEST_CASE("IEEE-754 compliance", "[FPU]") {
 	FPINIT();
 	convertible_t a, c;
 	a.i64 = 1;
 	FDIV_64(a, 0, c);
 	REQUIRE(c.f64 == std::numeric_limits<double>::infinity());
 	a.i64 = -1;
 	FDIV_64(a, 0, c);
 	REQUIRE(c.f64 == -std::numeric_limits<double>::infinity());
 	REQUIRE(rxRound(RoundToNearest, 33073499373184121, -37713516328519941, &FADD) == -4640016955335824.0);
 	REQUIRE(rxRound(RoundDown, 33073499373184121, -37713516328519941, &FADD) == -4640016955335824.0);
 	REQUIRE(rxRound(RoundUp, 33073499373184121, -37713516328519941, &FADD) == -4640016955335812.0);
 	REQUIRE(rxRound(RoundToZero, 33073499373184121, -37713516328519941, &FADD) == -4640016955335816.0);
 	REQUIRE(rxRound(RoundToNearest, -8570200862721897289, -1111111111111111119, &FSUB) == -7.4590897516107858e+18);
 	REQUIRE(rxRound(RoundDown, -8570200862721897289, -1111111111111111119, &FSUB) == -7.4590897516107868e+18);
 	REQUIRE(rxRound(RoundUp, -8570200862721897289, -1111111111111111119, &FSUB) == -7.4590897516107848e+18);
 	REQUIRE(rxRound(RoundToZero, -8570200862721897289, -1111111111111111119, &FSUB) == -7.4590897516107848e+18);
 	REQUIRE(rxRound(RoundToNearest, 1, -10, &FDIV) == -0.10000000000000001);
 	REQUIRE(rxRound(RoundDown, 1, -10, &FDIV) == -0.10000000000000001);
 	REQUIRE(rxRound(RoundUp, 1, -10, &FDIV) == -0.099999999999999992);
 	REQUIRE(rxRound(RoundToZero, 1, -10, &FDIV) == -0.099999999999999992);
 	REQUIRE(rxRound(RoundToNearest, -2, 0, &FABSQRT) == 1.4142135623730951);
 	REQUIRE(rxRound(RoundDown, -2, 0, &FABSQRT) == 1.4142135623730949);
 	REQUIRE(rxRound(RoundUp, -2, 0, &FABSQRT) == 1.4142135623730951);
 	REQUIRE(rxRound(RoundToZero, -2, 0, &FABSQRT) == 1.4142135623730949);
 }
--- a/tests/test_alu_fpu/catch.hpp
+++ b/tests/test_alu_fpu/catch.hpp
--- a/tests/test_alu_fpu/makefile
+++ b/tests/test_alu_fpu/makefile
@ -0,0 +1,10 @@
 CXXFLAGS=-Wall -std=c++17 -O0
 TestAluFpu: TestAluFpu.o InstructionsPortable.o
 	$(CXX) TestAluFpu.o InstructionsPortable.o -o $@
 TestAluFpu.o: TestAluFpu.cpp
 InstructionsPortable.o: InstructionsPortable.cpp
 clean:
 	rm -f TestAluFpu TestAluFpu.o InstructionsPortable.o