shithub: amd64-simd

Download patch

ref: 7cf4634e668730749aa8b7fa9ff16cf4234958fa
parent: d850c3b7f47e58556c160f9d03ea20aa52452020
author: rodri <[email protected]>
date: Fri Nov 24 11:48:14 EST 2023

clean and organize things up. implement VZEROUPPER.

--- /dev/null
+++ b/avx.h
@@ -1,0 +1,40 @@
+#define VEX_m_0F	(1)
+#define VEX_m_0F38	(2)
+#define VEX_m_0F3A	(3)
+#define VEX_L_128	(0)
+#define VEX_L_256	(1)
+#define VEX_p_NO	(0)
+#define VEX_p_66	(1)
+#define VEX_p_F3	(2)
+#define VEX_p_F2	(3)
+
+#define VEX3(r, x, b, m, w, v, l, p)	BYTE $0xC4;				\
+				BYTE $(((~r)<<7)|((~x)<<6)|((~b)<<5)|(m));	\
+				BYTE $(((w)<<7)|((~v)<<3)|((l)<<2)|(p))
+#define VEX2(r, b, l, p)	BYTE $0xC5;					\
+			BYTE $(((~r)<<7)|((~v)<<3)|((l)<<2)|(p))
+#define VOP(o, m, ro, rm)	BYTE $(o);	\
+			BYTE $(((m)<<6)|((ro)<<3)|(rm))
+#define VOPi(o, m, ro, rm, i)	VOP((o), (m), (ro), (rm));	\
+			BYTE $(i)
+
+
+/* VZEROUPPER */
+#define VZEROUPPER	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_NO); BYTE $0x77
+
+/* VMOVAPD */
+#define VMOVUPD_128mr(off, s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66);	\
+				VOPi(0x10, 0x1, (d), (s), (off))
+#define VMOVAPD_128rr(s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66);		\
+			VOP(0x28, 0x3, (d), (s))
+/* VDPPD */
+#define VDPPD(s0, s1, d)	VEX3(0,0,0,VEX_m_0F3A,0,(s0),VEX_L_128,VEX_p_66);	\
+			VOPi(0x41, 0x3, (d), (s1), 0x31)
+
+/* VFMADD231SD (128 bit) */
+#define VFMADD231SD(s0, s1, d)	VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66);	\
+			VOP(0xB9, 0x3, (d), (s1))
+
+/* VFMADD231PD (128 bit) */
+#define VFMADD231PD(s0, s1, d)	VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66);	\
+			VOP(0xB8, 0x3, (d), (s1))
--- a/dppd.s
+++ b/dppd.s
@@ -1,4 +1,6 @@
+#include "regs.h"
 #include "sse.h"
+#include "avx.h"
 
 DATA one(SB)/8,$1.0
 GLOBL one(SB), $8
--- a/mkfile
+++ b/mkfile
@@ -9,6 +9,8 @@
 	nanosec.$O\
 
 HFILES=\
+	regs.h\
 	sse.h\
+	avx.h\
 
 </sys/src/cmd/mkone
--- /dev/null
+++ b/regs.h
@@ -1,0 +1,19 @@
+/* GPRs */
+#define rAX	0
+#define rCX	1
+#define rDX	2
+#define rBX	3
+#define rSP	4
+#define rBP	5
+#define rSI	6
+#define rDI	7
+
+/* SSE and AVX (represent [XYZ]MM) */
+#define rX0	0	/* X8 */
+#define rX1	1	/* X9 */
+#define rX2	2	/* X10 */
+#define rX3	3	/* X11 */
+#define rX4	4	/* X12 */
+#define rX5	5	/* X13 */
+#define rX6	6	/* X14 */
+#define rX7	7	/* X15 */
--- a/sse.h
+++ b/sse.h
@@ -1,30 +1,3 @@
-#define rAX	0
-#define rCX	1
-#define rDX	2
-#define rBX	3
-#define rSP	4
-#define rBP	5
-#define rSI	6
-#define rDI	7
-
-#define rX0	0
-#define rX1	1
-#define rX2	2
-#define rX3	3
-#define rX4	4
-#define rX5	5
-#define rX6	6
-
-#define VEX_m_0F	(1)
-#define VEX_m_0F38	(2)
-#define VEX_m_0F3A	(3)
-#define VEX_L_128	(0)
-#define VEX_L_256	(1)
-#define VEX_p_NO	(0)
-#define VEX_p_66	(1)
-#define VEX_p_F3	(2)
-#define VEX_p_F2	(3)
-
 #define OP(o, m, ro, rm)	WORD $0x0F66; BYTE $(o);	\
 			BYTE $(((m)<<6)|((ro)<<3)|(rm))
 #define OPi(o, m, ro, rm, i)	OP((o), (m), (ro), (rm));	\
@@ -34,15 +7,6 @@
 #define OP4i(o, m, ro, rm, i)	OP4((o), (m), (ro), (rm));	\
 			BYTE $(i)
 
-#define VEX3(r, x, b, m, w, v, l, p)	BYTE $0xC4;				\
-				BYTE $(((~r)<<7)|((~x)<<6)|((~b)<<5)|(m));	\
-				BYTE $(((w)<<7)|((~v)<<3)|((l)<<2)|(p))
-#define VEX2(r, b, l, p)	BYTE $0xC5;					\
-			BYTE $(((~r)<<7)|((~v)<<3)|((l)<<2)|(p))
-#define VOP(o, m, ro, rm)	BYTE $(o);	\
-			BYTE $(((m)<<6)|((ro)<<3)|(rm))
-#define VOPi(o, m, ro, rm, i)	VOP((o), (m), (ro), (rm));	\
-			BYTE $(i)
 
 /* MOVLPD */
 //opcode = 660F12
@@ -66,20 +30,3 @@
 //modrm  = 11 000 001 [X1 → X0]
 //imm8   = 0011 0001
 #define DPPD(s, d) OP4i(0x413A, 0x3, (d), (s), 0x31)
-
-/* VMOVAPD */
-#define VMOVUPD_128mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66);			\
-			VOPi(0x10, 0x1, (d), (s), (off))
-#define VMOVAPD_128rr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66);			\
-			VOP(0x28, 0x3, (d), (s))
-/* VDPPD */
-#define VDPPD(s0, s1, d) VEX3(0,0,0,VEX_m_0F3A,0,(s0),VEX_L_128,VEX_p_66);		\
-			VOPi(0x41, 0x3, (d), (s1), 0x31)
-
-/* VFMADD231SD (128 bit) */
-#define VFMADD231SD(s0, s1, d) VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66);	\
-			VOP(0xB9, 0x3, (d), (s1))
-
-/* VFMADD231PD (128 bit) */
-#define VFMADD231PD(s0, s1, d) VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66);	\
-			VOP(0xB8, 0x3, (d), (s1))