shithub: amd64-simd

Download patch

ref: 96ccc85d6a58b7ebfcb69d6d470ff209668f6187
author: rodri <[email protected]>
date: Thu Nov 23 06:43:50 EST 2023

initial SIMD trials of some libgeometry functions.

--- /dev/null
+++ b/dppd.s
@@ -1,0 +1,71 @@
+#include "sse.h"
+
+DATA one(SB)/8,$1.0
+GLOBL one(SB), $8
+
+TEXT dppd(SB), 1, $0
+	MOVQ SP, AX
+	MOVLPD(8, rAX, rX0)	/* MOVLPD a+0(FP), X0 */
+	MOVHPD(16, rAX, rX0)	/* MOVHPD a+8(FP), X0 */
+	MOVLPD(32, rAX, rX1)	/* MOVLPD b+0(FP), X1 */
+	MOVHPD(40, rAX, rX1)	/* MOVHPD b+8(FP), X1*/
+	DPPD(rX1, rX0)		/* DPPD $0x31, X1, X0 */
+	RET
+
+TEXT dppd3(SB), 1, $0
+	MOVQ SP, AX
+	MOVLPD(8, rAX, rX0)	/* MOVLPD a+0(FP), X0 */
+	MOVHPD(16, rAX, rX0)	/* MOVHPD a+8(FP), X0 */
+	MOVLPD(40, rAX, rX1)	/* MOVLPD b+0(FP), X1 */
+	MOVHPD(48, rAX, rX1)	/* MOVHPD b+8(FP), X1 */
+	DPPD(rX1, rX0)		/* DPPD $0x31, X1, X0 */
+	MOVSD one(SB), X1
+	MOVHPD(24, rAX, rX0)	/* MOVHPD a+16(FP), X0 */
+	MOVHPD(56, rAX, rX1)	/* MOVHPD b+16(FP), X1 */
+	DPPD(rX1, rX0)		/* DPPD $0x31, X1, X0 */
+	RET
+
+TEXT Pt2b(SB), 1, $0
+	MOVQ BP, DI
+	MOVSD x+8(FP), X0
+	MOVSD X0, 0(DI)
+	MOVSD y+16(FP), X0
+	MOVSD X0, 8(DI)
+	MOVSD w+24(FP), X0
+	MOVSD X0, 16(DI)
+	RET
+
+TEXT hsubpd(SB), 1, $0
+	MOVQ SP, AX
+	MOVLPD(8, rAX, rX0)
+	MOVHPD(16, rAX, rX0)
+	HSUBPD(rX0, rX0)
+	RET
+
+TEXT xvec3(SB), 1, $0
+	MOVQ SP, AX
+	ADDQ $8, AX
+	MOVLPD(40, rAX, rX0)
+	MOVHPD(8, rAX, rX0)
+	MOVLPD(16, rAX, rX1)
+	MOVHPD(48, rAX, rX1)
+	MOVLPD(56, rAX, rX2)
+	MOVHPD(24, rAX, rX2)
+	MOVAPD X1, X3
+	MULPD X2, X3
+	HSUBPD(rX3, rX3)	/* x */
+	MOVAPD X2, X4
+	SHUFPD $0x1, X4, X4
+	MULPD X0, X4
+	HSUBPD(rX4, rX4)	/* y */
+	MOVAPD X0, X5
+	MULPD X1, X5
+	SHUFPD $0x1, X5, X5
+	HSUBPD(rX5, rX5)	/* z */
+	MOVQ BP, DI
+	MOVSD X3, 0(DI)
+	MOVSD X4, 8(DI)
+	MOVSD X5, 16(DI)
+	XORPD X0, X0
+	MOVSD X0, 24(DI)
+	RET
--- /dev/null
+++ b/main.c
@@ -1,0 +1,82 @@
+#include <u.h>
+#include <libc.h>
+#include <geometry.h>
+
+uvlong nanosec(void);
+double min(double, double);
+double dppd(Point2, Point2);
+double dppd3(Point3, Point3);
+Point2 Pt2b(double, double, double);
+Point3 xvec3(Point3, Point3);
+double hsubpd(double, double);
+
+double
+fmin(double a, double b)
+{
+	return a<b? a: b;
+}
+
+void
+main(int argc, char *argv[])
+{
+	uvlong t0, t1;
+	double a, b, r;
+	Point2 p0, p1;
+	Point3 p0t, p1t, pr;
+
+	GEOMfmtinstall();
+	ARGBEGIN{default:sysfatal("shit");}ARGEND
+	if(argc != 2)
+		sysfatal("shit");
+	a = strtod(argv[0], nil);
+	b = strtod(argv[1], nil);
+
+	t0 = nanosec();
+	r = fmin(a, b);
+	t1 = nanosec();
+	print("fmin(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0);
+	t0 = nanosec();
+	r = min(a, b);
+	t1 = nanosec();
+	print("min(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0);
+
+	p0 = Pt2b(a, 1, 1);
+	p1 = Pt2b(b, 3, 1);
+	t0 = nanosec();
+	r = dppd(p0, p1);
+	t1 = nanosec();
+	print("dppd(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0);
+	t0 = nanosec();
+	r = dotvec2(p0, p1);
+	t1 = nanosec();
+	print("dotvec2(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0);
+
+	p0t = Pt3(a, 1, 9, 1);
+	p1t = Pt3(b, 3, 4, 1);
+	t0 = nanosec();
+	r = dppd3(p0t, p1t);
+	t1 = nanosec();
+	print("dppd3(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0);
+	t0 = nanosec();
+	r = dotvec3(p0t, p1t);
+	t1 = nanosec();
+	print("dotvec3(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0);
+
+	t0 = nanosec();
+	r = hsubpd(a, b);
+	t1 = nanosec();
+	print("hsubpd(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0);
+
+	p0t = Pt3(a, 1, 9, 1);
+	p1t = Pt3(b, 3, 4, 1);
+	t0 = nanosec();
+	pr = xvec3(p0t, p1t);
+	t1 = nanosec();
+	print("xvec3(%V, %V) = %V\ttook %lludns\n", p0t, p1t, pr, t1-t0);
+	t0 = nanosec();
+	pr = crossvec3(p0t, p1t);
+	t1 = nanosec();
+	print("crossvec3(%V, %V) = %V\ttook %lludns\n", p0t, p1t, pr, t1-t0);
+
+	exits(nil);
+}
--- /dev/null
+++ b/min.s
@@ -1,0 +1,4 @@
+TEXT min(SB), 1, $0
+	MOVSD a+0(FP), X0
+	MINSD b+8(FP), X0
+	RET
--- /dev/null
+++ b/mkfile
@@ -1,0 +1,14 @@
+</$objtype/mkfile
+
+BIN=/$objtype/bin
+TARG=stuff
+OFILES=\
+	main.$O\
+	min.$O\
+	dppd.$O\
+	nanosec.$O\
+
+HFILES=\
+	sse.h\
+
+</sys/src/cmd/mkone
--- /dev/null
+++ b/nanosec.c
@@ -1,0 +1,109 @@
+#include <u.h>
+#include <libc.h>
+#include <tos.h>
+
+/*
+ * This code is a mixture of cpuid(1) and the nanosec() found in vmx,
+ * in order to force the use of nsec(2) in case we are running in a
+ * virtualized environment where the clock is mis-bhyve-ing.
+ */
+
+typedef struct Res {
+	ulong ax, bx, cx, dx;
+} Res;
+
+static uchar _cpuid[] = {
+	0x5E,			/* POP SI (PC) */
+	0x5D,			/* POP BP (Res&) */
+	0x58,			/* POP AX */
+	0x59,			/* POP CX */
+
+	0x51,			/* PUSH CX */
+	0x50,			/* PUSH AX */
+	0x55,			/* PUSH BP */
+	0x56,			/* PUSH SI */
+
+	0x31, 0xDB,		/* XOR BX, BX */
+	0x31, 0xD2,		/* XOR DX, DX */
+
+	0x0F, 0xA2,		/* CPUID */
+
+	0x89, 0x45, 0x00,	/* MOV AX, 0(BP) */
+	0x89, 0x5d, 0x04,	/* MOV BX, 4(BP) */
+	0x89, 0x4d, 0x08,	/* MOV CX, 8(BP) */
+	0x89, 0x55, 0x0C,	/* MOV DX, 12(BP) */
+	0xC3,			/* RET */
+};
+
+static Res (*cpuid)(ulong ax, ulong cx) = (Res(*)(ulong, ulong)) _cpuid;
+
+/*
+ * nsec() is wallclock and can be adjusted by timesync
+ * so need to use cycles() instead, but fall back to
+ * nsec() in case we can't
+ */
+uvlong
+nanosec(void)
+{
+	static uvlong fasthz, xstart;
+	char buf[13], path[128];
+	ulong w;
+	uvlong x, div;
+	int fd;
+	Res r;
+
+	if(fasthz == ~0ULL)
+		return nsec() - xstart;
+
+	if(fasthz == 0){
+		/* first long in a.out header */
+		snprint(path, sizeof path, "/proc/%d/text", getpid());
+		fd = open(path, OREAD);
+		if(fd < 0)
+			goto Wallclock;
+		if(read(fd, buf, 4) != 4){
+			close(fd);
+			goto Wallclock;
+		}
+		close(fd);
+
+		w = ((ulong *) buf)[0];
+
+		switch(w){
+		default:
+			goto Wallclock;
+		case 0x978a0000:	/* amd64 */
+			/* patch out POP BP -> POP AX */
+			_cpuid[1] = 0x58;
+		case 0xeb010000:	/* 386 */
+			break;
+		}
+		segflush(_cpuid, sizeof(_cpuid));
+
+		r = cpuid(0x40000000, 0);
+		((ulong *) buf)[0] = r.bx;
+		((ulong *) buf)[1] = r.cx;
+		((ulong *) buf)[2] = r.dx;
+		buf[12] = 0;
+
+		if(strstr(buf, "bhyve") != nil)
+			goto Wallclock;
+
+		if(_tos->cyclefreq){
+			fasthz = _tos->cyclefreq;
+			cycles(&xstart);
+		} else {
+Wallclock:
+			fasthz = ~0ULL;
+			xstart = nsec();
+		}
+		return 0;
+	}
+	cycles(&x);
+	x -= xstart;
+
+	/* this is ugly */
+	for(div = 1000000000ULL; x < 0x1999999999999999ULL && div > 1 ; div /= 10ULL, x *= 10ULL);
+
+	return x / (fasthz / div);
+}
--- /dev/null
+++ b/sse.h
@@ -1,0 +1,50 @@
+#define rAX	0
+#define rCX	1
+#define rDX	2
+#define rBX	3
+#define rSP	4
+#define rBP	5
+#define rSI	6
+#define rDI	7
+
+#define rX0	0
+#define rX1	1
+#define rX2	2
+#define rX3	3
+#define rX4	4
+#define rX5	5
+#define rX6	6
+
+#define OP(o, m, ro, rm)	WORD $0x0F66;	/* op + modr/m byte */	\
+			BYTE $(o);					\
+			BYTE $(((m)<<6)|((ro)<<3)|(rm))
+#define OPi(o, m, ro, rm, i)	OP((o), (m), (ro), (rm));		\
+			BYTE $(i)
+#define OP4(o, m, ro, rm)	WORD $0x0F66;				\
+			WORD $(o);					\
+			BYTE $(((m)<<6)|((ro)<<3)|(rm))
+#define OP4i(o, m, ro, rm, i)	OP4((o), (m), (ro), (rm));		\
+			BYTE $(i)
+
+/* MOVLPD */
+//opcode = 660F12
+//modrm  = 01 000 000 [AX → X0] / 01 001 000 [AX → X1]
+//disp8 = 8 / 32
+#define MOVLPD(off, s, d) OPi(0x12, 0x1, (d), (s), (off))
+
+/* MOVHPD */
+//opcode = 660F16
+//modrm  = 01 000 000 [AX → X0] / 01 001 000 [AX → X1]
+//disp8 = 16 / 40
+#define MOVHPD(off, s, d) OPi(0x16, 0x1, (d), (s), (off))
+
+/* HSUBPD */
+//opcode = 660F7D = 01100110 00001111 01111101
+//modrm = 11 000 000 [X0 → X0]
+#define HSUBPD(s, d) OP(0x7D, 0x3, (d), (s))
+
+/* DPPD */
+//opcode = 660F3A41 = 01100110 00001111 00111010 01000001
+//modrm  = 11 000 001 [X1 → X0]
+//imm8   = 0011 0001
+#define DPPD(s, d) OP4i(0x413A, 0x3, (d), (s), 0x31)