shithub: amd64-simd

Download patch

ref: cdfd06439c4c4354e9fdc4f4124149a8d7abdfe5
parent: a0b600a89c2e6e636579fe727235d036c08c7a9d
author: rodri <[email protected]>
date: Fri Dec 1 16:58:15 EST 2023

implement memory aligned versions of some functions.

--- a/bench/main.c
+++ b/bench/main.c
@@ -8,8 +8,12 @@
 double dotvec2_sse(Point2, Point2);
 double dotvec2_sse4(Point2, Point2);
 double dotvec2_avx(Point2, Point2);
+double dotvec2_sse_a(Point2*, Point2*);
+double dotvec2_sse4_a(Point2*, Point2*);
+double dotvec2_avx_a(Point2*, Point2*);
 double dotvec3_sse4(Point3, Point3);
 double dotvec3_avx(Point3, Point3);
+double dotvec3_sse4_a(Point3*, Point3*);
 Point2 Pt2b(double, double, double);
 Point3 crossvec3_sse(Point3, Point3);
 double hsubpd(double, double);
@@ -18,6 +22,21 @@
 Point2 addpt2_avx(Point2, Point2);
 Point3 addpt3_avx(Point3, Point3);
 
+void *
+amalloc(ulong n, ulong a)
+{
+	void *p;
+
+	assert(a > 1 && (a&1) == 0);
+
+	a--;
+	p = malloc(n+a);
+	if(p == nil)
+		sysfatal("malloc: %r");
+	p = (void*)(((uintptr)p + a)&~a);
+	return p;
+}
+
 double
 fmin(double a, double b)
 {
@@ -65,8 +84,9 @@
 bdotvec2(int fd)
 {
 	Bgr g;
-	B *b0, *b1, *b2, *b3;
+	B *b0, *b1, *b2, *b3, *b4, *b5, *b6;
 	Point2 a, b;
+	Point2 *aa, *bb;
 	int i;
 
 	benchinitgr(&g, "2d dot product");
@@ -74,10 +94,17 @@
 	b1 = benchadd(&g, "dotvec2_sse");
 	b2 = benchadd(&g, "dotvec2_sse4");
 	b3 = benchadd(&g, "dotvec2_avx");
+	b4 = benchadd(&g, "dotvec2_sse_a");
+	b5 = benchadd(&g, "dotvec2_sse4_a");
+	b6 = benchadd(&g, "dotvec2_avx_a");
 
 	while(b0->n > 0 || b1->n > 0){
 		a = Vec2(truerand()*frand(), truerand()*frand());
 		b = Vec2(truerand()*frand(), truerand()*frand());
+		aa = amalloc(sizeof(Point2), 16);
+		bb = amalloc(sizeof(Point2), 16);
+		*aa = a;
+		*bb = b;
 
 		benchin(b0);
 		for(i = 0; i < 1e6; i++)
@@ -98,6 +125,21 @@
 		for(i = 0; i < 1e6; i++)
 			dotvec2_avx(a, b);
 		benchout(b3);
+
+		benchin(b4);
+		for(i = 0; i < 1e6; i++)
+			dotvec2_sse_a(aa, bb);
+		benchout(b4);
+
+		benchin(b5);
+		for(i = 0; i < 1e6; i++)
+			dotvec2_sse4_a(aa, bb);
+		benchout(b5);
+
+		benchin(b6);
+		for(i = 0; i < 1e6; i++)
+			dotvec2_avx_a(aa, bb);
+		benchout(b6);
 	}
 
 	benchprintgr(&g, fd);
@@ -108,8 +150,9 @@
 bdotvec3(int fd)
 {
 	Bgr g;
-	B *b0, *b1, *b2;
+	B *b0, *b1, *b2, *b3;
 	Point3 a, b;
+	Point3 *aa, *bb;
 	int i;
 
 	benchinitgr(&g, "3d dot product");
@@ -116,10 +159,15 @@
 	b0 = benchadd(&g, "dotvec3");
 	b1 = benchadd(&g, "dotvec3_sse4");
 	b2 = benchadd(&g, "dotvec3_avx");
+	b3 = benchadd(&g, "dotvec3_sse4_a");
 
 	while(b0->n > 0 || b1->n > 0){
 		a = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand());
 		b = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand());
+		aa = amalloc(sizeof(Point3), 16);
+		bb = amalloc(sizeof(Point3), 16);
+		*aa = a;
+		*bb = b;
 
 		benchin(b0);
 		for(i = 0; i < 1e6; i++)
@@ -135,6 +183,11 @@
 		for(i = 0; i < 1e6; i++)
 			dotvec3_avx(a, b);
 		benchout(b2);
+
+		benchin(b3);
+		for(i = 0; i < 1e6; i++)
+			dotvec3_sse4_a(aa, bb);
+		benchout(b3);
 	}
 
 	benchprintgr(&g, fd);
--- a/dppd.s
+++ b/dppd.s
@@ -41,6 +41,29 @@
 	VZEROUPPER
 	RET
 
+TEXT dotvec2_sse_a(SB), 1, $0
+	MOVQ b+8(FP), DX
+	MOVAPD 0(DX), X1
+	MOVAPD 0(BP), X0
+	MULPD X1, X0
+	HADDPD X0, X0
+	RET
+
+TEXT dotvec2_sse4_a(SB), 1, $0
+	MOVQ b+8(FP), DX
+	MOVAPD 0(DX), X1
+	MOVAPD 0(BP), X0
+	DPPD $0x31, X1, X0
+	RET
+
+TEXT dotvec2_avx_a(SB), 1, $0
+	MOVQ b+8(FP), DX
+	VMOVAPD_128mr(0, rDX, rX0)
+	VMOVAPD_128mr(0, rBP, rX1)
+	VDPPD(rX1, rX0, rX0)		/* VDPPD $0x31, X1, X0, X0 */
+	VZEROUPPER
+	RET
+
 TEXT dotvec3_sse4(SB), 1, $0
 	MOVUPD a+0(FP), X0
 	MOVUPD b+32(FP), X1
@@ -61,6 +84,16 @@
 	MOVSD b+48(FP), X2
 	VFMADD231SD(rX1, rX2, rX0)
 	VZEROUPPER
+	RET
+
+TEXT dotvec3_sse4_a(SB), 1, $0
+	MOVQ b+8(FP), DX
+	MOVAPD 0(DX), X0
+	MOVAPD 0(BP), X1
+	DPPD $0x31, X1, X0
+	MOVSD 16(DX), X1
+	MULSD 16(BP), X1
+	ADDSD X1, X0
 	RET
 
 TEXT Pt2b(SB), 1, $0
--- a/main.c
+++ b/main.c
@@ -6,8 +6,12 @@
 double dotvec2_sse(Point2, Point2);
 double dotvec2_sse4(Point2, Point2);
 double dotvec2_avx(Point2, Point2);
+double dotvec2_sse_a(Point2*, Point2*);
+double dotvec2_sse4_a(Point2*, Point2*);
+double dotvec2_avx_a(Point2*, Point2*);
 double dotvec3_sse4(Point3, Point3);
 double dotvec3_avx(Point3, Point3);
+double dotvec3_sse4_a(Point3*, Point3*);
 Point2 Pt2b(double, double, double);
 Point3 crossvec3_sse(Point3, Point3);
 double hsubpd(double, double);
@@ -18,6 +22,21 @@
 void addsub_sse(double*,double*);
 double round(double);
 
+void *
+amalloc(ulong n, ulong a)
+{
+	void *p;
+
+	assert(a > 1 && (a&1) == 0);
+
+	a--;
+	p = malloc(n+a);
+	if(p == nil)
+		sysfatal("malloc: %r");
+	p = (void*)(((uintptr)p + a)&~a);
+	return p;
+}
+
 void
 addsub(double *a, double *b)
 {
@@ -44,6 +63,8 @@
 	double va[2], vb[2];
 	Point2 p0, p1, pr;
 	Point3 p0t, p1t, prt;
+	Point2 *ap0, *ap1, *apr;
+	Point3 *ap0t, *ap1t, *aprt;
 
 	GEOMfmtinstall();
 	ARGBEGIN{default:sysfatal("shit");}ARGEND
@@ -52,6 +73,14 @@
 	a = strtod(argv[0], nil);
 	b = strtod(argv[1], nil);
 
+	ap0 = amalloc(sizeof(Point2), 16);
+	ap1 = amalloc(sizeof(Point2), 16);
+	apr = amalloc(sizeof(Point2), 16);
+
+	ap0t = amalloc(sizeof(Point3), 16);
+	ap1t = amalloc(sizeof(Point3), 16);
+	aprt = amalloc(sizeof(Point3), 16);
+
 	r = 0;
 	r = fmin(a, b);
 	print("fmin(%g, %g) = %g\n", a, b, r);
@@ -78,6 +107,20 @@
 
 	print("\n");
 
+	*ap0 = Pt2b(a, 1, 1);
+	*ap1 = Pt2b(b, 3, 1);
+	r = 0;
+	r = dotvec2_sse_a(ap0, ap1);
+	print("dotvec2_sse_a(%v, %v) = %g\n", *ap0, *ap1, r);
+	r = 0;
+	r = dotvec2_sse4_a(ap0, ap1);
+	print("dotvec2_sse4_a(%v, %v) = %g\n", *ap0, *ap1, r);
+	r = 0;
+	r = dotvec2_avx_a(ap0, ap1);
+	print("dotvec2_avx_a(%v, %v) = %g\n", *ap0, *ap1, r);
+
+	print("\n");
+
 	p0t = Pt3(a, 1, 9, 1);
 	p1t = Pt3(b, 3, 4, 1);
 	r = 0;
@@ -89,6 +132,14 @@
 	r = 0;
 	r = dotvec3_avx(p0t, p1t);
 	print("dotvec3_avx(%V, %V) = %g\n", p0t, p1t, r);
+
+	print("\n");
+
+	*ap0t = Pt3(a, 1, 9, 1);
+	*ap1t = Pt3(b, 3, 4, 1);
+	r = 0;
+	r = dotvec3_sse4_a(ap0t, ap1t);
+	print("dotvec3_sse4_a(%V, %V) = %g\n", *ap0t, *ap1t, r);
 
 	print("\n");