ref: cdfd06439c4c4354e9fdc4f4124149a8d7abdfe5
parent: a0b600a89c2e6e636579fe727235d036c08c7a9d
author: rodri <[email protected]>
date: Fri Dec 1 16:58:15 EST 2023
implement memory aligned versions of some functions.
--- a/bench/main.c
+++ b/bench/main.c
@@ -8,8 +8,12 @@
double dotvec2_sse(Point2, Point2);
double dotvec2_sse4(Point2, Point2);
double dotvec2_avx(Point2, Point2);
+double dotvec2_sse_a(Point2*, Point2*);
+double dotvec2_sse4_a(Point2*, Point2*);
+double dotvec2_avx_a(Point2*, Point2*);
double dotvec3_sse4(Point3, Point3);
double dotvec3_avx(Point3, Point3);
+double dotvec3_sse4_a(Point3*, Point3*);
Point2 Pt2b(double, double, double);
Point3 crossvec3_sse(Point3, Point3);
double hsubpd(double, double);
@@ -18,6 +22,21 @@
Point2 addpt2_avx(Point2, Point2);
Point3 addpt3_avx(Point3, Point3);
+void *
+amalloc(ulong n, ulong a)
+{
+ void *p;
+
+ assert(a > 1 && (a&1) == 0);
+
+ a--;
+ p = malloc(n+a);
+ if(p == nil)
+ sysfatal("malloc: %r");
+ p = (void*)(((uintptr)p + a)&~a);
+ return p;
+}
+
double
fmin(double a, double b)
{
@@ -65,8 +84,9 @@
bdotvec2(int fd)
{
Bgr g;
- B *b0, *b1, *b2, *b3;
+ B *b0, *b1, *b2, *b3, *b4, *b5, *b6;
Point2 a, b;
+ Point2 *aa, *bb;
int i;
benchinitgr(&g, "2d dot product");
@@ -74,10 +94,17 @@
b1 = benchadd(&g, "dotvec2_sse");
b2 = benchadd(&g, "dotvec2_sse4");
b3 = benchadd(&g, "dotvec2_avx");
+ b4 = benchadd(&g, "dotvec2_sse_a");
+ b5 = benchadd(&g, "dotvec2_sse4_a");
+ b6 = benchadd(&g, "dotvec2_avx_a");
while(b0->n > 0 || b1->n > 0){
a = Vec2(truerand()*frand(), truerand()*frand());
b = Vec2(truerand()*frand(), truerand()*frand());
+ aa = amalloc(sizeof(Point2), 16);
+ bb = amalloc(sizeof(Point2), 16);
+ *aa = a;
+ *bb = b;
benchin(b0);
for(i = 0; i < 1e6; i++)
@@ -98,6 +125,21 @@
for(i = 0; i < 1e6; i++)
dotvec2_avx(a, b);
benchout(b3);
+
+ benchin(b4);
+ for(i = 0; i < 1e6; i++)
+ dotvec2_sse_a(aa, bb);
+ benchout(b4);
+
+ benchin(b5);
+ for(i = 0; i < 1e6; i++)
+ dotvec2_sse4_a(aa, bb);
+ benchout(b5);
+
+ benchin(b6);
+ for(i = 0; i < 1e6; i++)
+ dotvec2_avx_a(aa, bb);
+ benchout(b6);
}
benchprintgr(&g, fd);
@@ -108,8 +150,9 @@
bdotvec3(int fd)
{
Bgr g;
- B *b0, *b1, *b2;
+ B *b0, *b1, *b2, *b3;
Point3 a, b;
+ Point3 *aa, *bb;
int i;
benchinitgr(&g, "3d dot product");
@@ -116,10 +159,15 @@
b0 = benchadd(&g, "dotvec3");
b1 = benchadd(&g, "dotvec3_sse4");
b2 = benchadd(&g, "dotvec3_avx");
+ b3 = benchadd(&g, "dotvec3_sse4_a");
while(b0->n > 0 || b1->n > 0){
a = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand());
b = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand());
+ aa = amalloc(sizeof(Point3), 16);
+ bb = amalloc(sizeof(Point3), 16);
+ *aa = a;
+ *bb = b;
benchin(b0);
for(i = 0; i < 1e6; i++)
@@ -135,6 +183,11 @@
for(i = 0; i < 1e6; i++)
dotvec3_avx(a, b);
benchout(b2);
+
+ benchin(b3);
+ for(i = 0; i < 1e6; i++)
+ dotvec3_sse4_a(aa, bb);
+ benchout(b3);
}
benchprintgr(&g, fd);
--- a/dppd.s
+++ b/dppd.s
@@ -41,6 +41,29 @@
VZEROUPPER
RET
+TEXT dotvec2_sse_a(SB), 1, $0
+ MOVQ b+8(FP), DX
+ MOVAPD 0(DX), X1
+ MOVAPD 0(BP), X0
+ MULPD X1, X0
+ HADDPD X0, X0
+ RET
+
+TEXT dotvec2_sse4_a(SB), 1, $0
+ MOVQ b+8(FP), DX
+ MOVAPD 0(DX), X1
+ MOVAPD 0(BP), X0
+ DPPD $0x31, X1, X0
+ RET
+
+TEXT dotvec2_avx_a(SB), 1, $0
+ MOVQ b+8(FP), DX
+ VMOVAPD_128mr(0, rDX, rX0)
+ VMOVAPD_128mr(0, rBP, rX1)
+ VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */
+ VZEROUPPER
+ RET
+
TEXT dotvec3_sse4(SB), 1, $0
MOVUPD a+0(FP), X0
MOVUPD b+32(FP), X1
@@ -61,6 +84,16 @@
MOVSD b+48(FP), X2
VFMADD231SD(rX1, rX2, rX0)
VZEROUPPER
+ RET
+
+TEXT dotvec3_sse4_a(SB), 1, $0
+ MOVQ b+8(FP), DX
+ MOVAPD 0(DX), X0
+ MOVAPD 0(BP), X1
+ DPPD $0x31, X1, X0
+ MOVSD 16(DX), X1
+ MULSD 16(BP), X1
+ ADDSD X1, X0
RET
TEXT Pt2b(SB), 1, $0
--- a/main.c
+++ b/main.c
@@ -6,8 +6,12 @@
double dotvec2_sse(Point2, Point2);
double dotvec2_sse4(Point2, Point2);
double dotvec2_avx(Point2, Point2);
+double dotvec2_sse_a(Point2*, Point2*);
+double dotvec2_sse4_a(Point2*, Point2*);
+double dotvec2_avx_a(Point2*, Point2*);
double dotvec3_sse4(Point3, Point3);
double dotvec3_avx(Point3, Point3);
+double dotvec3_sse4_a(Point3*, Point3*);
Point2 Pt2b(double, double, double);
Point3 crossvec3_sse(Point3, Point3);
double hsubpd(double, double);
@@ -18,6 +22,21 @@
void addsub_sse(double*,double*);
double round(double);
+void *
+amalloc(ulong n, ulong a)
+{
+ void *p;
+
+ assert(a > 1 && (a&1) == 0);
+
+ a--;
+ p = malloc(n+a);
+ if(p == nil)
+ sysfatal("malloc: %r");
+ p = (void*)(((uintptr)p + a)&~a);
+ return p;
+}
+
void
addsub(double *a, double *b)
{
@@ -44,6 +63,8 @@
double va[2], vb[2];
Point2 p0, p1, pr;
Point3 p0t, p1t, prt;
+ Point2 *ap0, *ap1, *apr;
+ Point3 *ap0t, *ap1t, *aprt;
GEOMfmtinstall();
ARGBEGIN{default:sysfatal("shit");}ARGEND
@@ -52,6 +73,14 @@
a = strtod(argv[0], nil);
b = strtod(argv[1], nil);
+ ap0 = amalloc(sizeof(Point2), 16);
+ ap1 = amalloc(sizeof(Point2), 16);
+ apr = amalloc(sizeof(Point2), 16);
+
+ ap0t = amalloc(sizeof(Point3), 16);
+ ap1t = amalloc(sizeof(Point3), 16);
+ aprt = amalloc(sizeof(Point3), 16);
+
r = 0;
r = fmin(a, b);
print("fmin(%g, %g) = %g\n", a, b, r);
@@ -78,6 +107,20 @@
print("\n");
+ *ap0 = Pt2b(a, 1, 1);
+ *ap1 = Pt2b(b, 3, 1);
+ r = 0;
+ r = dotvec2_sse_a(ap0, ap1);
+ print("dotvec2_sse_a(%v, %v) = %g\n", *ap0, *ap1, r);
+ r = 0;
+ r = dotvec2_sse4_a(ap0, ap1);
+ print("dotvec2_sse4_a(%v, %v) = %g\n", *ap0, *ap1, r);
+ r = 0;
+ r = dotvec2_avx_a(ap0, ap1);
+ print("dotvec2_avx_a(%v, %v) = %g\n", *ap0, *ap1, r);
+
+ print("\n");
+
p0t = Pt3(a, 1, 9, 1);
p1t = Pt3(b, 3, 4, 1);
r = 0;
@@ -89,6 +132,14 @@
r = 0;
r = dotvec3_avx(p0t, p1t);
print("dotvec3_avx(%V, %V) = %g\n", p0t, p1t, r);
+
+ print("\n");
+
+ *ap0t = Pt3(a, 1, 9, 1);
+ *ap1t = Pt3(b, 3, 4, 1);
+ r = 0;
+ r = dotvec3_sse4_a(ap0t, ap1t);
+ print("dotvec3_sse4_a(%V, %V) = %g\n", *ap0t, *ap1t, r);
print("\n");