shithub: opus

Download patch

ref: b518b56fe11bf53f88fe30d57ea9d668337983a9
parent: 9880c4cdebf7e4db5616546e801749d36fdd7202
author: Timothy B. Terriberry <[email protected]>
date: Mon May 20 11:29:04 EDT 2013

Clean up register constraints.

http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0068b/CIHBJEHG.html
 says that "Rd cannot be the same as Rm."
http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0068b/CIHBJEHG.html
 says that "RdLo, RdHi, and Rm must all be different registers."
This means that some of the early clobbers I removed really should
 have been there (to prevent aliasing Rd, RdLo, or RdHi with Rm).
It also means that we should reverse some of the operands in the
 FFT's complex multiplies.
This should only affect the ARMv4 optimizations.

Thanks to Nils Wallménius for the report.

While we're here, audit the commutative pair flags again, since I
 screwed up at least one of them, and eliminate some dead code.

--- a/celt/_kiss_fft_guts.h
+++ b/celt/_kiss_fft_guts.h
@@ -110,7 +110,7 @@
             "smull %[tt], %[mi], r1, %[br]\n\t" \
             "smlal %[tt], %[mi], r0, %[bi]\n\t" \
             "rsb %[bi], %[bi], #0\n\t" \
-            "smull r0, %[mr], r0, %[br]\n\t" \
+            "smull r0, %[mr], %[br], r0\n\t" \
             "mov %[tt], %[tt], lsr #15\n\t" \
             "smlal r0, %[mr], r1, %[bi]\n\t" \
             "orr %[mi], %[tt], %[mi], lsl #17\n\t" \
@@ -138,7 +138,7 @@
             "smull %[tt], %[mi], r1, %[br]\n\t" \
             "smlal %[tt], %[mi], r0, %[bi]\n\t" \
             "rsb %[bi], %[bi], #0\n\t" \
-            "smull r0, %[mr], r0, %[br]\n\t" \
+            "smull r0, %[mr], %[br], r0\n\t" \
             "mov %[tt], %[tt], lsr #17\n\t" \
             "smlal r0, %[mr], r1, %[bi]\n\t" \
             "orr %[mi], %[tt], %[mi], lsl #15\n\t" \
@@ -166,7 +166,7 @@
             "smull %[tt], %[mr], r0, %[br]\n\t" \
             "smlal %[tt], %[mr], r1, %[bi]\n\t" \
             "rsb %[bi], %[bi], #0\n\t" \
-            "smull r1, %[mi], r1, %[br]\n\t" \
+            "smull r1, %[mi], %[br], r1\n\t" \
             "mov %[tt], %[tt], lsr #15\n\t" \
             "smlal r1, %[mi], r0, %[bi]\n\t" \
             "orr %[mr], %[tt], %[mr], lsl #17\n\t" \
--- a/celt/fixed_armv4.h
+++ b/celt/fixed_armv4.h
@@ -36,8 +36,8 @@
   __asm__(
       "#MULT16_32_Q16\n\t"
       "smull %0, %1, %2, %3\n\t"
-      : "=r"(rd_lo), "=r"(rd_hi)
-      : "r"(b),"r"(a<<16)
+      : "=&r"(rd_lo), "=&r"(rd_hi)
+      : "%r"(b),"r"(a<<16)
   );
   return rd_hi;
 }
@@ -53,7 +53,7 @@
   __asm__(
       "#MULT16_32_Q15\n\t"
       "smull %0, %1, %2, %3\n\t"
-      : "=r"(rd_lo), "=r"(rd_hi)
+      : "=&r"(rd_lo), "=&r"(rd_hi)
       : "%r"(b), "r"(a<<16)
   );
   /*We intentionally don't OR in the high bit of rd_lo for speed.*/
--- a/celt/fixed_armv5e.h
+++ b/celt/fixed_armv5e.h
@@ -52,26 +52,14 @@
 #undef MULT16_32_Q15
 static inline opus_val32 MULT16_32_Q15_armv5e(opus_val16 a, opus_val32 b)
 {
-#if 0
-  unsigned rd_lo;
-  int rd_hi;
-  __asm__(
-      "#MULT16_32_Q15\n\t"
-      "smull %0, %1, %2, %3\n\t"
-      : "=r"(rd_lo), "=r"(rd_hi)
-      : "%r"(b), "r"(a<<16)
-  );
-  return (rd_lo>>31)|(rd_hi<<1);
-#else
   int res;
   __asm__(
       "#MULT16_32_Q15\n\t"
       "smulwb %0, %1, %2\n\t"
       : "=r"(res)
-      : "%r"(b), "r"(a)
+      : "r"(b), "r"(a)
   );
   return res<<1;
-#endif
 }
 #define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv5e(a, b))
 
--- a/silk/SigProc_FIX_armv4.h
+++ b/silk/SigProc_FIX_armv4.h
@@ -37,7 +37,7 @@
   __asm__(
       "#silk_MLA\n\t"
       "mla %0, %1, %2, %3\n\t"
-      : "=r"(res)
+      : "=&r"(res)
       : "r"(b), "r"(c), "r"(a)
   );
   return res;
--- a/silk/SigProc_FIX_armv5e.h
+++ b/silk/SigProc_FIX_armv5e.h
@@ -37,7 +37,7 @@
       "#silk_SMULTT\n\t"
       "smultt %0, %1, %2\n\t"
       : "=r"(res)
-      : "r"(a), "r"(b)
+      : "%r"(a), "r"(b)
   );
   return res;
 }
@@ -52,7 +52,7 @@
       "#silk_SMLATT\n\t"
       "smlatt %0, %1, %2, %3\n\t"
       : "=r"(res)
-      : "r"(b), "r"(c), "r"(a)
+      : "%r"(b), "r"(c), "r"(a)
   );
   return res;
 }
--- a/silk/macros_armv4.h
+++ b/silk/macros_armv4.h
@@ -37,7 +37,7 @@
   __asm__(
       "#silk_SMULWB\n\t"
       "smull %0, %1, %2, %3\n\t"
-      : "=r"(rd_lo), "=r"(rd_hi)
+      : "=&r"(rd_lo), "=&r"(rd_hi)
       : "%r"(a), "r"(b<<16)
   );
   return rd_hi;
@@ -57,7 +57,7 @@
   __asm__(
       "#silk_SMULWT\n\t"
       "smull %0, %1, %2, %3\n\t"
-      : "=r"(rd_lo), "=r"(rd_hi)
+      : "=&r"(rd_lo), "=&r"(rd_hi)
       : "%r"(a), "r"(b&~0xFFFF)
   );
   return rd_hi;
@@ -77,10 +77,10 @@
   __asm__(
     "#silk_SMULWW\n\t"
     "smull %0, %1, %2, %3\n\t"
-    : "=r"(rd_lo), "=r"(rd_hi)
+    : "=&r"(rd_lo), "=&r"(rd_hi)
     : "%r"(a), "r"(b)
   );
-  return (rd_lo>>16)|(rd_hi<<16);
+  return (rd_hi<<16)+(rd_lo>>16);
 }
 #define silk_SMULWW(a, b) (silk_SMULWW_armv4(a, b))
 
@@ -91,12 +91,12 @@
   unsigned rd_lo;
   int rd_hi;
   __asm__(
-    "#silk_SMULWW\n\t"
+    "#silk_SMLAWW\n\t"
     "smull %0, %1, %2, %3\n\t"
-    : "=r"(rd_lo), "=r"(rd_hi)
+    : "=&r"(rd_lo), "=&r"(rd_hi)
     : "%r"(b), "r"(c)
   );
-  return a+((rd_lo>>16)|(rd_hi<<16));
+  return a+(rd_hi<<16)+(rd_lo>>16);
 }
 #define silk_SMLAWW(a, b, c) (silk_SMLAWW_armv4(a, b, c))
 
--- a/silk/macros_armv5e.h
+++ b/silk/macros_armv5e.h
@@ -203,7 +203,7 @@
   __asm__(
       "#silk_CLZ32\n\t"
       "clz %0, %1\n\t"
-      : "=&r"(res)
+      : "=r"(res)
       : "r"(in32)
   );
   return res;