shithub: opus

Download patch

ref: 972a34ec2c79d241318af24389b8ee042d10556a
parent: b7bd4c20acfd951ba46647e07411285997d952f4
author: Timothy B. Terriberry <[email protected]>
date: Sun May 19 13:11:17 EDT 2013

Add ARMv4/ARMv5E macros.

Original patch by Aurélien Zanelli <[email protected]>:
 http://lists.xiph.org/pipermail/opus/2013-May/002078.html

Revised version:
- Add autconf detection (ported from libtheora).
- Rename ARM5E to ARMv5E (an ARM5 is not the same thing as ARMv5!).
- Use actual macros so they can still be selectively overridden.
- Split out ARMv4 parts and add a few more ARMv4 macros.
- Label blocks to make them easy to find in generated assembly.
- Fix MULT16_32_Q15() so we can pass make check.
  The MDCT test passes in values larger than 2**30 for b.
  The new version should be just as fast (or faster, since it's
   easier to merge the shift with following instructions), and
   there's no appreciable impact on accuracy (FFT/MDCT SNR actually
   goes up in most cases).
- Fix register constraints.
  We were using early-clobber flags in a bunch of places that
   didn't need them, and commutative-pair flags in a bunch of
   places that weren't actually commutative.
  This was Jean-Marc's fault (the original code came from Speex).
- Simplify silk_CLZ16().
- Port over iFFT C_MULC asm by Andree Buschmann
   <[email protected]> from Rockbox.
- Speed up the C_MULC asm by using LDRD, allowing more flexible
   addressing, re-ordering instructions to avoid some stalls,
   allowing more flexible register allocation, and getting things
   out of the inline asm block so the compiler can schedule them
   better.
- Add C_MUL and C_MUL4 asm for the FFT to the encoder based, on the
   new C_MULC.

In total, this patch gives a 22.3% speed-up on test_opus_encoder on
 a 600 MHz Cortex A8 using gcc 4.2.1,
When restricted to ARMv4 optimizations, it gives a 9.6% speed-up
 on the same processor/compiler.
On the conformance test vectors:
 Average mono quality is 97.0583 %
 Average stereo quality is 97.775 %

--- a/autogen.sh
+++ b/autogen.sh
@@ -135,6 +135,7 @@
 
 echo "Generating configuration files for $package, please wait...."
 
+ACLOCAL_FLAGS="-I m4"
 echo "  $ACLOCAL $ACLOCAL_FLAGS"
 $ACLOCAL $ACLOCAL_FLAGS || exit 1
 echo "  autoheader"
--- a/celt/_kiss_fft_guts.h
+++ b/celt/_kiss_fft_guts.h
@@ -94,6 +94,179 @@
     do {(res).r = ADD32((res).r,(a).r);  (res).i = SUB32((res).i,(a).i); \
     }while(0)
 
+#if defined(ARMv4_ASM)
+
+#undef C_MUL
+#define C_MUL(m,a,b) \
+    do{ \
+       int br__; \
+       int bi__; \
+       int tt__; \
+        __asm__ __volatile__( \
+            "#C_MUL\n\t" \
+            "ldm %[ap], {r0,r1}\n\t" \
+            "ldrsh %[br], [%[bp], #0]\n\t" \
+            "ldrsh %[bi], [%[bp], #2]\n\t" \
+            "smull %[tt], %[mi], r1, %[br]\n\t" \
+            "smlal %[tt], %[mi], r0, %[bi]\n\t" \
+            "rsb %[bi], %[bi], #0\n\t" \
+            "smull r0, %[mr], r0, %[br]\n\t" \
+            "mov %[tt], %[tt], lsr #15\n\t" \
+            "smlal r0, %[mr], r1, %[bi]\n\t" \
+            "orr %[mi], %[tt], %[mi], lsl #17\n\t" \
+            "mov r0, r0, lsr #15\n\t" \
+            "orr %[mr], r0, %[mr], lsl #17\n\t" \
+            : [mr]"=r"((m).r), [mi]"=r"((m).i), \
+              [br]"=&r"(br__), [bi]"=r"(bi__), [tt]"=r"(tt__) \
+            : [ap]"r"(&(a)), [bp]"r"(&(b)) \
+            : "r0", "r1" \
+        ); \
+    } \
+    while(0)
+
+#undef C_MUL4
+#define C_MUL4(m,a,b) \
+    do{ \
+       int br__; \
+       int bi__; \
+       int tt__; \
+        __asm__ __volatile__( \
+            "#C_MUL4\n\t" \
+            "ldm %[ap], {r0,r1}\n\t" \
+            "ldrsh %[br], [%[bp], #0]\n\t" \
+            "ldrsh %[bi], [%[bp], #2]\n\t" \
+            "smull %[tt], %[mi], r1, %[br]\n\t" \
+            "smlal %[tt], %[mi], r0, %[bi]\n\t" \
+            "rsb %[bi], %[bi], #0\n\t" \
+            "smull r0, %[mr], r0, %[br]\n\t" \
+            "mov %[tt], %[tt], lsr #17\n\t" \
+            "smlal r0, %[mr], r1, %[bi]\n\t" \
+            "orr %[mi], %[tt], %[mi], lsl #15\n\t" \
+            "mov r0, r0, lsr #17\n\t" \
+            "orr %[mr], r0, %[mr], lsl #15\n\t" \
+            : [mr]"=r"((m).r), [mi]"=r"((m).i), \
+              [br]"=&r"(br__), [bi]"=r"(bi__), [tt]"=r"(tt__) \
+            : [ap]"r"(&(a)), [bp]"r"(&(b)) \
+            : "r0", "r1" \
+        ); \
+    } \
+    while(0)
+
+#undef C_MULC
+#define C_MULC(m,a,b) \
+    do{ \
+       int br__; \
+       int bi__; \
+       int tt__; \
+        __asm__ __volatile__( \
+            "#C_MULC\n\t" \
+            "ldm %[ap], {r0,r1}\n\t" \
+            "ldrsh %[br], [%[bp], #0]\n\t" \
+            "ldrsh %[bi], [%[bp], #2]\n\t" \
+            "smull %[tt], %[mr], r0, %[br]\n\t" \
+            "smlal %[tt], %[mr], r1, %[bi]\n\t" \
+            "rsb %[bi], %[bi], #0\n\t" \
+            "smull r1, %[mi], r1, %[br]\n\t" \
+            "mov %[tt], %[tt], lsr #15\n\t" \
+            "smlal r1, %[mi], r0, %[bi]\n\t" \
+            "orr %[mr], %[tt], %[mr], lsl #17\n\t" \
+            "mov r1, r1, lsr #15\n\t" \
+            "orr %[mi], r1, %[mi], lsl #17\n\t" \
+            : [mr]"=r"((m).r), [mi]"=r"((m).i), \
+              [br]"=&r"(br__), [bi]"=r"(bi__), [tt]"=r"(tt__) \
+            : [ap]"r"(&(a)), [bp]"r"(&(b)) \
+            : "r0", "r1" \
+        ); \
+    } \
+    while(0)
+
+#endif /* ARMv4_ASM */
+
+#if defined(ARMv5E_ASM)
+
+#if defined(__thumb__)||defined(__thumb2__)
+#define LDRD_CONS "Q"
+#else
+#define LDRD_CONS "Uq"
+#endif
+
+#undef C_MUL
+#define C_MUL(m,a,b) \
+    do{ \
+        int mr1__; \
+        int mr2__; \
+        int mi__; \
+        long long aval__; \
+        int bval__; \
+        __asm__( \
+            "#C_MUL\n\t" \
+            "ldrd %[aval], %H[aval], %[ap]\n\t" \
+            "ldr %[bval], %[bp]\n\t" \
+            "smulwb %[mi], %H[aval], %[bval]\n\t" \
+            "smulwb %[mr1], %[aval], %[bval]\n\t" \
+            "smulwt %[mr2], %H[aval], %[bval]\n\t" \
+            "smlawt %[mi], %[aval], %[bval], %[mi]\n\t" \
+            : [mr1]"=r"(mr1__), [mr2]"=r"(mr2__), [mi]"=r"(mi__), \
+              [aval]"=&r"(aval__), [bval]"=r"(bval__) \
+            : [ap]LDRD_CONS(a), [bp]"m"(b) \
+        ); \
+        (m).r = SHL32(SUB32(mr1__, mr2__), 1); \
+        (m).i = SHL32(mi__, 1); \
+    } \
+    while(0)
+
+#undef C_MUL4
+#define C_MUL4(m,a,b) \
+    do{ \
+        int mr1__; \
+        int mr2__; \
+        int mi__; \
+        long long aval__; \
+        int bval__; \
+        __asm__( \
+            "#C_MUL4\n\t" \
+            "ldrd %[aval], %H[aval], %[ap]\n\t" \
+            "ldr %[bval], %[bp]\n\t" \
+            "smulwb %[mi], %H[aval], %[bval]\n\t" \
+            "smulwb %[mr1], %[aval], %[bval]\n\t" \
+            "smulwt %[mr2], %H[aval], %[bval]\n\t" \
+            "smlawt %[mi], %[aval], %[bval], %[mi]\n\t" \
+            : [mr1]"=r"(mr1__), [mr2]"=r"(mr2__), [mi]"=r"(mi__), \
+              [aval]"=&r"(aval__), [bval]"=r"(bval__) \
+            : [ap]LDRD_CONS(a), [bp]"m"(b) \
+        ); \
+        (m).r = SHR32(SUB32(mr1__, mr2__), 1); \
+        (m).i = SHR32(mi__, 1); \
+    } \
+    while(0)
+
+#undef C_MULC
+#define C_MULC(m,a,b) \
+    do{ \
+        int mr__; \
+        int mi1__; \
+        int mi2__; \
+        long long aval__; \
+        int bval__; \
+        __asm__( \
+            "#C_MULC\n\t" \
+            "ldrd %[aval], %H[aval], %[ap]\n\t" \
+            "ldr %[bval], %[bp]\n\t" \
+            "smulwb %[mr], %[aval], %[bval]\n\t" \
+            "smulwb %[mi1], %H[aval], %[bval]\n\t" \
+            "smulwt %[mi2], %[aval], %[bval]\n\t" \
+            "smlawt %[mr], %H[aval], %[bval], %[mr]\n\t" \
+            : [mr]"=r"(mr__), [mi1]"=r"(mi1__), [mi2]"=r"(mi2__), \
+              [aval]"=&r"(aval__), [bval]"=r"(bval__) \
+            : [ap]LDRD_CONS(a), [bp]"m"(b) \
+        ); \
+        (m).r = SHL32(mr__, 1); \
+        (m).i = SHL32(SUB32(mi1__, mi2__), 1); \
+    } \
+    while(0)
+
+#endif /* ARMv5E_ASM */
+
 #else  /* not FIXED_POINT*/
 
 #   define S_MUL(a,b) ( (a)*(b) )
--- a/celt/arch.h
+++ b/celt/arch.h
@@ -112,10 +112,10 @@
 
 #include "fixed_generic.h"
 
-#ifdef ARM5E_ASM
-#include "fixed_arm5e.h"
-#elif defined (ARM4_ASM)
-#include "fixed_arm4.h"
+#ifdef ARMv5E_ASM
+#include "fixed_armv5e.h"
+#elif defined (ARMv4_ASM)
+#include "fixed_armv4.h"
 #elif defined (BFIN_ASM)
 #include "fixed_bfin.h"
 #elif defined (TI_C5X_ASM)
--- /dev/null
+++ b/celt/fixed_armv4.h
@@ -1,0 +1,71 @@
+/* Copyright (C) 2013 Xiph.Org Foundation and contributors */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef FIXED_ARMv4_H
+#define FIXED_ARMv4_H
+
+/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
+#undef MULT16_32_Q16
+static inline opus_val32 MULT16_32_Q16_armv4(opus_val16 a, opus_val32 b)
+{
+  unsigned rd_lo;
+  int rd_hi;
+  __asm__(
+      "#MULT16_32_Q16\n\t"
+      "smull %0, %1, %2, %3\n\t"
+      : "=r"(rd_lo), "=r"(rd_hi)
+      : "r"(b),"r"(a<<16)
+  );
+  return rd_hi;
+}
+#define MULT16_32_Q16(a, b) (MULT16_32_Q16_armv4(a, b))
+
+
+/** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */
+#undef MULT16_32_Q15
+static inline opus_val32 MULT16_32_Q15_armv4(opus_val16 a, opus_val32 b)
+{
+  unsigned rd_lo;
+  int rd_hi;
+  __asm__(
+      "#MULT16_32_Q15\n\t"
+      "smull %0, %1, %2, %3\n\t"
+      : "=r"(rd_lo), "=r"(rd_hi)
+      : "%r"(b), "r"(a<<16)
+  );
+  /*We intentionally don't OR in the high bit of rd_lo for speed.*/
+  return rd_hi<<1;
+}
+#define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv4(a, b))
+
+
+/** 16x32 multiply, followed by a 15-bit shift right and 32-bit add.
+    b must fit in 31 bits.
+    Result fits in 32 bits. */
+#undef MAC16_32_Q15
+#define MAC16_32_Q15(c, a, b) ADD32(c, MULT16_32_Q15(a, b))
+
+#endif
--- /dev/null
+++ b/celt/fixed_armv5e.h
@@ -1,0 +1,127 @@
+/* Copyright (C) 2007-2009 Xiph.Org Foundation
+   Copyright (C) 2003-2008 Jean-Marc Valin
+   Copyright (C) 2007-2008 CSIRO
+   Copyright (C) 2013      Parrot */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef FIXED_ARMv5E_H
+#define FIXED_ARMv5E_H
+
+#include "fixed_armv4.h"
+
+/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
+#undef MULT16_32_Q16
+static inline opus_val32 MULT16_32_Q16_armv5e(opus_val16 a, opus_val32 b)
+{
+  int res;
+  __asm__(
+      "#MULT16_32_Q16\n\t"
+      "smulwb %0, %1, %2\n\t"
+      : "=r"(res)
+      : "r"(b),"r"(a)
+  );
+  return res;
+}
+#define MULT16_32_Q16(a, b) (MULT16_32_Q16_armv5e(a, b))
+
+
+/** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */
+#undef MULT16_32_Q15
+static inline opus_val32 MULT16_32_Q15_armv5e(opus_val16 a, opus_val32 b)
+{
+#if 0
+  unsigned rd_lo;
+  int rd_hi;
+  __asm__(
+      "#MULT16_32_Q15\n\t"
+      "smull %0, %1, %2, %3\n\t"
+      : "=r"(rd_lo), "=r"(rd_hi)
+      : "%r"(b), "r"(a<<16)
+  );
+  return (rd_lo>>31)|(rd_hi<<1);
+#else
+  int res;
+  __asm__(
+      "#MULT16_32_Q15\n\t"
+      "smulwb %0, %1, %2\n\t"
+      : "=r"(res)
+      : "%r"(b), "r"(a)
+  );
+  return res<<1;
+#endif
+}
+#define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv5e(a, b))
+
+
+/** 16x32 multiply, followed by a 15-bit shift right and 32-bit add.
+    b must fit in 31 bits.
+    Result fits in 32 bits. */
+#undef MAC16_32_Q15
+static inline opus_val32 MAC16_32_Q15_armv5e(opus_val32 c, opus_val16 a,
+ opus_val32 b)
+{
+  int res;
+  __asm__(
+      "#MAC16_32_Q15\n\t"
+      "smlawb %0, %1, %2, %3;\n"
+      : "=r"(res)
+      : "r"(b<<1), "r"(a), "r"(c)
+  );
+  return res;
+}
+#define MAC16_32_Q15(c, a, b) (MAC16_32_Q15_armv5e(c, a, b))
+
+/** 16x16 multiply-add where the result fits in 32 bits */
+#undef MAC16_16
+static inline opus_val32 MAC16_16(opus_val32 c, opus_val16 a, opus_val16 b)
+{
+  int res;
+  __asm__(
+      "#MAC16_16\n\t"
+      "smlabb %0, %1, %2, %3;\n"
+      : "=r"(res)
+      : "r"(a), "r"(b), "r"(c)
+  );
+  return res;
+}
+#define MAC16_16(c, a, b) (MAC16_16(c, a, b))
+
+/** 16x16 multiplication where the result fits in 32 bits */
+#undef MULT16_16
+static inline opus_val32 MULT16_16(opus_val16 a, opus_val16 b)
+{
+  int res;
+  __asm__(
+      "#MULT16_16\n\t"
+      "smulbb %0, %1, %2;\n"
+      : "=r"(res)
+      : "r"(a), "r"(b)
+  );
+  return res;
+}
+#define MULT16_16(a, b) (MULT16_16(a, b))
+
+#endif
--- a/configure.ac
+++ b/configure.ac
@@ -18,7 +18,6 @@
 dnl enable silent rules on automake 1.11 and later
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 
-
 # For libtool.
 dnl Please update these for releases.
 OPUS_LT_CURRENT=4
@@ -155,6 +154,36 @@
     AC_DEFINE([FLOAT_APPROX], , [Float approximations])
 fi
 
+cpu_arm=no
+AC_ARG_ENABLE(asm,
+    AS_HELP_STRING([--disable-asm], [Disable assembly optimizations]),
+    [ ac_enable_asm=$enableval ], [ ac_enable_asm=yes] )
+if test "x${ac_enable_asm}" = xyes ; then
+    asm_optimization="no asm for your platform, please send patches"
+    case $host_cpu in
+    arm*)
+        cpu_arm=yes
+        AS_GCC_INLINE_ASSEMBLY([asm_optimization="ARM"],
+            [asm_optimization="disabled"])
+        if test "x${asm_optimization}" = "xARM" ; then
+            AC_DEFINE([ARMv4_ASM], [], [Use generic ARMv4 asm optimizations])
+            AS_ASM_ARM_EDSP([ARMv5E_ASM=1],[ARMv5E_ASM=0])
+            if test "x${ARMv5E_ASM}" = "x1" ; then
+                AC_DEFINE(ARMv5E_ASM, 1, [Use ARMv5E asm optimizations])
+                asm_optimization="${asm_optimization} (EDSP)"
+            fi
+            AS_ASM_ARM_MEDIA([ARMv6_ASM=1],[ARMv6_ASM=0])
+            if test "x${ARMv6_ASM}" = "x1" ; then
+                AC_DEFINE(ARMv6_ASM, 1, [Use ARMv6 asm optimizations])
+                asm_optimization="${asm_optimization} (Media)"
+            fi
+        fi
+        ;;
+    esac
+else
+    asm_optimization="disabled"
+fi
+
 ac_enable_assertions="no"
 AC_ARG_ENABLE(assertions, [  --enable-assertions        enable additional software error checking],
 [if test "$enableval" = yes; then
@@ -281,6 +310,7 @@
       Floating point support: ........ ${ac_enable_float}
       Fast float approximations: ..... ${float_approx}
       Fixed point debugging: ......... ${ac_enable_fixed_debug}
+      Assembly optimization: ......... ${asm_optimization}
       Custom modes: .................. ${ac_enable_custom_modes}
       Assertion checking: ............ ${ac_enable_assertions}
       Fuzzing: ....................... ${ac_enable_fuzzing}
--- /dev/null
+++ b/m4/as-gcc-inline-assembly.m4
@@ -1,0 +1,106 @@
+dnl as-gcc-inline-assembly.m4 0.1.0
+
+dnl autostars m4 macro for detection of gcc inline assembly
+
+dnl David Schleef <[email protected]>
+
+dnl $Id$
+
+dnl AS_COMPILER_FLAG(ACTION-IF-ACCEPTED, [ACTION-IF-NOT-ACCEPTED])
+dnl Tries to compile with the given CFLAGS.
+dnl Runs ACTION-IF-ACCEPTED if the compiler can compile with the flags,
+dnl and ACTION-IF-NOT-ACCEPTED otherwise.
+
+AC_DEFUN([AS_GCC_INLINE_ASSEMBLY],
+[
+  AC_MSG_CHECKING([if compiler supports gcc-style inline assembly])
+
+  AC_TRY_COMPILE([], [
+#ifdef __GNUC_MINOR__
+#if (__GNUC__ * 1000 + __GNUC_MINOR__) < 3004
+#error GCC before 3.4 has critical bugs compiling inline assembly
+#endif
+#endif
+__asm__ (""::) ], [flag_ok=yes], [flag_ok=no])
+
+  if test "X$flag_ok" = Xyes ; then
+    $1
+    true
+  else
+    $2
+    true
+  fi
+  AC_MSG_RESULT([$flag_ok])
+])
+
+AC_DEFUN([AC_TRY_ASSEMBLE],
+[ac_c_ext=$ac_ext
+ ac_ext=${ac_s_ext-s}
+ cat > conftest.$ac_ext <<EOF
+ .file "configure"
+[$1]
+EOF
+if AC_TRY_EVAL(ac_compile); then
+  ac_ext=$ac_c_ext
+  ifelse([$2], , :, [  $2
+  rm -rf conftest*])
+else
+  echo "configure: failed program was:" >&AC_FD_CC
+  cat conftest.$ac_ext >&AC_FD_CC
+  ac_ext=$ac_c_ext
+ifelse([$3], , , [  rm -rf conftest*
+  $3
+])dnl
+fi
+rm -rf conftest*])
+
+
+AC_DEFUN([AS_ASM_ARM_NEON],
+[
+  AC_MSG_CHECKING([if assembler supports NEON instructions on ARM])
+
+  AC_TRY_ASSEMBLE([vorr d0,d0,d0], [flag_ok=yes], [flag_ok=no])
+
+  if test "X$flag_ok" = Xyes ; then
+    $1
+    true
+  else
+    $2
+    true
+  fi
+  AC_MSG_RESULT([$flag_ok])
+])
+
+
+AC_DEFUN([AS_ASM_ARM_MEDIA],
+[
+  AC_MSG_CHECKING([if assembler supports ARMv6 media instructions on ARM])
+
+  AC_TRY_ASSEMBLE([shadd8 r3,r3,r3], [flag_ok=yes], [flag_ok=no])
+
+  if test "X$flag_ok" = Xyes ; then
+    $1
+    true
+  else
+    $2
+    true
+  fi
+  AC_MSG_RESULT([$flag_ok])
+])
+
+
+AC_DEFUN([AS_ASM_ARM_EDSP],
+[
+  AC_MSG_CHECKING([if assembler supports EDSP instructions on ARM])
+
+  AC_TRY_ASSEMBLE([qadd r3,r3,r3], [flag_ok=yes], [flag_ok=no])
+
+  if test "X$flag_ok" = Xyes ; then
+    $1
+    true
+  else
+    $2
+    true
+  fi
+  AC_MSG_RESULT([$flag_ok])
+])
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -576,6 +576,14 @@
 #include "MacroCount.h"
 #include "MacroDebug.h"
 
+#ifdef ARMv4_ASM
+#include "SigProc_FIX_armv4.h"
+#endif
+
+#ifdef ARMv5E_ASM
+#include "SigProc_FIX_armv5e.h"
+#endif
+
 #ifdef  __cplusplus
 }
 #endif
--- /dev/null
+++ b/silk/SigProc_FIX_armv4.h
@@ -1,0 +1,47 @@
+/***********************************************************************
+Copyright (C) 2013 Xiph.Org Foundation and contributors
+Copyright (c) 2013       Parrot
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifndef SILK_SIGPROC_FIX_ARMv4_H
+#define SILK_SIGPROC_FIX_ARMv4_H
+
+#undef silk_MLA
+static inline opus_int32 silk_MLA_armv4(opus_int32 a, opus_int32 b,
+ opus_int32 c)
+{
+  opus_int32 res;
+  __asm__(
+      "#silk_MLA\n\t"
+      "mla %0, %1, %2, %3\n\t"
+      : "=r"(res)
+      : "r"(b), "r"(c), "r"(a)
+  );
+  return res;
+}
+#define silk_MLA(a, b, c) (silk_MLA_armv4(a, b, c))
+
+#endif
--- /dev/null
+++ b/silk/SigProc_FIX_armv5e.h
@@ -1,0 +1,61 @@
+/***********************************************************************
+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
+Copyright (c) 2013       Parrot
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifndef SILK_SIGPROC_FIX_ARMv5E_H
+#define SILK_SIGPROC_FIX_ARMv5E_H
+
+#undef silk_SMULTT
+static inline opus_int32 silk_SMULTT_armv5e(opus_int32 a, opus_int32 b)
+{
+  opus_int32 res;
+  __asm__(
+      "#silk_SMULTT\n\t"
+      "smultt %0, %1, %2\n\t"
+      : "=r"(res)
+      : "r"(a), "r"(b)
+  );
+  return res;
+}
+#define silk_SMULTT(a, b) (silk_SMULTT_armv5e(a, b))
+
+#undef silk_SMLATT
+static inline opus_int32 silk_SMLATT_armv5e(opus_int32 a, opus_int32 b,
+ opus_int32 c)
+{
+  opus_int32 res;
+  __asm__(
+      "#silk_SMLATT\n\t"
+      "smlatt %0, %1, %2, %3\n\t"
+      : "=r"(res)
+      : "r"(b), "r"(c), "r"(a)
+  );
+  return res;
+}
+#define silk_SMLATT(a, b, c) (silk_SMLATT_armv5e(a, b, c))
+
+#endif
--- a/silk/macros.h
+++ b/silk/macros.h
@@ -134,5 +134,13 @@
     (*((Matrix_base_adr) + ((row)+(M)*(column))))
 #endif
 
+#ifdef ARMv4_ASM
+#include "macros_armv4.h"
+#endif
+
+#ifdef ARMv5E_ASM
+#include "macros_armv5e.h"
+#endif
+
 #endif /* SILK_MACROS_H */
 
--- /dev/null
+++ b/silk/macros_armv4.h
@@ -1,0 +1,103 @@
+/***********************************************************************
+Copyright (C) 2013 Xiph.Org Foundation and contributors.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifndef SILK_MACROS_ARMv4_H
+#define SILK_MACROS_ARMv4_H
+
+/* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */
+#undef silk_SMULWB
+static inline opus_int32 silk_SMULWB_armv4(opus_int32 a, opus_int16 b)
+{
+  unsigned rd_lo;
+  int rd_hi;
+  __asm__(
+      "#silk_SMULWB\n\t"
+      "smull %0, %1, %2, %3\n\t"
+      : "=r"(rd_lo), "=r"(rd_hi)
+      : "%r"(a), "r"(b<<16)
+  );
+  return rd_hi;
+}
+#define silk_SMULWB(a, b) (silk_SMULWB_armv4(a, b))
+
+/* a32 + (b32 * (opus_int32)((opus_int16)(c32))) >> 16 output have to be 32bit int */
+#undef silk_SMLAWB
+#define silk_SMLAWB(a, b, c) ((a) + silk_SMULWB(b, c))
+
+/* (a32 * (b32 >> 16)) >> 16 */
+#undef silk_SMULWT
+static inline opus_int32 silk_SMULWT_armv4(opus_int32 a, opus_int32 b)
+{
+  unsigned rd_lo;
+  int rd_hi;
+  __asm__(
+      "#silk_SMULWT\n\t"
+      "smull %0, %1, %2, %3\n\t"
+      : "=r"(rd_lo), "=r"(rd_hi)
+      : "%r"(a), "r"(b&~0xFFFF)
+  );
+  return rd_hi;
+}
+#define silk_SMULWT(a, b) (silk_SMULWT_armv4(a, b))
+
+/* a32 + (b32 * (c32 >> 16)) >> 16 */
+#undef silk_SMLAWT
+#define silk_SMLAWT(a, b, c) ((a) + silk_SMULWT(b, c))
+
+/* (a32 * b32) >> 16 */
+#undef silk_SMULWW
+static inline opus_int32 silk_SMULWW_armv4(opus_int32 a, opus_int32 b)
+{
+  unsigned rd_lo;
+  int rd_hi;
+  __asm__(
+    "#silk_SMULWW\n\t"
+    "smull %0, %1, %2, %3\n\t"
+    : "=r"(rd_lo), "=r"(rd_hi)
+    : "%r"(a), "r"(b)
+  );
+  return (rd_lo>>16)|(rd_hi<<16);
+}
+#define silk_SMULWW(a, b) (silk_SMULWW_armv4(a, b))
+
+#undef silk_SMLAWW
+static inline opus_int32 silk_SMLAWW_armv4(opus_int32 a, opus_int32 b,
+ opus_int32 c)
+{
+  unsigned rd_lo;
+  int rd_hi;
+  __asm__(
+    "#silk_SMULWW\n\t"
+    "smull %0, %1, %2, %3\n\t"
+    : "=r"(rd_lo), "=r"(rd_hi)
+    : "%r"(b), "r"(c)
+  );
+  return a+((rd_lo>>16)|(rd_hi<<16));
+}
+#define silk_SMLAWW(a, b, c) (silk_SMLAWW_armv4(a, b, c))
+
+#endif /* SILK_MACROS_ARMv4_H */
--- /dev/null
+++ b/silk/macros_armv5e.h
@@ -1,0 +1,213 @@
+/***********************************************************************
+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
+Copyright (c) 2013       Parrot
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifndef SILK_MACROS_ARMv5E_H
+#define SILK_MACROS_ARMv5E_H
+
+/* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */
+#undef silk_SMULWB
+static inline opus_int32 silk_SMULWB_armv5e(opus_int32 a, opus_int16 b)
+{
+  int res;
+  __asm__(
+      "#silk_SMULWB\n\t"
+      "smulwb %0, %1, %2\n\t"
+      : "=r"(res)
+      : "r"(a), "r"(b)
+  );
+  return res;
+}
+#define silk_SMULWB(a, b) (silk_SMULWB_armv5e(a, b))
+
+/* a32 + (b32 * (opus_int32)((opus_int16)(c32))) >> 16 output have to be 32bit int */
+#undef silk_SMLAWB
+static inline opus_int32 silk_SMLAWB_armv5e(opus_int32 a, opus_int32 b,
+ opus_int16 c)
+{
+  int res;
+  __asm__(
+      "#silk_SMLAWB\n\t"
+      "smlawb %0, %1, %2, %3\n\t"
+      : "=r"(res)
+      : "r"(b), "r"(c), "r"(a)
+  );
+  return res;
+}
+#define silk_SMLAWB(a, b, c) (silk_SMLAWB_armv5e(a, b, c))
+
+/* (a32 * (b32 >> 16)) >> 16 */
+#undef silk_SMULWT
+static inline opus_int32 silk_SMULWT_armv5e(opus_int32 a, opus_int32 b)
+{
+  int res;
+  __asm__(
+      "#silk_SMULWT\n\t"
+      "smulwt %0, %1, %2\n\t"
+      : "=r"(res)
+      : "r"(a), "r"(b)
+  );
+  return res;
+}
+#define silk_SMULWT(a, b) (silk_SMULWT_armv5e(a, b))
+
+/* a32 + (b32 * (c32 >> 16)) >> 16 */
+#undef silk_SMLAWT
+static inline opus_int32 silk_SMLAWT_armv5e(opus_int32 a, opus_int32 b,
+ opus_int32 c)
+{
+  int res;
+  __asm__(
+      "#silk_SMLAWT\n\t"
+      "smlawt %0, %1, %2, %3\n\t"
+      : "=r"(res)
+      : "r"(b), "r"(c), "r"(a)
+  );
+  return res;
+}
+#define silk_SMLAWT(a, b, c) (silk_SMLAWT_armv5e(a, b, c))
+
+/* (opus_int32)((opus_int16)(a3))) * (opus_int32)((opus_int16)(b32)) output have to be 32bit int */
+#undef silk_SMULBB
+static inline opus_int32 silk_SMULBB_armv5e(opus_int32 a, opus_int32 b)
+{
+  int res;
+  __asm__(
+      "#silk_SMULBB\n\t"
+      "smulbb %0, %1, %2\n\t"
+      : "=r"(res)
+      : "%r"(a), "r"(b)
+  );
+  return res;
+}
+#define silk_SMULBB(a, b) (silk_SMULBB_armv5e(a, b))
+
+/* a32 + (opus_int32)((opus_int16)(b32)) * (opus_int32)((opus_int16)(c32)) output have to be 32bit int */
+#undef silk_SMLABB
+static inline opus_int32 silk_SMLABB_armv5e(opus_int32 a, opus_int32 b,
+ opus_int32 c)
+{
+  int res;
+  __asm__(
+      "#silk_SMLABB\n\t"
+      "smlabb %0, %1, %2, %3\n\t"
+      : "=r"(res)
+      : "%r"(b), "r"(c), "r"(a)
+  );
+  return res;
+}
+#define silk_SMLABB(a, b, c) (silk_SMLABB_armv5e(a, b, c))
+
+/* (opus_int32)((opus_int16)(a32)) * (b32 >> 16) */
+#undef silk_SMULBT
+static inline opus_int32 silk_SMULBT_armv5e(opus_int32 a, opus_int32 b)
+{
+  int res;
+  __asm__(
+      "#silk_SMULBT\n\t"
+      "smulbt %0, %1, %2\n\t"
+      : "=r"(res)
+      : "r"(a), "r"(b)
+  );
+  return res;
+}
+#define silk_SMULBT(a, b) (silk_SMULBT_armv5e(a, b))
+
+/* a32 + (opus_int32)((opus_int16)(b32)) * (c32 >> 16) */
+#undef silk_SMLABT
+static inline opus_int32 silk_SMLABT_armv5e(opus_int32 a, opus_int32 b,
+ opus_int32 c)
+{
+  int res;
+  __asm__(
+      "#silk_SMLABT\n\t"
+      "smlabt %0, %1, %2, %3\n\t"
+      : "=r"(res)
+      : "r"(b), "r"(c), "r"(a)
+  );
+  return res;
+}
+#define silk_SMLABT(a, b, c) (silk_SMLABT_armv5e(a, b, c))
+
+/* add/subtract with output saturated */
+#undef silk_ADD_SAT32
+static inline opus_int32 silk_ADD_SAT32_armv5e(opus_int32 a, opus_int32 b)
+{
+  int res;
+  __asm__(
+      "#silk_ADD_SAT32\n\t"
+      "qadd %0, %1, %2\n\t"
+      : "=r"(res)
+      : "%r"(a), "r"(b)
+  );
+  return res;
+}
+#define silk_ADD_SAT32(a, b) (silk_ADD_SAT32_armv5e(a, b))
+
+#undef silk_SUB_SAT32
+static inline opus_int32 silk_SUB_SAT32_armv5e(opus_int32 a, opus_int32 b)
+{
+  int res;
+  __asm__(
+      "#silk_SUB_SAT32\n\t"
+      "qsub %0, %1, %2\n\t"
+      : "=r"(res)
+      : "r"(a), "r"(b)
+  );
+  return res;
+}
+#define silk_SUB_SAT32(a, b) (silk_SUB_SAT32_armv5e(a, b))
+
+#undef silk_CLZ16
+static inline opus_int32 silk_CLZ16_armv5(opus_int16 in16)
+{
+  int res;
+  __asm__(
+      "#silk_CLZ16\n\t"
+      "clz %0, %1;\n"
+      : "=r"(res)
+      : "r"(in16<<16|0x8000)
+  );
+  return res;
+}
+#define silk_CLZ16(in16) (silk_CLZ16_armv5(in16))
+
+#undef silk_CLZ32
+static inline opus_int32 silk_CLZ32_armv5(opus_int32 in32)
+{
+  int res;
+  __asm__(
+      "#silk_CLZ32\n\t"
+      "clz %0, %1\n\t"
+      : "=&r"(res)
+      : "r"(in32)
+  );
+  return res;
+}
+#define silk_CLZ32(in32) (silk_CLZ32_armv5(in32))
+
+#endif /* SILK_MACROS_ARMv5E_H */