ref: 972a34ec2c79d241318af24389b8ee042d10556a
parent: b7bd4c20acfd951ba46647e07411285997d952f4
author: Timothy B. Terriberry <[email protected]>
date: Sun May 19 13:11:17 EDT 2013
Add ARMv4/ARMv5E macros. Original patch by Aurélien Zanelli <[email protected]>: http://lists.xiph.org/pipermail/opus/2013-May/002078.html Revised version: - Add autconf detection (ported from libtheora). - Rename ARM5E to ARMv5E (an ARM5 is not the same thing as ARMv5!). - Use actual macros so they can still be selectively overridden. - Split out ARMv4 parts and add a few more ARMv4 macros. - Label blocks to make them easy to find in generated assembly. - Fix MULT16_32_Q15() so we can pass make check. The MDCT test passes in values larger than 2**30 for b. The new version should be just as fast (or faster, since it's easier to merge the shift with following instructions), and there's no appreciable impact on accuracy (FFT/MDCT SNR actually goes up in most cases). - Fix register constraints. We were using early-clobber flags in a bunch of places that didn't need them, and commutative-pair flags in a bunch of places that weren't actually commutative. This was Jean-Marc's fault (the original code came from Speex). - Simplify silk_CLZ16(). - Port over iFFT C_MULC asm by Andree Buschmann <[email protected]> from Rockbox. - Speed up the C_MULC asm by using LDRD, allowing more flexible addressing, re-ordering instructions to avoid some stalls, allowing more flexible register allocation, and getting things out of the inline asm block so the compiler can schedule them better. - Add C_MUL and C_MUL4 asm for the FFT to the encoder based, on the new C_MULC. In total, this patch gives a 22.3% speed-up on test_opus_encoder on a 600 MHz Cortex A8 using gcc 4.2.1, When restricted to ARMv4 optimizations, it gives a 9.6% speed-up on the same processor/compiler. On the conformance test vectors: Average mono quality is 97.0583 % Average stereo quality is 97.775 %
--- a/autogen.sh
+++ b/autogen.sh
@@ -135,6 +135,7 @@
echo "Generating configuration files for $package, please wait...."
+ACLOCAL_FLAGS="-I m4"
echo " $ACLOCAL $ACLOCAL_FLAGS"
$ACLOCAL $ACLOCAL_FLAGS || exit 1
echo " autoheader"
--- a/celt/_kiss_fft_guts.h
+++ b/celt/_kiss_fft_guts.h
@@ -94,6 +94,179 @@
do {(res).r = ADD32((res).r,(a).r); (res).i = SUB32((res).i,(a).i); \
}while(0)
+#if defined(ARMv4_ASM)
+
+#undef C_MUL
+#define C_MUL(m,a,b) \
+ do{ \
+ int br__; \
+ int bi__; \
+ int tt__; \
+ __asm__ __volatile__( \
+ "#C_MUL\n\t" \
+ "ldm %[ap], {r0,r1}\n\t" \
+ "ldrsh %[br], [%[bp], #0]\n\t" \
+ "ldrsh %[bi], [%[bp], #2]\n\t" \
+ "smull %[tt], %[mi], r1, %[br]\n\t" \
+ "smlal %[tt], %[mi], r0, %[bi]\n\t" \
+ "rsb %[bi], %[bi], #0\n\t" \
+ "smull r0, %[mr], r0, %[br]\n\t" \
+ "mov %[tt], %[tt], lsr #15\n\t" \
+ "smlal r0, %[mr], r1, %[bi]\n\t" \
+ "orr %[mi], %[tt], %[mi], lsl #17\n\t" \
+ "mov r0, r0, lsr #15\n\t" \
+ "orr %[mr], r0, %[mr], lsl #17\n\t" \
+ : [mr]"=r"((m).r), [mi]"=r"((m).i), \
+ [br]"=&r"(br__), [bi]"=r"(bi__), [tt]"=r"(tt__) \
+ : [ap]"r"(&(a)), [bp]"r"(&(b)) \
+ : "r0", "r1" \
+ ); \
+ } \
+ while(0)
+
+#undef C_MUL4
+#define C_MUL4(m,a,b) \
+ do{ \
+ int br__; \
+ int bi__; \
+ int tt__; \
+ __asm__ __volatile__( \
+ "#C_MUL4\n\t" \
+ "ldm %[ap], {r0,r1}\n\t" \
+ "ldrsh %[br], [%[bp], #0]\n\t" \
+ "ldrsh %[bi], [%[bp], #2]\n\t" \
+ "smull %[tt], %[mi], r1, %[br]\n\t" \
+ "smlal %[tt], %[mi], r0, %[bi]\n\t" \
+ "rsb %[bi], %[bi], #0\n\t" \
+ "smull r0, %[mr], r0, %[br]\n\t" \
+ "mov %[tt], %[tt], lsr #17\n\t" \
+ "smlal r0, %[mr], r1, %[bi]\n\t" \
+ "orr %[mi], %[tt], %[mi], lsl #15\n\t" \
+ "mov r0, r0, lsr #17\n\t" \
+ "orr %[mr], r0, %[mr], lsl #15\n\t" \
+ : [mr]"=r"((m).r), [mi]"=r"((m).i), \
+ [br]"=&r"(br__), [bi]"=r"(bi__), [tt]"=r"(tt__) \
+ : [ap]"r"(&(a)), [bp]"r"(&(b)) \
+ : "r0", "r1" \
+ ); \
+ } \
+ while(0)
+
+#undef C_MULC
+#define C_MULC(m,a,b) \
+ do{ \
+ int br__; \
+ int bi__; \
+ int tt__; \
+ __asm__ __volatile__( \
+ "#C_MULC\n\t" \
+ "ldm %[ap], {r0,r1}\n\t" \
+ "ldrsh %[br], [%[bp], #0]\n\t" \
+ "ldrsh %[bi], [%[bp], #2]\n\t" \
+ "smull %[tt], %[mr], r0, %[br]\n\t" \
+ "smlal %[tt], %[mr], r1, %[bi]\n\t" \
+ "rsb %[bi], %[bi], #0\n\t" \
+ "smull r1, %[mi], r1, %[br]\n\t" \
+ "mov %[tt], %[tt], lsr #15\n\t" \
+ "smlal r1, %[mi], r0, %[bi]\n\t" \
+ "orr %[mr], %[tt], %[mr], lsl #17\n\t" \
+ "mov r1, r1, lsr #15\n\t" \
+ "orr %[mi], r1, %[mi], lsl #17\n\t" \
+ : [mr]"=r"((m).r), [mi]"=r"((m).i), \
+ [br]"=&r"(br__), [bi]"=r"(bi__), [tt]"=r"(tt__) \
+ : [ap]"r"(&(a)), [bp]"r"(&(b)) \
+ : "r0", "r1" \
+ ); \
+ } \
+ while(0)
+
+#endif /* ARMv4_ASM */
+
+#if defined(ARMv5E_ASM)
+
+#if defined(__thumb__)||defined(__thumb2__)
+#define LDRD_CONS "Q"
+#else
+#define LDRD_CONS "Uq"
+#endif
+
+#undef C_MUL
+#define C_MUL(m,a,b) \
+ do{ \
+ int mr1__; \
+ int mr2__; \
+ int mi__; \
+ long long aval__; \
+ int bval__; \
+ __asm__( \
+ "#C_MUL\n\t" \
+ "ldrd %[aval], %H[aval], %[ap]\n\t" \
+ "ldr %[bval], %[bp]\n\t" \
+ "smulwb %[mi], %H[aval], %[bval]\n\t" \
+ "smulwb %[mr1], %[aval], %[bval]\n\t" \
+ "smulwt %[mr2], %H[aval], %[bval]\n\t" \
+ "smlawt %[mi], %[aval], %[bval], %[mi]\n\t" \
+ : [mr1]"=r"(mr1__), [mr2]"=r"(mr2__), [mi]"=r"(mi__), \
+ [aval]"=&r"(aval__), [bval]"=r"(bval__) \
+ : [ap]LDRD_CONS(a), [bp]"m"(b) \
+ ); \
+ (m).r = SHL32(SUB32(mr1__, mr2__), 1); \
+ (m).i = SHL32(mi__, 1); \
+ } \
+ while(0)
+
+#undef C_MUL4
+#define C_MUL4(m,a,b) \
+ do{ \
+ int mr1__; \
+ int mr2__; \
+ int mi__; \
+ long long aval__; \
+ int bval__; \
+ __asm__( \
+ "#C_MUL4\n\t" \
+ "ldrd %[aval], %H[aval], %[ap]\n\t" \
+ "ldr %[bval], %[bp]\n\t" \
+ "smulwb %[mi], %H[aval], %[bval]\n\t" \
+ "smulwb %[mr1], %[aval], %[bval]\n\t" \
+ "smulwt %[mr2], %H[aval], %[bval]\n\t" \
+ "smlawt %[mi], %[aval], %[bval], %[mi]\n\t" \
+ : [mr1]"=r"(mr1__), [mr2]"=r"(mr2__), [mi]"=r"(mi__), \
+ [aval]"=&r"(aval__), [bval]"=r"(bval__) \
+ : [ap]LDRD_CONS(a), [bp]"m"(b) \
+ ); \
+ (m).r = SHR32(SUB32(mr1__, mr2__), 1); \
+ (m).i = SHR32(mi__, 1); \
+ } \
+ while(0)
+
+#undef C_MULC
+#define C_MULC(m,a,b) \
+ do{ \
+ int mr__; \
+ int mi1__; \
+ int mi2__; \
+ long long aval__; \
+ int bval__; \
+ __asm__( \
+ "#C_MULC\n\t" \
+ "ldrd %[aval], %H[aval], %[ap]\n\t" \
+ "ldr %[bval], %[bp]\n\t" \
+ "smulwb %[mr], %[aval], %[bval]\n\t" \
+ "smulwb %[mi1], %H[aval], %[bval]\n\t" \
+ "smulwt %[mi2], %[aval], %[bval]\n\t" \
+ "smlawt %[mr], %H[aval], %[bval], %[mr]\n\t" \
+ : [mr]"=r"(mr__), [mi1]"=r"(mi1__), [mi2]"=r"(mi2__), \
+ [aval]"=&r"(aval__), [bval]"=r"(bval__) \
+ : [ap]LDRD_CONS(a), [bp]"m"(b) \
+ ); \
+ (m).r = SHL32(mr__, 1); \
+ (m).i = SHL32(SUB32(mi1__, mi2__), 1); \
+ } \
+ while(0)
+
+#endif /* ARMv5E_ASM */
+
#else /* not FIXED_POINT*/
# define S_MUL(a,b) ( (a)*(b) )
--- a/celt/arch.h
+++ b/celt/arch.h
@@ -112,10 +112,10 @@
#include "fixed_generic.h"
-#ifdef ARM5E_ASM
-#include "fixed_arm5e.h"
-#elif defined (ARM4_ASM)
-#include "fixed_arm4.h"
+#ifdef ARMv5E_ASM
+#include "fixed_armv5e.h"
+#elif defined (ARMv4_ASM)
+#include "fixed_armv4.h"
#elif defined (BFIN_ASM)
#include "fixed_bfin.h"
#elif defined (TI_C5X_ASM)
--- /dev/null
+++ b/celt/fixed_armv4.h
@@ -1,0 +1,71 @@
+/* Copyright (C) 2013 Xiph.Org Foundation and contributors */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef FIXED_ARMv4_H
+#define FIXED_ARMv4_H
+
+/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
+#undef MULT16_32_Q16
+static inline opus_val32 MULT16_32_Q16_armv4(opus_val16 a, opus_val32 b)
+{
+ unsigned rd_lo;
+ int rd_hi;
+ __asm__(
+ "#MULT16_32_Q16\n\t"
+ "smull %0, %1, %2, %3\n\t"
+ : "=r"(rd_lo), "=r"(rd_hi)
+ : "r"(b),"r"(a<<16)
+ );
+ return rd_hi;
+}
+#define MULT16_32_Q16(a, b) (MULT16_32_Q16_armv4(a, b))
+
+
+/** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */
+#undef MULT16_32_Q15
+static inline opus_val32 MULT16_32_Q15_armv4(opus_val16 a, opus_val32 b)
+{
+ unsigned rd_lo;
+ int rd_hi;
+ __asm__(
+ "#MULT16_32_Q15\n\t"
+ "smull %0, %1, %2, %3\n\t"
+ : "=r"(rd_lo), "=r"(rd_hi)
+ : "%r"(b), "r"(a<<16)
+ );
+ /*We intentionally don't OR in the high bit of rd_lo for speed.*/
+ return rd_hi<<1;
+}
+#define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv4(a, b))
+
+
+/** 16x32 multiply, followed by a 15-bit shift right and 32-bit add.
+ b must fit in 31 bits.
+ Result fits in 32 bits. */
+#undef MAC16_32_Q15
+#define MAC16_32_Q15(c, a, b) ADD32(c, MULT16_32_Q15(a, b))
+
+#endif
--- /dev/null
+++ b/celt/fixed_armv5e.h
@@ -1,0 +1,127 @@
+/* Copyright (C) 2007-2009 Xiph.Org Foundation
+ Copyright (C) 2003-2008 Jean-Marc Valin
+ Copyright (C) 2007-2008 CSIRO
+ Copyright (C) 2013 Parrot */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef FIXED_ARMv5E_H
+#define FIXED_ARMv5E_H
+
+#include "fixed_armv4.h"
+
+/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
+#undef MULT16_32_Q16
+static inline opus_val32 MULT16_32_Q16_armv5e(opus_val16 a, opus_val32 b)
+{
+ int res;
+ __asm__(
+ "#MULT16_32_Q16\n\t"
+ "smulwb %0, %1, %2\n\t"
+ : "=r"(res)
+ : "r"(b),"r"(a)
+ );
+ return res;
+}
+#define MULT16_32_Q16(a, b) (MULT16_32_Q16_armv5e(a, b))
+
+
+/** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */
+#undef MULT16_32_Q15
+static inline opus_val32 MULT16_32_Q15_armv5e(opus_val16 a, opus_val32 b)
+{
+#if 0
+ unsigned rd_lo;
+ int rd_hi;
+ __asm__(
+ "#MULT16_32_Q15\n\t"
+ "smull %0, %1, %2, %3\n\t"
+ : "=r"(rd_lo), "=r"(rd_hi)
+ : "%r"(b), "r"(a<<16)
+ );
+ return (rd_lo>>31)|(rd_hi<<1);
+#else
+ int res;
+ __asm__(
+ "#MULT16_32_Q15\n\t"
+ "smulwb %0, %1, %2\n\t"
+ : "=r"(res)
+ : "%r"(b), "r"(a)
+ );
+ return res<<1;
+#endif
+}
+#define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv5e(a, b))
+
+
+/** 16x32 multiply, followed by a 15-bit shift right and 32-bit add.
+ b must fit in 31 bits.
+ Result fits in 32 bits. */
+#undef MAC16_32_Q15
+static inline opus_val32 MAC16_32_Q15_armv5e(opus_val32 c, opus_val16 a,
+ opus_val32 b)
+{
+ int res;
+ __asm__(
+ "#MAC16_32_Q15\n\t"
+ "smlawb %0, %1, %2, %3;\n"
+ : "=r"(res)
+ : "r"(b<<1), "r"(a), "r"(c)
+ );
+ return res;
+}
+#define MAC16_32_Q15(c, a, b) (MAC16_32_Q15_armv5e(c, a, b))
+
+/** 16x16 multiply-add where the result fits in 32 bits */
+#undef MAC16_16
+static inline opus_val32 MAC16_16(opus_val32 c, opus_val16 a, opus_val16 b)
+{
+ int res;
+ __asm__(
+ "#MAC16_16\n\t"
+ "smlabb %0, %1, %2, %3;\n"
+ : "=r"(res)
+ : "r"(a), "r"(b), "r"(c)
+ );
+ return res;
+}
+#define MAC16_16(c, a, b) (MAC16_16(c, a, b))
+
+/** 16x16 multiplication where the result fits in 32 bits */
+#undef MULT16_16
+static inline opus_val32 MULT16_16(opus_val16 a, opus_val16 b)
+{
+ int res;
+ __asm__(
+ "#MULT16_16\n\t"
+ "smulbb %0, %1, %2;\n"
+ : "=r"(res)
+ : "r"(a), "r"(b)
+ );
+ return res;
+}
+#define MULT16_16(a, b) (MULT16_16(a, b))
+
+#endif
--- a/configure.ac
+++ b/configure.ac
@@ -18,7 +18,6 @@
dnl enable silent rules on automake 1.11 and later
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
-
# For libtool.
dnl Please update these for releases.
OPUS_LT_CURRENT=4
@@ -155,6 +154,36 @@
AC_DEFINE([FLOAT_APPROX], , [Float approximations])
fi
+cpu_arm=no
+AC_ARG_ENABLE(asm,
+ AS_HELP_STRING([--disable-asm], [Disable assembly optimizations]),
+ [ ac_enable_asm=$enableval ], [ ac_enable_asm=yes] )
+if test "x${ac_enable_asm}" = xyes ; then
+ asm_optimization="no asm for your platform, please send patches"
+ case $host_cpu in
+ arm*)
+ cpu_arm=yes
+ AS_GCC_INLINE_ASSEMBLY([asm_optimization="ARM"],
+ [asm_optimization="disabled"])
+ if test "x${asm_optimization}" = "xARM" ; then
+ AC_DEFINE([ARMv4_ASM], [], [Use generic ARMv4 asm optimizations])
+ AS_ASM_ARM_EDSP([ARMv5E_ASM=1],[ARMv5E_ASM=0])
+ if test "x${ARMv5E_ASM}" = "x1" ; then
+ AC_DEFINE(ARMv5E_ASM, 1, [Use ARMv5E asm optimizations])
+ asm_optimization="${asm_optimization} (EDSP)"
+ fi
+ AS_ASM_ARM_MEDIA([ARMv6_ASM=1],[ARMv6_ASM=0])
+ if test "x${ARMv6_ASM}" = "x1" ; then
+ AC_DEFINE(ARMv6_ASM, 1, [Use ARMv6 asm optimizations])
+ asm_optimization="${asm_optimization} (Media)"
+ fi
+ fi
+ ;;
+ esac
+else
+ asm_optimization="disabled"
+fi
+
ac_enable_assertions="no"
AC_ARG_ENABLE(assertions, [ --enable-assertions enable additional software error checking],
[if test "$enableval" = yes; then
@@ -281,6 +310,7 @@
Floating point support: ........ ${ac_enable_float}
Fast float approximations: ..... ${float_approx}
Fixed point debugging: ......... ${ac_enable_fixed_debug}
+ Assembly optimization: ......... ${asm_optimization}
Custom modes: .................. ${ac_enable_custom_modes}
Assertion checking: ............ ${ac_enable_assertions}
Fuzzing: ....................... ${ac_enable_fuzzing}
--- /dev/null
+++ b/m4/as-gcc-inline-assembly.m4
@@ -1,0 +1,106 @@
+dnl as-gcc-inline-assembly.m4 0.1.0
+
+dnl autostars m4 macro for detection of gcc inline assembly
+
+dnl David Schleef <[email protected]>
+
+dnl $Id$
+
+dnl AS_COMPILER_FLAG(ACTION-IF-ACCEPTED, [ACTION-IF-NOT-ACCEPTED])
+dnl Tries to compile with the given CFLAGS.
+dnl Runs ACTION-IF-ACCEPTED if the compiler can compile with the flags,
+dnl and ACTION-IF-NOT-ACCEPTED otherwise.
+
+AC_DEFUN([AS_GCC_INLINE_ASSEMBLY],
+[
+ AC_MSG_CHECKING([if compiler supports gcc-style inline assembly])
+
+ AC_TRY_COMPILE([], [
+#ifdef __GNUC_MINOR__
+#if (__GNUC__ * 1000 + __GNUC_MINOR__) < 3004
+#error GCC before 3.4 has critical bugs compiling inline assembly
+#endif
+#endif
+__asm__ (""::) ], [flag_ok=yes], [flag_ok=no])
+
+ if test "X$flag_ok" = Xyes ; then
+ $1
+ true
+ else
+ $2
+ true
+ fi
+ AC_MSG_RESULT([$flag_ok])
+])
+
+AC_DEFUN([AC_TRY_ASSEMBLE],
+[ac_c_ext=$ac_ext
+ ac_ext=${ac_s_ext-s}
+ cat > conftest.$ac_ext <<EOF
+ .file "configure"
+[$1]
+EOF
+if AC_TRY_EVAL(ac_compile); then
+ ac_ext=$ac_c_ext
+ ifelse([$2], , :, [ $2
+ rm -rf conftest*])
+else
+ echo "configure: failed program was:" >&AC_FD_CC
+ cat conftest.$ac_ext >&AC_FD_CC
+ ac_ext=$ac_c_ext
+ifelse([$3], , , [ rm -rf conftest*
+ $3
+])dnl
+fi
+rm -rf conftest*])
+
+
+AC_DEFUN([AS_ASM_ARM_NEON],
+[
+ AC_MSG_CHECKING([if assembler supports NEON instructions on ARM])
+
+ AC_TRY_ASSEMBLE([vorr d0,d0,d0], [flag_ok=yes], [flag_ok=no])
+
+ if test "X$flag_ok" = Xyes ; then
+ $1
+ true
+ else
+ $2
+ true
+ fi
+ AC_MSG_RESULT([$flag_ok])
+])
+
+
+AC_DEFUN([AS_ASM_ARM_MEDIA],
+[
+ AC_MSG_CHECKING([if assembler supports ARMv6 media instructions on ARM])
+
+ AC_TRY_ASSEMBLE([shadd8 r3,r3,r3], [flag_ok=yes], [flag_ok=no])
+
+ if test "X$flag_ok" = Xyes ; then
+ $1
+ true
+ else
+ $2
+ true
+ fi
+ AC_MSG_RESULT([$flag_ok])
+])
+
+
+AC_DEFUN([AS_ASM_ARM_EDSP],
+[
+ AC_MSG_CHECKING([if assembler supports EDSP instructions on ARM])
+
+ AC_TRY_ASSEMBLE([qadd r3,r3,r3], [flag_ok=yes], [flag_ok=no])
+
+ if test "X$flag_ok" = Xyes ; then
+ $1
+ true
+ else
+ $2
+ true
+ fi
+ AC_MSG_RESULT([$flag_ok])
+])
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -576,6 +576,14 @@
#include "MacroCount.h"
#include "MacroDebug.h"
+#ifdef ARMv4_ASM
+#include "SigProc_FIX_armv4.h"
+#endif
+
+#ifdef ARMv5E_ASM
+#include "SigProc_FIX_armv5e.h"
+#endif
+
#ifdef __cplusplus
}
#endif
--- /dev/null
+++ b/silk/SigProc_FIX_armv4.h
@@ -1,0 +1,47 @@
+/***********************************************************************
+Copyright (C) 2013 Xiph.Org Foundation and contributors
+Copyright (c) 2013 Parrot
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifndef SILK_SIGPROC_FIX_ARMv4_H
+#define SILK_SIGPROC_FIX_ARMv4_H
+
+#undef silk_MLA
+static inline opus_int32 silk_MLA_armv4(opus_int32 a, opus_int32 b,
+ opus_int32 c)
+{
+ opus_int32 res;
+ __asm__(
+ "#silk_MLA\n\t"
+ "mla %0, %1, %2, %3\n\t"
+ : "=r"(res)
+ : "r"(b), "r"(c), "r"(a)
+ );
+ return res;
+}
+#define silk_MLA(a, b, c) (silk_MLA_armv4(a, b, c))
+
+#endif
--- /dev/null
+++ b/silk/SigProc_FIX_armv5e.h
@@ -1,0 +1,61 @@
+/***********************************************************************
+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
+Copyright (c) 2013 Parrot
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifndef SILK_SIGPROC_FIX_ARMv5E_H
+#define SILK_SIGPROC_FIX_ARMv5E_H
+
+#undef silk_SMULTT
+static inline opus_int32 silk_SMULTT_armv5e(opus_int32 a, opus_int32 b)
+{
+ opus_int32 res;
+ __asm__(
+ "#silk_SMULTT\n\t"
+ "smultt %0, %1, %2\n\t"
+ : "=r"(res)
+ : "r"(a), "r"(b)
+ );
+ return res;
+}
+#define silk_SMULTT(a, b) (silk_SMULTT_armv5e(a, b))
+
+#undef silk_SMLATT
+static inline opus_int32 silk_SMLATT_armv5e(opus_int32 a, opus_int32 b,
+ opus_int32 c)
+{
+ opus_int32 res;
+ __asm__(
+ "#silk_SMLATT\n\t"
+ "smlatt %0, %1, %2, %3\n\t"
+ : "=r"(res)
+ : "r"(b), "r"(c), "r"(a)
+ );
+ return res;
+}
+#define silk_SMLATT(a, b, c) (silk_SMLATT_armv5e(a, b, c))
+
+#endif
--- a/silk/macros.h
+++ b/silk/macros.h
@@ -134,5 +134,13 @@
(*((Matrix_base_adr) + ((row)+(M)*(column))))
#endif
+#ifdef ARMv4_ASM
+#include "macros_armv4.h"
+#endif
+
+#ifdef ARMv5E_ASM
+#include "macros_armv5e.h"
+#endif
+
#endif /* SILK_MACROS_H */
--- /dev/null
+++ b/silk/macros_armv4.h
@@ -1,0 +1,103 @@
+/***********************************************************************
+Copyright (C) 2013 Xiph.Org Foundation and contributors.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifndef SILK_MACROS_ARMv4_H
+#define SILK_MACROS_ARMv4_H
+
+/* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */
+#undef silk_SMULWB
+static inline opus_int32 silk_SMULWB_armv4(opus_int32 a, opus_int16 b)
+{
+ unsigned rd_lo;
+ int rd_hi;
+ __asm__(
+ "#silk_SMULWB\n\t"
+ "smull %0, %1, %2, %3\n\t"
+ : "=r"(rd_lo), "=r"(rd_hi)
+ : "%r"(a), "r"(b<<16)
+ );
+ return rd_hi;
+}
+#define silk_SMULWB(a, b) (silk_SMULWB_armv4(a, b))
+
+/* a32 + (b32 * (opus_int32)((opus_int16)(c32))) >> 16 output have to be 32bit int */
+#undef silk_SMLAWB
+#define silk_SMLAWB(a, b, c) ((a) + silk_SMULWB(b, c))
+
+/* (a32 * (b32 >> 16)) >> 16 */
+#undef silk_SMULWT
+static inline opus_int32 silk_SMULWT_armv4(opus_int32 a, opus_int32 b)
+{
+ unsigned rd_lo;
+ int rd_hi;
+ __asm__(
+ "#silk_SMULWT\n\t"
+ "smull %0, %1, %2, %3\n\t"
+ : "=r"(rd_lo), "=r"(rd_hi)
+ : "%r"(a), "r"(b&~0xFFFF)
+ );
+ return rd_hi;
+}
+#define silk_SMULWT(a, b) (silk_SMULWT_armv4(a, b))
+
+/* a32 + (b32 * (c32 >> 16)) >> 16 */
+#undef silk_SMLAWT
+#define silk_SMLAWT(a, b, c) ((a) + silk_SMULWT(b, c))
+
+/* (a32 * b32) >> 16 */
+#undef silk_SMULWW
+static inline opus_int32 silk_SMULWW_armv4(opus_int32 a, opus_int32 b)
+{
+ unsigned rd_lo;
+ int rd_hi;
+ __asm__(
+ "#silk_SMULWW\n\t"
+ "smull %0, %1, %2, %3\n\t"
+ : "=r"(rd_lo), "=r"(rd_hi)
+ : "%r"(a), "r"(b)
+ );
+ return (rd_lo>>16)|(rd_hi<<16);
+}
+#define silk_SMULWW(a, b) (silk_SMULWW_armv4(a, b))
+
+#undef silk_SMLAWW
+static inline opus_int32 silk_SMLAWW_armv4(opus_int32 a, opus_int32 b,
+ opus_int32 c)
+{
+ unsigned rd_lo;
+ int rd_hi;
+ __asm__(
+ "#silk_SMULWW\n\t"
+ "smull %0, %1, %2, %3\n\t"
+ : "=r"(rd_lo), "=r"(rd_hi)
+ : "%r"(b), "r"(c)
+ );
+ return a+((rd_lo>>16)|(rd_hi<<16));
+}
+#define silk_SMLAWW(a, b, c) (silk_SMLAWW_armv4(a, b, c))
+
+#endif /* SILK_MACROS_ARMv4_H */
--- /dev/null
+++ b/silk/macros_armv5e.h
@@ -1,0 +1,213 @@
+/***********************************************************************
+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
+Copyright (c) 2013 Parrot
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifndef SILK_MACROS_ARMv5E_H
+#define SILK_MACROS_ARMv5E_H
+
+/* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */
+#undef silk_SMULWB
+static inline opus_int32 silk_SMULWB_armv5e(opus_int32 a, opus_int16 b)
+{
+ int res;
+ __asm__(
+ "#silk_SMULWB\n\t"
+ "smulwb %0, %1, %2\n\t"
+ : "=r"(res)
+ : "r"(a), "r"(b)
+ );
+ return res;
+}
+#define silk_SMULWB(a, b) (silk_SMULWB_armv5e(a, b))
+
+/* a32 + (b32 * (opus_int32)((opus_int16)(c32))) >> 16 output have to be 32bit int */
+#undef silk_SMLAWB
+static inline opus_int32 silk_SMLAWB_armv5e(opus_int32 a, opus_int32 b,
+ opus_int16 c)
+{
+ int res;
+ __asm__(
+ "#silk_SMLAWB\n\t"
+ "smlawb %0, %1, %2, %3\n\t"
+ : "=r"(res)
+ : "r"(b), "r"(c), "r"(a)
+ );
+ return res;
+}
+#define silk_SMLAWB(a, b, c) (silk_SMLAWB_armv5e(a, b, c))
+
+/* (a32 * (b32 >> 16)) >> 16 */
+#undef silk_SMULWT
+static inline opus_int32 silk_SMULWT_armv5e(opus_int32 a, opus_int32 b)
+{
+ int res;
+ __asm__(
+ "#silk_SMULWT\n\t"
+ "smulwt %0, %1, %2\n\t"
+ : "=r"(res)
+ : "r"(a), "r"(b)
+ );
+ return res;
+}
+#define silk_SMULWT(a, b) (silk_SMULWT_armv5e(a, b))
+
+/* a32 + (b32 * (c32 >> 16)) >> 16 */
+#undef silk_SMLAWT
+static inline opus_int32 silk_SMLAWT_armv5e(opus_int32 a, opus_int32 b,
+ opus_int32 c)
+{
+ int res;
+ __asm__(
+ "#silk_SMLAWT\n\t"
+ "smlawt %0, %1, %2, %3\n\t"
+ : "=r"(res)
+ : "r"(b), "r"(c), "r"(a)
+ );
+ return res;
+}
+#define silk_SMLAWT(a, b, c) (silk_SMLAWT_armv5e(a, b, c))
+
+/* (opus_int32)((opus_int16)(a3))) * (opus_int32)((opus_int16)(b32)) output have to be 32bit int */
+#undef silk_SMULBB
+static inline opus_int32 silk_SMULBB_armv5e(opus_int32 a, opus_int32 b)
+{
+ int res;
+ __asm__(
+ "#silk_SMULBB\n\t"
+ "smulbb %0, %1, %2\n\t"
+ : "=r"(res)
+ : "%r"(a), "r"(b)
+ );
+ return res;
+}
+#define silk_SMULBB(a, b) (silk_SMULBB_armv5e(a, b))
+
+/* a32 + (opus_int32)((opus_int16)(b32)) * (opus_int32)((opus_int16)(c32)) output have to be 32bit int */
+#undef silk_SMLABB
+static inline opus_int32 silk_SMLABB_armv5e(opus_int32 a, opus_int32 b,
+ opus_int32 c)
+{
+ int res;
+ __asm__(
+ "#silk_SMLABB\n\t"
+ "smlabb %0, %1, %2, %3\n\t"
+ : "=r"(res)
+ : "%r"(b), "r"(c), "r"(a)
+ );
+ return res;
+}
+#define silk_SMLABB(a, b, c) (silk_SMLABB_armv5e(a, b, c))
+
+/* (opus_int32)((opus_int16)(a32)) * (b32 >> 16) */
+#undef silk_SMULBT
+static inline opus_int32 silk_SMULBT_armv5e(opus_int32 a, opus_int32 b)
+{
+ int res;
+ __asm__(
+ "#silk_SMULBT\n\t"
+ "smulbt %0, %1, %2\n\t"
+ : "=r"(res)
+ : "r"(a), "r"(b)
+ );
+ return res;
+}
+#define silk_SMULBT(a, b) (silk_SMULBT_armv5e(a, b))
+
+/* a32 + (opus_int32)((opus_int16)(b32)) * (c32 >> 16) */
+#undef silk_SMLABT
+static inline opus_int32 silk_SMLABT_armv5e(opus_int32 a, opus_int32 b,
+ opus_int32 c)
+{
+ int res;
+ __asm__(
+ "#silk_SMLABT\n\t"
+ "smlabt %0, %1, %2, %3\n\t"
+ : "=r"(res)
+ : "r"(b), "r"(c), "r"(a)
+ );
+ return res;
+}
+#define silk_SMLABT(a, b, c) (silk_SMLABT_armv5e(a, b, c))
+
+/* add/subtract with output saturated */
+#undef silk_ADD_SAT32
+static inline opus_int32 silk_ADD_SAT32_armv5e(opus_int32 a, opus_int32 b)
+{
+ int res;
+ __asm__(
+ "#silk_ADD_SAT32\n\t"
+ "qadd %0, %1, %2\n\t"
+ : "=r"(res)
+ : "%r"(a), "r"(b)
+ );
+ return res;
+}
+#define silk_ADD_SAT32(a, b) (silk_ADD_SAT32_armv5e(a, b))
+
+#undef silk_SUB_SAT32
+static inline opus_int32 silk_SUB_SAT32_armv5e(opus_int32 a, opus_int32 b)
+{
+ int res;
+ __asm__(
+ "#silk_SUB_SAT32\n\t"
+ "qsub %0, %1, %2\n\t"
+ : "=r"(res)
+ : "r"(a), "r"(b)
+ );
+ return res;
+}
+#define silk_SUB_SAT32(a, b) (silk_SUB_SAT32_armv5e(a, b))
+
+#undef silk_CLZ16
+static inline opus_int32 silk_CLZ16_armv5(opus_int16 in16)
+{
+ int res;
+ __asm__(
+ "#silk_CLZ16\n\t"
+ "clz %0, %1;\n"
+ : "=r"(res)
+ : "r"(in16<<16|0x8000)
+ );
+ return res;
+}
+#define silk_CLZ16(in16) (silk_CLZ16_armv5(in16))
+
+#undef silk_CLZ32
+static inline opus_int32 silk_CLZ32_armv5(opus_int32 in32)
+{
+ int res;
+ __asm__(
+ "#silk_CLZ32\n\t"
+ "clz %0, %1\n\t"
+ : "=&r"(res)
+ : "r"(in32)
+ );
+ return res;
+}
+#define silk_CLZ32(in32) (silk_CLZ32_armv5(in32))
+
+#endif /* SILK_MACROS_ARMv5E_H */