shithub: opus

--- a/Makefile.am

+++ b/Makefile.am

@@ -29,7 +29,14 @@

 if CPU_ARM

 CELT_SOURCES += $(CELT_SOURCES_ARM)

+SILK_SOURCES += $(SILK_SOURCES_ARM)

+if OPUS_ARM_EXTERNAL_ASM

+CELT_SOURCES += $(CELT_SOURCES_ARM_ASM:.s=-gnu.S)

+BUILT_SOURCES = $(CELT_SOURCES_ARM_ASM:.s=-gnu.S) \

+ $(CELT_AM_SOURCES_ARM_ASM:.s.in=.s) \

+ $(CELT_AM_SOURCES_ARM_ASM.s.in=-gnu.S)

 endif

+endif

 include celt_headers.mk

 include silk_headers.mk

@@ -106,11 +113,12 @@

 endif

 EXTRA_DIST = version.mk \

-	     opus.pc.in \

+             opus.pc.in \

              opus-uninstalled.pc.in \

              opus.m4 \

              Makefile.unix \

              tests/run_vectors.sh \

+             celt/arm/arm2gnu.pl \

              win32/VS2010/silk_float.vcxproj \

              win32/VS2010/celt.vcxproj.filters \

              win32/VS2010/opus.vcxproj \

@@ -206,3 +214,14 @@

 .PHONY: opus check-opus install-opus docs install-docs

+# automake doesn't do dependency tracking for asm files, that I can tell

+$(CELT_SOURCES_ARM_ASM:%.s=%-gnu.S): celt/arm/armopts-gnu.S

+$(CELT_SOURCES_ARM_ASM:%.s=%-gnu.S): $(top_srcdir)/celt/arm/arm2gnu.pl

+# convert ARM asm to GNU as format

+%-gnu.S: $(top_srcdir)/%.s

+	$(top_srcdir)/celt/arm/arm2gnu.pl < $< > $@

+# For autoconf-modified sources (e.g., armopts.s)

+%-gnu.S: %.s

+	$(top_srcdir)/celt/arm/arm2gnu.pl < $< > $@

--- a/celt/_kiss_fft_guts.h

+++ b/celt/_kiss_fft_guts.h

@@ -94,11 +94,11 @@

     do {(res).r = ADD32((res).r,(a).r);  (res).i = SUB32((res).i,(a).i); \

     }while(0)

-#if defined(ARMv4_ASM)

+#if defined(OPUS_ARM_INLINE_ASM)

 #include "arm/kiss_fft_armv4.h"

 #endif

-#if defined(ARMv5E_ASM)

+#if defined(OPUS_ARM_INLINE_EDSP)

 #include "arm/kiss_fft_armv5e.h"

 #endif

--- a/celt/arch.h

+++ b/celt/arch.h

@@ -114,9 +114,9 @@

 #include "fixed_generic.h"

-#ifdef ARMv5E_ASM

+#ifdef OPUS_ARM_INLINE_EDSP

 #include "arm/fixed_armv5e.h"

-#elif defined (ARMv4_ASM)

+#elif defined (OPUS_ARM_INLINE_ASM)

 #include "arm/fixed_armv4.h"

 #elif defined (BFIN_ASM)

 #include "fixed_bfin.h"

--- /dev/null

+++ b/celt/arm/arm2gnu.pl

@@ -1,0 +1,316 @@

+#!/usr/bin/perl

+my $bigend;  # little/big endian

+my $nxstack;

+$nxstack = 0;

+eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}'

+    if $running_under_some_shell;

+while ($ARGV[0] =~ /^-/) {

+    $_ = shift;

+  last if /^--/;

+    if (/^-n/) {

+    $nflag++;

+    next;

+    }

+    die "I don't recognize this switch: $_\\n";

+}

+$printit++ unless $nflag;

+$\ = "\n";      # automatically add newline on print

+$n=0;

+$thumb = 0;     # ARM mode by default, not Thumb.

+@proc_stack = ();

+LINE:

+while (<>) {

+    # For ADRLs we need to add a new line after the substituted one.

+    $addPadding = 0;

+    # First, we do not dare to touch *anything* inside double quotes, do we?

+    # Second, if you want a dollar character in the string,

+    # insert two of them -- that's how ARM C and assembler treat strings.

+    s/^([A-Za-z_]\w*)[ \t]+DCB[ \t]*\"/$1:   .ascii \"/   && do { s/\$\$/\$/g; next };

+    s/\bDCB\b[ \t]*\"/.ascii \"/                          && do { s/\$\$/\$/g; next };

+    s/^(\S+)\s+RN\s+(\S+)/$1 .req r$2/                    && do { s/\$\$/\$/g; next };

+    # If there's nothing on a line but a comment, don't try to apply any further

+    #  substitutions (this is a cheap hack to avoid mucking up the license header)

+    s/^([ \t]*);/$1@/                                     && do { s/\$\$/\$/g; next };

+    # If substituted -- leave immediately !

+    s/@/,:/;

+    s/;/@/;

+    while ( /@.*'/ ) {

+      s/(@.*)'/$1/g;

+    }

+    s/\{FALSE\}/0/g;

+    s/\{TRUE\}/1/g;

+    s/\{(\w\w\w\w+)\}/$1/g;

+    s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/;

+    s/\bGET[ \t]*([^ \t\n]+)/.include \"${ my $x=$1; $x =~ s|\.s|-gnu.S|; \$x }\"/;

+    s/\bIMPORT\b/.extern/;

+    s/\bEXPORT\b/.global/;

+    s/^(\s+)\[/$1IF/;

+    s/^(\s+)\|/$1ELSE/;

+    s/^(\s+)\]/$1ENDIF/;

+    s/IF *:DEF:/ .ifdef/;

+    s/IF *:LNOT: *:DEF:/ .ifndef/;

+    s/ELSE/ .else/;

+    s/ENDIF/ .endif/;

+    if( /\bIF\b/ ) {

+      s/\bIF\b/ .if/;

+      s/=/==/;

+    }

+    if ( $n == 2) {

+        s/\$/\\/g;

+    }

+    if ($n == 1) {

+        s/\$//g;

+        s/label//g;

+    $n = 2;

+      }

+    if ( /MACRO/ ) {

+      s/MACRO *\n/.macro/;

+      $n=1;

+    }

+    if ( /\bMEND\b/ ) {

+      s/\bMEND\b/.endm/;

+      $n=0;

+    }

+    # ".rdata" doesn't work in 'as' version 2.13.2, as it is ".rodata" there.

+    #

+    if ( /\bAREA\b/ ) {

+        my $align;

+        $align = "2";

+        if ( /ALIGN=(\d+)/ ) {

+            $align = $1;

+        }

+        if ( /CODE/ ) {

+            $nxstack = 1;

+        }

+        s/^(.+)CODE(.+)READONLY(.*)/    .text/;

+        s/^(.+)DATA(.+)READONLY(.*)/    .section .rdata/;

+        s/^(.+)\|\|\.data\|\|(.+)/    .data/;

+        s/^(.+)\|\|\.bss\|\|(.+)/    .bss/;

+        s/$/;   .p2align $align/;

+        # Enable NEON instructions but don't produce a binary that requires

+        # ARMv7. RVCT does not have equivalent directives, so we just do this

+        # for all CODE areas.

+        if ( /.text/ ) {

+            # Separating .arch, .fpu, etc., by semicolons does not work (gas

+            # thinks the semicolon is part of the arch name, even when there's

+            # whitespace separating them). Sadly this means our line numbers

+            # won't match the original source file (we could use the .line

+            # directive, which is documented to be obsolete, but then gdb will

+            # show the wrong line in the translated source file).

+            s/$/;   .arch armv7-a\n   .fpu neon\n   .object_arch armv4t/;

+        }

+    }

+    s/\|\|\.constdata\$(\d+)\|\|/.L_CONST$1/;       # ||.constdata$3||

+    s/\|\|\.bss\$(\d+)\|\|/.L_BSS$1/;               # ||.bss$2||

+    s/\|\|\.data\$(\d+)\|\|/.L_DATA$1/;             # ||.data$2||

+    s/\|\|([a-zA-Z0-9_]+)\@([a-zA-Z0-9_]+)\|\|/@ $&/;

+    s/^(\s+)\%(\s)/    .space $1/;

+    s/\|(.+)\.(\d+)\|/\.$1_$2/;                     # |L80.123| -> .L80_123

+    s/\bCODE32\b/.code 32/ && do {$thumb = 0};

+    s/\bCODE16\b/.code 16/ && do {$thumb = 1};

+    if (/\bPROC\b/)

+    {

+        my $prefix;

+        my $proc;

+        /^([A-Za-z_\.]\w+)\b/;

+        $proc = $1;

+        $prefix = "";

+        if ($proc)

+        {

+            $prefix = $prefix.sprintf("\t.type\t%s, %%function; ",$proc);

+            push(@proc_stack, $proc);

+            s/^[A-Za-z_\.]\w+/$&:/;

+        }

+        $prefix = $prefix."\t.thumb_func; " if ($thumb);

+        s/\bPROC\b/@ $&/;

+        $_ = $prefix.$_;

+    }

+    s/^(\s*)(S|Q|SH|U|UQ|UH)ASX\b/$1$2ADDSUBX/;

+    s/^(\s*)(S|Q|SH|U|UQ|UH)SAX\b/$1$2SUBADDX/;

+    if (/\bENDP\b/)

+    {

+        my $proc;

+        s/\bENDP\b/@ $&/;

+        $proc = pop(@proc_stack);

+        $_ = "\t.size $proc, .-$proc".$_ if ($proc);

+    }

+    s/\bSUBT\b/@ $&/;

+    s/\bDATA\b/@ $&/;   # DATA directive is deprecated -- Asm guide, p.7-25

+    s/\bKEEP\b/@ $&/;

+    s/\bEXPORTAS\b/@ $&/;

+    s/\|\|(.)+\bEQU\b/@ $&/;

+    s/\|\|([\w\$]+)\|\|/$1/;

+    s/\bENTRY\b/@ $&/;

+    s/\bASSERT\b/@ $&/;

+    s/\bGBLL\b/@ $&/;

+    s/\bGBLA\b/@ $&/;

+    s/^\W+OPT\b/@ $&/;

+    s/:OR:/|/g;

+    s/:SHL:/<</g;

+    s/:SHR:/>>/g;

+    s/:AND:/&/g;

+    s/:LAND:/&&/g;

+    s/CPSR/cpsr/;

+    s/SPSR/spsr/;

+    s/ALIGN$/.balign 4/;

+    s/ALIGN\s+([0-9x]+)$/.balign $1/;

+    s/psr_cxsf/psr_all/;

+    s/LTORG/.ltorg/;

+    s/^([A-Za-z_]\w*)[ \t]+EQU/ .set $1,/;

+    s/^([A-Za-z_]\w*)[ \t]+SETL/ .set $1,/;

+    s/^([A-Za-z_]\w*)[ \t]+SETA/ .set $1,/;

+    s/^([A-Za-z_]\w*)[ \t]+\*/ .set $1,/;

+    #  {PC} + 0xdeadfeed  -->  . + 0xdeadfeed

+    s/\{PC\} \+/ \. +/;

+    # Single hex constant on the line !

+    #

+    # >>> NOTE <<<

+    #   Double-precision floats in gcc are always mixed-endian, which means

+    #   bytes in two words are little-endian, but words are big-endian.

+    #   So, 0x0000deadfeed0000 would be stored as 0x0000dead at low address

+    #   and 0xfeed0000 at high address.

+    #

+    s/\bDCFD\b[ \t]+0x([a-fA-F0-9]{8})([a-fA-F0-9]{8})/.long 0x$1, 0x$2/;

+    # Only decimal constants on the line, no hex !

+    s/\bDCFD\b[ \t]+([0-9\.\-]+)/.double $1/;

+    # Single hex constant on the line !

+#    s/\bDCFS\b[ \t]+0x([a-f0-9]{8})([a-f0-9]{8})/.long 0x$1, 0x$2/;

+    # Only decimal constants on the line, no hex !

+#    s/\bDCFS\b[ \t]+([0-9\.\-]+)/.double $1/;

+    s/\bDCFS[ \t]+0x/.word 0x/;

+    s/\bDCFS\b/.float/;

+    s/^([A-Za-z_]\w*)[ \t]+DCD/$1 .word/;

+    s/\bDCD\b/.word/;

+    s/^([A-Za-z_]\w*)[ \t]+DCW/$1 .short/;

+    s/\bDCW\b/.short/;

+    s/^([A-Za-z_]\w*)[ \t]+DCB/$1 .byte/;

+    s/\bDCB\b/.byte/;

+    s/^([A-Za-z_]\w*)[ \t]+\%/.comm $1,/;

+    s/^[A-Za-z_\.]\w+/$&:/;

+    s/^(\d+)/$1:/;

+    s/\%(\d+)/$1b_or_f/;

+    s/\%[Bb](\d+)/$1b/;

+    s/\%[Ff](\d+)/$1f/;

+    s/\%[Ff][Tt](\d+)/$1f/;

+    s/&([\dA-Fa-f]+)/0x$1/;

+    if ( /\b2_[01]+\b/ ) {

+      s/\b2_([01]+)\b/conv$1&&&&/g;

+      while ( /[01][01][01][01]&&&&/ ) {

+        s/0000&&&&/&&&&0/g;

+        s/0001&&&&/&&&&1/g;

+        s/0010&&&&/&&&&2/g;

+        s/0011&&&&/&&&&3/g;

+        s/0100&&&&/&&&&4/g;

+        s/0101&&&&/&&&&5/g;

+        s/0110&&&&/&&&&6/g;

+        s/0111&&&&/&&&&7/g;

+        s/1000&&&&/&&&&8/g;

+        s/1001&&&&/&&&&9/g;

+        s/1010&&&&/&&&&A/g;

+        s/1011&&&&/&&&&B/g;

+        s/1100&&&&/&&&&C/g;

+        s/1101&&&&/&&&&D/g;

+        s/1110&&&&/&&&&E/g;

+        s/1111&&&&/&&&&F/g;

+      }

+      s/000&&&&/&&&&0/g;

+      s/001&&&&/&&&&1/g;

+      s/010&&&&/&&&&2/g;

+      s/011&&&&/&&&&3/g;

+      s/100&&&&/&&&&4/g;

+      s/101&&&&/&&&&5/g;

+      s/110&&&&/&&&&6/g;

+      s/111&&&&/&&&&7/g;

+      s/00&&&&/&&&&0/g;

+      s/01&&&&/&&&&1/g;

+      s/10&&&&/&&&&2/g;

+      s/11&&&&/&&&&3/g;

+      s/0&&&&/&&&&0/g;

+      s/1&&&&/&&&&1/g;

+      s/conv&&&&/0x/g;

+    }

+    if ( /commandline/)

+    {

+        if( /-bigend/)

+        {

+            $bigend=1;

+        }

+    }

+    if ( /\bDCDU\b/ )

+    {

+        my $cmd=$_;

+        my $value;

+        my $prefix;

+        my $w1;

+        my $w2;

+        my $w3;

+        my $w4;

+        s/\s+DCDU\b/@ $&/;

+        $cmd =~ /\bDCDU\b\s+0x(\d+)/;

+        $value = $1;

+        $value =~ /(\w\w)(\w\w)(\w\w)(\w\w)/;

+        $w1 = $1;

+        $w2 = $2;

+        $w3 = $3;

+        $w4 = $4;

+        if( $bigend ne "")

+        {

+            # big endian

+            $prefix = "\t.byte\t0x".$w1.";".

+                      "\t.byte\t0x".$w2.";".

+                      "\t.byte\t0x".$w3.";".

+                      "\t.byte\t0x".$w4."; ";

+        }

+        else

+        {

+            # little endian

+            $prefix = "\t.byte\t0x".$w4.";".

+                      "\t.byte\t0x".$w3.";".

+                      "\t.byte\t0x".$w2.";".

+                      "\t.byte\t0x".$w1."; ";

+        }

+        $_=$prefix.$_;

+    }

+    if ( /\badrl\b/i )

+    {

+        s/\badrl\s+(\w+)\s*,\s*(\w+)/ldr $1,=$2/i;

+        $addPadding = 1;

+    }

+    s/\bEND\b/@ END/;

+} continue {

+    printf ("%s", $_) if $printit;

+    if ($addPadding != 0)

+    {

+        printf ("   mov r0,r0\n");

+        $addPadding = 0;

+    }

+}

+#If we had a code section, mark that this object doesn't need an executable

+# stack.

+if ($nxstack) {

+    printf ("    .section\t.note.GNU-stack,\"\",\%\%progbits\n");

+}

--- /dev/null

+++ b/celt/arm/arm_celt_map.c

@@ -1,0 +1,49 @@

+/* Copyright (c) 2010 Xiph.Org Foundation

+ * Copyright (c) 2013 Parrot */

+/*

+   Redistribution and use in source and binary forms, with or without

+   modification, are permitted provided that the following conditions

+   are met:

+   - Redistributions of source code must retain the above copyright

+   notice, this list of conditions and the following disclaimer.

+   - Redistributions in binary form must reproduce the above copyright

+   notice, this list of conditions and the following disclaimer in the

+   documentation and/or other materials provided with the distribution.

+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER

+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+#ifdef HAVE_CONFIG_H

+#include "config.h"

+#endif

+#include "pitch.h"

+#if defined(OPUS_HAVE_RTCD)

+# if defined(FIXED_POINT)

+opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,

+    const opus_val16 *, opus_val32 *, int , int) = {

+  celt_pitch_xcorr_c,               /* ARMv4 */

+  MAY_HAVE_EDSP(celt_pitch_xcorr),  /* EDSP */

+  MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */

+  MAY_HAVE_NEON(celt_pitch_xcorr)   /* NEON */

+};

+# else

+#  error "Floating-point implementation is not supported by ARM asm yet." \

+ "Reconfigure with --disable-rtcd or send patches."

+# endif

+#endif

--- a/celt/arm/armcpu.c

+++ b/celt/arm/armcpu.c

@@ -55,7 +55,7 @@

   /* MSVC has no OPUS_INLINE __asm support for ARM, but it does let you __emit

    * instructions via their assembled hex code.

    * All of these instructions should be essentially nops. */

-# if defined(ARMv5E_ASM)

+# if defined(OPUS_ARM_MAY_HAVE_EDSP)

   __try{

     /*PLD [r13]*/

     __emit(0xF5DDF000);

@@ -64,7 +64,7 @@

   __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){

     /*Ignore exception.*/

-#  if defined(ARMv6E_ASM)

+#  if defined(OPUS_ARM_MAY_HAVE_MEDIA)

   __try{

     /*SHADD8 r3,r3,r3*/

     __emit(0xE6333F93);

@@ -73,7 +73,7 @@

   __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){

     /*Ignore exception.*/

-#   if defined(ARM_HAVE_NEON)

+#   if defined(OPUS_ARM_MAY_HAVE_NEON)

   __try{

     /*VORR q0,q0,q0*/

     __emit(0xF2200150);

@@ -107,19 +107,26 @@

     while(fgets(buf, 512, cpuinfo) != NULL)

+# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_NEON)

       /* Search for edsp and neon flag */

       if(memcmp(buf, "Features", 8) == 0)

         char *p;

+#  if defined(OPUS_ARM_MAY_HAVE_EDSP)

         p = strstr(buf, " edsp");

         if(p != NULL && (p[5] == ' ' || p[5] == '\n'))

           flags |= OPUS_CPU_ARM_EDSP;

+#  endif

+#  if defined(OPUS_ARM_MAY_HAVE_NEON)

         p = strstr(buf, " neon");

         if(p != NULL && (p[5] == ' ' || p[5] == '\n'))

           flags |= OPUS_CPU_ARM_NEON;

+#  endif

+# endif

+# if defined(OPUS_ARM_MAY_HAVE_MEDIA)

       /* Search for media capabilities (>= ARMv6) */

       if(memcmp(buf, "CPU architecture:", 17) == 0)

@@ -129,6 +136,7 @@

         if(version >= 6)

           flags |= OPUS_CPU_ARM_MEDIA;

+# endif

     fclose(cpuinfo);

--- a/celt/arm/armcpu.h

+++ b/celt/arm/armcpu.h

@@ -25,11 +25,47 @@

    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

*/

-/* Original code from libtheora modified to suit to Opus */

+#if !defined(ARMCPU_H)

+# define ARMCPU_H

-#ifndef ARMCPU_H

-#define ARMCPU_H

+# if defined(OPUS_ARM_MAY_HAVE_EDSP)

+#  define MAY_HAVE_EDSP(name) name ## _edsp

+# else

+#  define MAY_HAVE_EDSP(name) name ## _c

+# endif

+# if defined(OPUS_ARM_MAY_HAVE_MEDIA)

+#  define MAY_HAVE_MEDIA(name) name ## _media

+# else

+#  define MAY_HAVE_MEDIA(name) MAY_HAVE_EDSP(name)

+# endif

+# if defined(OPUS_ARM_MAY_HAVE_NEON)

+#  define MAY_HAVE_NEON(name) name ## _neon

+# else

+#  define MAY_HAVE_NEON(name) MAY_HAVE_MEDIA(name)

+# endif

+# if defined(OPUS_ARM_PRESUME_EDSP)

+#  define PRESUME_EDSP(name) name ## _edsp

+# else

+#  define PRESUME_EDSP(name) name ## _c

+# endif

+# if defined(OPUS_ARM_PRESUME_MEDIA)

+#  define PRESUME_MEDIA(name) name ## _media

+# else

+#  define PRESUME_MEDIA(name) PRESUME_EDSP(name)

+# endif

+# if defined(OPUS_ARM_PRESUME_NEON)

+#  define PRESUME_NEON(name) name ## _neon

+# else

+#  define PRESUME_NEON(name) PRESUME_MEDIA(name)

+# endif

+# if defined(OPUS_HAVE_RTCD)

 int opus_select_arch(void);

+# endif

 #endif

--- /dev/null

+++ b/celt/arm/armopts.s.in

@@ -1,0 +1,37 @@

+/* Copyright (C) 2013 Mozilla Corporation */

+/*

+   Redistribution and use in source and binary forms, with or without

+   modification, are permitted provided that the following conditions

+   are met:

+   - Redistributions of source code must retain the above copyright

+   notice, this list of conditions and the following disclaimer.

+   - Redistributions in binary form must reproduce the above copyright

+   notice, this list of conditions and the following disclaimer in the

+   documentation and/or other materials provided with the distribution.

+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER

+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+; Set the following to 1 if we have EDSP instructions

+;  (LDRD/STRD, etc., ARMv5E and later).

+OPUS_ARM_MAY_HAVE_EDSP  * @OPUS_ARM_MAY_HAVE_EDSP@

+; Set the following to 1 if we have ARMv6 media instructions.

+OPUS_ARM_MAY_HAVE_MEDIA * @OPUS_ARM_MAY_HAVE_MEDIA@

+; Set the following to 1 if we have NEON (some ARMv7)

+OPUS_ARM_MAY_HAVE_NEON  * @OPUS_ARM_MAY_HAVE_NEON@

+END

--- /dev/null

+++ b/celt/arm/celt_pitch_xcorr_arm.s

@@ -1,0 +1,598 @@

+; Copyright (c) 2007-2008 CSIRO

+; Copyright (c) 2007-2009 Xiph.Org Foundation

+; Copyright (c) 2013      Parrot

+; Written by Aurélien Zanelli

+;

+; Redistribution and use in source and binary forms, with or without

+; modification, are permitted provided that the following conditions

+; are met:

+;

+; - Redistributions of source code must retain the above copyright

+; notice, this list of conditions and the following disclaimer.

+;

+; - Redistributions in binary form must reproduce the above copyright

+; notice, this list of conditions and the following disclaimer in the

+; documentation and/or other materials provided with the distribution.

+;

+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER

+; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+  AREA  |.text|, CODE, READONLY

+  GET    celt/arm/armopts.s

+IF OPUS_ARM_MAY_HAVE_EDSP

+  EXPORT celt_pitch_xcorr_edsp

+ENDIF

+IF OPUS_ARM_MAY_HAVE_NEON

+  EXPORT celt_pitch_xcorr_neon

+ENDIF

+IF OPUS_ARM_MAY_HAVE_NEON

+;; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3

+;xcorr_kernel_neon PROC

+;  ; input:

+;  ;   r3     = int         len

+;  ;   r4     = opus_val16 *x

+;  ;   r5     = opus_val16 *y

+;  ;   q0     = opus_val32  sum[4]

+;  ; output:

+;  ;   q0     = opus_val32  sum[4]

+;  ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15

+;  ; internal usage:

+;  ;   r12 = int j

+;  ;   d3  = y_3|y_2|y_1|y_0

+;  ;   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4

+;  ;   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0

+;  ;   q8  = scratch

+;  ;

+;  ; Load y[0...3]

+;  ; This requires len>0 to always be valid (which we assert in the C code).

+;  VLD1.16      {d5}, [r5]!

+;  SUBS         r12, r3, #8

+;  BLE xcorr_kernel_neon_process4

+;; Process 8 samples at a time.

+;; This loop loads one y value more than we actually need. Therefore we have to

+;; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid

+;; reading past the end of the array.

+;xcorr_kernel_neon_process8

+;  ; This loop has 19 total instructions (10 cycles to issue, minimum), with

+;  ; - 2 cycles of ARM insrtuctions,

+;  ; - 10 cycles of load/store/byte permute instructions, and

+;  ; - 9 cycles of data processing instructions.

+;  ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the

+;  ; latter two categories, meaning the whole loop should run in 10 cycles per

+;  ; iteration, barring cache misses.

+;  ;

+;  ; Load x[0...7]

+;  VLD1.16      {d6, d7}, [r4]!

+;  ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get

+;  ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.

+;  VAND         d3, d5, d5

+;  SUBS         r12, r12, #8

+;  ; Load y[4...11]

+;  VLD1.16      {d4, d5}, [r5]!

+;  VMLAL.S16    q0, d3, d6[0]

+;  VEXT.16      d16, d3, d4, #1

+;  VMLAL.S16    q0, d4, d7[0]

+;  VEXT.16      d17, d4, d5, #1

+;  VMLAL.S16    q0, d16, d6[1]

+;  VEXT.16      d16, d3, d4, #2

+;  VMLAL.S16    q0, d17, d7[1]

+;  VEXT.16      d17, d4, d5, #2

+;  VMLAL.S16    q0, d16, d6[2]

+;  VEXT.16      d16, d3, d4, #3

+;  VMLAL.S16    q0, d17, d7[2]

+;  VEXT.16      d17, d4, d5, #3

+;  VMLAL.S16    q0, d16, d6[3]

+;  VMLAL.S16    q0, d17, d7[3]

+;  BGT xcorr_kernel_neon_process8

+;; Process 4 samples here if we have > 4 left (still reading one extra y value).

+;xcorr_kernel_neon_process4

+;  ADDS         r12, r12, #4

+;  BLE xcorr_kernel_neon_process2

+;  ; Load x[0...3]

+;  VLD1.16      d6, [r4]!

+;  ; Use VAND since it's a data processing instruction again.

+;  VAND         d4, d5, d5

+;  SUB          r12, r12, #4

+;  ; Load y[4...7]

+;  VLD1.16      d5, [r5]!

+;  VMLAL.S16    q0, d4, d6[0]

+;  VEXT.16      d16, d4, d5, #1

+;  VMLAL.S16    q0, d16, d6[1]

+;  VEXT.16      d16, d4, d5, #2

+;  VMLAL.S16    q0, d16, d6[2]

+;  VEXT.16      d16, d4, d5, #3

+;  VMLAL.S16    q0, d16, d6[3]

+;; Process 2 samples here if we have > 2 left (still reading one extra y value).

+;xcorr_kernel_neon_process2

+;  ADDS         r12, r12, #2

+;  BLE xcorr_kernel_neon_process1

+;  ; Load x[0...1]

+;  VLD2.16      {d6[],d7[]}, [r4]!

+;  ; Use VAND since it's a data processing instruction again.

+;  VAND         d4, d5, d5

+;  SUB          r12, r12, #2

+;  ; Load y[4...5]

+;  VLD1.32      {d5[]}, [r5]!

+;  VMLAL.S16    q0, d4, d6

+;  VEXT.16      d16, d4, d5, #1

+;  ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI

+;  ; instead of VEXT, since it's a data-processing instruction.

+;  VSRI.64      d5, d4, #32

+;  VMLAL.S16    q0, d16, d7

+;; Process 1 sample using the extra y value we loaded above.

+;xcorr_kernel_neon_process1

+;  ; Load next *x

+;  VLD1.16      {d6[]}, [r4]!

+;  ADDS         r12, r12, #1

+;  ; y[0...3] are left in d5 from prior iteration(s) (if any)

+;  VMLAL.S16    q0, d5, d6

+;  MOVLE        pc, lr

+;; Now process 1 last sample, not reading ahead.

+;  ; Load last *y

+;  VLD1.16      {d4[]}, [r5]!

+;  VSRI.64      d4, d5, #16

+;  ; Load last *x

+;  VLD1.16      {d6[]}, [r4]!

+;  VMLAL.S16    q0, d4, d6

+;  MOV          pc, lr

+;  ENDP

+;; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,

+;;  opus_val32 *xcorr, int len, int max_pitch)

+;celt_pitch_xcorr_neon PROC

+;  ; input:

+;  ;   r0  = opus_val16 *_x

+;  ;   r1  = opus_val16 *_y

+;  ;   r2  = opus_val32 *xcorr

+;  ;   r3  = int         len

+;  ; output:

+;  ;   r0  = int         maxcorr

+;  ; internal usage:

+;  ;   r4  = opus_val16 *x (for xcorr_kernel_neon())

+;  ;   r5  = opus_val16 *y (for xcorr_kernel_neon())

+;  ;   r6  = int         max_pitch

+;  ;   r12 = int         j

+;  ;   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())

+;  STMFD        sp!, {r4-r6, lr}

+;  LDR          r6, [sp, #16]

+;  VMOV.S32     q15, #1

+;  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done

+;  SUBS         r6, r6, #4

+;  BLT celt_pitch_xcorr_neon_process4_done

+;celt_pitch_xcorr_neon_process4

+;  ; xcorr_kernel_neon parameters:

+;  ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}

+;  MOV          r4, r0

+;  MOV          r5, r1

+;  VEOR         q0, q0, q0

+;  ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.

+;  ; So we don't save/restore any other registers.

+;  BL xcorr_kernel_neon

+;  SUBS         r6, r6, #4

+;  VST1.32      {q0}, [r2]!

+;  ; _y += 4

+;  ADD          r1, r1, #8

+;  VMAX.S32     q15, q15, q0

+;  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done

+;  BGE celt_pitch_xcorr_neon_process4

+;; We have less than 4 sums left to compute.

+;celt_pitch_xcorr_neon_process4_done

+;  ADDS         r6, r6, #4

+;  ; Reduce maxcorr to a single value

+;  VMAX.S32     d30, d30, d31

+;  VPMAX.S32    d30, d30, d30

+;  ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done

+;  BLE celt_pitch_xcorr_neon_done

+;; Now compute each remaining sum one at a time.

+;celt_pitch_xcorr_neon_process_remaining

+;  MOV          r4, r0

+;  MOV          r5, r1

+;  VMOV.I32     q0, #0

+;  SUBS         r12, r3, #8

+;  BLT celt_pitch_xcorr_neon_process_remaining4

+;; Sum terms 8 at a time.

+;celt_pitch_xcorr_neon_process_remaining_loop8

+;  ; Load x[0...7]

+;  VLD1.16      {q1}, [r4]!

+;  ; Load y[0...7]

+;  VLD1.16      {q2}, [r5]!

+;  SUBS         r12, r12, #8

+;  VMLAL.S16    q0, d4, d2

+;  VMLAL.S16    q0, d5, d3

+;  BGE celt_pitch_xcorr_neon_process_remaining_loop8

+;; Sum terms 4 at a time.

+;celt_pitch_xcorr_neon_process_remaining4

+;  ADDS         r12, r12, #4

+;  BLT celt_pitch_xcorr_neon_process_remaining4_done

+;  ; Load x[0...3]

+;  VLD1.16      {d2}, [r4]!

+;  ; Load y[0...3]

+;  VLD1.16      {d3}, [r5]!

+;  SUB          r12, r12, #4

+;  VMLAL.S16    q0, d3, d2

+;  ; Reduce the sum to a single value.

+;  VADD.S32     d0, d0, d1

+;  VPADDL.S32   d0, d0

+;celt_pitch_xcorr_neon_process_remaining4_done

+;  ADDS         r12, r12, #4

+;  BLE celt_pitch_xcorr_neon_process_remaining_loop_done

+;; Sum terms 1 at a time.

+;celt_pitch_xcorr_neon_process_remaining_loop1

+;  VLD1.16      {d2[]}, [r4]!

+;  VLD1.16      {d3[]}, [r5]!

+;  SUBS         r12, r12, #1

+;  VMLAL.S16    q0, d2, d3

+;  BGT celt_pitch_xcorr_neon_process_remaining_loop1

+;celt_pitch_xcorr_neon_process_remaining_loop_done

+;  VST1.32      {d0[0]}, [r2]!

+;  VMAX.S32     d30, d30, d0

+;  SUBS         r6, r6, #1

+;  ; _y++

+;  ADD          r1, r1, #2

+;  ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining

+;  BGT celt_pitch_xcorr_neon_process_remaining

+;celt_pitch_xcorr_neon_done

+;  VMOV.32      r0, d30[0]

+;  LDMFD        sp!, {r4-r6, pc}

+;  ENDP

+xcorr_kernel_neon PROC

+  ; input:

+  ; r0 = opus_val16 *x

+  ; r1 = opus_val16 *y

+  ; r2 = int        len

+  ; q0 = opus_val32 sum (sum[3] | sum[2] | sum[1] | sum[0])

+  ; output:

+  ; q0 = sum

+  ; internal usage:

+  ; r3 = j

+  ; d2 = x_3|x_2|x_1|x_0  d3 = y_3|y_2|y_1|y_0

+  ; d4 = y_7|y_6|y_5|y_4  d5 = y_4|y_3|y_2|y_1

+  ; d6 = y_5|y_4|y_3|y_2  d7 = y_6|y_5|y_4|y_3

+  ; We will build d5, d6 and d7 vector from d3 and d4

+  VLD1.16   {d3}, [r1]!      ; Load y[3] downto y[0] to d3 lane (yy0)

+  SUB       r3, r2, #1

+  MOVS      r3, r3, lsr #2   ; j=(len-1)>>2

+  BEQ       xcorr_kernel_neon_process4_done

+  ; Process 4 x samples at a time

+  ; For this, we will need 4 y vectors

+xcorr_kernel_neon_process4

+  SUBS      r3, r3, #1       ; j--

+  VLD1.16   d4, [r1]!        ; Load y[7] downto y[4] to d4 lane

+  VLD1.16   d2, [r0]!        ; Load x[3] downto x[0] to d2 lane

+  VEXT.16   d5, d3, d4, #1   ; Build y[4] downto y[1] vector (yy1)

+  VEXT.16   d6, d3, d4, #2   ; Build y[5] downto y[2] vector (yy2)

+  VEXT.16   d7, d3, d4, #3   ; Build y[6] downto y[3] vector (yy3)

+  VMLAL.S16 q0, d3, d2[0]    ; MAC16_16(sum, x[0], yy0)

+  VMLAL.S16 q0, d5, d2[1]    ; MAC16_16(sum, x[1], yy1)

+  VMLAL.S16 q0, d6, d2[2]    ; MAC16_16(sum, x[2], yy2)

+  VMLAL.S16 q0, d7, d2[3]    ; MAC16_16(sum, x[3], yy3)

+  VMOV.S16  d3, d4           ; Next y vector should be in d3 (yy0)

+  BNE xcorr_kernel_neon_process4

+xcorr_kernel_neon_process4_done

+  ;Process len-1 to len

+  VLD1.16   {d2[]}, [r0]!    ; Load *x and duplicate to d2 lane

+  SUB       r3, r2, #1

+  ANDS      r3, r3, #3       ; j=(len-1)&3

+  VMLAL.S16 q0, d3, d2       ; MAC16_16(sum, *x, yy0)

+  BEQ xcorr_kernel_neon_done

+xcorr_kernel_neon_process_remaining

+  SUBS      r3, r3, #1       ; j--

+  VLD1.16   {d4[]}, [r1]!    ; Load y value and duplicate to d4 lane

+  VLD1.16   {d2[]}, [r0]!    ; Load *x and duplicate to d2 lane

+  VEXT.16   d3, d3, d4, #1   ; Build y vector from previous and d4

+  VMLAL.S16 q0, d3, d2       ; MAC16_16(sum, *x, yy0)

+  BNE xcorr_kernel_neon_process_remaining

+xcorr_kernel_neon_done

+  MOV       pc, lr

+  ENDP

+celt_pitch_xcorr_neon PROC

+  ; input:

+  ; r0 = opus_val16 *_x

+  ; r1 = opus_val16 *_y

+  ; r2 = opus_val32 *xcorr

+  ; r3 = int        len

+  ; output:

+  ; r0 = maxcorr

+  STMFD     sp!, {r4-r9, lr}

+  LDR       r4, [sp, #28]        ; r4 = int max_pitch

+  MOV       r5, r0               ; r5 = _x

+  MOV       r6, r1               ; r6 = _y

+  MOV       r7, r2               ; r7 = xcorr

+  MOV       r2, r3               ; r2 = len

+  VMOV.S32  d16, #1              ; d16 = {1, 1}  (not used by xcorr_kernel_neon)

+  MOV       r8, #0               ; r8 = i = 0

+  CMP       r4, #3               ; max_pitch-3 <= 0  ---> pitch_xcorr_neon_process4_done

+  BLE       celt_pitch_xcorr_neon_process4_done

+  SUB       r9, r4, #3           ; r9 = max_pitch-3

+celt_pitch_xcorr_neon_process4

+  MOV       r0, r5               ; r0 = _x

+  ADD       r1, r6 ,r8, LSL #1   ; r1 = _y + i

+  VMOV.I32  q0, #0               ; q0 = opus_val32 sum[4] = {0, 0, 0, 0}

+                                 ; xcorr_kernel_neon don't touch r2 (len)

+                                 ; So we don't store it

+  BL xcorr_kernel_neon           ; xcorr_kernel_neon(_x, _y+i, sum, len)

+  VST1.32   {q0}, [r7]!          ; Store sum to xcorr

+  VPMAX.S32 d0, d0, d1           ; d0 = max(sum[3], sum[2]) | max(sum[1], sum[0])

+  ADD       r8, r8, #4           ; i+=4

+  VPMAX.S32 d0, d0, d0           ; d0 = max(sum[3], sum[2], sum[1], sum[0])

+  CMP       r8, r9               ; i < max_pitch-3 ----> pitch_xcorr_neon_process4

+  VMAX.S32  d16, d16, d0         ; d16 = maxcorr = max(maxcorr, sum)

+  BLT       celt_pitch_xcorr_neon_process4

+celt_pitch_xcorr_neon_process4_done

+  CMP       r8, r4;

+  BGE       celt_pitch_xcorr_neon_done

+celt_pitch_xcorr_neon_process_remaining

+  MOV       r0, r5               ; r0 = _x

+  ADD       r1, r6, r8, LSL #1   ; r1 = _y + i

+  VMOV.I32  q0, #0

+  MOVS      r3, r2, LSR #2       ; r3 = j = len

+  BEQ       inner_loop_neon_process4_done

+inner_loop_neon_process4

+  VLD1.16   {d2}, [r0]!          ; Load x

+  VLD1.16   {d3}, [r1]!          ; Load y

+  SUBS      r3, r3, #1

+  VMLAL.S16 q0, d2, d3

+  BNE       inner_loop_neon_process4

+  VPADD.S32 d0, d0, d1          ; Reduce sum

+  VPADD.S32 d0, d0, d0

+inner_loop_neon_process4_done

+  ANDS      r3, r2, #3

+  BEQ       inner_loop_neon_done

+inner_loop_neon_process_remaining

+  VLD1.16   {d2[]}, [r0]!

+  VLD1.16   {d3[]}, [r1]!

+  SUBS      r3, r3, #1

+  VMLAL.S16 q0, d2, d3

+  BNE       inner_loop_neon_process_remaining

+inner_loop_neon_done

+  VST1.32   {d0[0]}, [r7]!

+  VMAX.S32  d16, d16, d0

+  ADD       r8, r8, #1

+  CMP       r8, r4

+  BCC       celt_pitch_xcorr_neon_process_remaining

+celt_pitch_xcorr_neon_done

+  VMOV      d0, d16

+  VMOV.32   r0, d0[0]

+  LDMFD     sp!, {r4-r9, pc}

+  ENDP

+ENDIF

+IF OPUS_ARM_MAY_HAVE_EDSP

+; This will get used on ARMv7 devices without NEON, so it has been optimized

+; to take advantage of dual-issuing where possible.

+xcorr_kernel_edsp PROC

+  ; input:

+  ;   r3      = int         len

+  ;   r4      = opus_val16 *_x

+  ;   r5      = opus_val16 *_y

+  ;   r6...r9 = opus_val32  sum[4]

+  ; output:

+  ;   r6...r9 = opus_val32  sum[4]

+  ; preserved: r0-r5

+  ; internal usage

+  ;   r2      = int         j

+  ;   r12,r14 = opus_val16  x[4]

+  ;   r10,r11 = opus_val16  y[4]

+  STMFD        sp!, {r2,r4,r5,lr}

+  SUBS         r2, r3, #4         ; j = len-4

+  LDRD         r10, r11, [r5], #8 ; Load y[0...3]

+  BLE xcorr_kernel_edsp_process4_done

+  LDR          r12, [r4], #4      ; Load x[0...1]

+  ; Stall

+xcorr_kernel_edsp_process4

+  ; The multiplies must issue from pipeline 0, and can't dual-issue with each

+  ; other. Every other instruction here dual-issues with a multiply, and is

+  ; thus "free". There should be no stalls in the body of the loop.

+  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_0,y_0)

+  LDR          r14, [r4], #4      ; Load x[2...3]

+  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x_0,y_1)

+  SUBS         r2, r2, #4         ; j-=4

+  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_0,y_2)

+  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x_0,y_3)

+  SMLATT       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_1,y_1)

+  LDR          r10, [r5], #4      ; Load y[4...5]

+  SMLATB       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],x_1,y_2)

+  SMLATT       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_1,y_3)

+  SMLATB       r9, r12, r10, r9   ; sum[3] = MAC16_16(sum[3],x_1,y_4)

+  LDRGT        r12, [r4], #4      ; Load x[0...1]

+  SMLABB       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_2,y_2)

+  SMLABT       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x_2,y_3)

+  SMLABB       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_2,y_4)

+  SMLABT       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x_2,y_5)

+  SMLATT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_3,y_3)

+  LDR          r11, [r5], #4      ; Load y[6...7]

+  SMLATB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],x_3,y_4)

+  SMLATT       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_3,y_5)

+  SMLATB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],x_3,y_6)

+  BGT xcorr_kernel_edsp_process4

+xcorr_kernel_edsp_process4_done

+  ADDS         r2, r2, #4

+  BLE xcorr_kernel_edsp_done

+  LDRH         r12, [r4], #2      ; r12 = *x++

+  SUBS         r2, r2, #1         ; j--

+  ; Stall

+  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_0)

+  LDRGTH       r14, [r4], #2      ; r14 = *x++

+  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x,y_1)

+  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_2)

+  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x,y_3)

+  BLE xcorr_kernel_edsp_done

+  SMLABT       r6, r14, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_1)

+  SUBS         r2, r2, #1         ; j--

+  SMLABB       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x,y_2)

+  LDRH         r10, [r5], #2      ; r10 = y_4 = *y++

+  SMLABT       r8, r14, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_3)

+  LDRGTH       r12, [r4], #2      ; r12 = *x++

+  SMLABB       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x,y_4)

+  BLE xcorr_kernel_edsp_done

+  SMLABB       r6, r12, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_2)

+  CMP          r2, #1             ; j--

+  SMLABT       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_3)

+  LDRH         r2, [r5], #2       ; r2 = y_5 = *y++

+  SMLABB       r8, r12, r10, r8   ; sum[2] = MAC16_16(sum[2],tmp,y_4)

+  LDRGTH       r14, [r4]          ; r14 = *x

+  SMLABB       r9, r12, r2, r9    ; sum[3] = MAC16_16(sum[3],tmp,y_5)

+  BLE xcorr_kernel_edsp_done

+  SMLABT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_3)

+  LDRH         r11, [r5]          ; r11 = y_6 = *y

+  SMLABB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_4)

+  SMLABB       r8, r14, r2, r8    ; sum[2] = MAC16_16(sum[2],tmp,y_5)

+  SMLABB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],tmp,y_6)

+xcorr_kernel_edsp_done

+  LDMFD        sp!, {r2,r4,r5,pc}

+  ENDP

+celt_pitch_xcorr_edsp PROC

+  ; input:

+  ;   r0  = opus_val16 *_x

+  ;   r1  = opus_val16 *_y

+  ;   r2  = opus_val32 *xcorr

+  ;   r3  = int         len

+  ; output:

+  ;   r0  = maxcorr

+  ; internal usage

+  ;   r4  = opus_val16 *x

+  ;   r5  = opus_val16 *y

+  ;   r6  = opus_val32  sum0

+  ;   r7  = opus_val32  sum1

+  ;   r8  = opus_val32  sum2

+  ;   r9  = opus_val32  sum3

+  ;   r1  = int         max_pitch

+  ;   r12 = int         j

+  STMFD        sp!, {r4-r11, lr}

+  MOV          r5, r1

+  LDR          r1, [sp, #36]

+  MOV          r4, r0

+  ; maxcorr = 1

+  MOV          r0, #1

+  ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process4_done

+  SUBS         r1, r1, #4

+  BLT celt_pitch_xcorr_edsp_process4_done

+celt_pitch_xcorr_edsp_process4

+  ; xcorr_kernel_edsp parameters:

+  ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}

+  MOV          r6, #0

+  MOV          r7, #0

+  MOV          r8, #0

+  MOV          r9, #0

+  BL xcorr_kernel_edsp  ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)

+  ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)

+  CMP          r0, r6

+  ; _y+=4

+  ADD          r5, r5, #8

+  MOVLT        r0, r6

+  CMP          r0, r7

+  STRD         r6, r7, [r2], #8

+  MOVLT        r0, r7

+  CMP          r0, r8

+  STRD         r8, r9, [r2], #8

+  MOVLT        r0, r8

+  CMP          r0, r9

+  MOVLT        r0, r9

+  SUBS         r1, r1, #4

+  BGE celt_pitch_xcorr_edsp_process4

+celt_pitch_xcorr_edsp_process4_done

+  ADDS         r1, r1, #4

+  BLE celt_pitch_xcorr_edsp_done

+; Now compute each remaining sum one at a time.

+celt_pitch_xcorr_edsp_process_remaining

+  SUBS         r12, r3, #4

+  ; r14 = sum = 0

+  MOV          r14, #0

+  BLT celt_pitch_xcorr_edsp_process_remaining_loop_done

+  LDRD         r6, r7, [r4], #8

+  LDRD         r8, r9, [r5], #8

+  ; Stall

+celt_pitch_xcorr_edsp_process_remaining_loop4

+  SMLABB       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)

+  SUBS         r12, r12, #4         ; j--

+  SMLATT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)

+  LDRGE        r6, [r4], #4

+  SMLABB       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)

+  LDRGE        r8, [r5], #4

+  SMLATT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_3, y_3)

+  LDRGE        r7, [r4], #4

+  LDRGE        r9, [r5], #4

+  BGE celt_pitch_xcorr_edsp_process_remaining_loop4

+celt_pitch_xcorr_edsp_process_remaining_loop_done

+  ADDS         r12, r12, #2

+  LDRGE        r6, [r4], #4

+  LDRGE        r8, [r5], #4

+  ; Stall

+  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)

+  SUBGE        r12, r12, #2

+  SMLATTGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)

+  ADDS         r12, r12, #1

+  LDRGEH       r6, [r4], #2

+  LDRGEH       r8, [r5], #2

+  ; Restore _x

+  SUB          r4, r4, r3, LSL #1

+  ; Stall

+  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, *x, *y)

+  ; Restore and advance _y

+  SUB          r5, r5, r3, LSL #1

+  ; maxcorr = max(maxcorr, sum)

+  ; Stall

+  CMP          r0, r14

+  ADD          r5, r5, #2

+  MOVLT        r0, r14

+  SUBS         r1, r1, #1

+  ; xcorr[i] = sum

+  STR          r14, [r2], #4

+  BGT celt_pitch_xcorr_edsp_process_remaining

+celt_pitch_xcorr_edsp_done

+  LDMFD        sp!, {r4-r11, pc}

+  ENDP

+ENDIF

+END

--- /dev/null

+++ b/celt/arm/pitch_arm.h

@@ -1,0 +1,57 @@

+/* Copyright (c) 2010 Xiph.Org Foundation

+ * Copyright (c) 2013 Parrot */

+/*

+   Redistribution and use in source and binary forms, with or without

+   modification, are permitted provided that the following conditions

+   are met:

+   - Redistributions of source code must retain the above copyright

+   notice, this list of conditions and the following disclaimer.

+   - Redistributions in binary form must reproduce the above copyright

+   notice, this list of conditions and the following disclaimer in the

+   documentation and/or other materials provided with the distribution.

+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER

+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+#if !defined(PITCH_ARM_H)

+# define PITCH_ARM_H

+# include "armcpu.h"

+# if defined(FIXED_POINT)

+#  if defined(OPUS_ARM_MAY_HAVE_NEON)

+opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y,

+    opus_val32 *xcorr, int len, int max_pitch);

+#  endif

+#  if defined(OPUS_ARM_MAY_HAVE_MEDIA)

+#   define celt_pitch_xcorr_media MAY_HAVE_EDSP(celt_pitch_xcorr)

+#  endif

+#  if defined(OPUS_ARM_MAY_HAVE_EDSP)

+opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,

+    opus_val32 *xcorr, int len, int max_pitch);

+#  endif

+#  if !defined(OPUS_HAVE_RTCD)

+#   define OVERRIDE_PITCH_XCORR (1)

+#   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \

+  ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))

+#  endif

+# endif

+#endif

--- a/celt/celt.h

+++ b/celt/celt.h

@@ -122,7 +122,8 @@

 int celt_encode_with_ec(OpusCustomEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes, ec_enc *enc);

-int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels);

+int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels,

+                      int arch);

--- a/celt/celt_decoder.c

+++ b/celt/celt_decoder.c

@@ -447,10 +447,11 @@

          VARDECL( opus_val16, lp_pitch_buf );

          ALLOC( lp_pitch_buf, DECODE_BUFFER_SIZE>>1, opus_val16 );

-         pitch_downsample(decode_mem, lp_pitch_buf, DECODE_BUFFER_SIZE, C);

+         pitch_downsample(decode_mem, lp_pitch_buf,

+               DECODE_BUFFER_SIZE, C, st->arch);

          pitch_search(lp_pitch_buf+(PLC_PITCH_LAG_MAX>>1), lp_pitch_buf,

                DECODE_BUFFER_SIZE-PLC_PITCH_LAG_MAX,

-               PLC_PITCH_LAG_MAX-PLC_PITCH_LAG_MIN, &pitch_index);

+               PLC_PITCH_LAG_MAX-PLC_PITCH_LAG_MIN, &pitch_index, st->arch);

          pitch_index = PLC_PITCH_LAG_MAX-pitch_index;

          st->last_pitch_index = pitch_index;

       } else {

@@ -481,7 +482,8 @@

             opus_val32 ac[LPC_ORDER+1];

             /* Compute LPC coefficients for the last MAX_PERIOD samples before

                the first loss so we can work in the excitation-filter domain. */

-            _celt_autocorr(exc, ac, window, overlap, LPC_ORDER, MAX_PERIOD);

+            _celt_autocorr(exc, ac, window, overlap,

+                   LPC_ORDER, MAX_PERIOD, st->arch);

             /* Add a noise floor of -40 dB. */

 #ifdef FIXED_POINT

             ac[0] += SHR32(ac[0],13);

--- a/celt/celt_encoder.c

+++ b/celt/celt_encoder.c

@@ -161,18 +161,9 @@

 #endif /* CUSTOM_MODES */

-int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels)

+static int opus_custom_encoder_init_arch(CELTEncoder *st, const CELTMode *mode,

+                                         int channels, int arch)

-   int ret;

-   ret = opus_custom_encoder_init(st, opus_custom_mode_create(48000, 960, NULL), channels);

-   if (ret != OPUS_OK)

-      return ret;

-   st->upsample = resampling_factor(sampling_rate);

-   return OPUS_OK;

-}

-OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_init(CELTEncoder *st, const CELTMode *mode, int channels)

-{

    if (channels < 0 || channels > 2)

       return OPUS_BAD_ARG;

@@ -190,7 +181,7 @@

    st->end = st->mode->effEBands;

    st->signalling = 1;

-   st->arch = opus_select_arch();

+   st->arch = arch;

    st->constrained_vbr = 1;

    st->clip = 1;

@@ -206,6 +197,23 @@

    return OPUS_OK;

+OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_init(CELTEncoder *st, const CELTMode *mode, int channels)

+{

+   return opus_custom_encoder_init_arch(st, mode, channels, opus_select_arch());

+}

+int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels,

+                      int arch)

+{

+   int ret;

+   ret = opus_custom_encoder_init_arch(st,

+           opus_custom_mode_create(48000, 960, NULL), channels, arch);

+   if (ret != OPUS_OK)

+      return ret;

+   st->upsample = resampling_factor(sampling_rate);

+   return OPUS_OK;

+}

 #ifdef CUSTOM_MODES

 void opus_custom_encoder_destroy(CELTEncoder *st)

@@ -1023,11 +1031,12 @@

       VARDECL(opus_val16, pitch_buf);

       ALLOC(pitch_buf, (COMBFILTER_MAXPERIOD+N)>>1, opus_val16);

-      pitch_downsample(pre, pitch_buf, COMBFILTER_MAXPERIOD+N, CC);

+      pitch_downsample(pre, pitch_buf, COMBFILTER_MAXPERIOD+N, CC, st->arch);

       /* Don't search for the fir last 1.5 octave of the range because

          there's too many false-positives due to short-term correlation */

       pitch_search(pitch_buf+(COMBFILTER_MAXPERIOD>>1), pitch_buf, N,

-            COMBFILTER_MAXPERIOD-3*COMBFILTER_MINPERIOD, &pitch_index);

+            COMBFILTER_MAXPERIOD-3*COMBFILTER_MINPERIOD, &pitch_index,

+            st->arch);

       pitch_index = COMBFILTER_MAXPERIOD-pitch_index;

       gain1 = remove_doubling(pitch_buf, COMBFILTER_MAXPERIOD, COMBFILTER_MINPERIOD,

--- a/celt/celt_lpc.c

+++ b/celt/celt_lpc.c

@@ -226,7 +226,8 @@

                    const opus_val16       *window,

                    int          overlap,

                    int          lag,

-                   int          n

+                   int          n,

+                   int          arch

    opus_val32 d;

@@ -275,7 +276,7 @@

          shift = 0;

 #endif

-   celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1);

+   celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1, arch);

    for (k=0;k<=lag;k++)

       for (i = k+fastN, d = 0; i < n; i++)

--- a/celt/celt_lpc.h

+++ b/celt/celt_lpc.h

@@ -48,6 +48,7 @@

          int ord,

          opus_val16 *mem);

-int _celt_autocorr(const opus_val16 *x, opus_val32 *ac, const opus_val16 *window, int overlap, int lag, int n);

+int _celt_autocorr(const opus_val16 *x, opus_val32 *ac,

+         const opus_val16 *window, int overlap, int lag, int n, int arch);

 #endif /* PLC_H */

--- a/celt/cpu_support.h

+++ b/celt/cpu_support.h

@@ -31,7 +31,7 @@

 #include "opus_types.h"

 #include "opus_defines.h"

-#if defined(OPUS_HAVE_RTCD) && defined(ARMv4_ASM)

+#if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM)

 #include "arm/armcpu.h"

 /* We currently support 4 ARM variants:

--- a/celt/pitch.c

+++ b/celt/pitch.c

@@ -145,7 +145,7 @@

 void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp,

-      int len, int C)

+      int len, int C, int arch)

    int i;

    opus_val32 ac[5];

@@ -180,7 +180,7 @@

    _celt_autocorr(x_lp, ac, NULL, 0,

-                  4, len>>1);

+                  4, len>>1, arch);

    /* Noise floor -40 dB */

 #ifdef FIXED_POINT

@@ -250,7 +250,7 @@

 #else

 void

 #endif

-celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)

+celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)

    int i,j;

 #ifdef FIXED_POINT

@@ -289,7 +289,7 @@

 #endif

 void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTRICT y,

-                  int len, int max_pitch, int *pitch)

+                  int len, int max_pitch, int *pitch, int arch)

    int i, j;

    int lag;

@@ -342,7 +342,7 @@

 #ifdef FIXED_POINT

    maxcorr =

 #endif

-   celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2);

+   celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2, arch);

    find_best_pitch(xcorr, y_lp4, len>>2, max_pitch>>2, best_pitch

 #ifdef FIXED_POINT

--- a/celt/pitch.h

+++ b/celt/pitch.h

@@ -35,16 +35,21 @@

 #define PITCH_H

 #include "modes.h"

+#include "cpu_support.h"

 #if defined(__SSE__) && !defined(FIXED_POINT)

 #include "x86/pitch_sse.h"

 #endif

+#if defined(OPUS_ARM_ASM) && defined(FIXED_POINT)

+# include "arm/pitch_arm.h"

+#endif

 void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp,

-      int len, int C);

+      int len, int C, int arch);

 void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTRICT y,

-                  int len, int max_pitch, int *pitch);

+                  int len, int max_pitch, int *pitch, int arch);

 opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,

       int N, int *T0, int prev_period, opus_val16 prev_gain);

@@ -140,6 +145,52 @@

 #else

 void

 #endif

-celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch);

+celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,

+      opus_val32 *xcorr, int len, int max_pitch);

+#if !defined(OVERRIDE_PITCH_XCORR)

+/*Is run-time CPU detection enabled on this platform?*/

+# if defined(OPUS_HAVE_RTCD)

+extern

+#  if defined(FIXED_POINT)

+opus_val32

+#  else

+void

+#  endif

+(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,

+      const opus_val16 *, opus_val32 *, int, int);

+#  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \

+  ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \

+        xcorr, len, max_pitch))

+# else

+#  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \

+  ((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch))

+# endif

+#else

+/*static inline opus_val32 real_celt_pitch_xcorr(const opus_val16 *_x,

+ const opus_val16 *_y,opus_val32 *xcorr,int len,int max_pitch,int arch){

+  opus_val32 *xcorr_tmp;

+  opus_val32  ret_tmp;

+  opus_val32  ret;

+  int         i;

+  xcorr_tmp=(opus_val32 *)malloc(max_pitch*sizeof(*xcorr));

+  ret_tmp=celt_pitch_xcorr_c(_x,_y,xcorr_tmp,len,max_pitch);

+  ret=celt_pitch_xcorr(_x,_y,xcorr,len,max_pitch,arch);

+  for(i=0;i<max_pitch;i++)if(xcorr[i]!=xcorr_tmp[i]){

+    fprintf(stderr,"xcorr[%i] (0x%08X) != xcorr_tmp[%i] (0x%08X)\n",

+     i,xcorr[i],i,xcorr_tmp[i]);

+  }

+  if(ret!=ret_tmp){

+    fprintf(stderr,"ret (0x%08X) != ret_tmp (0x%08X)\n",ret,ret_tmp);

+  }

+  return ret_tmp;

+}

+#undef celt_pitch_xcorr

+#define celt_pitch_xcorr real_celt_pitch_xcorr*/

+#endif

 #endif

--- a/celt_headers.mk

+++ b/celt_headers.mk

@@ -35,4 +35,5 @@

 celt/arm/fixed_armv5e.h \

 celt/arm/kiss_fft_armv4.h \

 celt/arm/kiss_fft_armv5e.h \

+celt/arm/pitch_arm.h \

 celt/x86/pitch_sse.h

--- a/celt_sources.mk

+++ b/celt_sources.mk

@@ -18,4 +18,11 @@

 celt/vq.c

 CELT_SOURCES_ARM = \

-celt/arm/armcpu.c

+celt/arm/armcpu.c \

+celt/arm/arm_celt_map.c

+CELT_SOURCES_ARM_ASM = \

+celt/arm/celt_pitch_xcorr_arm.s

+CELT_AM_SOURCES_ARM_ASM = \

+celt/arm/armopts.s.in

--- a/configure.ac

+++ b/configure.ac

@@ -42,6 +42,8 @@

 AC_C_CONST

 AC_C_INLINE

+AM_PROG_AS

 AC_DEFINE([OPUS_BUILD], [], [This is a build of OPUS])

 #Use a hacked up version of autoconf's AC_C_RESTRICT because it's not

@@ -54,13 +56,13 @@

    for ac_kw in __restrict __restrict__ _Restrict restrict; do

      AC_COMPILE_IFELSE([AC_LANG_PROGRAM(

       [[typedef int * int_ptr;

-	int foo (int_ptr $ac_kw ip, int * $ac_kw baz[]) {

-	return ip[0];

+        int foo (int_ptr $ac_kw ip, int * $ac_kw baz[]) {

+        return ip[0];

        }]],

       [[int s[1];

-	int * $ac_kw t = s;

-	t[0] = 0;

-	return foo(t, (void *)0)]])],

+        int * $ac_kw t = s;

+        t[0] = 0;

+        return foo(t, (void *)0)]])],

       [ac_cv_c_restrict=$ac_kw])

      test "$ac_cv_c_restrict" != no && break

    done

@@ -165,7 +167,7 @@

 #i[[3456]]86 | x86_64 | powerpc64 | powerpc32 | ia64)

 #  has_float_approx=yes

 #  ;;

-#esac

+#esac

 AC_ARG_ENABLE([float-approx],

     [AS_HELP_STRING([--enable-float-approx], [enable fast approximations for floating point])],

@@ -183,55 +185,167 @@

     [AS_HELP_STRING([--disable-asm], [Disable assembly optimizations])],,

     [enable_asm=yes])

+AC_ARG_ENABLE([rtcd],

+    [AS_HELP_STRING([--disable-rtcd], [Disable run-time CPU capabilities detection])],,

+    [enable_rtcd=yes])

 rtcd_support=no

 cpu_arm=no

-AS_IF([test "$enable_asm" = "yes"],[

-    asm_optimization="no asm for your platform, please send patches"

+AS_IF([test x"${enable_asm}" = x"yes"],[

+    inline_optimization="No ASM for your platform, please send patches"

     case $host_cpu in

-    arm*)

-        cpu_arm=yes

-        AS_GCC_INLINE_ASSEMBLY([asm_optimization="ARM"],

-            [asm_optimization="disabled"])

-        if test "$asm_optimization" = "ARM" ; then

-            rtcd_support=yes

-            AC_DEFINE([ARMv4_ASM], 1, [Use generic ARMv4 asm optimizations])

-            AS_ASM_ARM_EDSP([ARMv5E_ASM=1],[ARMv5E_ASM=0])

-            if test "$ARMv5E_ASM" = "1" ; then

-                AC_DEFINE([ARMv5E_ASM], [1], [Use ARMv5E asm optimizations])

-                asm_optimization="$asm_optimization (EDSP)"

-            fi

-            AS_ASM_ARM_MEDIA([ARMv6_ASM=1],[ARMv6_ASM=0])

-            if test "$ARMv6_ASM" = "1" ; then

-                AC_DEFINE([ARMv6_ASM], [1], [Use ARMv6 asm optimizations])

-                asm_optimization="$asm_optimization (Media)"

-            fi

-            AS_ASM_ARM_NEON([ARM_HAVE_NEON=1],[ARM_HAVE_NEON=0])

-            if test "$ARM_HAVE_NEON" = "1" ; then

-              AC_DEFINE([ARM_HAVE_NEON], 1, [Use ARM NEON optimizations])

-              asm_optimization="$asm_optimization (NEON)"

-            fi

-        fi

+      arm*)

+        dnl Currently we only have asm for fixed-point

+        AS_IF([test "$enable_float" != "yes"],[

+            cpu_arm=yes

+            AC_DEFINE([OPUS_ARM_ASM], [],  [Make use of ARM asm optimization])

+            AS_GCC_INLINE_ASSEMBLY(

+                [inline_optimization="ARM"],

+                [inline_optimization="disabled"]

+            )

+            AS_ASM_ARM_EDSP([OPUS_ARM_INLINE_EDSP=1],[OPUS_ARM_INLINE_EDSP=0])

+            AS_ASM_ARM_MEDIA([OPUS_ARM_INLINE_MEDIA=1],

+                [OPUS_ARM_INLINE_MEDIA=0])

+            AS_ASM_ARM_NEON([OPUS_ARM_INLINE_NEON=1],[OPUS_ARM_INLINE_NEON=0])

+            AS_IF([test x"$inline_optimization" = x"ARM"],[

+                AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],[true])

+                AC_DEFINE([OPUS_ARM_INLINE_ASM], 1,

+                    [Use generic ARMv4 inline asm optimizations])

+                AS_IF([test x"$OPUS_ARM_INLINE_EDSP" = x"1"],[

+                    AC_DEFINE([OPUS_ARM_INLINE_EDSP], [1],

+                        [Use ARMv5E inline asm optimizations])

+                    inline_optimization="$inline_optimization (EDSP)"

+                ])

+                AS_IF([test x"$OPUS_ARM_INLINE_MEDIA" = x"1"],[

+                    AC_DEFINE([OPUS_ARM_INLINE_MEDIA], [1],

+                        [Use ARMv6 inline asm optimizations])

+                    inline_optimization="$inline_optimization (Media)"

+                ])

+                AS_IF([test x"$OPUS_ARM_INLINE_NEON" = x"1"],[

+                    AC_DEFINE([OPUS_ARM_INLINE_NEON], 1,

+                        [Use ARM NEON inline asm optimizations])

+                    inline_optimization="$inline_optimization (NEON)"

+                ])

+            ])

+            dnl We need Perl to translate RVCT-syntax asm to gas syntax.

+            AC_CHECK_PROG([HAVE_PERL], perl, yes, no)

+            AS_IF([test x"$HAVE_PERL" = x"yes"],[

+                AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM],[true])

+                asm_optimization="ARM"

+                AS_IF([test x"$OPUS_ARM_INLINE_EDSP" = x"1"], [

+                    OPUS_ARM_PRESUME_EDSP=1

+                    OPUS_ARM_MAY_HAVE_EDSP=1

+                ],

+                [

+                    OPUS_ARM_PRESUME_EDSP=0

+                    OPUS_ARM_MAY_HAVE_EDSP=0

+                ])

+                AS_IF([test x"$OPUS_ARM_INLINE_MEDIA" = x"1"], [

+                    OPUS_ARM_PRESUME_MEDIA=1

+                    OPUS_ARM_MAY_HAVE_MEDIA=1

+                ],

+                [

+                    OPUS_ARM_PRESUME_MEDIA=0

+                    OPUS_ARM_MAY_HAVE_MEDIA=0

+                ])

+                AS_IF([test x"$OPUS_ARM_INLINE_NEON" = x"1"], [

+                    OPUS_ARM_PRESUME_NEON=1

+                    OPUS_ARM_MAY_HAVE_NEON=1

+                ],

+                [

+                    OPUS_ARM_PRESUME_NEON=0

+                    OPUS_ARM_MAY_HAVE_NEON=0

+                ])

+                AS_IF([test x"$enable_rtcd" = x"yes"],[

+                    AS_IF([test x"$OPUS_ARM_MAY_HAVE_EDSP" != x"1"],[

+                        AC_MSG_NOTICE(

+                          [Trying to force-enable armv5e EDSP instructions...])

+                        AS_ASM_ARM_EDSP_FORCE([OPUS_ARM_MAY_HAVE_EDSP=1])

+                    ])

+                    AS_IF([test x"$OPUS_ARM_MAY_HAVE_MEDIA" != x"1"],[

+                        AC_MSG_NOTICE(

+                          [Trying to force-enable ARMv6 media instructions...])

+                        AS_ASM_ARM_MEDIA_FORCE([OPUS_ARM_MAY_HAVE_MEDIA=1])

+                    ])

+                    AS_IF([test x"$OPUS_ARM_MAY_HAVE_NEON" != x"1"],[

+                        AC_MSG_NOTICE(

+                          [Trying to force-enable NEON instructions...])

+                        AS_ASM_ARM_NEON_FORCE([OPUS_ARM_MAY_HAVE_NEON=1])

+                    ])

+                ])

+                rtcd_support=

+                AS_IF([test x"$OPUS_ARM_MAY_HAVE_EDSP" = x"1"],[

+                    AC_DEFINE(OPUS_ARM_MAY_HAVE_EDSP, 1,

+                        [Define if assembler supports EDSP instructions])

+                    AS_IF([test x"$OPUS_ARM_PRESUME_EDSP" = x"1"],[

+                        AC_DEFINE(OPUS_ARM_PRESUME_EDSP, 1,

+                          [Define if binary requires EDSP instruction support])

+                        asm_optimization="$asm_optimization (EDSP)"

+                    ],

+                        [rtcd_support="$rtcd_support (EDSP)"]

+                    )

+                ])

+                AC_SUBST(OPUS_ARM_MAY_HAVE_EDSP)

+                AS_IF([test x"$OPUS_ARM_MAY_HAVE_MEDIA" = x"1"],[

+                    AC_DEFINE(OPUS_ARM_MAY_HAVE_MEDIA, 1,

+                      [Define if assembler supports ARMv6 media instructions])

+                    AS_IF([test x"$OPUS_ARM_PRESUME_MEDIA" = x"1"],[

+                        AC_DEFINE(OPUS_ARM_PRESUME_MEDIA, 1,

+                          [Define if binary requires ARMv6 media instruction support])

+                        asm_optimization="$asm_optimization (Media)"

+                    ],

+                        [rtcd_support="$rtcd_support (Media)"]

+                    )

+                ])

+                AC_SUBST(OPUS_ARM_MAY_HAVE_MEDIA)

+                AS_IF([test x"$OPUS_ARM_MAY_HAVE_NEON" = x"1"],[

+                    AC_DEFINE(OPUS_ARM_MAY_HAVE_NEON, 1,

+                      [Define if compiler supports NEON instructions])

+                    AS_IF([test x"$OPUS_ARM_PRESUME_NEON" = x"1"], [

+                        AC_DEFINE(OPUS_ARM_PRESUME_NEON, 1,

+                          [Define if binary requires NEON instruction support])

+                        asm_optimization="$asm_optimization (NEON)"

+                    ],

+                        [rtcd_support="$rtcd_support (NEON)"]

+                    )

+                ])

+                AC_SUBST(OPUS_ARM_MAY_HAVE_NEON)

+                dnl Make sure turning on RTCD gets us at least one

+                dnl instruction set.

+                AS_IF([test x"$rtcd_support" != x""],

+                    [rtcd_support=ARM"$rtcd_support"],

+                    [rtcd_support="no"]

+                )

+            ],

+            [

+                AC_MSG_WARN(

+                  [*** ARM assembly requires perl -- disabling optimizations])

+                asm_optimization="(missing perl dependency for ARM)"

+            ])

+        ])

;;

     esac

],[

-    asm_optimization="disabled"

+   inline_optimization="disabled"

+   asm_optimization="disabled"

])

 AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"])

+AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],

+    [test x"${inline_optimization:0:3}" = x"ARM"])

+AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM],

+    [test x"${asm_optimization:0:3}" = x"ARM"])

-AC_ARG_ENABLE([rtcd],

-    [AS_HELP_STRING([--disable-rtcd], [Disable run-time CPU capabilities detection])],,

-    [enable_rtcd=yes])

-AS_IF([test "$enable_rtcd" = "yes"],[

-    AS_IF([test "$rtcd_support" = "yes"],[

-        AC_DEFINE([OPUS_HAVE_RTCD], [1], [Use run-time CPU capabilities detection])

-    ],[

-        rtcd_support="no rtcd for your platform, please send patches"

+AS_IF([test x"$enable_rtcd" = x"yes"],[

+    AS_IF([test x"$rtcd_support" != x"no"],[

+        AC_DEFINE([OPUS_HAVE_RTCD], [1],

+            [Use run-time CPU capabilities detection])

+        OPUS_HAVE_RTCD=1

+        AC_SUBST(OPUS_HAVE_RTCD)

])

],[

-    rtcd_support="no"

+    rtcd_support="disabled"

])

 AC_ARG_ENABLE([assertions],

@@ -300,9 +414,14 @@

 AC_SUBST([PC_BUILD])

-AC_CONFIG_FILES([Makefile opus.pc opus-uninstalled.pc

-                 doc/Makefile doc/Doxyfile])

+AC_CONFIG_FILES([

+    Makefile

+    opus.pc

+    opus-uninstalled.pc

+    celt/arm/armopts.s

+    doc/Makefile

+    doc/Doxyfile

+])

 AC_CONFIG_HEADERS([config.h])

 AC_OUTPUT

@@ -316,13 +435,14 @@

       C99 var arrays: ................ ${has_var_arrays}

       C99 lrintf: .................... ${ac_cv_func_lrintf}

       Use alloca: .................... ${use_alloca}

     General configuration:

       Floating point support: ........ ${enable_float}

       Fast float approximations: ..... ${enable_float_approx}

       Fixed point debugging: ......... ${enable_fixed_point_debug}

-      Assembly optimization: ......... ${asm_optimization}

+      Inline Assembly Optimizations: . ${inline_optimization}

+      External Assembly Optimizations: ${asm_optimization}

       Run-time CPU detection: ........ ${rtcd_support}

       Custom modes: .................. ${enable_custom_modes}

       Assertion checking: ............ ${enable_assertions}

--- a/m4/as-gcc-inline-assembly.m4

+++ b/m4/as-gcc-inline-assembly.m4

@@ -42,7 +42,17 @@

                      $2])

])

+AC_DEFUN([AS_ASM_ARM_NEON_FORCE],

+[

+  AC_MSG_CHECKING([if assembler supports NEON instructions on ARM])

+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[__asm__(".arch armv7-a\n.fpu neon\n.object_arch armv4t\nvorr d0,d0,d0")])],

+                    [AC_MSG_RESULT([yes])

+                     $1],

+                    [AC_MSG_RESULT([no])

+                     $2])

+])

 AC_DEFUN([AS_ASM_ARM_MEDIA],

   AC_MSG_CHECKING([if assembler supports ARMv6 media instructions on ARM])

@@ -54,12 +64,33 @@

                      $2])

])

+AC_DEFUN([AS_ASM_ARM_MEDIA_FORCE],

+[

+  AC_MSG_CHECKING([if assembler supports ARMv6 media instructions on ARM])

+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[__asm__(".arch armv6\n.object_arch armv4t\nshadd8 r3,r3,r3")])],

+                    [AC_MSG_RESULT([yes])

+                     $1],

+                    [AC_MSG_RESULT([no])

+                     $2])

+])

 AC_DEFUN([AS_ASM_ARM_EDSP],

   AC_MSG_CHECKING([if assembler supports EDSP instructions on ARM])

   AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[__asm__("qadd r3,r3,r3")])],

+                    [AC_MSG_RESULT([yes])

+                     $1],

+                    [AC_MSG_RESULT([no])

+                     $2])

+])

+AC_DEFUN([AS_ASM_ARM_EDSP_FORCE],

+[

+  AC_MSG_CHECKING([if assembler supports EDSP instructions on ARM])

+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[__asm__(".arch armv5te\n.object_arch armv4t\nqadd r3,r3,r3")])],

                     [AC_MSG_RESULT([yes])

                      $1],

                     [AC_MSG_RESULT([no])

--- a/silk/API.h

+++ b/silk/API.h

@@ -64,6 +64,7 @@

 /*************************/

 opus_int silk_InitEncoder(                              /* O    Returns error code                              */

     void                            *encState,          /* I/O  State                                           */

+    int                              arch,              /* I    Run-time architecture                           */

     silk_EncControlStruct           *encStatus          /* O    Encoder Status                                  */

);

--- a/silk/SigProc_FIX.h

+++ b/silk/SigProc_FIX.h

@@ -227,7 +227,8 @@

     opus_int                    *scale,             /* O    Scaling of the correlation vector                           */

     const opus_int16            *inputData,         /* I    Input data to correlate                                     */

     const opus_int              inputDataSize,      /* I    Length of input                                             */

-    const opus_int              correlationCount    /* I    Number of correlation taps to compute                       */

+    const opus_int              correlationCount,   /* I    Number of correlation taps to compute                       */

+    int                         arch                /* I    Run-time architecture                                       */

);

 void silk_decode_pitch(

@@ -249,7 +250,8 @@

     const opus_int              search_thres2_Q13,  /* I    Final threshold for lag candidates 0 - 1                    */

     const opus_int              Fs_kHz,             /* I    Sample frequency (kHz)                                      */

     const opus_int              complexity,         /* I    Complexity setting, 0-2, where 2 is highest                 */

-    const opus_int              nb_subfr            /* I    number of 5 ms subframes                                    */

+    const opus_int              nb_subfr,           /* I    number of 5 ms subframes                                    */

+    int                         arch                /* I    Run-time architecture                                       */

);

 /* Compute Normalized Line Spectral Frequencies (NLSFs) from whitening filter coefficients      */

@@ -309,7 +311,8 @@

     const opus_int32            minInvGain_Q30,     /* I    Inverse of max prediction gain                              */

     const opus_int              subfr_length,       /* I    Input signal subframe length (incl. D preceding samples)    */

     const opus_int              nb_subfr,           /* I    Number of subframes stacked in x                            */

-    const opus_int              D                   /* I    Order                                                       */

+    const opus_int              D,                  /* I    Order                                                       */

+    int                         arch                /* I    Run-time architecture                                       */

);

 /* Copy and multiply a vector by a constant */

@@ -576,11 +579,11 @@

 #include "MacroCount.h"

 #include "MacroDebug.h"

-#ifdef ARMv4_ASM

+#ifdef OPUS_ARM_INLINE_ASM

 #include "arm/SigProc_FIX_armv4.h"

 #endif

-#ifdef ARMv5E_ASM

+#ifdef OPUS_ARM_INLINE_EDSP

 #include "arm/SigProc_FIX_armv5e.h"

 #endif

--- a/silk/enc_API.c

+++ b/silk/enc_API.c

@@ -69,6 +69,7 @@

 /*************************/

 opus_int silk_InitEncoder(                              /* O    Returns error code                              */

     void                            *encState,          /* I/O  State                                           */

+    int                              arch,              /* I    Run-time architecture                           */

     silk_EncControlStruct           *encStatus          /* O    Encoder Status                                  */

@@ -80,7 +81,7 @@

     /* Reset encoder */

     silk_memset( psEnc, 0, sizeof( silk_encoder ) );

     for( n = 0; n < ENCODER_NUM_CHANNELS; n++ ) {

-        if( ret += silk_init_encoder( &psEnc->state_Fxx[ n ] ) ) {

+        if( ret += silk_init_encoder( &psEnc->state_Fxx[ n ], arch ) ) {

             silk_assert( 0 );

@@ -174,7 +175,7 @@

     if( encControl->nChannelsInternal > psEnc->nChannelsInternal ) {

         /* Mono -> Stereo transition: init state of second channel and stereo state */

-        ret += silk_init_encoder( &psEnc->state_Fxx[ 1 ] );

+        ret += silk_init_encoder( &psEnc->state_Fxx[ 1 ], psEnc->state_Fxx[ 0 ].sCmn.arch );

         silk_memset( psEnc->sStereo.pred_prev_Q13, 0, sizeof( psEnc->sStereo.pred_prev_Q13 ) );

         silk_memset( psEnc->sStereo.sSide, 0, sizeof( psEnc->sStereo.sSide ) );

         psEnc->sStereo.mid_side_amp_Q0[ 0 ] = 0;

@@ -206,9 +207,8 @@

         /* Reset Encoder */

         for( n = 0; n < encControl->nChannelsInternal; n++ ) {

-            if( (ret = silk_init_encoder( &psEnc->state_Fxx[ n ] ) ) != 0 ) {

-                silk_assert( 0 );

-            }

+            ret = silk_init_encoder( &psEnc->state_Fxx[ n ], psEnc->state_Fxx[ n ].sCmn.arch );

+            silk_assert( !ret );

         tmp_payloadSize_ms = encControl->payloadSize_ms;

         encControl->payloadSize_ms = 10;

--- a/silk/fixed/autocorr_FIX.c

+++ b/silk/fixed/autocorr_FIX.c

@@ -38,10 +38,11 @@

     opus_int                    *scale,             /* O    Scaling of the correlation vector                           */

     const opus_int16            *inputData,         /* I    Input data to correlate                                     */

     const opus_int              inputDataSize,      /* I    Length of input                                             */

-    const opus_int              correlationCount    /* I    Number of correlation taps to compute                       */

+    const opus_int              correlationCount,   /* I    Number of correlation taps to compute                       */

+    int                         arch                /* I    Run-time architecture                                       */

     opus_int   corrCount;

     corrCount = silk_min_int( inputDataSize, correlationCount );

-    *scale = _celt_autocorr(inputData, results, NULL, 0, corrCount-1, inputDataSize);

+    *scale = _celt_autocorr(inputData, results, NULL, 0, corrCount-1, inputDataSize, arch);

--- a/silk/fixed/burg_modified_FIX.c

+++ b/silk/fixed/burg_modified_FIX.c

@@ -50,7 +50,8 @@

     const opus_int32            minInvGain_Q30,     /* I    Inverse of max prediction gain                              */

     const opus_int              subfr_length,       /* I    Input signal subframe length (incl. D preceding samples)    */

     const opus_int              nb_subfr,           /* I    Number of subframes stacked in x                            */

-    const opus_int              D                   /* I    Order                                                       */

+    const opus_int              D,                  /* I    Order                                                       */

+    int                         arch                /* I    Run-time architecture                                       */

     opus_int         k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;

@@ -98,7 +99,7 @@

             int i;

             opus_int32 d;

             x_ptr = x + s * subfr_length;

-            celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D );

+            celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch );

             for( n = 1; n < D + 1; n++ ) {

                for ( i = n + subfr_length - D, d = 0; i < subfr_length; i++ )

                   d = MAC16_16( d, x_ptr[ i ], x_ptr[ i - n ] );

--- a/silk/fixed/encode_frame_FIX.c

+++ b/silk/fixed/encode_frame_FIX.c

@@ -132,12 +132,12 @@

         /*****************************************/

         /* Find pitch lags, initial LPC analysis */

         /*****************************************/

-        silk_find_pitch_lags_FIX( psEnc, &sEncCtrl, res_pitch, x_frame );

+        silk_find_pitch_lags_FIX( psEnc, &sEncCtrl, res_pitch, x_frame, psEnc->sCmn.arch );

         /************************/

         /* Noise shape analysis */

         /************************/

-        silk_noise_shape_analysis_FIX( psEnc, &sEncCtrl, res_pitch_frame, x_frame );

+        silk_noise_shape_analysis_FIX( psEnc, &sEncCtrl, res_pitch_frame, x_frame, psEnc->sCmn.arch );

         /***************************************************/

         /* Find linear prediction coefficients (LPC + LTP) */

--- a/silk/fixed/find_LPC_FIX.c

+++ b/silk/fixed/find_LPC_FIX.c

@@ -60,13 +60,13 @@

     psEncC->indices.NLSFInterpCoef_Q2 = 4;

     /* Burg AR analysis for the full frame */

-    silk_burg_modified( &res_nrg, &res_nrg_Q, a_Q16, x, minInvGain_Q30, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder );

+    silk_burg_modified( &res_nrg, &res_nrg_Q, a_Q16, x, minInvGain_Q30, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder, psEncC->arch );

     if( psEncC->useInterpolatedNLSFs && !psEncC->first_frame_after_reset && psEncC->nb_subfr == MAX_NB_SUBFR ) {

         VARDECL( opus_int16, LPC_res );

         /* Optimal solution for last 10 ms */

-        silk_burg_modified( &res_tmp_nrg, &res_tmp_nrg_Q, a_tmp_Q16, x + 2 * subfr_length, minInvGain_Q30, subfr_length, 2, psEncC->predictLPCOrder );

+        silk_burg_modified( &res_tmp_nrg, &res_tmp_nrg_Q, a_tmp_Q16, x + 2 * subfr_length, minInvGain_Q30, subfr_length, 2, psEncC->predictLPCOrder, psEncC->arch );

         /* subtract residual energy here, as that's easier than adding it to the    */

         /* residual energy of the first 10 ms in each iteration of the search below */

--- a/silk/fixed/find_pitch_lags_FIX.c

+++ b/silk/fixed/find_pitch_lags_FIX.c

@@ -38,7 +38,8 @@

     silk_encoder_state_FIX          *psEnc,                                 /* I/O  encoder state                                                               */

     silk_encoder_control_FIX        *psEncCtrl,                             /* I/O  encoder control                                                             */

     opus_int16                      res[],                                  /* O    residual                                                                    */

-    const opus_int16                x[]                                     /* I    Speech signal                                                               */

+    const opus_int16                x[],                                    /* I    Speech signal                                                               */

+    int                             arch                                    /* I    Run-time architecture                                                       */

     opus_int   buf_len, i, scale;

@@ -86,7 +87,7 @@

     silk_apply_sine_window( Wsig_ptr, x_buf_ptr, 2, psEnc->sCmn.la_pitch );

     /* Calculate autocorrelation sequence */

-    silk_autocorr( auto_corr, &scale, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1 );

+    silk_autocorr( auto_corr, &scale, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1, arch );

     /* Add white noise, as fraction of energy */

     auto_corr[ 0 ] = silk_SMLAWB( auto_corr[ 0 ], auto_corr[ 0 ], SILK_FIX_CONST( FIND_PITCH_WHITE_NOISE_FRACTION, 16 ) ) + 1;

@@ -127,7 +128,8 @@

         /*****************************************/

         if( silk_pitch_analysis_core( res, psEncCtrl->pitchL, &psEnc->sCmn.indices.lagIndex, &psEnc->sCmn.indices.contourIndex,

                 &psEnc->LTPCorr_Q15, psEnc->sCmn.prevLag, psEnc->sCmn.pitchEstimationThreshold_Q16,

-                (opus_int)thrhld_Q13, psEnc->sCmn.fs_kHz, psEnc->sCmn.pitchEstimationComplexity, psEnc->sCmn.nb_subfr ) == 0 )

+                (opus_int)thrhld_Q13, psEnc->sCmn.fs_kHz, psEnc->sCmn.pitchEstimationComplexity, psEnc->sCmn.nb_subfr,

+                psEnc->sCmn.arch) == 0 )

             psEnc->sCmn.indices.signalType = TYPE_VOICED;

         } else {

--- a/silk/fixed/main_FIX.h

+++ b/silk/fixed/main_FIX.h

@@ -73,7 +73,8 @@

 /* Initializes the Silk encoder state */

 opus_int silk_init_encoder(

-    silk_encoder_state_Fxx          *psEnc                                  /* I/O  Pointer to Silk FIX encoder state                                           */

+    silk_encoder_state_Fxx          *psEnc,                                 /* I/O  Pointer to Silk FIX encoder state                                           */

+    int                              arch                                   /* I    Run-time architecture                                                       */

);

 /* Control the Silk encoder */

@@ -104,7 +105,8 @@

     silk_encoder_state_FIX          *psEnc,                                 /* I/O  Encoder state FIX                                                           */

     silk_encoder_control_FIX        *psEncCtrl,                             /* I/O  Encoder control FIX                                                         */

     const opus_int16                *pitch_res,                             /* I    LPC residual from pitch analysis                                            */

-    const opus_int16                *x                                      /* I    Input signal [ frame_length + la_shape ]                                    */

+    const opus_int16                *x,                                     /* I    Input signal [ frame_length + la_shape ]                                    */

+    int                              arch                                   /* I    Run-time architecture                                                       */

);

 /* Autocorrelations for a warped frequency axis */

@@ -132,7 +134,8 @@

     silk_encoder_state_FIX          *psEnc,                                 /* I/O  encoder state                                                               */

     silk_encoder_control_FIX        *psEncCtrl,                             /* I/O  encoder control                                                             */

     opus_int16                      res[],                                  /* O    residual                                                                    */

-    const opus_int16                x[]                                     /* I    Speech signal                                                               */

+    const opus_int16                x[],                                    /* I    Speech signal                                                               */

+    int                             arch                                    /* I    Run-time architecture                                                       */

);

 /* Find LPC and LTP coefficients */

--- a/silk/fixed/noise_shape_analysis_FIX.c

+++ b/silk/fixed/noise_shape_analysis_FIX.c

@@ -145,7 +145,8 @@

     silk_encoder_state_FIX          *psEnc,                                 /* I/O  Encoder state FIX                                                           */

     silk_encoder_control_FIX        *psEncCtrl,                             /* I/O  Encoder control FIX                                                         */

     const opus_int16                *pitch_res,                             /* I    LPC residual from pitch analysis                                            */

-    const opus_int16                *x                                      /* I    Input signal [ frame_length + la_shape ]                                    */

+    const opus_int16                *x,                                     /* I    Input signal [ frame_length + la_shape ]                                    */

+    int                              arch                                   /* I    Run-time architecture                                                       */

     silk_shape_state_FIX *psShapeSt = &psEnc->sShape;

@@ -281,7 +282,7 @@

             silk_warped_autocorrelation_FIX( auto_corr, &scale, x_windowed, warping_Q16, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder );

         } else {

             /* Calculate regular auto correlation */

-            silk_autocorr( auto_corr, &scale, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1 );

+            silk_autocorr( auto_corr, &scale, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1, arch );

         /* Add white noise, as a fraction of energy */

--- a/silk/fixed/pitch_analysis_core_FIX.c

+++ b/silk/fixed/pitch_analysis_core_FIX.c

@@ -62,7 +62,8 @@

     opus_int          start_lag,                       /* I lag offset to search around */

     opus_int          sf_length,                       /* I length of a 5 ms subframe   */

     opus_int          nb_subfr,                        /* I number of subframes         */

-    opus_int          complexity                       /* I Complexity setting          */

+    opus_int          complexity,                      /* I Complexity setting          */

+    int               arch                             /* I Run-time architecture       */

);

 static void silk_P_Ana_calc_energy_st3(

@@ -88,7 +89,8 @@

     const opus_int              search_thres2_Q13,  /* I    Final threshold for lag candidates 0 - 1                    */

     const opus_int              Fs_kHz,             /* I    Sample frequency (kHz)                                      */

     const opus_int              complexity,         /* I    Complexity setting, 0-2, where 2 is highest                 */

-    const opus_int              nb_subfr            /* I    number of 5 ms subframes                                    */

+    const opus_int              nb_subfr,           /* I    number of 5 ms subframes                                    */

+    int                         arch                /* I    Run-time architecture                                       */

     VARDECL( opus_int16, frame_8kHz );

@@ -189,7 +191,7 @@

         silk_assert( basis_ptr >= frame_4kHz );

         silk_assert( basis_ptr + SF_LENGTH_8KHZ <= frame_4kHz + frame_length_4kHz );

-        celt_pitch_xcorr( target_ptr, target_ptr - MAX_LAG_4KHZ, xcorr32, SF_LENGTH_8KHZ, MAX_LAG_4KHZ - MIN_LAG_4KHZ + 1 );

+        celt_pitch_xcorr( target_ptr, target_ptr - MAX_LAG_4KHZ, xcorr32, SF_LENGTH_8KHZ, MAX_LAG_4KHZ - MIN_LAG_4KHZ + 1, arch );

         /* Calculate first vector products before loop */

         cross_corr = xcorr32[ MAX_LAG_4KHZ - MIN_LAG_4KHZ ];

@@ -516,7 +518,7 @@

         /* Calculate the correlations and energies needed in stage 3 */

         ALLOC( energies_st3, nb_subfr * nb_cbk_search, silk_pe_stage3_vals );

         ALLOC( cross_corr_st3, nb_subfr * nb_cbk_search, silk_pe_stage3_vals );

-        silk_P_Ana_calc_corr_st3(  cross_corr_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity );

+        silk_P_Ana_calc_corr_st3(  cross_corr_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity, arch );

         silk_P_Ana_calc_energy_st3( energies_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity );

         lag_counter = 0;

@@ -597,7 +599,8 @@

     opus_int          start_lag,                       /* I lag offset to search around */

     opus_int          sf_length,                       /* I length of a 5 ms subframe   */

     opus_int          nb_subfr,                        /* I number of subframes         */

-    opus_int          complexity                       /* I Complexity setting          */

+    opus_int          complexity,                      /* I Complexity setting          */

+    int               arch                             /* I Run-time architecture       */

     const opus_int16 *target_ptr;

@@ -634,7 +637,7 @@

         lag_low  = matrix_ptr( Lag_range_ptr, k, 0, 2 );

         lag_high = matrix_ptr( Lag_range_ptr, k, 1, 2 );

         silk_assert(lag_high-lag_low+1 <= SCRATCH_SIZE);

-        celt_pitch_xcorr( target_ptr, target_ptr - start_lag - lag_high, xcorr32, sf_length, lag_high - lag_low + 1 );

+        celt_pitch_xcorr( target_ptr, target_ptr - start_lag - lag_high, xcorr32, sf_length, lag_high - lag_low + 1, arch );

         for( j = lag_low; j <= lag_high; j++ ) {

             silk_assert( lag_counter < SCRATCH_SIZE );

             scratch_mem[ lag_counter ] = xcorr32[ lag_high - j ];

--- a/silk/float/SigProc_FLP.h

+++ b/silk/float/SigProc_FLP.h

@@ -94,7 +94,8 @@

     const silk_float    search_thres2,      /* I    Final threshold for lag candidates 0 - 1                    */

     const opus_int      Fs_kHz,             /* I    sample frequency (kHz)                                      */

     const opus_int      complexity,         /* I    Complexity setting, 0-2, where 2 is highest                 */

-    const opus_int      nb_subfr            /* I    Number of 5 ms subframes                                    */

+    const opus_int      nb_subfr,           /* I    Number of 5 ms subframes                                    */

+    int                 arch                /* I    Run-time architecture                                       */

);

 void silk_insertion_sort_decreasing_FLP(

--- a/silk/float/encode_frame_FLP.c

+++ b/silk/float/encode_frame_FLP.c

@@ -129,7 +129,7 @@

         /*****************************************/

         /* Find pitch lags, initial LPC analysis */

         /*****************************************/

-        silk_find_pitch_lags_FLP( psEnc, &sEncCtrl, res_pitch, x_frame );

+        silk_find_pitch_lags_FLP( psEnc, &sEncCtrl, res_pitch, x_frame, psEnc->sCmn.arch );

         /************************/

         /* Noise shape analysis */

--- a/silk/float/find_pitch_lags_FLP.c

+++ b/silk/float/find_pitch_lags_FLP.c

@@ -37,7 +37,8 @@

     silk_encoder_state_FLP          *psEnc,                             /* I/O  Encoder state FLP                           */

     silk_encoder_control_FLP        *psEncCtrl,                         /* I/O  Encoder control FLP                         */

     silk_float                      res[],                              /* O    Residual                                    */

-    const silk_float                x[]                                 /* I    Speech signal                               */

+    const silk_float                x[],                                /* I    Speech signal                               */

+    int                             arch                                /* I    Run-time architecture                       */

     opus_int   buf_len;

@@ -116,7 +117,7 @@

         /*****************************************/

         if( silk_pitch_analysis_core_FLP( res, psEncCtrl->pitchL, &psEnc->sCmn.indices.lagIndex,

             &psEnc->sCmn.indices.contourIndex, &psEnc->LTPCorr, psEnc->sCmn.prevLag, psEnc->sCmn.pitchEstimationThreshold_Q16 / 65536.0f,

-            thrhld, psEnc->sCmn.fs_kHz, psEnc->sCmn.pitchEstimationComplexity, psEnc->sCmn.nb_subfr ) == 0 )

+            thrhld, psEnc->sCmn.fs_kHz, psEnc->sCmn.pitchEstimationComplexity, psEnc->sCmn.nb_subfr, arch ) == 0 )

             psEnc->sCmn.indices.signalType = TYPE_VOICED;

         } else {

--- a/silk/float/main_FLP.h

+++ b/silk/float/main_FLP.h

@@ -71,7 +71,8 @@

 /* Initializes the Silk encoder state */

 opus_int silk_init_encoder(

-    silk_encoder_state_FLP          *psEnc                              /* I/O  Encoder state FLP                           */

+    silk_encoder_state_FLP          *psEnc,                             /* I/O  Encoder state FLP                           */

+    int                              arch                               /* I    Run-tim architecture                        */

);

 /* Control the Silk encoder */

@@ -129,7 +130,8 @@

     silk_encoder_state_FLP          *psEnc,                             /* I/O  Encoder state FLP                           */

     silk_encoder_control_FLP        *psEncCtrl,                         /* I/O  Encoder control FLP                         */

     silk_float                      res[],                              /* O    Residual                                    */

-    const silk_float                x[]                                 /* I    Speech signal                               */

+    const silk_float                x[],                                /* I    Speech signal                               */

+    int                             arch                                /* I    Run-time architecture                       */

);

 /* Find LPC and LTP coefficients */

--- a/silk/float/pitch_analysis_core_FLP.c

+++ b/silk/float/pitch_analysis_core_FLP.c

@@ -48,7 +48,8 @@

     opus_int            start_lag,          /* I start lag                                                      */

     opus_int            sf_length,          /* I sub frame length                                               */

     opus_int            nb_subfr,           /* I number of subframes                                            */

-    opus_int            complexity          /* I Complexity setting                                             */

+    opus_int            complexity,         /* I Complexity setting                                             */

+    int                 arch                /* I Run-time architecture                                          */

);

 static void silk_P_Ana_calc_energy_st3(

@@ -74,7 +75,8 @@

     const silk_float    search_thres2,      /* I    Final threshold for lag candidates 0 - 1                    */

     const opus_int      Fs_kHz,             /* I    sample frequency (kHz)                                      */

     const opus_int      complexity,         /* I    Complexity setting, 0-2, where 2 is highest                 */

-    const opus_int      nb_subfr            /* I    Number of 5 ms subframes                                    */

+    const opus_int      nb_subfr,           /* I    Number of 5 ms subframes                                    */

+    int                 arch                /* I    Run-time architecture                                       */

     opus_int   i, k, d, j;

@@ -176,7 +178,7 @@

         silk_assert( basis_ptr >= frame_4kHz );

         silk_assert( basis_ptr + sf_length_8kHz <= frame_4kHz + frame_length_4kHz );

-        celt_pitch_xcorr( target_ptr, target_ptr-max_lag_4kHz, xcorr, sf_length_8kHz, max_lag_4kHz - min_lag_4kHz + 1 );

+        celt_pitch_xcorr( target_ptr, target_ptr-max_lag_4kHz, xcorr, sf_length_8kHz, max_lag_4kHz - min_lag_4kHz + 1, arch );

         /* Calculate first vector products before loop */

         cross_corr = xcorr[ max_lag_4kHz - min_lag_4kHz ];

@@ -409,7 +411,7 @@

         CCmax = -1000.0f;

         /* Calculate the correlations and energies needed in stage 3 */

-        silk_P_Ana_calc_corr_st3( cross_corr_st3, frame, start_lag, sf_length, nb_subfr, complexity );

+        silk_P_Ana_calc_corr_st3( cross_corr_st3, frame, start_lag, sf_length, nb_subfr, complexity, arch );

         silk_P_Ana_calc_energy_st3( energies_st3, frame, start_lag, sf_length, nb_subfr, complexity );

         lag_counter = 0;

@@ -493,7 +495,8 @@

     opus_int            start_lag,          /* I start lag                                                      */

     opus_int            sf_length,          /* I sub frame length                                               */

     opus_int            nb_subfr,           /* I number of subframes                                            */

-    opus_int            complexity          /* I Complexity setting                                             */

+    opus_int            complexity,         /* I Complexity setting                                             */

+    int                 arch                /* I Run-time architecture                                          */

     const silk_float *target_ptr;

@@ -527,7 +530,7 @@

         lag_low  = matrix_ptr( Lag_range_ptr, k, 0, 2 );

         lag_high = matrix_ptr( Lag_range_ptr, k, 1, 2 );

         silk_assert(lag_high-lag_low+1 <= SCRATCH_SIZE);

-        celt_pitch_xcorr( target_ptr, target_ptr - start_lag - lag_high, xcorr, sf_length, lag_high - lag_low + 1 );

+        celt_pitch_xcorr( target_ptr, target_ptr - start_lag - lag_high, xcorr, sf_length, lag_high - lag_low + 1, arch );

         for( j = lag_low; j <= lag_high; j++ ) {

             silk_assert( lag_counter < SCRATCH_SIZE );

             scratch_mem[ lag_counter ] = xcorr[ lag_high - j ];

--- a/silk/init_encoder.c

+++ b/silk/init_encoder.c

@@ -34,12 +34,14 @@

 #include "main_FLP.h"

 #endif

 #include "tuning_parameters.h"

+#include "cpu_support.h"

 /*********************************/

 /* Initialize Silk Encoder state */

 /*********************************/

 opus_int silk_init_encoder(

-    silk_encoder_state_Fxx          *psEnc                                  /* I/O  Pointer to Silk FIX encoder state                                           */

+    silk_encoder_state_Fxx          *psEnc,                                 /* I/O  Pointer to Silk FIX encoder state                                           */

+    int                              arch                                   /* I    Run-time architecture                                                       */

     opus_int ret = 0;

@@ -46,6 +48,8 @@

     /* Clear the entire encoder state */

     silk_memset( psEnc, 0, sizeof( silk_encoder_state_Fxx ) );

+    psEnc->sCmn.arch = arch;

     psEnc->sCmn.variable_HP_smth1_Q15 = silk_LSHIFT( silk_lin2log( SILK_FIX_CONST( VARIABLE_HP_MIN_CUTOFF_HZ, 16 ) ) - ( 16 << 7 ), 8 );

     psEnc->sCmn.variable_HP_smth2_Q15 = psEnc->sCmn.variable_HP_smth1_Q15;

--- a/silk/macros.h

+++ b/silk/macros.h

@@ -103,11 +103,11 @@

     (*((Matrix_base_adr) + ((row)+(M)*(column))))

 #endif

-#ifdef ARMv4_ASM

+#ifdef OPUS_ARM_INLINE_ASM

 #include "arm/macros_armv4.h"

 #endif

-#ifdef ARMv5E_ASM

+#ifdef OPUS_ARM_INLINE_EDSP

 #include "arm/macros_armv5e.h"

 #endif

--- a/silk/structs.h

+++ b/silk/structs.h

@@ -191,6 +191,8 @@

     SideInfoIndices              indices;

     opus_int8                    pulses[ MAX_FRAME_LENGTH ];

+    int                          arch;

     /* Input/output buffering */

     opus_int16                   inputBuf[ MAX_FRAME_LENGTH + 2 ];  /* Buffer containing input signal                                   */

     opus_int                     inputBufIx;

--- a/src/opus_decoder.c

+++ b/src/opus_decoder.c

@@ -75,7 +75,6 @@

 #endif

    opus_uint32  rangeFinal;

-   int arch;

};

 #ifdef FIXED_POINT

@@ -125,7 +124,6 @@

    st->Fs = Fs;

    st->DecControl.API_sampleRate = st->Fs;

    st->DecControl.nChannelsAPI      = st->channels;

-   st->arch = opus_select_arch();

    /* Reset decoder */

    ret = silk_InitDecoder( silk_dec );

--- a/src/opus_encoder.c

+++ b/src/opus_encoder.c

@@ -104,7 +104,7 @@

     int          analysis_offset;

 #endif

     opus_uint32  rangeFinal;

-    int arch;

+    int          arch;

};

 /* Transition tables for the voice and music. First column is the

@@ -188,7 +188,7 @@

     st->arch = opus_select_arch();

-    ret = silk_InitEncoder( silk_enc, &st->silk_mode );

+    ret = silk_InitEncoder( silk_enc, st->arch, &st->silk_mode );

     if(ret)return OPUS_INTERNAL_ERROR;

     /* default SILK parameters */

@@ -209,7 +209,7 @@

     /* Create CELT encoder */

     /* Initialize CELT encoder */

-    err = celt_encoder_init(celt_enc, Fs, channels);

+    err = celt_encoder_init(celt_enc, Fs, channels, st->arch);

     if(err!=OPUS_OK)return OPUS_INTERNAL_ERROR;

     celt_encoder_ctl(celt_enc, CELT_SET_SIGNALLING(0));

@@ -1219,7 +1219,7 @@

     if (st->mode != MODE_CELT_ONLY && st->prev_mode == MODE_CELT_ONLY)

         silk_EncControlStruct dummy;

-        silk_InitEncoder( silk_enc, &dummy);

+        silk_InitEncoder( silk_enc, st->arch, &dummy);

         prefill=1;

@@ -2418,7 +2418,7 @@

                  ((char*)&st->OPUS_ENCODER_RESET_START - (char*)st));

            celt_encoder_ctl(celt_enc, OPUS_RESET_STATE);

-           silk_InitEncoder( silk_enc, &dummy );

+           silk_InitEncoder( silk_enc, st->arch, &dummy );

            st->stream_channels = st->channels;

            st->hybrid_stereo_width_Q14 = 1 << 14;

            st->prev_HB_gain = Q15ONE;