ref: 39386e0b85ec0f978aa104d312604badb9047d58
parent: 530198f955e49571b3f890b4da4d933a4cd5df4e
author: Timothy B. Terriberry <[email protected]>
date: Mon Nov 18 08:30:13 EST 2013
Adds Neon assembly for correlation/convolution Optimizing celt_pitch_xcorr()/xcorr_kernel() which also speeds up FIRs, IIRs and auto-correlations Signed-off-by: Jean-Marc Valin <[email protected]>
--- a/Makefile.am
+++ b/Makefile.am
@@ -29,7 +29,14 @@
if CPU_ARM
CELT_SOURCES += $(CELT_SOURCES_ARM)
+SILK_SOURCES += $(SILK_SOURCES_ARM)
+if OPUS_ARM_EXTERNAL_ASM
+CELT_SOURCES += $(CELT_SOURCES_ARM_ASM:.s=-gnu.S)
+BUILT_SOURCES = $(CELT_SOURCES_ARM_ASM:.s=-gnu.S) \
+ $(CELT_AM_SOURCES_ARM_ASM:.s.in=.s) \
+ $(CELT_AM_SOURCES_ARM_ASM.s.in=-gnu.S)
endif
+endif
include celt_headers.mk
include silk_headers.mk
@@ -106,11 +113,12 @@
endif
EXTRA_DIST = version.mk \
- opus.pc.in \
+ opus.pc.in \
opus-uninstalled.pc.in \
opus.m4 \
Makefile.unix \
tests/run_vectors.sh \
+ celt/arm/arm2gnu.pl \
win32/VS2010/silk_float.vcxproj \
win32/VS2010/celt.vcxproj.filters \
win32/VS2010/opus.vcxproj \
@@ -206,3 +214,14 @@
.PHONY: opus check-opus install-opus docs install-docs
+
+# automake doesn't do dependency tracking for asm files, that I can tell
+$(CELT_SOURCES_ARM_ASM:%.s=%-gnu.S): celt/arm/armopts-gnu.S
+$(CELT_SOURCES_ARM_ASM:%.s=%-gnu.S): $(top_srcdir)/celt/arm/arm2gnu.pl
+
+# convert ARM asm to GNU as format
+%-gnu.S: $(top_srcdir)/%.s
+ $(top_srcdir)/celt/arm/arm2gnu.pl < $< > $@
+# For autoconf-modified sources (e.g., armopts.s)
+%-gnu.S: %.s
+ $(top_srcdir)/celt/arm/arm2gnu.pl < $< > $@
--- a/celt/_kiss_fft_guts.h
+++ b/celt/_kiss_fft_guts.h
@@ -94,11 +94,11 @@
do {(res).r = ADD32((res).r,(a).r); (res).i = SUB32((res).i,(a).i); \
}while(0)
-#if defined(ARMv4_ASM)
+#if defined(OPUS_ARM_INLINE_ASM)
#include "arm/kiss_fft_armv4.h"
#endif
-#if defined(ARMv5E_ASM)
+#if defined(OPUS_ARM_INLINE_EDSP)
#include "arm/kiss_fft_armv5e.h"
#endif
--- a/celt/arch.h
+++ b/celt/arch.h
@@ -114,9 +114,9 @@
#include "fixed_generic.h"
-#ifdef ARMv5E_ASM
+#ifdef OPUS_ARM_INLINE_EDSP
#include "arm/fixed_armv5e.h"
-#elif defined (ARMv4_ASM)
+#elif defined (OPUS_ARM_INLINE_ASM)
#include "arm/fixed_armv4.h"
#elif defined (BFIN_ASM)
#include "fixed_bfin.h"
--- /dev/null
+++ b/celt/arm/arm2gnu.pl
@@ -1,0 +1,316 @@
+#!/usr/bin/perl
+
+my $bigend; # little/big endian
+my $nxstack;
+
+$nxstack = 0;
+
+eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}'
+ if $running_under_some_shell;
+
+while ($ARGV[0] =~ /^-/) {
+ $_ = shift;
+ last if /^--/;
+ if (/^-n/) {
+ $nflag++;
+ next;
+ }
+ die "I don't recognize this switch: $_\\n";
+}
+$printit++ unless $nflag;
+
+$\ = "\n"; # automatically add newline on print
+$n=0;
+
+$thumb = 0; # ARM mode by default, not Thumb.
+@proc_stack = ();
+
+LINE:
+while (<>) {
+
+ # For ADRLs we need to add a new line after the substituted one.
+ $addPadding = 0;
+
+ # First, we do not dare to touch *anything* inside double quotes, do we?
+ # Second, if you want a dollar character in the string,
+ # insert two of them -- that's how ARM C and assembler treat strings.
+ s/^([A-Za-z_]\w*)[ \t]+DCB[ \t]*\"/$1: .ascii \"/ && do { s/\$\$/\$/g; next };
+ s/\bDCB\b[ \t]*\"/.ascii \"/ && do { s/\$\$/\$/g; next };
+ s/^(\S+)\s+RN\s+(\S+)/$1 .req r$2/ && do { s/\$\$/\$/g; next };
+ # If there's nothing on a line but a comment, don't try to apply any further
+ # substitutions (this is a cheap hack to avoid mucking up the license header)
+ s/^([ \t]*);/$1@/ && do { s/\$\$/\$/g; next };
+ # If substituted -- leave immediately !
+
+ s/@/,:/;
+ s/;/@/;
+ while ( /@.*'/ ) {
+ s/(@.*)'/$1/g;
+ }
+ s/\{FALSE\}/0/g;
+ s/\{TRUE\}/1/g;
+ s/\{(\w\w\w\w+)\}/$1/g;
+ s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/;
+ s/\bGET[ \t]*([^ \t\n]+)/.include \"${ my $x=$1; $x =~ s|\.s|-gnu.S|; \$x }\"/;
+ s/\bIMPORT\b/.extern/;
+ s/\bEXPORT\b/.global/;
+ s/^(\s+)\[/$1IF/;
+ s/^(\s+)\|/$1ELSE/;
+ s/^(\s+)\]/$1ENDIF/;
+ s/IF *:DEF:/ .ifdef/;
+ s/IF *:LNOT: *:DEF:/ .ifndef/;
+ s/ELSE/ .else/;
+ s/ENDIF/ .endif/;
+
+ if( /\bIF\b/ ) {
+ s/\bIF\b/ .if/;
+ s/=/==/;
+ }
+ if ( $n == 2) {
+ s/\$/\\/g;
+ }
+ if ($n == 1) {
+ s/\$//g;
+ s/label//g;
+ $n = 2;
+ }
+ if ( /MACRO/ ) {
+ s/MACRO *\n/.macro/;
+ $n=1;
+ }
+ if ( /\bMEND\b/ ) {
+ s/\bMEND\b/.endm/;
+ $n=0;
+ }
+
+ # ".rdata" doesn't work in 'as' version 2.13.2, as it is ".rodata" there.
+ #
+ if ( /\bAREA\b/ ) {
+ my $align;
+ $align = "2";
+ if ( /ALIGN=(\d+)/ ) {
+ $align = $1;
+ }
+ if ( /CODE/ ) {
+ $nxstack = 1;
+ }
+ s/^(.+)CODE(.+)READONLY(.*)/ .text/;
+ s/^(.+)DATA(.+)READONLY(.*)/ .section .rdata/;
+ s/^(.+)\|\|\.data\|\|(.+)/ .data/;
+ s/^(.+)\|\|\.bss\|\|(.+)/ .bss/;
+ s/$/; .p2align $align/;
+ # Enable NEON instructions but don't produce a binary that requires
+ # ARMv7. RVCT does not have equivalent directives, so we just do this
+ # for all CODE areas.
+ if ( /.text/ ) {
+ # Separating .arch, .fpu, etc., by semicolons does not work (gas
+ # thinks the semicolon is part of the arch name, even when there's
+ # whitespace separating them). Sadly this means our line numbers
+ # won't match the original source file (we could use the .line
+ # directive, which is documented to be obsolete, but then gdb will
+ # show the wrong line in the translated source file).
+ s/$/; .arch armv7-a\n .fpu neon\n .object_arch armv4t/;
+ }
+ }
+
+ s/\|\|\.constdata\$(\d+)\|\|/.L_CONST$1/; # ||.constdata$3||
+ s/\|\|\.bss\$(\d+)\|\|/.L_BSS$1/; # ||.bss$2||
+ s/\|\|\.data\$(\d+)\|\|/.L_DATA$1/; # ||.data$2||
+ s/\|\|([a-zA-Z0-9_]+)\@([a-zA-Z0-9_]+)\|\|/@ $&/;
+ s/^(\s+)\%(\s)/ .space $1/;
+
+ s/\|(.+)\.(\d+)\|/\.$1_$2/; # |L80.123| -> .L80_123
+ s/\bCODE32\b/.code 32/ && do {$thumb = 0};
+ s/\bCODE16\b/.code 16/ && do {$thumb = 1};
+ if (/\bPROC\b/)
+ {
+ my $prefix;
+ my $proc;
+ /^([A-Za-z_\.]\w+)\b/;
+ $proc = $1;
+ $prefix = "";
+ if ($proc)
+ {
+ $prefix = $prefix.sprintf("\t.type\t%s, %%function; ",$proc);
+ push(@proc_stack, $proc);
+ s/^[A-Za-z_\.]\w+/$&:/;
+ }
+ $prefix = $prefix."\t.thumb_func; " if ($thumb);
+ s/\bPROC\b/@ $&/;
+ $_ = $prefix.$_;
+ }
+ s/^(\s*)(S|Q|SH|U|UQ|UH)ASX\b/$1$2ADDSUBX/;
+ s/^(\s*)(S|Q|SH|U|UQ|UH)SAX\b/$1$2SUBADDX/;
+ if (/\bENDP\b/)
+ {
+ my $proc;
+ s/\bENDP\b/@ $&/;
+ $proc = pop(@proc_stack);
+ $_ = "\t.size $proc, .-$proc".$_ if ($proc);
+ }
+ s/\bSUBT\b/@ $&/;
+ s/\bDATA\b/@ $&/; # DATA directive is deprecated -- Asm guide, p.7-25
+ s/\bKEEP\b/@ $&/;
+ s/\bEXPORTAS\b/@ $&/;
+ s/\|\|(.)+\bEQU\b/@ $&/;
+ s/\|\|([\w\$]+)\|\|/$1/;
+ s/\bENTRY\b/@ $&/;
+ s/\bASSERT\b/@ $&/;
+ s/\bGBLL\b/@ $&/;
+ s/\bGBLA\b/@ $&/;
+ s/^\W+OPT\b/@ $&/;
+ s/:OR:/|/g;
+ s/:SHL:/<</g;
+ s/:SHR:/>>/g;
+ s/:AND:/&/g;
+ s/:LAND:/&&/g;
+ s/CPSR/cpsr/;
+ s/SPSR/spsr/;
+ s/ALIGN$/.balign 4/;
+ s/ALIGN\s+([0-9x]+)$/.balign $1/;
+ s/psr_cxsf/psr_all/;
+ s/LTORG/.ltorg/;
+ s/^([A-Za-z_]\w*)[ \t]+EQU/ .set $1,/;
+ s/^([A-Za-z_]\w*)[ \t]+SETL/ .set $1,/;
+ s/^([A-Za-z_]\w*)[ \t]+SETA/ .set $1,/;
+ s/^([A-Za-z_]\w*)[ \t]+\*/ .set $1,/;
+
+ # {PC} + 0xdeadfeed --> . + 0xdeadfeed
+ s/\{PC\} \+/ \. +/;
+
+ # Single hex constant on the line !
+ #
+ # >>> NOTE <<<
+ # Double-precision floats in gcc are always mixed-endian, which means
+ # bytes in two words are little-endian, but words are big-endian.
+ # So, 0x0000deadfeed0000 would be stored as 0x0000dead at low address
+ # and 0xfeed0000 at high address.
+ #
+ s/\bDCFD\b[ \t]+0x([a-fA-F0-9]{8})([a-fA-F0-9]{8})/.long 0x$1, 0x$2/;
+ # Only decimal constants on the line, no hex !
+ s/\bDCFD\b[ \t]+([0-9\.\-]+)/.double $1/;
+
+ # Single hex constant on the line !
+# s/\bDCFS\b[ \t]+0x([a-f0-9]{8})([a-f0-9]{8})/.long 0x$1, 0x$2/;
+ # Only decimal constants on the line, no hex !
+# s/\bDCFS\b[ \t]+([0-9\.\-]+)/.double $1/;
+ s/\bDCFS[ \t]+0x/.word 0x/;
+ s/\bDCFS\b/.float/;
+
+ s/^([A-Za-z_]\w*)[ \t]+DCD/$1 .word/;
+ s/\bDCD\b/.word/;
+ s/^([A-Za-z_]\w*)[ \t]+DCW/$1 .short/;
+ s/\bDCW\b/.short/;
+ s/^([A-Za-z_]\w*)[ \t]+DCB/$1 .byte/;
+ s/\bDCB\b/.byte/;
+ s/^([A-Za-z_]\w*)[ \t]+\%/.comm $1,/;
+ s/^[A-Za-z_\.]\w+/$&:/;
+ s/^(\d+)/$1:/;
+ s/\%(\d+)/$1b_or_f/;
+ s/\%[Bb](\d+)/$1b/;
+ s/\%[Ff](\d+)/$1f/;
+ s/\%[Ff][Tt](\d+)/$1f/;
+ s/&([\dA-Fa-f]+)/0x$1/;
+ if ( /\b2_[01]+\b/ ) {
+ s/\b2_([01]+)\b/conv$1&&&&/g;
+ while ( /[01][01][01][01]&&&&/ ) {
+ s/0000&&&&/&&&&0/g;
+ s/0001&&&&/&&&&1/g;
+ s/0010&&&&/&&&&2/g;
+ s/0011&&&&/&&&&3/g;
+ s/0100&&&&/&&&&4/g;
+ s/0101&&&&/&&&&5/g;
+ s/0110&&&&/&&&&6/g;
+ s/0111&&&&/&&&&7/g;
+ s/1000&&&&/&&&&8/g;
+ s/1001&&&&/&&&&9/g;
+ s/1010&&&&/&&&&A/g;
+ s/1011&&&&/&&&&B/g;
+ s/1100&&&&/&&&&C/g;
+ s/1101&&&&/&&&&D/g;
+ s/1110&&&&/&&&&E/g;
+ s/1111&&&&/&&&&F/g;
+ }
+ s/000&&&&/&&&&0/g;
+ s/001&&&&/&&&&1/g;
+ s/010&&&&/&&&&2/g;
+ s/011&&&&/&&&&3/g;
+ s/100&&&&/&&&&4/g;
+ s/101&&&&/&&&&5/g;
+ s/110&&&&/&&&&6/g;
+ s/111&&&&/&&&&7/g;
+ s/00&&&&/&&&&0/g;
+ s/01&&&&/&&&&1/g;
+ s/10&&&&/&&&&2/g;
+ s/11&&&&/&&&&3/g;
+ s/0&&&&/&&&&0/g;
+ s/1&&&&/&&&&1/g;
+ s/conv&&&&/0x/g;
+ }
+
+ if ( /commandline/)
+ {
+ if( /-bigend/)
+ {
+ $bigend=1;
+ }
+ }
+
+ if ( /\bDCDU\b/ )
+ {
+ my $cmd=$_;
+ my $value;
+ my $prefix;
+ my $w1;
+ my $w2;
+ my $w3;
+ my $w4;
+
+ s/\s+DCDU\b/@ $&/;
+
+ $cmd =~ /\bDCDU\b\s+0x(\d+)/;
+ $value = $1;
+ $value =~ /(\w\w)(\w\w)(\w\w)(\w\w)/;
+ $w1 = $1;
+ $w2 = $2;
+ $w3 = $3;
+ $w4 = $4;
+
+ if( $bigend ne "")
+ {
+ # big endian
+ $prefix = "\t.byte\t0x".$w1.";".
+ "\t.byte\t0x".$w2.";".
+ "\t.byte\t0x".$w3.";".
+ "\t.byte\t0x".$w4."; ";
+ }
+ else
+ {
+ # little endian
+ $prefix = "\t.byte\t0x".$w4.";".
+ "\t.byte\t0x".$w3.";".
+ "\t.byte\t0x".$w2.";".
+ "\t.byte\t0x".$w1."; ";
+ }
+ $_=$prefix.$_;
+ }
+
+ if ( /\badrl\b/i )
+ {
+ s/\badrl\s+(\w+)\s*,\s*(\w+)/ldr $1,=$2/i;
+ $addPadding = 1;
+ }
+ s/\bEND\b/@ END/;
+} continue {
+ printf ("%s", $_) if $printit;
+ if ($addPadding != 0)
+ {
+ printf (" mov r0,r0\n");
+ $addPadding = 0;
+ }
+}
+#If we had a code section, mark that this object doesn't need an executable
+# stack.
+if ($nxstack) {
+ printf (" .section\t.note.GNU-stack,\"\",\%\%progbits\n");
+}
--- /dev/null
+++ b/celt/arm/arm_celt_map.c
@@ -1,0 +1,49 @@
+/* Copyright (c) 2010 Xiph.Org Foundation
+ * Copyright (c) 2013 Parrot */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pitch.h"
+
+#if defined(OPUS_HAVE_RTCD)
+
+# if defined(FIXED_POINT)
+opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+ const opus_val16 *, opus_val32 *, int , int) = {
+ celt_pitch_xcorr_c, /* ARMv4 */
+ MAY_HAVE_EDSP(celt_pitch_xcorr), /* EDSP */
+ MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
+ MAY_HAVE_NEON(celt_pitch_xcorr) /* NEON */
+};
+# else
+# error "Floating-point implementation is not supported by ARM asm yet." \
+ "Reconfigure with --disable-rtcd or send patches."
+# endif
+
+#endif
--- a/celt/arm/armcpu.c
+++ b/celt/arm/armcpu.c
@@ -55,7 +55,7 @@
/* MSVC has no OPUS_INLINE __asm support for ARM, but it does let you __emit
* instructions via their assembled hex code.
* All of these instructions should be essentially nops. */
-# if defined(ARMv5E_ASM)
+# if defined(OPUS_ARM_MAY_HAVE_EDSP)
__try{
/*PLD [r13]*/
__emit(0xF5DDF000);
@@ -64,7 +64,7 @@
__except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
/*Ignore exception.*/
}
-# if defined(ARMv6E_ASM)
+# if defined(OPUS_ARM_MAY_HAVE_MEDIA)
__try{
/*SHADD8 r3,r3,r3*/
__emit(0xE6333F93);
@@ -73,7 +73,7 @@
__except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
/*Ignore exception.*/
}
-# if defined(ARM_HAVE_NEON)
+# if defined(OPUS_ARM_MAY_HAVE_NEON)
__try{
/*VORR q0,q0,q0*/
__emit(0xF2200150);
@@ -107,19 +107,26 @@
while(fgets(buf, 512, cpuinfo) != NULL)
{
+# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_NEON)
/* Search for edsp and neon flag */
if(memcmp(buf, "Features", 8) == 0)
{
char *p;
+# if defined(OPUS_ARM_MAY_HAVE_EDSP)
p = strstr(buf, " edsp");
if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
flags |= OPUS_CPU_ARM_EDSP;
+# endif
+# if defined(OPUS_ARM_MAY_HAVE_NEON)
p = strstr(buf, " neon");
if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
flags |= OPUS_CPU_ARM_NEON;
+# endif
}
+# endif
+# if defined(OPUS_ARM_MAY_HAVE_MEDIA)
/* Search for media capabilities (>= ARMv6) */
if(memcmp(buf, "CPU architecture:", 17) == 0)
{
@@ -129,6 +136,7 @@
if(version >= 6)
flags |= OPUS_CPU_ARM_MEDIA;
}
+# endif
}
fclose(cpuinfo);
--- a/celt/arm/armcpu.h
+++ b/celt/arm/armcpu.h
@@ -25,11 +25,47 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-/* Original code from libtheora modified to suit to Opus */
+#if !defined(ARMCPU_H)
+# define ARMCPU_H
-#ifndef ARMCPU_H
-#define ARMCPU_H
+# if defined(OPUS_ARM_MAY_HAVE_EDSP)
+# define MAY_HAVE_EDSP(name) name ## _edsp
+# else
+# define MAY_HAVE_EDSP(name) name ## _c
+# endif
+# if defined(OPUS_ARM_MAY_HAVE_MEDIA)
+# define MAY_HAVE_MEDIA(name) name ## _media
+# else
+# define MAY_HAVE_MEDIA(name) MAY_HAVE_EDSP(name)
+# endif
+
+# if defined(OPUS_ARM_MAY_HAVE_NEON)
+# define MAY_HAVE_NEON(name) name ## _neon
+# else
+# define MAY_HAVE_NEON(name) MAY_HAVE_MEDIA(name)
+# endif
+
+# if defined(OPUS_ARM_PRESUME_EDSP)
+# define PRESUME_EDSP(name) name ## _edsp
+# else
+# define PRESUME_EDSP(name) name ## _c
+# endif
+
+# if defined(OPUS_ARM_PRESUME_MEDIA)
+# define PRESUME_MEDIA(name) name ## _media
+# else
+# define PRESUME_MEDIA(name) PRESUME_EDSP(name)
+# endif
+
+# if defined(OPUS_ARM_PRESUME_NEON)
+# define PRESUME_NEON(name) name ## _neon
+# else
+# define PRESUME_NEON(name) PRESUME_MEDIA(name)
+# endif
+
+# if defined(OPUS_HAVE_RTCD)
int opus_select_arch(void);
+# endif
#endif
--- /dev/null
+++ b/celt/arm/armopts.s.in
@@ -1,0 +1,37 @@
+/* Copyright (C) 2013 Mozilla Corporation */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+; Set the following to 1 if we have EDSP instructions
+; (LDRD/STRD, etc., ARMv5E and later).
+OPUS_ARM_MAY_HAVE_EDSP * @OPUS_ARM_MAY_HAVE_EDSP@
+
+; Set the following to 1 if we have ARMv6 media instructions.
+OPUS_ARM_MAY_HAVE_MEDIA * @OPUS_ARM_MAY_HAVE_MEDIA@
+
+; Set the following to 1 if we have NEON (some ARMv7)
+OPUS_ARM_MAY_HAVE_NEON * @OPUS_ARM_MAY_HAVE_NEON@
+
+END
--- /dev/null
+++ b/celt/arm/celt_pitch_xcorr_arm.s
@@ -1,0 +1,598 @@
+; Copyright (c) 2007-2008 CSIRO
+; Copyright (c) 2007-2009 Xiph.Org Foundation
+; Copyright (c) 2013 Parrot
+; Written by Aurélien Zanelli
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+;
+; - Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+;
+; - Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in the
+; documentation and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ AREA |.text|, CODE, READONLY
+
+ GET celt/arm/armopts.s
+
+IF OPUS_ARM_MAY_HAVE_EDSP
+ EXPORT celt_pitch_xcorr_edsp
+ENDIF
+
+IF OPUS_ARM_MAY_HAVE_NEON
+ EXPORT celt_pitch_xcorr_neon
+ENDIF
+
+IF OPUS_ARM_MAY_HAVE_NEON
+
+;; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
+;xcorr_kernel_neon PROC
+; ; input:
+; ; r3 = int len
+; ; r4 = opus_val16 *x
+; ; r5 = opus_val16 *y
+; ; q0 = opus_val32 sum[4]
+; ; output:
+; ; q0 = opus_val32 sum[4]
+; ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
+; ; internal usage:
+; ; r12 = int j
+; ; d3 = y_3|y_2|y_1|y_0
+; ; q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
+; ; q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
+; ; q8 = scratch
+; ;
+; ; Load y[0...3]
+; ; This requires len>0 to always be valid (which we assert in the C code).
+; VLD1.16 {d5}, [r5]!
+; SUBS r12, r3, #8
+; BLE xcorr_kernel_neon_process4
+;; Process 8 samples at a time.
+;; This loop loads one y value more than we actually need. Therefore we have to
+;; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
+;; reading past the end of the array.
+;xcorr_kernel_neon_process8
+; ; This loop has 19 total instructions (10 cycles to issue, minimum), with
+; ; - 2 cycles of ARM insrtuctions,
+; ; - 10 cycles of load/store/byte permute instructions, and
+; ; - 9 cycles of data processing instructions.
+; ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
+; ; latter two categories, meaning the whole loop should run in 10 cycles per
+; ; iteration, barring cache misses.
+; ;
+; ; Load x[0...7]
+; VLD1.16 {d6, d7}, [r4]!
+; ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
+; ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
+; VAND d3, d5, d5
+; SUBS r12, r12, #8
+; ; Load y[4...11]
+; VLD1.16 {d4, d5}, [r5]!
+; VMLAL.S16 q0, d3, d6[0]
+; VEXT.16 d16, d3, d4, #1
+; VMLAL.S16 q0, d4, d7[0]
+; VEXT.16 d17, d4, d5, #1
+; VMLAL.S16 q0, d16, d6[1]
+; VEXT.16 d16, d3, d4, #2
+; VMLAL.S16 q0, d17, d7[1]
+; VEXT.16 d17, d4, d5, #2
+; VMLAL.S16 q0, d16, d6[2]
+; VEXT.16 d16, d3, d4, #3
+; VMLAL.S16 q0, d17, d7[2]
+; VEXT.16 d17, d4, d5, #3
+; VMLAL.S16 q0, d16, d6[3]
+; VMLAL.S16 q0, d17, d7[3]
+; BGT xcorr_kernel_neon_process8
+;; Process 4 samples here if we have > 4 left (still reading one extra y value).
+;xcorr_kernel_neon_process4
+; ADDS r12, r12, #4
+; BLE xcorr_kernel_neon_process2
+; ; Load x[0...3]
+; VLD1.16 d6, [r4]!
+; ; Use VAND since it's a data processing instruction again.
+; VAND d4, d5, d5
+; SUB r12, r12, #4
+; ; Load y[4...7]
+; VLD1.16 d5, [r5]!
+; VMLAL.S16 q0, d4, d6[0]
+; VEXT.16 d16, d4, d5, #1
+; VMLAL.S16 q0, d16, d6[1]
+; VEXT.16 d16, d4, d5, #2
+; VMLAL.S16 q0, d16, d6[2]
+; VEXT.16 d16, d4, d5, #3
+; VMLAL.S16 q0, d16, d6[3]
+;; Process 2 samples here if we have > 2 left (still reading one extra y value).
+;xcorr_kernel_neon_process2
+; ADDS r12, r12, #2
+; BLE xcorr_kernel_neon_process1
+; ; Load x[0...1]
+; VLD2.16 {d6[],d7[]}, [r4]!
+; ; Use VAND since it's a data processing instruction again.
+; VAND d4, d5, d5
+; SUB r12, r12, #2
+; ; Load y[4...5]
+; VLD1.32 {d5[]}, [r5]!
+; VMLAL.S16 q0, d4, d6
+; VEXT.16 d16, d4, d5, #1
+; ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
+; ; instead of VEXT, since it's a data-processing instruction.
+; VSRI.64 d5, d4, #32
+; VMLAL.S16 q0, d16, d7
+;; Process 1 sample using the extra y value we loaded above.
+;xcorr_kernel_neon_process1
+; ; Load next *x
+; VLD1.16 {d6[]}, [r4]!
+; ADDS r12, r12, #1
+; ; y[0...3] are left in d5 from prior iteration(s) (if any)
+; VMLAL.S16 q0, d5, d6
+; MOVLE pc, lr
+;; Now process 1 last sample, not reading ahead.
+; ; Load last *y
+; VLD1.16 {d4[]}, [r5]!
+; VSRI.64 d4, d5, #16
+; ; Load last *x
+; VLD1.16 {d6[]}, [r4]!
+; VMLAL.S16 q0, d4, d6
+; MOV pc, lr
+; ENDP
+
+;; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
+;; opus_val32 *xcorr, int len, int max_pitch)
+;celt_pitch_xcorr_neon PROC
+; ; input:
+; ; r0 = opus_val16 *_x
+; ; r1 = opus_val16 *_y
+; ; r2 = opus_val32 *xcorr
+; ; r3 = int len
+; ; output:
+; ; r0 = int maxcorr
+; ; internal usage:
+; ; r4 = opus_val16 *x (for xcorr_kernel_neon())
+; ; r5 = opus_val16 *y (for xcorr_kernel_neon())
+; ; r6 = int max_pitch
+; ; r12 = int j
+; ; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon())
+; STMFD sp!, {r4-r6, lr}
+; LDR r6, [sp, #16]
+; VMOV.S32 q15, #1
+; ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
+; SUBS r6, r6, #4
+; BLT celt_pitch_xcorr_neon_process4_done
+;celt_pitch_xcorr_neon_process4
+; ; xcorr_kernel_neon parameters:
+; ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
+; MOV r4, r0
+; MOV r5, r1
+; VEOR q0, q0, q0
+; ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
+; ; So we don't save/restore any other registers.
+; BL xcorr_kernel_neon
+; SUBS r6, r6, #4
+; VST1.32 {q0}, [r2]!
+; ; _y += 4
+; ADD r1, r1, #8
+; VMAX.S32 q15, q15, q0
+; ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
+; BGE celt_pitch_xcorr_neon_process4
+;; We have less than 4 sums left to compute.
+;celt_pitch_xcorr_neon_process4_done
+; ADDS r6, r6, #4
+; ; Reduce maxcorr to a single value
+; VMAX.S32 d30, d30, d31
+; VPMAX.S32 d30, d30, d30
+; ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
+; BLE celt_pitch_xcorr_neon_done
+;; Now compute each remaining sum one at a time.
+;celt_pitch_xcorr_neon_process_remaining
+; MOV r4, r0
+; MOV r5, r1
+; VMOV.I32 q0, #0
+; SUBS r12, r3, #8
+; BLT celt_pitch_xcorr_neon_process_remaining4
+;; Sum terms 8 at a time.
+;celt_pitch_xcorr_neon_process_remaining_loop8
+; ; Load x[0...7]
+; VLD1.16 {q1}, [r4]!
+; ; Load y[0...7]
+; VLD1.16 {q2}, [r5]!
+; SUBS r12, r12, #8
+; VMLAL.S16 q0, d4, d2
+; VMLAL.S16 q0, d5, d3
+; BGE celt_pitch_xcorr_neon_process_remaining_loop8
+;; Sum terms 4 at a time.
+;celt_pitch_xcorr_neon_process_remaining4
+; ADDS r12, r12, #4
+; BLT celt_pitch_xcorr_neon_process_remaining4_done
+; ; Load x[0...3]
+; VLD1.16 {d2}, [r4]!
+; ; Load y[0...3]
+; VLD1.16 {d3}, [r5]!
+; SUB r12, r12, #4
+; VMLAL.S16 q0, d3, d2
+; ; Reduce the sum to a single value.
+; VADD.S32 d0, d0, d1
+; VPADDL.S32 d0, d0
+;celt_pitch_xcorr_neon_process_remaining4_done
+; ADDS r12, r12, #4
+; BLE celt_pitch_xcorr_neon_process_remaining_loop_done
+;; Sum terms 1 at a time.
+;celt_pitch_xcorr_neon_process_remaining_loop1
+; VLD1.16 {d2[]}, [r4]!
+; VLD1.16 {d3[]}, [r5]!
+; SUBS r12, r12, #1
+; VMLAL.S16 q0, d2, d3
+; BGT celt_pitch_xcorr_neon_process_remaining_loop1
+;celt_pitch_xcorr_neon_process_remaining_loop_done
+; VST1.32 {d0[0]}, [r2]!
+; VMAX.S32 d30, d30, d0
+; SUBS r6, r6, #1
+; ; _y++
+; ADD r1, r1, #2
+; ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
+; BGT celt_pitch_xcorr_neon_process_remaining
+;celt_pitch_xcorr_neon_done
+; VMOV.32 r0, d30[0]
+; LDMFD sp!, {r4-r6, pc}
+; ENDP
+
+xcorr_kernel_neon PROC
+ ; input:
+ ; r0 = opus_val16 *x
+ ; r1 = opus_val16 *y
+ ; r2 = int len
+ ; q0 = opus_val32 sum (sum[3] | sum[2] | sum[1] | sum[0])
+
+ ; output:
+ ; q0 = sum
+
+ ; internal usage:
+ ; r3 = j
+ ; d2 = x_3|x_2|x_1|x_0 d3 = y_3|y_2|y_1|y_0
+ ; d4 = y_7|y_6|y_5|y_4 d5 = y_4|y_3|y_2|y_1
+ ; d6 = y_5|y_4|y_3|y_2 d7 = y_6|y_5|y_4|y_3
+ ; We will build d5, d6 and d7 vector from d3 and d4
+
+
+ VLD1.16 {d3}, [r1]! ; Load y[3] downto y[0] to d3 lane (yy0)
+ SUB r3, r2, #1
+ MOVS r3, r3, lsr #2 ; j=(len-1)>>2
+ BEQ xcorr_kernel_neon_process4_done
+
+ ; Process 4 x samples at a time
+ ; For this, we will need 4 y vectors
+xcorr_kernel_neon_process4
+ SUBS r3, r3, #1 ; j--
+ VLD1.16 d4, [r1]! ; Load y[7] downto y[4] to d4 lane
+ VLD1.16 d2, [r0]! ; Load x[3] downto x[0] to d2 lane
+ VEXT.16 d5, d3, d4, #1 ; Build y[4] downto y[1] vector (yy1)
+ VEXT.16 d6, d3, d4, #2 ; Build y[5] downto y[2] vector (yy2)
+ VEXT.16 d7, d3, d4, #3 ; Build y[6] downto y[3] vector (yy3)
+
+ VMLAL.S16 q0, d3, d2[0] ; MAC16_16(sum, x[0], yy0)
+ VMLAL.S16 q0, d5, d2[1] ; MAC16_16(sum, x[1], yy1)
+ VMLAL.S16 q0, d6, d2[2] ; MAC16_16(sum, x[2], yy2)
+ VMLAL.S16 q0, d7, d2[3] ; MAC16_16(sum, x[3], yy3)
+
+ VMOV.S16 d3, d4 ; Next y vector should be in d3 (yy0)
+
+ BNE xcorr_kernel_neon_process4
+
+xcorr_kernel_neon_process4_done
+ ;Process len-1 to len
+ VLD1.16 {d2[]}, [r0]! ; Load *x and duplicate to d2 lane
+
+ SUB r3, r2, #1
+ ANDS r3, r3, #3 ; j=(len-1)&3
+ VMLAL.S16 q0, d3, d2 ; MAC16_16(sum, *x, yy0)
+ BEQ xcorr_kernel_neon_done
+
+xcorr_kernel_neon_process_remaining
+ SUBS r3, r3, #1 ; j--
+ VLD1.16 {d4[]}, [r1]! ; Load y value and duplicate to d4 lane
+ VLD1.16 {d2[]}, [r0]! ; Load *x and duplicate to d2 lane
+ VEXT.16 d3, d3, d4, #1 ; Build y vector from previous and d4
+ VMLAL.S16 q0, d3, d2 ; MAC16_16(sum, *x, yy0)
+ BNE xcorr_kernel_neon_process_remaining
+
+xcorr_kernel_neon_done
+ MOV pc, lr
+ ENDP
+
+celt_pitch_xcorr_neon PROC
+ ; input:
+ ; r0 = opus_val16 *_x
+ ; r1 = opus_val16 *_y
+ ; r2 = opus_val32 *xcorr
+ ; r3 = int len
+
+ ; output:
+ ; r0 = maxcorr
+
+ STMFD sp!, {r4-r9, lr}
+
+ LDR r4, [sp, #28] ; r4 = int max_pitch
+ MOV r5, r0 ; r5 = _x
+ MOV r6, r1 ; r6 = _y
+ MOV r7, r2 ; r7 = xcorr
+ MOV r2, r3 ; r2 = len
+
+ VMOV.S32 d16, #1 ; d16 = {1, 1} (not used by xcorr_kernel_neon)
+ MOV r8, #0 ; r8 = i = 0
+ CMP r4, #3 ; max_pitch-3 <= 0 ---> pitch_xcorr_neon_process4_done
+ BLE celt_pitch_xcorr_neon_process4_done
+
+ SUB r9, r4, #3 ; r9 = max_pitch-3
+
+celt_pitch_xcorr_neon_process4
+ MOV r0, r5 ; r0 = _x
+ ADD r1, r6 ,r8, LSL #1 ; r1 = _y + i
+ VMOV.I32 q0, #0 ; q0 = opus_val32 sum[4] = {0, 0, 0, 0}
+
+ ; xcorr_kernel_neon don't touch r2 (len)
+ ; So we don't store it
+ BL xcorr_kernel_neon ; xcorr_kernel_neon(_x, _y+i, sum, len)
+
+ VST1.32 {q0}, [r7]! ; Store sum to xcorr
+ VPMAX.S32 d0, d0, d1 ; d0 = max(sum[3], sum[2]) | max(sum[1], sum[0])
+ ADD r8, r8, #4 ; i+=4
+ VPMAX.S32 d0, d0, d0 ; d0 = max(sum[3], sum[2], sum[1], sum[0])
+ CMP r8, r9 ; i < max_pitch-3 ----> pitch_xcorr_neon_process4
+ VMAX.S32 d16, d16, d0 ; d16 = maxcorr = max(maxcorr, sum)
+
+ BLT celt_pitch_xcorr_neon_process4
+
+celt_pitch_xcorr_neon_process4_done
+ CMP r8, r4;
+ BGE celt_pitch_xcorr_neon_done
+
+celt_pitch_xcorr_neon_process_remaining
+ MOV r0, r5 ; r0 = _x
+ ADD r1, r6, r8, LSL #1 ; r1 = _y + i
+ VMOV.I32 q0, #0
+ MOVS r3, r2, LSR #2 ; r3 = j = len
+ BEQ inner_loop_neon_process4_done
+
+inner_loop_neon_process4
+ VLD1.16 {d2}, [r0]! ; Load x
+ VLD1.16 {d3}, [r1]! ; Load y
+ SUBS r3, r3, #1
+ VMLAL.S16 q0, d2, d3
+ BNE inner_loop_neon_process4
+
+ VPADD.S32 d0, d0, d1 ; Reduce sum
+ VPADD.S32 d0, d0, d0
+
+inner_loop_neon_process4_done
+ ANDS r3, r2, #3
+ BEQ inner_loop_neon_done
+
+inner_loop_neon_process_remaining
+ VLD1.16 {d2[]}, [r0]!
+ VLD1.16 {d3[]}, [r1]!
+ SUBS r3, r3, #1
+ VMLAL.S16 q0, d2, d3
+ BNE inner_loop_neon_process_remaining
+
+inner_loop_neon_done
+ VST1.32 {d0[0]}, [r7]!
+ VMAX.S32 d16, d16, d0
+
+ ADD r8, r8, #1
+ CMP r8, r4
+ BCC celt_pitch_xcorr_neon_process_remaining
+
+celt_pitch_xcorr_neon_done
+ VMOV d0, d16
+ VMOV.32 r0, d0[0]
+ LDMFD sp!, {r4-r9, pc}
+ ENDP
+
+
+ENDIF
+
+IF OPUS_ARM_MAY_HAVE_EDSP
+
+; This will get used on ARMv7 devices without NEON, so it has been optimized
+; to take advantage of dual-issuing where possible.
+xcorr_kernel_edsp PROC
+ ; input:
+ ; r3 = int len
+ ; r4 = opus_val16 *_x
+ ; r5 = opus_val16 *_y
+ ; r6...r9 = opus_val32 sum[4]
+ ; output:
+ ; r6...r9 = opus_val32 sum[4]
+ ; preserved: r0-r5
+ ; internal usage
+ ; r2 = int j
+ ; r12,r14 = opus_val16 x[4]
+ ; r10,r11 = opus_val16 y[4]
+ STMFD sp!, {r2,r4,r5,lr}
+ SUBS r2, r3, #4 ; j = len-4
+ LDRD r10, r11, [r5], #8 ; Load y[0...3]
+ BLE xcorr_kernel_edsp_process4_done
+ LDR r12, [r4], #4 ; Load x[0...1]
+ ; Stall
+xcorr_kernel_edsp_process4
+ ; The multiplies must issue from pipeline 0, and can't dual-issue with each
+ ; other. Every other instruction here dual-issues with a multiply, and is
+ ; thus "free". There should be no stalls in the body of the loop.
+ SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_0,y_0)
+ LDR r14, [r4], #4 ; Load x[2...3]
+ SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x_0,y_1)
+ SUBS r2, r2, #4 ; j-=4
+ SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_0,y_2)
+ SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x_0,y_3)
+ SMLATT r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_1,y_1)
+ LDR r10, [r5], #4 ; Load y[4...5]
+ SMLATB r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],x_1,y_2)
+ SMLATT r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_1,y_3)
+ SMLATB r9, r12, r10, r9 ; sum[3] = MAC16_16(sum[3],x_1,y_4)
+ LDRGT r12, [r4], #4 ; Load x[0...1]
+ SMLABB r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_2,y_2)
+ SMLABT r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x_2,y_3)
+ SMLABB r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_2,y_4)
+ SMLABT r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x_2,y_5)
+ SMLATT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_3,y_3)
+ LDR r11, [r5], #4 ; Load y[6...7]
+ SMLATB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],x_3,y_4)
+ SMLATT r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_3,y_5)
+ SMLATB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],x_3,y_6)
+ BGT xcorr_kernel_edsp_process4
+xcorr_kernel_edsp_process4_done
+ ADDS r2, r2, #4
+ BLE xcorr_kernel_edsp_done
+ LDRH r12, [r4], #2 ; r12 = *x++
+ SUBS r2, r2, #1 ; j--
+ ; Stall
+ SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0)
+ LDRGTH r14, [r4], #2 ; r14 = *x++
+ SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1)
+ SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2)
+ SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3)
+ BLE xcorr_kernel_edsp_done
+ SMLABT r6, r14, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_1)
+ SUBS r2, r2, #1 ; j--
+ SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2)
+ LDRH r10, [r5], #2 ; r10 = y_4 = *y++
+ SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3)
+ LDRGTH r12, [r4], #2 ; r12 = *x++
+ SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4)
+ BLE xcorr_kernel_edsp_done
+ SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2)
+ CMP r2, #1 ; j--
+ SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3)
+ LDRH r2, [r5], #2 ; r2 = y_5 = *y++
+ SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4)
+ LDRGTH r14, [r4] ; r14 = *x
+ SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5)
+ BLE xcorr_kernel_edsp_done
+ SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3)
+ LDRH r11, [r5] ; r11 = y_6 = *y
+ SMLABB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_4)
+ SMLABB r8, r14, r2, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_5)
+ SMLABB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_6)
+xcorr_kernel_edsp_done
+ LDMFD sp!, {r2,r4,r5,pc}
+ ENDP
+
+celt_pitch_xcorr_edsp PROC
+ ; input:
+ ; r0 = opus_val16 *_x
+ ; r1 = opus_val16 *_y
+ ; r2 = opus_val32 *xcorr
+ ; r3 = int len
+ ; output:
+ ; r0 = maxcorr
+ ; internal usage
+ ; r4 = opus_val16 *x
+ ; r5 = opus_val16 *y
+ ; r6 = opus_val32 sum0
+ ; r7 = opus_val32 sum1
+ ; r8 = opus_val32 sum2
+ ; r9 = opus_val32 sum3
+ ; r1 = int max_pitch
+ ; r12 = int j
+ STMFD sp!, {r4-r11, lr}
+ MOV r5, r1
+ LDR r1, [sp, #36]
+ MOV r4, r0
+ ; maxcorr = 1
+ MOV r0, #1
+ ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process4_done
+ SUBS r1, r1, #4
+ BLT celt_pitch_xcorr_edsp_process4_done
+celt_pitch_xcorr_edsp_process4
+ ; xcorr_kernel_edsp parameters:
+ ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
+ MOV r6, #0
+ MOV r7, #0
+ MOV r8, #0
+ MOV r9, #0
+ BL xcorr_kernel_edsp ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
+ ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
+ CMP r0, r6
+ ; _y+=4
+ ADD r5, r5, #8
+ MOVLT r0, r6
+ CMP r0, r7
+ STRD r6, r7, [r2], #8
+ MOVLT r0, r7
+ CMP r0, r8
+ STRD r8, r9, [r2], #8
+ MOVLT r0, r8
+ CMP r0, r9
+ MOVLT r0, r9
+ SUBS r1, r1, #4
+ BGE celt_pitch_xcorr_edsp_process4
+celt_pitch_xcorr_edsp_process4_done
+ ADDS r1, r1, #4
+ BLE celt_pitch_xcorr_edsp_done
+; Now compute each remaining sum one at a time.
+celt_pitch_xcorr_edsp_process_remaining
+ SUBS r12, r3, #4
+ ; r14 = sum = 0
+ MOV r14, #0
+ BLT celt_pitch_xcorr_edsp_process_remaining_loop_done
+ LDRD r6, r7, [r4], #8
+ LDRD r8, r9, [r5], #8
+ ; Stall
+celt_pitch_xcorr_edsp_process_remaining_loop4
+ SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
+ SUBS r12, r12, #4 ; j--
+ SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
+ LDRGE r6, [r4], #4
+ SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
+ LDRGE r8, [r5], #4
+ SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3)
+ LDRGE r7, [r4], #4
+ LDRGE r9, [r5], #4
+ BGE celt_pitch_xcorr_edsp_process_remaining_loop4
+celt_pitch_xcorr_edsp_process_remaining_loop_done
+ ADDS r12, r12, #2
+ LDRGE r6, [r4], #4
+ LDRGE r8, [r5], #4
+ ; Stall
+ SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
+ SUBGE r12, r12, #2
+ SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
+ ADDS r12, r12, #1
+ LDRGEH r6, [r4], #2
+ LDRGEH r8, [r5], #2
+ ; Restore _x
+ SUB r4, r4, r3, LSL #1
+ ; Stall
+ SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y)
+ ; Restore and advance _y
+ SUB r5, r5, r3, LSL #1
+ ; maxcorr = max(maxcorr, sum)
+ ; Stall
+ CMP r0, r14
+ ADD r5, r5, #2
+ MOVLT r0, r14
+ SUBS r1, r1, #1
+ ; xcorr[i] = sum
+ STR r14, [r2], #4
+ BGT celt_pitch_xcorr_edsp_process_remaining
+celt_pitch_xcorr_edsp_done
+ LDMFD sp!, {r4-r11, pc}
+ ENDP
+
+ENDIF
+
+END
--- /dev/null
+++ b/celt/arm/pitch_arm.h
@@ -1,0 +1,57 @@
+/* Copyright (c) 2010 Xiph.Org Foundation
+ * Copyright (c) 2013 Parrot */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(PITCH_ARM_H)
+# define PITCH_ARM_H
+
+# include "armcpu.h"
+
+# if defined(FIXED_POINT)
+
+# if defined(OPUS_ARM_MAY_HAVE_NEON)
+opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y,
+ opus_val32 *xcorr, int len, int max_pitch);
+# endif
+
+# if defined(OPUS_ARM_MAY_HAVE_MEDIA)
+# define celt_pitch_xcorr_media MAY_HAVE_EDSP(celt_pitch_xcorr)
+# endif
+
+# if defined(OPUS_ARM_MAY_HAVE_EDSP)
+opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
+ opus_val32 *xcorr, int len, int max_pitch);
+# endif
+
+# if !defined(OPUS_HAVE_RTCD)
+# define OVERRIDE_PITCH_XCORR (1)
+# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+ ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))
+# endif
+
+# endif
+
+#endif
--- a/celt/celt.h
+++ b/celt/celt.h
@@ -122,7 +122,8 @@
int celt_encode_with_ec(OpusCustomEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes, ec_enc *enc);
-int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels);
+int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels,
+ int arch);
--- a/celt/celt_decoder.c
+++ b/celt/celt_decoder.c
@@ -447,10 +447,11 @@
{
VARDECL( opus_val16, lp_pitch_buf );
ALLOC( lp_pitch_buf, DECODE_BUFFER_SIZE>>1, opus_val16 );
- pitch_downsample(decode_mem, lp_pitch_buf, DECODE_BUFFER_SIZE, C);
+ pitch_downsample(decode_mem, lp_pitch_buf,
+ DECODE_BUFFER_SIZE, C, st->arch);
pitch_search(lp_pitch_buf+(PLC_PITCH_LAG_MAX>>1), lp_pitch_buf,
DECODE_BUFFER_SIZE-PLC_PITCH_LAG_MAX,
- PLC_PITCH_LAG_MAX-PLC_PITCH_LAG_MIN, &pitch_index);
+ PLC_PITCH_LAG_MAX-PLC_PITCH_LAG_MIN, &pitch_index, st->arch);
pitch_index = PLC_PITCH_LAG_MAX-pitch_index;
st->last_pitch_index = pitch_index;
} else {
@@ -481,7 +482,8 @@
opus_val32 ac[LPC_ORDER+1];
/* Compute LPC coefficients for the last MAX_PERIOD samples before
the first loss so we can work in the excitation-filter domain. */
- _celt_autocorr(exc, ac, window, overlap, LPC_ORDER, MAX_PERIOD);
+ _celt_autocorr(exc, ac, window, overlap,
+ LPC_ORDER, MAX_PERIOD, st->arch);
/* Add a noise floor of -40 dB. */
#ifdef FIXED_POINT
ac[0] += SHR32(ac[0],13);
--- a/celt/celt_encoder.c
+++ b/celt/celt_encoder.c
@@ -161,18 +161,9 @@
}
#endif /* CUSTOM_MODES */
-int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels)
+static int opus_custom_encoder_init_arch(CELTEncoder *st, const CELTMode *mode,
+ int channels, int arch)
{
- int ret;
- ret = opus_custom_encoder_init(st, opus_custom_mode_create(48000, 960, NULL), channels);
- if (ret != OPUS_OK)
- return ret;
- st->upsample = resampling_factor(sampling_rate);
- return OPUS_OK;
-}
-
-OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_init(CELTEncoder *st, const CELTMode *mode, int channels)
-{
if (channels < 0 || channels > 2)
return OPUS_BAD_ARG;
@@ -190,7 +181,7 @@
st->end = st->mode->effEBands;
st->signalling = 1;
- st->arch = opus_select_arch();
+ st->arch = arch;
st->constrained_vbr = 1;
st->clip = 1;
@@ -206,6 +197,23 @@
return OPUS_OK;
}
+OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_init(CELTEncoder *st, const CELTMode *mode, int channels)
+{
+ return opus_custom_encoder_init_arch(st, mode, channels, opus_select_arch());
+}
+
+int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels,
+ int arch)
+{
+ int ret;
+ ret = opus_custom_encoder_init_arch(st,
+ opus_custom_mode_create(48000, 960, NULL), channels, arch);
+ if (ret != OPUS_OK)
+ return ret;
+ st->upsample = resampling_factor(sampling_rate);
+ return OPUS_OK;
+}
+
#ifdef CUSTOM_MODES
void opus_custom_encoder_destroy(CELTEncoder *st)
{
@@ -1023,11 +1031,12 @@
VARDECL(opus_val16, pitch_buf);
ALLOC(pitch_buf, (COMBFILTER_MAXPERIOD+N)>>1, opus_val16);
- pitch_downsample(pre, pitch_buf, COMBFILTER_MAXPERIOD+N, CC);
+ pitch_downsample(pre, pitch_buf, COMBFILTER_MAXPERIOD+N, CC, st->arch);
/* Don't search for the fir last 1.5 octave of the range because
there's too many false-positives due to short-term correlation */
pitch_search(pitch_buf+(COMBFILTER_MAXPERIOD>>1), pitch_buf, N,
- COMBFILTER_MAXPERIOD-3*COMBFILTER_MINPERIOD, &pitch_index);
+ COMBFILTER_MAXPERIOD-3*COMBFILTER_MINPERIOD, &pitch_index,
+ st->arch);
pitch_index = COMBFILTER_MAXPERIOD-pitch_index;
gain1 = remove_doubling(pitch_buf, COMBFILTER_MAXPERIOD, COMBFILTER_MINPERIOD,
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c
@@ -226,7 +226,8 @@
const opus_val16 *window,
int overlap,
int lag,
- int n
+ int n,
+ int arch
)
{
opus_val32 d;
@@ -275,7 +276,7 @@
shift = 0;
}
#endif
- celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1);
+ celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1, arch);
for (k=0;k<=lag;k++)
{
for (i = k+fastN, d = 0; i < n; i++)
--- a/celt/celt_lpc.h
+++ b/celt/celt_lpc.h
@@ -48,6 +48,7 @@
int ord,
opus_val16 *mem);
-int _celt_autocorr(const opus_val16 *x, opus_val32 *ac, const opus_val16 *window, int overlap, int lag, int n);
+int _celt_autocorr(const opus_val16 *x, opus_val32 *ac,
+ const opus_val16 *window, int overlap, int lag, int n, int arch);
#endif /* PLC_H */
--- a/celt/cpu_support.h
+++ b/celt/cpu_support.h
@@ -31,7 +31,7 @@
#include "opus_types.h"
#include "opus_defines.h"
-#if defined(OPUS_HAVE_RTCD) && defined(ARMv4_ASM)
+#if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM)
#include "arm/armcpu.h"
/* We currently support 4 ARM variants:
--- a/celt/pitch.c
+++ b/celt/pitch.c
@@ -145,7 +145,7 @@
void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp,
- int len, int C)
+ int len, int C, int arch)
{
int i;
opus_val32 ac[5];
@@ -180,7 +180,7 @@
}
_celt_autocorr(x_lp, ac, NULL, 0,
- 4, len>>1);
+ 4, len>>1, arch);
/* Noise floor -40 dB */
#ifdef FIXED_POINT
@@ -250,7 +250,7 @@
#else
void
#endif
-celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)
+celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)
{
int i,j;
#ifdef FIXED_POINT
@@ -289,7 +289,7 @@
#endif
void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTRICT y,
- int len, int max_pitch, int *pitch)
+ int len, int max_pitch, int *pitch, int arch)
{
int i, j;
int lag;
@@ -342,7 +342,7 @@
#ifdef FIXED_POINT
maxcorr =
#endif
- celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2);
+ celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2, arch);
find_best_pitch(xcorr, y_lp4, len>>2, max_pitch>>2, best_pitch
#ifdef FIXED_POINT
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -35,16 +35,21 @@
#define PITCH_H
#include "modes.h"
+#include "cpu_support.h"
#if defined(__SSE__) && !defined(FIXED_POINT)
#include "x86/pitch_sse.h"
#endif
+#if defined(OPUS_ARM_ASM) && defined(FIXED_POINT)
+# include "arm/pitch_arm.h"
+#endif
+
void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp,
- int len, int C);
+ int len, int C, int arch);
void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTRICT y,
- int len, int max_pitch, int *pitch);
+ int len, int max_pitch, int *pitch, int arch);
opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
int N, int *T0, int prev_period, opus_val16 prev_gain);
@@ -140,6 +145,52 @@
#else
void
#endif
-celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch);
+celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
+ opus_val32 *xcorr, int len, int max_pitch);
+
+#if !defined(OVERRIDE_PITCH_XCORR)
+/*Is run-time CPU detection enabled on this platform?*/
+# if defined(OPUS_HAVE_RTCD)
+extern
+# if defined(FIXED_POINT)
+opus_val32
+# else
+void
+# endif
+(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+ const opus_val16 *, opus_val32 *, int, int);
+
+# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+ ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
+ xcorr, len, max_pitch))
+# else
+# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+ ((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch))
+# endif
+#else
+
+/*static inline opus_val32 real_celt_pitch_xcorr(const opus_val16 *_x,
+ const opus_val16 *_y,opus_val32 *xcorr,int len,int max_pitch,int arch){
+ opus_val32 *xcorr_tmp;
+ opus_val32 ret_tmp;
+ opus_val32 ret;
+ int i;
+ xcorr_tmp=(opus_val32 *)malloc(max_pitch*sizeof(*xcorr));
+ ret_tmp=celt_pitch_xcorr_c(_x,_y,xcorr_tmp,len,max_pitch);
+ ret=celt_pitch_xcorr(_x,_y,xcorr,len,max_pitch,arch);
+ for(i=0;i<max_pitch;i++)if(xcorr[i]!=xcorr_tmp[i]){
+ fprintf(stderr,"xcorr[%i] (0x%08X) != xcorr_tmp[%i] (0x%08X)\n",
+ i,xcorr[i],i,xcorr_tmp[i]);
+ }
+ if(ret!=ret_tmp){
+ fprintf(stderr,"ret (0x%08X) != ret_tmp (0x%08X)\n",ret,ret_tmp);
+ }
+ return ret_tmp;
+}
+
+#undef celt_pitch_xcorr
+#define celt_pitch_xcorr real_celt_pitch_xcorr*/
+
+#endif
#endif
--- a/celt_headers.mk
+++ b/celt_headers.mk
@@ -35,4 +35,5 @@
celt/arm/fixed_armv5e.h \
celt/arm/kiss_fft_armv4.h \
celt/arm/kiss_fft_armv5e.h \
+celt/arm/pitch_arm.h \
celt/x86/pitch_sse.h
--- a/celt_sources.mk
+++ b/celt_sources.mk
@@ -18,4 +18,11 @@
celt/vq.c
CELT_SOURCES_ARM = \
-celt/arm/armcpu.c
+celt/arm/armcpu.c \
+celt/arm/arm_celt_map.c
+
+CELT_SOURCES_ARM_ASM = \
+celt/arm/celt_pitch_xcorr_arm.s
+
+CELT_AM_SOURCES_ARM_ASM = \
+celt/arm/armopts.s.in
--- a/configure.ac
+++ b/configure.ac
@@ -42,6 +42,8 @@
AC_C_CONST
AC_C_INLINE
+AM_PROG_AS
+
AC_DEFINE([OPUS_BUILD], [], [This is a build of OPUS])
#Use a hacked up version of autoconf's AC_C_RESTRICT because it's not
@@ -54,13 +56,13 @@
for ac_kw in __restrict __restrict__ _Restrict restrict; do
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
[[typedef int * int_ptr;
- int foo (int_ptr $ac_kw ip, int * $ac_kw baz[]) {
- return ip[0];
+ int foo (int_ptr $ac_kw ip, int * $ac_kw baz[]) {
+ return ip[0];
}]],
[[int s[1];
- int * $ac_kw t = s;
- t[0] = 0;
- return foo(t, (void *)0)]])],
+ int * $ac_kw t = s;
+ t[0] = 0;
+ return foo(t, (void *)0)]])],
[ac_cv_c_restrict=$ac_kw])
test "$ac_cv_c_restrict" != no && break
done
@@ -165,7 +167,7 @@
#i[[3456]]86 | x86_64 | powerpc64 | powerpc32 | ia64)
# has_float_approx=yes
# ;;
-#esac
+#esac
AC_ARG_ENABLE([float-approx],
[AS_HELP_STRING([--enable-float-approx], [enable fast approximations for floating point])],
@@ -183,55 +185,167 @@
[AS_HELP_STRING([--disable-asm], [Disable assembly optimizations])],,
[enable_asm=yes])
+AC_ARG_ENABLE([rtcd],
+ [AS_HELP_STRING([--disable-rtcd], [Disable run-time CPU capabilities detection])],,
+ [enable_rtcd=yes])
+
rtcd_support=no
cpu_arm=no
-AS_IF([test "$enable_asm" = "yes"],[
- asm_optimization="no asm for your platform, please send patches"
+AS_IF([test x"${enable_asm}" = x"yes"],[
+ inline_optimization="No ASM for your platform, please send patches"
case $host_cpu in
- arm*)
- cpu_arm=yes
- AS_GCC_INLINE_ASSEMBLY([asm_optimization="ARM"],
- [asm_optimization="disabled"])
- if test "$asm_optimization" = "ARM" ; then
- rtcd_support=yes
- AC_DEFINE([ARMv4_ASM], 1, [Use generic ARMv4 asm optimizations])
- AS_ASM_ARM_EDSP([ARMv5E_ASM=1],[ARMv5E_ASM=0])
- if test "$ARMv5E_ASM" = "1" ; then
- AC_DEFINE([ARMv5E_ASM], [1], [Use ARMv5E asm optimizations])
- asm_optimization="$asm_optimization (EDSP)"
- fi
- AS_ASM_ARM_MEDIA([ARMv6_ASM=1],[ARMv6_ASM=0])
- if test "$ARMv6_ASM" = "1" ; then
- AC_DEFINE([ARMv6_ASM], [1], [Use ARMv6 asm optimizations])
- asm_optimization="$asm_optimization (Media)"
- fi
- AS_ASM_ARM_NEON([ARM_HAVE_NEON=1],[ARM_HAVE_NEON=0])
- if test "$ARM_HAVE_NEON" = "1" ; then
- AC_DEFINE([ARM_HAVE_NEON], 1, [Use ARM NEON optimizations])
- asm_optimization="$asm_optimization (NEON)"
- fi
- fi
+ arm*)
+ dnl Currently we only have asm for fixed-point
+ AS_IF([test "$enable_float" != "yes"],[
+ cpu_arm=yes
+ AC_DEFINE([OPUS_ARM_ASM], [], [Make use of ARM asm optimization])
+ AS_GCC_INLINE_ASSEMBLY(
+ [inline_optimization="ARM"],
+ [inline_optimization="disabled"]
+ )
+ AS_ASM_ARM_EDSP([OPUS_ARM_INLINE_EDSP=1],[OPUS_ARM_INLINE_EDSP=0])
+ AS_ASM_ARM_MEDIA([OPUS_ARM_INLINE_MEDIA=1],
+ [OPUS_ARM_INLINE_MEDIA=0])
+ AS_ASM_ARM_NEON([OPUS_ARM_INLINE_NEON=1],[OPUS_ARM_INLINE_NEON=0])
+ AS_IF([test x"$inline_optimization" = x"ARM"],[
+ AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],[true])
+ AC_DEFINE([OPUS_ARM_INLINE_ASM], 1,
+ [Use generic ARMv4 inline asm optimizations])
+ AS_IF([test x"$OPUS_ARM_INLINE_EDSP" = x"1"],[
+ AC_DEFINE([OPUS_ARM_INLINE_EDSP], [1],
+ [Use ARMv5E inline asm optimizations])
+ inline_optimization="$inline_optimization (EDSP)"
+ ])
+ AS_IF([test x"$OPUS_ARM_INLINE_MEDIA" = x"1"],[
+ AC_DEFINE([OPUS_ARM_INLINE_MEDIA], [1],
+ [Use ARMv6 inline asm optimizations])
+ inline_optimization="$inline_optimization (Media)"
+ ])
+ AS_IF([test x"$OPUS_ARM_INLINE_NEON" = x"1"],[
+ AC_DEFINE([OPUS_ARM_INLINE_NEON], 1,
+ [Use ARM NEON inline asm optimizations])
+ inline_optimization="$inline_optimization (NEON)"
+ ])
+ ])
+ dnl We need Perl to translate RVCT-syntax asm to gas syntax.
+ AC_CHECK_PROG([HAVE_PERL], perl, yes, no)
+ AS_IF([test x"$HAVE_PERL" = x"yes"],[
+ AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM],[true])
+ asm_optimization="ARM"
+ AS_IF([test x"$OPUS_ARM_INLINE_EDSP" = x"1"], [
+ OPUS_ARM_PRESUME_EDSP=1
+ OPUS_ARM_MAY_HAVE_EDSP=1
+ ],
+ [
+ OPUS_ARM_PRESUME_EDSP=0
+ OPUS_ARM_MAY_HAVE_EDSP=0
+ ])
+ AS_IF([test x"$OPUS_ARM_INLINE_MEDIA" = x"1"], [
+ OPUS_ARM_PRESUME_MEDIA=1
+ OPUS_ARM_MAY_HAVE_MEDIA=1
+ ],
+ [
+ OPUS_ARM_PRESUME_MEDIA=0
+ OPUS_ARM_MAY_HAVE_MEDIA=0
+ ])
+ AS_IF([test x"$OPUS_ARM_INLINE_NEON" = x"1"], [
+ OPUS_ARM_PRESUME_NEON=1
+ OPUS_ARM_MAY_HAVE_NEON=1
+ ],
+ [
+ OPUS_ARM_PRESUME_NEON=0
+ OPUS_ARM_MAY_HAVE_NEON=0
+ ])
+ AS_IF([test x"$enable_rtcd" = x"yes"],[
+ AS_IF([test x"$OPUS_ARM_MAY_HAVE_EDSP" != x"1"],[
+ AC_MSG_NOTICE(
+ [Trying to force-enable armv5e EDSP instructions...])
+ AS_ASM_ARM_EDSP_FORCE([OPUS_ARM_MAY_HAVE_EDSP=1])
+ ])
+ AS_IF([test x"$OPUS_ARM_MAY_HAVE_MEDIA" != x"1"],[
+ AC_MSG_NOTICE(
+ [Trying to force-enable ARMv6 media instructions...])
+ AS_ASM_ARM_MEDIA_FORCE([OPUS_ARM_MAY_HAVE_MEDIA=1])
+ ])
+ AS_IF([test x"$OPUS_ARM_MAY_HAVE_NEON" != x"1"],[
+ AC_MSG_NOTICE(
+ [Trying to force-enable NEON instructions...])
+ AS_ASM_ARM_NEON_FORCE([OPUS_ARM_MAY_HAVE_NEON=1])
+ ])
+ ])
+ rtcd_support=
+ AS_IF([test x"$OPUS_ARM_MAY_HAVE_EDSP" = x"1"],[
+ AC_DEFINE(OPUS_ARM_MAY_HAVE_EDSP, 1,
+ [Define if assembler supports EDSP instructions])
+ AS_IF([test x"$OPUS_ARM_PRESUME_EDSP" = x"1"],[
+ AC_DEFINE(OPUS_ARM_PRESUME_EDSP, 1,
+ [Define if binary requires EDSP instruction support])
+ asm_optimization="$asm_optimization (EDSP)"
+ ],
+ [rtcd_support="$rtcd_support (EDSP)"]
+ )
+ ])
+ AC_SUBST(OPUS_ARM_MAY_HAVE_EDSP)
+ AS_IF([test x"$OPUS_ARM_MAY_HAVE_MEDIA" = x"1"],[
+ AC_DEFINE(OPUS_ARM_MAY_HAVE_MEDIA, 1,
+ [Define if assembler supports ARMv6 media instructions])
+ AS_IF([test x"$OPUS_ARM_PRESUME_MEDIA" = x"1"],[
+ AC_DEFINE(OPUS_ARM_PRESUME_MEDIA, 1,
+ [Define if binary requires ARMv6 media instruction support])
+ asm_optimization="$asm_optimization (Media)"
+ ],
+ [rtcd_support="$rtcd_support (Media)"]
+ )
+ ])
+ AC_SUBST(OPUS_ARM_MAY_HAVE_MEDIA)
+ AS_IF([test x"$OPUS_ARM_MAY_HAVE_NEON" = x"1"],[
+ AC_DEFINE(OPUS_ARM_MAY_HAVE_NEON, 1,
+ [Define if compiler supports NEON instructions])
+ AS_IF([test x"$OPUS_ARM_PRESUME_NEON" = x"1"], [
+ AC_DEFINE(OPUS_ARM_PRESUME_NEON, 1,
+ [Define if binary requires NEON instruction support])
+ asm_optimization="$asm_optimization (NEON)"
+ ],
+ [rtcd_support="$rtcd_support (NEON)"]
+ )
+ ])
+ AC_SUBST(OPUS_ARM_MAY_HAVE_NEON)
+ dnl Make sure turning on RTCD gets us at least one
+ dnl instruction set.
+ AS_IF([test x"$rtcd_support" != x""],
+ [rtcd_support=ARM"$rtcd_support"],
+ [rtcd_support="no"]
+ )
+ ],
+ [
+ AC_MSG_WARN(
+ [*** ARM assembly requires perl -- disabling optimizations])
+ asm_optimization="(missing perl dependency for ARM)"
+ ])
+ ])
;;
esac
],[
- asm_optimization="disabled"
+ inline_optimization="disabled"
+ asm_optimization="disabled"
])
AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"])
+AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],
+ [test x"${inline_optimization:0:3}" = x"ARM"])
+AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM],
+ [test x"${asm_optimization:0:3}" = x"ARM"])
-AC_ARG_ENABLE([rtcd],
- [AS_HELP_STRING([--disable-rtcd], [Disable run-time CPU capabilities detection])],,
- [enable_rtcd=yes])
-
-AS_IF([test "$enable_rtcd" = "yes"],[
- AS_IF([test "$rtcd_support" = "yes"],[
- AC_DEFINE([OPUS_HAVE_RTCD], [1], [Use run-time CPU capabilities detection])
- ],[
- rtcd_support="no rtcd for your platform, please send patches"
+AS_IF([test x"$enable_rtcd" = x"yes"],[
+ AS_IF([test x"$rtcd_support" != x"no"],[
+ AC_DEFINE([OPUS_HAVE_RTCD], [1],
+ [Use run-time CPU capabilities detection])
+ OPUS_HAVE_RTCD=1
+ AC_SUBST(OPUS_HAVE_RTCD)
])
],[
- rtcd_support="no"
+ rtcd_support="disabled"
])
AC_ARG_ENABLE([assertions],
@@ -300,9 +414,14 @@
AC_SUBST([PC_BUILD])
-
-AC_CONFIG_FILES([Makefile opus.pc opus-uninstalled.pc
- doc/Makefile doc/Doxyfile])
+AC_CONFIG_FILES([
+ Makefile
+ opus.pc
+ opus-uninstalled.pc
+ celt/arm/armopts.s
+ doc/Makefile
+ doc/Doxyfile
+])
AC_CONFIG_HEADERS([config.h])
AC_OUTPUT
@@ -316,13 +435,14 @@
C99 var arrays: ................ ${has_var_arrays}
C99 lrintf: .................... ${ac_cv_func_lrintf}
Use alloca: .................... ${use_alloca}
-
+
General configuration:
-
+
Floating point support: ........ ${enable_float}
Fast float approximations: ..... ${enable_float_approx}
Fixed point debugging: ......... ${enable_fixed_point_debug}
- Assembly optimization: ......... ${asm_optimization}
+ Inline Assembly Optimizations: . ${inline_optimization}
+ External Assembly Optimizations: ${asm_optimization}
Run-time CPU detection: ........ ${rtcd_support}
Custom modes: .................. ${enable_custom_modes}
Assertion checking: ............ ${enable_assertions}
--- a/m4/as-gcc-inline-assembly.m4
+++ b/m4/as-gcc-inline-assembly.m4
@@ -42,7 +42,17 @@
$2])
])
+AC_DEFUN([AS_ASM_ARM_NEON_FORCE],
+[
+ AC_MSG_CHECKING([if assembler supports NEON instructions on ARM])
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[__asm__(".arch armv7-a\n.fpu neon\n.object_arch armv4t\nvorr d0,d0,d0")])],
+ [AC_MSG_RESULT([yes])
+ $1],
+ [AC_MSG_RESULT([no])
+ $2])
+])
+
AC_DEFUN([AS_ASM_ARM_MEDIA],
[
AC_MSG_CHECKING([if assembler supports ARMv6 media instructions on ARM])
@@ -54,12 +64,33 @@
$2])
])
+AC_DEFUN([AS_ASM_ARM_MEDIA_FORCE],
+[
+ AC_MSG_CHECKING([if assembler supports ARMv6 media instructions on ARM])
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[__asm__(".arch armv6\n.object_arch armv4t\nshadd8 r3,r3,r3")])],
+ [AC_MSG_RESULT([yes])
+ $1],
+ [AC_MSG_RESULT([no])
+ $2])
+])
+
AC_DEFUN([AS_ASM_ARM_EDSP],
[
AC_MSG_CHECKING([if assembler supports EDSP instructions on ARM])
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[__asm__("qadd r3,r3,r3")])],
+ [AC_MSG_RESULT([yes])
+ $1],
+ [AC_MSG_RESULT([no])
+ $2])
+])
+
+AC_DEFUN([AS_ASM_ARM_EDSP_FORCE],
+[
+ AC_MSG_CHECKING([if assembler supports EDSP instructions on ARM])
+
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[__asm__(".arch armv5te\n.object_arch armv4t\nqadd r3,r3,r3")])],
[AC_MSG_RESULT([yes])
$1],
[AC_MSG_RESULT([no])
--- a/silk/API.h
+++ b/silk/API.h
@@ -64,6 +64,7 @@
/*************************/
opus_int silk_InitEncoder( /* O Returns error code */
void *encState, /* I/O State */
+ int arch, /* I Run-time architecture */
silk_EncControlStruct *encStatus /* O Encoder Status */
);
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -227,7 +227,8 @@
opus_int *scale, /* O Scaling of the correlation vector */
const opus_int16 *inputData, /* I Input data to correlate */
const opus_int inputDataSize, /* I Length of input */
- const opus_int correlationCount /* I Number of correlation taps to compute */
+ const opus_int correlationCount, /* I Number of correlation taps to compute */
+ int arch /* I Run-time architecture */
);
void silk_decode_pitch(
@@ -249,7 +250,8 @@
const opus_int search_thres2_Q13, /* I Final threshold for lag candidates 0 - 1 */
const opus_int Fs_kHz, /* I Sample frequency (kHz) */
const opus_int complexity, /* I Complexity setting, 0-2, where 2 is highest */
- const opus_int nb_subfr /* I number of 5 ms subframes */
+ const opus_int nb_subfr, /* I number of 5 ms subframes */
+ int arch /* I Run-time architecture */
);
/* Compute Normalized Line Spectral Frequencies (NLSFs) from whitening filter coefficients */
@@ -309,7 +311,8 @@
const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */
const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */
const opus_int nb_subfr, /* I Number of subframes stacked in x */
- const opus_int D /* I Order */
+ const opus_int D, /* I Order */
+ int arch /* I Run-time architecture */
);
/* Copy and multiply a vector by a constant */
@@ -576,11 +579,11 @@
#include "MacroCount.h"
#include "MacroDebug.h"
-#ifdef ARMv4_ASM
+#ifdef OPUS_ARM_INLINE_ASM
#include "arm/SigProc_FIX_armv4.h"
#endif
-#ifdef ARMv5E_ASM
+#ifdef OPUS_ARM_INLINE_EDSP
#include "arm/SigProc_FIX_armv5e.h"
#endif
--- a/silk/enc_API.c
+++ b/silk/enc_API.c
@@ -69,6 +69,7 @@
/*************************/
opus_int silk_InitEncoder( /* O Returns error code */
void *encState, /* I/O State */
+ int arch, /* I Run-time architecture */
silk_EncControlStruct *encStatus /* O Encoder Status */
)
{
@@ -80,7 +81,7 @@
/* Reset encoder */
silk_memset( psEnc, 0, sizeof( silk_encoder ) );
for( n = 0; n < ENCODER_NUM_CHANNELS; n++ ) {
- if( ret += silk_init_encoder( &psEnc->state_Fxx[ n ] ) ) {
+ if( ret += silk_init_encoder( &psEnc->state_Fxx[ n ], arch ) ) {
silk_assert( 0 );
}
}
@@ -174,7 +175,7 @@
if( encControl->nChannelsInternal > psEnc->nChannelsInternal ) {
/* Mono -> Stereo transition: init state of second channel and stereo state */
- ret += silk_init_encoder( &psEnc->state_Fxx[ 1 ] );
+ ret += silk_init_encoder( &psEnc->state_Fxx[ 1 ], psEnc->state_Fxx[ 0 ].sCmn.arch );
silk_memset( psEnc->sStereo.pred_prev_Q13, 0, sizeof( psEnc->sStereo.pred_prev_Q13 ) );
silk_memset( psEnc->sStereo.sSide, 0, sizeof( psEnc->sStereo.sSide ) );
psEnc->sStereo.mid_side_amp_Q0[ 0 ] = 0;
@@ -206,9 +207,8 @@
}
/* Reset Encoder */
for( n = 0; n < encControl->nChannelsInternal; n++ ) {
- if( (ret = silk_init_encoder( &psEnc->state_Fxx[ n ] ) ) != 0 ) {
- silk_assert( 0 );
- }
+ ret = silk_init_encoder( &psEnc->state_Fxx[ n ], psEnc->state_Fxx[ n ].sCmn.arch );
+ silk_assert( !ret );
}
tmp_payloadSize_ms = encControl->payloadSize_ms;
encControl->payloadSize_ms = 10;
--- a/silk/fixed/autocorr_FIX.c
+++ b/silk/fixed/autocorr_FIX.c
@@ -38,10 +38,11 @@
opus_int *scale, /* O Scaling of the correlation vector */
const opus_int16 *inputData, /* I Input data to correlate */
const opus_int inputDataSize, /* I Length of input */
- const opus_int correlationCount /* I Number of correlation taps to compute */
+ const opus_int correlationCount, /* I Number of correlation taps to compute */
+ int arch /* I Run-time architecture */
)
{
opus_int corrCount;
corrCount = silk_min_int( inputDataSize, correlationCount );
- *scale = _celt_autocorr(inputData, results, NULL, 0, corrCount-1, inputDataSize);
+ *scale = _celt_autocorr(inputData, results, NULL, 0, corrCount-1, inputDataSize, arch);
}
--- a/silk/fixed/burg_modified_FIX.c
+++ b/silk/fixed/burg_modified_FIX.c
@@ -50,7 +50,8 @@
const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */
const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */
const opus_int nb_subfr, /* I Number of subframes stacked in x */
- const opus_int D /* I Order */
+ const opus_int D, /* I Order */
+ int arch /* I Run-time architecture */
)
{
opus_int k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;
@@ -98,7 +99,7 @@
int i;
opus_int32 d;
x_ptr = x + s * subfr_length;
- celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D );
+ celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch );
for( n = 1; n < D + 1; n++ ) {
for ( i = n + subfr_length - D, d = 0; i < subfr_length; i++ )
d = MAC16_16( d, x_ptr[ i ], x_ptr[ i - n ] );
--- a/silk/fixed/encode_frame_FIX.c
+++ b/silk/fixed/encode_frame_FIX.c
@@ -132,12 +132,12 @@
/*****************************************/
/* Find pitch lags, initial LPC analysis */
/*****************************************/
- silk_find_pitch_lags_FIX( psEnc, &sEncCtrl, res_pitch, x_frame );
+ silk_find_pitch_lags_FIX( psEnc, &sEncCtrl, res_pitch, x_frame, psEnc->sCmn.arch );
/************************/
/* Noise shape analysis */
/************************/
- silk_noise_shape_analysis_FIX( psEnc, &sEncCtrl, res_pitch_frame, x_frame );
+ silk_noise_shape_analysis_FIX( psEnc, &sEncCtrl, res_pitch_frame, x_frame, psEnc->sCmn.arch );
/***************************************************/
/* Find linear prediction coefficients (LPC + LTP) */
--- a/silk/fixed/find_LPC_FIX.c
+++ b/silk/fixed/find_LPC_FIX.c
@@ -60,13 +60,13 @@
psEncC->indices.NLSFInterpCoef_Q2 = 4;
/* Burg AR analysis for the full frame */
- silk_burg_modified( &res_nrg, &res_nrg_Q, a_Q16, x, minInvGain_Q30, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder );
+ silk_burg_modified( &res_nrg, &res_nrg_Q, a_Q16, x, minInvGain_Q30, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder, psEncC->arch );
if( psEncC->useInterpolatedNLSFs && !psEncC->first_frame_after_reset && psEncC->nb_subfr == MAX_NB_SUBFR ) {
VARDECL( opus_int16, LPC_res );
/* Optimal solution for last 10 ms */
- silk_burg_modified( &res_tmp_nrg, &res_tmp_nrg_Q, a_tmp_Q16, x + 2 * subfr_length, minInvGain_Q30, subfr_length, 2, psEncC->predictLPCOrder );
+ silk_burg_modified( &res_tmp_nrg, &res_tmp_nrg_Q, a_tmp_Q16, x + 2 * subfr_length, minInvGain_Q30, subfr_length, 2, psEncC->predictLPCOrder, psEncC->arch );
/* subtract residual energy here, as that's easier than adding it to the */
/* residual energy of the first 10 ms in each iteration of the search below */
--- a/silk/fixed/find_pitch_lags_FIX.c
+++ b/silk/fixed/find_pitch_lags_FIX.c
@@ -38,7 +38,8 @@
silk_encoder_state_FIX *psEnc, /* I/O encoder state */
silk_encoder_control_FIX *psEncCtrl, /* I/O encoder control */
opus_int16 res[], /* O residual */
- const opus_int16 x[] /* I Speech signal */
+ const opus_int16 x[], /* I Speech signal */
+ int arch /* I Run-time architecture */
)
{
opus_int buf_len, i, scale;
@@ -86,7 +87,7 @@
silk_apply_sine_window( Wsig_ptr, x_buf_ptr, 2, psEnc->sCmn.la_pitch );
/* Calculate autocorrelation sequence */
- silk_autocorr( auto_corr, &scale, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1 );
+ silk_autocorr( auto_corr, &scale, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1, arch );
/* Add white noise, as fraction of energy */
auto_corr[ 0 ] = silk_SMLAWB( auto_corr[ 0 ], auto_corr[ 0 ], SILK_FIX_CONST( FIND_PITCH_WHITE_NOISE_FRACTION, 16 ) ) + 1;
@@ -127,7 +128,8 @@
/*****************************************/
if( silk_pitch_analysis_core( res, psEncCtrl->pitchL, &psEnc->sCmn.indices.lagIndex, &psEnc->sCmn.indices.contourIndex,
&psEnc->LTPCorr_Q15, psEnc->sCmn.prevLag, psEnc->sCmn.pitchEstimationThreshold_Q16,
- (opus_int)thrhld_Q13, psEnc->sCmn.fs_kHz, psEnc->sCmn.pitchEstimationComplexity, psEnc->sCmn.nb_subfr ) == 0 )
+ (opus_int)thrhld_Q13, psEnc->sCmn.fs_kHz, psEnc->sCmn.pitchEstimationComplexity, psEnc->sCmn.nb_subfr,
+ psEnc->sCmn.arch) == 0 )
{
psEnc->sCmn.indices.signalType = TYPE_VOICED;
} else {
--- a/silk/fixed/main_FIX.h
+++ b/silk/fixed/main_FIX.h
@@ -73,7 +73,8 @@
/* Initializes the Silk encoder state */
opus_int silk_init_encoder(
- silk_encoder_state_Fxx *psEnc /* I/O Pointer to Silk FIX encoder state */
+ silk_encoder_state_Fxx *psEnc, /* I/O Pointer to Silk FIX encoder state */
+ int arch /* I Run-time architecture */
);
/* Control the Silk encoder */
@@ -104,7 +105,8 @@
silk_encoder_state_FIX *psEnc, /* I/O Encoder state FIX */
silk_encoder_control_FIX *psEncCtrl, /* I/O Encoder control FIX */
const opus_int16 *pitch_res, /* I LPC residual from pitch analysis */
- const opus_int16 *x /* I Input signal [ frame_length + la_shape ] */
+ const opus_int16 *x, /* I Input signal [ frame_length + la_shape ] */
+ int arch /* I Run-time architecture */
);
/* Autocorrelations for a warped frequency axis */
@@ -132,7 +134,8 @@
silk_encoder_state_FIX *psEnc, /* I/O encoder state */
silk_encoder_control_FIX *psEncCtrl, /* I/O encoder control */
opus_int16 res[], /* O residual */
- const opus_int16 x[] /* I Speech signal */
+ const opus_int16 x[], /* I Speech signal */
+ int arch /* I Run-time architecture */
);
/* Find LPC and LTP coefficients */
--- a/silk/fixed/noise_shape_analysis_FIX.c
+++ b/silk/fixed/noise_shape_analysis_FIX.c
@@ -145,7 +145,8 @@
silk_encoder_state_FIX *psEnc, /* I/O Encoder state FIX */
silk_encoder_control_FIX *psEncCtrl, /* I/O Encoder control FIX */
const opus_int16 *pitch_res, /* I LPC residual from pitch analysis */
- const opus_int16 *x /* I Input signal [ frame_length + la_shape ] */
+ const opus_int16 *x, /* I Input signal [ frame_length + la_shape ] */
+ int arch /* I Run-time architecture */
)
{
silk_shape_state_FIX *psShapeSt = &psEnc->sShape;
@@ -281,7 +282,7 @@
silk_warped_autocorrelation_FIX( auto_corr, &scale, x_windowed, warping_Q16, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder );
} else {
/* Calculate regular auto correlation */
- silk_autocorr( auto_corr, &scale, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1 );
+ silk_autocorr( auto_corr, &scale, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1, arch );
}
/* Add white noise, as a fraction of energy */
--- a/silk/fixed/pitch_analysis_core_FIX.c
+++ b/silk/fixed/pitch_analysis_core_FIX.c
@@ -62,7 +62,8 @@
opus_int start_lag, /* I lag offset to search around */
opus_int sf_length, /* I length of a 5 ms subframe */
opus_int nb_subfr, /* I number of subframes */
- opus_int complexity /* I Complexity setting */
+ opus_int complexity, /* I Complexity setting */
+ int arch /* I Run-time architecture */
);
static void silk_P_Ana_calc_energy_st3(
@@ -88,7 +89,8 @@
const opus_int search_thres2_Q13, /* I Final threshold for lag candidates 0 - 1 */
const opus_int Fs_kHz, /* I Sample frequency (kHz) */
const opus_int complexity, /* I Complexity setting, 0-2, where 2 is highest */
- const opus_int nb_subfr /* I number of 5 ms subframes */
+ const opus_int nb_subfr, /* I number of 5 ms subframes */
+ int arch /* I Run-time architecture */
)
{
VARDECL( opus_int16, frame_8kHz );
@@ -189,7 +191,7 @@
silk_assert( basis_ptr >= frame_4kHz );
silk_assert( basis_ptr + SF_LENGTH_8KHZ <= frame_4kHz + frame_length_4kHz );
- celt_pitch_xcorr( target_ptr, target_ptr - MAX_LAG_4KHZ, xcorr32, SF_LENGTH_8KHZ, MAX_LAG_4KHZ - MIN_LAG_4KHZ + 1 );
+ celt_pitch_xcorr( target_ptr, target_ptr - MAX_LAG_4KHZ, xcorr32, SF_LENGTH_8KHZ, MAX_LAG_4KHZ - MIN_LAG_4KHZ + 1, arch );
/* Calculate first vector products before loop */
cross_corr = xcorr32[ MAX_LAG_4KHZ - MIN_LAG_4KHZ ];
@@ -516,7 +518,7 @@
/* Calculate the correlations and energies needed in stage 3 */
ALLOC( energies_st3, nb_subfr * nb_cbk_search, silk_pe_stage3_vals );
ALLOC( cross_corr_st3, nb_subfr * nb_cbk_search, silk_pe_stage3_vals );
- silk_P_Ana_calc_corr_st3( cross_corr_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity );
+ silk_P_Ana_calc_corr_st3( cross_corr_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity, arch );
silk_P_Ana_calc_energy_st3( energies_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity );
lag_counter = 0;
@@ -597,7 +599,8 @@
opus_int start_lag, /* I lag offset to search around */
opus_int sf_length, /* I length of a 5 ms subframe */
opus_int nb_subfr, /* I number of subframes */
- opus_int complexity /* I Complexity setting */
+ opus_int complexity, /* I Complexity setting */
+ int arch /* I Run-time architecture */
)
{
const opus_int16 *target_ptr;
@@ -634,7 +637,7 @@
lag_low = matrix_ptr( Lag_range_ptr, k, 0, 2 );
lag_high = matrix_ptr( Lag_range_ptr, k, 1, 2 );
silk_assert(lag_high-lag_low+1 <= SCRATCH_SIZE);
- celt_pitch_xcorr( target_ptr, target_ptr - start_lag - lag_high, xcorr32, sf_length, lag_high - lag_low + 1 );
+ celt_pitch_xcorr( target_ptr, target_ptr - start_lag - lag_high, xcorr32, sf_length, lag_high - lag_low + 1, arch );
for( j = lag_low; j <= lag_high; j++ ) {
silk_assert( lag_counter < SCRATCH_SIZE );
scratch_mem[ lag_counter ] = xcorr32[ lag_high - j ];
--- a/silk/float/SigProc_FLP.h
+++ b/silk/float/SigProc_FLP.h
@@ -94,7 +94,8 @@
const silk_float search_thres2, /* I Final threshold for lag candidates 0 - 1 */
const opus_int Fs_kHz, /* I sample frequency (kHz) */
const opus_int complexity, /* I Complexity setting, 0-2, where 2 is highest */
- const opus_int nb_subfr /* I Number of 5 ms subframes */
+ const opus_int nb_subfr, /* I Number of 5 ms subframes */
+ int arch /* I Run-time architecture */
);
void silk_insertion_sort_decreasing_FLP(
--- a/silk/float/encode_frame_FLP.c
+++ b/silk/float/encode_frame_FLP.c
@@ -129,7 +129,7 @@
/*****************************************/
/* Find pitch lags, initial LPC analysis */
/*****************************************/
- silk_find_pitch_lags_FLP( psEnc, &sEncCtrl, res_pitch, x_frame );
+ silk_find_pitch_lags_FLP( psEnc, &sEncCtrl, res_pitch, x_frame, psEnc->sCmn.arch );
/************************/
/* Noise shape analysis */
--- a/silk/float/find_pitch_lags_FLP.c
+++ b/silk/float/find_pitch_lags_FLP.c
@@ -37,7 +37,8 @@
silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */
silk_encoder_control_FLP *psEncCtrl, /* I/O Encoder control FLP */
silk_float res[], /* O Residual */
- const silk_float x[] /* I Speech signal */
+ const silk_float x[], /* I Speech signal */
+ int arch /* I Run-time architecture */
)
{
opus_int buf_len;
@@ -116,7 +117,7 @@
/*****************************************/
if( silk_pitch_analysis_core_FLP( res, psEncCtrl->pitchL, &psEnc->sCmn.indices.lagIndex,
&psEnc->sCmn.indices.contourIndex, &psEnc->LTPCorr, psEnc->sCmn.prevLag, psEnc->sCmn.pitchEstimationThreshold_Q16 / 65536.0f,
- thrhld, psEnc->sCmn.fs_kHz, psEnc->sCmn.pitchEstimationComplexity, psEnc->sCmn.nb_subfr ) == 0 )
+ thrhld, psEnc->sCmn.fs_kHz, psEnc->sCmn.pitchEstimationComplexity, psEnc->sCmn.nb_subfr, arch ) == 0 )
{
psEnc->sCmn.indices.signalType = TYPE_VOICED;
} else {
--- a/silk/float/main_FLP.h
+++ b/silk/float/main_FLP.h
@@ -71,7 +71,8 @@
/* Initializes the Silk encoder state */
opus_int silk_init_encoder(
- silk_encoder_state_FLP *psEnc /* I/O Encoder state FLP */
+ silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */
+ int arch /* I Run-tim architecture */
);
/* Control the Silk encoder */
@@ -129,7 +130,8 @@
silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */
silk_encoder_control_FLP *psEncCtrl, /* I/O Encoder control FLP */
silk_float res[], /* O Residual */
- const silk_float x[] /* I Speech signal */
+ const silk_float x[], /* I Speech signal */
+ int arch /* I Run-time architecture */
);
/* Find LPC and LTP coefficients */
--- a/silk/float/pitch_analysis_core_FLP.c
+++ b/silk/float/pitch_analysis_core_FLP.c
@@ -48,7 +48,8 @@
opus_int start_lag, /* I start lag */
opus_int sf_length, /* I sub frame length */
opus_int nb_subfr, /* I number of subframes */
- opus_int complexity /* I Complexity setting */
+ opus_int complexity, /* I Complexity setting */
+ int arch /* I Run-time architecture */
);
static void silk_P_Ana_calc_energy_st3(
@@ -74,7 +75,8 @@
const silk_float search_thres2, /* I Final threshold for lag candidates 0 - 1 */
const opus_int Fs_kHz, /* I sample frequency (kHz) */
const opus_int complexity, /* I Complexity setting, 0-2, where 2 is highest */
- const opus_int nb_subfr /* I Number of 5 ms subframes */
+ const opus_int nb_subfr, /* I Number of 5 ms subframes */
+ int arch /* I Run-time architecture */
)
{
opus_int i, k, d, j;
@@ -176,7 +178,7 @@
silk_assert( basis_ptr >= frame_4kHz );
silk_assert( basis_ptr + sf_length_8kHz <= frame_4kHz + frame_length_4kHz );
- celt_pitch_xcorr( target_ptr, target_ptr-max_lag_4kHz, xcorr, sf_length_8kHz, max_lag_4kHz - min_lag_4kHz + 1 );
+ celt_pitch_xcorr( target_ptr, target_ptr-max_lag_4kHz, xcorr, sf_length_8kHz, max_lag_4kHz - min_lag_4kHz + 1, arch );
/* Calculate first vector products before loop */
cross_corr = xcorr[ max_lag_4kHz - min_lag_4kHz ];
@@ -409,7 +411,7 @@
CCmax = -1000.0f;
/* Calculate the correlations and energies needed in stage 3 */
- silk_P_Ana_calc_corr_st3( cross_corr_st3, frame, start_lag, sf_length, nb_subfr, complexity );
+ silk_P_Ana_calc_corr_st3( cross_corr_st3, frame, start_lag, sf_length, nb_subfr, complexity, arch );
silk_P_Ana_calc_energy_st3( energies_st3, frame, start_lag, sf_length, nb_subfr, complexity );
lag_counter = 0;
@@ -493,7 +495,8 @@
opus_int start_lag, /* I start lag */
opus_int sf_length, /* I sub frame length */
opus_int nb_subfr, /* I number of subframes */
- opus_int complexity /* I Complexity setting */
+ opus_int complexity, /* I Complexity setting */
+ int arch /* I Run-time architecture */
)
{
const silk_float *target_ptr;
@@ -527,7 +530,7 @@
lag_low = matrix_ptr( Lag_range_ptr, k, 0, 2 );
lag_high = matrix_ptr( Lag_range_ptr, k, 1, 2 );
silk_assert(lag_high-lag_low+1 <= SCRATCH_SIZE);
- celt_pitch_xcorr( target_ptr, target_ptr - start_lag - lag_high, xcorr, sf_length, lag_high - lag_low + 1 );
+ celt_pitch_xcorr( target_ptr, target_ptr - start_lag - lag_high, xcorr, sf_length, lag_high - lag_low + 1, arch );
for( j = lag_low; j <= lag_high; j++ ) {
silk_assert( lag_counter < SCRATCH_SIZE );
scratch_mem[ lag_counter ] = xcorr[ lag_high - j ];
--- a/silk/init_encoder.c
+++ b/silk/init_encoder.c
@@ -34,12 +34,14 @@
#include "main_FLP.h"
#endif
#include "tuning_parameters.h"
+#include "cpu_support.h"
/*********************************/
/* Initialize Silk Encoder state */
/*********************************/
opus_int silk_init_encoder(
- silk_encoder_state_Fxx *psEnc /* I/O Pointer to Silk FIX encoder state */
+ silk_encoder_state_Fxx *psEnc, /* I/O Pointer to Silk FIX encoder state */
+ int arch /* I Run-time architecture */
)
{
opus_int ret = 0;
@@ -46,6 +48,8 @@
/* Clear the entire encoder state */
silk_memset( psEnc, 0, sizeof( silk_encoder_state_Fxx ) );
+
+ psEnc->sCmn.arch = arch;
psEnc->sCmn.variable_HP_smth1_Q15 = silk_LSHIFT( silk_lin2log( SILK_FIX_CONST( VARIABLE_HP_MIN_CUTOFF_HZ, 16 ) ) - ( 16 << 7 ), 8 );
psEnc->sCmn.variable_HP_smth2_Q15 = psEnc->sCmn.variable_HP_smth1_Q15;
--- a/silk/macros.h
+++ b/silk/macros.h
@@ -103,11 +103,11 @@
(*((Matrix_base_adr) + ((row)+(M)*(column))))
#endif
-#ifdef ARMv4_ASM
+#ifdef OPUS_ARM_INLINE_ASM
#include "arm/macros_armv4.h"
#endif
-#ifdef ARMv5E_ASM
+#ifdef OPUS_ARM_INLINE_EDSP
#include "arm/macros_armv5e.h"
#endif
--- a/silk/structs.h
+++ b/silk/structs.h
@@ -191,6 +191,8 @@
SideInfoIndices indices;
opus_int8 pulses[ MAX_FRAME_LENGTH ];
+ int arch;
+
/* Input/output buffering */
opus_int16 inputBuf[ MAX_FRAME_LENGTH + 2 ]; /* Buffer containing input signal */
opus_int inputBufIx;
--- a/src/opus_decoder.c
+++ b/src/opus_decoder.c
@@ -75,7 +75,6 @@
#endif
opus_uint32 rangeFinal;
- int arch;
};
#ifdef FIXED_POINT
@@ -125,7 +124,6 @@
st->Fs = Fs;
st->DecControl.API_sampleRate = st->Fs;
st->DecControl.nChannelsAPI = st->channels;
- st->arch = opus_select_arch();
/* Reset decoder */
ret = silk_InitDecoder( silk_dec );
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -104,7 +104,7 @@
int analysis_offset;
#endif
opus_uint32 rangeFinal;
- int arch;
+ int arch;
};
/* Transition tables for the voice and music. First column is the
@@ -188,7 +188,7 @@
st->arch = opus_select_arch();
- ret = silk_InitEncoder( silk_enc, &st->silk_mode );
+ ret = silk_InitEncoder( silk_enc, st->arch, &st->silk_mode );
if(ret)return OPUS_INTERNAL_ERROR;
/* default SILK parameters */
@@ -209,7 +209,7 @@
/* Create CELT encoder */
/* Initialize CELT encoder */
- err = celt_encoder_init(celt_enc, Fs, channels);
+ err = celt_encoder_init(celt_enc, Fs, channels, st->arch);
if(err!=OPUS_OK)return OPUS_INTERNAL_ERROR;
celt_encoder_ctl(celt_enc, CELT_SET_SIGNALLING(0));
@@ -1219,7 +1219,7 @@
if (st->mode != MODE_CELT_ONLY && st->prev_mode == MODE_CELT_ONLY)
{
silk_EncControlStruct dummy;
- silk_InitEncoder( silk_enc, &dummy);
+ silk_InitEncoder( silk_enc, st->arch, &dummy);
prefill=1;
}
@@ -2418,7 +2418,7 @@
((char*)&st->OPUS_ENCODER_RESET_START - (char*)st));
celt_encoder_ctl(celt_enc, OPUS_RESET_STATE);
- silk_InitEncoder( silk_enc, &dummy );
+ silk_InitEncoder( silk_enc, st->arch, &dummy );
st->stream_channels = st->channels;
st->hybrid_stereo_width_Q14 = 1 << 14;
st->prev_HB_gain = Q15ONE;