ref: 4b0dbd7bd1a3a95364fad19c95ee7f2b118f33fb
dir: /codec/encoder/core/arm64/pixel_neon_aarch64.S/
/*! * \copy * Copyright (c) 2013, Cisco Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */ #ifdef HAVE_NEON_AARCH64 .text #include "arm_arch64_common_macro.S" .macro CALC_AND_STORE_SAD saddlv s2, v2.8h fmov w0, s2 .endm .macro CALC_AND_STORE_SAD_FOUR saddlv s28, v28.8h saddlv s29, v29.8h saddlv s30, v30.8h saddlv s31, v31.8h st4 {v28.s, v29.s, v30.s, v31.s}[0], [x4] .endm .macro LOAD_8X8_1 ld1 {v0.8b}, [x0], x1 ld1 {v1.8b}, [x0], x1 ld1 {v2.8b}, [x0], x1 ld1 {v3.8b}, [x0], x1 ld1 {v4.8b}, [x0], x1 ld1 {v5.8b}, [x0], x1 ld1 {v6.8b}, [x0], x1 ld1 {v7.8b}, [x0], x1 .endm .macro LOAD_16X8_1 ld1 {v0.16b}, [x0], x1 ld1 {v1.16b}, [x0], x1 ld1 {v2.16b}, [x0], x1 ld1 {v3.16b}, [x0], x1 ld1 {v4.16b}, [x0], x1 ld1 {v5.16b}, [x0], x1 ld1 {v6.16b}, [x0], x1 ld1 {v7.16b}, [x0], x1 .endm #ifdef __APPLE__ .macro LOAD_8X8_2 ld1 {v16.8b}, [$0], x3 ld1 {v17.8b}, [$0], x3 ld1 {v18.8b}, [$0], x3 ld1 {v19.8b}, [$0], x3 ld1 {v20.8b}, [$0], x3 ld1 {v21.8b}, [$0], x3 ld1 {v22.8b}, [$0], x3 ld1 {v23.8b}, [$0], x3 .endm .macro CALC_ABS_8X8_1 uab$1l $0, v0.8b, v16.8b uabal $0, v1.8b, v17.8b uabal $0, v2.8b, v18.8b uabal $0, v3.8b, v19.8b uabal $0, v4.8b, v20.8b uabal $0, v5.8b, v21.8b uabal $0, v6.8b, v22.8b uabal $0, v7.8b, v23.8b .endm .macro CALC_ABS_8X8_2 uab$0l v29.8h, v0.8b, v18.8b uabal v29.8h, v1.8b, v19.8b uabal v29.8h, v2.8b, v20.8b uabal v29.8h, v3.8b, v21.8b uabal v29.8h, v4.8b, v22.8b uabal v29.8h, v5.8b, v23.8b uabal v29.8h, v6.8b, v24.8b uabal v29.8h, v7.8b, v25.8b .endm .macro LOAD_16X8_2 ld1 {v16.16b}, [$0], x3 ld1 {v17.16b}, [$0], x3 ld1 {v18.16b}, [$0], x3 ld1 {v19.16b}, [$0], x3 ld1 {v20.16b}, [$0], x3 ld1 {v21.16b}, [$0], x3 ld1 {v22.16b}, [$0], x3 ld1 {v23.16b}, [$0], x3 .endm .macro CALC_ABS_16X8_1 uab$1l $0, v0.8b, v16.8b uabal2 $0, v0.16b,v16.16b uabal $0, v1.8b, v17.8b uabal2 $0, v1.16b,v17.16b uabal $0, v2.8b, v18.8b uabal2 $0, v2.16b,v18.16b uabal $0, v3.8b, v19.8b uabal2 $0, v3.16b,v19.16b uabal $0, v4.8b, v20.8b uabal2 $0, v4.16b,v20.16b uabal $0, v5.8b, v21.8b uabal2 $0, v5.16b,v21.16b uabal $0, v6.8b, v22.8b uabal2 $0, v6.16b,v22.16b uabal $0, v7.8b, v23.8b uabal2 $0, v7.16b,v23.16b .endm .macro CALC_ABS_16X8_2 uab$0l v29.8h, v0.8b, v18.8b uabal2 v29.8h, v0.16b,v18.16b uabal v29.8h, v1.8b, v19.8b uabal2 v29.8h, v1.16b,v19.16b uabal v29.8h, v2.8b, v20.8b uabal2 v29.8h, v2.16b,v20.16b uabal v29.8h, v3.8b, v21.8b uabal2 v29.8h, v3.16b,v21.16b uabal v29.8h, v4.8b, v22.8b uabal2 v29.8h, v4.16b,v22.16b uabal v29.8h, v5.8b, v23.8b uabal2 v29.8h, v5.16b,v23.16b uabal v29.8h, v6.8b, v24.8b uabal2 v29.8h, v6.16b,v24.16b uabal v29.8h, v7.8b, v25.8b uabal2 v29.8h, v7.16b,v25.16b .endm #else .macro LOAD_8X8_2 arg0 ld1 {v16.8b}, [\arg0], x3 ld1 {v17.8b}, [\arg0], x3 ld1 {v18.8b}, [\arg0], x3 ld1 {v19.8b}, [\arg0], x3 ld1 {v20.8b}, [\arg0], x3 ld1 {v21.8b}, [\arg0], x3 ld1 {v22.8b}, [\arg0], x3 ld1 {v23.8b}, [\arg0], x3 .endm .macro CALC_ABS_8X8_1 arg0, arg1 uab\arg1\()l \arg0, v0.8b, v16.8b uabal \arg0, v1.8b, v17.8b uabal \arg0, v2.8b, v18.8b uabal \arg0, v3.8b, v19.8b uabal \arg0, v4.8b, v20.8b uabal \arg0, v5.8b, v21.8b uabal \arg0, v6.8b, v22.8b uabal \arg0, v7.8b, v23.8b .endm .macro CALC_ABS_8X8_2 arg0 uab\arg0\()l v29.8h, v0.8b, v18.8b uabal v29.8h, v1.8b, v19.8b uabal v29.8h, v2.8b, v20.8b uabal v29.8h, v3.8b, v21.8b uabal v29.8h, v4.8b, v22.8b uabal v29.8h, v5.8b, v23.8b uabal v29.8h, v6.8b, v24.8b uabal v29.8h, v7.8b, v25.8b .endm .macro LOAD_16X8_2 arg0 ld1 {v16.16b}, [\arg0], x3 ld1 {v17.16b}, [\arg0], x3 ld1 {v18.16b}, [\arg0], x3 ld1 {v19.16b}, [\arg0], x3 ld1 {v20.16b}, [\arg0], x3 ld1 {v21.16b}, [\arg0], x3 ld1 {v22.16b}, [\arg0], x3 ld1 {v23.16b}, [\arg0], x3 .endm .macro CALC_ABS_16X8_1 arg0, arg1 uab\arg1\()l \arg0, v0.8b, v16.8b uabal2 \arg0, v0.16b,v16.16b uabal \arg0, v1.8b, v17.8b uabal2 \arg0, v1.16b,v17.16b uabal \arg0, v2.8b, v18.8b uabal2 \arg0, v2.16b,v18.16b uabal \arg0, v3.8b, v19.8b uabal2 \arg0, v3.16b,v19.16b uabal \arg0, v4.8b, v20.8b uabal2 \arg0, v4.16b,v20.16b uabal \arg0, v5.8b, v21.8b uabal2 \arg0, v5.16b,v21.16b uabal \arg0, v6.8b, v22.8b uabal2 \arg0, v6.16b,v22.16b uabal \arg0, v7.8b, v23.8b uabal2 \arg0, v7.16b,v23.16b .endm .macro CALC_ABS_16X8_2 arg0 uab\arg0\()l v29.8h, v0.8b, v18.8b uabal2 v29.8h, v0.16b,v18.16b uabal v29.8h, v1.8b, v19.8b uabal2 v29.8h, v1.16b,v19.16b uabal v29.8h, v2.8b, v20.8b uabal2 v29.8h, v2.16b,v20.16b uabal v29.8h, v3.8b, v21.8b uabal2 v29.8h, v3.16b,v21.16b uabal v29.8h, v4.8b, v22.8b uabal2 v29.8h, v4.16b,v22.16b uabal v29.8h, v5.8b, v23.8b uabal2 v29.8h, v5.16b,v23.16b uabal v29.8h, v6.8b, v24.8b uabal2 v29.8h, v6.16b,v24.16b uabal v29.8h, v7.8b, v25.8b uabal2 v29.8h, v7.16b,v25.16b .endm #endif WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad4x4_AArch64_neon sxtw x1, w1 sxtw x3, w3 ld1 {v0.s}[0], [x0], x1 ld1 {v1.s}[0], [x2], x3 uabdl v2.8h, v0.8b, v1.8b .rept 3 ld1 {v0.s}[0], [x0], x1 ld1 {v1.s}[0], [x2], x3 uabal v2.8h, v0.8b, v1.8b .endr saddlv s2, v2.4h fmov w0, s2 WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad8x8_AArch64_neon sxtw x1, w1 sxtw x3, w3 ld1 {v0.8b}, [x0], x1 ld1 {v1.8b}, [x2], x3 uabdl v2.8h, v0.8b, v1.8b .rept 7 ld1 {v0.8b}, [x0], x1 ld1 {v1.8b}, [x2], x3 uabal v2.8h, v0.8b, v1.8b .endr CALC_AND_STORE_SAD WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad8x16_AArch64_neon sxtw x1, w1 sxtw x3, w3 ld1 {v0.8b}, [x0], x1 ld1 {v1.8b}, [x2], x3 uabdl v2.8h, v0.8b, v1.8b .rept 15 ld1 {v0.8b}, [x0], x1 ld1 {v1.8b}, [x2], x3 uabal v2.8h, v0.8b, v1.8b .endr CALC_AND_STORE_SAD WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad16x8_AArch64_neon sxtw x1, w1 sxtw x3, w3 ld1 {v0.16b}, [x0], x1 ld1 {v1.16b}, [x2], x3 uabdl v2.8h, v0.8b, v1.8b uabal2 v2.8h, v0.16b, v1.16b .rept 7 ld1 {v0.16b}, [x0], x1 ld1 {v1.16b}, [x2], x3 uabal v2.8h, v0.8b, v1.8b uabal2 v2.8h, v0.16b, v1.16b .endr CALC_AND_STORE_SAD WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad16x16_AArch64_neon sxtw x1, w1 sxtw x3, w3 ld1 {v0.16b}, [x0], x1 ld1 {v1.16b}, [x2], x3 uabdl v2.8h, v0.8b, v1.8b uabal2 v2.8h, v0.16b, v1.16b .rept 15 ld1 {v0.16b}, [x0], x1 ld1 {v1.16b}, [x2], x3 uabal v2.8h, v0.8b, v1.8b uabal2 v2.8h, v0.16b, v1.16b .endr CALC_AND_STORE_SAD WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour4x4_AArch64_neon sxtw x1, w1 sxtw x3, w3 ld1 {v0.s}[0], [x0], x1 ld1 {v0.s}[1], [x0], x1 ld1 {v1.s}[0], [x0], x1 ld1 {v1.s}[1], [x0] sub x0, x2, x3 ld1 {v2.s}[0], [x0], x3 ld1 {v2.s}[1], [x0], x3 ld1 {v3.s}[0], [x0], x3 ld1 {v3.s}[1], [x0], x3 ld1 {v4.s}[0], [x0], x3 ld1 {v4.s}[1], [x0], x3 uabdl v28.8h, v0.8b, v2.8b uabal v28.8h, v1.8b, v3.8b uabdl v29.8h, v0.8b, v3.8b uabal v29.8h, v1.8b, v4.8b sub x0, x2, #1 ld1 {v2.s}[0], [x0], x3 ld1 {v2.s}[1], [x0], x3 ld1 {v3.s}[0], [x0], x3 ld1 {v3.s}[1], [x0] uabdl v30.8h, v0.8b, v2.8b uabal v30.8h, v1.8b, v3.8b add x0, x2, #1 ld1 {v2.s}[0], [x0], x3 ld1 {v2.s}[1], [x0], x3 ld1 {v3.s}[0], [x0], x3 ld1 {v3.s}[1], [x0] uabdl v31.8h, v0.8b, v2.8b uabal v31.8h, v1.8b, v3.8b CALC_AND_STORE_SAD_FOUR WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour8x8_AArch64_neon sxtw x1, w1 sxtw x3, w3 LOAD_8X8_1 sub x0, x2, x3 LOAD_8X8_2 x0 ld1 {v24.8b}, [x0], x3 ld1 {v25.8b}, [x0] CALC_ABS_8X8_1 v28.8h, d CALC_ABS_8X8_2 d sub x0, x2, #1 LOAD_8X8_2 x0 CALC_ABS_8X8_1 v30.8h, d add x0, x2, #1 LOAD_8X8_2 x0 CALC_ABS_8X8_1 v31.8h, d CALC_AND_STORE_SAD_FOUR WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour8x16_AArch64_neon sxtw x1, w1 sxtw x3, w3 LOAD_8X8_1 sub x5, x2, x3 LOAD_8X8_2 x5 ld1 {v24.8b}, [x5], x3 ld1 {v25.8b}, [x5], x3 CALC_ABS_8X8_1 v28.8h, d CALC_ABS_8X8_2 d sub x6, x2, #1 LOAD_8X8_2 x6 CALC_ABS_8X8_1 v30.8h, d add x7, x2, #1 LOAD_8X8_2 x7 CALC_ABS_8X8_1 v31.8h, d LOAD_8X8_1 sub x5, x5, x3 sub x5, x5, x3 LOAD_8X8_2 x5 ld1 {v24.8b}, [x5], x3 ld1 {v25.8b}, [x5] CALC_ABS_8X8_1 v28.8h, a CALC_ABS_8X8_2 a LOAD_8X8_2 x6 CALC_ABS_8X8_1 v30.8h, a LOAD_8X8_2 x7 CALC_ABS_8X8_1 v31.8h, a CALC_AND_STORE_SAD_FOUR WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour16x8_AArch64_neon sxtw x1, w1 sxtw x3, w3 LOAD_16X8_1 sub x0, x2, x3 LOAD_16X8_2 x0 ld1 {v24.16b}, [x0], x3 ld1 {v25.16b}, [x0] CALC_ABS_16X8_1 v28.8h, d CALC_ABS_16X8_2 d sub x0, x2, #1 LOAD_16X8_2 x0 CALC_ABS_16X8_1 v30.8h, d add x0, x2, #1 LOAD_16X8_2 x0 CALC_ABS_16X8_1 v31.8h, d CALC_AND_STORE_SAD_FOUR WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour16x16_AArch64_neon sxtw x1, w1 sxtw x3, w3 LOAD_16X8_1 sub x5, x2, x3 LOAD_16X8_2 x5 ld1 {v24.16b}, [x5], x3 ld1 {v25.16b}, [x5], x3 CALC_ABS_16X8_1 v28.8h, d CALC_ABS_16X8_2 d sub x6, x2, #1 LOAD_16X8_2 x6 CALC_ABS_16X8_1 v30.8h, d add x7, x2, #1 LOAD_16X8_2 x7 CALC_ABS_16X8_1 v31.8h, d LOAD_16X8_1 sub x5, x5, x3 sub x5, x5, x3 LOAD_16X8_2 x5 ld1 {v24.16b}, [x5], x3 ld1 {v25.16b}, [x5] CALC_ABS_16X8_1 v28.8h, a CALC_ABS_16X8_2 a LOAD_16X8_2 x6 CALC_ABS_16X8_1 v30.8h, a LOAD_16X8_2 x7 CALC_ABS_16X8_1 v31.8h, a CALC_AND_STORE_SAD_FOUR WELS_ASM_ARCH64_FUNC_END #endif