shithub: openh264

Download patch

ref: aaa5bcc1579d6c7177afb6cf09e178df2cb6f9b8
parent: 127749c454c1fb4b23a9e391000e255bd0521dd8
parent: 720f8dcc525c2fef52518080e6e26b353a535abf
author: Ethan Hugg <[email protected]>
date: Tue Jun 17 04:33:46 EDT 2014

Merge pull request #975 from mstorsjo/cleanup-asm

Clean up the aarch64 deblocking assembly

--- a/codec/common/arm64/deblocking_aarch64_neon.S
+++ b/codec/common/arm64/deblocking_aarch64_neon.S
@@ -1,35 +1,35 @@
 /*!
-* \copy
-*     Copyright (c)  2013, Cisco Systems
-*     All rights reserved.
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
 
-*     Redistribution and use in source and binary forms, with or without
-*     modification, are permitted provided that the following conditions
-*     are met:
-
-*        * Redistributions of source code must retain the above copyright
-*          notice, this list of conditions and the following disclaimer.
-
-*        * Redistributions in binary form must reproduce the above copyright
-*          notice, this list of conditions and the following disclaimer in
-*          the documentation and/or other materials provided with the
-*          distribution.
-
-*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-*     POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
 #ifdef HAVE_NEON_AARCH64
 .text
 
@@ -36,1080 +36,1080 @@
 #include "arm_arch64_common_macro.S"
 #ifdef __APPLE__
 
-.macro	MASK_MATRIX
-  uabd    $6.16b, $1.16b, $2.16b
-  cmhi    $6.16b, $4.16b, $6.16b
+.macro MASK_MATRIX
+    uabd    $6.16b, $1.16b, $2.16b
+    cmhi    $6.16b, $4.16b, $6.16b
 
-  uabd    $4.16b, $0.16b, $1.16b
-  cmhi    $4.16b, $5.16b, $4.16b
-  and     $6.16b, $6.16b, $4.16b
+    uabd    $4.16b, $0.16b, $1.16b
+    cmhi    $4.16b, $5.16b, $4.16b
+    and     $6.16b, $6.16b, $4.16b
 
-  uabd    $4.16b, $3.16b, $2.16b
-  cmhi    $4.16b, $5.16b, $4.16b
-  and     $6.16b, $6.16b, $4.16b
+    uabd    $4.16b, $3.16b, $2.16b
+    cmhi    $4.16b, $5.16b, $4.16b
+    and     $6.16b, $6.16b, $4.16b
 .endm
 
-.macro	DIFF_LUMA_LT4_P1_Q1 //(Use Tmp v23, v24)
-  //v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20
-  urhadd	$8.16b, $2.16b, $3.16b
-  uhadd   $8.16b, $0.16b, $8.16b
-  usubl   $9.8h, $8.8b, $1.8b
-  sqxtn   $9.8b, $9.8h
-  usubl2  $8.8h, $8.16b, $1.16b
-  sqxtn2  $9.16b, $8.8h
-  smax    $8.16b, $9.16b, $5.16b
+.macro DIFF_LUMA_LT4_P1_Q1 //(Use Tmp v23, v24)
+    //v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20
+    urhadd    $8.16b, $2.16b, $3.16b
+    uhadd   $8.16b, $0.16b, $8.16b
+    usubl   $9.8h, $8.8b, $1.8b
+    sqxtn   $9.8b, $9.8h
+    usubl2  $8.8h, $8.16b, $1.16b
+    sqxtn2  $9.16b, $8.8h
+    smax    $8.16b, $9.16b, $5.16b
 //
-  smin	$8.16b, $8.16b, $6.16b
-  uabd	$9.16b, $0.16b, $2.16b
-  cmhi	$9.16b, $4.16b, $9.16b
-  and     $8.16b, $8.16b, $9.16b
-  and     $8.16b, $8.16b, $7.16b
-  add     $8.16b, $1.16b, $8.16b
-  abs     $9.16b, $9.16b
+    smin  $8.16b, $8.16b, $6.16b
+    uabd  $9.16b, $0.16b, $2.16b
+    cmhi  $9.16b, $4.16b, $9.16b
+    and     $8.16b, $8.16b, $9.16b
+    and     $8.16b, $8.16b, $7.16b
+    add     $8.16b, $1.16b, $8.16b
+    abs     $9.16b, $9.16b
 .endm
 
-.macro	DIFF_LUMA_LT4_P0_Q0_1
-  usubl	$5.8h, $0.8b, $3.8b
-  usubl	$6.8h, $2.8b, $1.8b
-  shl     $6.8h, $6.8h, #2
-  add     $5.8h, $5.8h, $6.8h
-  sqrshrn  $4.8b, $5.8h, #3
+.macro DIFF_LUMA_LT4_P0_Q0_1
+    usubl $5.8h, $0.8b, $3.8b
+    usubl $6.8h, $2.8b, $1.8b
+    shl     $6.8h, $6.8h, #2
+    add     $5.8h, $5.8h, $6.8h
+    sqrshrn  $4.8b, $5.8h, #3
 .endm
 
-.macro	DIFF_LUMA_LT4_P0_Q0_2
-  usubl2	$5.8h, $0.16b, $3.16b
-  usubl2	$6.8h, $2.16b, $1.16b
-  shl     $6.8h, $6.8h, #2
-  add     $5.8h, $5.8h, $6.8h
-  sqrshrn2  $4.16b, $5.8h, #3
+.macro DIFF_LUMA_LT4_P0_Q0_2
+    usubl2    $5.8h, $0.16b, $3.16b
+    usubl2    $6.8h, $2.16b, $1.16b
+    shl     $6.8h, $6.8h, #2
+    add     $5.8h, $5.8h, $6.8h
+    sqrshrn2  $4.16b, $5.8h, #3
 .endm
 
-.macro	EXTRACT_DELTA_INTO_TWO_PART
-  cmge	$1.16b, $0.16b, #0
-  and     $1.16b, $0.16b, $1.16b
-  sub     $0.16b, $1.16b, $0.16b
+.macro EXTRACT_DELTA_INTO_TWO_PART
+    cmge  $1.16b, $0.16b, #0
+    and     $1.16b, $0.16b, $1.16b
+    sub     $0.16b, $1.16b, $0.16b
 .endm
 
-.macro	DIFF_LUMA_EQ4_P2P1P0_1
-  uaddl $8.8h, $1.8b, $2.8b
-  uaddl $9.8h, $3.8b, $4.8b
-  add   $9.8h, $9.8h, $8.8h
+.macro DIFF_LUMA_EQ4_P2P1P0_1
+    uaddl $8.8h, $1.8b, $2.8b
+    uaddl $9.8h, $3.8b, $4.8b
+    add   $9.8h, $9.8h, $8.8h
 
-  uaddl $8.8h, $0.8b, $1.8b
-  shl   $8.8h, $8.8h, #1
-  add   $8.8h, $9.8h, $8.8h
+    uaddl $8.8h, $0.8b, $1.8b
+    shl   $8.8h, $8.8h, #1
+    add   $8.8h, $9.8h, $8.8h
 
-  rshrn	$0.8b, $9.8h, #2
-  rshrn	$7.8b, $8.8h, #3
-  shl     $9.8h, $9.8h, #1
-  usubl   $8.8h, $5.8b, $1.8b
-  add     $9.8h, $8.8h, $9.8h
+    rshrn $0.8b, $9.8h, #2
+    rshrn $7.8b, $8.8h, #3
+    shl     $9.8h, $9.8h, #1
+    usubl   $8.8h, $5.8b, $1.8b
+    add     $9.8h, $8.8h, $9.8h
 
-  uaddl	$8.8h, $2.8b, $5.8b
-  uaddw	$8.8h, $8.8h, $2.8b
-  uaddw	$8.8h, $8.8h, $3.8b
+    uaddl $8.8h, $2.8b, $5.8b
+    uaddw $8.8h, $8.8h, $2.8b
+    uaddw $8.8h, $8.8h, $3.8b
 
-  rshrn	$9.8b, $9.8h, #3
-  rshrn	$8.8b, $8.8h, #2
-  bsl		$6.8b, $9.8b, $8.8b
+    rshrn $9.8b, $9.8h, #3
+    rshrn $8.8b, $8.8h, #2
+    bsl       $6.8b, $9.8b, $8.8b
 .endm
 
-.macro	DIFF_LUMA_EQ4_P2P1P0_2
-  uaddl2 $8.8h, $1.16b, $2.16b
-  uaddl2 $9.8h, $3.16b, $4.16b
-  add   $9.8h, $9.8h, $8.8h
+.macro DIFF_LUMA_EQ4_P2P1P0_2
+    uaddl2 $8.8h, $1.16b, $2.16b
+    uaddl2 $9.8h, $3.16b, $4.16b
+    add   $9.8h, $9.8h, $8.8h
 
-  uaddl2 $8.8h, $0.16b, $1.16b
-  shl   $8.8h, $8.8h, #1
-  add   $8.8h, $9.8h, $8.8h
+    uaddl2 $8.8h, $0.16b, $1.16b
+    shl   $8.8h, $8.8h, #1
+    add   $8.8h, $9.8h, $8.8h
 
-  rshrn2	$0.16b, $9.8h, #2
-  rshrn2	$7.16b, $8.8h, #3
-  shl     $9.8h, $9.8h, #1
-  usubl2   $8.8h, $5.16b, $1.16b
-  add     $9.8h, $8.8h, $9.8h
+    rshrn2    $0.16b, $9.8h, #2
+    rshrn2    $7.16b, $8.8h, #3
+    shl     $9.8h, $9.8h, #1
+    usubl2   $8.8h, $5.16b, $1.16b
+    add     $9.8h, $8.8h, $9.8h
 
-  uaddl2	$8.8h, $2.16b, $5.16b
-  uaddw2	$8.8h, $8.8h, $2.16b
-  uaddw2	$8.8h, $8.8h, $3.16b
+    uaddl2    $8.8h, $2.16b, $5.16b
+    uaddw2    $8.8h, $8.8h, $2.16b
+    uaddw2    $8.8h, $8.8h, $3.16b
 
-  rshrn2	$9.16b, $9.8h, #3
-  rshrn2	$8.16b, $8.8h, #2
-  bsl		$6.16b, $9.16b, $8.16b
+    rshrn2    $9.16b, $9.8h, #3
+    rshrn2    $8.16b, $8.8h, #2
+    bsl       $6.16b, $9.16b, $8.16b
 .endm
 
 
-.macro	DIFF_CHROMA_EQ4_P0Q0_1
-  uaddl $4.8h, $0.8b, $3.8b
-  shl   $4.8h, $4.8h, #1
-  usubl $5.8h, $1.8b, $3.8b
-  add   $5.8h, $5.8h, $4.8h
-  rshrn $6.8b, $5.8h, #2
-  usubl $5.8h, $2.8b, $0.8b
-  add   $5.8h, $5.8h, $4.8h
-  rshrn $7.8b, $5.8h, #2
+.macro DIFF_CHROMA_EQ4_P0Q0_1
+    uaddl $4.8h, $0.8b, $3.8b
+    shl   $4.8h, $4.8h, #1
+    usubl $5.8h, $1.8b, $3.8b
+    add   $5.8h, $5.8h, $4.8h
+    rshrn $6.8b, $5.8h, #2
+    usubl $5.8h, $2.8b, $0.8b
+    add   $5.8h, $5.8h, $4.8h
+    rshrn $7.8b, $5.8h, #2
 .endm
 
-.macro	DIFF_CHROMA_EQ4_P0Q0_2
-  uaddl2 $4.8h, $0.16b, $3.16b
-  shl   $4.8h, $4.8h, #1
-  usubl2 $5.8h, $1.16b, $3.16b
-  add   $5.8h, $5.8h, $4.8h
-  rshrn2 $6.16b, $5.8h, #2
-  usubl2 $5.8h, $2.16b, $0.16b
-  add   $5.8h, $5.8h, $4.8h
-  rshrn2 $7.16b, $5.8h, #2
+.macro DIFF_CHROMA_EQ4_P0Q0_2
+    uaddl2 $4.8h, $0.16b, $3.16b
+    shl   $4.8h, $4.8h, #1
+    usubl2 $5.8h, $1.16b, $3.16b
+    add   $5.8h, $5.8h, $4.8h
+    rshrn2 $6.16b, $5.8h, #2
+    usubl2 $5.8h, $2.16b, $0.16b
+    add   $5.8h, $5.8h, $4.8h
+    rshrn2 $7.16b, $5.8h, #2
 .endm
 
-.macro	DIFF_LUMA_EQ4_MASK
-  mov.16b	$3, $2
-  bsl	$3.16b, $0.16b, $1.16b
+.macro DIFF_LUMA_EQ4_MASK
+    mov.16b   $3, $2
+    bsl   $3.16b, $0.16b, $1.16b
 .endm
 
-.macro	LOAD_LUMA_DATA_3
-  ld3	{$0.b, $1.b, $2.b} [$6], [x2], x1
-  ld3	{$3.b, $4.b, $5.b} [$6], [x0], x1
+.macro LOAD_LUMA_DATA_3
+    ld3   {$0.b, $1.b, $2.b} [$6], [x2], x1
+    ld3   {$3.b, $4.b, $5.b} [$6], [x0], x1
 .endm
 
-.macro	LOAD_LUMA_DATA_4
-  ld4	{$0.b, $1.b, $2.b, $3.b} [$8], [x3], x1
-  ld4	{$4.b, $5.b, $6.b, $7.b} [$8], [x0], x1
+.macro LOAD_LUMA_DATA_4
+    ld4   {$0.b, $1.b, $2.b, $3.b} [$8], [x3], x1
+    ld4   {$4.b, $5.b, $6.b, $7.b} [$8], [x0], x1
 .endm
 
-.macro	STORE_LUMA_DATA_4
-  st4	{$0.b, $1.b, $2.b, $3.b} [$4], [x0], x1
-  st4	{$0.b, $1.b, $2.b, $3.b} [$5], [x2], x1
+.macro STORE_LUMA_DATA_4
+    st4   {$0.b, $1.b, $2.b, $3.b} [$4], [x0], x1
+    st4   {$0.b, $1.b, $2.b, $3.b} [$5], [x2], x1
 .endm
 
-.macro	STORE_LUMA_DATA_3
-  st3 {$0.b, $1.b, $2.b} [$6], [x3], x1
-  st3	{$3.b, $4.b, $5.b} [$6], [x0], x1
+.macro STORE_LUMA_DATA_3
+    st3 {$0.b, $1.b, $2.b} [$6], [x3], x1
+    st3   {$3.b, $4.b, $5.b} [$6], [x0], x1
 .endm
 
-.macro	LOAD_CHROMA_DATA_4
-  ld4	{$0.b, $1.b, $2.b, $3.b} [$5], [$4], x2
+.macro LOAD_CHROMA_DATA_4
+    ld4   {$0.b, $1.b, $2.b, $3.b} [$5], [$4], x2
 .endm
 
-.macro	STORE_CHROMA_DATA_2
-  st2	{$0.b, $1.b} [$3], [$2], x2
+.macro STORE_CHROMA_DATA_2
+    st2   {$0.b, $1.b} [$3], [$2], x2
 .endm
 
-.macro	ZERO_JUMP_END
-  mov $1, $0.d[0]
-  mov $2, $0.d[1]
-  orr $1, $1, $2
-  cbz $1, $3
+.macro ZERO_JUMP_END
+    mov $1, $0.d[0]
+    mov $2, $0.d[1]
+    orr $1, $1, $2
+    cbz $1, $3
 .endm
 
 .macro BS_NZC_CHECK
-  ld1 {v0.16b}, [$0]
-  //Arrange the input data --- TOP
-  ands     x6, $1, #2
-  cbz      x6, bs_nzc_check_jump0
-  sub      x6, $0, $2, lsl #4
-  sub      x6, x6, $2, lsl #3
-  add      x6, x6, #12
-  ld1      {v1.s} [3], [x6]
+    ld1 {v0.16b}, [$0]
+    //Arrange the input data --- TOP
+    ands     x6, $1, #2
+    cbz      x6, bs_nzc_check_jump0
+    sub      x6, $0, $2, lsl #4
+    sub      x6, x6, $2, lsl #3
+    add      x6, x6, #12
+    ld1      {v1.s} [3], [x6]
 
-  bs_nzc_check_jump0:
-  ext.16b  v1, v1, v0, #12
-  add      $3.16b, v0.16b, v1.16b
+    bs_nzc_check_jump0:
+    ext.16b  v1, v1, v0, #12
+    add      $3.16b, v0.16b, v1.16b
 
-  // Arrange the input data --- LEFT
-  ands     x6, $1, #1
-  cbz      x6, bs_nzc_check_jump1
+    // Arrange the input data --- LEFT
+    ands     x6, $1, #1
+    cbz      x6, bs_nzc_check_jump1
 
-  sub      x6, $0, #21
-  add      x7, x6, #4
-  ld1      {v1.b} [12], [x6]
-  add      x6, x7, #4
-  ld1      {v1.b} [13], [x7]
-  add      x7, x6, #4
-  ld1      {v1.b} [14], [x6]
-  ld1      {v1.b} [15], [x7]
+    sub      x6, $0, #21
+    add      x7, x6, #4
+    ld1      {v1.b} [12], [x6]
+    add      x6, x7, #4
+    ld1      {v1.b} [13], [x7]
+    add      x7, x6, #4
+    ld1      {v1.b} [14], [x6]
+    ld1      {v1.b} [15], [x7]
 
 bs_nzc_check_jump1:
-  ins      v2.d[0], v0.d[1]
-  zip1     v0.16b, v0.16b, v2.16b
-  ins      v2.d[0], v0.d[1]
-  zip1     v0.16b, v0.16b, v2.16b
-  ext.16b  v1, v1, v0, #12
-  add      $4.16b, v0.16b, v1.16b
+    ins      v2.d[0], v0.d[1]
+    zip1     v0.16b, v0.16b, v2.16b
+    ins      v2.d[0], v0.d[1]
+    zip1     v0.16b, v0.16b, v2.16b
+    ext.16b  v1, v1, v0, #12
+    add      $4.16b, v0.16b, v1.16b
 .endm
 
 .macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5
-  mov   w6, #4
-  sabd  v20.8h, $0.8h, $1.8h
-  sabd  v21.8h, $1.8h, $2.8h
-  dup   $0.8h, w6
-  sabd  v22.8h, $2.8h, $3.8h
-  sabd  v23.8h, $3.8h, $4.8h
+    mov   w6, #4
+    sabd  v20.8h, $0.8h, $1.8h
+    sabd  v21.8h, $1.8h, $2.8h
+    dup   $0.8h, w6
+    sabd  v22.8h, $2.8h, $3.8h
+    sabd  v23.8h, $3.8h, $4.8h
 
-  cmge  v20.8h, v20.8h, $0.8h
-  cmge  v21.8h, v21.8h, $0.8h
-  cmge  v22.8h, v22.8h, $0.8h
-  cmge  v23.8h, v23.8h, $0.8h
+    cmge  v20.8h, v20.8h, $0.8h
+    cmge  v21.8h, v21.8h, $0.8h
+    cmge  v22.8h, v22.8h, $0.8h
+    cmge  v23.8h, v23.8h, $0.8h
 
-  addp v20.8h, v20.8h, v21.8h
-  addp v21.8h, v22.8h, v23.8h
+    addp v20.8h, v20.8h, v21.8h
+    addp v21.8h, v22.8h, v23.8h
 
-  addhn  $5.8b, v20.8h, v20.8h
-  addhn2  $5.16b, v21.8h, v21.8h
+    addhn  $5.8b, v20.8h, v20.8h
+    addhn2  $5.16b, v21.8h, v21.8h
 .endm
 
 .macro BS_MV_CHECK
-  ldp q0, q1, [$0], #32
-  ldp q2, q3, [$0]
-  sub $0, $0, #32
-  // Arrenge the input data --- TOP
-  ands     x6, $1, #2
-  cbz     x6, bs_mv_check_jump0
-  sub      x6, $0, $2, lsl #6
-  add      x6, x6, #48
-  ld1      {v4.16b}, [x6]
+    ldp q0, q1, [$0], #32
+    ldp q2, q3, [$0]
+    sub $0, $0, #32
+    // Arrenge the input data --- TOP
+    ands     x6, $1, #2
+    cbz     x6, bs_mv_check_jump0
+    sub      x6, $0, $2, lsl #6
+    add      x6, x6, #48
+    ld1      {v4.16b}, [x6]
 bs_mv_check_jump0:
-  BS_COMPARE_MV  v4, v0, v1, v2, v3, $3
-  // Arrange the input data --- LEFT
-  ands     x6, $1, #1
-  cbz      x6, bs_mv_check_jump1
-  sub      x6, $0, #52
-  add      x7, x6, #16
-  ld1      {v4.s} [0], [x6]
-  add      x6, x7, #16
-  ld1      {v4.s} [1], [x7]
-  add      x7, x6, #16
-  ld1      {v4.s} [2], [x6]
-  ld1      {v4.s} [3], [x7]
+    BS_COMPARE_MV  v4, v0, v1, v2, v3, $3
+    // Arrange the input data --- LEFT
+    ands     x6, $1, #1
+    cbz      x6, bs_mv_check_jump1
+    sub      x6, $0, #52
+    add      x7, x6, #16
+    ld1      {v4.s} [0], [x6]
+    add      x6, x7, #16
+    ld1      {v4.s} [1], [x7]
+    add      x7, x6, #16
+    ld1      {v4.s} [2], [x6]
+    ld1      {v4.s} [3], [x7]
 bs_mv_check_jump1:
-  zip1  $5.4s, v0.4s, v2.4s
-  zip2  $6.4s, v0.4s, v2.4s
-  zip1  v0.4s, v1.4s, v3.4s
-  zip2  v2.4s, v1.4s, v3.4s
-  zip2  v1.4s, $5.4s, v0.4s
-  zip1  v0.4s, $5.4s, v0.4s
-  zip2  v3.4s, $6.4s, v2.4s
-  zip1  v2.4s, $6.4s, v2.4s
-  BS_COMPARE_MV  v4, v0, v1, v2, v3, $4
+    zip1  $5.4s, v0.4s, v2.4s
+    zip2  $6.4s, v0.4s, v2.4s
+    zip1  v0.4s, v1.4s, v3.4s
+    zip2  v2.4s, v1.4s, v3.4s
+    zip2  v1.4s, $5.4s, v0.4s
+    zip1  v0.4s, $5.4s, v0.4s
+    zip2  v3.4s, $6.4s, v2.4s
+    zip1  v2.4s, $6.4s, v2.4s
+    BS_COMPARE_MV  v4, v0, v1, v2, v3, $4
 .endm
 
 #else
 
-.macro	MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
-  uabd    \arg6.16b, \arg1.16b, \arg2.16b
-  cmhi    \arg6.16b, \arg4.16b, \arg6.16b
+.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
+    uabd    \arg6\().16b, \arg1\().16b, \arg2\().16b
+    cmhi    \arg6\().16b, \arg4\().16b, \arg6\().16b
 
-  uabd    \arg4.16b, \arg0.16b, \arg1.16b
-  cmhi    \arg4.16b, \arg5.16b, \arg4.16b
-  and     \arg6.16b, \arg6.16b, \arg4.16b
+    uabd    \arg4\().16b, \arg0\().16b, \arg1\().16b
+    cmhi    \arg4\().16b, \arg5\().16b, \arg4\().16b
+    and     \arg6\().16b, \arg6\().16b, \arg4\().16b
 
-  uabd    \arg4.16b, \arg3.16b, \arg2.16b
-  cmhi    \arg4.16b, \arg5.16b, \arg4.16b
-  and     \arg6.16b, \arg6.16b, \arg4.16b
+    uabd    \arg4\().16b, \arg3\().16b, \arg2\().16b
+    cmhi    \arg4\().16b, \arg5\().16b, \arg4\().16b
+    and     \arg6\().16b, \arg6\().16b, \arg4\().16b
 .endm
 
-.macro	DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-  //v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20
-  urhadd	\arg8.16b, \arg2.16b, \arg3.16b
-  uhadd   \arg8.16b, \arg0.16b, \arg8.16b
-  usubl   \arg9.8h, \arg8.8b, \arg1.8b
-  sqxtn   \arg9.8b, \arg9.8h
-  usubl2  \arg8.8h, \arg8.16b, \arg1.16b
-  sqxtn2  \arg9.16b, \arg8.8h
-  smax    \arg8.16b, \arg9.16b, \arg5.16b
-  //
-  smin	\arg8.16b, \arg8.16b, \arg6.16b
-  uabd	\arg9.16b, \arg0.16b, \arg2.16b
-  cmhi	\arg9.16b, \arg4.16b, \arg9.16b
-  and     \arg8.16b, \arg8.16b, \arg9.16b
-  and     \arg8.16b, \arg8.16b, \arg7.16b
-  add     \arg8.16b, \arg1.16b, \arg8.16b
-  abs     \arg9.16b, \arg9.16b
+.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+    //v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20
+    urhadd    \arg8\().16b, \arg2\().16b, \arg3\().16b
+    uhadd   \arg8\().16b, \arg0\().16b, \arg8\().16b
+    usubl   \arg9\().8h, \arg8\().8b, \arg1\().8b
+    sqxtn   \arg9\().8b, \arg9\().8h
+    usubl2  \arg8\().8h, \arg8\().16b, \arg1\().16b
+    sqxtn2  \arg9\().16b, \arg8\().8h
+    smax    \arg8\().16b, \arg9\().16b, \arg5\().16b
+    //
+    smin  \arg8\().16b, \arg8\().16b, \arg6\().16b
+    uabd  \arg9\().16b, \arg0\().16b, \arg2\().16b
+    cmhi  \arg9\().16b, \arg4\().16b, \arg9\().16b
+    and     \arg8\().16b, \arg8\().16b, \arg9\().16b
+    and     \arg8\().16b, \arg8\().16b, \arg7\().16b
+    add     \arg8\().16b, \arg1\().16b, \arg8\().16b
+    abs     \arg9\().16b, \arg9\().16b
 .endm
 
-.macro	DIFF_LUMA_LT4_P0_Q0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6
-  usubl	\arg5.8h, \arg0.8b, \arg3.8b
-  usubl	\arg6.8h, \arg2.8b, \arg1.8b
-  shl     \arg6.8h, \arg6.8h, #2
-  add     \arg5.8h, \arg5.8h, \arg6.8h
-  sqrshrn  \arg4.8b, \arg5.8h, #3
+.macro DIFF_LUMA_LT4_P0_Q0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+    usubl \arg5\().8h, \arg0\().8b, \arg3\().8b
+    usubl \arg6\().8h, \arg2\().8b, \arg1\().8b
+    shl     \arg6\().8h, \arg6\().8h, #2
+    add     \arg5\().8h, \arg5\().8h, \arg6\().8h
+    sqrshrn  \arg4\().8b, \arg5\().8h, #3
 .endm
 
-.macro	DIFF_LUMA_LT4_P0_Q0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6
-  usubl2	\arg5.8h, \arg0.16b, \arg3.16b
-  usubl2	\arg6.8h, \arg2.16b, \arg1.16b
-  shl     \arg6.8h, \arg6.8h, #2
-  add     \arg5.8h, \arg5.8h, \arg6.8h
-  sqrshrn2  \arg4.16b, \arg5.8h, #3
+.macro DIFF_LUMA_LT4_P0_Q0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+    usubl2    \arg5\().8h, \arg0\().16b, \arg3\().16b
+    usubl2    \arg6\().8h, \arg2\().16b, \arg1\().16b
+    shl     \arg6\().8h, \arg6\().8h, #2
+    add     \arg5\().8h, \arg5\().8h, \arg6\().8h
+    sqrshrn2  \arg4\().16b, \arg5\().8h, #3
 .endm
 
-.macro	EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
-  cmge	\arg1.16b, \arg0.16b, #0
-  and     \arg1.16b, \arg0.16b, \arg1.16b
-  sub     \arg0.16b, \arg1.16b, \arg0.16b
+.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
+    cmge  \arg1\().16b, \arg0\().16b, #0
+    and     \arg1\().16b, \arg0\().16b, \arg1\().16b
+    sub     \arg0\().16b, \arg1\().16b, \arg0\().16b
 .endm
 
-.macro	DIFF_LUMA_EQ4_P2P1P0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-  uaddl \arg8.8h, \arg1.8b, \arg2.8b
-  uaddl \arg9.8h, \arg3.8b, \arg4.8b
-  add   \arg9.8h, \arg9.8h, \arg8.8h
+.macro DIFF_LUMA_EQ4_P2P1P0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+    uaddl \arg8\().8h, \arg1\().8b, \arg2\().8b
+    uaddl \arg9\().8h, \arg3\().8b, \arg4\().8b
+    add   \arg9\().8h, \arg9\().8h, \arg8\().8h
 
-  uaddl \arg8.8h, \arg0.8b, \arg1.8b
-  shl   \arg8.8h, \arg8.8h, #1
-  add   \arg8.8h, \arg9.8h, \arg8.8h
+    uaddl \arg8\().8h, \arg0\().8b, \arg1\().8b
+    shl   \arg8\().8h, \arg8\().8h, #1
+    add   \arg8\().8h, \arg9\().8h, \arg8\().8h
 
-  rshrn	\arg0.8b, \arg9.8h, #2
-  rshrn	\arg7.8b, \arg8.8h, #3
-  shl     \arg9.8h, \arg9.8h, #1
-  usubl   \arg8.8h, \arg5.8b, \arg1.8b
-  add     \arg9.8h, \arg8.8h, \arg9.8h
+    rshrn \arg0\().8b, \arg9\().8h, #2
+    rshrn \arg7\().8b, \arg8\().8h, #3
+    shl     \arg9\().8h, \arg9\().8h, #1
+    usubl   \arg8\().8h, \arg5\().8b, \arg1\().8b
+    add     \arg9\().8h, \arg8\().8h, \arg9\().8h
 
-  uaddl	\arg8.8h, \arg2.8b, \arg5.8b
-  uaddw	\arg8.8h, \arg8.8h, \arg2.8b
-  uaddw	\arg8.8h, \arg8.8h, \arg3.8b
+    uaddl \arg8\().8h, \arg2\().8b, \arg5\().8b
+    uaddw \arg8\().8h, \arg8\().8h, \arg2\().8b
+    uaddw \arg8\().8h, \arg8\().8h, \arg3\().8b
 
-  rshrn	\arg9.8b, \arg9.8h, #3
-  rshrn	\arg8.8b, \arg8.8h, #2
-  bsl		\arg6.8b, \arg9.8b, \arg8.8b
+    rshrn \arg9\().8b, \arg9\().8h, #3
+    rshrn \arg8\().8b, \arg8\().8h, #2
+    bsl       \arg6\().8b, \arg9\().8b, \arg8\().8b
 .endm
 
-.macro	DIFF_LUMA_EQ4_P2P1P0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-  uaddl2 \arg8.8h, \arg1.16b, \arg2.16b
-  uaddl2 \arg9.8h, \arg3.16b, \arg4.16b
-  add   \arg9.8h, \arg9.8h, \arg8.8h
+.macro DIFF_LUMA_EQ4_P2P1P0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+    uaddl2 \arg8\().8h, \arg1\().16b, \arg2\().16b
+    uaddl2 \arg9\().8h, \arg3\().16b, \arg4\().16b
+    add   \arg9\().8h, \arg9\().8h, \arg8\().8h
 
-  uaddl2 \arg8.8h, \arg0.16b, \arg1.16b
-  shl   \arg8.8h, \arg8.8h, #1
-  add   \arg8.8h, \arg9.8h, \arg8.8h
+    uaddl2 \arg8\().8h, \arg0\().16b, \arg1\().16b
+    shl   \arg8\().8h, \arg8\().8h, #1
+    add   \arg8\().8h, \arg9\().8h, \arg8\().8h
 
-  rshrn2	\arg0.16b, \arg9.8h, #2
-  rshrn2	\arg7.16b, \arg8.8h, #3
-  shl     \arg9.8h, \arg9.8h, #1
-  usubl2   \arg8.8h, \arg5.16b, \arg1.16b
-  add     \arg9.8h, \arg8.8h, \arg9.8h
+    rshrn2    \arg0\().16b, \arg9\().8h, #2
+    rshrn2    \arg7\().16b, \arg8\().8h, #3
+    shl     \arg9\().8h, \arg9\().8h, #1
+    usubl2   \arg8\().8h, \arg5\().16b, \arg1\().16b
+    add     \arg9\().8h, \arg8\().8h, \arg9\().8h
 
-  uaddl2	\arg8.8h, \arg2.16b, \arg5.16b
-  uaddw2	\arg8.8h, \arg8.8h, \arg2.16b
-  uaddw2	\arg8.8h, \arg8.8h, \arg3.16b
+    uaddl2    \arg8\().8h, \arg2\().16b, \arg5\().16b
+    uaddw2    \arg8\().8h, \arg8\().8h, \arg2\().16b
+    uaddw2    \arg8\().8h, \arg8\().8h, \arg3\().16b
 
-  rshrn2	\arg9.16b, \arg9.8h, #3
-  rshrn2	\arg8.16b, \arg8.8h, #2
-  bsl		\arg6.16b, \arg9.16b, \arg8.16b
+    rshrn2    \arg9\().16b, \arg9\().8h, #3
+    rshrn2    \arg8\().16b, \arg8\().8h, #2
+    bsl       \arg6\().16b, \arg9\().16b, \arg8\().16b
 .endm
 
 
-.macro	DIFF_CHROMA_EQ4_P0Q0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-  uaddl \arg4.8h, \arg0.8b, \arg3.8b
-  shl   \arg4.8h, \arg4.8h, #1
-  usubl \arg5.8h, \arg1.8b, \arg3.8b
-  add   \arg5.8h, \arg5.8h, \arg4.8h
-  rshrn \arg6.8b, \arg5.8h, #2
-  usubl \arg5.8h, \arg2.8b, \arg0.8b
-  add   \arg5.8h, \arg5.8h, \arg4.8h
-  rshrn \arg7.8b, \arg5.8h, #2
+.macro DIFF_CHROMA_EQ4_P0Q0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+    uaddl \arg4\().8h, \arg0\().8b, \arg3\().8b
+    shl   \arg4\().8h, \arg4\().8h, #1
+    usubl \arg5\().8h, \arg1\().8b, \arg3\().8b
+    add   \arg5\().8h, \arg5\().8h, \arg4\().8h
+    rshrn \arg6\().8b, \arg5\().8h, #2
+    usubl \arg5\().8h, \arg2\().8b, \arg0\().8b
+    add   \arg5\().8h, \arg5\().8h, \arg4\().8h
+    rshrn \arg7\().8b, \arg5\().8h, #2
 .endm
 
-.macro	DIFF_CHROMA_EQ4_P0Q0_2  arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-  uaddl2 \arg4.8h, \arg0.16b, \arg3.16b
-  shl   \arg4.8h, \arg4.8h, #1
-  usubl2 \arg5.8h, \arg1.16b, \arg3.16b
-  add   \arg5.8h, \arg5.8h, \arg4.8h
-  rshrn2 \arg6.16b, \arg5.8h, #2
-  usubl2 \arg5.8h, \arg2.16b, \arg0.16b
-  add   \arg5.8h, \arg5.8h, \arg4.8h
-  rshrn2 \arg7.16b, \arg5.8h, #2
+.macro DIFF_CHROMA_EQ4_P0Q0_2  arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+    uaddl2 \arg4\().8h, \arg0\().16b, \arg3\().16b
+    shl   \arg4\().8h, \arg4\().8h, #1
+    usubl2 \arg5\().8h, \arg1\().16b, \arg3\().16b
+    add   \arg5\().8h, \arg5\().8h, \arg4\().8h
+    rshrn2 \arg6\().16b, \arg5\().8h, #2
+    usubl2 \arg5\().8h, \arg2\().16b, \arg0\().16b
+    add   \arg5\().8h, \arg5\().8h, \arg4\().8h
+    rshrn2 \arg7\().16b, \arg5\().8h, #2
 .endm
 
-.macro	DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
-  mov.16b	\arg3, \arg2
-  bsl	\arg3.16b, \arg0.16b, \arg1.16b
+.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
+    mov.16b   \arg3, \arg2
+    bsl   \arg3\().16b, \arg0\().16b, \arg1\().16b
 .endm
 
-.macro	LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
-  ld3	{\arg0.b, \arg1.b, \arg2.b} [\arg6], [x2], x1
-  ld3	{\arg3.b, \arg4.b, \arg5.b} [\arg6], [x0], x1
+.macro LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+    ld3   {\arg0\().b, \arg1\().b, \arg2\().b} [\arg6], [x2], x1
+    ld3   {\arg3\().b, \arg4\().b, \arg5\().b} [\arg6], [x0], x1
 .endm
 
-.macro	LOAD_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-  ld4	{\arg0.b, \arg1.b, \arg2.b, \arg3.b} [\arg8], [x3], x1
-  ld4	{\arg4.b, \arg5.b, \arg6.b, \arg7.b} [\arg8], [x0], x1
+.macro LOAD_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+    ld4   {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg8], [x3], x1
+    ld4   {\arg4\().b, \arg5\().b, \arg6\().b, \arg7\().b} [\arg8], [x0], x1
 .endm
 
-.macro	STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
-  st4	{\arg0.b, \arg1.b, \arg2.b, \arg3.b} [\arg4], [x0], x1
-  st4	{\arg0.b, \arg1.b, \arg2.b, \arg3.b} [\arg5], [x2], x1
+.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
+    st4   {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg4], [x0], x1
+    st4   {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg5], [x2], x1
 .endm
 
-.macro	STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
-  st3   {\arg0.b, \arg1.b, \arg2.b} [\arg6], [x3], x1
-  st3	{\arg3.b, \arg4.b, \arg5.b} [\arg6], [x0], x1
+.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+    st3   {\arg0\().b, \arg1\().b, \arg2\().b} [\arg6], [x3], x1
+    st3   {\arg3\().b, \arg4\().b, \arg5\().b} [\arg6], [x0], x1
 .endm
 
-.macro	LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
-  ld4	{\arg0.b, \arg1.b, \arg2.b, \arg3.b} [\arg5], [\arg4], x2
+.macro LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
+    ld4   {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg5], [\arg4], x2
 .endm
 
-.macro	STORE_CHROMA_DATA_2 arg0, arg1, arg2, arg3
-  st2	{\arg0.b, \arg1.b} [\arg3], [\arg2], x2
+.macro STORE_CHROMA_DATA_2 arg0, arg1, arg2, arg3
+    st2   {\arg0\().b, \arg1\().b} [\arg3], [\arg2], x2
 .endm
 
-.macro	ZERO_JUMP_END arg0, arg1, arg2, arg3
-  mov \arg1, \arg0.d[0]
-  mov \arg2, \arg0.d[1]
-  orr \arg1, \arg1, \arg2
-  cbz \arg1, \arg3
+.macro ZERO_JUMP_END arg0, arg1, arg2, arg3
+    mov \arg1, \arg0\().d[0]
+    mov \arg2, \arg0\().d[1]
+    orr \arg1, \arg1, \arg2
+    cbz \arg1, \arg3
 .endm
 
 .macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4
-  ld1 {v0.16b}, [\arg0]
-  //Arrange the input data --- TOP
-  ands     x6, \arg1, #2
-  cbz      x6, bs_nzc_check_jump0
-  sub      x6, \arg0, \arg2, lsl #4
-  sub      x6, x6, \arg2, lsl #3
-  add      x6, x6, #12
-  ld1      {v1.s} [3], [x6]
+    ld1 {v0.16b}, [\arg0]
+    //Arrange the input data --- TOP
+    ands     x6, \arg1, #2
+    cbz      x6, bs_nzc_check_jump0
+    sub      x6, \arg0, \arg2, lsl #4
+    sub      x6, x6, \arg2, lsl #3
+    add      x6, x6, #12
+    ld1      {v1.s} [3], [x6]
 
 bs_nzc_check_jump0:
-  ext.16b  v1, v1, v0, #12
-  add      \arg3.16b, v0.16b, v1.16b
+    ext.16b  v1, v1, v0, #12
+    add      \arg3\().16b, v0.16b, v1.16b
 
-  // Arrange the input data --- LEFT
-  ands     x6, \arg1, #1
-  cbz      x6, bs_nzc_check_jump1
+    // Arrange the input data --- LEFT
+    ands     x6, \arg1, #1
+    cbz      x6, bs_nzc_check_jump1
 
-  sub      x6, \arg0, #21
-  add      x7, x6, #4
-  ld1      {v1.b} [12], [x6]
-  add      x6, x7, #4
-  ld1      {v1.b} [13], [x7]
-  add      x7, x6, #4
-  ld1      {v1.b} [14], [x6]
-  ld1      {v1.b} [15], [x7]
+    sub      x6, \arg0, #21
+    add      x7, x6, #4
+    ld1      {v1.b} [12], [x6]
+    add      x6, x7, #4
+    ld1      {v1.b} [13], [x7]
+    add      x7, x6, #4
+    ld1      {v1.b} [14], [x6]
+    ld1      {v1.b} [15], [x7]
 
 bs_nzc_check_jump1:
-  ins      v2.d[0], v0.d[1]
-  zip1     v0.16b, v0.16b, v2.16b
-  ins      v2.d[0], v0.d[1]
-  zip1     v0.16b, v0.16b, v2.16b
-  ext.16b  v1, v1, v0, #12
-  add      \arg4.16b, v0.16b, v1.16b
+    ins      v2.d[0], v0.d[1]
+    zip1     v0.16b, v0.16b, v2.16b
+    ins      v2.d[0], v0.d[1]
+    zip1     v0.16b, v0.16b, v2.16b
+    ext.16b  v1, v1, v0, #12
+    add      \arg4\().16b, v0.16b, v1.16b
 .endm
 
 .macro BS_COMPARE_MV arg0, arg1, arg2, arg3, arg4, arg5
-  //in: \arg0,\arg1(const),\arg2(const),\arg3(const),\arg4(const); out:\arg5
-  mov   w6, #4
-  sabd  v20.8h, \arg0.8h, \arg1.8h
-  sabd  v21.8h, \arg1.8h, \arg2.8h
-  dup   \arg0.8h, w6
-  sabd  v22.8h, \arg2.8h, \arg3.8h
-  sabd  v23.8h, \arg3.8h, \arg4.8h
+    //in: \arg0,\arg1(const),\arg2(const),\arg3(const),\arg4(const); out:\arg5
+    mov   w6, #4
+    sabd  v20.8h, \arg0\().8h, \arg1\().8h
+    sabd  v21.8h, \arg1\().8h, \arg2\().8h
+    dup   \arg0\().8h, w6
+    sabd  v22.8h, \arg2\().8h, \arg3\().8h
+    sabd  v23.8h, \arg3\().8h, \arg4\().8h
 
-  cmge  v20.8h, v20.8h, \arg0.8h
-  cmge  v21.8h, v21.8h, \arg0.8h
-  cmge  v22.8h, v22.8h, \arg0.8h
-  cmge  v23.8h, v23.8h, \arg0.8h
+    cmge  v20.8h, v20.8h, \arg0\().8h
+    cmge  v21.8h, v21.8h, \arg0\().8h
+    cmge  v22.8h, v22.8h, \arg0\().8h
+    cmge  v23.8h, v23.8h, \arg0\().8h
 
-  addp v20.8h, v20.8h, v21.8h
-  addp v21.8h, v22.8h, v23.8h
+    addp v20.8h, v20.8h, v21.8h
+    addp v21.8h, v22.8h, v23.8h
 
-  addhn  \arg5.8b, v20.8h, v20.8h
-  addhn2  \arg5.16b, v21.8h, v21.8h
+    addhn  \arg5\().8b, v20.8h, v20.8h
+    addhn2  \arg5\().16b, v21.8h, v21.8h
 .endm
 
 .macro BS_MV_CHECK arg0, arg1, arg2, arg3, arg4, arg5, arg6
-  ldp q0, q1, [\arg0], #32
-  ldp q2, q3, [\arg0]
-  sub \arg0, \arg0, #32
-  // Arrenge the input data --- TOP
-  ands     x6, \arg1, #2
-  cbz     x6, bs_mv_check_jump0
-  sub      x6, \arg0, \arg2, lsl #6
-  add      x6, x6, #48
-  ld1      {v4.16b}, [x6]
+    ldp q0, q1, [\arg0], #32
+    ldp q2, q3, [\arg0]
+    sub \arg0, \arg0, #32
+    // Arrenge the input data --- TOP
+    ands     x6, \arg1, #2
+    cbz     x6, bs_mv_check_jump0
+    sub      x6, \arg0, \arg2, lsl #6
+    add      x6, x6, #48
+    ld1      {v4.16b}, [x6]
 bs_mv_check_jump0:
-  BS_COMPARE_MV  v4, v0, v1, v2, v3, \arg3
-  // Arrange the input data --- LEFT
-  ands     x6, \arg1, #1
-  cbz      x6, bs_mv_check_jump1
-  sub      x6, \arg0, #52
-  add      x7, x6, #16
-  ld1      {v4.s} [0], [x6]
-  add      x6, x7, #16
-  ld1      {v4.s} [1], [x7]
-  add      x7, x6, #16
-  ld1      {v4.s} [2], [x6]
-  ld1      {v4.s} [3], [x7]
+    BS_COMPARE_MV  v4, v0, v1, v2, v3, \arg3
+    // Arrange the input data --- LEFT
+    ands     x6, \arg1, #1
+    cbz      x6, bs_mv_check_jump1
+    sub      x6, \arg0, #52
+    add      x7, x6, #16
+    ld1      {v4.s} [0], [x6]
+    add      x6, x7, #16
+    ld1      {v4.s} [1], [x7]
+    add      x7, x6, #16
+    ld1      {v4.s} [2], [x6]
+    ld1      {v4.s} [3], [x7]
 bs_mv_check_jump1:
-  zip1  \arg5.4s, v0.4s, v2.4s
-  zip2  \arg6.4s, v0.4s, v2.4s
-  zip1  v0.4s, v1.4s, v3.4s
-  zip2  v2.4s, v1.4s, v3.4s
-  zip2  v1.4s, \arg5.4s, v0.4s
-  zip1  v0.4s, \arg5.4s, v0.4s
-  zip2  v3.4s, \arg6.4s, v2.4s
-  zip1  v2.4s, \arg6.4s, v2.4s
-  BS_COMPARE_MV  v4, v0, v1, v2, v3, \arg4
+    zip1  \arg5\().4s, v0.4s, v2.4s
+    zip2  \arg6\().4s, v0.4s, v2.4s
+    zip1  v0.4s, v1.4s, v3.4s
+    zip2  v2.4s, v1.4s, v3.4s
+    zip2  v1.4s, \arg5\().4s, v0.4s
+    zip1  v0.4s, \arg5\().4s, v0.4s
+    zip2  v3.4s, \arg6\().4s, v2.4s
+    zip1  v2.4s, \arg6\().4s, v2.4s
+    BS_COMPARE_MV  v4, v0, v1, v2, v3, \arg4
 .endm
 #endif
 
 WELS_ASM_ARCH64_FUNC_BEGIN WelsNonZeroCount_AArch64_neon
-  ld1 {v0.8b, v1.8b, v2.8b}, [x0]
-  ins v0.d[1], v1.d[0]
-  uzp1 v0.2d, v0.2d, v1.2d
-  cmeq v0.16b, v0.16b, #0
-  cmeq v2.8b, v2.8b, #0
-  mvn v0.16b, v0.16b
-  mvn v2.8b, v2.8b
-  abs v0.16b, v0.16b
-  abs v2.8b, v2.8b
-  ins v1.d[0], v0.d[1]
-  st1 {v0.8b, v1.8b, v2.8b}, [x0]
+    ld1 {v0.8b, v1.8b, v2.8b}, [x0]
+    ins v0.d[1], v1.d[0]
+    uzp1 v0.2d, v0.2d, v1.2d
+    cmeq v0.16b, v0.16b, #0
+    cmeq v2.8b, v2.8b, #0
+    mvn v0.16b, v0.16b
+    mvn v2.8b, v2.8b
+    abs v0.16b, v0.16b
+    abs v2.8b, v2.8b
+    ins v1.d[0], v0.d[1]
+    st1 {v0.8b, v1.8b, v2.8b}, [x0]
 WELS_ASM_ARCH64_FUNC_END
 
 
 WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaLt4V_AArch64_neon //uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc
-  dup v16.16b, w2 //alpha
-  dup v17.16b, w3 //beta
-  add x2, x1, x1, lsl #1
-  sub x2, x0, x2
-  movi v23.16b, #128
-  ld1 {v0.16b}, [x2], x1
-  ld1 {v1.16b}, [x2], x1
-  ld1 {v2.16b}, [x2]
-  ld1 {v3.16b}, [x0], x1
-  ld1 {v4.16b}, [x0], x1
-  ld1 {v5.16b}, [x0]
-  sub	x2, x2, x1
-  ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x4]
-  trn1 v18.2s, v18.2s, v19.2s
-  trn1 v20.2s, v20.2s, v21.2s
-  trn1 v6.2d, v18.2d, v20.2d // iTc0: 0000, 1111, 2222, 3333
-  cmge v7.16b, v6.16b, #0 // iTc0 Flag
+    dup v16.16b, w2 //alpha
+    dup v17.16b, w3 //beta
+    add x2, x1, x1, lsl #1
+    sub x2, x0, x2
+    movi v23.16b, #128
+    ld1 {v0.16b}, [x2], x1
+    ld1 {v1.16b}, [x2], x1
+    ld1 {v2.16b}, [x2]
+    ld1 {v3.16b}, [x0], x1
+    ld1 {v4.16b}, [x0], x1
+    ld1 {v5.16b}, [x0]
+    sub   x2, x2, x1
+    ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x4]
+    trn1 v18.2s, v18.2s, v19.2s
+    trn1 v20.2s, v20.2s, v21.2s
+    trn1 v6.2d, v18.2d, v20.2d // iTc0: 0000, 1111, 2222, 3333
+    cmge v7.16b, v6.16b, #0 // iTc0 Flag
 
-  MASK_MATRIX	v1, v2, v3, v4, v16, v17, v18
-  and	v7.16b, v7.16b, v18.16b // need filter flag
+    MASK_MATRIX   v1, v2, v3, v4, v16, v17, v18
+    and   v7.16b, v7.16b, v18.16b // need filter flag
 
-  ZERO_JUMP_END v7, x3, x4, DeblockLumaLt4V_AArch64_neon_end
+    ZERO_JUMP_END v7, x3, x4, DeblockLumaLt4V_AArch64_neon_end
 
-  eor	v18.16b, v18.16b, v18.16b
-  sub v18.16b, v18.16b, v6.16b // -iTc0: 0000, 1111, 2222, 3333
+    eor   v18.16b, v18.16b, v18.16b
+    sub v18.16b, v18.16b, v6.16b // -iTc0: 0000, 1111, 2222, 3333
 
-  DIFF_LUMA_LT4_P1_Q1	v0, v1, v2, v3, v17, v18, v6, v7, v19, v20
-  st1	{v19.16b}, [x2], x1
+    DIFF_LUMA_LT4_P1_Q1   v0, v1, v2, v3, v17, v18, v6, v7, v19, v20
+    st1   {v19.16b}, [x2], x1
 
-  DIFF_LUMA_LT4_P1_Q1	v5, v4, v3, v2, v17, v18, v6, v7, v21, v22
+    DIFF_LUMA_LT4_P1_Q1   v5, v4, v3, v2, v17, v18, v6, v7, v21, v22
 
-  abs	v20.16b, v20.16b
-  abs	v22.16b, v22.16b
-  add	v6.16b, v6.16b, v20.16b
-  add	v6.16b, v6.16b, v22.16b
-  eor	v18.16b, v18.16b, v18.16b
-  sub	v18.16b, v18.16b, v6.16b
+    abs   v20.16b, v20.16b
+    abs   v22.16b, v22.16b
+    add   v6.16b, v6.16b, v20.16b
+    add   v6.16b, v6.16b, v22.16b
+    eor   v18.16b, v18.16b, v18.16b
+    sub   v18.16b, v18.16b, v6.16b
 
-  DIFF_LUMA_LT4_P0_Q0_1	v1, v2, v3, v4, v19, v20, v22
-  DIFF_LUMA_LT4_P0_Q0_2	v1, v2, v3, v4, v19, v20, v22
+    DIFF_LUMA_LT4_P0_Q0_1 v1, v2, v3, v4, v19, v20, v22
+    DIFF_LUMA_LT4_P0_Q0_2 v1, v2, v3, v4, v19, v20, v22
 
-  smax	v19.16b, v19.16b, v18.16b
-  smin	v19.16b, v19.16b, v6.16b
-  and     v19.16b, v19.16b, v7.16b
+    smax  v19.16b, v19.16b, v18.16b
+    smin  v19.16b, v19.16b, v6.16b
+    and     v19.16b, v19.16b, v7.16b
 
-  EXTRACT_DELTA_INTO_TWO_PART	v19, v20
-  uqadd	v2.16b, v2.16b, v20.16b
-  uqsub	v2.16b, v2.16b, v19.16b
-  st1     {v2.16b}, [x2], x1
-  uqsub	v3.16b, v3.16b, v20.16b
-  uqadd	v3.16b, v3.16b, v19.16b
-  st1     {v3.16b}, [x2], x1
-  st1     {v21.16b}, [x2]
+    EXTRACT_DELTA_INTO_TWO_PART   v19, v20
+    uqadd v2.16b, v2.16b, v20.16b
+    uqsub v2.16b, v2.16b, v19.16b
+    st1     {v2.16b}, [x2], x1
+    uqsub v3.16b, v3.16b, v20.16b
+    uqadd v3.16b, v3.16b, v19.16b
+    st1     {v3.16b}, [x2], x1
+    st1     {v21.16b}, [x2]
 DeblockLumaLt4V_AArch64_neon_end:
 WELS_ASM_ARCH64_FUNC_END
 
 
 WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaEq4V_AArch64_neon
-  dup     v16.16b, w2 //alpha
-  dup     v17.16b, w3 //beta
-  sub     x3, x0, x1, lsl #2
+    dup     v16.16b, w2 //alpha
+    dup     v17.16b, w3 //beta
+    sub     x3, x0, x1, lsl #2
 
-  ld1     {v0.16b}, [x3], x1
-  ld1     {v4.16b}, [x0], x1
-  ld1     {v1.16b}, [x3], x1
-  ld1     {v5.16b}, [x0], x1
-  ld1     {v2.16b}, [x3], x1
-  ld1     {v6.16b}, [x0], x1
-  ld1     {v3.16b}, [x3]
-  ld1     {v7.16b}, [x0]
+    ld1     {v0.16b}, [x3], x1
+    ld1     {v4.16b}, [x0], x1
+    ld1     {v1.16b}, [x3], x1
+    ld1     {v5.16b}, [x0], x1
+    ld1     {v2.16b}, [x3], x1
+    ld1     {v6.16b}, [x0], x1
+    ld1     {v3.16b}, [x3]
+    ld1     {v7.16b}, [x0]
 
-  sub     x3, x3, x1, lsl #1
-  MASK_MATRIX	v2, v3, v4, v5, v16, v17, v18
-  lsr		w2, w2, #2
-  add		w2, w2, #2
-  dup     v16.16b, w2 //((alpha >> 2) + 2)
-  uabd	v19.16b, v3.16b, v4.16b
-  cmhi	v20.16b, v16.16b, v19.16b //iDetaP0Q0 < ((iAlpha >> 2) + 2)
+    sub     x3, x3, x1, lsl #1
+    MASK_MATRIX   v2, v3, v4, v5, v16, v17, v18
+    lsr       w2, w2, #2
+    add       w2, w2, #2
+    dup     v16.16b, w2 //((alpha >> 2) + 2)
+    uabd  v19.16b, v3.16b, v4.16b
+    cmhi  v20.16b, v16.16b, v19.16b //iDetaP0Q0 < ((iAlpha >> 2) + 2)
 
-  uabd 	v21.16b, v1.16b, v3.16b
-  cmhi	v21.16b, v17.16b, v21.16b //bDetaP2P0
-  and     v21.16b, v21.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaP2P0
+    uabd  v21.16b, v1.16b, v3.16b
+    cmhi  v21.16b, v17.16b, v21.16b //bDetaP2P0
+    and     v21.16b, v21.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaP2P0
 
-  uabd	v22.16b, v6.16b, v4.16b
-  cmhi	v22.16b, v17.16b, v22.16b //bDetaQ2Q0
-  and     v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0
-  and     v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2))
+    uabd  v22.16b, v6.16b, v4.16b
+    cmhi  v22.16b, v17.16b, v22.16b //bDetaQ2Q0
+    and     v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0
+    and     v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2))
 
-  mov.16b v23, v21
-  mov.16b v24, v21
+    mov.16b v23, v21
+    mov.16b v24, v21
 
-  mov.16b v25, v0
-  DIFF_LUMA_EQ4_P2P1P0_1		v0, v1, v2, v3, v4, v5, v23, v19, v17, v16
-  DIFF_LUMA_EQ4_P2P1P0_2		v25, v1, v2, v3, v4, v5, v24, v19, v17, v16
-  ins v0.d[1], v25.d[1]
-  ins v23.d[1], v24.d[1]
-  and	v21.16b, v20.16b, v21.16b
-  DIFF_LUMA_EQ4_MASK	v19, v1, v21, v17
-  st1	{v17.16b}, [x3], x1
-  DIFF_LUMA_EQ4_MASK	v0, v2, v21, v17
-  st1	{v17.16b}, [x3], x1
-  DIFF_LUMA_EQ4_MASK	v23, v3, v18, v17
-  st1	{v17.16b}, [x3], x1
+    mov.16b v25, v0
+    DIFF_LUMA_EQ4_P2P1P0_1        v0, v1, v2, v3, v4, v5, v23, v19, v17, v16
+    DIFF_LUMA_EQ4_P2P1P0_2        v25, v1, v2, v3, v4, v5, v24, v19, v17, v16
+    ins v0.d[1], v25.d[1]
+    ins v23.d[1], v24.d[1]
+    and   v21.16b, v20.16b, v21.16b
+    DIFF_LUMA_EQ4_MASK    v19, v1, v21, v17
+    st1   {v17.16b}, [x3], x1
+    DIFF_LUMA_EQ4_MASK    v0, v2, v21, v17
+    st1   {v17.16b}, [x3], x1
+    DIFF_LUMA_EQ4_MASK    v23, v3, v18, v17
+    st1   {v17.16b}, [x3], x1
 
 
-  mov.16b v23, v22
-  mov.16b v24, v22
-  mov.16b v25, v7
-  DIFF_LUMA_EQ4_P2P1P0_1		v7, v6, v5, v4, v3, v2, v23, v19, v17, v16
-  DIFF_LUMA_EQ4_P2P1P0_2		v25, v6, v5, v4, v3, v2, v24, v19, v17, v16
-  ins v7.d[1], v25.d[1]
-  ins v23.d[1], v24.d[1]
-  and	v22.16b, v20.16b, v22.16b
-  DIFF_LUMA_EQ4_MASK	v23, v4, v18, v17
-  st1	{v17.16b}, [x3], x1
-  DIFF_LUMA_EQ4_MASK	v7, v5, v22, v17
-  st1	{v17.16b}, [x3], x1
-  DIFF_LUMA_EQ4_MASK	v19, v6, v22, v17
-  st1	{v17.16b}, [x3], x1
+    mov.16b v23, v22
+    mov.16b v24, v22
+    mov.16b v25, v7
+    DIFF_LUMA_EQ4_P2P1P0_1        v7, v6, v5, v4, v3, v2, v23, v19, v17, v16
+    DIFF_LUMA_EQ4_P2P1P0_2        v25, v6, v5, v4, v3, v2, v24, v19, v17, v16
+    ins v7.d[1], v25.d[1]
+    ins v23.d[1], v24.d[1]
+    and   v22.16b, v20.16b, v22.16b
+    DIFF_LUMA_EQ4_MASK    v23, v4, v18, v17
+    st1   {v17.16b}, [x3], x1
+    DIFF_LUMA_EQ4_MASK    v7, v5, v22, v17
+    st1   {v17.16b}, [x3], x1
+    DIFF_LUMA_EQ4_MASK    v19, v6, v22, v17
+    st1   {v17.16b}, [x3], x1
 DeblockLumaEq4V_AArch64_neon_end:
 WELS_ASM_ARCH64_FUNC_END
 
 
 WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaLt4H_AArch64_neon //uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc
-  dup v16.16b, w2 //alpha
-  dup v17.16b, w3 //beta
-  sub x2, x0, #3
-  movi v23.16b, #128
+    dup v16.16b, w2 //alpha
+    dup v17.16b, w3 //beta
+    sub x2, x0, #3
+    movi v23.16b, #128
 
-  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 0
-  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 1
-  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 2
-  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 3
-  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 4
-  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 5
-  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 6
-  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 7
+    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 0
+    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 1
+    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 2
+    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 3
+    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 4
+    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 5
+    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 6
+    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 7
 
-  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 8
-  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 9
-  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 10
-  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 11
-  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 12
-  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 13
-  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 14
-  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 15
+    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 8
+    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 9
+    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 10
+    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 11
+    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 12
+    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 13
+    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 14
+    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 15
 
-  sub x0, x0, x1, lsl #4
+    sub x0, x0, x1, lsl #4
 
-  ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x4]
-  trn1 v18.2s, v18.2s, v19.2s
-  trn1 v20.2s, v20.2s, v21.2s
-  trn1 v6.2d, v18.2d, v20.2d // iTc0: 0000, 1111, 2222, 3333
-  cmge v7.16b, v6.16b, #0 // iTc0 Flag
+    ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x4]
+    trn1 v18.2s, v18.2s, v19.2s
+    trn1 v20.2s, v20.2s, v21.2s
+    trn1 v6.2d, v18.2d, v20.2d // iTc0: 0000, 1111, 2222, 3333
+    cmge v7.16b, v6.16b, #0 // iTc0 Flag
 
-  MASK_MATRIX	v1, v2, v3, v4, v16, v17, v18
-  and	v7.16b, v7.16b, v18.16b // need filter flag
+    MASK_MATRIX   v1, v2, v3, v4, v16, v17, v18
+    and   v7.16b, v7.16b, v18.16b // need filter flag
 
-  ZERO_JUMP_END v7, x3, x4, DeblockLumaLt4H_AArch64_neon_end
+    ZERO_JUMP_END v7, x3, x4, DeblockLumaLt4H_AArch64_neon_end
 
-  eor	v18.16b, v18.16b, v18.16b
-  sub v18.16b, v18.16b, v6.16b // -iTc0: 0000, 1111, 2222, 3333
+    eor   v18.16b, v18.16b, v18.16b
+    sub v18.16b, v18.16b, v6.16b // -iTc0: 0000, 1111, 2222, 3333
 
-  DIFF_LUMA_LT4_P1_Q1	v0, v1, v2, v3, v17, v18, v6, v7, v19, v20 //Use Tmp v23,v24
-  mov.16b v25, v19
+    DIFF_LUMA_LT4_P1_Q1   v0, v1, v2, v3, v17, v18, v6, v7, v19, v20 //Use Tmp v23,v24
+    mov.16b v25, v19
 
-  DIFF_LUMA_LT4_P1_Q1	v5, v4, v3, v2, v17, v18, v6, v7, v21, v22 //Use Tmp v23,v24
+    DIFF_LUMA_LT4_P1_Q1   v5, v4, v3, v2, v17, v18, v6, v7, v21, v22 //Use Tmp v23,v24
 
-  abs	v20.16b, v20.16b
-  abs	v22.16b, v22.16b
-  add	v6.16b, v6.16b, v20.16b
-  add	v6.16b, v6.16b, v22.16b
-  eor	v18.16b, v18.16b, v18.16b
-  sub	v18.16b, v18.16b, v6.16b
+    abs   v20.16b, v20.16b
+    abs   v22.16b, v22.16b
+    add   v6.16b, v6.16b, v20.16b
+    add   v6.16b, v6.16b, v22.16b
+    eor   v18.16b, v18.16b, v18.16b
+    sub   v18.16b, v18.16b, v6.16b
 
-  DIFF_LUMA_LT4_P0_Q0_1	v1, v2, v3, v4, v19, v20, v22
-  DIFF_LUMA_LT4_P0_Q0_2	v1, v2, v3, v4, v19, v20, v22
+    DIFF_LUMA_LT4_P0_Q0_1 v1, v2, v3, v4, v19, v20, v22
+    DIFF_LUMA_LT4_P0_Q0_2 v1, v2, v3, v4, v19, v20, v22
 
-  smax	v19.16b, v19.16b, v18.16b
-  smin	v19.16b, v19.16b, v6.16b
-  and     v19.16b, v19.16b, v7.16b
+    smax  v19.16b, v19.16b, v18.16b
+    smin  v19.16b, v19.16b, v6.16b
+    and     v19.16b, v19.16b, v7.16b
 
-  EXTRACT_DELTA_INTO_TWO_PART	v19, v20
-  uqadd	v2.16b, v2.16b, v20.16b
-  uqsub	v2.16b, v2.16b, v19.16b
-  mov.16b v26, v2
-  uqsub	v3.16b, v3.16b, v20.16b
-  uqadd	v3.16b, v3.16b, v19.16b
-  mov.16b v27, v3
-  mov.16b v28, v21
+    EXTRACT_DELTA_INTO_TWO_PART   v19, v20
+    uqadd v2.16b, v2.16b, v20.16b
+    uqsub v2.16b, v2.16b, v19.16b
+    mov.16b v26, v2
+    uqsub v3.16b, v3.16b, v20.16b
+    uqadd v3.16b, v3.16b, v19.16b
+    mov.16b v27, v3
+    mov.16b v28, v21
 
-  sub	x0, x0, #2
-  add	x2, x0, x1
-  lsl	x1, x1, #1
+    sub   x0, x0, #2
+    add   x2, x0, x1
+    lsl   x1, x1, #1
 
-  STORE_LUMA_DATA_4		v25, v26, v27, v28, 0, 1
-  STORE_LUMA_DATA_4		v25, v26, v27, v28, 2, 3
-  STORE_LUMA_DATA_4		v25, v26, v27, v28, 4, 5
-  STORE_LUMA_DATA_4		v25, v26, v27, v28, 6, 7
+    STORE_LUMA_DATA_4     v25, v26, v27, v28, 0, 1
+    STORE_LUMA_DATA_4     v25, v26, v27, v28, 2, 3
+    STORE_LUMA_DATA_4     v25, v26, v27, v28, 4, 5
+    STORE_LUMA_DATA_4     v25, v26, v27, v28, 6, 7
 
-  STORE_LUMA_DATA_4		v25, v26, v27, v28, 8, 9
-  STORE_LUMA_DATA_4		v25, v26, v27, v28, 10, 11
-  STORE_LUMA_DATA_4		v25, v26, v27, v28, 12, 13
-  STORE_LUMA_DATA_4		v25, v26, v27, v28, 14, 15
+    STORE_LUMA_DATA_4     v25, v26, v27, v28, 8, 9
+    STORE_LUMA_DATA_4     v25, v26, v27, v28, 10, 11
+    STORE_LUMA_DATA_4     v25, v26, v27, v28, 12, 13
+    STORE_LUMA_DATA_4     v25, v26, v27, v28, 14, 15
 DeblockLumaLt4H_AArch64_neon_end:
 WELS_ASM_ARCH64_FUNC_END
 
 
 WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaEq4H_AArch64_neon
-  dup     v16.16b, w2 //alpha
-  dup     v17.16b, w3 //beta
-  sub     x3, x0, #4
+    dup     v16.16b, w2 //alpha
+    dup     v17.16b, w3 //beta
+    sub     x3, x0, #4
 
-  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 0
-  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 1
-  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 2
-  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 3
-  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 4
-  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 5
-  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 6
-  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 7
+    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 0
+    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 1
+    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 2
+    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 3
+    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 4
+    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 5
+    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 6
+    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 7
 
-  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 8
-  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 9
-  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 10
-  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 11
-  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 12
-  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 13
-  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 14
-  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 15
+    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 8
+    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 9
+    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 10
+    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 11
+    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 12
+    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 13
+    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 14
+    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 15
 
-  sub x0, x0, x1, lsl #4
-  sub x3, x0, #3
-  MASK_MATRIX	v2, v3, v4, v5, v16, v17, v18
+    sub x0, x0, x1, lsl #4
+    sub x3, x0, #3
+    MASK_MATRIX   v2, v3, v4, v5, v16, v17, v18
 
-  ZERO_JUMP_END v18, x4, x5, DeblockLumaEq4H_AArch64_neon_end
+    ZERO_JUMP_END v18, x4, x5, DeblockLumaEq4H_AArch64_neon_end
 
-  lsr		w2, w2, #2
-  add		w2, w2, #2
-  dup     v16.16b, w2 //((alpha >> 2) + 2)
-  uabd	v19.16b, v3.16b, v4.16b
-  cmhi	v20.16b, v16.16b, v19.16b //iDetaP0Q0 < ((iAlpha >> 2) + 2)
+    lsr       w2, w2, #2
+    add       w2, w2, #2
+    dup     v16.16b, w2 //((alpha >> 2) + 2)
+    uabd  v19.16b, v3.16b, v4.16b
+    cmhi  v20.16b, v16.16b, v19.16b //iDetaP0Q0 < ((iAlpha >> 2) + 2)
 
-  uabd	v21.16b, v1.16b, v3.16b
-  cmhi	v21.16b, v17.16b, v21.16b //bDetaP2P0
-  and     v21.16b, v21.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaP2P0
+    uabd  v21.16b, v1.16b, v3.16b
+    cmhi  v21.16b, v17.16b, v21.16b //bDetaP2P0
+    and     v21.16b, v21.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaP2P0
 
-  uabd	v22.16b, v6.16b, v4.16b
-  cmhi	v22.16b, v17.16b, v22.16b //bDetaQ2Q0
-  and     v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0
-  and     v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2))
+    uabd  v22.16b, v6.16b, v4.16b
+    cmhi  v22.16b, v17.16b, v22.16b //bDetaQ2Q0
+    and     v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0
+    and     v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2))
 
-  mov.16b v23, v21
-  mov.16b v24, v21
+    mov.16b v23, v21
+    mov.16b v24, v21
 
-  mov.16b v25, v0
-  DIFF_LUMA_EQ4_P2P1P0_1		v0, v1, v2, v3, v4, v5, v23, v19, v17, v16
-  DIFF_LUMA_EQ4_P2P1P0_2		v25, v1, v2, v3, v4, v5, v24, v19, v17, v16
-  ins v0.d[1], v25.d[1]
-  ins v23.d[1], v24.d[1]
-  and	v21.16b, v20.16b, v21.16b
-  DIFF_LUMA_EQ4_MASK	v19, v1, v21, v17
-  mov.16b v26, v17
-  DIFF_LUMA_EQ4_MASK	v0, v2, v21, v17
-  mov.16b v27, v17
-  DIFF_LUMA_EQ4_MASK	v23, v3, v18, v17
-  mov.16b v28, v17
+    mov.16b v25, v0
+    DIFF_LUMA_EQ4_P2P1P0_1        v0, v1, v2, v3, v4, v5, v23, v19, v17, v16
+    DIFF_LUMA_EQ4_P2P1P0_2        v25, v1, v2, v3, v4, v5, v24, v19, v17, v16
+    ins v0.d[1], v25.d[1]
+    ins v23.d[1], v24.d[1]
+    and   v21.16b, v20.16b, v21.16b
+    DIFF_LUMA_EQ4_MASK    v19, v1, v21, v17
+    mov.16b v26, v17
+    DIFF_LUMA_EQ4_MASK    v0, v2, v21, v17
+    mov.16b v27, v17
+    DIFF_LUMA_EQ4_MASK    v23, v3, v18, v17
+    mov.16b v28, v17
 
 
-  mov.16b v23, v22
-  mov.16b v24, v22
-  mov.16b v25, v7
-  DIFF_LUMA_EQ4_P2P1P0_1		v7, v6, v5, v4, v3, v2, v23, v19, v17, v16
-  DIFF_LUMA_EQ4_P2P1P0_2		v25, v6, v5, v4, v3, v2, v24, v19, v17, v16
-  ins v7.d[1], v25.d[1]
-  ins v23.d[1], v24.d[1]
-  and	v22.16b, v20.16b, v22.16b
-  DIFF_LUMA_EQ4_MASK	v23, v4, v18, v17
-  mov.16b v29, v17
-  DIFF_LUMA_EQ4_MASK	v7, v5, v22, v17
-  mov.16b v30, v17
-  DIFF_LUMA_EQ4_MASK	v19, v6, v22, v17
-  mov.16b v31, v17
+    mov.16b v23, v22
+    mov.16b v24, v22
+    mov.16b v25, v7
+    DIFF_LUMA_EQ4_P2P1P0_1        v7, v6, v5, v4, v3, v2, v23, v19, v17, v16
+    DIFF_LUMA_EQ4_P2P1P0_2        v25, v6, v5, v4, v3, v2, v24, v19, v17, v16
+    ins v7.d[1], v25.d[1]
+    ins v23.d[1], v24.d[1]
+    and   v22.16b, v20.16b, v22.16b
+    DIFF_LUMA_EQ4_MASK    v23, v4, v18, v17
+    mov.16b v29, v17
+    DIFF_LUMA_EQ4_MASK    v7, v5, v22, v17
+    mov.16b v30, v17
+    DIFF_LUMA_EQ4_MASK    v19, v6, v22, v17
+    mov.16b v31, v17
 
-  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 0
-  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 1
-  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 2
-  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 3
-  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 4
-  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 5
-  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 6
-  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 7
-  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 8
-  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 9
-  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 10
-  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 11
-  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 12
-  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 13
-  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 14
-  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 15
+    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 0
+    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 1
+    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 2
+    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 3
+    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 4
+    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 5
+    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 6
+    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 7
+    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 8
+    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 9
+    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 10
+    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 11
+    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 12
+    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 13
+    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 14
+    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 15
 DeblockLumaEq4H_AArch64_neon_end:
 WELS_ASM_ARCH64_FUNC_END
 
 
 WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaLt4V_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta, int8_t* pTc
-  dup v16.16b, w3 //alpha
-  dup v17.16b, w4 //beta
-  lsl x3, x2, #1
-  sub x6, x0, x3 //pPixCb-2*Stride
-  sub x7, x1, x3 //pPixCr-2*Stride
+    dup v16.16b, w3 //alpha
+    dup v17.16b, w4 //beta
+    lsl x3, x2, #1
+    sub x6, x0, x3 //pPixCb-2*Stride
+    sub x7, x1, x3 //pPixCr-2*Stride
 
-  ld1 {v0.d} [0], [x6], x2
-  ld1 {v1.d} [0], [x6]
-  ld1 {v2.d} [0], [x0], x2
-  ld1 {v3.d} [0], [x0]
-  ld1 {v0.d} [1], [x7], x2
-  ld1 {v1.d} [1], [x7]
-  ld1 {v2.d} [1], [x1], x2
-  ld1 {v3.d} [1], [x1]
+    ld1 {v0.d} [0], [x6], x2
+    ld1 {v1.d} [0], [x6]
+    ld1 {v2.d} [0], [x0], x2
+    ld1 {v3.d} [0], [x0]
+    ld1 {v0.d} [1], [x7], x2
+    ld1 {v1.d} [1], [x7]
+    ld1 {v2.d} [1], [x1], x2
+    ld1 {v3.d} [1], [x1]
 
-  ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x5]
-  trn1 v18.4h, v18.4h, v19.4h //0011,0011,
-  trn1 v20.4h, v20.4h, v21.4h //2233,2233
-  zip1 v6.4s, v18.4s, v20.4s //iTc0: 0011,2233,0011,2233
-  cmgt v7.16b, v6.16b, #0 // iTc0 Flag
+    ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x5]
+    trn1 v18.4h, v18.4h, v19.4h //0011,0011,
+    trn1 v20.4h, v20.4h, v21.4h //2233,2233
+    zip1 v6.4s, v18.4s, v20.4s //iTc0: 0011,2233,0011,2233
+    cmgt v7.16b, v6.16b, #0 // iTc0 Flag
 
-  MASK_MATRIX	v0, v1, v2, v3, v16, v17, v18
-  and	v7.16b, v7.16b, v18.16b // need filter flag
+    MASK_MATRIX   v0, v1, v2, v3, v16, v17, v18
+    and   v7.16b, v7.16b, v18.16b // need filter flag
 
-  ZERO_JUMP_END v7, x4, x5, DeblockChromaLt4V_AArch64_neon_end
+    ZERO_JUMP_END v7, x4, x5, DeblockChromaLt4V_AArch64_neon_end
 
-  eor	v18.16b, v18.16b, v18.16b
-  sub v18.16b, v18.16b, v6.16b //-iTc0: 0011,2233,0011,2233
+    eor   v18.16b, v18.16b, v18.16b
+    sub v18.16b, v18.16b, v6.16b //-iTc0: 0011,2233,0011,2233
 
-  DIFF_LUMA_LT4_P0_Q0_1	v0, v1, v2, v3, v19, v20, v22
-  DIFF_LUMA_LT4_P0_Q0_2	v0, v1, v2, v3, v19, v20, v22
+    DIFF_LUMA_LT4_P0_Q0_1 v0, v1, v2, v3, v19, v20, v22
+    DIFF_LUMA_LT4_P0_Q0_2 v0, v1, v2, v3, v19, v20, v22
 
-  smax	v19.16b, v19.16b, v18.16b
-  smin	v19.16b, v19.16b, v6.16b
-  and     v19.16b, v19.16b, v7.16b
+    smax  v19.16b, v19.16b, v18.16b
+    smin  v19.16b, v19.16b, v6.16b
+    and     v19.16b, v19.16b, v7.16b
 
-  EXTRACT_DELTA_INTO_TWO_PART	v19, v20
-  uqadd	v1.16b, v1.16b, v20.16b
-  uqsub	v1.16b, v1.16b, v19.16b
-  st1     {v1.d} [0], [x6], x2
-  st1     {v1.d} [1], [x7], x2
-  uqsub	v2.16b, v2.16b, v20.16b
-  uqadd	v2.16b, v2.16b, v19.16b
-  st1     {v2.d} [0], [x6]
-  st1     {v2.d} [1], [x7]
+    EXTRACT_DELTA_INTO_TWO_PART   v19, v20
+    uqadd v1.16b, v1.16b, v20.16b
+    uqsub v1.16b, v1.16b, v19.16b
+    st1     {v1.d} [0], [x6], x2
+    st1     {v1.d} [1], [x7], x2
+    uqsub v2.16b, v2.16b, v20.16b
+    uqadd v2.16b, v2.16b, v19.16b
+    st1     {v2.d} [0], [x6]
+    st1     {v2.d} [1], [x7]
 DeblockChromaLt4V_AArch64_neon_end:
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaLt4H_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta, int8_t* pTc
-  dup v16.16b, w3 //alpha
-  dup v17.16b, w4 //beta
-  sub x6, x0, #2 //pPixCb-2
-  sub x7, x1, #2 //pPixCr-2
+    dup v16.16b, w3 //alpha
+    dup v17.16b, w4 //beta
+    sub x6, x0, #2 //pPixCb-2
+    sub x7, x1, #2 //pPixCr-2
 
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 0
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 1
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 2
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 3
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 4
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 5
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 6
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 7
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 0
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 1
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 2
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 3
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 4
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 5
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 6
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 7
 
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 8
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 9
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 10
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 11
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 12
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 13
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 14
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 15
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 8
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 9
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 10
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 11
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 12
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 13
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 14
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 15
 
-  sub x0, x0, #1
-  sub x1, x1, #1
+    sub x0, x0, #1
+    sub x1, x1, #1
 
-  ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x5]
-  trn1 v18.4h, v18.4h, v19.4h //0011,0011,
-  trn1 v20.4h, v20.4h, v21.4h //2233,2233
-  zip1 v6.4s, v18.4s, v20.4s //iTc0: 0011,2233,0011,2233
-  cmgt v7.16b, v6.16b, #0 // iTc0 Flag
+    ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x5]
+    trn1 v18.4h, v18.4h, v19.4h //0011,0011,
+    trn1 v20.4h, v20.4h, v21.4h //2233,2233
+    zip1 v6.4s, v18.4s, v20.4s //iTc0: 0011,2233,0011,2233
+    cmgt v7.16b, v6.16b, #0 // iTc0 Flag
 
-  MASK_MATRIX	v0, v1, v2, v3, v16, v17, v18
-  and	v7.16b, v7.16b, v18.16b // need filter flag
+    MASK_MATRIX   v0, v1, v2, v3, v16, v17, v18
+    and   v7.16b, v7.16b, v18.16b // need filter flag
 
-  ZERO_JUMP_END v7, x4, x5, DeblockChromaLt4H_AArch64_neon_end
-  eor	v18.16b, v18.16b, v18.16b
-  sub v18.16b, v18.16b, v6.16b //-iTc0: 0011,2233,0011,2233
+    ZERO_JUMP_END v7, x4, x5, DeblockChromaLt4H_AArch64_neon_end
+    eor   v18.16b, v18.16b, v18.16b
+    sub v18.16b, v18.16b, v6.16b //-iTc0: 0011,2233,0011,2233
 
-  DIFF_LUMA_LT4_P0_Q0_1	v0, v1, v2, v3, v19, v20, v22
-  DIFF_LUMA_LT4_P0_Q0_2	v0, v1, v2, v3, v19, v20, v22
+    DIFF_LUMA_LT4_P0_Q0_1 v0, v1, v2, v3, v19, v20, v22
+    DIFF_LUMA_LT4_P0_Q0_2 v0, v1, v2, v3, v19, v20, v22
 
-  smax	v19.16b, v19.16b, v18.16b
-  smin	v19.16b, v19.16b, v6.16b
-  and     v19.16b, v19.16b, v7.16b
+    smax  v19.16b, v19.16b, v18.16b
+    smin  v19.16b, v19.16b, v6.16b
+    and     v19.16b, v19.16b, v7.16b
 
-  EXTRACT_DELTA_INTO_TWO_PART	v19, v20
-  uqadd	v1.16b, v1.16b, v20.16b
-  uqsub	v1.16b, v1.16b, v19.16b
-  uqsub	v2.16b, v2.16b, v20.16b
-  uqadd	v2.16b, v2.16b, v19.16b
+    EXTRACT_DELTA_INTO_TWO_PART   v19, v20
+    uqadd v1.16b, v1.16b, v20.16b
+    uqsub v1.16b, v1.16b, v19.16b
+    uqsub v2.16b, v2.16b, v20.16b
+    uqadd v2.16b, v2.16b, v19.16b
 
-  STORE_CHROMA_DATA_2 v1, v2, x0, 0
-  STORE_CHROMA_DATA_2 v1, v2, x0, 1
-  STORE_CHROMA_DATA_2 v1, v2, x0, 2
-  STORE_CHROMA_DATA_2 v1, v2, x0, 3
-  STORE_CHROMA_DATA_2 v1, v2, x0, 4
-  STORE_CHROMA_DATA_2 v1, v2, x0, 5
-  STORE_CHROMA_DATA_2 v1, v2, x0, 6
-  STORE_CHROMA_DATA_2 v1, v2, x0, 7
+    STORE_CHROMA_DATA_2 v1, v2, x0, 0
+    STORE_CHROMA_DATA_2 v1, v2, x0, 1
+    STORE_CHROMA_DATA_2 v1, v2, x0, 2
+    STORE_CHROMA_DATA_2 v1, v2, x0, 3
+    STORE_CHROMA_DATA_2 v1, v2, x0, 4
+    STORE_CHROMA_DATA_2 v1, v2, x0, 5
+    STORE_CHROMA_DATA_2 v1, v2, x0, 6
+    STORE_CHROMA_DATA_2 v1, v2, x0, 7
 
-  STORE_CHROMA_DATA_2 v1, v2, x1, 8
-  STORE_CHROMA_DATA_2 v1, v2, x1, 9
-  STORE_CHROMA_DATA_2 v1, v2, x1, 10
-  STORE_CHROMA_DATA_2 v1, v2, x1, 11
-  STORE_CHROMA_DATA_2 v1, v2, x1, 12
-  STORE_CHROMA_DATA_2 v1, v2, x1, 13
-  STORE_CHROMA_DATA_2 v1, v2, x1, 14
-  STORE_CHROMA_DATA_2 v1, v2, x1, 15
+    STORE_CHROMA_DATA_2 v1, v2, x1, 8
+    STORE_CHROMA_DATA_2 v1, v2, x1, 9
+    STORE_CHROMA_DATA_2 v1, v2, x1, 10
+    STORE_CHROMA_DATA_2 v1, v2, x1, 11
+    STORE_CHROMA_DATA_2 v1, v2, x1, 12
+    STORE_CHROMA_DATA_2 v1, v2, x1, 13
+    STORE_CHROMA_DATA_2 v1, v2, x1, 14
+    STORE_CHROMA_DATA_2 v1, v2, x1, 15
 DeblockChromaLt4H_AArch64_neon_end:
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaEq4V_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta
-  dup v16.16b, w3 //alpha
-  dup v17.16b, w4 //beta
-  lsl x3, x2, #1
-  sub x6, x0, x3 //pPixCb-2*Stride
-  sub x7, x1, x3 //pPixCr-2*Stride
+    dup v16.16b, w3 //alpha
+    dup v17.16b, w4 //beta
+    lsl x3, x2, #1
+    sub x6, x0, x3 //pPixCb-2*Stride
+    sub x7, x1, x3 //pPixCr-2*Stride
 
-  ld1 {v0.d} [0], [x6], x2
-  ld1 {v1.d} [0], [x6]
-  ld1 {v2.d} [0], [x0], x2
-  ld1 {v3.d} [0], [x0]
-  ld1 {v0.d} [1], [x7], x2
-  ld1 {v1.d} [1], [x7]
-  ld1 {v2.d} [1], [x1], x2
-  ld1 {v3.d} [1], [x1]
+    ld1 {v0.d} [0], [x6], x2
+    ld1 {v1.d} [0], [x6]
+    ld1 {v2.d} [0], [x0], x2
+    ld1 {v3.d} [0], [x0]
+    ld1 {v0.d} [1], [x7], x2
+    ld1 {v1.d} [1], [x7]
+    ld1 {v2.d} [1], [x1], x2
+    ld1 {v3.d} [1], [x1]
 
-  MASK_MATRIX	v0, v1, v2, v3, v16, v17, v7
+    MASK_MATRIX   v0, v1, v2, v3, v16, v17, v7
 
-  ZERO_JUMP_END v7, x3, x4, DeblockChromaEq4V_AArch64_neon_end
+    ZERO_JUMP_END v7, x3, x4, DeblockChromaEq4V_AArch64_neon_end
 
-  DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21
-  DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21
+    DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21
+    DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21
 
-  mov.16b v6, v7
-  bsl v6.16b, v20.16b, v1.16b
-  bsl v7.16b, v21.16b, v2.16b
+    mov.16b v6, v7
+    bsl v6.16b, v20.16b, v1.16b
+    bsl v7.16b, v21.16b, v2.16b
 
-  st1     {v6.d} [0], [x6], x2
-  st1     {v6.d} [1], [x7], x2
+    st1     {v6.d} [0], [x6], x2
+    st1     {v6.d} [1], [x7], x2
 
-  st1     {v7.d} [0], [x6]
-  st1     {v7.d} [1], [x7]
+    st1     {v7.d} [0], [x6]
+    st1     {v7.d} [1], [x7]
 DeblockChromaEq4V_AArch64_neon_end:
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaEq4H_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta
-  dup v16.16b, w3 //alpha
-  dup v17.16b, w4 //beta
+    dup v16.16b, w3 //alpha
+    dup v17.16b, w4 //beta
 
-  sub x6, x0, #2 //pPixCb-2
-  sub x7, x1, #2 //pPixCr-2
+    sub x6, x0, #2 //pPixCb-2
+    sub x7, x1, #2 //pPixCr-2
 
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 0
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 1
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 2
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 3
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 4
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 5
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 6
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 7
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 0
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 1
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 2
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 3
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 4
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 5
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 6
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 7
 
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 8
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 9
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 10
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 11
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 12
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 13
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 14
-  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 15
-  sub x0, x0, #1
-  sub x1, x1, #1
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 8
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 9
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 10
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 11
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 12
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 13
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 14
+    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 15
+    sub x0, x0, #1
+    sub x1, x1, #1
 
-  MASK_MATRIX	v0, v1, v2, v3, v16, v17, v7
+    MASK_MATRIX   v0, v1, v2, v3, v16, v17, v7
 
-  ZERO_JUMP_END v7, x3, x4, DeblockChromaEq4H_AArch64_neon_end
+    ZERO_JUMP_END v7, x3, x4, DeblockChromaEq4H_AArch64_neon_end
 
-  DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21
-  DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21
+    DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21
+    DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21
 
-  mov.16b v6, v7
-  bsl v6.16b, v20.16b, v1.16b
-  bsl v7.16b, v21.16b, v2.16b
+    mov.16b v6, v7
+    bsl v6.16b, v20.16b, v1.16b
+    bsl v7.16b, v21.16b, v2.16b
 
-  STORE_CHROMA_DATA_2 v6, v7, x0, 0
-  STORE_CHROMA_DATA_2 v6, v7, x0, 1
-  STORE_CHROMA_DATA_2 v6, v7, x0, 2
-  STORE_CHROMA_DATA_2 v6, v7, x0, 3
-  STORE_CHROMA_DATA_2 v6, v7, x0, 4
-  STORE_CHROMA_DATA_2 v6, v7, x0, 5
-  STORE_CHROMA_DATA_2 v6, v7, x0, 6
-  STORE_CHROMA_DATA_2 v6, v7, x0, 7
+    STORE_CHROMA_DATA_2 v6, v7, x0, 0
+    STORE_CHROMA_DATA_2 v6, v7, x0, 1
+    STORE_CHROMA_DATA_2 v6, v7, x0, 2
+    STORE_CHROMA_DATA_2 v6, v7, x0, 3
+    STORE_CHROMA_DATA_2 v6, v7, x0, 4
+    STORE_CHROMA_DATA_2 v6, v7, x0, 5
+    STORE_CHROMA_DATA_2 v6, v7, x0, 6
+    STORE_CHROMA_DATA_2 v6, v7, x0, 7
 
-  STORE_CHROMA_DATA_2 v6, v7, x1, 8
-  STORE_CHROMA_DATA_2 v6, v7, x1, 9
-  STORE_CHROMA_DATA_2 v6, v7, x1, 10
-  STORE_CHROMA_DATA_2 v6, v7, x1, 11
-  STORE_CHROMA_DATA_2 v6, v7, x1, 12
-  STORE_CHROMA_DATA_2 v6, v7, x1, 13
-  STORE_CHROMA_DATA_2 v6, v7, x1, 14
-  STORE_CHROMA_DATA_2 v6, v7, x1, 15
-  DeblockChromaEq4H_AArch64_neon_end:
+    STORE_CHROMA_DATA_2 v6, v7, x1, 8
+    STORE_CHROMA_DATA_2 v6, v7, x1, 9
+    STORE_CHROMA_DATA_2 v6, v7, x1, 10
+    STORE_CHROMA_DATA_2 v6, v7, x1, 11
+    STORE_CHROMA_DATA_2 v6, v7, x1, 12
+    STORE_CHROMA_DATA_2 v6, v7, x1, 13
+    STORE_CHROMA_DATA_2 v6, v7, x1, 14
+    STORE_CHROMA_DATA_2 v6, v7, x1, 15
+    DeblockChromaEq4H_AArch64_neon_end:
 WELS_ASM_ARCH64_FUNC_END
 
 
 WELS_ASM_ARCH64_FUNC_BEGIN DeblockingBSCalcEnc_AArch64_neon
-  // Checking the nzc status
-  BS_NZC_CHECK x0, x2, x3, v16, v17 //v16,v17 save the nzc status
-  // For checking bS[I] = 2
-  movi     v0.16b, #0
-  cmgt     v16.16b, v16.16b, v0.16b
-  cmgt     v17.16b, v17.16b, v0.16b
-  movi     v0.16b, #2
+    // Checking the nzc status
+    BS_NZC_CHECK x0, x2, x3, v16, v17 //v16,v17 save the nzc status
+    // For checking bS[I] = 2
+    movi     v0.16b, #0
+    cmgt     v16.16b, v16.16b, v0.16b
+    cmgt     v17.16b, v17.16b, v0.16b
+    movi     v0.16b, #2
 
-  and  v16.16b, v16.16b, v0.16b //v16 save the nzc check result all the time --- for dir is top
-  and  v17.16b, v17.16b, v0.16b //v17 save the nzc check result all the time --- for dir is left
+    and  v16.16b, v16.16b, v0.16b //v16 save the nzc check result all the time --- for dir is top
+    and  v17.16b, v17.16b, v0.16b //v17 save the nzc check result all the time --- for dir is left
 
-  // Checking the mv status
-  BS_MV_CHECK x1, x2, x3, v18, v19, v5 , v6 //v18, v19 save the mv status
-  // For checking bS[I] = 1
-  movi   v0.16b, #1
-  and  v18.16b, v18.16b, v0.16b //v18 save the nzc check result all the time --- for dir is top
-  and  v19.16b, v19.16b, v0.16b //v19 save the nzc check result all the time --- for dir is left
-  // Check bS[I] is '1' or '2'
-  umax v1.16b, v18.16b, v16.16b
-  umax v0.16b, v19.16b, v17.16b
-  st1 {v0.16b, v1.16b}, [x4]
+    // Checking the mv status
+    BS_MV_CHECK x1, x2, x3, v18, v19, v5 , v6 //v18, v19 save the mv status
+    // For checking bS[I] = 1
+    movi   v0.16b, #1
+    and  v18.16b, v18.16b, v0.16b //v18 save the nzc check result all the time --- for dir is top
+    and  v19.16b, v19.16b, v0.16b //v19 save the nzc check result all the time --- for dir is left
+    // Check bS[I] is '1' or '2'
+    umax v1.16b, v18.16b, v16.16b
+    umax v0.16b, v19.16b, v17.16b
+    st1 {v0.16b, v1.16b}, [x4]
 WELS_ASM_ARCH64_FUNC_END