shithub: tlsclient

ref: 94f2907dc40a6415a10c252cb9ba3971f1f7e838
dir: /third_party/boringssl/src/gen/bcm/aesv8-gcm-armv8-win.S/

View raw version
// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.

#include <openssl/asm_base.h>

#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
#include <openssl/arm_arch.h>
#if __ARM_MAX_ARCH__ >= 8

.arch	armv8-a+crypto
.text
.globl	aes_gcm_enc_kernel

.def aes_gcm_enc_kernel
   .type 32
.endef
.align	4
aes_gcm_enc_kernel:
	AARCH64_SIGN_LINK_REGISTER
	stp	x29, x30, [sp, #-128]!
	mov	x29, sp
	stp	x19, x20, [sp, #16]
	mov	x16, x4
	mov	x8, x5
	stp	x21, x22, [sp, #32]
	stp	x23, x24, [sp, #48]
	stp	d8, d9, [sp, #64]
	stp	d10, d11, [sp, #80]
	stp	d12, d13, [sp, #96]
	stp	d14, d15, [sp, #112]
	ldr	w17, [x8, #240]
	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
	ldp	x13, x14, [x19]                       // load round N keys
	ldr	q31, [x19, #-16]                        // load round N-1 keys
	add	x4, x0, x1, lsr #3   // end_input_ptr
	lsr	x5, x1, #3              // byte_len
	mov	x15, x5
	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
	sub	x5, x5, #1      // byte_len - 1
	ldr	q18, [x8, #0]                                  // load rk0
	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
	ldr	q25, [x8, #112]                                // load rk7
	add	x5, x5, x0
	lsr	x12, x11, #32
	fmov	d2, x10                               // CTR block 2
	orr	w11, w11, w11
	rev	w12, w12                                // rev_ctr32
	fmov	d1, x10                               // CTR block 1
	aese	v0.16b, v18.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
	add	w12, w12, #1                            // increment rev_ctr32
	rev	w9, w12                                 // CTR block 1
	fmov	d3, x10                               // CTR block 3
	orr	x9, x11, x9, lsl #32            // CTR block 1
	add	w12, w12, #1                            // CTR block 1
	ldr	q19, [x8, #16]                                 // load rk1
	fmov	v1.d[1], x9                               // CTR block 1
	rev	w9, w12                                 // CTR block 2
	add	w12, w12, #1                            // CTR block 2
	orr	x9, x11, x9, lsl #32            // CTR block 2
	ldr	q20, [x8, #32]                                 // load rk2
	fmov	v2.d[1], x9                               // CTR block 2
	rev	w9, w12                                 // CTR block 3
	aese	v0.16b, v19.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
	orr	x9, x11, x9, lsl #32            // CTR block 3
	fmov	v3.d[1], x9                               // CTR block 3
	aese	v1.16b, v18.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
	ldr	q21, [x8, #48]                                 // load rk3
	aese	v0.16b, v20.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
	ldr	q24, [x8, #96]                                 // load rk6
	aese	v2.16b, v18.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
	ldr	q23, [x8, #80]                                 // load rk5
	aese	v1.16b, v19.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
	ldr	q14, [x6, #48]                              // load h3l | h3h
	ext	v14.16b, v14.16b, v14.16b, #8
	aese	v3.16b, v18.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
	aese	v2.16b, v19.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
	ldr	q22, [x8, #64]                                 // load rk4
	aese	v1.16b, v20.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
	ldr	q13, [x6, #32]                              // load h2l | h2h
	ext	v13.16b, v13.16b, v13.16b, #8
	aese	v3.16b, v19.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
	ldr	q30, [x8, #192]                               // load rk12
	aese	v2.16b, v20.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
	ldr	q15, [x6, #80]                              // load h4l | h4h
	ext	v15.16b, v15.16b, v15.16b, #8
	aese	v1.16b, v21.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
	ldr	q29, [x8, #176]                               // load rk11
	aese	v3.16b, v20.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
	ldr	q26, [x8, #128]                                // load rk8
	aese	v2.16b, v21.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
	add	w12, w12, #1                            // CTR block 3
	aese	v0.16b, v21.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
	aese	v3.16b, v21.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
	ld1	{ v11.16b}, [x3]
	ext	v11.16b, v11.16b, v11.16b, #8
	rev64	v11.16b, v11.16b
	aese	v2.16b, v22.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
	aese	v0.16b, v22.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
	aese	v1.16b, v22.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
	aese	v3.16b, v22.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
	aese	v0.16b, v23.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
	aese	v1.16b, v23.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
	aese	v3.16b, v23.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
	aese	v2.16b, v23.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
	aese	v1.16b, v24.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
	aese	v3.16b, v24.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
	ldr	q27, [x8, #144]                                // load rk9
	aese	v0.16b, v24.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
	ldr	q12, [x6]                                   // load h1l | h1h
	ext	v12.16b, v12.16b, v12.16b, #8
	aese	v2.16b, v24.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
	ldr	q28, [x8, #160]                               // load rk10
	aese	v1.16b, v25.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
	aese	v0.16b, v25.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
	aese	v2.16b, v25.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
	aese	v3.16b, v25.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
	b.lt	Lenc_finish_first_blocks                         // branch if AES-128

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
	b.eq	Lenc_finish_first_blocks                         // branch if AES-192

	aese	v1.16b, v29.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
	aese	v2.16b, v29.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
	aese	v0.16b, v29.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
	aese	v3.16b, v29.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
	aese	v1.16b, v30.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
	aese	v2.16b, v30.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
	aese	v0.16b, v30.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
	aese	v3.16b, v30.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 12

Lenc_finish_first_blocks:
	cmp	x0, x5                   // check if we have <= 4 blocks
	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
	b.ge	Lenc_tail                                        // handle tail

	ldp	x19, x20, [x0, #16]           // AES block 1 - load plaintext
	rev	w9, w12                                 // CTR block 4
	ldp	x6, x7, [x0, #0]            // AES block 0 - load plaintext
	ldp	x23, x24, [x0, #48]           // AES block 3 - load plaintext
	ldp	x21, x22, [x0, #32]           // AES block 2 - load plaintext
	add	x0, x0, #64                       // AES input_ptr update
	eor	x19, x19, x13                      // AES block 1 - round N low
	eor	x20, x20, x14                      // AES block 1 - round N high
	fmov	d5, x19                               // AES block 1 - mov low
	eor	x6, x6, x13                      // AES block 0 - round N low
	eor	x7, x7, x14                      // AES block 0 - round N high
	eor	x24, x24, x14                      // AES block 3 - round N high
	fmov	d4, x6                               // AES block 0 - mov low
	cmp	x0, x5                   // check if we have <= 8 blocks
	fmov	v4.d[1], x7                           // AES block 0 - mov high
	eor	x23, x23, x13                      // AES block 3 - round N low
	eor	x21, x21, x13                      // AES block 2 - round N low
	fmov	v5.d[1], x20                           // AES block 1 - mov high
	fmov	d6, x21                               // AES block 2 - mov low
	add	w12, w12, #1                            // CTR block 4
	orr	x9, x11, x9, lsl #32            // CTR block 4
	fmov	d7, x23                               // AES block 3 - mov low
	eor	x22, x22, x14                      // AES block 2 - round N high
	fmov	v6.d[1], x22                           // AES block 2 - mov high
	eor	v4.16b, v4.16b, v0.16b                          // AES block 0 - result
	fmov	d0, x10                               // CTR block 4
	fmov	v0.d[1], x9                               // CTR block 4
	rev	w9, w12                                 // CTR block 5
	add	w12, w12, #1                            // CTR block 5
	eor	v5.16b, v5.16b, v1.16b                          // AES block 1 - result
	fmov	d1, x10                               // CTR block 5
	orr	x9, x11, x9, lsl #32            // CTR block 5
	fmov	v1.d[1], x9                               // CTR block 5
	rev	w9, w12                                 // CTR block 6
	st1	{ v4.16b}, [x2], #16                     // AES block 0 - store result
	fmov	v7.d[1], x24                           // AES block 3 - mov high
	orr	x9, x11, x9, lsl #32            // CTR block 6
	eor	v6.16b, v6.16b, v2.16b                          // AES block 2 - result
	st1	{ v5.16b}, [x2], #16                     // AES block 1 - store result
	add	w12, w12, #1                            // CTR block 6
	fmov	d2, x10                               // CTR block 6
	fmov	v2.d[1], x9                               // CTR block 6
	st1	{ v6.16b}, [x2], #16                     // AES block 2 - store result
	rev	w9, w12                                 // CTR block 7
	orr	x9, x11, x9, lsl #32            // CTR block 7
	eor	v7.16b, v7.16b, v3.16b                          // AES block 3 - result
	st1	{ v7.16b}, [x2], #16                     // AES block 3 - store result
	b.ge	Lenc_prepretail                                  // do prepretail

Lenc_main_loop:	//	main loop start
	aese	v0.16b, v18.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
	aese	v1.16b, v18.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
	fmov	d3, x10                               // CTR block 4k+3
	aese	v2.16b, v18.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
	aese	v0.16b, v19.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
	fmov	v3.d[1], x9                               // CTR block 4k+3
	aese	v1.16b, v19.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
	ldp	x23, x24, [x0, #48]           // AES block 4k+7 - load plaintext
	aese	v2.16b, v19.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
	ldp	x21, x22, [x0, #32]           // AES block 4k+6 - load plaintext
	aese	v0.16b, v20.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
	aese	v1.16b, v20.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
	aese	v3.16b, v18.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
	eor	x23, x23, x13                      // AES block 4k+7 - round N low
	aese	v0.16b, v21.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
	mov	d10, v17.d[1]                               // GHASH block 4k - mid
	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
	eor	x22, x22, x14                      // AES block 4k+6 - round N high
	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
	aese	v3.16b, v19.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
	aese	v0.16b, v22.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
	aese	v2.16b, v20.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
	aese	v0.16b, v23.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
	aese	v1.16b, v21.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
	aese	v3.16b, v20.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
	aese	v2.16b, v21.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
	aese	v1.16b, v22.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
	aese	v3.16b, v21.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
	aese	v2.16b, v22.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
	aese	v0.16b, v24.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
	aese	v3.16b, v22.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
	aese	v0.16b, v25.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
	aese	v3.16b, v23.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
	aese	v1.16b, v23.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
	aese	v2.16b, v23.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
	aese	v1.16b, v24.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
	aese	v1.16b, v25.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
	aese	v3.16b, v24.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
	ldp	x19, x20, [x0, #16]           // AES block 4k+5 - load plaintext
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
	aese	v2.16b, v24.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
	aese	v2.16b, v25.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
	eor	x19, x19, x13                      // AES block 4k+5 - round N low
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
	aese	v3.16b, v25.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
	eor	x21, x21, x13                      // AES block 4k+6 - round N low
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
	movi	v8.8b, #0xc2
	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
	fmov	d5, x19                               // AES block 4k+5 - mov low
	ldp	x6, x7, [x0, #0]            // AES block 4k+4 - load plaintext
	b.lt	Lenc_main_loop_continue                          // branch if AES-128

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
	b.eq	Lenc_main_loop_continue                          // branch if AES-192

	aese	v0.16b, v29.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
	aese	v1.16b, v29.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
	aese	v2.16b, v29.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
	aese	v3.16b, v29.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
	aese	v1.16b, v30.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
	aese	v0.16b, v30.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
	aese	v2.16b, v30.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
	aese	v3.16b, v30.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12

Lenc_main_loop_continue:
	shl	d8, d8, #56               // mod_constant
	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
	add	w12, w12, #1                            // CTR block 4k+3
	eor	v4.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
	add	x0, x0, #64                       // AES input_ptr update
	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
	rev	w9, w12                                 // CTR block 4k+8
	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
	eor	x6, x6, x13                      // AES block 4k+4 - round N low
	eor	v10.16b, v10.16b, v4.16b                         // MODULO - karatsuba tidy up
	eor	x7, x7, x14                      // AES block 4k+4 - round N high
	fmov	d4, x6                               // AES block 4k+4 - mov low
	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
	eor	v7.16b, v9.16b, v7.16b                   // MODULO - fold into mid
	eor	x20, x20, x14                      // AES block 4k+5 - round N high
	eor	x24, x24, x14                      // AES block 4k+7 - round N high
	add	w12, w12, #1                            // CTR block 4k+8
	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
	fmov	d7, x23                               // AES block 4k+7 - mov low
	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
	fmov	v5.d[1], x20                           // AES block 4k+5 - mov high
	fmov	d6, x21                               // AES block 4k+6 - mov low
	cmp	x0, x5                   // LOOP CONTROL
	fmov	v6.d[1], x22                           // AES block 4k+6 - mov high
	pmull	v9.1q, v10.1d, v8.1d            // MODULO - mid 64b align with low
	eor	v4.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
	fmov	d0, x10                               // CTR block 4k+8
	fmov	v0.d[1], x9                               // CTR block 4k+8
	rev	w9, w12                                 // CTR block 4k+9
	add	w12, w12, #1                            // CTR block 4k+9
	eor	v5.16b, v5.16b, v1.16b                          // AES block 4k+5 - result
	fmov	d1, x10                               // CTR block 4k+9
	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
	fmov	v1.d[1], x9                               // CTR block 4k+9
	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
	rev	w9, w12                                 // CTR block 4k+10
	st1	{ v4.16b}, [x2], #16                     // AES block 4k+4 - store result
	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
	eor	v11.16b, v11.16b, v9.16b                         // MODULO - fold into low
	fmov	v7.d[1], x24                           // AES block 4k+7 - mov high
	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
	st1	{ v5.16b}, [x2], #16                     // AES block 4k+5 - store result
	add	w12, w12, #1                            // CTR block 4k+10
	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
	eor	v6.16b, v6.16b, v2.16b                          // AES block 4k+6 - result
	fmov	d2, x10                               // CTR block 4k+10
	st1	{ v6.16b}, [x2], #16                     // AES block 4k+6 - store result
	fmov	v2.d[1], x9                               // CTR block 4k+10
	rev	w9, w12                                 // CTR block 4k+11
	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
	orr	x9, x11, x9, lsl #32            // CTR block 4k+11
	eor	v7.16b, v7.16b, v3.16b                          // AES block 4k+7 - result
	st1	{ v7.16b}, [x2], #16                     // AES block 4k+7 - store result
	b.lt	Lenc_main_loop

Lenc_prepretail:	//	PREPRETAIL
	aese	v1.16b, v18.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
	aese	v2.16b, v18.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
	fmov	d3, x10                               // CTR block 4k+3
	aese	v0.16b, v18.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
	fmov	v3.d[1], x9                               // CTR block 4k+3
	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
	aese	v2.16b, v19.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
	aese	v0.16b, v19.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
	aese	v2.16b, v20.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
	aese	v3.16b, v18.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
	mov	d10, v17.d[1]                               // GHASH block 4k - mid
	aese	v1.16b, v19.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
	aese	v2.16b, v21.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
	aese	v1.16b, v20.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
	aese	v0.16b, v20.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
	aese	v3.16b, v19.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
	aese	v1.16b, v21.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
	aese	v3.16b, v20.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
	aese	v0.16b, v21.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
	aese	v3.16b, v21.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
	aese	v0.16b, v22.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
	aese	v3.16b, v22.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
	add	w12, w12, #1                            // CTR block 4k+3
	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
	aese	v3.16b, v23.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
	aese	v2.16b, v22.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
	aese	v2.16b, v23.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
	aese	v1.16b, v22.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
	aese	v1.16b, v23.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
	aese	v0.16b, v23.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
	aese	v1.16b, v24.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
	aese	v2.16b, v24.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
	aese	v0.16b, v24.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
	movi	v8.8b, #0xc2
	aese	v3.16b, v24.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
	aese	v1.16b, v25.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
	aese	v0.16b, v25.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
	aese	v3.16b, v25.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
	shl	d8, d8, #56               // mod_constant
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
	aese	v2.16b, v25.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
	eor	v10.16b, v10.16b, v9.16b                         // karatsuba tidy up
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
	pmull	v4.1q, v9.1d, v8.1d
	ext	v9.16b, v9.16b, v9.16b, #8
	eor	v10.16b, v10.16b, v11.16b
	b.lt	Lenc_finish_prepretail                           // branch if AES-128

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
	b.eq	Lenc_finish_prepretail                           // branch if AES-192

	aese	v1.16b, v29.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
	aese	v0.16b, v29.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
	aese	v3.16b, v29.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
	aese	v2.16b, v29.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
	aese	v1.16b, v30.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
	aese	v0.16b, v30.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
	aese	v3.16b, v30.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
	aese	v2.16b, v30.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12

Lenc_finish_prepretail:
	eor	v10.16b, v10.16b, v4.16b
	eor	v10.16b, v10.16b, v9.16b
	pmull	v4.1q, v10.1d, v8.1d
	ext	v10.16b, v10.16b, v10.16b, #8
	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
	eor	v11.16b, v11.16b, v4.16b
	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
	eor	v11.16b, v11.16b, v10.16b

Lenc_tail:	//	TAIL
	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
	ldp	x6, x7, [x0], #16           // AES block 4k+4 - load plaintext
	eor	x6, x6, x13                      // AES block 4k+4 - round N low
	eor	x7, x7, x14                      // AES block 4k+4 - round N high
	cmp	x5, #48
	fmov	d4, x6                               // AES block 4k+4 - mov low
	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
	eor	v5.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
	b.gt	Lenc_blocks_more_than_3
	cmp	x5, #32
	mov	v3.16b, v2.16b
	movi	v11.8b, #0
	movi	v9.8b, #0
	sub	w12, w12, #1
	mov	v2.16b, v1.16b
	movi	v10.8b, #0
	b.gt	Lenc_blocks_more_than_2
	mov	v3.16b, v1.16b
	sub	w12, w12, #1
	cmp	x5, #16
	b.gt	Lenc_blocks_more_than_1
	sub	w12, w12, #1
	b	Lenc_blocks_less_than_1
Lenc_blocks_more_than_3:	//	blocks left >  3
	st1	{ v5.16b}, [x2], #16                    // AES final-3 block  - store result
	ldp	x6, x7, [x0], #16          // AES final-2 block - load input low & high
	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
	eor	x6, x6, x13                     // AES final-2 block - round N low
	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
	eor	x7, x7, x14                     // AES final-2 block - round N high
	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
	fmov	d5, x6                                // AES final-2 block - mov low
	fmov	v5.d[1], x7                            // AES final-2 block - mov high
	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
	movi	v8.8b, #0                                       // suppress further partial tag feed in
	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
	eor	v5.16b, v5.16b, v1.16b                           // AES final-2 block - result
Lenc_blocks_more_than_2:	//	blocks left >  2
	st1	{ v5.16b}, [x2], #16                    // AES final-2 block - store result
	ldp	x6, x7, [x0], #16          // AES final-1 block - load input low & high
	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
	eor	x6, x6, x13                     // AES final-1 block - round N low
	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
	fmov	d5, x6                                // AES final-1 block - mov low
	eor	x7, x7, x14                     // AES final-1 block - round N high
	fmov	v5.d[1], x7                            // AES final-1 block - mov high
	movi	v8.8b, #0                                       // suppress further partial tag feed in
	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
	eor	v5.16b, v5.16b, v2.16b                           // AES final-1 block - result
	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
Lenc_blocks_more_than_1:	//	blocks left >  1
	st1	{ v5.16b}, [x2], #16                    // AES final-1 block - store result
	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
	ldp	x6, x7, [x0], #16          // AES final block - load input low & high
	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
	movi	v8.8b, #0                                       // suppress further partial tag feed in
	eor	x6, x6, x13                     // AES final block - round N low
	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
	eor	x7, x7, x14                     // AES final block - round N high
	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
	fmov	d5, x6                                // AES final block - mov low
	fmov	v5.d[1], x7                            // AES final block - mov high
	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
	eor	v5.16b, v5.16b, v3.16b                           // AES final block - result
	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
Lenc_blocks_less_than_1:	//	blocks left <= 1
	and	x1, x1, #127                   // bit_length %= 128
	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
	sub	x1, x1, #128                   // bit_length -= 128
	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
	ld1	{ v18.16b}, [x2]                           // load existing bytes where the possibly partial last block is to be stored
	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
	and	x1, x1, #127                   // bit_length %= 128
	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
	cmp	x1, #64
	csel	x6, x13, x14, lt
	csel	x7, x14, xzr, lt
	fmov	d0, x6                                // ctr0b is mask for last block
	fmov	v0.d[1], x7
	and	v5.16b, v5.16b, v0.16b                           // possibly partial last block has zeroes in highest bits
	rev64	v4.16b, v5.16b                                   // GHASH final block
	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
	bif	v5.16b, v18.16b, v0.16b                             // insert existing bytes in top end of result before storing
	pmull2	v20.1q, v4.2d, v12.2d                         // GHASH final block - high
	mov	d8, v4.d[1]                                 // GHASH final block - mid
	rev	w9, w12
	pmull	v21.1q, v4.1d, v12.1d                         // GHASH final block - low
	eor	v9.16b, v9.16b, v20.16b                           // GHASH final block - high
	eor	v8.8b, v8.8b, v4.8b                         // GHASH final block - mid
	pmull	v8.1q, v8.1d, v16.1d                         // GHASH final block - mid
	eor	v11.16b, v11.16b, v21.16b                           // GHASH final block - low
	eor	v10.16b, v10.16b, v8.16b                        // GHASH final block - mid
	movi	v8.8b, #0xc2
	eor	v4.16b, v11.16b, v9.16b                        // MODULO - karatsuba tidy up
	shl	d8, d8, #56              // mod_constant
	eor	v10.16b, v10.16b, v4.16b                        // MODULO - karatsuba tidy up
	pmull	v7.1q, v9.1d, v8.1d           // MODULO - top 64b align with mid
	ext	v9.16b, v9.16b, v9.16b, #8                    // MODULO - other top alignment
	eor	v10.16b, v10.16b, v7.16b                     // MODULO - fold into mid
	eor	v10.16b, v10.16b, v9.16b                        // MODULO - fold into mid
	pmull	v9.1q, v10.1d, v8.1d           // MODULO - mid 64b align with low
	ext	v10.16b, v10.16b, v10.16b, #8                    // MODULO - other mid alignment
	str	w9, [x16, #12]                         // store the updated counter
	st1	{ v5.16b}, [x2]                         // store all 16B
	eor	v11.16b, v11.16b, v9.16b                        // MODULO - fold into low
	eor	v11.16b, v11.16b, v10.16b                        // MODULO - fold into low
	ext	v11.16b, v11.16b, v11.16b, #8
	rev64	v11.16b, v11.16b
	mov	x0, x15
	st1	{ v11.16b }, [x3]
	ldp	x19, x20, [sp, #16]
	ldp	x21, x22, [sp, #32]
	ldp	x23, x24, [sp, #48]
	ldp	d8, d9, [sp, #64]
	ldp	d10, d11, [sp, #80]
	ldp	d12, d13, [sp, #96]
	ldp	d14, d15, [sp, #112]
	ldp	x29, x30, [sp], #128
	AARCH64_VALIDATE_LINK_REGISTER
	ret

.globl	aes_gcm_dec_kernel

.def aes_gcm_dec_kernel
   .type 32
.endef
.align	4
aes_gcm_dec_kernel:
	AARCH64_SIGN_LINK_REGISTER
	stp	x29, x30, [sp, #-128]!
	mov	x29, sp
	stp	x19, x20, [sp, #16]
	mov	x16, x4
	mov	x8, x5
	stp	x21, x22, [sp, #32]
	stp	x23, x24, [sp, #48]
	stp	d8, d9, [sp, #64]
	stp	d10, d11, [sp, #80]
	stp	d12, d13, [sp, #96]
	stp	d14, d15, [sp, #112]
	ldr	w17, [x8, #240]
	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
	ldp	x13, x14, [x19]                       // load round N keys
	ldr	q31, [x19, #-16]                        // load round N-1 keys
	lsr	x5, x1, #3              // byte_len
	mov	x15, x5
	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
	ldr	q26, [x8, #128]                                // load rk8
	sub	x5, x5, #1      // byte_len - 1
	ldr	q25, [x8, #112]                                // load rk7
	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
	add	x4, x0, x1, lsr #3   // end_input_ptr
	ldr	q24, [x8, #96]                                 // load rk6
	lsr	x12, x11, #32
	ldr	q23, [x8, #80]                                 // load rk5
	orr	w11, w11, w11
	ldr	q21, [x8, #48]                                 // load rk3
	add	x5, x5, x0
	rev	w12, w12                                // rev_ctr32
	add	w12, w12, #1                            // increment rev_ctr32
	fmov	d3, x10                               // CTR block 3
	rev	w9, w12                                 // CTR block 1
	add	w12, w12, #1                            // CTR block 1
	fmov	d1, x10                               // CTR block 1
	orr	x9, x11, x9, lsl #32            // CTR block 1
	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
	fmov	v1.d[1], x9                               // CTR block 1
	rev	w9, w12                                 // CTR block 2
	add	w12, w12, #1                            // CTR block 2
	fmov	d2, x10                               // CTR block 2
	orr	x9, x11, x9, lsl #32            // CTR block 2
	fmov	v2.d[1], x9                               // CTR block 2
	rev	w9, w12                                 // CTR block 3
	orr	x9, x11, x9, lsl #32            // CTR block 3
	ldr	q18, [x8, #0]                                  // load rk0
	fmov	v3.d[1], x9                               // CTR block 3
	add	w12, w12, #1                            // CTR block 3
	ldr	q22, [x8, #64]                                 // load rk4
	ldr	q19, [x8, #16]                                 // load rk1
	aese	v0.16b, v18.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
	ldr	q14, [x6, #48]                              // load h3l | h3h
	ext	v14.16b, v14.16b, v14.16b, #8
	aese	v3.16b, v18.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
	ldr	q15, [x6, #80]                              // load h4l | h4h
	ext	v15.16b, v15.16b, v15.16b, #8
	aese	v1.16b, v18.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
	ldr	q13, [x6, #32]                              // load h2l | h2h
	ext	v13.16b, v13.16b, v13.16b, #8
	aese	v2.16b, v18.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
	ldr	q20, [x8, #32]                                 // load rk2
	aese	v0.16b, v19.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
	aese	v1.16b, v19.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
	ld1	{ v11.16b}, [x3]
	ext	v11.16b, v11.16b, v11.16b, #8
	rev64	v11.16b, v11.16b
	aese	v2.16b, v19.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
	ldr	q27, [x8, #144]                                // load rk9
	aese	v3.16b, v19.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
	ldr	q30, [x8, #192]                               // load rk12
	aese	v0.16b, v20.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
	ldr	q12, [x6]                                   // load h1l | h1h
	ext	v12.16b, v12.16b, v12.16b, #8
	aese	v2.16b, v20.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
	ldr	q28, [x8, #160]                               // load rk10
	aese	v3.16b, v20.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
	aese	v0.16b, v21.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
	aese	v1.16b, v20.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
	aese	v3.16b, v21.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
	aese	v0.16b, v22.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
	aese	v2.16b, v21.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
	aese	v1.16b, v21.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
	aese	v3.16b, v22.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
	aese	v2.16b, v22.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
	aese	v1.16b, v22.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
	aese	v3.16b, v23.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
	aese	v0.16b, v23.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
	aese	v1.16b, v23.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
	aese	v2.16b, v23.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
	aese	v0.16b, v24.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
	aese	v3.16b, v24.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
	aese	v1.16b, v24.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
	aese	v2.16b, v24.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
	aese	v0.16b, v25.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
	aese	v1.16b, v25.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
	aese	v3.16b, v25.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
	aese	v2.16b, v25.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
	ldr	q29, [x8, #176]                               // load rk11
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
	b.lt	Ldec_finish_first_blocks                         // branch if AES-128

	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
	b.eq	Ldec_finish_first_blocks                         // branch if AES-192

	aese	v0.16b, v29.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
	aese	v3.16b, v29.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
	aese	v1.16b, v29.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
	aese	v2.16b, v29.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
	aese	v1.16b, v30.16b
	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
	aese	v0.16b, v30.16b
	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
	aese	v2.16b, v30.16b
	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
	aese	v3.16b, v30.16b
	aesmc	v3.16b, v3.16b          // AES block 3 - round 12

Ldec_finish_first_blocks:
	cmp	x0, x5                   // check if we have <= 4 blocks
	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
	b.ge	Ldec_tail                                        // handle tail

	ldr	q4, [x0, #0]                          // AES block 0 - load ciphertext
	ldr	q5, [x0, #16]                         // AES block 1 - load ciphertext
	rev	w9, w12                                 // CTR block 4
	eor	v0.16b, v4.16b, v0.16b                            // AES block 0 - result
	eor	v1.16b, v5.16b, v1.16b                            // AES block 1 - result
	rev64	v5.16b, v5.16b                                    // GHASH block 1
	ldr	q7, [x0, #48]                         // AES block 3 - load ciphertext
	mov	x7, v0.d[1]                            // AES block 0 - mov high
	mov	x6, v0.d[0]                            // AES block 0 - mov low
	rev64	v4.16b, v4.16b                                    // GHASH block 0
	add	w12, w12, #1                            // CTR block 4
	fmov	d0, x10                               // CTR block 4
	orr	x9, x11, x9, lsl #32            // CTR block 4
	fmov	v0.d[1], x9                               // CTR block 4
	rev	w9, w12                                 // CTR block 5
	add	w12, w12, #1                            // CTR block 5
	mov	x19, v1.d[0]                            // AES block 1 - mov low
	orr	x9, x11, x9, lsl #32            // CTR block 5
	mov	x20, v1.d[1]                            // AES block 1 - mov high
	eor	x7, x7, x14                    // AES block 0 - round N high
	eor	x6, x6, x13                    // AES block 0 - round N low
	stp	x6, x7, [x2], #16        // AES block 0 - store result
	fmov	d1, x10                               // CTR block 5
	ldr	q6, [x0, #32]                         // AES block 2 - load ciphertext
	add	x0, x0, #64                       // AES input_ptr update
	fmov	v1.d[1], x9                               // CTR block 5
	rev	w9, w12                                 // CTR block 6
	add	w12, w12, #1                            // CTR block 6
	eor	x19, x19, x13                    // AES block 1 - round N low
	orr	x9, x11, x9, lsl #32            // CTR block 6
	eor	x20, x20, x14                    // AES block 1 - round N high
	stp	x19, x20, [x2], #16        // AES block 1 - store result
	eor	v2.16b, v6.16b, v2.16b                            // AES block 2 - result
	cmp	x0, x5                   // check if we have <= 8 blocks
	b.ge	Ldec_prepretail                                  // do prepretail

Ldec_main_loop:	//	main loop start
	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
	aese	v0.16b, v18.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
	aese	v1.16b, v18.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
	fmov	d2, x10                               // CTR block 4k+6
	fmov	v2.d[1], x9                               // CTR block 4k+6
	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
	rev	w9, w12                                 // CTR block 4k+7
	aese	v0.16b, v19.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
	aese	v1.16b, v19.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
	fmov	d3, x10                               // CTR block 4k+7
	aese	v0.16b, v20.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
	aese	v2.16b, v18.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
	fmov	v3.d[1], x9                               // CTR block 4k+7
	aese	v1.16b, v20.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
	aese	v0.16b, v21.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
	eor	x22, x22, x14                    // AES block 4k+2 - round N high
	aese	v2.16b, v19.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
	mov	d10, v17.d[1]                               // GHASH block 4k - mid
	aese	v1.16b, v21.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
	aese	v3.16b, v18.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
	eor	x21, x21, x13                    // AES block 4k+2 - round N low
	aese	v2.16b, v20.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
	aese	v2.16b, v21.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
	eor	x23, x23, x13                    // AES block 4k+3 - round N low
	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
	eor	x24, x24, x14                    // AES block 4k+3 - round N high
	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
	aese	v2.16b, v22.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
	aese	v3.16b, v19.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
	aese	v0.16b, v22.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
	aese	v2.16b, v23.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
	add	w12, w12, #1                            // CTR block 4k+7
	aese	v3.16b, v20.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
	aese	v1.16b, v22.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
	aese	v3.16b, v21.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
	aese	v1.16b, v23.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
	aese	v0.16b, v23.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
	rev	w9, w12                                 // CTR block 4k+8
	aese	v1.16b, v24.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
	aese	v0.16b, v24.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
	add	w12, w12, #1                            // CTR block 4k+8
	aese	v3.16b, v22.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
	aese	v1.16b, v25.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
	aese	v0.16b, v25.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
	aese	v3.16b, v23.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
	aese	v3.16b, v24.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
	aese	v2.16b, v24.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
	movi	v8.8b, #0xc2
	aese	v2.16b, v25.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
	aese	v3.16b, v25.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
	shl	d8, d8, #56               // mod_constant
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
	b.lt	Ldec_main_loop_continue                          // branch if AES-128

	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
	b.eq	Ldec_main_loop_continue                          // branch if AES-192

	aese	v0.16b, v29.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
	aese	v1.16b, v29.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
	aese	v2.16b, v29.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
	aese	v3.16b, v29.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
	aese	v0.16b, v30.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
	aese	v1.16b, v30.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
	aese	v2.16b, v30.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
	aese	v3.16b, v30.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12

Ldec_main_loop_continue:
	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
	ldr	q4, [x0, #0]                          // AES block 4k+4 - load ciphertext
	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
	ldr	q5, [x0, #16]                         // AES block 4k+5 - load ciphertext
	eor	v0.16b, v4.16b, v0.16b                            // AES block 4k+4 - result
	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
	ldr	q7, [x0, #48]                         // AES block 4k+7 - load ciphertext
	ldr	q6, [x0, #32]                         // AES block 4k+6 - load ciphertext
	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
	add	x0, x0, #64                       // AES input_ptr update
	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
	fmov	d0, x10                               // CTR block 4k+8
	fmov	v0.d[1], x9                               // CTR block 4k+8
	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
	eor	v1.16b, v5.16b, v1.16b                            // AES block 4k+5 - result
	rev	w9, w12                                 // CTR block 4k+9
	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
	cmp	x0, x5                   // LOOP CONTROL
	add	w12, w12, #1                            // CTR block 4k+9
	eor	x6, x6, x13                    // AES block 4k+4 - round N low
	eor	x7, x7, x14                    // AES block 4k+4 - round N high
	mov	x20, v1.d[1]                            // AES block 4k+5 - mov high
	eor	v2.16b, v6.16b, v2.16b                            // AES block 4k+6 - result
	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
	mov	x19, v1.d[0]                            // AES block 4k+5 - mov low
	fmov	d1, x10                               // CTR block 4k+9
	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
	fmov	v1.d[1], x9                               // CTR block 4k+9
	rev	w9, w12                                 // CTR block 4k+10
	add	w12, w12, #1                            // CTR block 4k+10
	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
	rev64	v5.16b, v5.16b                                    // GHASH block 4k+5
	eor	x20, x20, x14                    // AES block 4k+5 - round N high
	stp	x6, x7, [x2], #16        // AES block 4k+4 - store result
	eor	x19, x19, x13                    // AES block 4k+5 - round N low
	stp	x19, x20, [x2], #16        // AES block 4k+5 - store result
	rev64	v4.16b, v4.16b                                    // GHASH block 4k+4
	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
	b.lt	Ldec_main_loop

Ldec_prepretail:	//	PREPRETAIL
	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
	aese	v0.16b, v18.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
	aese	v1.16b, v18.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
	fmov	d2, x10                               // CTR block 4k+6
	fmov	v2.d[1], x9                               // CTR block 4k+6
	rev	w9, w12                                 // CTR block 4k+7
	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
	aese	v1.16b, v19.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
	fmov	d3, x10                               // CTR block 4k+7
	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
	fmov	v3.d[1], x9                               // CTR block 4k+7
	aese	v2.16b, v18.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
	mov	d10, v17.d[1]                               // GHASH block 4k - mid
	aese	v0.16b, v19.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
	aese	v2.16b, v19.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
	aese	v3.16b, v18.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
	aese	v3.16b, v19.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
	aese	v0.16b, v20.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
	aese	v1.16b, v20.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
	aese	v2.16b, v20.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
	aese	v0.16b, v21.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
	aese	v3.16b, v20.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
	aese	v0.16b, v22.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
	aese	v3.16b, v21.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
	aese	v0.16b, v23.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
	aese	v3.16b, v22.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
	aese	v3.16b, v23.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
	aese	v2.16b, v21.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
	aese	v1.16b, v21.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
	aese	v2.16b, v22.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
	aese	v1.16b, v22.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
	aese	v2.16b, v23.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
	aese	v1.16b, v23.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
	aese	v3.16b, v24.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
	aese	v2.16b, v24.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
	aese	v0.16b, v24.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
	movi	v8.8b, #0xc2
	aese	v1.16b, v24.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
	aese	v3.16b, v25.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
	aese	v1.16b, v25.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
	aese	v0.16b, v25.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
	aese	v2.16b, v25.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
	shl	d8, d8, #56               // mod_constant
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
	b.lt	Ldec_finish_prepretail                           // branch if AES-128

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
	b.eq	Ldec_finish_prepretail                           // branch if AES-192

	aese	v2.16b, v29.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
	aese	v0.16b, v29.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
	aese	v1.16b, v29.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
	aese	v2.16b, v30.16b
	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
	aese	v3.16b, v29.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
	aese	v1.16b, v30.16b
	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
	aese	v0.16b, v30.16b
	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
	aese	v3.16b, v30.16b
	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12

Ldec_finish_prepretail:
	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
	eor	x22, x22, x14                    // AES block 4k+2 - round N high
	eor	x23, x23, x13                    // AES block 4k+3 - round N low
	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
	add	w12, w12, #1                            // CTR block 4k+7
	eor	x21, x21, x13                    // AES block 4k+2 - round N low
	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
	eor	x24, x24, x14                    // AES block 4k+3 - round N high
	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result

	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low

Ldec_tail:	//	TAIL
	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
	ld1	{ v5.16b}, [x0], #16                      // AES block 4k+4 - load ciphertext
	eor	v0.16b, v5.16b, v0.16b                            // AES block 4k+4 - result
	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
	cmp	x5, #48
	eor	x6, x6, x13                    // AES block 4k+4 - round N low
	eor	x7, x7, x14                    // AES block 4k+4 - round N high
	b.gt	Ldec_blocks_more_than_3
	sub	w12, w12, #1
	mov	v3.16b, v2.16b
	movi	v10.8b, #0
	movi	v11.8b, #0
	cmp	x5, #32
	movi	v9.8b, #0
	mov	v2.16b, v1.16b
	b.gt	Ldec_blocks_more_than_2
	sub	w12, w12, #1
	mov	v3.16b, v1.16b
	cmp	x5, #16
	b.gt	Ldec_blocks_more_than_1
	sub	w12, w12, #1
	b	Ldec_blocks_less_than_1
Ldec_blocks_more_than_3:	//	blocks left >  3
	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
	ld1	{ v5.16b}, [x0], #16                     // AES final-2 block - load ciphertext
	stp	x6, x7, [x2], #16       // AES final-3 block  - store result
	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
	eor	v0.16b, v5.16b, v1.16b                           // AES final-2 block - result
	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
	mov	x6, v0.d[0]                           // AES final-2 block - mov low
	mov	x7, v0.d[1]                           // AES final-2 block - mov high
	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
	movi	v8.8b, #0                                       // suppress further partial tag feed in
	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
	eor	x6, x6, x13                   // AES final-2 block - round N low
	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
	eor	x7, x7, x14                   // AES final-2 block - round N high
Ldec_blocks_more_than_2:	//	blocks left >  2
	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
	ld1	{ v5.16b}, [x0], #16                     // AES final-1 block - load ciphertext
	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
	stp	x6, x7, [x2], #16       // AES final-2 block  - store result
	eor	v0.16b, v5.16b, v2.16b                           // AES final-1 block - result
	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
	mov	x6, v0.d[0]                           // AES final-1 block - mov low
	mov	x7, v0.d[1]                           // AES final-1 block - mov high
	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
	movi	v8.8b, #0                                       // suppress further partial tag feed in
	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
	eor	x6, x6, x13                   // AES final-1 block - round N low
	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
	eor	x7, x7, x14                   // AES final-1 block - round N high
Ldec_blocks_more_than_1:	//	blocks left >  1
	stp	x6, x7, [x2], #16       // AES final-1 block  - store result
	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
	ld1	{ v5.16b}, [x0], #16                     // AES final block - load ciphertext
	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
	movi	v8.8b, #0                                       // suppress further partial tag feed in
	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
	eor	v0.16b, v5.16b, v3.16b                           // AES final block - result
	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
	mov	x6, v0.d[0]                           // AES final block - mov low
	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
	mov	x7, v0.d[1]                           // AES final block - mov high
	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
	eor	x6, x6, x13                   // AES final block - round N low
	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
	eor	x7, x7, x14                   // AES final block - round N high
Ldec_blocks_less_than_1:	//	blocks left <= 1
	and	x1, x1, #127                   // bit_length %= 128
	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
	sub	x1, x1, #128                   // bit_length -= 128
	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
	ldp	x4, x5, [x2] // load existing bytes we need to not overwrite
	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
	and	x1, x1, #127                   // bit_length %= 128
	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
	cmp	x1, #64
	csel	x9, x13, x14, lt
	csel	x10, x14, xzr, lt
	fmov	d0, x9                                  // ctr0b is mask for last block
	and	x6, x6, x9
	mov	v0.d[1], x10
	bic	x4, x4, x9          // mask out low existing bytes
	rev	w9, w12
	bic	x5, x5, x10      // mask out high existing bytes
	orr	x6, x6, x4
	and	x7, x7, x10
	orr	x7, x7, x5
	and	v5.16b, v5.16b, v0.16b                            // possibly partial last block has zeroes in highest bits
	rev64	v4.16b, v5.16b                                    // GHASH final block
	eor	v4.16b, v4.16b, v8.16b                           // feed in partial tag
	pmull	v21.1q, v4.1d, v12.1d                          // GHASH final block - low
	mov	d8, v4.d[1]                                  // GHASH final block - mid
	eor	v8.8b, v8.8b, v4.8b                          // GHASH final block - mid
	pmull2	v20.1q, v4.2d, v12.2d                          // GHASH final block - high
	pmull	v8.1q, v8.1d, v16.1d                          // GHASH final block - mid
	eor	v9.16b, v9.16b, v20.16b                            // GHASH final block - high
	eor	v11.16b, v11.16b, v21.16b                            // GHASH final block - low
	eor	v10.16b, v10.16b, v8.16b                         // GHASH final block - mid
	movi	v8.8b, #0xc2
	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
	shl	d8, d8, #56               // mod_constant
	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
	stp	x6, x7, [x2]
	str	w9, [x16, #12]                          // store the updated counter
	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
	ext	v11.16b, v11.16b, v11.16b, #8
	rev64	v11.16b, v11.16b
	mov	x0, x15
	st1	{ v11.16b }, [x3]
	ldp	x19, x20, [sp, #16]
	ldp	x21, x22, [sp, #32]
	ldp	x23, x24, [sp, #48]
	ldp	d8, d9, [sp, #64]
	ldp	d10, d11, [sp, #80]
	ldp	d12, d13, [sp, #96]
	ldp	d14, d15, [sp, #112]
	ldp	x29, x30, [sp], #128
	AARCH64_VALIDATE_LINK_REGISTER
	ret

#endif
#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)