shithub: tlsclient

ref: 94f2907dc40a6415a10c252cb9ba3971f1f7e838
dir: /third_party/boringssl/src/gen/crypto/aes128gcmsiv-x86_64-win.asm/

View raw version
; This file is generated from a similarly-named Perl script in the BoringSSL
; source tree. Do not edit by hand.

%ifidn __OUTPUT_FORMAT__, win64
default	rel
%define XMMWORD
%define YMMWORD
%define ZMMWORD
%define _CET_ENDBR

%ifdef BORINGSSL_PREFIX
%include "boringssl_prefix_symbols_nasm.inc"
%endif
section	.rdata rdata align=8

ALIGN	16
one:
	DQ	1,0
two:
	DQ	2,0
three:
	DQ	3,0
four:
	DQ	4,0
five:
	DQ	5,0
six:
	DQ	6,0
seven:
	DQ	7,0
eight:
	DQ	8,0

OR_MASK:
	DD	0x00000000,0x00000000,0x00000000,0x80000000
poly:
	DQ	0x1,0xc200000000000000
mask:
	DD	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
con1:
	DD	1,1,1,1
con2:
	DD	0x1b,0x1b,0x1b,0x1b
con3:
	DB	-1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
and_mask:
	DD	0,0xffffffff,0xffffffff,0xffffffff
section	.text code align=64


ALIGN	16
GFMUL:

	vpclmulqdq	xmm2,xmm0,xmm1,0x00
	vpclmulqdq	xmm5,xmm0,xmm1,0x11
	vpclmulqdq	xmm3,xmm0,xmm1,0x10
	vpclmulqdq	xmm4,xmm0,xmm1,0x01
	vpxor	xmm3,xmm3,xmm4
	vpslldq	xmm4,xmm3,8
	vpsrldq	xmm3,xmm3,8
	vpxor	xmm2,xmm2,xmm4
	vpxor	xmm5,xmm5,xmm3

	vpclmulqdq	xmm3,xmm2,XMMWORD[poly],0x10
	vpshufd	xmm4,xmm2,78
	vpxor	xmm2,xmm3,xmm4

	vpclmulqdq	xmm3,xmm2,XMMWORD[poly],0x10
	vpshufd	xmm4,xmm2,78
	vpxor	xmm2,xmm3,xmm4

	vpxor	xmm0,xmm2,xmm5
	ret


global	aesgcmsiv_htable_init

ALIGN	16
aesgcmsiv_htable_init:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aesgcmsiv_htable_init:
	mov	rdi,rcx
	mov	rsi,rdx



_CET_ENDBR
	vmovdqa	xmm0,XMMWORD[rsi]
	vmovdqa	xmm1,xmm0
	vmovdqa	XMMWORD[rdi],xmm0
	call	GFMUL
	vmovdqa	XMMWORD[16+rdi],xmm0
	call	GFMUL
	vmovdqa	XMMWORD[32+rdi],xmm0
	call	GFMUL
	vmovdqa	XMMWORD[48+rdi],xmm0
	call	GFMUL
	vmovdqa	XMMWORD[64+rdi],xmm0
	call	GFMUL
	vmovdqa	XMMWORD[80+rdi],xmm0
	call	GFMUL
	vmovdqa	XMMWORD[96+rdi],xmm0
	call	GFMUL
	vmovdqa	XMMWORD[112+rdi],xmm0
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$SEH_end_aesgcmsiv_htable_init:
global	aesgcmsiv_htable6_init

ALIGN	16
aesgcmsiv_htable6_init:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aesgcmsiv_htable6_init:
	mov	rdi,rcx
	mov	rsi,rdx



_CET_ENDBR
	vmovdqa	xmm0,XMMWORD[rsi]
	vmovdqa	xmm1,xmm0
	vmovdqa	XMMWORD[rdi],xmm0
	call	GFMUL
	vmovdqa	XMMWORD[16+rdi],xmm0
	call	GFMUL
	vmovdqa	XMMWORD[32+rdi],xmm0
	call	GFMUL
	vmovdqa	XMMWORD[48+rdi],xmm0
	call	GFMUL
	vmovdqa	XMMWORD[64+rdi],xmm0
	call	GFMUL
	vmovdqa	XMMWORD[80+rdi],xmm0
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$SEH_end_aesgcmsiv_htable6_init:
global	aesgcmsiv_htable_polyval

ALIGN	16
aesgcmsiv_htable_polyval:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aesgcmsiv_htable_polyval:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8
	mov	rcx,r9



_CET_ENDBR
	test	rdx,rdx
	jnz	NEAR $L$htable_polyval_start
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$htable_polyval_start:
	vzeroall



	mov	r11,rdx
	and	r11,127

	jz	NEAR $L$htable_polyval_no_prefix

	vpxor	xmm9,xmm9,xmm9
	vmovdqa	xmm1,XMMWORD[rcx]
	sub	rdx,r11

	sub	r11,16


	vmovdqu	xmm0,XMMWORD[rsi]
	vpxor	xmm0,xmm0,xmm1

	vpclmulqdq	xmm5,xmm0,XMMWORD[r11*1+rdi],0x01
	vpclmulqdq	xmm3,xmm0,XMMWORD[r11*1+rdi],0x00
	vpclmulqdq	xmm4,xmm0,XMMWORD[r11*1+rdi],0x11
	vpclmulqdq	xmm6,xmm0,XMMWORD[r11*1+rdi],0x10
	vpxor	xmm5,xmm5,xmm6

	lea	rsi,[16+rsi]
	test	r11,r11
	jnz	NEAR $L$htable_polyval_prefix_loop
	jmp	NEAR $L$htable_polyval_prefix_complete


ALIGN	64
$L$htable_polyval_prefix_loop:
	sub	r11,16

	vmovdqu	xmm0,XMMWORD[rsi]

	vpclmulqdq	xmm6,xmm0,XMMWORD[r11*1+rdi],0x00
	vpxor	xmm3,xmm3,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[r11*1+rdi],0x11
	vpxor	xmm4,xmm4,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[r11*1+rdi],0x01
	vpxor	xmm5,xmm5,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[r11*1+rdi],0x10
	vpxor	xmm5,xmm5,xmm6

	test	r11,r11

	lea	rsi,[16+rsi]

	jnz	NEAR $L$htable_polyval_prefix_loop

$L$htable_polyval_prefix_complete:
	vpsrldq	xmm6,xmm5,8
	vpslldq	xmm5,xmm5,8

	vpxor	xmm9,xmm4,xmm6
	vpxor	xmm1,xmm3,xmm5

	jmp	NEAR $L$htable_polyval_main_loop

$L$htable_polyval_no_prefix:




	vpxor	xmm1,xmm1,xmm1
	vmovdqa	xmm9,XMMWORD[rcx]

ALIGN	64
$L$htable_polyval_main_loop:
	sub	rdx,0x80
	jb	NEAR $L$htable_polyval_out

	vmovdqu	xmm0,XMMWORD[112+rsi]

	vpclmulqdq	xmm5,xmm0,XMMWORD[rdi],0x01
	vpclmulqdq	xmm3,xmm0,XMMWORD[rdi],0x00
	vpclmulqdq	xmm4,xmm0,XMMWORD[rdi],0x11
	vpclmulqdq	xmm6,xmm0,XMMWORD[rdi],0x10
	vpxor	xmm5,xmm5,xmm6


	vmovdqu	xmm0,XMMWORD[96+rsi]
	vpclmulqdq	xmm6,xmm0,XMMWORD[16+rdi],0x01
	vpxor	xmm5,xmm5,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[16+rdi],0x00
	vpxor	xmm3,xmm3,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[16+rdi],0x11
	vpxor	xmm4,xmm4,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[16+rdi],0x10
	vpxor	xmm5,xmm5,xmm6



	vmovdqu	xmm0,XMMWORD[80+rsi]

	vpclmulqdq	xmm7,xmm1,XMMWORD[poly],0x10
	vpalignr	xmm1,xmm1,xmm1,8

	vpclmulqdq	xmm6,xmm0,XMMWORD[32+rdi],0x01
	vpxor	xmm5,xmm5,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[32+rdi],0x00
	vpxor	xmm3,xmm3,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[32+rdi],0x11
	vpxor	xmm4,xmm4,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[32+rdi],0x10
	vpxor	xmm5,xmm5,xmm6


	vpxor	xmm1,xmm1,xmm7

	vmovdqu	xmm0,XMMWORD[64+rsi]

	vpclmulqdq	xmm6,xmm0,XMMWORD[48+rdi],0x01
	vpxor	xmm5,xmm5,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[48+rdi],0x00
	vpxor	xmm3,xmm3,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[48+rdi],0x11
	vpxor	xmm4,xmm4,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[48+rdi],0x10
	vpxor	xmm5,xmm5,xmm6


	vmovdqu	xmm0,XMMWORD[48+rsi]

	vpclmulqdq	xmm7,xmm1,XMMWORD[poly],0x10
	vpalignr	xmm1,xmm1,xmm1,8

	vpclmulqdq	xmm6,xmm0,XMMWORD[64+rdi],0x01
	vpxor	xmm5,xmm5,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[64+rdi],0x00
	vpxor	xmm3,xmm3,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[64+rdi],0x11
	vpxor	xmm4,xmm4,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[64+rdi],0x10
	vpxor	xmm5,xmm5,xmm6


	vpxor	xmm1,xmm1,xmm7

	vmovdqu	xmm0,XMMWORD[32+rsi]

	vpclmulqdq	xmm6,xmm0,XMMWORD[80+rdi],0x01
	vpxor	xmm5,xmm5,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[80+rdi],0x00
	vpxor	xmm3,xmm3,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[80+rdi],0x11
	vpxor	xmm4,xmm4,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[80+rdi],0x10
	vpxor	xmm5,xmm5,xmm6


	vpxor	xmm1,xmm1,xmm9

	vmovdqu	xmm0,XMMWORD[16+rsi]

	vpclmulqdq	xmm6,xmm0,XMMWORD[96+rdi],0x01
	vpxor	xmm5,xmm5,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[96+rdi],0x00
	vpxor	xmm3,xmm3,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[96+rdi],0x11
	vpxor	xmm4,xmm4,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[96+rdi],0x10
	vpxor	xmm5,xmm5,xmm6


	vmovdqu	xmm0,XMMWORD[rsi]
	vpxor	xmm0,xmm0,xmm1

	vpclmulqdq	xmm6,xmm0,XMMWORD[112+rdi],0x01
	vpxor	xmm5,xmm5,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[112+rdi],0x00
	vpxor	xmm3,xmm3,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[112+rdi],0x11
	vpxor	xmm4,xmm4,xmm6
	vpclmulqdq	xmm6,xmm0,XMMWORD[112+rdi],0x10
	vpxor	xmm5,xmm5,xmm6


	vpsrldq	xmm6,xmm5,8
	vpslldq	xmm5,xmm5,8

	vpxor	xmm9,xmm4,xmm6
	vpxor	xmm1,xmm3,xmm5

	lea	rsi,[128+rsi]
	jmp	NEAR $L$htable_polyval_main_loop



$L$htable_polyval_out:
	vpclmulqdq	xmm6,xmm1,XMMWORD[poly],0x10
	vpalignr	xmm1,xmm1,xmm1,8
	vpxor	xmm1,xmm1,xmm6

	vpclmulqdq	xmm6,xmm1,XMMWORD[poly],0x10
	vpalignr	xmm1,xmm1,xmm1,8
	vpxor	xmm1,xmm1,xmm6
	vpxor	xmm1,xmm1,xmm9

	vmovdqu	XMMWORD[rcx],xmm1
	vzeroupper
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$SEH_end_aesgcmsiv_htable_polyval:
global	aesgcmsiv_polyval_horner

ALIGN	16
aesgcmsiv_polyval_horner:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aesgcmsiv_polyval_horner:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8
	mov	rcx,r9



_CET_ENDBR
	test	rcx,rcx
	jnz	NEAR $L$polyval_horner_start
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$polyval_horner_start:



	xor	r10,r10
	shl	rcx,4

	vmovdqa	xmm1,XMMWORD[rsi]
	vmovdqa	xmm0,XMMWORD[rdi]

$L$polyval_horner_loop:
	vpxor	xmm0,xmm0,XMMWORD[r10*1+rdx]
	call	GFMUL

	add	r10,16
	cmp	rcx,r10
	jne	NEAR $L$polyval_horner_loop


	vmovdqa	XMMWORD[rdi],xmm0
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$SEH_end_aesgcmsiv_polyval_horner:
global	aes128gcmsiv_aes_ks

ALIGN	16
aes128gcmsiv_aes_ks:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aes128gcmsiv_aes_ks:
	mov	rdi,rcx
	mov	rsi,rdx



_CET_ENDBR
	vmovdqu	xmm1,XMMWORD[rdi]
	vmovdqa	XMMWORD[rsi],xmm1

	vmovdqa	xmm0,XMMWORD[con1]
	vmovdqa	xmm15,XMMWORD[mask]

	mov	rax,8

$L$ks128_loop:
	add	rsi,16
	sub	rax,1
	vpshufb	xmm2,xmm1,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpslld	xmm0,xmm0,1
	vpslldq	xmm3,xmm1,4
	vpxor	xmm1,xmm1,xmm3
	vpslldq	xmm3,xmm3,4
	vpxor	xmm1,xmm1,xmm3
	vpslldq	xmm3,xmm3,4
	vpxor	xmm1,xmm1,xmm3
	vpxor	xmm1,xmm1,xmm2
	vmovdqa	XMMWORD[rsi],xmm1
	jne	NEAR $L$ks128_loop

	vmovdqa	xmm0,XMMWORD[con2]
	vpshufb	xmm2,xmm1,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpslld	xmm0,xmm0,1
	vpslldq	xmm3,xmm1,4
	vpxor	xmm1,xmm1,xmm3
	vpslldq	xmm3,xmm3,4
	vpxor	xmm1,xmm1,xmm3
	vpslldq	xmm3,xmm3,4
	vpxor	xmm1,xmm1,xmm3
	vpxor	xmm1,xmm1,xmm2
	vmovdqa	XMMWORD[16+rsi],xmm1

	vpshufb	xmm2,xmm1,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpslldq	xmm3,xmm1,4
	vpxor	xmm1,xmm1,xmm3
	vpslldq	xmm3,xmm3,4
	vpxor	xmm1,xmm1,xmm3
	vpslldq	xmm3,xmm3,4
	vpxor	xmm1,xmm1,xmm3
	vpxor	xmm1,xmm1,xmm2
	vmovdqa	XMMWORD[32+rsi],xmm1
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$SEH_end_aes128gcmsiv_aes_ks:
global	aes256gcmsiv_aes_ks

ALIGN	16
aes256gcmsiv_aes_ks:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aes256gcmsiv_aes_ks:
	mov	rdi,rcx
	mov	rsi,rdx



_CET_ENDBR
	vmovdqu	xmm1,XMMWORD[rdi]
	vmovdqu	xmm3,XMMWORD[16+rdi]
	vmovdqa	XMMWORD[rsi],xmm1
	vmovdqa	XMMWORD[16+rsi],xmm3
	vmovdqa	xmm0,XMMWORD[con1]
	vmovdqa	xmm15,XMMWORD[mask]
	vpxor	xmm14,xmm14,xmm14
	mov	rax,6

$L$ks256_loop:
	add	rsi,32
	sub	rax,1
	vpshufb	xmm2,xmm3,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpslld	xmm0,xmm0,1
	vpsllq	xmm4,xmm1,32
	vpxor	xmm1,xmm1,xmm4
	vpshufb	xmm4,xmm1,XMMWORD[con3]
	vpxor	xmm1,xmm1,xmm4
	vpxor	xmm1,xmm1,xmm2
	vmovdqa	XMMWORD[rsi],xmm1
	vpshufd	xmm2,xmm1,0xff
	vaesenclast	xmm2,xmm2,xmm14
	vpsllq	xmm4,xmm3,32
	vpxor	xmm3,xmm3,xmm4
	vpshufb	xmm4,xmm3,XMMWORD[con3]
	vpxor	xmm3,xmm3,xmm4
	vpxor	xmm3,xmm3,xmm2
	vmovdqa	XMMWORD[16+rsi],xmm3
	jne	NEAR $L$ks256_loop

	vpshufb	xmm2,xmm3,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpsllq	xmm4,xmm1,32
	vpxor	xmm1,xmm1,xmm4
	vpshufb	xmm4,xmm1,XMMWORD[con3]
	vpxor	xmm1,xmm1,xmm4
	vpxor	xmm1,xmm1,xmm2
	vmovdqa	XMMWORD[32+rsi],xmm1
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

global	aes128gcmsiv_aes_ks_enc_x1

ALIGN	16
aes128gcmsiv_aes_ks_enc_x1:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aes128gcmsiv_aes_ks_enc_x1:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8
	mov	rcx,r9



_CET_ENDBR
	vmovdqa	xmm1,XMMWORD[rcx]
	vmovdqa	xmm4,XMMWORD[rdi]

	vmovdqa	XMMWORD[rdx],xmm1
	vpxor	xmm4,xmm4,xmm1

	vmovdqa	xmm0,XMMWORD[con1]
	vmovdqa	xmm15,XMMWORD[mask]

	vpshufb	xmm2,xmm1,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpslld	xmm0,xmm0,1
	vpsllq	xmm3,xmm1,32
	vpxor	xmm1,xmm1,xmm3
	vpshufb	xmm3,xmm1,XMMWORD[con3]
	vpxor	xmm1,xmm1,xmm3
	vpxor	xmm1,xmm1,xmm2

	vaesenc	xmm4,xmm4,xmm1
	vmovdqa	XMMWORD[16+rdx],xmm1

	vpshufb	xmm2,xmm1,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpslld	xmm0,xmm0,1
	vpsllq	xmm3,xmm1,32
	vpxor	xmm1,xmm1,xmm3
	vpshufb	xmm3,xmm1,XMMWORD[con3]
	vpxor	xmm1,xmm1,xmm3
	vpxor	xmm1,xmm1,xmm2

	vaesenc	xmm4,xmm4,xmm1
	vmovdqa	XMMWORD[32+rdx],xmm1

	vpshufb	xmm2,xmm1,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpslld	xmm0,xmm0,1
	vpsllq	xmm3,xmm1,32
	vpxor	xmm1,xmm1,xmm3
	vpshufb	xmm3,xmm1,XMMWORD[con3]
	vpxor	xmm1,xmm1,xmm3
	vpxor	xmm1,xmm1,xmm2

	vaesenc	xmm4,xmm4,xmm1
	vmovdqa	XMMWORD[48+rdx],xmm1

	vpshufb	xmm2,xmm1,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpslld	xmm0,xmm0,1
	vpsllq	xmm3,xmm1,32
	vpxor	xmm1,xmm1,xmm3
	vpshufb	xmm3,xmm1,XMMWORD[con3]
	vpxor	xmm1,xmm1,xmm3
	vpxor	xmm1,xmm1,xmm2

	vaesenc	xmm4,xmm4,xmm1
	vmovdqa	XMMWORD[64+rdx],xmm1

	vpshufb	xmm2,xmm1,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpslld	xmm0,xmm0,1
	vpsllq	xmm3,xmm1,32
	vpxor	xmm1,xmm1,xmm3
	vpshufb	xmm3,xmm1,XMMWORD[con3]
	vpxor	xmm1,xmm1,xmm3
	vpxor	xmm1,xmm1,xmm2

	vaesenc	xmm4,xmm4,xmm1
	vmovdqa	XMMWORD[80+rdx],xmm1

	vpshufb	xmm2,xmm1,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpslld	xmm0,xmm0,1
	vpsllq	xmm3,xmm1,32
	vpxor	xmm1,xmm1,xmm3
	vpshufb	xmm3,xmm1,XMMWORD[con3]
	vpxor	xmm1,xmm1,xmm3
	vpxor	xmm1,xmm1,xmm2

	vaesenc	xmm4,xmm4,xmm1
	vmovdqa	XMMWORD[96+rdx],xmm1

	vpshufb	xmm2,xmm1,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpslld	xmm0,xmm0,1
	vpsllq	xmm3,xmm1,32
	vpxor	xmm1,xmm1,xmm3
	vpshufb	xmm3,xmm1,XMMWORD[con3]
	vpxor	xmm1,xmm1,xmm3
	vpxor	xmm1,xmm1,xmm2

	vaesenc	xmm4,xmm4,xmm1
	vmovdqa	XMMWORD[112+rdx],xmm1

	vpshufb	xmm2,xmm1,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpslld	xmm0,xmm0,1
	vpsllq	xmm3,xmm1,32
	vpxor	xmm1,xmm1,xmm3
	vpshufb	xmm3,xmm1,XMMWORD[con3]
	vpxor	xmm1,xmm1,xmm3
	vpxor	xmm1,xmm1,xmm2

	vaesenc	xmm4,xmm4,xmm1
	vmovdqa	XMMWORD[128+rdx],xmm1


	vmovdqa	xmm0,XMMWORD[con2]

	vpshufb	xmm2,xmm1,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpslld	xmm0,xmm0,1
	vpsllq	xmm3,xmm1,32
	vpxor	xmm1,xmm1,xmm3
	vpshufb	xmm3,xmm1,XMMWORD[con3]
	vpxor	xmm1,xmm1,xmm3
	vpxor	xmm1,xmm1,xmm2

	vaesenc	xmm4,xmm4,xmm1
	vmovdqa	XMMWORD[144+rdx],xmm1

	vpshufb	xmm2,xmm1,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpsllq	xmm3,xmm1,32
	vpxor	xmm1,xmm1,xmm3
	vpshufb	xmm3,xmm1,XMMWORD[con3]
	vpxor	xmm1,xmm1,xmm3
	vpxor	xmm1,xmm1,xmm2

	vaesenclast	xmm4,xmm4,xmm1
	vmovdqa	XMMWORD[160+rdx],xmm1


	vmovdqa	XMMWORD[rsi],xmm4
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$SEH_end_aes128gcmsiv_aes_ks_enc_x1:
global	aes128gcmsiv_kdf

ALIGN	16
aes128gcmsiv_kdf:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aes128gcmsiv_kdf:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8



_CET_ENDBR




	vmovdqa	xmm1,XMMWORD[rdx]
	vmovdqa	xmm9,XMMWORD[rdi]
	vmovdqa	xmm12,XMMWORD[and_mask]
	vmovdqa	xmm13,XMMWORD[one]
	vpshufd	xmm9,xmm9,0x90
	vpand	xmm9,xmm9,xmm12
	vpaddd	xmm10,xmm9,xmm13
	vpaddd	xmm11,xmm10,xmm13
	vpaddd	xmm12,xmm11,xmm13

	vpxor	xmm9,xmm9,xmm1
	vpxor	xmm10,xmm10,xmm1
	vpxor	xmm11,xmm11,xmm1
	vpxor	xmm12,xmm12,xmm1

	vmovdqa	xmm1,XMMWORD[16+rdx]
	vaesenc	xmm9,xmm9,xmm1
	vaesenc	xmm10,xmm10,xmm1
	vaesenc	xmm11,xmm11,xmm1
	vaesenc	xmm12,xmm12,xmm1

	vmovdqa	xmm2,XMMWORD[32+rdx]
	vaesenc	xmm9,xmm9,xmm2
	vaesenc	xmm10,xmm10,xmm2
	vaesenc	xmm11,xmm11,xmm2
	vaesenc	xmm12,xmm12,xmm2

	vmovdqa	xmm1,XMMWORD[48+rdx]
	vaesenc	xmm9,xmm9,xmm1
	vaesenc	xmm10,xmm10,xmm1
	vaesenc	xmm11,xmm11,xmm1
	vaesenc	xmm12,xmm12,xmm1

	vmovdqa	xmm2,XMMWORD[64+rdx]
	vaesenc	xmm9,xmm9,xmm2
	vaesenc	xmm10,xmm10,xmm2
	vaesenc	xmm11,xmm11,xmm2
	vaesenc	xmm12,xmm12,xmm2

	vmovdqa	xmm1,XMMWORD[80+rdx]
	vaesenc	xmm9,xmm9,xmm1
	vaesenc	xmm10,xmm10,xmm1
	vaesenc	xmm11,xmm11,xmm1
	vaesenc	xmm12,xmm12,xmm1

	vmovdqa	xmm2,XMMWORD[96+rdx]
	vaesenc	xmm9,xmm9,xmm2
	vaesenc	xmm10,xmm10,xmm2
	vaesenc	xmm11,xmm11,xmm2
	vaesenc	xmm12,xmm12,xmm2

	vmovdqa	xmm1,XMMWORD[112+rdx]
	vaesenc	xmm9,xmm9,xmm1
	vaesenc	xmm10,xmm10,xmm1
	vaesenc	xmm11,xmm11,xmm1
	vaesenc	xmm12,xmm12,xmm1

	vmovdqa	xmm2,XMMWORD[128+rdx]
	vaesenc	xmm9,xmm9,xmm2
	vaesenc	xmm10,xmm10,xmm2
	vaesenc	xmm11,xmm11,xmm2
	vaesenc	xmm12,xmm12,xmm2

	vmovdqa	xmm1,XMMWORD[144+rdx]
	vaesenc	xmm9,xmm9,xmm1
	vaesenc	xmm10,xmm10,xmm1
	vaesenc	xmm11,xmm11,xmm1
	vaesenc	xmm12,xmm12,xmm1

	vmovdqa	xmm2,XMMWORD[160+rdx]
	vaesenclast	xmm9,xmm9,xmm2
	vaesenclast	xmm10,xmm10,xmm2
	vaesenclast	xmm11,xmm11,xmm2
	vaesenclast	xmm12,xmm12,xmm2


	vmovdqa	XMMWORD[rsi],xmm9
	vmovdqa	XMMWORD[16+rsi],xmm10
	vmovdqa	XMMWORD[32+rsi],xmm11
	vmovdqa	XMMWORD[48+rsi],xmm12
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$SEH_end_aes128gcmsiv_kdf:
global	aes128gcmsiv_enc_msg_x4

ALIGN	16
aes128gcmsiv_enc_msg_x4:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aes128gcmsiv_enc_msg_x4:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8
	mov	rcx,r9
	mov	r8,QWORD[40+rsp]



_CET_ENDBR
	test	r8,r8
	jnz	NEAR $L$128_enc_msg_x4_start
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$128_enc_msg_x4_start:
	push	r12

	push	r13


	shr	r8,4
	mov	r10,r8
	shl	r10,62
	shr	r10,62


	vmovdqa	xmm15,XMMWORD[rdx]
	vpor	xmm15,xmm15,XMMWORD[OR_MASK]

	vmovdqu	xmm4,XMMWORD[four]
	vmovdqa	xmm0,xmm15
	vpaddd	xmm1,xmm15,XMMWORD[one]
	vpaddd	xmm2,xmm15,XMMWORD[two]
	vpaddd	xmm3,xmm15,XMMWORD[three]

	shr	r8,2
	je	NEAR $L$128_enc_msg_x4_check_remainder

	sub	rsi,64
	sub	rdi,64

$L$128_enc_msg_x4_loop1:
	add	rsi,64
	add	rdi,64

	vmovdqa	xmm5,xmm0
	vmovdqa	xmm6,xmm1
	vmovdqa	xmm7,xmm2
	vmovdqa	xmm8,xmm3

	vpxor	xmm5,xmm5,XMMWORD[rcx]
	vpxor	xmm6,xmm6,XMMWORD[rcx]
	vpxor	xmm7,xmm7,XMMWORD[rcx]
	vpxor	xmm8,xmm8,XMMWORD[rcx]

	vmovdqu	xmm12,XMMWORD[16+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vpaddd	xmm0,xmm0,xmm4
	vmovdqu	xmm12,XMMWORD[32+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vpaddd	xmm1,xmm1,xmm4
	vmovdqu	xmm12,XMMWORD[48+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vpaddd	xmm2,xmm2,xmm4
	vmovdqu	xmm12,XMMWORD[64+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vpaddd	xmm3,xmm3,xmm4

	vmovdqu	xmm12,XMMWORD[80+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vmovdqu	xmm12,XMMWORD[96+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vmovdqu	xmm12,XMMWORD[112+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vmovdqu	xmm12,XMMWORD[128+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vmovdqu	xmm12,XMMWORD[144+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vmovdqu	xmm12,XMMWORD[160+rcx]
	vaesenclast	xmm5,xmm5,xmm12
	vaesenclast	xmm6,xmm6,xmm12
	vaesenclast	xmm7,xmm7,xmm12
	vaesenclast	xmm8,xmm8,xmm12



	vpxor	xmm5,xmm5,XMMWORD[rdi]
	vpxor	xmm6,xmm6,XMMWORD[16+rdi]
	vpxor	xmm7,xmm7,XMMWORD[32+rdi]
	vpxor	xmm8,xmm8,XMMWORD[48+rdi]

	sub	r8,1

	vmovdqu	XMMWORD[rsi],xmm5
	vmovdqu	XMMWORD[16+rsi],xmm6
	vmovdqu	XMMWORD[32+rsi],xmm7
	vmovdqu	XMMWORD[48+rsi],xmm8

	jne	NEAR $L$128_enc_msg_x4_loop1

	add	rsi,64
	add	rdi,64

$L$128_enc_msg_x4_check_remainder:
	cmp	r10,0
	je	NEAR $L$128_enc_msg_x4_out

$L$128_enc_msg_x4_loop2:


	vmovdqa	xmm5,xmm0
	vpaddd	xmm0,xmm0,XMMWORD[one]

	vpxor	xmm5,xmm5,XMMWORD[rcx]
	vaesenc	xmm5,xmm5,XMMWORD[16+rcx]
	vaesenc	xmm5,xmm5,XMMWORD[32+rcx]
	vaesenc	xmm5,xmm5,XMMWORD[48+rcx]
	vaesenc	xmm5,xmm5,XMMWORD[64+rcx]
	vaesenc	xmm5,xmm5,XMMWORD[80+rcx]
	vaesenc	xmm5,xmm5,XMMWORD[96+rcx]
	vaesenc	xmm5,xmm5,XMMWORD[112+rcx]
	vaesenc	xmm5,xmm5,XMMWORD[128+rcx]
	vaesenc	xmm5,xmm5,XMMWORD[144+rcx]
	vaesenclast	xmm5,xmm5,XMMWORD[160+rcx]


	vpxor	xmm5,xmm5,XMMWORD[rdi]
	vmovdqu	XMMWORD[rsi],xmm5

	add	rdi,16
	add	rsi,16

	sub	r10,1
	jne	NEAR $L$128_enc_msg_x4_loop2

$L$128_enc_msg_x4_out:
	pop	r13

	pop	r12

	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$SEH_end_aes128gcmsiv_enc_msg_x4:
global	aes128gcmsiv_enc_msg_x8

ALIGN	16
aes128gcmsiv_enc_msg_x8:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aes128gcmsiv_enc_msg_x8:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8
	mov	rcx,r9
	mov	r8,QWORD[40+rsp]



_CET_ENDBR
	test	r8,r8
	jnz	NEAR $L$128_enc_msg_x8_start
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$128_enc_msg_x8_start:
	push	r12

	push	r13

	push	rbp

	mov	rbp,rsp



	sub	rsp,128
	and	rsp,-64

	shr	r8,4
	mov	r10,r8
	shl	r10,61
	shr	r10,61


	vmovdqu	xmm1,XMMWORD[rdx]
	vpor	xmm1,xmm1,XMMWORD[OR_MASK]


	vpaddd	xmm0,xmm1,XMMWORD[seven]
	vmovdqu	XMMWORD[rsp],xmm0
	vpaddd	xmm9,xmm1,XMMWORD[one]
	vpaddd	xmm10,xmm1,XMMWORD[two]
	vpaddd	xmm11,xmm1,XMMWORD[three]
	vpaddd	xmm12,xmm1,XMMWORD[four]
	vpaddd	xmm13,xmm1,XMMWORD[five]
	vpaddd	xmm14,xmm1,XMMWORD[six]
	vmovdqa	xmm0,xmm1

	shr	r8,3
	je	NEAR $L$128_enc_msg_x8_check_remainder

	sub	rsi,128
	sub	rdi,128

$L$128_enc_msg_x8_loop1:
	add	rsi,128
	add	rdi,128

	vmovdqa	xmm1,xmm0
	vmovdqa	xmm2,xmm9
	vmovdqa	xmm3,xmm10
	vmovdqa	xmm4,xmm11
	vmovdqa	xmm5,xmm12
	vmovdqa	xmm6,xmm13
	vmovdqa	xmm7,xmm14

	vmovdqu	xmm8,XMMWORD[rsp]

	vpxor	xmm1,xmm1,XMMWORD[rcx]
	vpxor	xmm2,xmm2,XMMWORD[rcx]
	vpxor	xmm3,xmm3,XMMWORD[rcx]
	vpxor	xmm4,xmm4,XMMWORD[rcx]
	vpxor	xmm5,xmm5,XMMWORD[rcx]
	vpxor	xmm6,xmm6,XMMWORD[rcx]
	vpxor	xmm7,xmm7,XMMWORD[rcx]
	vpxor	xmm8,xmm8,XMMWORD[rcx]

	vmovdqu	xmm15,XMMWORD[16+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vmovdqu	xmm14,XMMWORD[rsp]
	vpaddd	xmm14,xmm14,XMMWORD[eight]
	vmovdqu	XMMWORD[rsp],xmm14
	vmovdqu	xmm15,XMMWORD[32+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vpsubd	xmm14,xmm14,XMMWORD[one]
	vmovdqu	xmm15,XMMWORD[48+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vpaddd	xmm0,xmm0,XMMWORD[eight]
	vmovdqu	xmm15,XMMWORD[64+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vpaddd	xmm9,xmm9,XMMWORD[eight]
	vmovdqu	xmm15,XMMWORD[80+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vpaddd	xmm10,xmm10,XMMWORD[eight]
	vmovdqu	xmm15,XMMWORD[96+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vpaddd	xmm11,xmm11,XMMWORD[eight]
	vmovdqu	xmm15,XMMWORD[112+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vpaddd	xmm12,xmm12,XMMWORD[eight]
	vmovdqu	xmm15,XMMWORD[128+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vpaddd	xmm13,xmm13,XMMWORD[eight]
	vmovdqu	xmm15,XMMWORD[144+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vmovdqu	xmm15,XMMWORD[160+rcx]
	vaesenclast	xmm1,xmm1,xmm15
	vaesenclast	xmm2,xmm2,xmm15
	vaesenclast	xmm3,xmm3,xmm15
	vaesenclast	xmm4,xmm4,xmm15
	vaesenclast	xmm5,xmm5,xmm15
	vaesenclast	xmm6,xmm6,xmm15
	vaesenclast	xmm7,xmm7,xmm15
	vaesenclast	xmm8,xmm8,xmm15



	vpxor	xmm1,xmm1,XMMWORD[rdi]
	vpxor	xmm2,xmm2,XMMWORD[16+rdi]
	vpxor	xmm3,xmm3,XMMWORD[32+rdi]
	vpxor	xmm4,xmm4,XMMWORD[48+rdi]
	vpxor	xmm5,xmm5,XMMWORD[64+rdi]
	vpxor	xmm6,xmm6,XMMWORD[80+rdi]
	vpxor	xmm7,xmm7,XMMWORD[96+rdi]
	vpxor	xmm8,xmm8,XMMWORD[112+rdi]

	dec	r8

	vmovdqu	XMMWORD[rsi],xmm1
	vmovdqu	XMMWORD[16+rsi],xmm2
	vmovdqu	XMMWORD[32+rsi],xmm3
	vmovdqu	XMMWORD[48+rsi],xmm4
	vmovdqu	XMMWORD[64+rsi],xmm5
	vmovdqu	XMMWORD[80+rsi],xmm6
	vmovdqu	XMMWORD[96+rsi],xmm7
	vmovdqu	XMMWORD[112+rsi],xmm8

	jne	NEAR $L$128_enc_msg_x8_loop1

	add	rsi,128
	add	rdi,128

$L$128_enc_msg_x8_check_remainder:
	cmp	r10,0
	je	NEAR $L$128_enc_msg_x8_out

$L$128_enc_msg_x8_loop2:


	vmovdqa	xmm1,xmm0
	vpaddd	xmm0,xmm0,XMMWORD[one]

	vpxor	xmm1,xmm1,XMMWORD[rcx]
	vaesenc	xmm1,xmm1,XMMWORD[16+rcx]
	vaesenc	xmm1,xmm1,XMMWORD[32+rcx]
	vaesenc	xmm1,xmm1,XMMWORD[48+rcx]
	vaesenc	xmm1,xmm1,XMMWORD[64+rcx]
	vaesenc	xmm1,xmm1,XMMWORD[80+rcx]
	vaesenc	xmm1,xmm1,XMMWORD[96+rcx]
	vaesenc	xmm1,xmm1,XMMWORD[112+rcx]
	vaesenc	xmm1,xmm1,XMMWORD[128+rcx]
	vaesenc	xmm1,xmm1,XMMWORD[144+rcx]
	vaesenclast	xmm1,xmm1,XMMWORD[160+rcx]


	vpxor	xmm1,xmm1,XMMWORD[rdi]

	vmovdqu	XMMWORD[rsi],xmm1

	add	rdi,16
	add	rsi,16

	dec	r10
	jne	NEAR $L$128_enc_msg_x8_loop2

$L$128_enc_msg_x8_out:
	mov	rsp,rbp

	pop	rbp

	pop	r13

	pop	r12

	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$SEH_end_aes128gcmsiv_enc_msg_x8:
global	aes128gcmsiv_dec

ALIGN	16
aes128gcmsiv_dec:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aes128gcmsiv_dec:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8
	mov	rcx,r9
	mov	r8,QWORD[40+rsp]
	mov	r9,QWORD[48+rsp]



_CET_ENDBR
	test	r9,~15
	jnz	NEAR $L$128_dec_start
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$128_dec_start:
	vzeroupper
	vmovdqa	xmm0,XMMWORD[rdx]


	vmovdqu	xmm15,XMMWORD[16+rdx]
	vpor	xmm15,xmm15,XMMWORD[OR_MASK]
	mov	rax,rdx

	lea	rax,[32+rax]
	lea	rcx,[32+rcx]

	and	r9,~15


	cmp	r9,96
	jb	NEAR $L$128_dec_loop2


	sub	r9,96
	vmovdqa	xmm7,xmm15
	vpaddd	xmm8,xmm7,XMMWORD[one]
	vpaddd	xmm9,xmm7,XMMWORD[two]
	vpaddd	xmm10,xmm9,XMMWORD[one]
	vpaddd	xmm11,xmm9,XMMWORD[two]
	vpaddd	xmm12,xmm11,XMMWORD[one]
	vpaddd	xmm15,xmm11,XMMWORD[two]

	vpxor	xmm7,xmm7,XMMWORD[r8]
	vpxor	xmm8,xmm8,XMMWORD[r8]
	vpxor	xmm9,xmm9,XMMWORD[r8]
	vpxor	xmm10,xmm10,XMMWORD[r8]
	vpxor	xmm11,xmm11,XMMWORD[r8]
	vpxor	xmm12,xmm12,XMMWORD[r8]

	vmovdqu	xmm4,XMMWORD[16+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[32+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[48+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[64+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[80+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[96+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[112+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[128+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[144+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[160+r8]
	vaesenclast	xmm7,xmm7,xmm4
	vaesenclast	xmm8,xmm8,xmm4
	vaesenclast	xmm9,xmm9,xmm4
	vaesenclast	xmm10,xmm10,xmm4
	vaesenclast	xmm11,xmm11,xmm4
	vaesenclast	xmm12,xmm12,xmm4


	vpxor	xmm7,xmm7,XMMWORD[rdi]
	vpxor	xmm8,xmm8,XMMWORD[16+rdi]
	vpxor	xmm9,xmm9,XMMWORD[32+rdi]
	vpxor	xmm10,xmm10,XMMWORD[48+rdi]
	vpxor	xmm11,xmm11,XMMWORD[64+rdi]
	vpxor	xmm12,xmm12,XMMWORD[80+rdi]

	vmovdqu	XMMWORD[rsi],xmm7
	vmovdqu	XMMWORD[16+rsi],xmm8
	vmovdqu	XMMWORD[32+rsi],xmm9
	vmovdqu	XMMWORD[48+rsi],xmm10
	vmovdqu	XMMWORD[64+rsi],xmm11
	vmovdqu	XMMWORD[80+rsi],xmm12

	add	rdi,96
	add	rsi,96
	jmp	NEAR $L$128_dec_loop1


ALIGN	64
$L$128_dec_loop1:
	cmp	r9,96
	jb	NEAR $L$128_dec_finish_96
	sub	r9,96

	vmovdqa	xmm6,xmm12
	vmovdqa	XMMWORD[(16-32)+rax],xmm11
	vmovdqa	XMMWORD[(32-32)+rax],xmm10
	vmovdqa	XMMWORD[(48-32)+rax],xmm9
	vmovdqa	XMMWORD[(64-32)+rax],xmm8
	vmovdqa	XMMWORD[(80-32)+rax],xmm7

	vmovdqa	xmm7,xmm15
	vpaddd	xmm8,xmm7,XMMWORD[one]
	vpaddd	xmm9,xmm7,XMMWORD[two]
	vpaddd	xmm10,xmm9,XMMWORD[one]
	vpaddd	xmm11,xmm9,XMMWORD[two]
	vpaddd	xmm12,xmm11,XMMWORD[one]
	vpaddd	xmm15,xmm11,XMMWORD[two]

	vmovdqa	xmm4,XMMWORD[r8]
	vpxor	xmm7,xmm7,xmm4
	vpxor	xmm8,xmm8,xmm4
	vpxor	xmm9,xmm9,xmm4
	vpxor	xmm10,xmm10,xmm4
	vpxor	xmm11,xmm11,xmm4
	vpxor	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[((0-32))+rcx]
	vpclmulqdq	xmm2,xmm6,xmm4,0x11
	vpclmulqdq	xmm3,xmm6,xmm4,0x00
	vpclmulqdq	xmm1,xmm6,xmm4,0x01
	vpclmulqdq	xmm4,xmm6,xmm4,0x10
	vpxor	xmm1,xmm1,xmm4

	vmovdqu	xmm4,XMMWORD[16+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm6,XMMWORD[((-16))+rax]
	vmovdqu	xmm13,XMMWORD[((-16))+rcx]

	vpclmulqdq	xmm4,xmm6,xmm13,0x10
	vpxor	xmm1,xmm1,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x11
	vpxor	xmm2,xmm2,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x00
	vpxor	xmm3,xmm3,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x01
	vpxor	xmm1,xmm1,xmm4


	vmovdqu	xmm4,XMMWORD[32+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm6,XMMWORD[rax]
	vmovdqu	xmm13,XMMWORD[rcx]

	vpclmulqdq	xmm4,xmm6,xmm13,0x10
	vpxor	xmm1,xmm1,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x11
	vpxor	xmm2,xmm2,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x00
	vpxor	xmm3,xmm3,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x01
	vpxor	xmm1,xmm1,xmm4


	vmovdqu	xmm4,XMMWORD[48+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm6,XMMWORD[16+rax]
	vmovdqu	xmm13,XMMWORD[16+rcx]

	vpclmulqdq	xmm4,xmm6,xmm13,0x10
	vpxor	xmm1,xmm1,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x11
	vpxor	xmm2,xmm2,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x00
	vpxor	xmm3,xmm3,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x01
	vpxor	xmm1,xmm1,xmm4


	vmovdqu	xmm4,XMMWORD[64+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm6,XMMWORD[32+rax]
	vmovdqu	xmm13,XMMWORD[32+rcx]

	vpclmulqdq	xmm4,xmm6,xmm13,0x10
	vpxor	xmm1,xmm1,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x11
	vpxor	xmm2,xmm2,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x00
	vpxor	xmm3,xmm3,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x01
	vpxor	xmm1,xmm1,xmm4


	vmovdqu	xmm4,XMMWORD[80+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[96+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[112+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4


	vmovdqa	xmm6,XMMWORD[((80-32))+rax]
	vpxor	xmm6,xmm6,xmm0
	vmovdqu	xmm5,XMMWORD[((80-32))+rcx]

	vpclmulqdq	xmm4,xmm6,xmm5,0x01
	vpxor	xmm1,xmm1,xmm4
	vpclmulqdq	xmm4,xmm6,xmm5,0x11
	vpxor	xmm2,xmm2,xmm4
	vpclmulqdq	xmm4,xmm6,xmm5,0x00
	vpxor	xmm3,xmm3,xmm4
	vpclmulqdq	xmm4,xmm6,xmm5,0x10
	vpxor	xmm1,xmm1,xmm4

	vmovdqu	xmm4,XMMWORD[128+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4


	vpsrldq	xmm4,xmm1,8
	vpxor	xmm5,xmm2,xmm4
	vpslldq	xmm4,xmm1,8
	vpxor	xmm0,xmm3,xmm4

	vmovdqa	xmm3,XMMWORD[poly]

	vmovdqu	xmm4,XMMWORD[144+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm6,XMMWORD[160+r8]
	vpalignr	xmm2,xmm0,xmm0,8
	vpclmulqdq	xmm0,xmm0,xmm3,0x10
	vpxor	xmm0,xmm2,xmm0

	vpxor	xmm4,xmm6,XMMWORD[rdi]
	vaesenclast	xmm7,xmm7,xmm4
	vpxor	xmm4,xmm6,XMMWORD[16+rdi]
	vaesenclast	xmm8,xmm8,xmm4
	vpxor	xmm4,xmm6,XMMWORD[32+rdi]
	vaesenclast	xmm9,xmm9,xmm4
	vpxor	xmm4,xmm6,XMMWORD[48+rdi]
	vaesenclast	xmm10,xmm10,xmm4
	vpxor	xmm4,xmm6,XMMWORD[64+rdi]
	vaesenclast	xmm11,xmm11,xmm4
	vpxor	xmm4,xmm6,XMMWORD[80+rdi]
	vaesenclast	xmm12,xmm12,xmm4

	vpalignr	xmm2,xmm0,xmm0,8
	vpclmulqdq	xmm0,xmm0,xmm3,0x10
	vpxor	xmm0,xmm2,xmm0

	vmovdqu	XMMWORD[rsi],xmm7
	vmovdqu	XMMWORD[16+rsi],xmm8
	vmovdqu	XMMWORD[32+rsi],xmm9
	vmovdqu	XMMWORD[48+rsi],xmm10
	vmovdqu	XMMWORD[64+rsi],xmm11
	vmovdqu	XMMWORD[80+rsi],xmm12

	vpxor	xmm0,xmm0,xmm5

	lea	rdi,[96+rdi]
	lea	rsi,[96+rsi]
	jmp	NEAR $L$128_dec_loop1

$L$128_dec_finish_96:
	vmovdqa	xmm6,xmm12
	vmovdqa	XMMWORD[(16-32)+rax],xmm11
	vmovdqa	XMMWORD[(32-32)+rax],xmm10
	vmovdqa	XMMWORD[(48-32)+rax],xmm9
	vmovdqa	XMMWORD[(64-32)+rax],xmm8
	vmovdqa	XMMWORD[(80-32)+rax],xmm7

	vmovdqu	xmm4,XMMWORD[((0-32))+rcx]
	vpclmulqdq	xmm1,xmm6,xmm4,0x10
	vpclmulqdq	xmm2,xmm6,xmm4,0x11
	vpclmulqdq	xmm3,xmm6,xmm4,0x00
	vpclmulqdq	xmm4,xmm6,xmm4,0x01
	vpxor	xmm1,xmm1,xmm4

	vmovdqu	xmm6,XMMWORD[((-16))+rax]
	vmovdqu	xmm13,XMMWORD[((-16))+rcx]

	vpclmulqdq	xmm4,xmm6,xmm13,0x10
	vpxor	xmm1,xmm1,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x11
	vpxor	xmm2,xmm2,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x00
	vpxor	xmm3,xmm3,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x01
	vpxor	xmm1,xmm1,xmm4

	vmovdqu	xmm6,XMMWORD[rax]
	vmovdqu	xmm13,XMMWORD[rcx]

	vpclmulqdq	xmm4,xmm6,xmm13,0x10
	vpxor	xmm1,xmm1,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x11
	vpxor	xmm2,xmm2,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x00
	vpxor	xmm3,xmm3,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x01
	vpxor	xmm1,xmm1,xmm4

	vmovdqu	xmm6,XMMWORD[16+rax]
	vmovdqu	xmm13,XMMWORD[16+rcx]

	vpclmulqdq	xmm4,xmm6,xmm13,0x10
	vpxor	xmm1,xmm1,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x11
	vpxor	xmm2,xmm2,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x00
	vpxor	xmm3,xmm3,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x01
	vpxor	xmm1,xmm1,xmm4

	vmovdqu	xmm6,XMMWORD[32+rax]
	vmovdqu	xmm13,XMMWORD[32+rcx]

	vpclmulqdq	xmm4,xmm6,xmm13,0x10
	vpxor	xmm1,xmm1,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x11
	vpxor	xmm2,xmm2,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x00
	vpxor	xmm3,xmm3,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x01
	vpxor	xmm1,xmm1,xmm4


	vmovdqu	xmm6,XMMWORD[((80-32))+rax]
	vpxor	xmm6,xmm6,xmm0
	vmovdqu	xmm5,XMMWORD[((80-32))+rcx]
	vpclmulqdq	xmm4,xmm6,xmm5,0x11
	vpxor	xmm2,xmm2,xmm4
	vpclmulqdq	xmm4,xmm6,xmm5,0x00
	vpxor	xmm3,xmm3,xmm4
	vpclmulqdq	xmm4,xmm6,xmm5,0x10
	vpxor	xmm1,xmm1,xmm4
	vpclmulqdq	xmm4,xmm6,xmm5,0x01
	vpxor	xmm1,xmm1,xmm4

	vpsrldq	xmm4,xmm1,8
	vpxor	xmm5,xmm2,xmm4
	vpslldq	xmm4,xmm1,8
	vpxor	xmm0,xmm3,xmm4

	vmovdqa	xmm3,XMMWORD[poly]

	vpalignr	xmm2,xmm0,xmm0,8
	vpclmulqdq	xmm0,xmm0,xmm3,0x10
	vpxor	xmm0,xmm2,xmm0

	vpalignr	xmm2,xmm0,xmm0,8
	vpclmulqdq	xmm0,xmm0,xmm3,0x10
	vpxor	xmm0,xmm2,xmm0

	vpxor	xmm0,xmm0,xmm5

$L$128_dec_loop2:



	cmp	r9,16
	jb	NEAR $L$128_dec_out
	sub	r9,16

	vmovdqa	xmm2,xmm15
	vpaddd	xmm15,xmm15,XMMWORD[one]

	vpxor	xmm2,xmm2,XMMWORD[r8]
	vaesenc	xmm2,xmm2,XMMWORD[16+r8]
	vaesenc	xmm2,xmm2,XMMWORD[32+r8]
	vaesenc	xmm2,xmm2,XMMWORD[48+r8]
	vaesenc	xmm2,xmm2,XMMWORD[64+r8]
	vaesenc	xmm2,xmm2,XMMWORD[80+r8]
	vaesenc	xmm2,xmm2,XMMWORD[96+r8]
	vaesenc	xmm2,xmm2,XMMWORD[112+r8]
	vaesenc	xmm2,xmm2,XMMWORD[128+r8]
	vaesenc	xmm2,xmm2,XMMWORD[144+r8]
	vaesenclast	xmm2,xmm2,XMMWORD[160+r8]
	vpxor	xmm2,xmm2,XMMWORD[rdi]
	vmovdqu	XMMWORD[rsi],xmm2
	add	rdi,16
	add	rsi,16

	vpxor	xmm0,xmm0,xmm2
	vmovdqa	xmm1,XMMWORD[((-32))+rcx]
	call	GFMUL

	jmp	NEAR $L$128_dec_loop2

$L$128_dec_out:
	vmovdqu	XMMWORD[rdx],xmm0
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$SEH_end_aes128gcmsiv_dec:
global	aes128gcmsiv_ecb_enc_block

ALIGN	16
aes128gcmsiv_ecb_enc_block:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aes128gcmsiv_ecb_enc_block:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8



_CET_ENDBR
	vmovdqa	xmm1,XMMWORD[rdi]

	vpxor	xmm1,xmm1,XMMWORD[rdx]
	vaesenc	xmm1,xmm1,XMMWORD[16+rdx]
	vaesenc	xmm1,xmm1,XMMWORD[32+rdx]
	vaesenc	xmm1,xmm1,XMMWORD[48+rdx]
	vaesenc	xmm1,xmm1,XMMWORD[64+rdx]
	vaesenc	xmm1,xmm1,XMMWORD[80+rdx]
	vaesenc	xmm1,xmm1,XMMWORD[96+rdx]
	vaesenc	xmm1,xmm1,XMMWORD[112+rdx]
	vaesenc	xmm1,xmm1,XMMWORD[128+rdx]
	vaesenc	xmm1,xmm1,XMMWORD[144+rdx]
	vaesenclast	xmm1,xmm1,XMMWORD[160+rdx]

	vmovdqa	XMMWORD[rsi],xmm1

	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$SEH_end_aes128gcmsiv_ecb_enc_block:
global	aes256gcmsiv_aes_ks_enc_x1

ALIGN	16
aes256gcmsiv_aes_ks_enc_x1:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aes256gcmsiv_aes_ks_enc_x1:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8
	mov	rcx,r9



_CET_ENDBR
	vmovdqa	xmm0,XMMWORD[con1]
	vmovdqa	xmm15,XMMWORD[mask]
	vmovdqa	xmm8,XMMWORD[rdi]
	vmovdqa	xmm1,XMMWORD[rcx]
	vmovdqa	xmm3,XMMWORD[16+rcx]
	vpxor	xmm8,xmm8,xmm1
	vaesenc	xmm8,xmm8,xmm3
	vmovdqu	XMMWORD[rdx],xmm1
	vmovdqu	XMMWORD[16+rdx],xmm3
	vpxor	xmm14,xmm14,xmm14

	vpshufb	xmm2,xmm3,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpslld	xmm0,xmm0,1
	vpslldq	xmm4,xmm1,4
	vpxor	xmm1,xmm1,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm1,xmm1,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm1,xmm1,xmm4
	vpxor	xmm1,xmm1,xmm2
	vaesenc	xmm8,xmm8,xmm1
	vmovdqu	XMMWORD[32+rdx],xmm1

	vpshufd	xmm2,xmm1,0xff
	vaesenclast	xmm2,xmm2,xmm14
	vpslldq	xmm4,xmm3,4
	vpxor	xmm3,xmm3,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm3,xmm3,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm3,xmm3,xmm4
	vpxor	xmm3,xmm3,xmm2
	vaesenc	xmm8,xmm8,xmm3
	vmovdqu	XMMWORD[48+rdx],xmm3

	vpshufb	xmm2,xmm3,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpslld	xmm0,xmm0,1
	vpslldq	xmm4,xmm1,4
	vpxor	xmm1,xmm1,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm1,xmm1,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm1,xmm1,xmm4
	vpxor	xmm1,xmm1,xmm2
	vaesenc	xmm8,xmm8,xmm1
	vmovdqu	XMMWORD[64+rdx],xmm1

	vpshufd	xmm2,xmm1,0xff
	vaesenclast	xmm2,xmm2,xmm14
	vpslldq	xmm4,xmm3,4
	vpxor	xmm3,xmm3,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm3,xmm3,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm3,xmm3,xmm4
	vpxor	xmm3,xmm3,xmm2
	vaesenc	xmm8,xmm8,xmm3
	vmovdqu	XMMWORD[80+rdx],xmm3

	vpshufb	xmm2,xmm3,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpslld	xmm0,xmm0,1
	vpslldq	xmm4,xmm1,4
	vpxor	xmm1,xmm1,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm1,xmm1,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm1,xmm1,xmm4
	vpxor	xmm1,xmm1,xmm2
	vaesenc	xmm8,xmm8,xmm1
	vmovdqu	XMMWORD[96+rdx],xmm1

	vpshufd	xmm2,xmm1,0xff
	vaesenclast	xmm2,xmm2,xmm14
	vpslldq	xmm4,xmm3,4
	vpxor	xmm3,xmm3,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm3,xmm3,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm3,xmm3,xmm4
	vpxor	xmm3,xmm3,xmm2
	vaesenc	xmm8,xmm8,xmm3
	vmovdqu	XMMWORD[112+rdx],xmm3

	vpshufb	xmm2,xmm3,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpslld	xmm0,xmm0,1
	vpslldq	xmm4,xmm1,4
	vpxor	xmm1,xmm1,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm1,xmm1,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm1,xmm1,xmm4
	vpxor	xmm1,xmm1,xmm2
	vaesenc	xmm8,xmm8,xmm1
	vmovdqu	XMMWORD[128+rdx],xmm1

	vpshufd	xmm2,xmm1,0xff
	vaesenclast	xmm2,xmm2,xmm14
	vpslldq	xmm4,xmm3,4
	vpxor	xmm3,xmm3,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm3,xmm3,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm3,xmm3,xmm4
	vpxor	xmm3,xmm3,xmm2
	vaesenc	xmm8,xmm8,xmm3
	vmovdqu	XMMWORD[144+rdx],xmm3

	vpshufb	xmm2,xmm3,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpslld	xmm0,xmm0,1
	vpslldq	xmm4,xmm1,4
	vpxor	xmm1,xmm1,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm1,xmm1,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm1,xmm1,xmm4
	vpxor	xmm1,xmm1,xmm2
	vaesenc	xmm8,xmm8,xmm1
	vmovdqu	XMMWORD[160+rdx],xmm1

	vpshufd	xmm2,xmm1,0xff
	vaesenclast	xmm2,xmm2,xmm14
	vpslldq	xmm4,xmm3,4
	vpxor	xmm3,xmm3,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm3,xmm3,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm3,xmm3,xmm4
	vpxor	xmm3,xmm3,xmm2
	vaesenc	xmm8,xmm8,xmm3
	vmovdqu	XMMWORD[176+rdx],xmm3

	vpshufb	xmm2,xmm3,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpslld	xmm0,xmm0,1
	vpslldq	xmm4,xmm1,4
	vpxor	xmm1,xmm1,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm1,xmm1,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm1,xmm1,xmm4
	vpxor	xmm1,xmm1,xmm2
	vaesenc	xmm8,xmm8,xmm1
	vmovdqu	XMMWORD[192+rdx],xmm1

	vpshufd	xmm2,xmm1,0xff
	vaesenclast	xmm2,xmm2,xmm14
	vpslldq	xmm4,xmm3,4
	vpxor	xmm3,xmm3,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm3,xmm3,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm3,xmm3,xmm4
	vpxor	xmm3,xmm3,xmm2
	vaesenc	xmm8,xmm8,xmm3
	vmovdqu	XMMWORD[208+rdx],xmm3

	vpshufb	xmm2,xmm3,xmm15
	vaesenclast	xmm2,xmm2,xmm0
	vpslldq	xmm4,xmm1,4
	vpxor	xmm1,xmm1,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm1,xmm1,xmm4
	vpslldq	xmm4,xmm4,4
	vpxor	xmm1,xmm1,xmm4
	vpxor	xmm1,xmm1,xmm2
	vaesenclast	xmm8,xmm8,xmm1
	vmovdqu	XMMWORD[224+rdx],xmm1

	vmovdqa	XMMWORD[rsi],xmm8
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$SEH_end_aes256gcmsiv_aes_ks_enc_x1:
global	aes256gcmsiv_ecb_enc_block

ALIGN	16
aes256gcmsiv_ecb_enc_block:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aes256gcmsiv_ecb_enc_block:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8



_CET_ENDBR
	vmovdqa	xmm1,XMMWORD[rdi]
	vpxor	xmm1,xmm1,XMMWORD[rdx]
	vaesenc	xmm1,xmm1,XMMWORD[16+rdx]
	vaesenc	xmm1,xmm1,XMMWORD[32+rdx]
	vaesenc	xmm1,xmm1,XMMWORD[48+rdx]
	vaesenc	xmm1,xmm1,XMMWORD[64+rdx]
	vaesenc	xmm1,xmm1,XMMWORD[80+rdx]
	vaesenc	xmm1,xmm1,XMMWORD[96+rdx]
	vaesenc	xmm1,xmm1,XMMWORD[112+rdx]
	vaesenc	xmm1,xmm1,XMMWORD[128+rdx]
	vaesenc	xmm1,xmm1,XMMWORD[144+rdx]
	vaesenc	xmm1,xmm1,XMMWORD[160+rdx]
	vaesenc	xmm1,xmm1,XMMWORD[176+rdx]
	vaesenc	xmm1,xmm1,XMMWORD[192+rdx]
	vaesenc	xmm1,xmm1,XMMWORD[208+rdx]
	vaesenclast	xmm1,xmm1,XMMWORD[224+rdx]
	vmovdqa	XMMWORD[rsi],xmm1
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$SEH_end_aes256gcmsiv_ecb_enc_block:
global	aes256gcmsiv_enc_msg_x4

ALIGN	16
aes256gcmsiv_enc_msg_x4:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aes256gcmsiv_enc_msg_x4:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8
	mov	rcx,r9
	mov	r8,QWORD[40+rsp]



_CET_ENDBR
	test	r8,r8
	jnz	NEAR $L$256_enc_msg_x4_start
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$256_enc_msg_x4_start:
	mov	r10,r8
	shr	r8,4
	shl	r10,60
	jz	NEAR $L$256_enc_msg_x4_start2
	add	r8,1

$L$256_enc_msg_x4_start2:
	mov	r10,r8
	shl	r10,62
	shr	r10,62


	vmovdqa	xmm15,XMMWORD[rdx]
	vpor	xmm15,xmm15,XMMWORD[OR_MASK]

	vmovdqa	xmm4,XMMWORD[four]
	vmovdqa	xmm0,xmm15
	vpaddd	xmm1,xmm15,XMMWORD[one]
	vpaddd	xmm2,xmm15,XMMWORD[two]
	vpaddd	xmm3,xmm15,XMMWORD[three]

	shr	r8,2
	je	NEAR $L$256_enc_msg_x4_check_remainder

	sub	rsi,64
	sub	rdi,64

$L$256_enc_msg_x4_loop1:
	add	rsi,64
	add	rdi,64

	vmovdqa	xmm5,xmm0
	vmovdqa	xmm6,xmm1
	vmovdqa	xmm7,xmm2
	vmovdqa	xmm8,xmm3

	vpxor	xmm5,xmm5,XMMWORD[rcx]
	vpxor	xmm6,xmm6,XMMWORD[rcx]
	vpxor	xmm7,xmm7,XMMWORD[rcx]
	vpxor	xmm8,xmm8,XMMWORD[rcx]

	vmovdqu	xmm12,XMMWORD[16+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vpaddd	xmm0,xmm0,xmm4
	vmovdqu	xmm12,XMMWORD[32+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vpaddd	xmm1,xmm1,xmm4
	vmovdqu	xmm12,XMMWORD[48+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vpaddd	xmm2,xmm2,xmm4
	vmovdqu	xmm12,XMMWORD[64+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vpaddd	xmm3,xmm3,xmm4

	vmovdqu	xmm12,XMMWORD[80+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vmovdqu	xmm12,XMMWORD[96+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vmovdqu	xmm12,XMMWORD[112+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vmovdqu	xmm12,XMMWORD[128+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vmovdqu	xmm12,XMMWORD[144+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vmovdqu	xmm12,XMMWORD[160+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vmovdqu	xmm12,XMMWORD[176+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vmovdqu	xmm12,XMMWORD[192+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vmovdqu	xmm12,XMMWORD[208+rcx]
	vaesenc	xmm5,xmm5,xmm12
	vaesenc	xmm6,xmm6,xmm12
	vaesenc	xmm7,xmm7,xmm12
	vaesenc	xmm8,xmm8,xmm12

	vmovdqu	xmm12,XMMWORD[224+rcx]
	vaesenclast	xmm5,xmm5,xmm12
	vaesenclast	xmm6,xmm6,xmm12
	vaesenclast	xmm7,xmm7,xmm12
	vaesenclast	xmm8,xmm8,xmm12



	vpxor	xmm5,xmm5,XMMWORD[rdi]
	vpxor	xmm6,xmm6,XMMWORD[16+rdi]
	vpxor	xmm7,xmm7,XMMWORD[32+rdi]
	vpxor	xmm8,xmm8,XMMWORD[48+rdi]

	sub	r8,1

	vmovdqu	XMMWORD[rsi],xmm5
	vmovdqu	XMMWORD[16+rsi],xmm6
	vmovdqu	XMMWORD[32+rsi],xmm7
	vmovdqu	XMMWORD[48+rsi],xmm8

	jne	NEAR $L$256_enc_msg_x4_loop1

	add	rsi,64
	add	rdi,64

$L$256_enc_msg_x4_check_remainder:
	cmp	r10,0
	je	NEAR $L$256_enc_msg_x4_out

$L$256_enc_msg_x4_loop2:



	vmovdqa	xmm5,xmm0
	vpaddd	xmm0,xmm0,XMMWORD[one]
	vpxor	xmm5,xmm5,XMMWORD[rcx]
	vaesenc	xmm5,xmm5,XMMWORD[16+rcx]
	vaesenc	xmm5,xmm5,XMMWORD[32+rcx]
	vaesenc	xmm5,xmm5,XMMWORD[48+rcx]
	vaesenc	xmm5,xmm5,XMMWORD[64+rcx]
	vaesenc	xmm5,xmm5,XMMWORD[80+rcx]
	vaesenc	xmm5,xmm5,XMMWORD[96+rcx]
	vaesenc	xmm5,xmm5,XMMWORD[112+rcx]
	vaesenc	xmm5,xmm5,XMMWORD[128+rcx]
	vaesenc	xmm5,xmm5,XMMWORD[144+rcx]
	vaesenc	xmm5,xmm5,XMMWORD[160+rcx]
	vaesenc	xmm5,xmm5,XMMWORD[176+rcx]
	vaesenc	xmm5,xmm5,XMMWORD[192+rcx]
	vaesenc	xmm5,xmm5,XMMWORD[208+rcx]
	vaesenclast	xmm5,xmm5,XMMWORD[224+rcx]


	vpxor	xmm5,xmm5,XMMWORD[rdi]

	vmovdqu	XMMWORD[rsi],xmm5

	add	rdi,16
	add	rsi,16

	sub	r10,1
	jne	NEAR $L$256_enc_msg_x4_loop2

$L$256_enc_msg_x4_out:
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$SEH_end_aes256gcmsiv_enc_msg_x4:
global	aes256gcmsiv_enc_msg_x8

ALIGN	16
aes256gcmsiv_enc_msg_x8:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aes256gcmsiv_enc_msg_x8:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8
	mov	rcx,r9
	mov	r8,QWORD[40+rsp]



_CET_ENDBR
	test	r8,r8
	jnz	NEAR $L$256_enc_msg_x8_start
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$256_enc_msg_x8_start:

	mov	r11,rsp
	sub	r11,16
	and	r11,-64

	mov	r10,r8
	shr	r8,4
	shl	r10,60
	jz	NEAR $L$256_enc_msg_x8_start2
	add	r8,1

$L$256_enc_msg_x8_start2:
	mov	r10,r8
	shl	r10,61
	shr	r10,61


	vmovdqa	xmm1,XMMWORD[rdx]
	vpor	xmm1,xmm1,XMMWORD[OR_MASK]


	vpaddd	xmm0,xmm1,XMMWORD[seven]
	vmovdqa	XMMWORD[r11],xmm0
	vpaddd	xmm9,xmm1,XMMWORD[one]
	vpaddd	xmm10,xmm1,XMMWORD[two]
	vpaddd	xmm11,xmm1,XMMWORD[three]
	vpaddd	xmm12,xmm1,XMMWORD[four]
	vpaddd	xmm13,xmm1,XMMWORD[five]
	vpaddd	xmm14,xmm1,XMMWORD[six]
	vmovdqa	xmm0,xmm1

	shr	r8,3
	jz	NEAR $L$256_enc_msg_x8_check_remainder

	sub	rsi,128
	sub	rdi,128

$L$256_enc_msg_x8_loop1:
	add	rsi,128
	add	rdi,128

	vmovdqa	xmm1,xmm0
	vmovdqa	xmm2,xmm9
	vmovdqa	xmm3,xmm10
	vmovdqa	xmm4,xmm11
	vmovdqa	xmm5,xmm12
	vmovdqa	xmm6,xmm13
	vmovdqa	xmm7,xmm14

	vmovdqa	xmm8,XMMWORD[r11]

	vpxor	xmm1,xmm1,XMMWORD[rcx]
	vpxor	xmm2,xmm2,XMMWORD[rcx]
	vpxor	xmm3,xmm3,XMMWORD[rcx]
	vpxor	xmm4,xmm4,XMMWORD[rcx]
	vpxor	xmm5,xmm5,XMMWORD[rcx]
	vpxor	xmm6,xmm6,XMMWORD[rcx]
	vpxor	xmm7,xmm7,XMMWORD[rcx]
	vpxor	xmm8,xmm8,XMMWORD[rcx]

	vmovdqu	xmm15,XMMWORD[16+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vmovdqa	xmm14,XMMWORD[r11]
	vpaddd	xmm14,xmm14,XMMWORD[eight]
	vmovdqa	XMMWORD[r11],xmm14
	vmovdqu	xmm15,XMMWORD[32+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vpsubd	xmm14,xmm14,XMMWORD[one]
	vmovdqu	xmm15,XMMWORD[48+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vpaddd	xmm0,xmm0,XMMWORD[eight]
	vmovdqu	xmm15,XMMWORD[64+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vpaddd	xmm9,xmm9,XMMWORD[eight]
	vmovdqu	xmm15,XMMWORD[80+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vpaddd	xmm10,xmm10,XMMWORD[eight]
	vmovdqu	xmm15,XMMWORD[96+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vpaddd	xmm11,xmm11,XMMWORD[eight]
	vmovdqu	xmm15,XMMWORD[112+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vpaddd	xmm12,xmm12,XMMWORD[eight]
	vmovdqu	xmm15,XMMWORD[128+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vpaddd	xmm13,xmm13,XMMWORD[eight]
	vmovdqu	xmm15,XMMWORD[144+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vmovdqu	xmm15,XMMWORD[160+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vmovdqu	xmm15,XMMWORD[176+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vmovdqu	xmm15,XMMWORD[192+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vmovdqu	xmm15,XMMWORD[208+rcx]
	vaesenc	xmm1,xmm1,xmm15
	vaesenc	xmm2,xmm2,xmm15
	vaesenc	xmm3,xmm3,xmm15
	vaesenc	xmm4,xmm4,xmm15
	vaesenc	xmm5,xmm5,xmm15
	vaesenc	xmm6,xmm6,xmm15
	vaesenc	xmm7,xmm7,xmm15
	vaesenc	xmm8,xmm8,xmm15

	vmovdqu	xmm15,XMMWORD[224+rcx]
	vaesenclast	xmm1,xmm1,xmm15
	vaesenclast	xmm2,xmm2,xmm15
	vaesenclast	xmm3,xmm3,xmm15
	vaesenclast	xmm4,xmm4,xmm15
	vaesenclast	xmm5,xmm5,xmm15
	vaesenclast	xmm6,xmm6,xmm15
	vaesenclast	xmm7,xmm7,xmm15
	vaesenclast	xmm8,xmm8,xmm15



	vpxor	xmm1,xmm1,XMMWORD[rdi]
	vpxor	xmm2,xmm2,XMMWORD[16+rdi]
	vpxor	xmm3,xmm3,XMMWORD[32+rdi]
	vpxor	xmm4,xmm4,XMMWORD[48+rdi]
	vpxor	xmm5,xmm5,XMMWORD[64+rdi]
	vpxor	xmm6,xmm6,XMMWORD[80+rdi]
	vpxor	xmm7,xmm7,XMMWORD[96+rdi]
	vpxor	xmm8,xmm8,XMMWORD[112+rdi]

	sub	r8,1

	vmovdqu	XMMWORD[rsi],xmm1
	vmovdqu	XMMWORD[16+rsi],xmm2
	vmovdqu	XMMWORD[32+rsi],xmm3
	vmovdqu	XMMWORD[48+rsi],xmm4
	vmovdqu	XMMWORD[64+rsi],xmm5
	vmovdqu	XMMWORD[80+rsi],xmm6
	vmovdqu	XMMWORD[96+rsi],xmm7
	vmovdqu	XMMWORD[112+rsi],xmm8

	jne	NEAR $L$256_enc_msg_x8_loop1

	add	rsi,128
	add	rdi,128

$L$256_enc_msg_x8_check_remainder:
	cmp	r10,0
	je	NEAR $L$256_enc_msg_x8_out

$L$256_enc_msg_x8_loop2:


	vmovdqa	xmm1,xmm0
	vpaddd	xmm0,xmm0,XMMWORD[one]

	vpxor	xmm1,xmm1,XMMWORD[rcx]
	vaesenc	xmm1,xmm1,XMMWORD[16+rcx]
	vaesenc	xmm1,xmm1,XMMWORD[32+rcx]
	vaesenc	xmm1,xmm1,XMMWORD[48+rcx]
	vaesenc	xmm1,xmm1,XMMWORD[64+rcx]
	vaesenc	xmm1,xmm1,XMMWORD[80+rcx]
	vaesenc	xmm1,xmm1,XMMWORD[96+rcx]
	vaesenc	xmm1,xmm1,XMMWORD[112+rcx]
	vaesenc	xmm1,xmm1,XMMWORD[128+rcx]
	vaesenc	xmm1,xmm1,XMMWORD[144+rcx]
	vaesenc	xmm1,xmm1,XMMWORD[160+rcx]
	vaesenc	xmm1,xmm1,XMMWORD[176+rcx]
	vaesenc	xmm1,xmm1,XMMWORD[192+rcx]
	vaesenc	xmm1,xmm1,XMMWORD[208+rcx]
	vaesenclast	xmm1,xmm1,XMMWORD[224+rcx]


	vpxor	xmm1,xmm1,XMMWORD[rdi]

	vmovdqu	XMMWORD[rsi],xmm1

	add	rdi,16
	add	rsi,16
	sub	r10,1
	jnz	NEAR $L$256_enc_msg_x8_loop2

$L$256_enc_msg_x8_out:
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret


$L$SEH_end_aes256gcmsiv_enc_msg_x8:
global	aes256gcmsiv_dec

ALIGN	16
aes256gcmsiv_dec:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aes256gcmsiv_dec:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8
	mov	rcx,r9
	mov	r8,QWORD[40+rsp]
	mov	r9,QWORD[48+rsp]



_CET_ENDBR
	test	r9,~15
	jnz	NEAR $L$256_dec_start
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$256_dec_start:
	vzeroupper
	vmovdqa	xmm0,XMMWORD[rdx]


	vmovdqu	xmm15,XMMWORD[16+rdx]
	vpor	xmm15,xmm15,XMMWORD[OR_MASK]
	mov	rax,rdx

	lea	rax,[32+rax]
	lea	rcx,[32+rcx]

	and	r9,~15


	cmp	r9,96
	jb	NEAR $L$256_dec_loop2


	sub	r9,96
	vmovdqa	xmm7,xmm15
	vpaddd	xmm8,xmm7,XMMWORD[one]
	vpaddd	xmm9,xmm7,XMMWORD[two]
	vpaddd	xmm10,xmm9,XMMWORD[one]
	vpaddd	xmm11,xmm9,XMMWORD[two]
	vpaddd	xmm12,xmm11,XMMWORD[one]
	vpaddd	xmm15,xmm11,XMMWORD[two]

	vpxor	xmm7,xmm7,XMMWORD[r8]
	vpxor	xmm8,xmm8,XMMWORD[r8]
	vpxor	xmm9,xmm9,XMMWORD[r8]
	vpxor	xmm10,xmm10,XMMWORD[r8]
	vpxor	xmm11,xmm11,XMMWORD[r8]
	vpxor	xmm12,xmm12,XMMWORD[r8]

	vmovdqu	xmm4,XMMWORD[16+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[32+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[48+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[64+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[80+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[96+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[112+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[128+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[144+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[160+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[176+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[192+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[208+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[224+r8]
	vaesenclast	xmm7,xmm7,xmm4
	vaesenclast	xmm8,xmm8,xmm4
	vaesenclast	xmm9,xmm9,xmm4
	vaesenclast	xmm10,xmm10,xmm4
	vaesenclast	xmm11,xmm11,xmm4
	vaesenclast	xmm12,xmm12,xmm4


	vpxor	xmm7,xmm7,XMMWORD[rdi]
	vpxor	xmm8,xmm8,XMMWORD[16+rdi]
	vpxor	xmm9,xmm9,XMMWORD[32+rdi]
	vpxor	xmm10,xmm10,XMMWORD[48+rdi]
	vpxor	xmm11,xmm11,XMMWORD[64+rdi]
	vpxor	xmm12,xmm12,XMMWORD[80+rdi]

	vmovdqu	XMMWORD[rsi],xmm7
	vmovdqu	XMMWORD[16+rsi],xmm8
	vmovdqu	XMMWORD[32+rsi],xmm9
	vmovdqu	XMMWORD[48+rsi],xmm10
	vmovdqu	XMMWORD[64+rsi],xmm11
	vmovdqu	XMMWORD[80+rsi],xmm12

	add	rdi,96
	add	rsi,96
	jmp	NEAR $L$256_dec_loop1


ALIGN	64
$L$256_dec_loop1:
	cmp	r9,96
	jb	NEAR $L$256_dec_finish_96
	sub	r9,96

	vmovdqa	xmm6,xmm12
	vmovdqa	XMMWORD[(16-32)+rax],xmm11
	vmovdqa	XMMWORD[(32-32)+rax],xmm10
	vmovdqa	XMMWORD[(48-32)+rax],xmm9
	vmovdqa	XMMWORD[(64-32)+rax],xmm8
	vmovdqa	XMMWORD[(80-32)+rax],xmm7

	vmovdqa	xmm7,xmm15
	vpaddd	xmm8,xmm7,XMMWORD[one]
	vpaddd	xmm9,xmm7,XMMWORD[two]
	vpaddd	xmm10,xmm9,XMMWORD[one]
	vpaddd	xmm11,xmm9,XMMWORD[two]
	vpaddd	xmm12,xmm11,XMMWORD[one]
	vpaddd	xmm15,xmm11,XMMWORD[two]

	vmovdqa	xmm4,XMMWORD[r8]
	vpxor	xmm7,xmm7,xmm4
	vpxor	xmm8,xmm8,xmm4
	vpxor	xmm9,xmm9,xmm4
	vpxor	xmm10,xmm10,xmm4
	vpxor	xmm11,xmm11,xmm4
	vpxor	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[((0-32))+rcx]
	vpclmulqdq	xmm2,xmm6,xmm4,0x11
	vpclmulqdq	xmm3,xmm6,xmm4,0x00
	vpclmulqdq	xmm1,xmm6,xmm4,0x01
	vpclmulqdq	xmm4,xmm6,xmm4,0x10
	vpxor	xmm1,xmm1,xmm4

	vmovdqu	xmm4,XMMWORD[16+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm6,XMMWORD[((-16))+rax]
	vmovdqu	xmm13,XMMWORD[((-16))+rcx]

	vpclmulqdq	xmm4,xmm6,xmm13,0x10
	vpxor	xmm1,xmm1,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x11
	vpxor	xmm2,xmm2,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x00
	vpxor	xmm3,xmm3,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x01
	vpxor	xmm1,xmm1,xmm4


	vmovdqu	xmm4,XMMWORD[32+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm6,XMMWORD[rax]
	vmovdqu	xmm13,XMMWORD[rcx]

	vpclmulqdq	xmm4,xmm6,xmm13,0x10
	vpxor	xmm1,xmm1,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x11
	vpxor	xmm2,xmm2,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x00
	vpxor	xmm3,xmm3,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x01
	vpxor	xmm1,xmm1,xmm4


	vmovdqu	xmm4,XMMWORD[48+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm6,XMMWORD[16+rax]
	vmovdqu	xmm13,XMMWORD[16+rcx]

	vpclmulqdq	xmm4,xmm6,xmm13,0x10
	vpxor	xmm1,xmm1,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x11
	vpxor	xmm2,xmm2,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x00
	vpxor	xmm3,xmm3,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x01
	vpxor	xmm1,xmm1,xmm4


	vmovdqu	xmm4,XMMWORD[64+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm6,XMMWORD[32+rax]
	vmovdqu	xmm13,XMMWORD[32+rcx]

	vpclmulqdq	xmm4,xmm6,xmm13,0x10
	vpxor	xmm1,xmm1,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x11
	vpxor	xmm2,xmm2,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x00
	vpxor	xmm3,xmm3,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x01
	vpxor	xmm1,xmm1,xmm4


	vmovdqu	xmm4,XMMWORD[80+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[96+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[112+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4


	vmovdqa	xmm6,XMMWORD[((80-32))+rax]
	vpxor	xmm6,xmm6,xmm0
	vmovdqu	xmm5,XMMWORD[((80-32))+rcx]

	vpclmulqdq	xmm4,xmm6,xmm5,0x01
	vpxor	xmm1,xmm1,xmm4
	vpclmulqdq	xmm4,xmm6,xmm5,0x11
	vpxor	xmm2,xmm2,xmm4
	vpclmulqdq	xmm4,xmm6,xmm5,0x00
	vpxor	xmm3,xmm3,xmm4
	vpclmulqdq	xmm4,xmm6,xmm5,0x10
	vpxor	xmm1,xmm1,xmm4

	vmovdqu	xmm4,XMMWORD[128+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4


	vpsrldq	xmm4,xmm1,8
	vpxor	xmm5,xmm2,xmm4
	vpslldq	xmm4,xmm1,8
	vpxor	xmm0,xmm3,xmm4

	vmovdqa	xmm3,XMMWORD[poly]

	vmovdqu	xmm4,XMMWORD[144+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[160+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[176+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[192+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm4,XMMWORD[208+r8]
	vaesenc	xmm7,xmm7,xmm4
	vaesenc	xmm8,xmm8,xmm4
	vaesenc	xmm9,xmm9,xmm4
	vaesenc	xmm10,xmm10,xmm4
	vaesenc	xmm11,xmm11,xmm4
	vaesenc	xmm12,xmm12,xmm4

	vmovdqu	xmm6,XMMWORD[224+r8]
	vpalignr	xmm2,xmm0,xmm0,8
	vpclmulqdq	xmm0,xmm0,xmm3,0x10
	vpxor	xmm0,xmm2,xmm0

	vpxor	xmm4,xmm6,XMMWORD[rdi]
	vaesenclast	xmm7,xmm7,xmm4
	vpxor	xmm4,xmm6,XMMWORD[16+rdi]
	vaesenclast	xmm8,xmm8,xmm4
	vpxor	xmm4,xmm6,XMMWORD[32+rdi]
	vaesenclast	xmm9,xmm9,xmm4
	vpxor	xmm4,xmm6,XMMWORD[48+rdi]
	vaesenclast	xmm10,xmm10,xmm4
	vpxor	xmm4,xmm6,XMMWORD[64+rdi]
	vaesenclast	xmm11,xmm11,xmm4
	vpxor	xmm4,xmm6,XMMWORD[80+rdi]
	vaesenclast	xmm12,xmm12,xmm4

	vpalignr	xmm2,xmm0,xmm0,8
	vpclmulqdq	xmm0,xmm0,xmm3,0x10
	vpxor	xmm0,xmm2,xmm0

	vmovdqu	XMMWORD[rsi],xmm7
	vmovdqu	XMMWORD[16+rsi],xmm8
	vmovdqu	XMMWORD[32+rsi],xmm9
	vmovdqu	XMMWORD[48+rsi],xmm10
	vmovdqu	XMMWORD[64+rsi],xmm11
	vmovdqu	XMMWORD[80+rsi],xmm12

	vpxor	xmm0,xmm0,xmm5

	lea	rdi,[96+rdi]
	lea	rsi,[96+rsi]
	jmp	NEAR $L$256_dec_loop1

$L$256_dec_finish_96:
	vmovdqa	xmm6,xmm12
	vmovdqa	XMMWORD[(16-32)+rax],xmm11
	vmovdqa	XMMWORD[(32-32)+rax],xmm10
	vmovdqa	XMMWORD[(48-32)+rax],xmm9
	vmovdqa	XMMWORD[(64-32)+rax],xmm8
	vmovdqa	XMMWORD[(80-32)+rax],xmm7

	vmovdqu	xmm4,XMMWORD[((0-32))+rcx]
	vpclmulqdq	xmm1,xmm6,xmm4,0x10
	vpclmulqdq	xmm2,xmm6,xmm4,0x11
	vpclmulqdq	xmm3,xmm6,xmm4,0x00
	vpclmulqdq	xmm4,xmm6,xmm4,0x01
	vpxor	xmm1,xmm1,xmm4

	vmovdqu	xmm6,XMMWORD[((-16))+rax]
	vmovdqu	xmm13,XMMWORD[((-16))+rcx]

	vpclmulqdq	xmm4,xmm6,xmm13,0x10
	vpxor	xmm1,xmm1,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x11
	vpxor	xmm2,xmm2,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x00
	vpxor	xmm3,xmm3,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x01
	vpxor	xmm1,xmm1,xmm4

	vmovdqu	xmm6,XMMWORD[rax]
	vmovdqu	xmm13,XMMWORD[rcx]

	vpclmulqdq	xmm4,xmm6,xmm13,0x10
	vpxor	xmm1,xmm1,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x11
	vpxor	xmm2,xmm2,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x00
	vpxor	xmm3,xmm3,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x01
	vpxor	xmm1,xmm1,xmm4

	vmovdqu	xmm6,XMMWORD[16+rax]
	vmovdqu	xmm13,XMMWORD[16+rcx]

	vpclmulqdq	xmm4,xmm6,xmm13,0x10
	vpxor	xmm1,xmm1,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x11
	vpxor	xmm2,xmm2,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x00
	vpxor	xmm3,xmm3,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x01
	vpxor	xmm1,xmm1,xmm4

	vmovdqu	xmm6,XMMWORD[32+rax]
	vmovdqu	xmm13,XMMWORD[32+rcx]

	vpclmulqdq	xmm4,xmm6,xmm13,0x10
	vpxor	xmm1,xmm1,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x11
	vpxor	xmm2,xmm2,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x00
	vpxor	xmm3,xmm3,xmm4
	vpclmulqdq	xmm4,xmm6,xmm13,0x01
	vpxor	xmm1,xmm1,xmm4


	vmovdqu	xmm6,XMMWORD[((80-32))+rax]
	vpxor	xmm6,xmm6,xmm0
	vmovdqu	xmm5,XMMWORD[((80-32))+rcx]
	vpclmulqdq	xmm4,xmm6,xmm5,0x11
	vpxor	xmm2,xmm2,xmm4
	vpclmulqdq	xmm4,xmm6,xmm5,0x00
	vpxor	xmm3,xmm3,xmm4
	vpclmulqdq	xmm4,xmm6,xmm5,0x10
	vpxor	xmm1,xmm1,xmm4
	vpclmulqdq	xmm4,xmm6,xmm5,0x01
	vpxor	xmm1,xmm1,xmm4

	vpsrldq	xmm4,xmm1,8
	vpxor	xmm5,xmm2,xmm4
	vpslldq	xmm4,xmm1,8
	vpxor	xmm0,xmm3,xmm4

	vmovdqa	xmm3,XMMWORD[poly]

	vpalignr	xmm2,xmm0,xmm0,8
	vpclmulqdq	xmm0,xmm0,xmm3,0x10
	vpxor	xmm0,xmm2,xmm0

	vpalignr	xmm2,xmm0,xmm0,8
	vpclmulqdq	xmm0,xmm0,xmm3,0x10
	vpxor	xmm0,xmm2,xmm0

	vpxor	xmm0,xmm0,xmm5

$L$256_dec_loop2:



	cmp	r9,16
	jb	NEAR $L$256_dec_out
	sub	r9,16

	vmovdqa	xmm2,xmm15
	vpaddd	xmm15,xmm15,XMMWORD[one]

	vpxor	xmm2,xmm2,XMMWORD[r8]
	vaesenc	xmm2,xmm2,XMMWORD[16+r8]
	vaesenc	xmm2,xmm2,XMMWORD[32+r8]
	vaesenc	xmm2,xmm2,XMMWORD[48+r8]
	vaesenc	xmm2,xmm2,XMMWORD[64+r8]
	vaesenc	xmm2,xmm2,XMMWORD[80+r8]
	vaesenc	xmm2,xmm2,XMMWORD[96+r8]
	vaesenc	xmm2,xmm2,XMMWORD[112+r8]
	vaesenc	xmm2,xmm2,XMMWORD[128+r8]
	vaesenc	xmm2,xmm2,XMMWORD[144+r8]
	vaesenc	xmm2,xmm2,XMMWORD[160+r8]
	vaesenc	xmm2,xmm2,XMMWORD[176+r8]
	vaesenc	xmm2,xmm2,XMMWORD[192+r8]
	vaesenc	xmm2,xmm2,XMMWORD[208+r8]
	vaesenclast	xmm2,xmm2,XMMWORD[224+r8]
	vpxor	xmm2,xmm2,XMMWORD[rdi]
	vmovdqu	XMMWORD[rsi],xmm2
	add	rdi,16
	add	rsi,16

	vpxor	xmm0,xmm0,xmm2
	vmovdqa	xmm1,XMMWORD[((-32))+rcx]
	call	GFMUL

	jmp	NEAR $L$256_dec_loop2

$L$256_dec_out:
	vmovdqu	XMMWORD[rdx],xmm0
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$SEH_end_aes256gcmsiv_dec:
global	aes256gcmsiv_kdf

ALIGN	16
aes256gcmsiv_kdf:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aes256gcmsiv_kdf:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8



_CET_ENDBR




	vmovdqa	xmm1,XMMWORD[rdx]
	vmovdqa	xmm4,XMMWORD[rdi]
	vmovdqa	xmm11,XMMWORD[and_mask]
	vmovdqa	xmm8,XMMWORD[one]
	vpshufd	xmm4,xmm4,0x90
	vpand	xmm4,xmm4,xmm11
	vpaddd	xmm6,xmm4,xmm8
	vpaddd	xmm7,xmm6,xmm8
	vpaddd	xmm11,xmm7,xmm8
	vpaddd	xmm12,xmm11,xmm8
	vpaddd	xmm13,xmm12,xmm8

	vpxor	xmm4,xmm4,xmm1
	vpxor	xmm6,xmm6,xmm1
	vpxor	xmm7,xmm7,xmm1
	vpxor	xmm11,xmm11,xmm1
	vpxor	xmm12,xmm12,xmm1
	vpxor	xmm13,xmm13,xmm1

	vmovdqa	xmm1,XMMWORD[16+rdx]
	vaesenc	xmm4,xmm4,xmm1
	vaesenc	xmm6,xmm6,xmm1
	vaesenc	xmm7,xmm7,xmm1
	vaesenc	xmm11,xmm11,xmm1
	vaesenc	xmm12,xmm12,xmm1
	vaesenc	xmm13,xmm13,xmm1

	vmovdqa	xmm2,XMMWORD[32+rdx]
	vaesenc	xmm4,xmm4,xmm2
	vaesenc	xmm6,xmm6,xmm2
	vaesenc	xmm7,xmm7,xmm2
	vaesenc	xmm11,xmm11,xmm2
	vaesenc	xmm12,xmm12,xmm2
	vaesenc	xmm13,xmm13,xmm2

	vmovdqa	xmm1,XMMWORD[48+rdx]
	vaesenc	xmm4,xmm4,xmm1
	vaesenc	xmm6,xmm6,xmm1
	vaesenc	xmm7,xmm7,xmm1
	vaesenc	xmm11,xmm11,xmm1
	vaesenc	xmm12,xmm12,xmm1
	vaesenc	xmm13,xmm13,xmm1

	vmovdqa	xmm2,XMMWORD[64+rdx]
	vaesenc	xmm4,xmm4,xmm2
	vaesenc	xmm6,xmm6,xmm2
	vaesenc	xmm7,xmm7,xmm2
	vaesenc	xmm11,xmm11,xmm2
	vaesenc	xmm12,xmm12,xmm2
	vaesenc	xmm13,xmm13,xmm2

	vmovdqa	xmm1,XMMWORD[80+rdx]
	vaesenc	xmm4,xmm4,xmm1
	vaesenc	xmm6,xmm6,xmm1
	vaesenc	xmm7,xmm7,xmm1
	vaesenc	xmm11,xmm11,xmm1
	vaesenc	xmm12,xmm12,xmm1
	vaesenc	xmm13,xmm13,xmm1

	vmovdqa	xmm2,XMMWORD[96+rdx]
	vaesenc	xmm4,xmm4,xmm2
	vaesenc	xmm6,xmm6,xmm2
	vaesenc	xmm7,xmm7,xmm2
	vaesenc	xmm11,xmm11,xmm2
	vaesenc	xmm12,xmm12,xmm2
	vaesenc	xmm13,xmm13,xmm2

	vmovdqa	xmm1,XMMWORD[112+rdx]
	vaesenc	xmm4,xmm4,xmm1
	vaesenc	xmm6,xmm6,xmm1
	vaesenc	xmm7,xmm7,xmm1
	vaesenc	xmm11,xmm11,xmm1
	vaesenc	xmm12,xmm12,xmm1
	vaesenc	xmm13,xmm13,xmm1

	vmovdqa	xmm2,XMMWORD[128+rdx]
	vaesenc	xmm4,xmm4,xmm2
	vaesenc	xmm6,xmm6,xmm2
	vaesenc	xmm7,xmm7,xmm2
	vaesenc	xmm11,xmm11,xmm2
	vaesenc	xmm12,xmm12,xmm2
	vaesenc	xmm13,xmm13,xmm2

	vmovdqa	xmm1,XMMWORD[144+rdx]
	vaesenc	xmm4,xmm4,xmm1
	vaesenc	xmm6,xmm6,xmm1
	vaesenc	xmm7,xmm7,xmm1
	vaesenc	xmm11,xmm11,xmm1
	vaesenc	xmm12,xmm12,xmm1
	vaesenc	xmm13,xmm13,xmm1

	vmovdqa	xmm2,XMMWORD[160+rdx]
	vaesenc	xmm4,xmm4,xmm2
	vaesenc	xmm6,xmm6,xmm2
	vaesenc	xmm7,xmm7,xmm2
	vaesenc	xmm11,xmm11,xmm2
	vaesenc	xmm12,xmm12,xmm2
	vaesenc	xmm13,xmm13,xmm2

	vmovdqa	xmm1,XMMWORD[176+rdx]
	vaesenc	xmm4,xmm4,xmm1
	vaesenc	xmm6,xmm6,xmm1
	vaesenc	xmm7,xmm7,xmm1
	vaesenc	xmm11,xmm11,xmm1
	vaesenc	xmm12,xmm12,xmm1
	vaesenc	xmm13,xmm13,xmm1

	vmovdqa	xmm2,XMMWORD[192+rdx]
	vaesenc	xmm4,xmm4,xmm2
	vaesenc	xmm6,xmm6,xmm2
	vaesenc	xmm7,xmm7,xmm2
	vaesenc	xmm11,xmm11,xmm2
	vaesenc	xmm12,xmm12,xmm2
	vaesenc	xmm13,xmm13,xmm2

	vmovdqa	xmm1,XMMWORD[208+rdx]
	vaesenc	xmm4,xmm4,xmm1
	vaesenc	xmm6,xmm6,xmm1
	vaesenc	xmm7,xmm7,xmm1
	vaesenc	xmm11,xmm11,xmm1
	vaesenc	xmm12,xmm12,xmm1
	vaesenc	xmm13,xmm13,xmm1

	vmovdqa	xmm2,XMMWORD[224+rdx]
	vaesenclast	xmm4,xmm4,xmm2
	vaesenclast	xmm6,xmm6,xmm2
	vaesenclast	xmm7,xmm7,xmm2
	vaesenclast	xmm11,xmm11,xmm2
	vaesenclast	xmm12,xmm12,xmm2
	vaesenclast	xmm13,xmm13,xmm2


	vmovdqa	XMMWORD[rsi],xmm4
	vmovdqa	XMMWORD[16+rsi],xmm6
	vmovdqa	XMMWORD[32+rsi],xmm7
	vmovdqa	XMMWORD[48+rsi],xmm11
	vmovdqa	XMMWORD[64+rsi],xmm12
	vmovdqa	XMMWORD[80+rsi],xmm13
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	ret

$L$SEH_end_aes256gcmsiv_kdf:
%else
; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
ret
%endif