aboutsummaryrefslogtreecommitdiffhomepage
path: root/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s')
-rw-r--r--vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s2714
1 files changed, 2714 insertions, 0 deletions
diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s
new file mode 100644
index 0000000..1c57e38
--- /dev/null
+++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s
@@ -0,0 +1,2714 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
+
+// +build go1.7,amd64,!gccgo,!appengine
+
+#include "textflag.h"
+// General register allocation
+#define oup DI
+#define inp SI
+#define inl BX
+#define adp CX // free to reuse, after we hash the additional data
+#define keyp R8 // free to reuse, when we copy the key to stack
+#define itr2 R9 // general iterator
+#define itr1 CX // general iterator
+#define acc0 R10
+#define acc1 R11
+#define acc2 R12
+#define t0 R13
+#define t1 R14
+#define t2 R15
+#define t3 R8
+// Register and stack allocation for the SSE code
+#define rStore (0*16)(BP)
+#define sStore (1*16)(BP)
+#define state1Store (2*16)(BP)
+#define state2Store (3*16)(BP)
+#define tmpStore (4*16)(BP)
+#define ctr0Store (5*16)(BP)
+#define ctr1Store (6*16)(BP)
+#define ctr2Store (7*16)(BP)
+#define ctr3Store (8*16)(BP)
+#define A0 X0
+#define A1 X1
+#define A2 X2
+#define B0 X3
+#define B1 X4
+#define B2 X5
+#define C0 X6
+#define C1 X7
+#define C2 X8
+#define D0 X9
+#define D1 X10
+#define D2 X11
+#define T0 X12
+#define T1 X13
+#define T2 X14
+#define T3 X15
+#define A3 T0
+#define B3 T1
+#define C3 T2
+#define D3 T3
+// Register and stack allocation for the AVX2 code
+#define rsStoreAVX2 (0*32)(BP)
+#define state1StoreAVX2 (1*32)(BP)
+#define state2StoreAVX2 (2*32)(BP)
+#define ctr0StoreAVX2 (3*32)(BP)
+#define ctr1StoreAVX2 (4*32)(BP)
+#define ctr2StoreAVX2 (5*32)(BP)
+#define ctr3StoreAVX2 (6*32)(BP)
+#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
+#define AA0 Y0
+#define AA1 Y5
+#define AA2 Y6
+#define AA3 Y7
+#define BB0 Y14
+#define BB1 Y9
+#define BB2 Y10
+#define BB3 Y11
+#define CC0 Y12
+#define CC1 Y13
+#define CC2 Y8
+#define CC3 Y15
+#define DD0 Y4
+#define DD1 Y1
+#define DD2 Y2
+#define DD3 Y3
+#define TT0 DD3
+#define TT1 AA3
+#define TT2 BB3
+#define TT3 CC3
+// ChaCha20 constants
+DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
+DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
+DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
+DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
+DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
+DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
+DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
+DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
+// <<< 16 with PSHUFB
+DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
+DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
+DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
+DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
+// <<< 8 with PSHUFB
+DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
+DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
+DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
+DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
+
+DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
+DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
+DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
+DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
+
+DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
+DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
+DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
+DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
+// Poly1305 key clamp
+DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
+DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
+DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
+DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
+
+DATA ·sseIncMask<>+0x00(SB)/8, $0x1
+DATA ·sseIncMask<>+0x08(SB)/8, $0x0
+// To load/store the last < 16 bytes in a buffer
+DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
+DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
+DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
+DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
+DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
+DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
+DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
+DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
+DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
+DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
+DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
+DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
+DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
+DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
+
+GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
+GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
+GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
+GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
+GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
+GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
+GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
+GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
+// No PALIGNR in Go ASM yet (but VPALIGNR is present).
+#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
+#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
+#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
+#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
+#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
+#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
+#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
+#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
+#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
+#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
+#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
+#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
+#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
+#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
+#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
+#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
+#define shiftC0Right shiftC0Left
+#define shiftC1Right shiftC1Left
+#define shiftC2Right shiftC2Left
+#define shiftC3Right shiftC3Left
+#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
+#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
+#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
+#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
+// Some macros
+#define chachaQR(A, B, C, D, T) \
+ PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \
+ PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
+ PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \
+ PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
+
+#define chachaQR_AVX2(A, B, C, D, T) \
+ VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \
+ VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
+ VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \
+ VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
+
+#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
+#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
+#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
+#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
+#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t2:t3; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
+
+#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
+#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
+#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
+
+#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
+#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
+// ----------------------------------------------------------------------------
+TEXT polyHashADInternal<>(SB), NOSPLIT, $0
+ // adp points to beginning of additional data
+ // itr2 holds ad length
+ XORQ acc0, acc0
+ XORQ acc1, acc1
+ XORQ acc2, acc2
+ CMPQ itr2, $13
+ JNE hashADLoop
+
+openFastTLSAD:
+ // Special treatment for the TLS case of 13 bytes
+ MOVQ (adp), acc0
+ MOVQ 5(adp), acc1
+ SHRQ $24, acc1
+ MOVQ $1, acc2
+ polyMul
+ RET
+
+hashADLoop:
+ // Hash in 16 byte chunks
+ CMPQ itr2, $16
+ JB hashADTail
+ polyAdd(0(adp))
+ LEAQ (1*16)(adp), adp
+ SUBQ $16, itr2
+ polyMul
+ JMP hashADLoop
+
+hashADTail:
+ CMPQ itr2, $0
+ JE hashADDone
+
+ // Hash last < 16 byte tail
+ XORQ t0, t0
+ XORQ t1, t1
+ XORQ t2, t2
+ ADDQ itr2, adp
+
+hashADTailLoop:
+ SHLQ $8, t1:t0
+ SHLQ $8, t0
+ MOVB -1(adp), t2
+ XORQ t2, t0
+ DECQ adp
+ DECQ itr2
+ JNE hashADTailLoop
+
+hashADTailFinish:
+ ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
+ polyMul
+
+ // Finished AD
+hashADDone:
+ RET
+
+// ----------------------------------------------------------------------------
+// func chacha20Poly1305Open(dst, key, src, ad []byte) bool
+TEXT ·chacha20Poly1305Open(SB), 0, $288-97
+ // For aligned stack access
+ MOVQ SP, BP
+ ADDQ $32, BP
+ ANDQ $-32, BP
+ MOVQ dst+0(FP), oup
+ MOVQ key+24(FP), keyp
+ MOVQ src+48(FP), inp
+ MOVQ src_len+56(FP), inl
+ MOVQ ad+72(FP), adp
+
+ // Check for AVX2 support
+ CMPB ·useAVX2(SB), $1
+ JE chacha20Poly1305Open_AVX2
+
+ // Special optimization, for very short buffers
+ CMPQ inl, $128
+ JBE openSSE128 // About 16% faster
+
+ // For long buffers, prepare the poly key first
+ MOVOU ·chacha20Constants<>(SB), A0
+ MOVOU (1*16)(keyp), B0
+ MOVOU (2*16)(keyp), C0
+ MOVOU (3*16)(keyp), D0
+ MOVO D0, T1
+
+ // Store state on stack for future use
+ MOVO B0, state1Store
+ MOVO C0, state2Store
+ MOVO D0, ctr3Store
+ MOVQ $10, itr2
+
+openSSEPreparePolyKey:
+ chachaQR(A0, B0, C0, D0, T0)
+ shiftB0Left; shiftC0Left; shiftD0Left
+ chachaQR(A0, B0, C0, D0, T0)
+ shiftB0Right; shiftC0Right; shiftD0Right
+ DECQ itr2
+ JNE openSSEPreparePolyKey
+
+ // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
+ PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
+
+ // Clamp and store the key
+ PAND ·polyClampMask<>(SB), A0
+ MOVO A0, rStore; MOVO B0, sStore
+
+ // Hash AAD
+ MOVQ ad_len+80(FP), itr2
+ CALL polyHashADInternal<>(SB)
+
+openSSEMainLoop:
+ CMPQ inl, $256
+ JB openSSEMainLoopDone
+
+ // Load state, increment counter blocks
+ MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
+ MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
+ MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
+ MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+
+ // Store counters
+ MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
+
+ // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
+ MOVQ $4, itr1
+ MOVQ inp, itr2
+
+openSSEInternalLoop:
+ MOVO C3, tmpStore
+ chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
+ MOVO tmpStore, C3
+ MOVO C1, tmpStore
+ chachaQR(A3, B3, C3, D3, C1)
+ MOVO tmpStore, C1
+ polyAdd(0(itr2))
+ shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
+ shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
+ shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
+ polyMulStage1
+ polyMulStage2
+ LEAQ (2*8)(itr2), itr2
+ MOVO C3, tmpStore
+ chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
+ MOVO tmpStore, C3
+ MOVO C1, tmpStore
+ polyMulStage3
+ chachaQR(A3, B3, C3, D3, C1)
+ MOVO tmpStore, C1
+ polyMulReduceStage
+ shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
+ shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
+ shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
+ DECQ itr1
+ JGE openSSEInternalLoop
+
+ polyAdd(0(itr2))
+ polyMul
+ LEAQ (2*8)(itr2), itr2
+
+ CMPQ itr1, $-6
+ JG openSSEInternalLoop
+
+ // Add in the state
+ PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
+ PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
+ PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
+ PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
+
+ // Load - xor - store
+ MOVO D3, tmpStore
+ MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
+ MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
+ MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
+ MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
+ MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
+ MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
+ MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
+ MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
+ MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
+ MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
+ MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
+ MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
+ MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
+ MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
+ MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
+ MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
+ LEAQ 256(inp), inp
+ LEAQ 256(oup), oup
+ SUBQ $256, inl
+ JMP openSSEMainLoop
+
+openSSEMainLoopDone:
+ // Handle the various tail sizes efficiently
+ TESTQ inl, inl
+ JE openSSEFinalize
+ CMPQ inl, $64
+ JBE openSSETail64
+ CMPQ inl, $128
+ JBE openSSETail128
+ CMPQ inl, $192
+ JBE openSSETail192
+ JMP openSSETail256
+
+openSSEFinalize:
+ // Hash in the PT, AAD lengths
+ ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
+ polyMul
+
+ // Final reduce
+ MOVQ acc0, t0
+ MOVQ acc1, t1
+ MOVQ acc2, t2
+ SUBQ $-5, acc0
+ SBBQ $-1, acc1
+ SBBQ $3, acc2
+ CMOVQCS t0, acc0
+ CMOVQCS t1, acc1
+ CMOVQCS t2, acc2
+
+ // Add in the "s" part of the key
+ ADDQ 0+sStore, acc0
+ ADCQ 8+sStore, acc1
+
+ // Finally, constant time compare to the tag at the end of the message
+ XORQ AX, AX
+ MOVQ $1, DX
+ XORQ (0*8)(inp), acc0
+ XORQ (1*8)(inp), acc1
+ ORQ acc1, acc0
+ CMOVQEQ DX, AX
+
+ // Return true iff tags are equal
+ MOVB AX, ret+96(FP)
+ RET
+
+// ----------------------------------------------------------------------------
+// Special optimization for buffers smaller than 129 bytes
+openSSE128:
+ // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
+ MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
+ MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
+ MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
+ MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
+ MOVQ $10, itr2
+
+openSSE128InnerCipherLoop:
+ chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
+ shiftB0Left; shiftB1Left; shiftB2Left
+ shiftC0Left; shiftC1Left; shiftC2Left
+ shiftD0Left; shiftD1Left; shiftD2Left
+ chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
+ shiftB0Right; shiftB1Right; shiftB2Right
+ shiftC0Right; shiftC1Right; shiftC2Right
+ shiftD0Right; shiftD1Right; shiftD2Right
+ DECQ itr2
+ JNE openSSE128InnerCipherLoop
+
+ // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
+ PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
+ PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
+ PADDL T2, C1; PADDL T2, C2
+ PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
+
+ // Clamp and store the key
+ PAND ·polyClampMask<>(SB), A0
+ MOVOU A0, rStore; MOVOU B0, sStore
+
+ // Hash
+ MOVQ ad_len+80(FP), itr2
+ CALL polyHashADInternal<>(SB)
+
+openSSE128Open:
+ CMPQ inl, $16
+ JB openSSETail16
+ SUBQ $16, inl
+
+ // Load for hashing
+ polyAdd(0(inp))
+
+ // Load for decryption
+ MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
+ LEAQ (1*16)(inp), inp
+ LEAQ (1*16)(oup), oup
+ polyMul
+
+ // Shift the stream "left"
+ MOVO B1, A1
+ MOVO C1, B1
+ MOVO D1, C1
+ MOVO A2, D1
+ MOVO B2, A2
+ MOVO C2, B2
+ MOVO D2, C2
+ JMP openSSE128Open
+
+openSSETail16:
+ TESTQ inl, inl
+ JE openSSEFinalize
+
+ // We can safely load the CT from the end, because it is padded with the MAC
+ MOVQ inl, itr2
+ SHLQ $4, itr2
+ LEAQ ·andMask<>(SB), t0
+ MOVOU (inp), T0
+ ADDQ inl, inp
+ PAND -16(t0)(itr2*1), T0
+ MOVO T0, 0+tmpStore
+ MOVQ T0, t0
+ MOVQ 8+tmpStore, t1
+ PXOR A1, T0
+
+ // We can only store one byte at a time, since plaintext can be shorter than 16 bytes
+openSSETail16Store:
+ MOVQ T0, t3
+ MOVB t3, (oup)
+ PSRLDQ $1, T0
+ INCQ oup
+ DECQ inl
+ JNE openSSETail16Store
+ ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
+ polyMul
+ JMP openSSEFinalize
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 64 bytes of ciphertext
+openSSETail64:
+ // Need to decrypt up to 64 bytes - prepare single block
+ MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
+ XORQ itr2, itr2
+ MOVQ inl, itr1
+ CMPQ itr1, $16
+ JB openSSETail64LoopB
+
+openSSETail64LoopA:
+ // Perform ChaCha rounds, while hashing the remaining input
+ polyAdd(0(inp)(itr2*1))
+ polyMul
+ SUBQ $16, itr1
+
+openSSETail64LoopB:
+ ADDQ $16, itr2
+ chachaQR(A0, B0, C0, D0, T0)
+ shiftB0Left; shiftC0Left; shiftD0Left
+ chachaQR(A0, B0, C0, D0, T0)
+ shiftB0Right; shiftC0Right; shiftD0Right
+
+ CMPQ itr1, $16
+ JAE openSSETail64LoopA
+
+ CMPQ itr2, $160
+ JNE openSSETail64LoopB
+
+ PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
+
+openSSETail64DecLoop:
+ CMPQ inl, $16
+ JB openSSETail64DecLoopDone
+ SUBQ $16, inl
+ MOVOU (inp), T0
+ PXOR T0, A0
+ MOVOU A0, (oup)
+ LEAQ 16(inp), inp
+ LEAQ 16(oup), oup
+ MOVO B0, A0
+ MOVO C0, B0
+ MOVO D0, C0
+ JMP openSSETail64DecLoop
+
+openSSETail64DecLoopDone:
+ MOVO A0, A1
+ JMP openSSETail16
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 128 bytes of ciphertext
+openSSETail128:
+ // Need to decrypt up to 128 bytes - prepare two blocks
+ MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
+ MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
+ XORQ itr2, itr2
+ MOVQ inl, itr1
+ ANDQ $-16, itr1
+
+openSSETail128LoopA:
+ // Perform ChaCha rounds, while hashing the remaining input
+ polyAdd(0(inp)(itr2*1))
+ polyMul
+
+openSSETail128LoopB:
+ ADDQ $16, itr2
+ chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
+ shiftB0Left; shiftC0Left; shiftD0Left
+ shiftB1Left; shiftC1Left; shiftD1Left
+ chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
+ shiftB0Right; shiftC0Right; shiftD0Right
+ shiftB1Right; shiftC1Right; shiftD1Right
+
+ CMPQ itr2, itr1
+ JB openSSETail128LoopA
+
+ CMPQ itr2, $160
+ JNE openSSETail128LoopB
+
+ PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
+ PADDL state1Store, B0; PADDL state1Store, B1
+ PADDL state2Store, C0; PADDL state2Store, C1
+ PADDL ctr1Store, D0; PADDL ctr0Store, D1
+
+ MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
+ PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
+ MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
+
+ SUBQ $64, inl
+ LEAQ 64(inp), inp
+ LEAQ 64(oup), oup
+ JMP openSSETail64DecLoop
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 192 bytes of ciphertext
+openSSETail192:
+ // Need to decrypt up to 192 bytes - prepare three blocks
+ MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
+ MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
+ MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
+
+ MOVQ inl, itr1
+ MOVQ $160, itr2
+ CMPQ itr1, $160
+ CMOVQGT itr2, itr1
+ ANDQ $-16, itr1
+ XORQ itr2, itr2
+
+openSSLTail192LoopA:
+ // Perform ChaCha rounds, while hashing the remaining input
+ polyAdd(0(inp)(itr2*1))
+ polyMul
+
+openSSLTail192LoopB:
+ ADDQ $16, itr2
+ chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
+ shiftB0Left; shiftC0Left; shiftD0Left
+ shiftB1Left; shiftC1Left; shiftD1Left
+ shiftB2Left; shiftC2Left; shiftD2Left
+
+ chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
+ shiftB0Right; shiftC0Right; shiftD0Right
+ shiftB1Right; shiftC1Right; shiftD1Right
+ shiftB2Right; shiftC2Right; shiftD2Right
+
+ CMPQ itr2, itr1
+ JB openSSLTail192LoopA
+
+ CMPQ itr2, $160
+ JNE openSSLTail192LoopB
+
+ CMPQ inl, $176
+ JB openSSLTail192Store
+
+ polyAdd(160(inp))
+ polyMul
+
+ CMPQ inl, $192
+ JB openSSLTail192Store
+
+ polyAdd(176(inp))
+ polyMul
+
+openSSLTail192Store:
+ PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
+ PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
+ PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
+ PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
+
+ MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
+ PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
+ MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
+
+ MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
+ PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
+ MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
+
+ SUBQ $128, inl
+ LEAQ 128(inp), inp
+ LEAQ 128(oup), oup
+ JMP openSSETail64DecLoop
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 256 bytes of ciphertext
+openSSETail256:
+ // Need to decrypt up to 256 bytes - prepare four blocks
+ MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
+ MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
+ MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
+ MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+
+ // Store counters
+ MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
+ XORQ itr2, itr2
+
+openSSETail256Loop:
+ // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
+ polyAdd(0(inp)(itr2*1))
+ MOVO C3, tmpStore
+ chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
+ MOVO tmpStore, C3
+ MOVO C1, tmpStore
+ chachaQR(A3, B3, C3, D3, C1)
+ MOVO tmpStore, C1
+ shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
+ shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
+ shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
+ polyMulStage1
+ polyMulStage2
+ MOVO C3, tmpStore
+ chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
+ MOVO tmpStore, C3
+ MOVO C1, tmpStore
+ chachaQR(A3, B3, C3, D3, C1)
+ MOVO tmpStore, C1
+ polyMulStage3
+ polyMulReduceStage
+ shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
+ shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
+ shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
+ ADDQ $2*8, itr2
+ CMPQ itr2, $160
+ JB openSSETail256Loop
+ MOVQ inl, itr1
+ ANDQ $-16, itr1
+
+openSSETail256HashLoop:
+ polyAdd(0(inp)(itr2*1))
+ polyMul
+ ADDQ $2*8, itr2
+ CMPQ itr2, itr1
+ JB openSSETail256HashLoop
+
+ // Add in the state
+ PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
+ PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
+ PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
+ PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
+ MOVO D3, tmpStore
+
+ // Load - xor - store
+ MOVOU (0*16)(inp), D3; PXOR D3, A0
+ MOVOU (1*16)(inp), D3; PXOR D3, B0
+ MOVOU (2*16)(inp), D3; PXOR D3, C0
+ MOVOU (3*16)(inp), D3; PXOR D3, D0
+ MOVOU A0, (0*16)(oup)
+ MOVOU B0, (1*16)(oup)
+ MOVOU C0, (2*16)(oup)
+ MOVOU D0, (3*16)(oup)
+ MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
+ PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
+ MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
+ MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
+ PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
+ MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
+ LEAQ 192(inp), inp
+ LEAQ 192(oup), oup
+ SUBQ $192, inl
+ MOVO A3, A0
+ MOVO B3, B0
+ MOVO C3, C0
+ MOVO tmpStore, D0
+
+ JMP openSSETail64DecLoop
+
+// ----------------------------------------------------------------------------
+// ------------------------- AVX2 Code ----------------------------------------
+chacha20Poly1305Open_AVX2:
+ VZEROUPPER
+ VMOVDQU ·chacha20Constants<>(SB), AA0
+ BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
+ BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
+ BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
+ VPADDD ·avx2InitMask<>(SB), DD0, DD0
+
+ // Special optimization, for very short buffers
+ CMPQ inl, $192
+ JBE openAVX2192
+ CMPQ inl, $320
+ JBE openAVX2320
+
+ // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
+ VMOVDQA BB0, state1StoreAVX2
+ VMOVDQA CC0, state2StoreAVX2
+ VMOVDQA DD0, ctr3StoreAVX2
+ MOVQ $10, itr2
+
+openAVX2PreparePolyKey:
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
+ DECQ itr2
+ JNE openAVX2PreparePolyKey
+
+ VPADDD ·chacha20Constants<>(SB), AA0, AA0
+ VPADDD state1StoreAVX2, BB0, BB0
+ VPADDD state2StoreAVX2, CC0, CC0
+ VPADDD ctr3StoreAVX2, DD0, DD0
+
+ VPERM2I128 $0x02, AA0, BB0, TT0
+
+ // Clamp and store poly key
+ VPAND ·polyClampMask<>(SB), TT0, TT0
+ VMOVDQA TT0, rsStoreAVX2
+
+ // Stream for the first 64 bytes
+ VPERM2I128 $0x13, AA0, BB0, AA0
+ VPERM2I128 $0x13, CC0, DD0, BB0
+
+ // Hash AD + first 64 bytes
+ MOVQ ad_len+80(FP), itr2
+ CALL polyHashADInternal<>(SB)
+ XORQ itr1, itr1
+
+openAVX2InitialHash64:
+ polyAdd(0(inp)(itr1*1))
+ polyMulAVX2
+ ADDQ $16, itr1
+ CMPQ itr1, $64
+ JNE openAVX2InitialHash64
+
+ // Decrypt the first 64 bytes
+ VPXOR (0*32)(inp), AA0, AA0
+ VPXOR (1*32)(inp), BB0, BB0
+ VMOVDQU AA0, (0*32)(oup)
+ VMOVDQU BB0, (1*32)(oup)
+ LEAQ (2*32)(inp), inp
+ LEAQ (2*32)(oup), oup
+ SUBQ $64, inl
+
+openAVX2MainLoop:
+ CMPQ inl, $512
+ JB openAVX2MainLoopDone
+
+ // Load state, increment counter blocks, store the incremented counters
+ VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
+ VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
+ VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
+ VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
+ VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
+ XORQ itr1, itr1
+
+openAVX2InternalLoop:
+ // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
+ // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
+ polyAdd(0*8(inp)(itr1*1))
+ VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+ polyMulStage1_AVX2
+ VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+ VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
+ polyMulStage2_AVX2
+ VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+ VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+ polyMulStage3_AVX2
+ VMOVDQA CC3, tmpStoreAVX2
+ VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
+ VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
+ VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
+ VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
+ VMOVDQA tmpStoreAVX2, CC3
+ polyMulReduceStage
+ VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+ VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+ VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
+ polyAdd(2*8(inp)(itr1*1))
+ VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+ polyMulStage1_AVX2
+ VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+ VMOVDQA CC3, tmpStoreAVX2
+ VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
+ VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
+ VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
+ VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
+ VMOVDQA tmpStoreAVX2, CC3
+ polyMulStage2_AVX2
+ VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
+ VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
+ VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+ polyMulStage3_AVX2
+ VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+ VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
+ polyMulReduceStage
+ VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+ VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+ polyAdd(4*8(inp)(itr1*1))
+ LEAQ (6*8)(itr1), itr1
+ VMOVDQA CC3, tmpStoreAVX2
+ VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
+ VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
+ VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
+ VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
+ VMOVDQA tmpStoreAVX2, CC3
+ polyMulStage1_AVX2
+ VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+ VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+ polyMulStage2_AVX2
+ VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
+ VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+ polyMulStage3_AVX2
+ VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+ VMOVDQA CC3, tmpStoreAVX2
+ VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
+ VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
+ VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
+ VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
+ VMOVDQA tmpStoreAVX2, CC3
+ polyMulReduceStage
+ VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
+ VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
+ CMPQ itr1, $480
+ JNE openAVX2InternalLoop
+
+ VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
+ VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
+ VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
+ VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
+ VMOVDQA CC3, tmpStoreAVX2
+
+ // We only hashed 480 of the 512 bytes available - hash the remaining 32 here
+ polyAdd(480(inp))
+ polyMulAVX2
+ VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
+ VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
+ VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
+ VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
+ VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
+ VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
+
+ // and here
+ polyAdd(496(inp))
+ polyMulAVX2
+ VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
+ VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
+ VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
+ VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
+ VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
+ VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
+ LEAQ (32*16)(inp), inp
+ LEAQ (32*16)(oup), oup
+ SUBQ $(32*16), inl
+ JMP openAVX2MainLoop
+
+openAVX2MainLoopDone:
+ // Handle the various tail sizes efficiently
+ TESTQ inl, inl
+ JE openSSEFinalize
+ CMPQ inl, $128
+ JBE openAVX2Tail128
+ CMPQ inl, $256
+ JBE openAVX2Tail256
+ CMPQ inl, $384
+ JBE openAVX2Tail384
+ JMP openAVX2Tail512
+
+// ----------------------------------------------------------------------------
+// Special optimization for buffers smaller than 193 bytes
+openAVX2192:
+ // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
+ VMOVDQA AA0, AA1
+ VMOVDQA BB0, BB1
+ VMOVDQA CC0, CC1
+ VPADDD ·avx2IncMask<>(SB), DD0, DD1
+ VMOVDQA AA0, AA2
+ VMOVDQA BB0, BB2
+ VMOVDQA CC0, CC2
+ VMOVDQA DD0, DD2
+ VMOVDQA DD1, TT3
+ MOVQ $10, itr2
+
+openAVX2192InnerCipherLoop:
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
+ VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
+ VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
+ DECQ itr2
+ JNE openAVX2192InnerCipherLoop
+ VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
+ VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
+ VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
+ VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
+ VPERM2I128 $0x02, AA0, BB0, TT0
+
+ // Clamp and store poly key
+ VPAND ·polyClampMask<>(SB), TT0, TT0
+ VMOVDQA TT0, rsStoreAVX2
+
+ // Stream for up to 192 bytes
+ VPERM2I128 $0x13, AA0, BB0, AA0
+ VPERM2I128 $0x13, CC0, DD0, BB0
+ VPERM2I128 $0x02, AA1, BB1, CC0
+ VPERM2I128 $0x02, CC1, DD1, DD0
+ VPERM2I128 $0x13, AA1, BB1, AA1
+ VPERM2I128 $0x13, CC1, DD1, BB1
+
+openAVX2ShortOpen:
+ // Hash
+ MOVQ ad_len+80(FP), itr2
+ CALL polyHashADInternal<>(SB)
+
+openAVX2ShortOpenLoop:
+ CMPQ inl, $32
+ JB openAVX2ShortTail32
+ SUBQ $32, inl
+
+ // Load for hashing
+ polyAdd(0*8(inp))
+ polyMulAVX2
+ polyAdd(2*8(inp))
+ polyMulAVX2
+
+ // Load for decryption
+ VPXOR (inp), AA0, AA0
+ VMOVDQU AA0, (oup)
+ LEAQ (1*32)(inp), inp
+ LEAQ (1*32)(oup), oup
+
+ // Shift stream left
+ VMOVDQA BB0, AA0
+ VMOVDQA CC0, BB0
+ VMOVDQA DD0, CC0
+ VMOVDQA AA1, DD0
+ VMOVDQA BB1, AA1
+ VMOVDQA CC1, BB1
+ VMOVDQA DD1, CC1
+ VMOVDQA AA2, DD1
+ VMOVDQA BB2, AA2
+ JMP openAVX2ShortOpenLoop
+
+openAVX2ShortTail32:
+ CMPQ inl, $16
+ VMOVDQA A0, A1
+ JB openAVX2ShortDone
+
+ SUBQ $16, inl
+
+ // Load for hashing
+ polyAdd(0*8(inp))
+ polyMulAVX2
+
+ // Load for decryption
+ VPXOR (inp), A0, T0
+ VMOVDQU T0, (oup)
+ LEAQ (1*16)(inp), inp
+ LEAQ (1*16)(oup), oup
+ VPERM2I128 $0x11, AA0, AA0, AA0
+ VMOVDQA A0, A1
+
+openAVX2ShortDone:
+ VZEROUPPER
+ JMP openSSETail16
+
+// ----------------------------------------------------------------------------
+// Special optimization for buffers smaller than 321 bytes
+openAVX2320:
+ // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
+ VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
+ VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
+ VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
+ MOVQ $10, itr2
+
+openAVX2320InnerCipherLoop:
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+ VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
+ VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+ VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
+ VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
+ DECQ itr2
+ JNE openAVX2320InnerCipherLoop
+
+ VMOVDQA ·chacha20Constants<>(SB), TT0
+ VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
+ VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
+ VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
+ VMOVDQA ·avx2IncMask<>(SB), TT0
+ VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
+ VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
+ VPADDD TT3, DD2, DD2
+
+ // Clamp and store poly key
+ VPERM2I128 $0x02, AA0, BB0, TT0
+ VPAND ·polyClampMask<>(SB), TT0, TT0
+ VMOVDQA TT0, rsStoreAVX2
+
+ // Stream for up to 320 bytes
+ VPERM2I128 $0x13, AA0, BB0, AA0
+ VPERM2I128 $0x13, CC0, DD0, BB0
+ VPERM2I128 $0x02, AA1, BB1, CC0
+ VPERM2I128 $0x02, CC1, DD1, DD0
+ VPERM2I128 $0x13, AA1, BB1, AA1
+ VPERM2I128 $0x13, CC1, DD1, BB1
+ VPERM2I128 $0x02, AA2, BB2, CC1
+ VPERM2I128 $0x02, CC2, DD2, DD1
+ VPERM2I128 $0x13, AA2, BB2, AA2
+ VPERM2I128 $0x13, CC2, DD2, BB2
+ JMP openAVX2ShortOpen
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 128 bytes of ciphertext
+openAVX2Tail128:
+ // Need to decrypt up to 128 bytes - prepare two blocks
+ VMOVDQA ·chacha20Constants<>(SB), AA1
+ VMOVDQA state1StoreAVX2, BB1
+ VMOVDQA state2StoreAVX2, CC1
+ VMOVDQA ctr3StoreAVX2, DD1
+ VPADDD ·avx2IncMask<>(SB), DD1, DD1
+ VMOVDQA DD1, DD0
+
+ XORQ itr2, itr2
+ MOVQ inl, itr1
+ ANDQ $-16, itr1
+ TESTQ itr1, itr1
+ JE openAVX2Tail128LoopB
+
+openAVX2Tail128LoopA:
+ // Perform ChaCha rounds, while hashing the remaining input
+ polyAdd(0(inp)(itr2*1))
+ polyMulAVX2
+
+openAVX2Tail128LoopB:
+ ADDQ $16, itr2
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ VPALIGNR $4, BB1, BB1, BB1
+ VPALIGNR $8, CC1, CC1, CC1
+ VPALIGNR $12, DD1, DD1, DD1
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ VPALIGNR $12, BB1, BB1, BB1
+ VPALIGNR $8, CC1, CC1, CC1
+ VPALIGNR $4, DD1, DD1, DD1
+ CMPQ itr2, itr1
+ JB openAVX2Tail128LoopA
+ CMPQ itr2, $160
+ JNE openAVX2Tail128LoopB
+
+ VPADDD ·chacha20Constants<>(SB), AA1, AA1
+ VPADDD state1StoreAVX2, BB1, BB1
+ VPADDD state2StoreAVX2, CC1, CC1
+ VPADDD DD0, DD1, DD1
+ VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
+
+openAVX2TailLoop:
+ CMPQ inl, $32
+ JB openAVX2Tail
+ SUBQ $32, inl
+
+ // Load for decryption
+ VPXOR (inp), AA0, AA0
+ VMOVDQU AA0, (oup)
+ LEAQ (1*32)(inp), inp
+ LEAQ (1*32)(oup), oup
+ VMOVDQA BB0, AA0
+ VMOVDQA CC0, BB0
+ VMOVDQA DD0, CC0
+ JMP openAVX2TailLoop
+
+openAVX2Tail:
+ CMPQ inl, $16
+ VMOVDQA A0, A1
+ JB openAVX2TailDone
+ SUBQ $16, inl
+
+ // Load for decryption
+ VPXOR (inp), A0, T0
+ VMOVDQU T0, (oup)
+ LEAQ (1*16)(inp), inp
+ LEAQ (1*16)(oup), oup
+ VPERM2I128 $0x11, AA0, AA0, AA0
+ VMOVDQA A0, A1
+
+openAVX2TailDone:
+ VZEROUPPER
+ JMP openSSETail16
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 256 bytes of ciphertext
+openAVX2Tail256:
+ // Need to decrypt up to 256 bytes - prepare four blocks
+ VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
+ VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
+ VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
+ VMOVDQA ctr3StoreAVX2, DD0
+ VPADDD ·avx2IncMask<>(SB), DD0, DD0
+ VPADDD ·avx2IncMask<>(SB), DD0, DD1
+ VMOVDQA DD0, TT1
+ VMOVDQA DD1, TT2
+
+ // Compute the number of iterations that will hash data
+ MOVQ inl, tmpStoreAVX2
+ MOVQ inl, itr1
+ SUBQ $128, itr1
+ SHRQ $4, itr1
+ MOVQ $10, itr2
+ CMPQ itr1, $10
+ CMOVQGT itr2, itr1
+ MOVQ inp, inl
+ XORQ itr2, itr2
+
+openAVX2Tail256LoopA:
+ polyAdd(0(inl))
+ polyMulAVX2
+ LEAQ 16(inl), inl
+
+ // Perform ChaCha rounds, while hashing the remaining input
+openAVX2Tail256LoopB:
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
+ VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
+ INCQ itr2
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
+ VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
+ CMPQ itr2, itr1
+ JB openAVX2Tail256LoopA
+
+ CMPQ itr2, $10
+ JNE openAVX2Tail256LoopB
+
+ MOVQ inl, itr2
+ SUBQ inp, inl
+ MOVQ inl, itr1
+ MOVQ tmpStoreAVX2, inl
+
+ // Hash the remainder of data (if any)
+openAVX2Tail256Hash:
+ ADDQ $16, itr1
+ CMPQ itr1, inl
+ JGT openAVX2Tail256HashEnd
+ polyAdd (0(itr2))
+ polyMulAVX2
+ LEAQ 16(itr2), itr2
+ JMP openAVX2Tail256Hash
+
+// Store 128 bytes safely, then go to store loop
+openAVX2Tail256HashEnd:
+ VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
+ VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
+ VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
+ VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
+ VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
+ VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
+
+ VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
+ VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
+ LEAQ (4*32)(inp), inp
+ LEAQ (4*32)(oup), oup
+ SUBQ $4*32, inl
+
+ JMP openAVX2TailLoop
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 384 bytes of ciphertext
+openAVX2Tail384:
+ // Need to decrypt up to 384 bytes - prepare six blocks
+ VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
+ VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
+ VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
+ VMOVDQA ctr3StoreAVX2, DD0
+ VPADDD ·avx2IncMask<>(SB), DD0, DD0
+ VPADDD ·avx2IncMask<>(SB), DD0, DD1
+ VPADDD ·avx2IncMask<>(SB), DD1, DD2
+ VMOVDQA DD0, ctr0StoreAVX2
+ VMOVDQA DD1, ctr1StoreAVX2
+ VMOVDQA DD2, ctr2StoreAVX2
+
+ // Compute the number of iterations that will hash two blocks of data
+ MOVQ inl, tmpStoreAVX2
+ MOVQ inl, itr1
+ SUBQ $256, itr1
+ SHRQ $4, itr1
+ ADDQ $6, itr1
+ MOVQ $10, itr2
+ CMPQ itr1, $10
+ CMOVQGT itr2, itr1
+ MOVQ inp, inl
+ XORQ itr2, itr2
+
+ // Perform ChaCha rounds, while hashing the remaining input
+openAVX2Tail384LoopB:
+ polyAdd(0(inl))
+ polyMulAVX2
+ LEAQ 16(inl), inl
+
+openAVX2Tail384LoopA:
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+ VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
+ VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
+ polyAdd(0(inl))
+ polyMulAVX2
+ LEAQ 16(inl), inl
+ INCQ itr2
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+ VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
+ VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
+
+ CMPQ itr2, itr1
+ JB openAVX2Tail384LoopB
+
+ CMPQ itr2, $10
+ JNE openAVX2Tail384LoopA
+
+ MOVQ inl, itr2
+ SUBQ inp, inl
+ MOVQ inl, itr1
+ MOVQ tmpStoreAVX2, inl
+
+openAVX2Tail384Hash:
+ ADDQ $16, itr1
+ CMPQ itr1, inl
+ JGT openAVX2Tail384HashEnd
+ polyAdd(0(itr2))
+ polyMulAVX2
+ LEAQ 16(itr2), itr2
+ JMP openAVX2Tail384Hash
+
+// Store 256 bytes safely, then go to store loop
+openAVX2Tail384HashEnd:
+ VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
+ VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
+ VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
+ VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
+ VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
+ VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
+ VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
+ VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
+ VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
+ VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
+ VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
+ LEAQ (8*32)(inp), inp
+ LEAQ (8*32)(oup), oup
+ SUBQ $8*32, inl
+ JMP openAVX2TailLoop
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 512 bytes of ciphertext
+openAVX2Tail512:
+ VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
+ VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
+ VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
+ VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
+ VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
+ XORQ itr1, itr1
+ MOVQ inp, itr2
+
+openAVX2Tail512LoopB:
+ polyAdd(0(itr2))
+ polyMulAVX2
+ LEAQ (2*8)(itr2), itr2
+
+openAVX2Tail512LoopA:
+ VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+ VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+ VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
+ VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+ VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+ VMOVDQA CC3, tmpStoreAVX2
+ VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
+ VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
+ VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
+ VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
+ VMOVDQA tmpStoreAVX2, CC3
+ polyAdd(0*8(itr2))
+ polyMulAVX2
+ VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+ VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+ VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
+ VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+ VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+ VMOVDQA CC3, tmpStoreAVX2
+ VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
+ VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
+ VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
+ VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
+ VMOVDQA tmpStoreAVX2, CC3
+ VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
+ VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
+ VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+ VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+ VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
+ VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+ VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+ polyAdd(2*8(itr2))
+ polyMulAVX2
+ LEAQ (4*8)(itr2), itr2
+ VMOVDQA CC3, tmpStoreAVX2
+ VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
+ VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
+ VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
+ VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
+ VMOVDQA tmpStoreAVX2, CC3
+ VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+ VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+ VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
+ VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+ VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+ VMOVDQA CC3, tmpStoreAVX2
+ VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
+ VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
+ VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
+ VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
+ VMOVDQA tmpStoreAVX2, CC3
+ VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
+ VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
+ INCQ itr1
+ CMPQ itr1, $4
+ JLT openAVX2Tail512LoopB
+
+ CMPQ itr1, $10
+ JNE openAVX2Tail512LoopA
+
+ MOVQ inl, itr1
+ SUBQ $384, itr1
+ ANDQ $-16, itr1
+
+openAVX2Tail512HashLoop:
+ TESTQ itr1, itr1
+ JE openAVX2Tail512HashEnd
+ polyAdd(0(itr2))
+ polyMulAVX2
+ LEAQ 16(itr2), itr2
+ SUBQ $16, itr1
+ JMP openAVX2Tail512HashLoop
+
+openAVX2Tail512HashEnd:
+ VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
+ VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
+ VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
+ VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
+ VMOVDQA CC3, tmpStoreAVX2
+ VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
+ VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
+ VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
+ VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
+ VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
+ VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
+ VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
+ VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
+ VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
+ VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
+
+ LEAQ (12*32)(inp), inp
+ LEAQ (12*32)(oup), oup
+ SUBQ $12*32, inl
+
+ JMP openAVX2TailLoop
+
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// func chacha20Poly1305Seal(dst, key, src, ad []byte)
+TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
+ // For aligned stack access
+ MOVQ SP, BP
+ ADDQ $32, BP
+ ANDQ $-32, BP
+ MOVQ dst+0(FP), oup
+ MOVQ key+24(FP), keyp
+ MOVQ src+48(FP), inp
+ MOVQ src_len+56(FP), inl
+ MOVQ ad+72(FP), adp
+
+ CMPB ·useAVX2(SB), $1
+ JE chacha20Poly1305Seal_AVX2
+
+ // Special optimization, for very short buffers
+ CMPQ inl, $128
+ JBE sealSSE128 // About 15% faster
+
+ // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
+ MOVOU ·chacha20Constants<>(SB), A0
+ MOVOU (1*16)(keyp), B0
+ MOVOU (2*16)(keyp), C0
+ MOVOU (3*16)(keyp), D0
+
+ // Store state on stack for future use
+ MOVO B0, state1Store
+ MOVO C0, state2Store
+
+ // Load state, increment counter blocks
+ MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
+ MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
+ MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+
+ // Store counters
+ MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
+ MOVQ $10, itr2
+
+sealSSEIntroLoop:
+ MOVO C3, tmpStore
+ chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
+ MOVO tmpStore, C3
+ MOVO C1, tmpStore
+ chachaQR(A3, B3, C3, D3, C1)
+ MOVO tmpStore, C1
+ shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
+ shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
+ shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
+
+ MOVO C3, tmpStore
+ chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
+ MOVO tmpStore, C3
+ MOVO C1, tmpStore
+ chachaQR(A3, B3, C3, D3, C1)
+ MOVO tmpStore, C1
+ shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
+ shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
+ shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
+ DECQ itr2
+ JNE sealSSEIntroLoop
+
+ // Add in the state
+ PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
+ PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
+ PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
+ PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
+
+ // Clamp and store the key
+ PAND ·polyClampMask<>(SB), A0
+ MOVO A0, rStore
+ MOVO B0, sStore
+
+ // Hash AAD
+ MOVQ ad_len+80(FP), itr2
+ CALL polyHashADInternal<>(SB)
+
+ MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
+ PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
+ MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
+ MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
+ PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
+ MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
+
+ MOVQ $128, itr1
+ SUBQ $128, inl
+ LEAQ 128(inp), inp
+
+ MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
+
+ CMPQ inl, $64
+ JBE sealSSE128SealHash
+
+ MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
+ PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
+ MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
+
+ ADDQ $64, itr1
+ SUBQ $64, inl
+ LEAQ 64(inp), inp
+
+ MOVQ $2, itr1
+ MOVQ $8, itr2
+
+ CMPQ inl, $64
+ JBE sealSSETail64
+ CMPQ inl, $128
+ JBE sealSSETail128
+ CMPQ inl, $192
+ JBE sealSSETail192
+
+sealSSEMainLoop:
+ // Load state, increment counter blocks
+ MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
+ MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
+ MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
+ MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+
+ // Store counters
+ MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
+
+sealSSEInnerLoop:
+ MOVO C3, tmpStore
+ chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
+ MOVO tmpStore, C3
+ MOVO C1, tmpStore
+ chachaQR(A3, B3, C3, D3, C1)
+ MOVO tmpStore, C1
+ polyAdd(0(oup))
+ shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
+ shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
+ shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
+ polyMulStage1
+ polyMulStage2
+ LEAQ (2*8)(oup), oup
+ MOVO C3, tmpStore
+ chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
+ MOVO tmpStore, C3
+ MOVO C1, tmpStore
+ polyMulStage3
+ chachaQR(A3, B3, C3, D3, C1)
+ MOVO tmpStore, C1
+ polyMulReduceStage
+ shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
+ shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
+ shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
+ DECQ itr2
+ JGE sealSSEInnerLoop
+ polyAdd(0(oup))
+ polyMul
+ LEAQ (2*8)(oup), oup
+ DECQ itr1
+ JG sealSSEInnerLoop
+
+ // Add in the state
+ PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
+ PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
+ PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
+ PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
+ MOVO D3, tmpStore
+
+ // Load - xor - store
+ MOVOU (0*16)(inp), D3; PXOR D3, A0
+ MOVOU (1*16)(inp), D3; PXOR D3, B0
+ MOVOU (2*16)(inp), D3; PXOR D3, C0
+ MOVOU (3*16)(inp), D3; PXOR D3, D0
+ MOVOU A0, (0*16)(oup)
+ MOVOU B0, (1*16)(oup)
+ MOVOU C0, (2*16)(oup)
+ MOVOU D0, (3*16)(oup)
+ MOVO tmpStore, D3
+
+ MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
+ PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
+ MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
+ MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
+ PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
+ MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
+ ADDQ $192, inp
+ MOVQ $192, itr1
+ SUBQ $192, inl
+ MOVO A3, A1
+ MOVO B3, B1
+ MOVO C3, C1
+ MOVO D3, D1
+ CMPQ inl, $64
+ JBE sealSSE128SealHash
+ MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
+ PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
+ MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
+ LEAQ 64(inp), inp
+ SUBQ $64, inl
+ MOVQ $6, itr1
+ MOVQ $4, itr2
+ CMPQ inl, $192
+ JG sealSSEMainLoop
+
+ MOVQ inl, itr1
+ TESTQ inl, inl
+ JE sealSSE128SealHash
+ MOVQ $6, itr1
+ CMPQ inl, $64
+ JBE sealSSETail64
+ CMPQ inl, $128
+ JBE sealSSETail128
+ JMP sealSSETail192
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 64 bytes of plaintext
+sealSSETail64:
+ // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
+ MOVO ·chacha20Constants<>(SB), A1
+ MOVO state1Store, B1
+ MOVO state2Store, C1
+ MOVO ctr3Store, D1
+ PADDL ·sseIncMask<>(SB), D1
+ MOVO D1, ctr0Store
+
+sealSSETail64LoopA:
+ // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
+ polyAdd(0(oup))
+ polyMul
+ LEAQ 16(oup), oup
+
+sealSSETail64LoopB:
+ chachaQR(A1, B1, C1, D1, T1)
+ shiftB1Left; shiftC1Left; shiftD1Left
+ chachaQR(A1, B1, C1, D1, T1)
+ shiftB1Right; shiftC1Right; shiftD1Right
+ polyAdd(0(oup))
+ polyMul
+ LEAQ 16(oup), oup
+
+ DECQ itr1
+ JG sealSSETail64LoopA
+
+ DECQ itr2
+ JGE sealSSETail64LoopB
+ PADDL ·chacha20Constants<>(SB), A1
+ PADDL state1Store, B1
+ PADDL state2Store, C1
+ PADDL ctr0Store, D1
+
+ JMP sealSSE128Seal
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 128 bytes of plaintext
+sealSSETail128:
+ // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
+ MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
+ MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
+
+sealSSETail128LoopA:
+ // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
+ polyAdd(0(oup))
+ polyMul
+ LEAQ 16(oup), oup
+
+sealSSETail128LoopB:
+ chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
+ shiftB0Left; shiftC0Left; shiftD0Left
+ shiftB1Left; shiftC1Left; shiftD1Left
+ polyAdd(0(oup))
+ polyMul
+ LEAQ 16(oup), oup
+ chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
+ shiftB0Right; shiftC0Right; shiftD0Right
+ shiftB1Right; shiftC1Right; shiftD1Right
+
+ DECQ itr1
+ JG sealSSETail128LoopA
+
+ DECQ itr2
+ JGE sealSSETail128LoopB
+
+ PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
+ PADDL state1Store, B0; PADDL state1Store, B1
+ PADDL state2Store, C0; PADDL state2Store, C1
+ PADDL ctr0Store, D0; PADDL ctr1Store, D1
+
+ MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
+ PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
+ MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
+
+ MOVQ $64, itr1
+ LEAQ 64(inp), inp
+ SUBQ $64, inl
+
+ JMP sealSSE128SealHash
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 192 bytes of plaintext
+sealSSETail192:
+ // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
+ MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
+ MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
+ MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
+
+sealSSETail192LoopA:
+ // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
+ polyAdd(0(oup))
+ polyMul
+ LEAQ 16(oup), oup
+
+sealSSETail192LoopB:
+ chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
+ shiftB0Left; shiftC0Left; shiftD0Left
+ shiftB1Left; shiftC1Left; shiftD1Left
+ shiftB2Left; shiftC2Left; shiftD2Left
+
+ polyAdd(0(oup))
+ polyMul
+ LEAQ 16(oup), oup
+
+ chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
+ shiftB0Right; shiftC0Right; shiftD0Right
+ shiftB1Right; shiftC1Right; shiftD1Right
+ shiftB2Right; shiftC2Right; shiftD2Right
+
+ DECQ itr1
+ JG sealSSETail192LoopA
+
+ DECQ itr2
+ JGE sealSSETail192LoopB
+
+ PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
+ PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
+ PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
+ PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
+
+ MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
+ PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
+ MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
+ MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
+ PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
+ MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
+
+ MOVO A2, A1
+ MOVO B2, B1
+ MOVO C2, C1
+ MOVO D2, D1
+ MOVQ $128, itr1
+ LEAQ 128(inp), inp
+ SUBQ $128, inl
+
+ JMP sealSSE128SealHash
+
+// ----------------------------------------------------------------------------
+// Special seal optimization for buffers smaller than 129 bytes
+sealSSE128:
+ // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
+ MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
+ MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
+ MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
+ MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
+ MOVQ $10, itr2
+
+sealSSE128InnerCipherLoop:
+ chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
+ shiftB0Left; shiftB1Left; shiftB2Left
+ shiftC0Left; shiftC1Left; shiftC2Left
+ shiftD0Left; shiftD1Left; shiftD2Left
+ chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
+ shiftB0Right; shiftB1Right; shiftB2Right
+ shiftC0Right; shiftC1Right; shiftC2Right
+ shiftD0Right; shiftD1Right; shiftD2Right
+ DECQ itr2
+ JNE sealSSE128InnerCipherLoop
+
+ // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
+ PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
+ PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
+ PADDL T2, C1; PADDL T2, C2
+ PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
+ PAND ·polyClampMask<>(SB), A0
+ MOVOU A0, rStore
+ MOVOU B0, sStore
+
+ // Hash
+ MOVQ ad_len+80(FP), itr2
+ CALL polyHashADInternal<>(SB)
+ XORQ itr1, itr1
+
+sealSSE128SealHash:
+ // itr1 holds the number of bytes encrypted but not yet hashed
+ CMPQ itr1, $16
+ JB sealSSE128Seal
+ polyAdd(0(oup))
+ polyMul
+
+ SUBQ $16, itr1
+ ADDQ $16, oup
+
+ JMP sealSSE128SealHash
+
+sealSSE128Seal:
+ CMPQ inl, $16
+ JB sealSSETail
+ SUBQ $16, inl
+
+ // Load for decryption
+ MOVOU (inp), T0
+ PXOR T0, A1
+ MOVOU A1, (oup)
+ LEAQ (1*16)(inp), inp
+ LEAQ (1*16)(oup), oup
+
+ // Extract for hashing
+ MOVQ A1, t0
+ PSRLDQ $8, A1
+ MOVQ A1, t1
+ ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
+ polyMul
+
+ // Shift the stream "left"
+ MOVO B1, A1
+ MOVO C1, B1
+ MOVO D1, C1
+ MOVO A2, D1
+ MOVO B2, A2
+ MOVO C2, B2
+ MOVO D2, C2
+ JMP sealSSE128Seal
+
+sealSSETail:
+ TESTQ inl, inl
+ JE sealSSEFinalize
+
+ // We can only load the PT one byte at a time to avoid read after end of buffer
+ MOVQ inl, itr2
+ SHLQ $4, itr2
+ LEAQ ·andMask<>(SB), t0
+ MOVQ inl, itr1
+ LEAQ -1(inp)(inl*1), inp
+ XORQ t2, t2
+ XORQ t3, t3
+ XORQ AX, AX
+
+sealSSETailLoadLoop:
+ SHLQ $8, t2, t3
+ SHLQ $8, t2
+ MOVB (inp), AX
+ XORQ AX, t2
+ LEAQ -1(inp), inp
+ DECQ itr1
+ JNE sealSSETailLoadLoop
+ MOVQ t2, 0+tmpStore
+ MOVQ t3, 8+tmpStore
+ PXOR 0+tmpStore, A1
+ MOVOU A1, (oup)
+ MOVOU -16(t0)(itr2*1), T0
+ PAND T0, A1
+ MOVQ A1, t0
+ PSRLDQ $8, A1
+ MOVQ A1, t1
+ ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
+ polyMul
+
+ ADDQ inl, oup
+
+sealSSEFinalize:
+ // Hash in the buffer lengths
+ ADDQ ad_len+80(FP), acc0
+ ADCQ src_len+56(FP), acc1
+ ADCQ $1, acc2
+ polyMul
+
+ // Final reduce
+ MOVQ acc0, t0
+ MOVQ acc1, t1
+ MOVQ acc2, t2
+ SUBQ $-5, acc0
+ SBBQ $-1, acc1
+ SBBQ $3, acc2
+ CMOVQCS t0, acc0
+ CMOVQCS t1, acc1
+ CMOVQCS t2, acc2
+
+ // Add in the "s" part of the key
+ ADDQ 0+sStore, acc0
+ ADCQ 8+sStore, acc1
+
+ // Finally store the tag at the end of the message
+ MOVQ acc0, (0*8)(oup)
+ MOVQ acc1, (1*8)(oup)
+ RET
+
+// ----------------------------------------------------------------------------
+// ------------------------- AVX2 Code ----------------------------------------
+chacha20Poly1305Seal_AVX2:
+ VZEROUPPER
+ VMOVDQU ·chacha20Constants<>(SB), AA0
+ BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
+ BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
+ BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
+ VPADDD ·avx2InitMask<>(SB), DD0, DD0
+
+ // Special optimizations, for very short buffers
+ CMPQ inl, $192
+ JBE seal192AVX2 // 33% faster
+ CMPQ inl, $320
+ JBE seal320AVX2 // 17% faster
+
+ // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
+ VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
+ VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
+ VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
+ VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
+ VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
+ VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
+ VMOVDQA DD3, ctr3StoreAVX2
+ MOVQ $10, itr2
+
+sealAVX2IntroLoop:
+ VMOVDQA CC3, tmpStoreAVX2
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
+ VMOVDQA tmpStoreAVX2, CC3
+ VMOVDQA CC1, tmpStoreAVX2
+ chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
+ VMOVDQA tmpStoreAVX2, CC1
+
+ VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
+ VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
+ VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
+ VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
+
+ VMOVDQA CC3, tmpStoreAVX2
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
+ VMOVDQA tmpStoreAVX2, CC3
+ VMOVDQA CC1, tmpStoreAVX2
+ chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
+ VMOVDQA tmpStoreAVX2, CC1
+
+ VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
+ VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
+ VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
+ VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
+ DECQ itr2
+ JNE sealAVX2IntroLoop
+
+ VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
+ VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
+ VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
+ VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
+
+ VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
+ VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
+ VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
+
+ // Clamp and store poly key
+ VPAND ·polyClampMask<>(SB), DD0, DD0
+ VMOVDQA DD0, rsStoreAVX2
+
+ // Hash AD
+ MOVQ ad_len+80(FP), itr2
+ CALL polyHashADInternal<>(SB)
+
+ // Can store at least 320 bytes
+ VPXOR (0*32)(inp), AA0, AA0
+ VPXOR (1*32)(inp), CC0, CC0
+ VMOVDQU AA0, (0*32)(oup)
+ VMOVDQU CC0, (1*32)(oup)
+
+ VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
+ VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
+ VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
+ VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
+ VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
+ VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
+
+ MOVQ $320, itr1
+ SUBQ $320, inl
+ LEAQ 320(inp), inp
+
+ VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
+ CMPQ inl, $128
+ JBE sealAVX2SealHash
+
+ VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
+ VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
+ SUBQ $128, inl
+ LEAQ 128(inp), inp
+
+ MOVQ $8, itr1
+ MOVQ $2, itr2
+
+ CMPQ inl, $128
+ JBE sealAVX2Tail128
+ CMPQ inl, $256
+ JBE sealAVX2Tail256
+ CMPQ inl, $384
+ JBE sealAVX2Tail384
+ CMPQ inl, $512
+ JBE sealAVX2Tail512
+
+ // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
+ VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
+ VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
+ VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
+ VMOVDQA ctr3StoreAVX2, DD0
+ VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
+ VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
+
+ VMOVDQA CC3, tmpStoreAVX2
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
+ VMOVDQA tmpStoreAVX2, CC3
+ VMOVDQA CC1, tmpStoreAVX2
+ chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
+ VMOVDQA tmpStoreAVX2, CC1
+
+ VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
+ VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
+ VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
+ VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
+
+ VMOVDQA CC3, tmpStoreAVX2
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
+ VMOVDQA tmpStoreAVX2, CC3
+ VMOVDQA CC1, tmpStoreAVX2
+ chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
+ VMOVDQA tmpStoreAVX2, CC1
+
+ VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
+ VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
+ VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
+ VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
+ VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+ VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+ VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
+ VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+ VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+ VMOVDQA CC3, tmpStoreAVX2
+ VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
+ VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
+ VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
+ VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
+ VMOVDQA tmpStoreAVX2, CC3
+
+ SUBQ $16, oup // Adjust the pointer
+ MOVQ $9, itr1
+ JMP sealAVX2InternalLoopStart
+
+sealAVX2MainLoop:
+ // Load state, increment counter blocks, store the incremented counters
+ VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
+ VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
+ VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
+ VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
+ VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
+ MOVQ $10, itr1
+
+sealAVX2InternalLoop:
+ polyAdd(0*8(oup))
+ VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+ polyMulStage1_AVX2
+ VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+ VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
+ polyMulStage2_AVX2
+ VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+ VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+ polyMulStage3_AVX2
+ VMOVDQA CC3, tmpStoreAVX2
+ VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
+ VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
+ VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
+ VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
+ VMOVDQA tmpStoreAVX2, CC3
+ polyMulReduceStage
+
+sealAVX2InternalLoopStart:
+ VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+ VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+ VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
+ polyAdd(2*8(oup))
+ VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+ polyMulStage1_AVX2
+ VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+ VMOVDQA CC3, tmpStoreAVX2
+ VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
+ VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
+ VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
+ VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
+ VMOVDQA tmpStoreAVX2, CC3
+ polyMulStage2_AVX2
+ VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
+ VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
+ VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+ polyMulStage3_AVX2
+ VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+ VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
+ polyMulReduceStage
+ VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+ VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+ polyAdd(4*8(oup))
+ LEAQ (6*8)(oup), oup
+ VMOVDQA CC3, tmpStoreAVX2
+ VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
+ VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
+ VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
+ VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
+ VMOVDQA tmpStoreAVX2, CC3
+ polyMulStage1_AVX2
+ VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+ VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+ polyMulStage2_AVX2
+ VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
+ VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+ polyMulStage3_AVX2
+ VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+ VMOVDQA CC3, tmpStoreAVX2
+ VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
+ VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
+ VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
+ VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
+ VMOVDQA tmpStoreAVX2, CC3
+ polyMulReduceStage
+ VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
+ VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
+ DECQ itr1
+ JNE sealAVX2InternalLoop
+
+ VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
+ VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
+ VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
+ VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
+ VMOVDQA CC3, tmpStoreAVX2
+
+ // We only hashed 480 of the 512 bytes available - hash the remaining 32 here
+ polyAdd(0*8(oup))
+ polyMulAVX2
+ LEAQ (4*8)(oup), oup
+ VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
+ VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
+ VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
+ VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
+ VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
+ VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
+
+ // and here
+ polyAdd(-2*8(oup))
+ polyMulAVX2
+ VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
+ VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
+ VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
+ VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
+ VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
+ VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
+ LEAQ (32*16)(inp), inp
+ SUBQ $(32*16), inl
+ CMPQ inl, $512
+ JG sealAVX2MainLoop
+
+ // Tail can only hash 480 bytes
+ polyAdd(0*8(oup))
+ polyMulAVX2
+ polyAdd(2*8(oup))
+ polyMulAVX2
+ LEAQ 32(oup), oup
+
+ MOVQ $10, itr1
+ MOVQ $0, itr2
+ CMPQ inl, $128
+ JBE sealAVX2Tail128
+ CMPQ inl, $256
+ JBE sealAVX2Tail256
+ CMPQ inl, $384
+ JBE sealAVX2Tail384
+ JMP sealAVX2Tail512
+
+// ----------------------------------------------------------------------------
+// Special optimization for buffers smaller than 193 bytes
+seal192AVX2:
+ // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
+ VMOVDQA AA0, AA1
+ VMOVDQA BB0, BB1
+ VMOVDQA CC0, CC1
+ VPADDD ·avx2IncMask<>(SB), DD0, DD1
+ VMOVDQA AA0, AA2
+ VMOVDQA BB0, BB2
+ VMOVDQA CC0, CC2
+ VMOVDQA DD0, DD2
+ VMOVDQA DD1, TT3
+ MOVQ $10, itr2
+
+sealAVX2192InnerCipherLoop:
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
+ VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
+ VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
+ DECQ itr2
+ JNE sealAVX2192InnerCipherLoop
+ VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
+ VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
+ VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
+ VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
+ VPERM2I128 $0x02, AA0, BB0, TT0
+
+ // Clamp and store poly key
+ VPAND ·polyClampMask<>(SB), TT0, TT0
+ VMOVDQA TT0, rsStoreAVX2
+
+ // Stream for up to 192 bytes
+ VPERM2I128 $0x13, AA0, BB0, AA0
+ VPERM2I128 $0x13, CC0, DD0, BB0
+ VPERM2I128 $0x02, AA1, BB1, CC0
+ VPERM2I128 $0x02, CC1, DD1, DD0
+ VPERM2I128 $0x13, AA1, BB1, AA1
+ VPERM2I128 $0x13, CC1, DD1, BB1
+
+sealAVX2ShortSeal:
+ // Hash aad
+ MOVQ ad_len+80(FP), itr2
+ CALL polyHashADInternal<>(SB)
+ XORQ itr1, itr1
+
+sealAVX2SealHash:
+ // itr1 holds the number of bytes encrypted but not yet hashed
+ CMPQ itr1, $16
+ JB sealAVX2ShortSealLoop
+ polyAdd(0(oup))
+ polyMul
+ SUBQ $16, itr1
+ ADDQ $16, oup
+ JMP sealAVX2SealHash
+
+sealAVX2ShortSealLoop:
+ CMPQ inl, $32
+ JB sealAVX2ShortTail32
+ SUBQ $32, inl
+
+ // Load for encryption
+ VPXOR (inp), AA0, AA0
+ VMOVDQU AA0, (oup)
+ LEAQ (1*32)(inp), inp
+
+ // Now can hash
+ polyAdd(0*8(oup))
+ polyMulAVX2
+ polyAdd(2*8(oup))
+ polyMulAVX2
+ LEAQ (1*32)(oup), oup
+
+ // Shift stream left
+ VMOVDQA BB0, AA0
+ VMOVDQA CC0, BB0
+ VMOVDQA DD0, CC0
+ VMOVDQA AA1, DD0
+ VMOVDQA BB1, AA1
+ VMOVDQA CC1, BB1
+ VMOVDQA DD1, CC1
+ VMOVDQA AA2, DD1
+ VMOVDQA BB2, AA2
+ JMP sealAVX2ShortSealLoop
+
+sealAVX2ShortTail32:
+ CMPQ inl, $16
+ VMOVDQA A0, A1
+ JB sealAVX2ShortDone
+
+ SUBQ $16, inl
+
+ // Load for encryption
+ VPXOR (inp), A0, T0
+ VMOVDQU T0, (oup)
+ LEAQ (1*16)(inp), inp
+
+ // Hash
+ polyAdd(0*8(oup))
+ polyMulAVX2
+ LEAQ (1*16)(oup), oup
+ VPERM2I128 $0x11, AA0, AA0, AA0
+ VMOVDQA A0, A1
+
+sealAVX2ShortDone:
+ VZEROUPPER
+ JMP sealSSETail
+
+// ----------------------------------------------------------------------------
+// Special optimization for buffers smaller than 321 bytes
+seal320AVX2:
+ // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
+ VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
+ VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
+ VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
+ MOVQ $10, itr2
+
+sealAVX2320InnerCipherLoop:
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+ VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
+ VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+ VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
+ VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
+ DECQ itr2
+ JNE sealAVX2320InnerCipherLoop
+
+ VMOVDQA ·chacha20Constants<>(SB), TT0
+ VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
+ VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
+ VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
+ VMOVDQA ·avx2IncMask<>(SB), TT0
+ VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
+ VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
+ VPADDD TT3, DD2, DD2
+
+ // Clamp and store poly key
+ VPERM2I128 $0x02, AA0, BB0, TT0
+ VPAND ·polyClampMask<>(SB), TT0, TT0
+ VMOVDQA TT0, rsStoreAVX2
+
+ // Stream for up to 320 bytes
+ VPERM2I128 $0x13, AA0, BB0, AA0
+ VPERM2I128 $0x13, CC0, DD0, BB0
+ VPERM2I128 $0x02, AA1, BB1, CC0
+ VPERM2I128 $0x02, CC1, DD1, DD0
+ VPERM2I128 $0x13, AA1, BB1, AA1
+ VPERM2I128 $0x13, CC1, DD1, BB1
+ VPERM2I128 $0x02, AA2, BB2, CC1
+ VPERM2I128 $0x02, CC2, DD2, DD1
+ VPERM2I128 $0x13, AA2, BB2, AA2
+ VPERM2I128 $0x13, CC2, DD2, BB2
+ JMP sealAVX2ShortSeal
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 128 bytes of ciphertext
+sealAVX2Tail128:
+ // Need to decrypt up to 128 bytes - prepare two blocks
+ // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
+ // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
+ VMOVDQA ·chacha20Constants<>(SB), AA0
+ VMOVDQA state1StoreAVX2, BB0
+ VMOVDQA state2StoreAVX2, CC0
+ VMOVDQA ctr3StoreAVX2, DD0
+ VPADDD ·avx2IncMask<>(SB), DD0, DD0
+ VMOVDQA DD0, DD1
+
+sealAVX2Tail128LoopA:
+ polyAdd(0(oup))
+ polyMul
+ LEAQ 16(oup), oup
+
+sealAVX2Tail128LoopB:
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ polyAdd(0(oup))
+ polyMul
+ VPALIGNR $4, BB0, BB0, BB0
+ VPALIGNR $8, CC0, CC0, CC0
+ VPALIGNR $12, DD0, DD0, DD0
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ polyAdd(16(oup))
+ polyMul
+ LEAQ 32(oup), oup
+ VPALIGNR $12, BB0, BB0, BB0
+ VPALIGNR $8, CC0, CC0, CC0
+ VPALIGNR $4, DD0, DD0, DD0
+ DECQ itr1
+ JG sealAVX2Tail128LoopA
+ DECQ itr2
+ JGE sealAVX2Tail128LoopB
+
+ VPADDD ·chacha20Constants<>(SB), AA0, AA1
+ VPADDD state1StoreAVX2, BB0, BB1
+ VPADDD state2StoreAVX2, CC0, CC1
+ VPADDD DD1, DD0, DD1
+
+ VPERM2I128 $0x02, AA1, BB1, AA0
+ VPERM2I128 $0x02, CC1, DD1, BB0
+ VPERM2I128 $0x13, AA1, BB1, CC0
+ VPERM2I128 $0x13, CC1, DD1, DD0
+ JMP sealAVX2ShortSealLoop
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 256 bytes of ciphertext
+sealAVX2Tail256:
+ // Need to decrypt up to 256 bytes - prepare two blocks
+ // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
+ // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
+ VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
+ VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
+ VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
+ VMOVDQA ctr3StoreAVX2, DD0
+ VPADDD ·avx2IncMask<>(SB), DD0, DD0
+ VPADDD ·avx2IncMask<>(SB), DD0, DD1
+ VMOVDQA DD0, TT1
+ VMOVDQA DD1, TT2
+
+sealAVX2Tail256LoopA:
+ polyAdd(0(oup))
+ polyMul
+ LEAQ 16(oup), oup
+
+sealAVX2Tail256LoopB:
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ polyAdd(0(oup))
+ polyMul
+ VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
+ VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ polyAdd(16(oup))
+ polyMul
+ LEAQ 32(oup), oup
+ VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
+ VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
+ DECQ itr1
+ JG sealAVX2Tail256LoopA
+ DECQ itr2
+ JGE sealAVX2Tail256LoopB
+
+ VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
+ VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
+ VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
+ VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
+ VPERM2I128 $0x02, AA0, BB0, TT0
+ VPERM2I128 $0x02, CC0, DD0, TT1
+ VPERM2I128 $0x13, AA0, BB0, TT2
+ VPERM2I128 $0x13, CC0, DD0, TT3
+ VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
+ VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
+ MOVQ $128, itr1
+ LEAQ 128(inp), inp
+ SUBQ $128, inl
+ VPERM2I128 $0x02, AA1, BB1, AA0
+ VPERM2I128 $0x02, CC1, DD1, BB0
+ VPERM2I128 $0x13, AA1, BB1, CC0
+ VPERM2I128 $0x13, CC1, DD1, DD0
+
+ JMP sealAVX2SealHash
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 384 bytes of ciphertext
+sealAVX2Tail384:
+ // Need to decrypt up to 384 bytes - prepare two blocks
+ // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
+ // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
+ VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
+ VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
+ VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
+ VMOVDQA ctr3StoreAVX2, DD0
+ VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
+ VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
+
+sealAVX2Tail384LoopA:
+ polyAdd(0(oup))
+ polyMul
+ LEAQ 16(oup), oup
+
+sealAVX2Tail384LoopB:
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+ polyAdd(0(oup))
+ polyMul
+ VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
+ VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+ polyAdd(16(oup))
+ polyMul
+ LEAQ 32(oup), oup
+ VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
+ VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
+ DECQ itr1
+ JG sealAVX2Tail384LoopA
+ DECQ itr2
+ JGE sealAVX2Tail384LoopB
+
+ VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
+ VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
+ VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
+ VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
+ VPERM2I128 $0x02, AA0, BB0, TT0
+ VPERM2I128 $0x02, CC0, DD0, TT1
+ VPERM2I128 $0x13, AA0, BB0, TT2
+ VPERM2I128 $0x13, CC0, DD0, TT3
+ VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
+ VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
+ VPERM2I128 $0x02, AA1, BB1, TT0
+ VPERM2I128 $0x02, CC1, DD1, TT1
+ VPERM2I128 $0x13, AA1, BB1, TT2
+ VPERM2I128 $0x13, CC1, DD1, TT3
+ VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
+ VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
+ MOVQ $256, itr1
+ LEAQ 256(inp), inp
+ SUBQ $256, inl
+ VPERM2I128 $0x02, AA2, BB2, AA0
+ VPERM2I128 $0x02, CC2, DD2, BB0
+ VPERM2I128 $0x13, AA2, BB2, CC0
+ VPERM2I128 $0x13, CC2, DD2, DD0
+
+ JMP sealAVX2SealHash
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 512 bytes of ciphertext
+sealAVX2Tail512:
+ // Need to decrypt up to 512 bytes - prepare two blocks
+ // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
+ // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
+ VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
+ VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
+ VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
+ VMOVDQA ctr3StoreAVX2, DD0
+ VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
+ VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
+
+sealAVX2Tail512LoopA:
+ polyAdd(0(oup))
+ polyMul
+ LEAQ 16(oup), oup
+
+sealAVX2Tail512LoopB:
+ VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+ VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+ VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
+ VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+ VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+ VMOVDQA CC3, tmpStoreAVX2
+ VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
+ VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
+ VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
+ VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
+ VMOVDQA tmpStoreAVX2, CC3
+ polyAdd(0*8(oup))
+ polyMulAVX2
+ VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+ VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+ VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
+ VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+ VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+ VMOVDQA CC3, tmpStoreAVX2
+ VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
+ VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
+ VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
+ VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
+ VMOVDQA tmpStoreAVX2, CC3
+ VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
+ VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
+ VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+ VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+ VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
+ VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+ VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+ polyAdd(2*8(oup))
+ polyMulAVX2
+ LEAQ (4*8)(oup), oup
+ VMOVDQA CC3, tmpStoreAVX2
+ VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
+ VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
+ VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
+ VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
+ VMOVDQA tmpStoreAVX2, CC3
+ VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+ VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+ VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
+ VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+ VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+ VMOVDQA CC3, tmpStoreAVX2
+ VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
+ VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
+ VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
+ VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
+ VMOVDQA tmpStoreAVX2, CC3
+ VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
+ VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
+ VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
+
+ DECQ itr1
+ JG sealAVX2Tail512LoopA
+ DECQ itr2
+ JGE sealAVX2Tail512LoopB
+
+ VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
+ VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
+ VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
+ VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
+ VMOVDQA CC3, tmpStoreAVX2
+ VPERM2I128 $0x02, AA0, BB0, CC3
+ VPXOR (0*32)(inp), CC3, CC3
+ VMOVDQU CC3, (0*32)(oup)
+ VPERM2I128 $0x02, CC0, DD0, CC3
+ VPXOR (1*32)(inp), CC3, CC3
+ VMOVDQU CC3, (1*32)(oup)
+ VPERM2I128 $0x13, AA0, BB0, CC3
+ VPXOR (2*32)(inp), CC3, CC3
+ VMOVDQU CC3, (2*32)(oup)
+ VPERM2I128 $0x13, CC0, DD0, CC3
+ VPXOR (3*32)(inp), CC3, CC3
+ VMOVDQU CC3, (3*32)(oup)
+
+ VPERM2I128 $0x02, AA1, BB1, AA0
+ VPERM2I128 $0x02, CC1, DD1, BB0
+ VPERM2I128 $0x13, AA1, BB1, CC0
+ VPERM2I128 $0x13, CC1, DD1, DD0
+ VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
+ VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
+
+ VPERM2I128 $0x02, AA2, BB2, AA0
+ VPERM2I128 $0x02, CC2, DD2, BB0
+ VPERM2I128 $0x13, AA2, BB2, CC0
+ VPERM2I128 $0x13, CC2, DD2, DD0
+ VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
+ VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
+
+ MOVQ $384, itr1
+ LEAQ 384(inp), inp
+ SUBQ $384, inl
+ VPERM2I128 $0x02, AA3, BB3, AA0
+ VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
+ VPERM2I128 $0x13, AA3, BB3, CC0
+ VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
+
+ JMP sealAVX2SealHash
+
+// func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·cpuid(SB), NOSPLIT, $0-24
+ MOVL eaxArg+0(FP), AX
+ MOVL ecxArg+4(FP), CX
+ CPUID
+ MOVL AX, eax+8(FP)
+ MOVL BX, ebx+12(FP)
+ MOVL CX, ecx+16(FP)
+ MOVL DX, edx+20(FP)
+ RET
+
+// func xgetbv() (eax, edx uint32)
+TEXT ·xgetbv(SB),NOSPLIT,$0-8
+ MOVL $0, CX
+ XGETBV
+ MOVL AX, eax+0(FP)
+ MOVL DX, edx+4(FP)
+ RET