aboutsummaryrefslogtreecommitdiffhomepage
path: root/vendor/golang.org/x/crypto/argon2/blamka_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/golang.org/x/crypto/argon2/blamka_amd64.s')
-rw-r--r--vendor/golang.org/x/crypto/argon2/blamka_amd64.s252
1 files changed, 252 insertions, 0 deletions
diff --git a/vendor/golang.org/x/crypto/argon2/blamka_amd64.s b/vendor/golang.org/x/crypto/argon2/blamka_amd64.s
new file mode 100644
index 0000000..8a83f7c
--- /dev/null
+++ b/vendor/golang.org/x/crypto/argon2/blamka_amd64.s
@@ -0,0 +1,252 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64,!gccgo,!appengine
+
+#include "textflag.h"
+
+DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
+DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
+GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
+
+DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
+DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
+GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
+
+#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
+ MOVO v4, t1; \
+ MOVO v5, v4; \
+ MOVO t1, v5; \
+ MOVO v6, t1; \
+ PUNPCKLQDQ v6, t2; \
+ PUNPCKHQDQ v7, v6; \
+ PUNPCKHQDQ t2, v6; \
+ PUNPCKLQDQ v7, t2; \
+ MOVO t1, v7; \
+ MOVO v2, t1; \
+ PUNPCKHQDQ t2, v7; \
+ PUNPCKLQDQ v3, t2; \
+ PUNPCKHQDQ t2, v2; \
+ PUNPCKLQDQ t1, t2; \
+ PUNPCKHQDQ t2, v3
+
+#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
+ MOVO v4, t1; \
+ MOVO v5, v4; \
+ MOVO t1, v5; \
+ MOVO v2, t1; \
+ PUNPCKLQDQ v2, t2; \
+ PUNPCKHQDQ v3, v2; \
+ PUNPCKHQDQ t2, v2; \
+ PUNPCKLQDQ v3, t2; \
+ MOVO t1, v3; \
+ MOVO v6, t1; \
+ PUNPCKHQDQ t2, v3; \
+ PUNPCKLQDQ v7, t2; \
+ PUNPCKHQDQ t2, v6; \
+ PUNPCKLQDQ t1, t2; \
+ PUNPCKHQDQ t2, v7
+
+#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \
+ MOVO v0, t0; \
+ PMULULQ v2, t0; \
+ PADDQ v2, v0; \
+ PADDQ t0, v0; \
+ PADDQ t0, v0; \
+ PXOR v0, v6; \
+ PSHUFD $0xB1, v6, v6; \
+ MOVO v4, t0; \
+ PMULULQ v6, t0; \
+ PADDQ v6, v4; \
+ PADDQ t0, v4; \
+ PADDQ t0, v4; \
+ PXOR v4, v2; \
+ PSHUFB c40, v2; \
+ MOVO v0, t0; \
+ PMULULQ v2, t0; \
+ PADDQ v2, v0; \
+ PADDQ t0, v0; \
+ PADDQ t0, v0; \
+ PXOR v0, v6; \
+ PSHUFB c48, v6; \
+ MOVO v4, t0; \
+ PMULULQ v6, t0; \
+ PADDQ v6, v4; \
+ PADDQ t0, v4; \
+ PADDQ t0, v4; \
+ PXOR v4, v2; \
+ MOVO v2, t0; \
+ PADDQ v2, t0; \
+ PSRLQ $63, v2; \
+ PXOR t0, v2; \
+ MOVO v1, t0; \
+ PMULULQ v3, t0; \
+ PADDQ v3, v1; \
+ PADDQ t0, v1; \
+ PADDQ t0, v1; \
+ PXOR v1, v7; \
+ PSHUFD $0xB1, v7, v7; \
+ MOVO v5, t0; \
+ PMULULQ v7, t0; \
+ PADDQ v7, v5; \
+ PADDQ t0, v5; \
+ PADDQ t0, v5; \
+ PXOR v5, v3; \
+ PSHUFB c40, v3; \
+ MOVO v1, t0; \
+ PMULULQ v3, t0; \
+ PADDQ v3, v1; \
+ PADDQ t0, v1; \
+ PADDQ t0, v1; \
+ PXOR v1, v7; \
+ PSHUFB c48, v7; \
+ MOVO v5, t0; \
+ PMULULQ v7, t0; \
+ PADDQ v7, v5; \
+ PADDQ t0, v5; \
+ PADDQ t0, v5; \
+ PXOR v5, v3; \
+ MOVO v3, t0; \
+ PADDQ v3, t0; \
+ PSRLQ $63, v3; \
+ PXOR t0, v3
+
+#define LOAD_MSG_0(block, off) \
+ MOVOU 8*(off+0)(block), X0; \
+ MOVOU 8*(off+2)(block), X1; \
+ MOVOU 8*(off+4)(block), X2; \
+ MOVOU 8*(off+6)(block), X3; \
+ MOVOU 8*(off+8)(block), X4; \
+ MOVOU 8*(off+10)(block), X5; \
+ MOVOU 8*(off+12)(block), X6; \
+ MOVOU 8*(off+14)(block), X7
+
+#define STORE_MSG_0(block, off) \
+ MOVOU X0, 8*(off+0)(block); \
+ MOVOU X1, 8*(off+2)(block); \
+ MOVOU X2, 8*(off+4)(block); \
+ MOVOU X3, 8*(off+6)(block); \
+ MOVOU X4, 8*(off+8)(block); \
+ MOVOU X5, 8*(off+10)(block); \
+ MOVOU X6, 8*(off+12)(block); \
+ MOVOU X7, 8*(off+14)(block)
+
+#define LOAD_MSG_1(block, off) \
+ MOVOU 8*off+0*8(block), X0; \
+ MOVOU 8*off+16*8(block), X1; \
+ MOVOU 8*off+32*8(block), X2; \
+ MOVOU 8*off+48*8(block), X3; \
+ MOVOU 8*off+64*8(block), X4; \
+ MOVOU 8*off+80*8(block), X5; \
+ MOVOU 8*off+96*8(block), X6; \
+ MOVOU 8*off+112*8(block), X7
+
+#define STORE_MSG_1(block, off) \
+ MOVOU X0, 8*off+0*8(block); \
+ MOVOU X1, 8*off+16*8(block); \
+ MOVOU X2, 8*off+32*8(block); \
+ MOVOU X3, 8*off+48*8(block); \
+ MOVOU X4, 8*off+64*8(block); \
+ MOVOU X5, 8*off+80*8(block); \
+ MOVOU X6, 8*off+96*8(block); \
+ MOVOU X7, 8*off+112*8(block)
+
+#define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \
+ LOAD_MSG_0(block, off); \
+ HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
+ SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \
+ HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
+ SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \
+ STORE_MSG_0(block, off)
+
+#define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \
+ LOAD_MSG_1(block, off); \
+ HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
+ SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \
+ HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
+ SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \
+ STORE_MSG_1(block, off)
+
+// func blamkaSSE4(b *block)
+TEXT ·blamkaSSE4(SB), 4, $0-8
+ MOVQ b+0(FP), AX
+
+ MOVOU ·c40<>(SB), X10
+ MOVOU ·c48<>(SB), X11
+
+ BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11)
+ BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11)
+ BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11)
+ BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11)
+ BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11)
+ BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11)
+ BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11)
+ BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11)
+
+ BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11)
+ BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11)
+ BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11)
+ BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11)
+ BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11)
+ BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11)
+ BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11)
+ BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11)
+ RET
+
+// func mixBlocksSSE2(out, a, b, c *block)
+TEXT ·mixBlocksSSE2(SB), 4, $0-32
+ MOVQ out+0(FP), DX
+ MOVQ a+8(FP), AX
+ MOVQ b+16(FP), BX
+ MOVQ a+24(FP), CX
+ MOVQ $128, BP
+
+loop:
+ MOVOU 0(AX), X0
+ MOVOU 0(BX), X1
+ MOVOU 0(CX), X2
+ PXOR X1, X0
+ PXOR X2, X0
+ MOVOU X0, 0(DX)
+ ADDQ $16, AX
+ ADDQ $16, BX
+ ADDQ $16, CX
+ ADDQ $16, DX
+ SUBQ $2, BP
+ JA loop
+ RET
+
+// func xorBlocksSSE2(out, a, b, c *block)
+TEXT ·xorBlocksSSE2(SB), 4, $0-32
+ MOVQ out+0(FP), DX
+ MOVQ a+8(FP), AX
+ MOVQ b+16(FP), BX
+ MOVQ a+24(FP), CX
+ MOVQ $128, BP
+
+loop:
+ MOVOU 0(AX), X0
+ MOVOU 0(BX), X1
+ MOVOU 0(CX), X2
+ MOVOU 0(DX), X3
+ PXOR X1, X0
+ PXOR X2, X0
+ PXOR X3, X0
+ MOVOU X0, 0(DX)
+ ADDQ $16, AX
+ ADDQ $16, BX
+ ADDQ $16, CX
+ ADDQ $16, DX
+ SUBQ $2, BP
+ JA loop
+ RET
+
+// func supportsSSE4() bool
+TEXT ·supportsSSE4(SB), 4, $0-1
+ MOVL $1, AX
+ CPUID
+ SHRL $19, CX // Bit 19 indicates SSE4 support
+ ANDL $1, CX // CX != 0 if support SSE4
+ MOVB CX, ret+0(FP)
+ RET