summaryrefslogtreecommitdiff
path: root/arch/x86/crypto/aesni-intel_asm.S
diff options
context:
space:
mode:
authorHuang Ying <ying.huang@intel.com>2009-01-18 16:28:34 +1100
committerHerbert Xu <herbert@gondor.apana.org.au>2009-02-18 16:48:06 +0800
commit54b6a1bd5364aca95cd6ffae00f2b64c6511122c (patch)
treeb1e288b009df7fefa92ce001d8709b04dd20663f /arch/x86/crypto/aesni-intel_asm.S
parent1cac2cbc76b9f3fce0d4ccc374e724e7f2533a47 (diff)
crypto: aes-ni - Add support to Intel AES-NI instructions for x86_64 platform
Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD) instructions that are going to be introduced in the next generation of Intel processor, as of 2009. These instructions enable fast and secure data encryption and decryption, using the Advanced Encryption Standard (AES), defined by FIPS Publication number 197. The architecture introduces six instructions that offer full hardware support for AES. Four of them support high performance data encryption and decryption, and the other two instructions support the AES key expansion procedure. The white paper can be downloaded from: http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf AES may be used in soft_irq context, but MMX/SSE context can not be touched safely in soft_irq context. So in_interrupt() is checked, if in IRQ or soft_irq context, the general x86_64 implementation are used instead. Signed-off-by: Huang Ying <ying.huang@intel.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/aesni-intel_asm.S')
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S896
1 files changed, 896 insertions, 0 deletions
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
new file mode 100644
index 000000000000..caba99601703
--- /dev/null
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -0,0 +1,896 @@
+/*
+ * Implement AES algorithm in Intel AES-NI instructions.
+ *
+ * The white paper of AES-NI instructions can be downloaded from:
+ * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
+ *
+ * Copyright (C) 2008, Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ * Vinodh Gopal <vinodh.gopal@intel.com>
+ * Kahraman Akdemir
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+.text
+
+#define STATE1 %xmm0
+#define STATE2 %xmm4
+#define STATE3 %xmm5
+#define STATE4 %xmm6
+#define STATE STATE1
+#define IN1 %xmm1
+#define IN2 %xmm7
+#define IN3 %xmm8
+#define IN4 %xmm9
+#define IN IN1
+#define KEY %xmm2
+#define IV %xmm3
+
+#define KEYP %rdi
+#define OUTP %rsi
+#define INP %rdx
+#define LEN %rcx
+#define IVP %r8
+#define KLEN %r9d
+#define T1 %r10
+#define TKEYP T1
+#define T2 %r11
+
+_key_expansion_128:
+_key_expansion_256a:
+ pshufd $0b11111111, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+ movaps %xmm0, (%rcx)
+ add $0x10, %rcx
+ ret
+
+_key_expansion_192a:
+ pshufd $0b01010101, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+
+ movaps %xmm2, %xmm5
+ movaps %xmm2, %xmm6
+ pslldq $4, %xmm5
+ pshufd $0b11111111, %xmm0, %xmm3
+ pxor %xmm3, %xmm2
+ pxor %xmm5, %xmm2
+
+ movaps %xmm0, %xmm1
+ shufps $0b01000100, %xmm0, %xmm6
+ movaps %xmm6, (%rcx)
+ shufps $0b01001110, %xmm2, %xmm1
+ movaps %xmm1, 16(%rcx)
+ add $0x20, %rcx
+ ret
+
+_key_expansion_192b:
+ pshufd $0b01010101, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+
+ movaps %xmm2, %xmm5
+ pslldq $4, %xmm5
+ pshufd $0b11111111, %xmm0, %xmm3
+ pxor %xmm3, %xmm2
+ pxor %xmm5, %xmm2
+
+ movaps %xmm0, (%rcx)
+ add $0x10, %rcx
+ ret
+
+_key_expansion_256b:
+ pshufd $0b10101010, %xmm1, %xmm1
+ shufps $0b00010000, %xmm2, %xmm4
+ pxor %xmm4, %xmm2
+ shufps $0b10001100, %xmm2, %xmm4
+ pxor %xmm4, %xmm2
+ pxor %xmm1, %xmm2
+ movaps %xmm2, (%rcx)
+ add $0x10, %rcx
+ ret
+
+/*
+ * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+ * unsigned int key_len)
+ */
+ENTRY(aesni_set_key)
+ movups (%rsi), %xmm0 # user key (first 16 bytes)
+ movaps %xmm0, (%rdi)
+ lea 0x10(%rdi), %rcx # key addr
+ movl %edx, 480(%rdi)
+ pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
+ cmp $24, %dl
+ jb .Lenc_key128
+ je .Lenc_key192
+ movups 0x10(%rsi), %xmm2 # other user key
+ movaps %xmm2, (%rcx)
+ add $0x10, %rcx
+ # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
+ call _key_expansion_256a
+ # aeskeygenassist $0x1, %xmm0, %xmm1
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
+ call _key_expansion_256b
+ # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
+ call _key_expansion_256a
+ # aeskeygenassist $0x2, %xmm0, %xmm1
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
+ call _key_expansion_256b
+ # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
+ call _key_expansion_256a
+ # aeskeygenassist $0x4, %xmm0, %xmm1
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
+ call _key_expansion_256b
+ # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
+ call _key_expansion_256a
+ # aeskeygenassist $0x8, %xmm0, %xmm1
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
+ call _key_expansion_256b
+ # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
+ call _key_expansion_256a
+ # aeskeygenassist $0x10, %xmm0, %xmm1
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
+ call _key_expansion_256b
+ # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
+ call _key_expansion_256a
+ # aeskeygenassist $0x20, %xmm0, %xmm1
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
+ call _key_expansion_256b
+ # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
+ call _key_expansion_256a
+ jmp .Ldec_key
+.Lenc_key192:
+ movq 0x10(%rsi), %xmm2 # other user key
+ # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
+ call _key_expansion_192a
+ # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
+ call _key_expansion_192b
+ # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
+ call _key_expansion_192a
+ # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
+ call _key_expansion_192b
+ # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
+ call _key_expansion_192a
+ # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
+ call _key_expansion_192b
+ # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
+ call _key_expansion_192a
+ # aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x80
+ call _key_expansion_192b
+ jmp .Ldec_key
+.Lenc_key128:
+ # aeskeygenassist $0x1, %xmm0, %xmm1 # round 1
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
+ call _key_expansion_128
+ # aeskeygenassist $0x2, %xmm0, %xmm1 # round 2
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
+ call _key_expansion_128
+ # aeskeygenassist $0x4, %xmm0, %xmm1 # round 3
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
+ call _key_expansion_128
+ # aeskeygenassist $0x8, %xmm0, %xmm1 # round 4
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
+ call _key_expansion_128
+ # aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
+ call _key_expansion_128
+ # aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
+ call _key_expansion_128
+ # aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x40
+ call _key_expansion_128
+ # aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x80
+ call _key_expansion_128
+ # aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x1b
+ call _key_expansion_128
+ # aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x36
+ call _key_expansion_128
+.Ldec_key:
+ sub $0x10, %rcx
+ movaps (%rdi), %xmm0
+ movaps (%rcx), %xmm1
+ movaps %xmm0, 240(%rcx)
+ movaps %xmm1, 240(%rdi)
+ add $0x10, %rdi
+ lea 240-16(%rcx), %rsi
+.align 4
+.Ldec_key_loop:
+ movaps (%rdi), %xmm0
+ # aesimc %xmm0, %xmm1
+ .byte 0x66, 0x0f, 0x38, 0xdb, 0xc8
+ movaps %xmm1, (%rsi)
+ add $0x10, %rdi
+ sub $0x10, %rsi
+ cmp %rcx, %rdi
+ jb .Ldec_key_loop
+ xor %rax, %rax
+ ret
+
+/*
+ * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+ */
+ENTRY(aesni_enc)
+ movl 480(KEYP), KLEN # key length
+ movups (INP), STATE # input
+ call _aesni_enc1
+ movups STATE, (OUTP) # output
+ ret
+
+/*
+ * _aesni_enc1: internal ABI
+ * input:
+ * KEYP: key struct pointer
+ * KLEN: round count
+ * STATE: initial state (input)
+ * output:
+ * STATE: finial state (output)
+ * changed:
+ * KEY
+ * TKEYP (T1)
+ */
+_aesni_enc1:
+ movaps (KEYP), KEY # key
+ mov KEYP, TKEYP
+ pxor KEY, STATE # round 0
+ add $0x30, TKEYP
+ cmp $24, KLEN
+ jb .Lenc128
+ lea 0x20(TKEYP), TKEYP
+ je .Lenc192
+ add $0x20, TKEYP
+ movaps -0x60(TKEYP), KEY
+ # aesenc KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ movaps -0x50(TKEYP), KEY
+ # aesenc KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+.align 4
+.Lenc192:
+ movaps -0x40(TKEYP), KEY
+ # aesenc KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ movaps -0x30(TKEYP), KEY
+ # aesenc KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+.align 4
+.Lenc128:
+ movaps -0x20(TKEYP), KEY
+ # aesenc KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ movaps -0x10(TKEYP), KEY
+ # aesenc KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ movaps (TKEYP), KEY
+ # aesenc KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ movaps 0x10(TKEYP), KEY
+ # aesenc KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ movaps 0x20(TKEYP), KEY
+ # aesenc KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ movaps 0x30(TKEYP), KEY
+ # aesenc KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ movaps 0x40(TKEYP), KEY
+ # aesenc KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ movaps 0x50(TKEYP), KEY
+ # aesenc KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ movaps 0x60(TKEYP), KEY
+ # aesenc KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ movaps 0x70(TKEYP), KEY
+ # aesenclast KEY, STATE # last round
+ .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2
+ ret
+
+/*
+ * _aesni_enc4: internal ABI
+ * input:
+ * KEYP: key struct pointer
+ * KLEN: round count
+ * STATE1: initial state (input)
+ * STATE2
+ * STATE3
+ * STATE4
+ * output:
+ * STATE1: finial state (output)
+ * STATE2
+ * STATE3
+ * STATE4
+ * changed:
+ * KEY
+ * TKEYP (T1)
+ */
+_aesni_enc4:
+ movaps (KEYP), KEY # key
+ mov KEYP, TKEYP
+ pxor KEY, STATE1 # round 0
+ pxor KEY, STATE2
+ pxor KEY, STATE3
+ pxor KEY, STATE4
+ add $0x30, TKEYP
+ cmp $24, KLEN
+ jb .L4enc128
+ lea 0x20(TKEYP), TKEYP
+ je .L4enc192
+ add $0x20, TKEYP
+ movaps -0x60(TKEYP), KEY
+ # aesenc KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ # aesenc KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+ # aesenc KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+ # aesenc KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ movaps -0x50(TKEYP), KEY
+ # aesenc KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ # aesenc KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+ # aesenc KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+ # aesenc KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+#.align 4
+.L4enc192:
+ movaps -0x40(TKEYP), KEY
+ # aesenc KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ # aesenc KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+ # aesenc KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+ # aesenc KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ movaps -0x30(TKEYP), KEY
+ # aesenc KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ # aesenc KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+ # aesenc KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+ # aesenc KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+#.align 4
+.L4enc128:
+ movaps -0x20(TKEYP), KEY
+ # aesenc KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ # aesenc KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+ # aesenc KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+ # aesenc KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ movaps -0x10(TKEYP), KEY
+ # aesenc KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ # aesenc KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+ # aesenc KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+ # aesenc KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ movaps (TKEYP), KEY
+ # aesenc KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ # aesenc KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+ # aesenc KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+ # aesenc KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ movaps 0x10(TKEYP), KEY
+ # aesenc KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ # aesenc KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+ # aesenc KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+ # aesenc KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ movaps 0x20(TKEYP), KEY
+ # aesenc KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ # aesenc KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+ # aesenc KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+ # aesenc KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ movaps 0x30(TKEYP), KEY
+ # aesenc KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ # aesenc KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+ # aesenc KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+ # aesenc KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ movaps 0x40(TKEYP), KEY
+ # aesenc KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ # aesenc KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+ # aesenc KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+ # aesenc KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ movaps 0x50(TKEYP), KEY
+ # aesenc KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ # aesenc KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+ # aesenc KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+ # aesenc KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ movaps 0x60(TKEYP), KEY
+ # aesenc KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ # aesenc KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+ # aesenc KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+ # aesenc KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ movaps 0x70(TKEYP), KEY
+ # aesenclast KEY, STATE1 # last round
+ .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2
+ # aesenclast KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xdd, 0xe2
+ # aesenclast KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xdd, 0xea
+ # aesenclast KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xdd, 0xf2
+ ret
+
+/*
+ * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+ */
+ENTRY(aesni_dec)
+ mov 480(KEYP), KLEN # key length
+ add $240, KEYP
+ movups (INP), STATE # input
+ call _aesni_dec1
+ movups STATE, (OUTP) #output
+ ret
+
+/*
+ * _aesni_dec1: internal ABI
+ * input:
+ * KEYP: key struct pointer
+ * KLEN: key length
+ * STATE: initial state (input)
+ * output:
+ * STATE: finial state (output)
+ * changed:
+ * KEY
+ * TKEYP (T1)
+ */
+_aesni_dec1:
+ movaps (KEYP), KEY # key
+ mov KEYP, TKEYP
+ pxor KEY, STATE # round 0
+ add $0x30, TKEYP
+ cmp $24, KLEN
+ jb .Ldec128
+ lea 0x20(TKEYP), TKEYP
+ je .Ldec192
+ add $0x20, TKEYP
+ movaps -0x60(TKEYP), KEY
+ # aesdec KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ movaps -0x50(TKEYP), KEY
+ # aesdec KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+.align 4
+.Ldec192:
+ movaps -0x40(TKEYP), KEY
+ # aesdec KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ movaps -0x30(TKEYP), KEY
+ # aesdec KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+.align 4
+.Ldec128:
+ movaps -0x20(TKEYP), KEY
+ # aesdec KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ movaps -0x10(TKEYP), KEY
+ # aesdec KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ movaps (TKEYP), KEY
+ # aesdec KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ movaps 0x10(TKEYP), KEY
+ # aesdec KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ movaps 0x20(TKEYP), KEY
+ # aesdec KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ movaps 0x30(TKEYP), KEY
+ # aesdec KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ movaps 0x40(TKEYP), KEY
+ # aesdec KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ movaps 0x50(TKEYP), KEY
+ # aesdec KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ movaps 0x60(TKEYP), KEY
+ # aesdec KEY, STATE
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ movaps 0x70(TKEYP), KEY
+ # aesdeclast KEY, STATE # last round
+ .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2
+ ret
+
+/*
+ * _aesni_dec4: internal ABI
+ * input:
+ * KEYP: key struct pointer
+ * KLEN: key length
+ * STATE1: initial state (input)
+ * STATE2
+ * STATE3
+ * STATE4
+ * output:
+ * STATE1: finial state (output)
+ * STATE2
+ * STATE3
+ * STATE4
+ * changed:
+ * KEY
+ * TKEYP (T1)
+ */
+_aesni_dec4:
+ movaps (KEYP), KEY # key
+ mov KEYP, TKEYP
+ pxor KEY, STATE1 # round 0
+ pxor KEY, STATE2
+ pxor KEY, STATE3
+ pxor KEY, STATE4
+ add $0x30, TKEYP
+ cmp $24, KLEN
+ jb .L4dec128
+ lea 0x20(TKEYP), TKEYP
+ je .L4dec192
+ add $0x20, TKEYP
+ movaps -0x60(TKEYP), KEY
+ # aesdec KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ # aesdec KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+ # aesdec KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xea
+ # aesdec KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ movaps -0x50(TKEYP), KEY
+ # aesdec KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ # aesdec KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+ # aesdec KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xea
+ # aesdec KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+.align 4
+.L4dec192:
+ movaps -0x40(TKEYP), KEY
+ # aesdec KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ # aesdec KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+ # aesdec KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xea
+ # aesdec KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ movaps -0x30(TKEYP), KEY
+ # aesdec KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ # aesdec KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+ # aesdec KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xea
+ # aesdec KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+.align 4
+.L4dec128:
+ movaps -0x20(TKEYP), KEY
+ # aesdec KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ # aesdec KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+ # aesdec KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xea
+ # aesdec KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ movaps -0x10(TKEYP), KEY
+ # aesdec KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ # aesdec KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+ # aesdec KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xea
+ # aesdec KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ movaps (TKEYP), KEY
+ # aesdec KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ # aesdec KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+ # aesdec KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xea
+ # aesdec KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ movaps 0x10(TKEYP), KEY
+ # aesdec KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ # aesdec KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+ # aesdec KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xea
+ # aesdec KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ movaps 0x20(TKEYP), KEY
+ # aesdec KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ # aesdec KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+ # aesdec KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xea
+ # aesdec KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ movaps 0x30(TKEYP), KEY
+ # aesdec KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ # aesdec KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+ # aesdec KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xea
+ # aesdec KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ movaps 0x40(TKEYP), KEY
+ # aesdec KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ # aesdec KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+ # aesdec KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xea
+ # aesdec KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ movaps 0x50(TKEYP), KEY
+ # aesdec KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ # aesdec KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+ # aesdec KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xea
+ # aesdec KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ movaps 0x60(TKEYP), KEY
+ # aesdec KEY, STATE1
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ # aesdec KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+ # aesdec KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xea
+ # aesdec KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ movaps 0x70(TKEYP), KEY
+ # aesdeclast KEY, STATE1 # last round
+ .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2
+ # aesdeclast KEY, STATE2
+ .byte 0x66, 0x0f, 0x38, 0xdf, 0xe2
+ # aesdeclast KEY, STATE3
+ .byte 0x66, 0x0f, 0x38, 0xdf, 0xea
+ # aesdeclast KEY, STATE4
+ .byte 0x66, 0x0f, 0x38, 0xdf, 0xf2
+ ret
+
+/*
+ * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len)
+ */
+ENTRY(aesni_ecb_enc)
+ test LEN, LEN # check length
+ jz .Lecb_enc_ret
+ mov 480(KEYP), KLEN
+ cmp $16, LEN
+ jb .Lecb_enc_ret
+ cmp $64, LEN
+ jb .Lecb_enc_loop1
+.align 4
+.Lecb_enc_loop4:
+ movups (INP), STATE1
+ movups 0x10(INP), STATE2
+ movups 0x20(INP), STATE3
+ movups 0x30(INP), STATE4
+ call _aesni_enc4
+ movups STATE1, (OUTP)
+ movups STATE2, 0x10(OUTP)
+ movups STATE3, 0x20(OUTP)
+ movups STATE4, 0x30(OUTP)
+ sub $64, LEN
+ add $64, INP
+ add $64, OUTP
+ cmp $64, LEN
+ jge .Lecb_enc_loop4
+ cmp $16, LEN
+ jb .Lecb_enc_ret
+.align 4
+.Lecb_enc_loop1:
+ movups (INP), STATE1
+ call _aesni_enc1
+ movups STATE1, (OUTP)
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lecb_enc_loop1
+.Lecb_enc_ret:
+ ret
+
+/*
+ * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len);
+ */
+ENTRY(aesni_ecb_dec)
+ test LEN, LEN
+ jz .Lecb_dec_ret
+ mov 480(KEYP), KLEN
+ add $240, KEYP
+ cmp $16, LEN
+ jb .Lecb_dec_ret
+ cmp $64, LEN
+ jb .Lecb_dec_loop1
+.align 4
+.Lecb_dec_loop4:
+ movups (INP), STATE1
+ movups 0x10(INP), STATE2
+ movups 0x20(INP), STATE3
+ movups 0x30(INP), STATE4
+ call _aesni_dec4
+ movups STATE1, (OUTP)
+ movups STATE2, 0x10(OUTP)
+ movups STATE3, 0x20(OUTP)
+ movups STATE4, 0x30(OUTP)
+ sub $64, LEN
+ add $64, INP
+ add $64, OUTP
+ cmp $64, LEN
+ jge .Lecb_dec_loop4
+ cmp $16, LEN
+ jb .Lecb_dec_ret
+.align 4
+.Lecb_dec_loop1:
+ movups (INP), STATE1
+ call _aesni_dec1
+ movups STATE1, (OUTP)
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lecb_dec_loop1
+.Lecb_dec_ret:
+ ret
+
+/*
+ * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+ENTRY(aesni_cbc_enc)
+ cmp $16, LEN
+ jb .Lcbc_enc_ret
+ mov 480(KEYP), KLEN
+ movups (IVP), STATE # load iv as initial state
+.align 4
+.Lcbc_enc_loop:
+ movups (INP), IN # load input
+ pxor IN, STATE
+ call _aesni_enc1
+ movups STATE, (OUTP) # store output
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lcbc_enc_loop
+ movups STATE, (IVP)
+.Lcbc_enc_ret:
+ ret
+
+/*
+ * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+ENTRY(aesni_cbc_dec)
+ cmp $16, LEN
+ jb .Lcbc_dec_ret
+ mov 480(KEYP), KLEN
+ add $240, KEYP
+ movups (IVP), IV
+ cmp $64, LEN
+ jb .Lcbc_dec_loop1
+.align 4
+.Lcbc_dec_loop4:
+ movups (INP), IN1
+ movaps IN1, STATE1
+ movups 0x10(INP), IN2
+ movaps IN2, STATE2
+ movups 0x20(INP), IN3
+ movaps IN3, STATE3
+ movups 0x30(INP), IN4
+ movaps IN4, STATE4
+ call _aesni_dec4
+ pxor IV, STATE1
+ pxor IN1, STATE2
+ pxor IN2, STATE3
+ pxor IN3, STATE4
+ movaps IN4, IV
+ movups STATE1, (OUTP)
+ movups STATE2, 0x10(OUTP)
+ movups STATE3, 0x20(OUTP)
+ movups STATE4, 0x30(OUTP)
+ sub $64, LEN
+ add $64, INP
+ add $64, OUTP
+ cmp $64, LEN
+ jge .Lcbc_dec_loop4
+ cmp $16, LEN
+ jb .Lcbc_dec_ret
+.align 4
+.Lcbc_dec_loop1:
+ movups (INP), IN
+ movaps IN, STATE
+ call _aesni_dec1
+ pxor IV, STATE
+ movups STATE, (OUTP)
+ movaps IN, IV
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lcbc_dec_loop1
+ movups IV, (IVP)
+.Lcbc_dec_ret:
+ ret