/* SPDX-License-Identifier: GPL-2.0-only */ /* * Accelerated GHASH implementation with ARMv8 PMULL instructions. * * Copyright (C) 2014 - 2018 Linaro Ltd. */ #include #include SHASH .req v0 SHASH2 .req v1 T1 .req v2 T2 .req v3 MASK .req v4 XL .req v5 XM .req v6 XH .req v7 IN1 .req v7 k00_16 .req v8 k32_48 .req v9 t3 .req v10 t4 .req v11 t5 .req v12 t6 .req v13 t7 .req v14 t8 .req v15 t9 .req v16 perm1 .req v17 perm2 .req v18 perm3 .req v19 sh1 .req v20 sh2 .req v21 sh3 .req v22 sh4 .req v23 ss1 .req v24 ss2 .req v25 ss3 .req v26 ss4 .req v27 XL2 .req v8 XM2 .req v9 XH2 .req v10 XL3 .req v11 XM3 .req v12 XH3 .req v13 TT3 .req v14 TT4 .req v15 HH .req v16 HH3 .req v17 HH4 .req v18 HH34 .req v19 .text .arch armv8-a+crypto .macro __pmull_p64, rd, rn, rm pmull \rd\().1q, \rn\().1d, \rm\().1d .endm .macro __pmull2_p64, rd, rn, rm pmull2 \rd\().1q, \rn\().2d, \rm\().2d .endm .macro __pmull_p8, rq, ad, bd ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 __pmull_p8_\bd \rq, \ad .endm .macro __pmull2_p8, rq, ad, bd tbl t3.16b, {\ad\().16b}, perm1.16b // A1 tbl t5.16b, {\ad\().16b}, perm2.16b // A2 tbl t7.16b, {\ad\().16b}, perm3.16b // A3 __pmull2_p8_\bd \rq, \ad .endm .macro __pmull_p8_SHASH, rq, ad __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 .endm .macro __pmull_p8_SHASH2, rq, ad __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 .endm .macro __pmull2_p8_SHASH, rq, ad __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 .endm .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 pmull\t t3.8h, t3.\nb, \bd // F = A1*B pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 pmull\t t5.8h, t5.\nb, \bd // H = A2*B pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 pmull\t t7.8h, t7.\nb, \bd // J = A3*B pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 pmull\t \rq\().8h, \ad, \bd // D = A*B eor t3.16b, t3.16b, t4.16b // L = E + F eor t5.16b, t5.16b, t6.16b // M = G + H eor t7.16b, t7.16b, t8.16b // N = I + J uzp1 t4.2d, t3.2d, t5.2d uzp2 t3.2d, t3.2d, t5.2d uzp1 t6.2d, t7.2d, t9.2d uzp2 t7.2d, t7.2d, t9.2d // t3 = (L) (P0 + P1) << 8 // t5 = (M) (P2 + P3) << 16 eor t4.16b, t4.16b, t3.16b and t3.16b, t3.16b, k32_48.16b // t7 = (N) (P4 + P5) << 24 // t9 = (K) (P6 + P7) << 32 eor t6.16b, t6.16b, t7.16b and t7.16b, t7.16b, k00_16.16b eor t4.16b, t4.16b, t3.16b eor t6.16b, t6.16b, t7.16b zip2 t5.2d, t4.2d, t3.2d zip1 t3.2d, t4.2d, t3.2d zip2 t9.2d, t6.2d, t7.2d zip1 t7.2d, t6.2d, t7.2d ext t3.16b, t3.16b, t3.16b, #15 ext t5.16b, t5.16b, t5.16b, #14 ext t7.16b, t7.16b, t7.16b, #13 ext t9.16b, t9.16b, t9.16b, #12 eor t3.16b, t3.16b, t5.16b eor t7.16b, t7.16b, t9.16b eor \rq\().16b, \rq\().16b, t3.16b eor \rq\().16b, \rq\().16b, t7.16b .endm .macro __pmull_pre_p64 add x8, x3, #16 ld1 {HH.2d-HH4.2d}, [x8] trn1 SHASH2.2d, SHASH.2d, HH.2d trn2 T1.2d, SHASH.2d, HH.2d eor SHASH2.16b, SHASH2.16b, T1.16b trn1 HH34.2d, HH3.2d, HH4.2d trn2 T1.2d, HH3.2d, HH4.2d eor HH34.16b, HH34.16b, T1.16b movi MASK.16b, #0xe1 shl MASK.2d, MASK.2d, #57 .endm .macro __pmull_pre_p8 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 eor SHASH2.16b, SHASH2.16b, SHASH.16b // k00_16 := 0x0000000000000000_000000000000ffff // k32_48 := 0x00000000ffffffff_0000ffffffffffff movi k32_48.2d, #0xffffffff mov k32_48.h[2], k32_48.h[0] ushr k00_16.2d, k32_48.2d, #32 // prepare the permutation vectors mov_q x5, 0x080f0e0d0c0b0a09 movi T1.8b, #8 dup perm1.2d, x5 eor perm1.16b, perm1.16b, T1.16b ushr perm2.2d, perm1.2d, #8 ushr perm3.2d, perm1.2d, #16 ushr T1.2d, perm1.2d, #24 sli perm2.2d, perm1.2d, #56 sli perm3.2d, perm1.2d, #48 sli T1.2d, perm1.2d, #40 // precompute loop invariants tbl sh1.16b, {SHASH.16b}, perm1.16b tbl sh2.16b, {SHASH.16b}, perm2.16b tbl sh3.16b, {SHASH.16b}, perm3.16b tbl sh4.16b, {SHASH.16b}, T1.16b ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 .endm // // PMULL (64x64->128) based reduction for CPUs that can do // it in a single instruction. // .macro __pmull_reduce_p64 pmull T2.1q, XL.1d, MASK.1d eor XM.16b, XM.16b, T1.16b mov XH.d[0], XM.d[1] mov XM.d[1], XL.d[0] eor XL.16b, XM.16b, T2.16b ext T2.16b, XL.16b, XL.16b, #8 pmull XL.1q, XL.1d, MASK.1d .endm // // Alternative reduction for CPUs that lack support for the // 64x64->128 PMULL instruction // .macro __pmull_reduce_p8 eor XM.16b, XM.16b, T1.16b mov XL.d[1], XM.d[0] mov XH.d[0], XM.d[1] shl T1.2d, XL.2d, #57 shl T2.2d, XL.2d, #62 eor T2.16b, T2.16b, T1.16b shl T1.2d, XL.2d, #63 eor T2.16b, T2.16b, T1.16b ext T1.16b, XL.16b, XH.16b, #8 eor T2.16b, T2.16b, T1.16b mov XL.d[1], T2.d[0] mov XH.d[0], T2.d[1] ushr T2.2d, XL.2d, #1 eor XH.16b, XH.16b, XL.16b eor XL.16b, XL.16b, T2.16b ushr T2.2d, T2.2d, #6 ushr XL.2d, XL.2d, #1 .endm .macro __pmull_ghash, pn ld1 {SHASH.2d}, [x3] ld1 {XL.2d}, [x1] __pmull_pre_\pn /* do the head block first, if supplied */ cbz x4, 0f ld1 {T1.2d}, [x4] mov x4, xzr b 3f 0: .ifc \pn, p64 tbnz w0, #0, 2f // skip until #blocks is a tbnz w0, #1, 2f // round multiple of 4 1: ld1 {XM3.16b-TT4.16b}, [x2], #64 sub w0, w0, #4 rev64 T1.16b, XM3.16b rev64 T2.16b, XH3.16b rev64 TT4.16b, TT4.16b rev64 TT3.16b, TT3.16b ext IN1.16b, TT4.16b, TT4.16b, #8 ext XL3.16b, TT3.16b, TT3.16b, #8 eor TT4.16b, TT4.16b, IN1.16b pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) eor TT3.16b, TT3.16b, XL3.16b pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1 pmull XL3.1q, HH.1d, XL3.1d // a0 * b0 pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) ext IN1.16b, T2.16b, T2.16b, #8 eor XL2.16b, XL2.16b, XL3.16b eor XH2.16b, XH2.16b, XH3.16b eor XM2.16b, XM2.16b, XM3.16b eor T2.16b, T2.16b, IN1.16b pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1 pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0 pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) eor XL2.16b, XL2.16b, XL3.16b eor XH2.16b, XH2.16b, XH3.16b eor XM2.16b, XM2.16b, XM3.16b ext IN1.16b, T1.16b, T1.16b, #8 ext TT3.16b, XL.16b, XL.16b, #8 eor XL.16b, XL.16b, IN1.16b eor T1.16b, T1.16b, TT3.16b pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1 eor T1.16b, T1.16b, XL.16b pmull XL.1q, HH4.1d, XL.1d // a0 * b0 pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) eor XL.16b, XL.16b, XL2.16b eor XH.16b, XH.16b, XH2.16b eor XM.16b, XM.16b, XM2.16b eor T2.16b, XL.16b, XH.16b ext T1.16b, XL.16b, XH.16b, #8 eor XM.16b, XM.16b, T2.16b __pmull_reduce_p64 eor T2.16b, T2.16b, XH.16b eor XL.16b, XL.16b, T2.16b cbz w0, 5f b 1b .endif 2: ld1 {T1.2d}, [x2], #16 sub w0, w0, #1 3: /* multiply XL by SHASH in GF(2^128) */ CPU_LE( rev64 T1.16b, T1.16b ) ext T2.16b, XL.16b, XL.16b, #8 ext IN1.16b, T1.16b, T1.16b, #8 eor T1.16b, T1.16b, T2.16b eor XL.16b, XL.16b, IN1.16b __pmull2_\pn XH, XL, SHASH // a1 * b1 eor T1.16b, T1.16b, XL.16b __pmull_\pn XL, XL, SHASH // a0 * b0 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) 4: eor T2.16b, XL.16b, XH.16b ext T1.16b, XL.16b, XH.16b, #8 eor XM.16b, XM.16b, T2.16b __pmull_reduce_\pn eor T2.16b, T2.16b, XH.16b eor XL.16b, XL.16b, T2.16b cbnz w0, 0b 5: st1 {XL.2d}, [x1] ret .endm /* * void pmull_ghash_update(int blocks, u64 dg[], const char *src, * struct ghash_key const *k, const char *head) */ ENTRY(pmull_ghash_update_p64) __pmull_ghash p64 ENDPROC(pmull_ghash_update_p64) ENTRY(pmull_ghash_update_p8) __pmull_ghash p8 ENDPROC(pmull_ghash_update_p8) KS0 .req v12 KS1 .req v13 INP0 .req v14 INP1 .req v15 .macro load_round_keys, rounds, rk cmp \rounds, #12 blo 2222f /* 128 bits */ beq 1111f /* 192 bits */ ld1 {v17.4s-v18.4s}, [\rk], #32 1111: ld1 {v19.4s-v20.4s}, [\rk], #32 2222: ld1 {v21.4s-v24.4s}, [\rk], #64 ld1 {v25.4s-v28.4s}, [\rk], #64 ld1 {v29.4s-v31.4s}, [\rk] .endm .macro enc_round, state, key aese \state\().16b, \key\().16b aesmc \state\().16b, \state\().16b .endm .macro enc_block, state, rounds cmp \rounds, #12 b.lo 2222f /* 128 bits */ b.eq 1111f /* 192 bits */ enc_round \state, v17 enc_round \state, v18 1111: enc_round \state, v19 enc_round \state, v20 2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 enc_round \state, \key .endr aese \state\().16b, v30.16b eor \state\().16b, \state\().16b, v31.16b .endm .macro pmull_gcm_do_crypt, enc ld1 {SHASH.2d}, [x4], #16 ld1 {HH.2d}, [x4] ld1 {XL.2d}, [x1] ldr x8, [x5, #8] // load lower counter movi MASK.16b, #0xe1 trn1 SHASH2.2d, SHASH.2d, HH.2d trn2 T1.2d, SHASH.2d, HH.2d CPU_LE( rev x8, x8 ) shl MASK.2d, MASK.2d, #57 eor SHASH2.16b, SHASH2.16b, T1.16b .if \enc == 1 ldr x10, [sp] ld1 {KS0.16b-KS1.16b}, [x10] .endif cbnz x6, 4f 0: ld1 {INP0.16b-INP1.16b}, [x3], #32 rev x9, x8 add x11, x8, #1 add x8, x8, #2 .if \enc == 1 eor INP0.16b, INP0.16b, KS0.16b // encrypt input eor INP1.16b, INP1.16b, KS1.16b .endif ld1 {KS0.8b}, [x5] // load upper counter rev x11, x11 sub w0, w0, #2 mov KS1.8b, KS0.8b ins KS0.d[1], x9 // set lower counter ins KS1.d[1], x11 rev64 T1.16b, INP1.16b cmp w7, #12 b.ge 2f // AES-192/256? 1: enc_round KS0, v21 ext IN1.16b, T1.16b, T1.16b, #8 enc_round KS1, v21 pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 enc_round KS0, v22 eor T1.16b, T1.16b, IN1.16b enc_round KS1, v22 pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 enc_round KS0, v23 pmull XM2.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) enc_round KS1, v23 rev64 T1.16b, INP0.16b ext T2.16b, XL.16b, XL.16b, #8 enc_round KS0, v24 ext IN1.16b, T1.16b, T1.16b, #8 eor T1.16b, T1.16b, T2.16b enc_round KS1, v24 eor XL.16b, XL.16b, IN1.16b enc_round KS0, v25 eor T1.16b, T1.16b, XL.16b enc_round KS1, v25 pmull2 XH.1q, HH.2d, XL.2d // a1 * b1 enc_round KS0, v26 pmull XL.1q, HH.1d, XL.1d // a0 * b0 enc_round KS1, v26 pmull2 XM.1q, SHASH2.2d, T1.2d // (a1 + a0)(b1 + b0) enc_round KS0, v27 eor XL.16b, XL.16b, XL2.16b eor XH.16b, XH.16b, XH2.16b enc_round KS1, v27 eor XM.16b, XM.16b, XM2.16b ext T1.16b, XL.16b, XH.16b, #8 enc_round KS0, v28 eor T2.16b, XL.16b, XH.16b eor XM.16b, XM.16b, T1.16b enc_round KS1, v28 eor XM.16b, XM.16b, T2.16b enc_round KS0, v29 pmull T2.1q, XL.1d, MASK.1d enc_round KS1, v29 mov XH.d[0], XM.d[1] mov XM.d[1], XL.d[0] aese KS0.16b, v30.16b eor XL.16b, XM.16b, T2.16b aese KS1.16b, v30.16b ext T2.16b, XL.16b, XL.16b, #8 eor KS0.16b, KS0.16b, v31.16b pmull XL.1q, XL.1d, MASK.1d eor T2.16b, T2.16b, XH.16b eor KS1.16b, KS1.16b, v31.16b eor XL.16b, XL.16b, T2.16b .if \enc == 0 eor INP0.16b, INP0.16b, KS0.16b eor INP1.16b, INP1.16b, KS1.16b .endif st1 {INP0.16b-INP1.16b}, [x2], #32 cbnz w0, 0b CPU_LE( rev x8, x8 ) st1 {XL.2d}, [x1] str x8, [x5, #8] // store lower counter .if \enc == 1 st1 {KS0.16b-KS1.16b}, [x10] .endif ret 2: b.eq 3f // AES-192? enc_round KS0, v17 enc_round KS1, v17 enc_round KS0, v18 enc_round KS1, v18 3: enc_round KS0, v19 enc_round KS1, v19 enc_round KS0, v20 enc_round KS1, v20 b 1b 4: load_round_keys w7, x6 b 0b .endm /* * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], * struct ghash_key const *k, u8 ctr[], * int rounds, u8 ks[]) */ ENTRY(pmull_gcm_encrypt) pmull_gcm_do_crypt 1 ENDPROC(pmull_gcm_encrypt) /* * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], * struct ghash_key const *k, u8 ctr[], * int rounds) */ ENTRY(pmull_gcm_decrypt) pmull_gcm_do_crypt 0 ENDPROC(pmull_gcm_decrypt) /* * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds) */ ENTRY(pmull_gcm_encrypt_block) cbz x2, 0f load_round_keys w3, x2 0: ld1 {v0.16b}, [x1] enc_block v0, w3 st1 {v0.16b}, [x0] ret ENDPROC(pmull_gcm_encrypt_block)