blob: 6af9b1a6dbd6c0261744d077a9b12d72a04bf2a0 [file] [log] [blame]
/* SPDX-License-Identifier: BSD-2-Clause */
/*
* Copyright (c) 2015, Linaro Limited
*/
/*
* - AES cipher for ARMv8 with Crypto Extensions
* - Chaining mode wrappers for AES
*
* Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
*/
#define ENTRY(func) \
.global func ; \
.type func , %function ; \
func :
#define ENDPROC(func) \
.size func , .-func
.arch armv8-a+crypto
/* preload all round keys */
.macro load_round_keys, rounds, rk
cmp \rounds, #12
blo 2222f /* 128 bits */
beq 1111f /* 192 bits */
ld1 {v17.16b-v18.16b}, [\rk], #32
1111: ld1 {v19.16b-v20.16b}, [\rk], #32
2222: ld1 {v21.16b-v24.16b}, [\rk], #64
ld1 {v25.16b-v28.16b}, [\rk], #64
ld1 {v29.16b-v31.16b}, [\rk]
.endm
/* prepare for encryption with key in rk[] */
.macro enc_prepare, rounds, rk, ignore
load_round_keys \rounds, \rk
.endm
/* prepare for encryption (again) but with new key in rk[] */
.macro enc_switch_key, rounds, rk, ignore
load_round_keys \rounds, \rk
.endm
/* prepare for decryption with key in rk[] */
.macro dec_prepare, rounds, rk, ignore
load_round_keys \rounds, \rk
.endm
.macro do_enc_Nx, de, mc, k, i0, i1, i2, i3
aes\de \i0\().16b, \k\().16b
aes\mc \i0\().16b, \i0\().16b
.ifnb \i1
aes\de \i1\().16b, \k\().16b
aes\mc \i1\().16b, \i1\().16b
.ifnb \i3
aes\de \i2\().16b, \k\().16b
aes\mc \i2\().16b, \i2\().16b
aes\de \i3\().16b, \k\().16b
aes\mc \i3\().16b, \i3\().16b
.endif
.endif
.endm
/* up to 4 interleaved encryption rounds with the same round key */
.macro round_Nx, enc, k, i0, i1, i2, i3
.ifc \enc, e
do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3
.else
do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3
.endif
.endm
/* up to 4 interleaved final rounds */
.macro fin_round_Nx, de, k, k2, i0, i1, i2, i3
aes\de \i0\().16b, \k\().16b
.ifnb \i1
aes\de \i1\().16b, \k\().16b
.ifnb \i3
aes\de \i2\().16b, \k\().16b
aes\de \i3\().16b, \k\().16b
.endif
.endif
eor \i0\().16b, \i0\().16b, \k2\().16b
.ifnb \i1
eor \i1\().16b, \i1\().16b, \k2\().16b
.ifnb \i3
eor \i2\().16b, \i2\().16b, \k2\().16b
eor \i3\().16b, \i3\().16b, \k2\().16b
.endif
.endif
.endm
/* up to 4 interleaved blocks */
.macro do_block_Nx, enc, rounds, i0, i1, i2, i3
cmp \rounds, #12
blo 2222f /* 128 bits */
beq 1111f /* 192 bits */
round_Nx \enc, v17, \i0, \i1, \i2, \i3
round_Nx \enc, v18, \i0, \i1, \i2, \i3
1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3
round_Nx \enc, v20, \i0, \i1, \i2, \i3
2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
round_Nx \enc, \key, \i0, \i1, \i2, \i3
.endr
fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3
.endm
.macro encrypt_block, in, rounds, t0, t1, t2
do_block_Nx e, \rounds, \in
.endm
.macro encrypt_block2x, i0, i1, rounds, t0, t1, t2
do_block_Nx e, \rounds, \i0, \i1
.endm
.macro encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
do_block_Nx e, \rounds, \i0, \i1, \i2, \i3
.endm
.macro decrypt_block, in, rounds, t0, t1, t2
do_block_Nx d, \rounds, \in
.endm
.macro decrypt_block2x, i0, i1, rounds, t0, t1, t2
do_block_Nx d, \rounds, \i0, \i1
.endm
.macro decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
do_block_Nx d, \rounds, \i0, \i1, \i2, \i3
.endm
.text
.align 4
/*
* There are several ways to instantiate this code:
* - no interleave, all inline
* - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
* - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
* - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
* - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
*
* Macros imported by this code:
* - enc_prepare - setup NEON registers for encryption
* - dec_prepare - setup NEON registers for decryption
* - enc_switch_key - change to new key after having prepared for encryption
* - encrypt_block - encrypt a single block
* - decrypt block - decrypt a single block
* - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
* - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
* - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
* - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
*/
#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
#define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp
#define FRAME_POP ldp x29, x30, [sp],#16
#if INTERLEAVE == 2
aes_encrypt_block2x:
encrypt_block2x v0, v1, w3, x2, x6, w7
ret
ENDPROC(aes_encrypt_block2x)
aes_decrypt_block2x:
decrypt_block2x v0, v1, w3, x2, x6, w7
ret
ENDPROC(aes_decrypt_block2x)
#elif INTERLEAVE == 4
aes_encrypt_block4x:
encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
ret
ENDPROC(aes_encrypt_block4x)
aes_decrypt_block4x:
decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
ret
ENDPROC(aes_decrypt_block4x)
#else
#error INTERLEAVE should equal 2 or 4
#endif
.macro do_encrypt_block2x
bl aes_encrypt_block2x
.endm
.macro do_decrypt_block2x
bl aes_decrypt_block2x
.endm
.macro do_encrypt_block4x
bl aes_encrypt_block4x
.endm
.macro do_decrypt_block4x
bl aes_decrypt_block4x
.endm
#else
#define FRAME_PUSH
#define FRAME_POP
.macro do_encrypt_block2x
encrypt_block2x v0, v1, w3, x2, x6, w7
.endm
.macro do_decrypt_block2x
decrypt_block2x v0, v1, w3, x2, x6, w7
.endm
.macro do_encrypt_block4x
encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
.endm
.macro do_decrypt_block4x
decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
.endm
#endif
/*
* uint32_t ce_aes_sub(uint32_t in) - use the aese instruction to
* perform the AES sbox substitution on each byte in 'input'
*/
ENTRY(ce_aes_sub)
dup v1.4s, w0
movi v0.16b, #0
aese v0.16b, v1.16b
umov w0, v0.s[0]
ret
ENDPROC(ce_aes_sub)
/*
* void ce_aes_invert(void *dst, void *src)
*/
ENTRY(ce_aes_invert)
ld1 {v0.16b}, [x1]
aesimc v1.16b, v0.16b
st1 {v1.16b}, [x0]
ret
ENDPROC(ce_aes_invert)
/*
* ce_aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
* int rounds, int blocks, int first)
* ce_aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
* int rounds, int blocks, int first)
*/
ENTRY(ce_aes_ecb_encrypt)
FRAME_PUSH
cbz w5, .LecbencloopNx
enc_prepare w3, x2, x5
.LecbencloopNx:
#if INTERLEAVE >= 2
subs w4, w4, #INTERLEAVE
bmi .Lecbenc1x
#if INTERLEAVE == 2
ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
do_encrypt_block2x
st1 {v0.16b-v1.16b}, [x0], #32
#else
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
do_encrypt_block4x
st1 {v0.16b-v3.16b}, [x0], #64
#endif
b .LecbencloopNx
.Lecbenc1x:
adds w4, w4, #INTERLEAVE
beq .Lecbencout
#endif
.Lecbencloop:
ld1 {v0.16b}, [x1], #16 /* get next pt block */
encrypt_block v0, w3, x2, x5, w6
st1 {v0.16b}, [x0], #16
subs w4, w4, #1
bne .Lecbencloop
.Lecbencout:
FRAME_POP
ret
ENDPROC(ce_aes_ecb_encrypt)
ENTRY(ce_aes_ecb_decrypt)
FRAME_PUSH
cbz w5, .LecbdecloopNx
dec_prepare w3, x2, x5
.LecbdecloopNx:
#if INTERLEAVE >= 2
subs w4, w4, #INTERLEAVE
bmi .Lecbdec1x
#if INTERLEAVE == 2
ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
do_decrypt_block2x
st1 {v0.16b-v1.16b}, [x0], #32
#else
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
do_decrypt_block4x
st1 {v0.16b-v3.16b}, [x0], #64
#endif
b .LecbdecloopNx
.Lecbdec1x:
adds w4, w4, #INTERLEAVE
beq .Lecbdecout
#endif
.Lecbdecloop:
ld1 {v0.16b}, [x1], #16 /* get next ct block */
decrypt_block v0, w3, x2, x5, w6
st1 {v0.16b}, [x0], #16
subs w4, w4, #1
bne .Lecbdecloop
.Lecbdecout:
FRAME_POP
ret
ENDPROC(ce_aes_ecb_decrypt)
/*
* aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks, u8 iv[])
* aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks, u8 iv[])
*/
ENTRY(ce_aes_cbc_encrypt)
ld1 {v4.16b}, [x5] /* get iv */
enc_prepare w3, x2, x6
.Lcbcencloop4x:
subs w4, w4, #4
bmi .Lcbcenc1x
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */
encrypt_block v0, w3, x2, x6, w7
eor v1.16b, v1.16b, v0.16b
encrypt_block v1, w3, x2, x6, w7
eor v2.16b, v2.16b, v1.16b
encrypt_block v2, w3, x2, x6, w7
eor v3.16b, v3.16b, v2.16b
encrypt_block v3, w3, x2, x6, w7
st1 {v0.16b-v3.16b}, [x0], #64
mov v4.16b, v3.16b
b .Lcbcencloop4x
.Lcbcenc1x:
adds w4, w4, #4
beq .Lcbcencout
.Lcbcencloop:
ld1 {v0.16b}, [x1], #16 /* get next pt block */
eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */
encrypt_block v4, w3, x2, x6, w7
st1 {v4.16b}, [x0], #16
subs w4, w4, #1
bne .Lcbcencloop
.Lcbcencout:
st1 {v4.16b}, [x5] /* return iv */
ret
ENDPROC(ce_aes_cbc_encrypt)
ENTRY(ce_aes_cbc_decrypt)
stp x29, x30, [sp, #-16]!
mov x29, sp
ld1 {v7.16b}, [x5] /* get iv */
dec_prepare w3, x2, x6
.LcbcdecloopNx:
subs w4, w4, #4
bmi .Lcbcdec1x
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
mov v4.16b, v0.16b
mov v5.16b, v1.16b
mov v6.16b, v2.16b
bl aes_decrypt_block4x
sub x1, x1, #16
eor v0.16b, v0.16b, v7.16b
eor v1.16b, v1.16b, v4.16b
ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
eor v2.16b, v2.16b, v5.16b
eor v3.16b, v3.16b, v6.16b
st1 {v0.16b-v3.16b}, [x0], #64
b .LcbcdecloopNx
.Lcbcdec1x:
adds w4, w4, #4
beq .Lcbcdecout
.Lcbcdecloop:
ld1 {v1.16b}, [x1], #16 /* get next ct block */
mov v0.16b, v1.16b /* ...and copy to v0 */
decrypt_block v0, w3, x2, x6, w7
eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
mov v7.16b, v1.16b /* ct is next iv */
st1 {v0.16b}, [x0], #16
subs w4, w4, #1
bne .Lcbcdecloop
.Lcbcdecout:
st1 {v7.16b}, [x5] /* return iv */
ldp x29, x30, [sp], #16
ret
ENDPROC(ce_aes_cbc_decrypt)
/*
* aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks, u8 ctr[], int first)
*/
ENTRY(ce_aes_ctr_encrypt)
stp x29, x30, [sp, #-16]!
mov x29, sp
enc_prepare w3, x2, x6
ld1 {v4.16b}, [x5]
umov x6, v4.d[1] /* keep swabbed ctr in reg */
rev x6, x6
cmn w6, w4 /* 32 bit overflow? */
bcs .Lctrloop
.LctrloopNx:
subs w4, w4, #4
bmi .Lctr1x
add w7, w6, #1
mov v0.16b, v4.16b
add w8, w6, #2
mov v1.16b, v4.16b
add w9, w6, #3
mov v2.16b, v4.16b
rev w7, w7
mov v3.16b, v4.16b
rev w8, w8
mov v1.s[3], w7
rev w9, w9
mov v2.s[3], w8
mov v3.s[3], w9
ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
bl aes_encrypt_block4x
eor v0.16b, v5.16b, v0.16b
ld1 {v5.16b}, [x1], #16 /* get 1 input block */
eor v1.16b, v6.16b, v1.16b
eor v2.16b, v7.16b, v2.16b
eor v3.16b, v5.16b, v3.16b
st1 {v0.16b-v3.16b}, [x0], #64
add x6, x6, #4
rev x7, x6
ins v4.d[1], x7
cbz w4, .Lctrout
b .LctrloopNx
.Lctr1x:
adds w4, w4, #4
beq .Lctrout
.Lctrloop:
mov v0.16b, v4.16b
encrypt_block v0, w3, x2, x8, w7
adds x6, x6, #1 /* increment BE ctr */
rev x7, x6
ins v4.d[1], x7
bcs .Lctrcarry /* overflow? */
.Lctrcarrydone:
subs w4, w4, #1
bmi .Lctrtailblock /* blocks <0 means tail block */
ld1 {v3.16b}, [x1], #16
eor v3.16b, v0.16b, v3.16b
st1 {v3.16b}, [x0], #16
bne .Lctrloop
.Lctrout:
st1 {v4.16b}, [x5] /* return next CTR value */
ldp x29, x30, [sp], #16
ret
.Lctrtailblock:
st1 {v0.16b}, [x0]
ldp x29, x30, [sp], #16
ret
.Lctrcarry:
umov x7, v4.d[0] /* load upper word of ctr */
rev x7, x7 /* ... to handle the carry */
add x7, x7, #1
rev x7, x7
ins v4.d[0], x7
b .Lctrcarrydone
ENDPROC(ce_aes_ctr_encrypt)
.ltorg
/*
* aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
* int blocks, u8 const rk2[], u8 iv[])
* aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
* int blocks, u8 const rk2[], u8 iv[])
*/
.macro next_tweak, out, in, const, tmp
sshr \tmp\().2d, \in\().2d, #63
and \tmp\().16b, \tmp\().16b, \const\().16b
add \out\().2d, \in\().2d, \in\().2d
ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
eor \out\().16b, \out\().16b, \tmp\().16b
.endm
.Lxts_mul_x:
.word 1, 0, 0x87, 0
ENTRY(ce_aes_xts_encrypt)
FRAME_PUSH
ld1 {v4.16b}, [x6]
enc_prepare w3, x5, x6
encrypt_block v4, w3, x5, x6, w7 /* first tweak */
enc_switch_key w3, x2, x6
ldr q7, .Lxts_mul_x
b .LxtsencNx
.LxtsencloopNx:
ldr q7, .Lxts_mul_x
next_tweak v4, v4, v7, v8
.LxtsencNx:
#if INTERLEAVE >= 2
subs w4, w4, #INTERLEAVE
bmi .Lxtsenc1x
#if INTERLEAVE == 2
ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
next_tweak v5, v4, v7, v8
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
do_encrypt_block2x
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
st1 {v0.16b-v1.16b}, [x0], #32
cbz w4, .LxtsencoutNx
next_tweak v4, v5, v7, v8
b .LxtsencNx
.LxtsencoutNx:
mov v4.16b, v5.16b
b .Lxtsencout
#else
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
next_tweak v5, v4, v7, v8
eor v0.16b, v0.16b, v4.16b
next_tweak v6, v5, v7, v8
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
next_tweak v7, v6, v7, v8
eor v3.16b, v3.16b, v7.16b
do_encrypt_block4x
eor v3.16b, v3.16b, v7.16b
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
st1 {v0.16b-v3.16b}, [x0], #64
mov v4.16b, v7.16b
cbz w4, .Lxtsencout
b .LxtsencloopNx
#endif
.Lxtsenc1x:
adds w4, w4, #INTERLEAVE
beq .Lxtsencout
#endif
.Lxtsencloop:
ld1 {v1.16b}, [x1], #16
eor v0.16b, v1.16b, v4.16b
encrypt_block v0, w3, x2, x6, w7
eor v0.16b, v0.16b, v4.16b
st1 {v0.16b}, [x0], #16
subs w4, w4, #1
beq .Lxtsencout
next_tweak v4, v4, v7, v8
b .Lxtsencloop
.Lxtsencout:
next_tweak v4, v4, v7, v8
st1 {v4.16b}, [x6], #16
FRAME_POP
ret
ENDPROC(ce_aes_xts_encrypt)
ENTRY(ce_aes_xts_decrypt)
FRAME_PUSH
ld1 {v4.16b}, [x6]
enc_prepare w3, x5, x6
encrypt_block v4, w3, x5, x6, w7 /* first tweak */
dec_prepare w3, x2, x6
ldr q7, .Lxts_mul_x
b .LxtsdecNx
.LxtsdecloopNx:
ldr q7, .Lxts_mul_x
next_tweak v4, v4, v7, v8
.LxtsdecNx:
#if INTERLEAVE >= 2
subs w4, w4, #INTERLEAVE
bmi .Lxtsdec1x
#if INTERLEAVE == 2
ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
next_tweak v5, v4, v7, v8
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
do_decrypt_block2x
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
st1 {v0.16b-v1.16b}, [x0], #32
cbz w4, .LxtsdecoutNx
next_tweak v4, v5, v7, v8
b .LxtsdecNx
.LxtsdecoutNx:
mov v4.16b, v5.16b
b .Lxtsdecout
#else
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
next_tweak v5, v4, v7, v8
eor v0.16b, v0.16b, v4.16b
next_tweak v6, v5, v7, v8
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
next_tweak v7, v6, v7, v8
eor v3.16b, v3.16b, v7.16b
do_decrypt_block4x
eor v3.16b, v3.16b, v7.16b
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
st1 {v0.16b-v3.16b}, [x0], #64
mov v4.16b, v7.16b
cbz w4, .Lxtsdecout
b .LxtsdecloopNx
#endif
.Lxtsdec1x:
adds w4, w4, #INTERLEAVE
beq .Lxtsdecout
#endif
.Lxtsdecloop:
ld1 {v1.16b}, [x1], #16
eor v0.16b, v1.16b, v4.16b
decrypt_block v0, w3, x2, x6, w7
eor v0.16b, v0.16b, v4.16b
st1 {v0.16b}, [x0], #16
subs w4, w4, #1
beq .Lxtsdecout
next_tweak v4, v4, v7, v8
b .Lxtsdecloop
.Lxtsdecout:
FRAME_POP
next_tweak v4, v4, v7, v8
st1 {v4.16b}, [x6], #16
ret
ENDPROC(ce_aes_xts_decrypt)