| /* |
| * AVX2 implementation of MORUS-1280 |
| * |
| * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> |
| * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. |
| * |
| * This program is free software; you can redistribute it and/or modify it |
| * under the terms of the GNU General Public License version 2 as published |
| * by the Free Software Foundation. |
| */ |
| |
| #include <linux/linkage.h> |
| #include <asm/frame.h> |
| |
| #define SHUFFLE_MASK(i0, i1, i2, i3) \ |
| (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) |
| |
| #define MASK1 SHUFFLE_MASK(3, 0, 1, 2) |
| #define MASK2 SHUFFLE_MASK(2, 3, 0, 1) |
| #define MASK3 SHUFFLE_MASK(1, 2, 3, 0) |
| |
| #define STATE0 %ymm0 |
| #define STATE0_LOW %xmm0 |
| #define STATE1 %ymm1 |
| #define STATE2 %ymm2 |
| #define STATE3 %ymm3 |
| #define STATE4 %ymm4 |
| #define KEY %ymm5 |
| #define MSG %ymm5 |
| #define MSG_LOW %xmm5 |
| #define T0 %ymm6 |
| #define T0_LOW %xmm6 |
| #define T1 %ymm7 |
| |
| .section .rodata.cst32.morus1280_const, "aM", @progbits, 32 |
| .align 32 |
| .Lmorus1280_const: |
| .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d |
| .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 |
| .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 |
| .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd |
| |
| .section .rodata.cst32.morus1280_counter, "aM", @progbits, 32 |
| .align 32 |
| .Lmorus1280_counter: |
| .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 |
| .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f |
| .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 |
| .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f |
| |
| .text |
| |
| .macro morus1280_round s0, s1, s2, s3, s4, b, w |
| vpand \s1, \s2, T0 |
| vpxor T0, \s0, \s0 |
| vpxor \s3, \s0, \s0 |
| vpsllq $\b, \s0, T0 |
| vpsrlq $(64 - \b), \s0, \s0 |
| vpxor T0, \s0, \s0 |
| vpermq $\w, \s3, \s3 |
| .endm |
| |
| /* |
| * __morus1280_update: internal ABI |
| * input: |
| * STATE[0-4] - input state |
| * MSG - message block |
| * output: |
| * STATE[0-4] - output state |
| * changed: |
| * T0 |
| */ |
| __morus1280_update: |
| morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1 |
| vpxor MSG, STATE1, STATE1 |
| morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2 |
| vpxor MSG, STATE2, STATE2 |
| morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3 |
| vpxor MSG, STATE3, STATE3 |
| morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2 |
| vpxor MSG, STATE4, STATE4 |
| morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1 |
| ret |
| ENDPROC(__morus1280_update) |
| |
| /* |
| * __morus1280_update_zero: internal ABI |
| * input: |
| * STATE[0-4] - input state |
| * output: |
| * STATE[0-4] - output state |
| * changed: |
| * T0 |
| */ |
| __morus1280_update_zero: |
| morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1 |
| morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2 |
| morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3 |
| morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2 |
| morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1 |
| ret |
| ENDPROC(__morus1280_update_zero) |
| |
| /* |
| * __load_partial: internal ABI |
| * input: |
| * %rsi - src |
| * %rcx - bytes |
| * output: |
| * MSG - message block |
| * changed: |
| * %r8 |
| * %r9 |
| */ |
| __load_partial: |
| xor %r9d, %r9d |
| vpxor MSG, MSG, MSG |
| |
| mov %rcx, %r8 |
| and $0x1, %r8 |
| jz .Lld_partial_1 |
| |
| mov %rcx, %r8 |
| and $0x1E, %r8 |
| add %rsi, %r8 |
| mov (%r8), %r9b |
| |
| .Lld_partial_1: |
| mov %rcx, %r8 |
| and $0x2, %r8 |
| jz .Lld_partial_2 |
| |
| mov %rcx, %r8 |
| and $0x1C, %r8 |
| add %rsi, %r8 |
| shl $16, %r9 |
| mov (%r8), %r9w |
| |
| .Lld_partial_2: |
| mov %rcx, %r8 |
| and $0x4, %r8 |
| jz .Lld_partial_4 |
| |
| mov %rcx, %r8 |
| and $0x18, %r8 |
| add %rsi, %r8 |
| shl $32, %r9 |
| mov (%r8), %r8d |
| xor %r8, %r9 |
| |
| .Lld_partial_4: |
| movq %r9, MSG_LOW |
| |
| mov %rcx, %r8 |
| and $0x8, %r8 |
| jz .Lld_partial_8 |
| |
| mov %rcx, %r8 |
| and $0x10, %r8 |
| add %rsi, %r8 |
| pshufd $MASK2, MSG_LOW, MSG_LOW |
| pinsrq $0, (%r8), MSG_LOW |
| |
| .Lld_partial_8: |
| mov %rcx, %r8 |
| and $0x10, %r8 |
| jz .Lld_partial_16 |
| |
| vpermq $MASK2, MSG, MSG |
| movdqu (%rsi), MSG_LOW |
| |
| .Lld_partial_16: |
| ret |
| ENDPROC(__load_partial) |
| |
| /* |
| * __store_partial: internal ABI |
| * input: |
| * %rdx - dst |
| * %rcx - bytes |
| * output: |
| * T0 - message block |
| * changed: |
| * %r8 |
| * %r9 |
| * %r10 |
| */ |
| __store_partial: |
| mov %rcx, %r8 |
| mov %rdx, %r9 |
| |
| cmp $16, %r8 |
| jl .Lst_partial_16 |
| |
| movdqu T0_LOW, (%r9) |
| vpermq $MASK2, T0, T0 |
| |
| sub $16, %r8 |
| add $16, %r9 |
| |
| .Lst_partial_16: |
| movq T0_LOW, %r10 |
| |
| cmp $8, %r8 |
| jl .Lst_partial_8 |
| |
| mov %r10, (%r9) |
| pextrq $1, T0_LOW, %r10 |
| |
| sub $8, %r8 |
| add $8, %r9 |
| |
| .Lst_partial_8: |
| cmp $4, %r8 |
| jl .Lst_partial_4 |
| |
| mov %r10d, (%r9) |
| shr $32, %r10 |
| |
| sub $4, %r8 |
| add $4, %r9 |
| |
| .Lst_partial_4: |
| cmp $2, %r8 |
| jl .Lst_partial_2 |
| |
| mov %r10w, (%r9) |
| shr $16, %r10 |
| |
| sub $2, %r8 |
| add $2, %r9 |
| |
| .Lst_partial_2: |
| cmp $1, %r8 |
| jl .Lst_partial_1 |
| |
| mov %r10b, (%r9) |
| |
| .Lst_partial_1: |
| ret |
| ENDPROC(__store_partial) |
| |
| /* |
| * void crypto_morus1280_avx2_init(void *state, const void *key, |
| * const void *iv); |
| */ |
| ENTRY(crypto_morus1280_avx2_init) |
| FRAME_BEGIN |
| |
| /* load IV: */ |
| vpxor STATE0, STATE0, STATE0 |
| movdqu (%rdx), STATE0_LOW |
| /* load key: */ |
| vmovdqu (%rsi), KEY |
| vmovdqa KEY, STATE1 |
| /* load all ones: */ |
| vpcmpeqd STATE2, STATE2, STATE2 |
| /* load all zeros: */ |
| vpxor STATE3, STATE3, STATE3 |
| /* load the constant: */ |
| vmovdqa .Lmorus1280_const, STATE4 |
| |
| /* update 16 times with zero: */ |
| call __morus1280_update_zero |
| call __morus1280_update_zero |
| call __morus1280_update_zero |
| call __morus1280_update_zero |
| call __morus1280_update_zero |
| call __morus1280_update_zero |
| call __morus1280_update_zero |
| call __morus1280_update_zero |
| call __morus1280_update_zero |
| call __morus1280_update_zero |
| call __morus1280_update_zero |
| call __morus1280_update_zero |
| call __morus1280_update_zero |
| call __morus1280_update_zero |
| call __morus1280_update_zero |
| call __morus1280_update_zero |
| |
| /* xor-in the key again after updates: */ |
| vpxor KEY, STATE1, STATE1 |
| |
| /* store the state: */ |
| vmovdqu STATE0, (0 * 32)(%rdi) |
| vmovdqu STATE1, (1 * 32)(%rdi) |
| vmovdqu STATE2, (2 * 32)(%rdi) |
| vmovdqu STATE3, (3 * 32)(%rdi) |
| vmovdqu STATE4, (4 * 32)(%rdi) |
| |
| FRAME_END |
| ret |
| ENDPROC(crypto_morus1280_avx2_init) |
| |
| /* |
| * void crypto_morus1280_avx2_ad(void *state, const void *data, |
| * unsigned int length); |
| */ |
| ENTRY(crypto_morus1280_avx2_ad) |
| FRAME_BEGIN |
| |
| cmp $32, %rdx |
| jb .Lad_out |
| |
| /* load the state: */ |
| vmovdqu (0 * 32)(%rdi), STATE0 |
| vmovdqu (1 * 32)(%rdi), STATE1 |
| vmovdqu (2 * 32)(%rdi), STATE2 |
| vmovdqu (3 * 32)(%rdi), STATE3 |
| vmovdqu (4 * 32)(%rdi), STATE4 |
| |
| mov %rsi, %r8 |
| and $0x1F, %r8 |
| jnz .Lad_u_loop |
| |
| .align 4 |
| .Lad_a_loop: |
| vmovdqa (%rsi), MSG |
| call __morus1280_update |
| sub $32, %rdx |
| add $32, %rsi |
| cmp $32, %rdx |
| jge .Lad_a_loop |
| |
| jmp .Lad_cont |
| .align 4 |
| .Lad_u_loop: |
| vmovdqu (%rsi), MSG |
| call __morus1280_update |
| sub $32, %rdx |
| add $32, %rsi |
| cmp $32, %rdx |
| jge .Lad_u_loop |
| |
| .Lad_cont: |
| /* store the state: */ |
| vmovdqu STATE0, (0 * 32)(%rdi) |
| vmovdqu STATE1, (1 * 32)(%rdi) |
| vmovdqu STATE2, (2 * 32)(%rdi) |
| vmovdqu STATE3, (3 * 32)(%rdi) |
| vmovdqu STATE4, (4 * 32)(%rdi) |
| |
| .Lad_out: |
| FRAME_END |
| ret |
| ENDPROC(crypto_morus1280_avx2_ad) |
| |
| /* |
| * void crypto_morus1280_avx2_enc(void *state, const void *src, void *dst, |
| * unsigned int length); |
| */ |
| ENTRY(crypto_morus1280_avx2_enc) |
| FRAME_BEGIN |
| |
| cmp $32, %rcx |
| jb .Lenc_out |
| |
| /* load the state: */ |
| vmovdqu (0 * 32)(%rdi), STATE0 |
| vmovdqu (1 * 32)(%rdi), STATE1 |
| vmovdqu (2 * 32)(%rdi), STATE2 |
| vmovdqu (3 * 32)(%rdi), STATE3 |
| vmovdqu (4 * 32)(%rdi), STATE4 |
| |
| mov %rsi, %r8 |
| or %rdx, %r8 |
| and $0x1F, %r8 |
| jnz .Lenc_u_loop |
| |
| .align 4 |
| .Lenc_a_loop: |
| vmovdqa (%rsi), MSG |
| vmovdqa MSG, T0 |
| vpxor STATE0, T0, T0 |
| vpermq $MASK3, STATE1, T1 |
| vpxor T1, T0, T0 |
| vpand STATE2, STATE3, T1 |
| vpxor T1, T0, T0 |
| vmovdqa T0, (%rdx) |
| |
| call __morus1280_update |
| sub $32, %rcx |
| add $32, %rsi |
| add $32, %rdx |
| cmp $32, %rcx |
| jge .Lenc_a_loop |
| |
| jmp .Lenc_cont |
| .align 4 |
| .Lenc_u_loop: |
| vmovdqu (%rsi), MSG |
| vmovdqa MSG, T0 |
| vpxor STATE0, T0, T0 |
| vpermq $MASK3, STATE1, T1 |
| vpxor T1, T0, T0 |
| vpand STATE2, STATE3, T1 |
| vpxor T1, T0, T0 |
| vmovdqu T0, (%rdx) |
| |
| call __morus1280_update |
| sub $32, %rcx |
| add $32, %rsi |
| add $32, %rdx |
| cmp $32, %rcx |
| jge .Lenc_u_loop |
| |
| .Lenc_cont: |
| /* store the state: */ |
| vmovdqu STATE0, (0 * 32)(%rdi) |
| vmovdqu STATE1, (1 * 32)(%rdi) |
| vmovdqu STATE2, (2 * 32)(%rdi) |
| vmovdqu STATE3, (3 * 32)(%rdi) |
| vmovdqu STATE4, (4 * 32)(%rdi) |
| |
| .Lenc_out: |
| FRAME_END |
| ret |
| ENDPROC(crypto_morus1280_avx2_enc) |
| |
| /* |
| * void crypto_morus1280_avx2_enc_tail(void *state, const void *src, void *dst, |
| * unsigned int length); |
| */ |
| ENTRY(crypto_morus1280_avx2_enc_tail) |
| FRAME_BEGIN |
| |
| /* load the state: */ |
| vmovdqu (0 * 32)(%rdi), STATE0 |
| vmovdqu (1 * 32)(%rdi), STATE1 |
| vmovdqu (2 * 32)(%rdi), STATE2 |
| vmovdqu (3 * 32)(%rdi), STATE3 |
| vmovdqu (4 * 32)(%rdi), STATE4 |
| |
| /* encrypt message: */ |
| call __load_partial |
| |
| vmovdqa MSG, T0 |
| vpxor STATE0, T0, T0 |
| vpermq $MASK3, STATE1, T1 |
| vpxor T1, T0, T0 |
| vpand STATE2, STATE3, T1 |
| vpxor T1, T0, T0 |
| |
| call __store_partial |
| |
| call __morus1280_update |
| |
| /* store the state: */ |
| vmovdqu STATE0, (0 * 32)(%rdi) |
| vmovdqu STATE1, (1 * 32)(%rdi) |
| vmovdqu STATE2, (2 * 32)(%rdi) |
| vmovdqu STATE3, (3 * 32)(%rdi) |
| vmovdqu STATE4, (4 * 32)(%rdi) |
| |
| FRAME_END |
| ret |
| ENDPROC(crypto_morus1280_avx2_enc_tail) |
| |
| /* |
| * void crypto_morus1280_avx2_dec(void *state, const void *src, void *dst, |
| * unsigned int length); |
| */ |
| ENTRY(crypto_morus1280_avx2_dec) |
| FRAME_BEGIN |
| |
| cmp $32, %rcx |
| jb .Ldec_out |
| |
| /* load the state: */ |
| vmovdqu (0 * 32)(%rdi), STATE0 |
| vmovdqu (1 * 32)(%rdi), STATE1 |
| vmovdqu (2 * 32)(%rdi), STATE2 |
| vmovdqu (3 * 32)(%rdi), STATE3 |
| vmovdqu (4 * 32)(%rdi), STATE4 |
| |
| mov %rsi, %r8 |
| or %rdx, %r8 |
| and $0x1F, %r8 |
| jnz .Ldec_u_loop |
| |
| .align 4 |
| .Ldec_a_loop: |
| vmovdqa (%rsi), MSG |
| vpxor STATE0, MSG, MSG |
| vpermq $MASK3, STATE1, T0 |
| vpxor T0, MSG, MSG |
| vpand STATE2, STATE3, T0 |
| vpxor T0, MSG, MSG |
| vmovdqa MSG, (%rdx) |
| |
| call __morus1280_update |
| sub $32, %rcx |
| add $32, %rsi |
| add $32, %rdx |
| cmp $32, %rcx |
| jge .Ldec_a_loop |
| |
| jmp .Ldec_cont |
| .align 4 |
| .Ldec_u_loop: |
| vmovdqu (%rsi), MSG |
| vpxor STATE0, MSG, MSG |
| vpermq $MASK3, STATE1, T0 |
| vpxor T0, MSG, MSG |
| vpand STATE2, STATE3, T0 |
| vpxor T0, MSG, MSG |
| vmovdqu MSG, (%rdx) |
| |
| call __morus1280_update |
| sub $32, %rcx |
| add $32, %rsi |
| add $32, %rdx |
| cmp $32, %rcx |
| jge .Ldec_u_loop |
| |
| .Ldec_cont: |
| /* store the state: */ |
| vmovdqu STATE0, (0 * 32)(%rdi) |
| vmovdqu STATE1, (1 * 32)(%rdi) |
| vmovdqu STATE2, (2 * 32)(%rdi) |
| vmovdqu STATE3, (3 * 32)(%rdi) |
| vmovdqu STATE4, (4 * 32)(%rdi) |
| |
| .Ldec_out: |
| FRAME_END |
| ret |
| ENDPROC(crypto_morus1280_avx2_dec) |
| |
| /* |
| * void crypto_morus1280_avx2_dec_tail(void *state, const void *src, void *dst, |
| * unsigned int length); |
| */ |
| ENTRY(crypto_morus1280_avx2_dec_tail) |
| FRAME_BEGIN |
| |
| /* load the state: */ |
| vmovdqu (0 * 32)(%rdi), STATE0 |
| vmovdqu (1 * 32)(%rdi), STATE1 |
| vmovdqu (2 * 32)(%rdi), STATE2 |
| vmovdqu (3 * 32)(%rdi), STATE3 |
| vmovdqu (4 * 32)(%rdi), STATE4 |
| |
| /* decrypt message: */ |
| call __load_partial |
| |
| vpxor STATE0, MSG, MSG |
| vpermq $MASK3, STATE1, T0 |
| vpxor T0, MSG, MSG |
| vpand STATE2, STATE3, T0 |
| vpxor T0, MSG, MSG |
| vmovdqa MSG, T0 |
| |
| call __store_partial |
| |
| /* mask with byte count: */ |
| movq %rcx, T0_LOW |
| vpbroadcastb T0_LOW, T0 |
| vmovdqa .Lmorus1280_counter, T1 |
| vpcmpgtb T1, T0, T0 |
| vpand T0, MSG, MSG |
| |
| call __morus1280_update |
| |
| /* store the state: */ |
| vmovdqu STATE0, (0 * 32)(%rdi) |
| vmovdqu STATE1, (1 * 32)(%rdi) |
| vmovdqu STATE2, (2 * 32)(%rdi) |
| vmovdqu STATE3, (3 * 32)(%rdi) |
| vmovdqu STATE4, (4 * 32)(%rdi) |
| |
| FRAME_END |
| ret |
| ENDPROC(crypto_morus1280_avx2_dec_tail) |
| |
| /* |
| * void crypto_morus1280_avx2_final(void *state, void *tag_xor, |
| * u64 assoclen, u64 cryptlen); |
| */ |
| ENTRY(crypto_morus1280_avx2_final) |
| FRAME_BEGIN |
| |
| /* load the state: */ |
| vmovdqu (0 * 32)(%rdi), STATE0 |
| vmovdqu (1 * 32)(%rdi), STATE1 |
| vmovdqu (2 * 32)(%rdi), STATE2 |
| vmovdqu (3 * 32)(%rdi), STATE3 |
| vmovdqu (4 * 32)(%rdi), STATE4 |
| |
| /* xor state[0] into state[4]: */ |
| vpxor STATE0, STATE4, STATE4 |
| |
| /* prepare length block: */ |
| vpxor MSG, MSG, MSG |
| vpinsrq $0, %rdx, MSG_LOW, MSG_LOW |
| vpinsrq $1, %rcx, MSG_LOW, MSG_LOW |
| vpsllq $3, MSG, MSG /* multiply by 8 (to get bit count) */ |
| |
| /* update state: */ |
| call __morus1280_update |
| call __morus1280_update |
| call __morus1280_update |
| call __morus1280_update |
| call __morus1280_update |
| call __morus1280_update |
| call __morus1280_update |
| call __morus1280_update |
| call __morus1280_update |
| call __morus1280_update |
| |
| /* xor tag: */ |
| vmovdqu (%rsi), MSG |
| |
| vpxor STATE0, MSG, MSG |
| vpermq $MASK3, STATE1, T0 |
| vpxor T0, MSG, MSG |
| vpand STATE2, STATE3, T0 |
| vpxor T0, MSG, MSG |
| vmovdqu MSG, (%rsi) |
| |
| FRAME_END |
| ret |
| ENDPROC(crypto_morus1280_avx2_final) |