| /* |
| * Multi-buffer SHA1 algorithm hash compute routine |
| * |
| * This file is provided under a dual BSD/GPLv2 license. When using or |
| * redistributing this file, you may do so under either license. |
| * |
| * GPL LICENSE SUMMARY |
| * |
| * Copyright(c) 2014 Intel Corporation. |
| * |
| * This program is free software; you can redistribute it and/or modify |
| * it under the terms of version 2 of the GNU General Public License as |
| * published by the Free Software Foundation. |
| * |
| * This program is distributed in the hope that it will be useful, but |
| * WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * General Public License for more details. |
| * |
| * Contact Information: |
| * James Guilford <james.guilford@intel.com> |
| * Tim Chen <tim.c.chen@linux.intel.com> |
| * |
| * BSD LICENSE |
| * |
| * Copyright(c) 2014 Intel Corporation. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in |
| * the documentation and/or other materials provided with the |
| * distribution. |
| * * Neither the name of Intel Corporation nor the names of its |
| * contributors may be used to endorse or promote products derived |
| * from this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include <linux/linkage.h> |
| #include "sha1_mb_mgr_datastruct.S" |
| |
| ## code to compute oct SHA1 using SSE-256 |
| ## outer calling routine takes care of save and restore of XMM registers |
| |
| ## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15# ymm0-15 |
| ## |
| ## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15 |
| ## Linux preserves: rdi rbp r8 |
| ## |
| ## clobbers ymm0-15 |
| |
| |
| # TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 |
| # "transpose" data in {r0...r7} using temps {t0...t1} |
| # Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7} |
| # r0 = {a7 a6 a5 a4 a3 a2 a1 a0} |
| # r1 = {b7 b6 b5 b4 b3 b2 b1 b0} |
| # r2 = {c7 c6 c5 c4 c3 c2 c1 c0} |
| # r3 = {d7 d6 d5 d4 d3 d2 d1 d0} |
| # r4 = {e7 e6 e5 e4 e3 e2 e1 e0} |
| # r5 = {f7 f6 f5 f4 f3 f2 f1 f0} |
| # r6 = {g7 g6 g5 g4 g3 g2 g1 g0} |
| # r7 = {h7 h6 h5 h4 h3 h2 h1 h0} |
| # |
| # Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} |
| # r0 = {h0 g0 f0 e0 d0 c0 b0 a0} |
| # r1 = {h1 g1 f1 e1 d1 c1 b1 a1} |
| # r2 = {h2 g2 f2 e2 d2 c2 b2 a2} |
| # r3 = {h3 g3 f3 e3 d3 c3 b3 a3} |
| # r4 = {h4 g4 f4 e4 d4 c4 b4 a4} |
| # r5 = {h5 g5 f5 e5 d5 c5 b5 a5} |
| # r6 = {h6 g6 f6 e6 d6 c6 b6 a6} |
| # r7 = {h7 g7 f7 e7 d7 c7 b7 a7} |
| # |
| |
| .macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1 |
| # process top half (r0..r3) {a...d} |
| vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0} |
| vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2} |
| vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0} |
| vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2} |
| vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1} |
| vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2} |
| vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3} |
| vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0} |
| |
| # use r2 in place of t0 |
| # process bottom half (r4..r7) {e...h} |
| vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0} |
| vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2} |
| vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0} |
| vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2} |
| vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1} |
| vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2} |
| vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3} |
| vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0} |
| |
| vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6 |
| vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2 |
| vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5 |
| vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1 |
| vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7 |
| vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3 |
| vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4 |
| vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0 |
| |
| .endm |
| ## |
| ## Magic functions defined in FIPS 180-1 |
| ## |
| # macro MAGIC_F0 F,B,C,D,T ## F = (D ^ (B & (C ^ D))) |
| .macro MAGIC_F0 regF regB regC regD regT |
| vpxor \regD, \regC, \regF |
| vpand \regB, \regF, \regF |
| vpxor \regD, \regF, \regF |
| .endm |
| |
| # macro MAGIC_F1 F,B,C,D,T ## F = (B ^ C ^ D) |
| .macro MAGIC_F1 regF regB regC regD regT |
| vpxor \regC, \regD, \regF |
| vpxor \regB, \regF, \regF |
| .endm |
| |
| # macro MAGIC_F2 F,B,C,D,T ## F = ((B & C) | (B & D) | (C & D)) |
| .macro MAGIC_F2 regF regB regC regD regT |
| vpor \regC, \regB, \regF |
| vpand \regC, \regB, \regT |
| vpand \regD, \regF, \regF |
| vpor \regT, \regF, \regF |
| .endm |
| |
| # macro MAGIC_F3 F,B,C,D,T ## F = (B ^ C ^ D) |
| .macro MAGIC_F3 regF regB regC regD regT |
| MAGIC_F1 \regF,\regB,\regC,\regD,\regT |
| .endm |
| |
| # PROLD reg, imm, tmp |
| .macro PROLD reg imm tmp |
| vpsrld $(32-\imm), \reg, \tmp |
| vpslld $\imm, \reg, \reg |
| vpor \tmp, \reg, \reg |
| .endm |
| |
| .macro PROLD_nd reg imm tmp src |
| vpsrld $(32-\imm), \src, \tmp |
| vpslld $\imm, \src, \reg |
| vpor \tmp, \reg, \reg |
| .endm |
| |
| .macro SHA1_STEP_00_15 regA regB regC regD regE regT regF memW immCNT MAGIC |
| vpaddd \immCNT, \regE, \regE |
| vpaddd \memW*32(%rsp), \regE, \regE |
| PROLD_nd \regT, 5, \regF, \regA |
| vpaddd \regT, \regE, \regE |
| \MAGIC \regF, \regB, \regC, \regD, \regT |
| PROLD \regB, 30, \regT |
| vpaddd \regF, \regE, \regE |
| .endm |
| |
| .macro SHA1_STEP_16_79 regA regB regC regD regE regT regF memW immCNT MAGIC |
| vpaddd \immCNT, \regE, \regE |
| offset = ((\memW - 14) & 15) * 32 |
| vmovdqu offset(%rsp), W14 |
| vpxor W14, W16, W16 |
| offset = ((\memW - 8) & 15) * 32 |
| vpxor offset(%rsp), W16, W16 |
| offset = ((\memW - 3) & 15) * 32 |
| vpxor offset(%rsp), W16, W16 |
| vpsrld $(32-1), W16, \regF |
| vpslld $1, W16, W16 |
| vpor W16, \regF, \regF |
| |
| ROTATE_W |
| |
| offset = ((\memW - 0) & 15) * 32 |
| vmovdqu \regF, offset(%rsp) |
| vpaddd \regF, \regE, \regE |
| PROLD_nd \regT, 5, \regF, \regA |
| vpaddd \regT, \regE, \regE |
| \MAGIC \regF,\regB,\regC,\regD,\regT ## FUN = MAGIC_Fi(B,C,D) |
| PROLD \regB,30, \regT |
| vpaddd \regF, \regE, \regE |
| .endm |
| |
| ######################################################################## |
| ######################################################################## |
| ######################################################################## |
| |
| ## FRAMESZ plus pushes must be an odd multiple of 8 |
| YMM_SAVE = (15-15)*32 |
| FRAMESZ = 32*16 + YMM_SAVE |
| _YMM = FRAMESZ - YMM_SAVE |
| |
| #define VMOVPS vmovups |
| |
| IDX = %rax |
| inp0 = %r9 |
| inp1 = %r10 |
| inp2 = %r11 |
| inp3 = %r12 |
| inp4 = %r13 |
| inp5 = %r14 |
| inp6 = %r15 |
| inp7 = %rcx |
| arg1 = %rdi |
| arg2 = %rsi |
| RSP_SAVE = %rdx |
| |
| # ymm0 A |
| # ymm1 B |
| # ymm2 C |
| # ymm3 D |
| # ymm4 E |
| # ymm5 F AA |
| # ymm6 T0 BB |
| # ymm7 T1 CC |
| # ymm8 T2 DD |
| # ymm9 T3 EE |
| # ymm10 T4 TMP |
| # ymm11 T5 FUN |
| # ymm12 T6 K |
| # ymm13 T7 W14 |
| # ymm14 T8 W15 |
| # ymm15 T9 W16 |
| |
| |
| A = %ymm0 |
| B = %ymm1 |
| C = %ymm2 |
| D = %ymm3 |
| E = %ymm4 |
| F = %ymm5 |
| T0 = %ymm6 |
| T1 = %ymm7 |
| T2 = %ymm8 |
| T3 = %ymm9 |
| T4 = %ymm10 |
| T5 = %ymm11 |
| T6 = %ymm12 |
| T7 = %ymm13 |
| T8 = %ymm14 |
| T9 = %ymm15 |
| |
| AA = %ymm5 |
| BB = %ymm6 |
| CC = %ymm7 |
| DD = %ymm8 |
| EE = %ymm9 |
| TMP = %ymm10 |
| FUN = %ymm11 |
| K = %ymm12 |
| W14 = %ymm13 |
| W15 = %ymm14 |
| W16 = %ymm15 |
| |
| .macro ROTATE_ARGS |
| TMP_ = E |
| E = D |
| D = C |
| C = B |
| B = A |
| A = TMP_ |
| .endm |
| |
| .macro ROTATE_W |
| TMP_ = W16 |
| W16 = W15 |
| W15 = W14 |
| W14 = TMP_ |
| .endm |
| |
| # 8 streams x 5 32bit words per digest x 4 bytes per word |
| #define DIGEST_SIZE (8*5*4) |
| |
| .align 32 |
| |
| # void sha1_x8_avx2(void **input_data, UINT128 *digest, UINT32 size) |
| # arg 1 : pointer to array[4] of pointer to input data |
| # arg 2 : size (in blocks) ;; assumed to be >= 1 |
| # |
| ENTRY(sha1_x8_avx2) |
| |
| # save callee-saved clobbered registers to comply with C function ABI |
| push %r12 |
| push %r13 |
| push %r14 |
| push %r15 |
| |
| #save rsp |
| mov %rsp, RSP_SAVE |
| sub $FRAMESZ, %rsp |
| |
| #align rsp to 32 Bytes |
| and $~0x1F, %rsp |
| |
| ## Initialize digests |
| vmovdqu 0*32(arg1), A |
| vmovdqu 1*32(arg1), B |
| vmovdqu 2*32(arg1), C |
| vmovdqu 3*32(arg1), D |
| vmovdqu 4*32(arg1), E |
| |
| ## transpose input onto stack |
| mov _data_ptr+0*8(arg1),inp0 |
| mov _data_ptr+1*8(arg1),inp1 |
| mov _data_ptr+2*8(arg1),inp2 |
| mov _data_ptr+3*8(arg1),inp3 |
| mov _data_ptr+4*8(arg1),inp4 |
| mov _data_ptr+5*8(arg1),inp5 |
| mov _data_ptr+6*8(arg1),inp6 |
| mov _data_ptr+7*8(arg1),inp7 |
| |
| xor IDX, IDX |
| lloop: |
| vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), F |
| I=0 |
| .rep 2 |
| VMOVPS (inp0, IDX), T0 |
| VMOVPS (inp1, IDX), T1 |
| VMOVPS (inp2, IDX), T2 |
| VMOVPS (inp3, IDX), T3 |
| VMOVPS (inp4, IDX), T4 |
| VMOVPS (inp5, IDX), T5 |
| VMOVPS (inp6, IDX), T6 |
| VMOVPS (inp7, IDX), T7 |
| |
| TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 |
| vpshufb F, T0, T0 |
| vmovdqu T0, (I*8)*32(%rsp) |
| vpshufb F, T1, T1 |
| vmovdqu T1, (I*8+1)*32(%rsp) |
| vpshufb F, T2, T2 |
| vmovdqu T2, (I*8+2)*32(%rsp) |
| vpshufb F, T3, T3 |
| vmovdqu T3, (I*8+3)*32(%rsp) |
| vpshufb F, T4, T4 |
| vmovdqu T4, (I*8+4)*32(%rsp) |
| vpshufb F, T5, T5 |
| vmovdqu T5, (I*8+5)*32(%rsp) |
| vpshufb F, T6, T6 |
| vmovdqu T6, (I*8+6)*32(%rsp) |
| vpshufb F, T7, T7 |
| vmovdqu T7, (I*8+7)*32(%rsp) |
| add $32, IDX |
| I = (I+1) |
| .endr |
| # save old digests |
| vmovdqu A,AA |
| vmovdqu B,BB |
| vmovdqu C,CC |
| vmovdqu D,DD |
| vmovdqu E,EE |
| |
| ## |
| ## perform 0-79 steps |
| ## |
| vmovdqu K00_19(%rip), K |
| ## do rounds 0...15 |
| I = 0 |
| .rep 16 |
| SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 |
| ROTATE_ARGS |
| I = (I+1) |
| .endr |
| |
| ## do rounds 16...19 |
| vmovdqu ((16 - 16) & 15) * 32 (%rsp), W16 |
| vmovdqu ((16 - 15) & 15) * 32 (%rsp), W15 |
| .rep 4 |
| SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 |
| ROTATE_ARGS |
| I = (I+1) |
| .endr |
| |
| ## do rounds 20...39 |
| vmovdqu K20_39(%rip), K |
| .rep 20 |
| SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 |
| ROTATE_ARGS |
| I = (I+1) |
| .endr |
| |
| ## do rounds 40...59 |
| vmovdqu K40_59(%rip), K |
| .rep 20 |
| SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 |
| ROTATE_ARGS |
| I = (I+1) |
| .endr |
| |
| ## do rounds 60...79 |
| vmovdqu K60_79(%rip), K |
| .rep 20 |
| SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 |
| ROTATE_ARGS |
| I = (I+1) |
| .endr |
| |
| vpaddd AA,A,A |
| vpaddd BB,B,B |
| vpaddd CC,C,C |
| vpaddd DD,D,D |
| vpaddd EE,E,E |
| |
| sub $1, arg2 |
| jne lloop |
| |
| # write out digests |
| vmovdqu A, 0*32(arg1) |
| vmovdqu B, 1*32(arg1) |
| vmovdqu C, 2*32(arg1) |
| vmovdqu D, 3*32(arg1) |
| vmovdqu E, 4*32(arg1) |
| |
| # update input pointers |
| add IDX, inp0 |
| add IDX, inp1 |
| add IDX, inp2 |
| add IDX, inp3 |
| add IDX, inp4 |
| add IDX, inp5 |
| add IDX, inp6 |
| add IDX, inp7 |
| mov inp0, _data_ptr (arg1) |
| mov inp1, _data_ptr + 1*8(arg1) |
| mov inp2, _data_ptr + 2*8(arg1) |
| mov inp3, _data_ptr + 3*8(arg1) |
| mov inp4, _data_ptr + 4*8(arg1) |
| mov inp5, _data_ptr + 5*8(arg1) |
| mov inp6, _data_ptr + 6*8(arg1) |
| mov inp7, _data_ptr + 7*8(arg1) |
| |
| ################ |
| ## Postamble |
| |
| mov RSP_SAVE, %rsp |
| |
| # restore callee-saved clobbered registers |
| pop %r15 |
| pop %r14 |
| pop %r13 |
| pop %r12 |
| |
| ret |
| ENDPROC(sha1_x8_avx2) |
| |
| |
| .section .rodata.cst32.K00_19, "aM", @progbits, 32 |
| .align 32 |
| K00_19: |
| .octa 0x5A8279995A8279995A8279995A827999 |
| .octa 0x5A8279995A8279995A8279995A827999 |
| |
| .section .rodata.cst32.K20_39, "aM", @progbits, 32 |
| .align 32 |
| K20_39: |
| .octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 |
| .octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 |
| |
| .section .rodata.cst32.K40_59, "aM", @progbits, 32 |
| .align 32 |
| K40_59: |
| .octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC |
| .octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC |
| |
| .section .rodata.cst32.K60_79, "aM", @progbits, 32 |
| .align 32 |
| K60_79: |
| .octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 |
| .octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 |
| |
| .section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 |
| .align 32 |
| PSHUFFLE_BYTE_FLIP_MASK: |
| .octa 0x0c0d0e0f08090a0b0405060700010203 |
| .octa 0x0c0d0e0f08090a0b0405060700010203 |